From 389132400b6d3c1f7c767aaf3546d19e93909eb3 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 22 Jun 2025 13:50:15 -0400 Subject: Do text deserialization in chunks to avoid possible memory issues --- src/stdlib/text.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 2107c1df..8b01529d 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1617,13 +1617,19 @@ public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const public void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *info) { (void)info; - int64_t len = -1; + int64_t len = 0; Int64$deserialize(in, &len, pointers, &Int64$info); - char *buf = GC_MALLOC_ATOMIC((size_t)len+1); - if (fread(buf, sizeof(char), (size_t)len, in) != (size_t)len) - fail("Not enough data in stream to deserialize"); - buf[len+1] = '\0'; - *(Text_t*)out = Text$from_strn(buf, (size_t)len); + if (len < 0) + fail("Invalid text length during deserialization!"); + Text_t text = EMPTY_TEXT; + while (text.length < len) { + static char chunk[1024] = {}; + size_t chunk_size = MIN(sizeof(chunk), (size_t)(len - text.length)); + if (fread(chunk, sizeof(char), chunk_size, in) != chunk_size) + fail("Not enough data in stream to deserialize"); + text = concat2(text, Text$from_strn(chunk, chunk_size)); + } + *(Text_t*)out = text; } public const TypeInfo_t Text$info = { -- cgit v1.2.3 From c09a4eece687c23846b922ec2ec8740859aaa210 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Tue, 24 Jun 2025 12:53:12 -0400 Subject: Revert "Do text deserialization in chunks to avoid possible memory issues" (Breaks with unicode text) This reverts commit 389132400b6d3c1f7c767aaf3546d19e93909eb3. --- src/stdlib/text.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 8b01529d..2107c1df 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1617,19 +1617,13 @@ public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const public void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *info) { (void)info; - int64_t len = 0; + int64_t len = -1; Int64$deserialize(in, &len, pointers, &Int64$info); - if (len < 0) - fail("Invalid text length during deserialization!"); - Text_t text = EMPTY_TEXT; - while (text.length < len) { - static char chunk[1024] = {}; - size_t chunk_size = MIN(sizeof(chunk), (size_t)(len - text.length)); - if (fread(chunk, sizeof(char), chunk_size, in) != chunk_size) - fail("Not enough data in stream to deserialize"); - text = concat2(text, Text$from_strn(chunk, chunk_size)); - } - *(Text_t*)out = text; + char *buf = GC_MALLOC_ATOMIC((size_t)len+1); + if (fread(buf, sizeof(char), (size_t)len, in) != (size_t)len) + fail("Not enough data in stream to deserialize"); + buf[len+1] = '\0'; + *(Text_t*)out = Text$from_strn(buf, (size_t)len); } public const TypeInfo_t Text$info = { -- cgit v1.2.3 From 7f8735d84baeff4d41182d0682f8bf4aeaa07c8c Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Tue, 24 Jun 2025 12:54:49 -0400 Subject: Sanity check --- src/stdlib/text.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 2107c1df..9dedccdd 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1617,8 +1617,10 @@ public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const public void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *info) { (void)info; - int64_t len = -1; + int64_t len = 0; Int64$deserialize(in, &len, pointers, &Int64$info); + if (len < 0) + fail("Cannot deserialize text with a negative length!"); char *buf = GC_MALLOC_ATOMIC((size_t)len+1); if (fread(buf, sizeof(char), (size_t)len, in) != (size_t)len) fail("Not enough data in stream to deserialize"); -- cgit v1.2.3 From e122e5525ef044c25c2cd888bc267f7a2bd91b4f Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Tue, 24 Jun 2025 12:56:23 -0400 Subject: Minor performance improvement for hash collisions in tables --- src/stdlib/tables.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/src/stdlib/tables.c b/src/stdlib/tables.c index 762afb06..9301914c 100644 --- a/src/stdlib/tables.c +++ b/src/stdlib/tables.c @@ -177,20 +177,16 @@ static void Table$set_bucket(Table_t *t, const void *entry, int32_t index, const // Move mid-chain entry to free space and update predecessor buckets[predecessor].next_bucket = t->bucket_info->last_free; buckets[t->bucket_info->last_free] = *bucket; - } else { // Collided with the start of a chain + + bucket->occupied = 1; + bucket->index = index; + bucket->next_bucket = END_OF_CHAIN; + } else { // Collided with the start of a chain, put the new entry in chain position #2 hdebug("Hit start of a chain\n"); - uint64_t end_of_chain = hash; - while (buckets[end_of_chain].next_bucket != END_OF_CHAIN) - end_of_chain = buckets[end_of_chain].next_bucket; - hdebug("Appending to chain\n"); - // Chain now ends on the free space: - buckets[end_of_chain].next_bucket = t->bucket_info->last_free; - bucket = &buckets[t->bucket_info->last_free]; - } - - bucket->occupied = 1; - bucket->index = index; - bucket->next_bucket = END_OF_CHAIN; + buckets[t->bucket_info->last_free] = (bucket_t){ + .occupied = 1, .index=index, .next_bucket=bucket->next_bucket}; + bucket->next_bucket = t->bucket_info->last_free; + } hshow(t); } -- cgit v1.2.3 From 271017ba9970e4220e1bd0dc83ce146afe9222a2 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Tue, 24 Jun 2025 13:37:09 -0400 Subject: Add Path.has_extension() and update manpages/api docs --- CHANGES.md | 3 ++ api/api.md | 77 +++++++++++++++++++++++++++++++++++++ api/paths.md | 28 ++++++++++++++ api/paths.yaml | 28 ++++++++++++++ api/sets.md | 24 ++++++++++++ api/tables.md | 27 +++++++++++++ man/man3/tomo-Path.has_extension.3 | 41 ++++++++++++++++++++ man/man3/tomo-Table.with_fallback.3 | 40 +++++++++++++++++++ man/man3/tomo-Table.xor.3 | 35 +++++++++++++++++ src/environment.c | 1 + src/stdlib/paths.c | 16 ++++++++ src/stdlib/paths.h | 1 + test/paths.tm | 16 ++++++++ 13 files changed, 337 insertions(+) create mode 100644 man/man3/tomo-Path.has_extension.3 create mode 100644 man/man3/tomo-Table.with_fallback.3 create mode 100644 man/man3/tomo-Table.xor.3 diff --git a/CHANGES.md b/CHANGES.md index 40749904..1d3c2909 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -11,11 +11,14 @@ - Added `tomo --prefix` to print the Tomo install prefix. - Sets now support infix operations for `and`, `or`, `xor`, and `-` - Added Path.sibling() +- Added Path.has_extension() - Added Table.with_fallback() - Fixed bugs: - Negative integers weren't converting to text properly. - Mutation of a collection during iteration was violating value semantics. - `extend` statements weren't properly checking that the type name was valid. + - Lazy recompilation wasn't happening when `use ./foo.c` was used for local + C/assembly files. ## v0.2 diff --git a/api/api.md b/api/api.md index cbaa7a54..464ec58a 100644 --- a/api/api.md +++ b/api/api.md @@ -3009,6 +3009,34 @@ follow_symlinks | `Bool` | Whether to follow symbolic links. | `yes` >> (/non/existent/file).group() = none +``` +## Path.has_extension + +```tomo +Path.has_extension : func(path: Path, extension: Text -> Bool) +``` + +Return whether or not a path has a given file extension. + +Argument | Type | Description | Default +---------|------|-------------|--------- +path | `Path` | A path. | - +extension | `Text` | A file extension (leading `.` is optional). If empty, the check will test if the file does not have any file extension. | - + +**Return:** Whether or not the path has the given extension. + + +**Example:** +```tomo +>> (/foo.txt).has_extension("txt") += yes +>> (/foo.txt).has_extension(".txt") += yes +>> (/foo.tar.gz).has_extension("gz") += yes +>> (/foo.tar.gz).has_extension("zip") += no + ``` ## Path.is_directory @@ -3879,6 +3907,55 @@ t.set("C", 3) >> t = {"A"=1, "B"=2, "C"=3} +``` +## Table.with_fallback + +```tomo +Table.with_fallback : func(t: {K=V}, fallback: {K=V}? -> {K=V}) +``` + +Return a copy of a table with a different fallback table. + +Argument | Type | Description | Default +---------|------|-------------|--------- +t | `{K=V}` | The table whose fallback will be replaced. | - +fallback | `{K=V}?` | The new fallback table value. | - + +**Return:** The original table with a different fallback. + + +**Example:** +```tomo +t := {"A"=1; fallback={"B"=2}} +t2 = t.with_fallback({"B"=3"}) +>> t2["B"] += 3? +t3 = t.with_fallback(none) +>> t2["B"] += none + +``` +## Table.xor + +```tomo +Table.xor : func(a: |T|, b: |T| -> |T|) +``` + +Return set with the elements in one, but not both of the arguments. This is also known as the symmetric difference or disjunctive union. + +Argument | Type | Description | Default +---------|------|-------------|--------- +a | `|T|` | The first set. | - +b | `|T|` | The second set. | - + +**Return:** A set with the symmetric difference of the arguments. + + +**Example:** +```tomo +>> |1, 2, 3|.xor(|2, 3, 4|) += |1, 4| + ``` # Text diff --git a/api/paths.md b/api/paths.md index ad6b894b..aade9b0f 100644 --- a/api/paths.md +++ b/api/paths.md @@ -488,6 +488,34 @@ follow_symlinks | `Bool` | Whether to follow symbolic links. | `yes` >> (/non/existent/file).group() = none +``` +## Path.has_extension + +```tomo +Path.has_extension : func(path: Path, extension: Text -> Bool) +``` + +Return whether or not a path has a given file extension. + +Argument | Type | Description | Default +---------|------|-------------|--------- +path | `Path` | A path. | - +extension | `Text` | A file extension (leading `.` is optional). If empty, the check will test if the file does not have any file extension. | - + +**Return:** Whether or not the path has the given extension. + + +**Example:** +```tomo +>> (/foo.txt).has_extension("txt") += yes +>> (/foo.txt).has_extension(".txt") += yes +>> (/foo.tar.gz).has_extension("gz") += yes +>> (/foo.tar.gz).has_extension("zip") += no + ``` ## Path.is_directory diff --git a/api/paths.yaml b/api/paths.yaml index 40813129..1bfe5d6d 100644 --- a/api/paths.yaml +++ b/api/paths.yaml @@ -463,6 +463,34 @@ Path.group: = "root" >> (/non/existent/file).group() = none + +Path.has_extension: + short: check if a path has a given extension + description: > + Return whether or not a path has a given file extension. + return: + type: 'Bool' + description: > + Whether or not the path has the given extension. + args: + path: + type: 'Path' + description: > + A path. + extension: + type: 'Text' + description: > + A file extension (leading `.` is optional). If empty, the check will + test if the file does not have any file extension. + example: | + >> (/foo.txt).has_extension("txt") + = yes + >> (/foo.txt).has_extension(".txt") + = yes + >> (/foo.tar.gz).has_extension("gz") + = yes + >> (/foo.tar.gz).has_extension("zip") + = no Path.is_directory: short: check if a path is a directory diff --git a/api/sets.md b/api/sets.md index b64439c2..8b07cf08 100644 --- a/api/sets.md +++ b/api/sets.md @@ -241,3 +241,27 @@ other | `|T|` | The set of items to remove from the original set. | - = |1| ``` + +# Table +## Table.xor + +```tomo +Table.xor : func(a: |T|, b: |T| -> |T|) +``` + +Return set with the elements in one, but not both of the arguments. This is also known as the symmetric difference or disjunctive union. + +Argument | Type | Description | Default +---------|------|-------------|--------- +a | `|T|` | The first set. | - +b | `|T|` | The second set. | - + +**Return:** A set with the symmetric difference of the arguments. + + +**Example:** +```tomo +>> |1, 2, 3|.xor(|2, 3, 4|) += |1, 4| + +``` diff --git a/api/tables.md b/api/tables.md index 580488f4..26bfe908 100644 --- a/api/tables.md +++ b/api/tables.md @@ -164,3 +164,30 @@ t.set("C", 3) = {"A"=1, "B"=2, "C"=3} ``` +## Table.with_fallback + +```tomo +Table.with_fallback : func(t: {K=V}, fallback: {K=V}? -> {K=V}) +``` + +Return a copy of a table with a different fallback table. + +Argument | Type | Description | Default +---------|------|-------------|--------- +t | `{K=V}` | The table whose fallback will be replaced. | - +fallback | `{K=V}?` | The new fallback table value. | - + +**Return:** The original table with a different fallback. + + +**Example:** +```tomo +t := {"A"=1; fallback={"B"=2}} +t2 = t.with_fallback({"B"=3"}) +>> t2["B"] += 3? +t3 = t.with_fallback(none) +>> t2["B"] += none + +``` diff --git a/man/man3/tomo-Path.has_extension.3 b/man/man3/tomo-Path.has_extension.3 new file mode 100644 index 00000000..4556c819 --- /dev/null +++ b/man/man3/tomo-Path.has_extension.3 @@ -0,0 +1,41 @@ +'\" t +.\" Copyright (c) 2025 Bruce Hill +.\" All rights reserved. +.\" +.TH Path.has_extension 3 2025-06-24 "Tomo man-pages" +.SH NAME +Path.has_extension \- check if a path has a given extension +.SH LIBRARY +Tomo Standard Library +.SH SYNOPSIS +.nf +.BI Path.has_extension\ :\ func(path:\ Path,\ extension:\ Text\ ->\ Bool) +.fi +.SH DESCRIPTION +Return whether or not a path has a given file extension. + + +.SH ARGUMENTS + +.TS +allbox; +lb lb lbx lb +l l l l. +Name Type Description Default +path Path A path. - +extension Text A file extension (leading `.` is optional). If empty, the check will test if the file does not have any file extension. - +.TE +.SH RETURN +Whether or not the path has the given extension. + +.SH EXAMPLES +.EX +>> (/foo.txt).has_extension("txt") += yes +>> (/foo.txt).has_extension(".txt") += yes +>> (/foo.tar.gz).has_extension("gz") += yes +>> (/foo.tar.gz).has_extension("zip") += no +.EE diff --git a/man/man3/tomo-Table.with_fallback.3 b/man/man3/tomo-Table.with_fallback.3 new file mode 100644 index 00000000..89c78fe1 --- /dev/null +++ b/man/man3/tomo-Table.with_fallback.3 @@ -0,0 +1,40 @@ +'\" t +.\" Copyright (c) 2025 Bruce Hill +.\" All rights reserved. +.\" +.TH Table.with_fallback 3 2025-06-24 "Tomo man-pages" +.SH NAME +Table.with_fallback \- return a table with a new fallback +.SH LIBRARY +Tomo Standard Library +.SH SYNOPSIS +.nf +.BI Table.with_fallback\ :\ func(t:\ {K=V},\ fallback:\ {K=V}?\ ->\ {K=V}) +.fi +.SH DESCRIPTION +Return a copy of a table with a different fallback table. + + +.SH ARGUMENTS + +.TS +allbox; +lb lb lbx lb +l l l l. +Name Type Description Default +t {K=V} The table whose fallback will be replaced. - +fallback {K=V}? The new fallback table value. - +.TE +.SH RETURN +The original table with a different fallback. + +.SH EXAMPLES +.EX +t := {"A"=1; fallback={"B"=2}} +t2 = t.with_fallback({"B"=3"}) +>> t2["B"] += 3? +t3 = t.with_fallback(none) +>> t2["B"] += none +.EE diff --git a/man/man3/tomo-Table.xor.3 b/man/man3/tomo-Table.xor.3 new file mode 100644 index 00000000..8d3cb8f2 --- /dev/null +++ b/man/man3/tomo-Table.xor.3 @@ -0,0 +1,35 @@ +'\" t +.\" Copyright (c) 2025 Bruce Hill +.\" All rights reserved. +.\" +.TH Table.xor 3 2025-06-24 "Tomo man-pages" +.SH NAME +Table.xor \- symmetric difference +.SH LIBRARY +Tomo Standard Library +.SH SYNOPSIS +.nf +.BI Table.xor\ :\ func(a:\ |T|,\ b:\ |T|\ ->\ |T|) +.fi +.SH DESCRIPTION +Return set with the elements in one, but not both of the arguments. This is also known as the symmetric difference or disjunctive union. + + +.SH ARGUMENTS + +.TS +allbox; +lb lb lbx lb +l l l l. +Name Type Description Default +a |T| The first set. - +b |T| The second set. - +.TE +.SH RETURN +A set with the symmetric difference of the arguments. + +.SH EXAMPLES +.EX +>> |1, 2, 3|.xor(|2, 3, 4|) += |1, 4| +.EE diff --git a/src/environment.c b/src/environment.c index db17da94..5fb6714a 100644 --- a/src/environment.c +++ b/src/environment.c @@ -308,6 +308,7 @@ env_t *global_env(bool source_mapping) {"from_components", "Path$from_components", "func(components:[Text] -> Path)"}, {"glob", "Path$glob", "func(path:Path -> [Path])"}, {"group", "Path$group", "func(path:Path, follow_symlinks=yes -> Text?)"}, + {"has_extension", "Path$has_extension", "func(path:Path, extension:Text -> Bool)"}, {"is_directory", "Path$is_directory", "func(path:Path, follow_symlinks=yes -> Bool)"}, {"is_file", "Path$is_file", "func(path:Path, follow_symlinks=yes -> Bool)"}, {"is_pipe", "Path$is_pipe", "func(path:Path, follow_symlinks=yes -> Bool)"}, diff --git a/src/stdlib/paths.c b/src/stdlib/paths.c index 5047d615..772fa1fd 100644 --- a/src/stdlib/paths.c +++ b/src/stdlib/paths.c @@ -609,6 +609,22 @@ public Text_t Path$extension(Path_t path, bool full) return Text$from_str(extension); } +public bool Path$has_extension(Path_t path, Text_t extension) +{ + if (path.components.length < 2) + return extension.length == 0; + + Text_t last = *(Text_t*)(path.components.data + path.components.stride*(path.components.length-1)); + + if (extension.length == 0) + return !Text$has(Text$from(last, I(2)), Text(".")) || Text$equal_values(last, Text("..")); + + if (!Text$starts_with(extension, Text("."))) + extension = Texts(Text("."), extension); + + return Text$ends_with(Text$from(last, I(2)), extension); +} + public Path_t Path$child(Path_t path, Text_t name) { if (Text$has(name, Text("/")) || Text$has(name, Text(";"))) diff --git a/src/stdlib/paths.h b/src/stdlib/paths.h index 4f94d3e4..3a9cdef7 100644 --- a/src/stdlib/paths.h +++ b/src/stdlib/paths.h @@ -51,6 +51,7 @@ Path_t Path$write_unique_bytes(Path_t path, List_t bytes); Path_t Path$parent(Path_t path); Text_t Path$base_name(Path_t path); Text_t Path$extension(Path_t path, bool full); +bool Path$has_extension(Path_t path, Text_t extension); Path_t Path$child(Path_t path, Text_t name); Path_t Path$sibling(Path_t path, Text_t name); Path_t Path$with_extension(Path_t path, Text_t extension, bool replace); diff --git a/test/paths.tm b/test/paths.tm index 7685d089..14a15299 100644 --- a/test/paths.tm +++ b/test/paths.tm @@ -60,6 +60,22 @@ func main() = "tar.gz" >> p.extension(full=no) = "gz" + >> p.has_extension("gz") + = yes + >> p.has_extension(".gz") + = yes + >> p.has_extension("tar.gz") + = yes + >> p.has_extension("txt") + = no + >> p.has_extension("") + = no + >> (./foo).has_extension("") + = yes + >> (..).has_extension("") + = yes + >> (~/.foo).has_extension("foo") + = no >> (~/.foo).extension() = "" >> (~/foo).extension() -- cgit v1.2.3 From 6e50989b21214c107b5075782a070e3df25aa333 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Tue, 24 Jun 2025 13:44:41 -0400 Subject: Correctly detect changes to `use`d C/header files and trigger recompilation --- src/tomo.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/tomo.c b/src/tomo.c index b534c491..32acf8e6 100644 --- a/src/tomo.c +++ b/src/tomo.c @@ -647,9 +647,21 @@ void build_file_dependency_graph(Path_t path, Table_t *to_compile, Table_t *to_l asm_path = Path$concat(Path$parent(path), asm_path); Text_t linker_text = Path$as_text(&asm_path, NULL, &Path$info); Table$set(to_link, &linker_text, NULL, Table$info(&Text$info, &Void$info)); + if (is_stale(build_file(path, ".o"), asm_path, false)) { + staleness.o = true; + Table$set(to_compile, &path, &staleness, Table$info(&Path$info, &Byte$info)); + } + break; + } + case USE_HEADER: case USE_C_CODE: { + Path_t dep_path = Path$from_str(use->path); + if (is_stale(build_file(path, ".o"), dep_path, false)) { + staleness.o = true; + Table$set(to_compile, &path, &staleness, Table$info(&Path$info, &Byte$info)); + } break; } - default: case USE_HEADER: break; + default: break; } } } -- cgit v1.2.3 From d3b78e14e4d26c877f88ee64022e541bbb367df1 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Wed, 25 Jun 2025 12:35:08 -0400 Subject: Trigger recompilation when downstream #includes for C files are updated. --- CHANGES.md | 2 +- src/tomo.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 1d3c2909..1fba98fc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -18,7 +18,7 @@ - Mutation of a collection during iteration was violating value semantics. - `extend` statements weren't properly checking that the type name was valid. - Lazy recompilation wasn't happening when `use ./foo.c` was used for local - C/assembly files. + C/assembly files or their `#include`s. ## v0.2 diff --git a/src/tomo.c b/src/tomo.c index 32acf8e6..0f131513 100644 --- a/src/tomo.c +++ b/src/tomo.c @@ -666,6 +666,46 @@ void build_file_dependency_graph(Path_t path, Table_t *to_compile, Table_t *to_l } } +time_t latest_c_modification_time(Path_t path) +{ + static Table_t c_modification_times = {}; + const TypeInfo_t time_info = {.size=sizeof(time_t), .align=alignof(time_t), .tag=OpaqueInfo}; + time_t *cached_latest = Table$get(c_modification_times, &path, Table$info(&Path$info, &time_info)); + if (cached_latest) return *cached_latest; + + struct stat s; + time_t latest = 0; + if (stat(Path$as_c_string(path), &s) == 0) + latest = s.st_mtime; + Table$set(&c_modification_times, &path, &latest, Table$info(&Path$info, &time_info)); + + OptionalClosure_t by_line = Path$by_line(path); + if (by_line.fn == NULL) return 0; + OptionalText_t (*next_line)(void*) = by_line.fn; + Path_t parent = Path$parent(path); + for (Text_t line; (line=next_line(by_line.userdata)).length >= 0; ) { + line = Text$trim(line, Text(" \t"), true, false); + if (!Text$starts_with(line, Text("#include"))) + continue; + + List_t tokens = Text$split_any(line, Text(" \t")); + if (tokens.length < 2 || !Text$equal_values(*(Text_t*)tokens.data, Text("#include"))) + continue; + + Text_t include = *(Text_t*)(tokens.data + tokens.stride); + if (!Text$starts_with(include, Text("\""))) + continue; + + Path_t included_path = Path$resolved(Path$from_text(Text$trim(include, Text("\""), true, true)), parent); + time_t included_time = latest_c_modification_time(included_path); + if (included_time > latest) { + latest = included_time; + Table$set(&c_modification_times, &path, &latest, Table$info(&Path$info, &time_info)); + } + } + return latest; +} + bool is_stale(Path_t path, Path_t relative_to, bool ignore_missing) { struct stat target_stat; @@ -680,6 +720,11 @@ bool is_stale(Path_t path, Path_t relative_to, bool ignore_missing) return true; #endif + if (Path$has_extension(relative_to, Text("c")) || Path$has_extension(relative_to, Text("h"))) { + time_t mtime = latest_c_modification_time(relative_to); + return target_stat.st_mtime < mtime; + } + struct stat relative_to_stat; if (stat(Path$as_c_string(relative_to), &relative_to_stat) != 0) { if (ignore_missing) return false; -- cgit v1.2.3 From 45171e9a0f7632d3011f0f75a1f1691348e6632a Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Wed, 25 Jun 2025 12:55:17 -0400 Subject: Add fix for `.include` or `#included` files in assembly failing to trigger recompilation --- src/tomo.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/tomo.c b/src/tomo.c index 0f131513..47b22251 100644 --- a/src/tomo.c +++ b/src/tomo.c @@ -666,7 +666,7 @@ void build_file_dependency_graph(Path_t path, Table_t *to_compile, Table_t *to_l } } -time_t latest_c_modification_time(Path_t path) +time_t latest_included_modification_time(Path_t path) { static Table_t c_modification_times = {}; const TypeInfo_t time_info = {.size=sizeof(time_t), .align=alignof(time_t), .tag=OpaqueInfo}; @@ -683,13 +683,20 @@ time_t latest_c_modification_time(Path_t path) if (by_line.fn == NULL) return 0; OptionalText_t (*next_line)(void*) = by_line.fn; Path_t parent = Path$parent(path); + bool allow_dot_include = Path$has_extension(path, Text("s")) || Path$has_extension(path, Text("S")); for (Text_t line; (line=next_line(by_line.userdata)).length >= 0; ) { line = Text$trim(line, Text(" \t"), true, false); - if (!Text$starts_with(line, Text("#include"))) + if (!Text$starts_with(line, Text("#include")) && !(allow_dot_include && Text$starts_with(line, Text(".include")))) continue; List_t tokens = Text$split_any(line, Text(" \t")); - if (tokens.length < 2 || !Text$equal_values(*(Text_t*)tokens.data, Text("#include"))) + if (tokens.length < 2) + continue; + + // Previously, we just checked for `#include...`, but it could be + // `#includenotreally`, so we should check for those types of thing to be safe. + if (!Text$equal_values(*(Text_t*)tokens.data, Text("#include")) + && !(allow_dot_include && Text$equal_values(*(Text_t*)tokens.data, Text(".include")))) continue; Text_t include = *(Text_t*)(tokens.data + tokens.stride); @@ -697,7 +704,7 @@ time_t latest_c_modification_time(Path_t path) continue; Path_t included_path = Path$resolved(Path$from_text(Text$trim(include, Text("\""), true, true)), parent); - time_t included_time = latest_c_modification_time(included_path); + time_t included_time = latest_included_modification_time(included_path); if (included_time > latest) { latest = included_time; Table$set(&c_modification_times, &path, &latest, Table$info(&Path$info, &time_info)); @@ -720,8 +727,9 @@ bool is_stale(Path_t path, Path_t relative_to, bool ignore_missing) return true; #endif - if (Path$has_extension(relative_to, Text("c")) || Path$has_extension(relative_to, Text("h"))) { - time_t mtime = latest_c_modification_time(relative_to); + if (Path$has_extension(relative_to, Text("c")) || Path$has_extension(relative_to, Text("h")) + || Path$has_extension(relative_to, Text("s")) || Path$has_extension(relative_to, Text("S"))) { + time_t mtime = latest_included_modification_time(relative_to); return target_stat.st_mtime < mtime; } -- cgit v1.2.3 From c2653404944dbd5a6f737877f0bad6fd1de018f1 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Wed, 25 Jun 2025 13:05:30 -0400 Subject: Clean up #include parsing so it works correctly with comments --- src/tomo.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/tomo.c b/src/tomo.c index 47b22251..187495ce 100644 --- a/src/tomo.c +++ b/src/tomo.c @@ -689,21 +689,16 @@ time_t latest_included_modification_time(Path_t path) if (!Text$starts_with(line, Text("#include")) && !(allow_dot_include && Text$starts_with(line, Text(".include")))) continue; - List_t tokens = Text$split_any(line, Text(" \t")); - if (tokens.length < 2) + // Check for `"` after `#include` or `.include` and some spaces: + if (!Text$starts_with(Text$trim(Text$from(line, I(9)), Text(" \t"), true, false), Text("\""))) continue; - // Previously, we just checked for `#include...`, but it could be - // `#includenotreally`, so we should check for those types of thing to be safe. - if (!Text$equal_values(*(Text_t*)tokens.data, Text("#include")) - && !(allow_dot_include && Text$equal_values(*(Text_t*)tokens.data, Text(".include")))) + List_t chunks = Text$split(line, Text("\"")); + if (chunks.length < 3) // Should be `#include "foo" ...` -> ["#include ", "foo", "..."] continue; - Text_t include = *(Text_t*)(tokens.data + tokens.stride); - if (!Text$starts_with(include, Text("\""))) - continue; - - Path_t included_path = Path$resolved(Path$from_text(Text$trim(include, Text("\""), true, true)), parent); + Text_t included = *(Text_t*)(chunks.data + 1*chunks.stride); + Path_t included_path = Path$resolved(Path$from_text(included), parent); time_t included_time = latest_included_modification_time(included_path); if (included_time > latest) { latest = included_time; -- cgit v1.2.3 From 8a4d5dc57b14e7c947c25970bb4d4f4ef91450f4 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Thu, 26 Jun 2025 13:06:47 -0400 Subject: Add get_bit() method for Ints and Bytes --- CHANGES.md | 1 + api/api.md | 60 ++++++++++++++++++++++++++++++++++++++++++++ api/bytes.md | 30 ++++++++++++++++++++++ api/bytes.yaml | 30 ++++++++++++++++++++++ api/integers.md | 30 ++++++++++++++++++++++ api/integers.yaml | 36 +++++++++++++++++++++++++- man/man3/tomo-Byte.get_bit.3 | 44 ++++++++++++++++++++++++++++++++ man/man3/tomo-Int.get_bit.3 | 44 ++++++++++++++++++++++++++++++++ src/environment.c | 10 ++++++-- src/stdlib/bytes.c | 8 ++++++ src/stdlib/bytes.h | 1 + src/stdlib/integers.c | 20 +++++++++++++++ src/stdlib/integers.h | 2 ++ test/bytes.tm | 9 +++++++ test/integers.tm | 18 +++++++++++++ 15 files changed, 340 insertions(+), 3 deletions(-) create mode 100644 man/man3/tomo-Byte.get_bit.3 create mode 100644 man/man3/tomo-Int.get_bit.3 diff --git a/CHANGES.md b/CHANGES.md index 1fba98fc..ce93bc8c 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,6 +13,7 @@ - Added Path.sibling() - Added Path.has_extension() - Added Table.with_fallback() +- Added Int*.get_bit() and Byte.get_bit() - Fixed bugs: - Negative integers weren't converting to text properly. - Mutation of a collection during iteration was violating value semantics. diff --git a/api/api.md b/api/api.md index 464ec58a..9711ab44 100644 --- a/api/api.md +++ b/api/api.md @@ -210,6 +210,36 @@ text | `Text` | The string containing the boolean value. | - ``` # Byte +## Byte.get_bit + +```tomo +Byte.get_bit : func(i: Byte, bit_index: Int -> Bool) +``` + +In the binary representation of a byte, check whether a given bit index is set to 1 or not. + +The bit index must be between 1-8 or a runtime error will be produced. + +Argument | Type | Description | Default +---------|------|-------------|--------- +i | `Byte` | The byte whose bits are being inspected. | - +bit_index | `Int` | The index of the bit to check (1-indexed, range 1-8). | - + +**Return:** Whether or not the given bit index is set to 1 in the byte. + + +**Example:** +```tomo +>> Byte(6).get_bit(1) += no +>> Byte(6).get_bit(2) += yes +>> Byte(6).get_bit(3) += yes +>> Byte(6).get_bit(4) += no + +``` ## Byte.hex ```tomo @@ -401,6 +431,36 @@ n | `Int` | The integer to compute the factorial of. | - >> (10).factorial() = 3628800 +``` +## Int.get_bit + +```tomo +Int.get_bit : func(i: Int, bit_index: Int -> Bool) +``` + +In the binary representation of an integer, check whether a given bit index is set to 1 or not. + +For fixed-size integers, the bit index must be between 1 and the number of bits in that integer (i.e. 1-64 for `Int64`). For `Int`, the bit index must be between 1 and `Int64.max`. Values outside this range will produce a runtime error. + +Argument | Type | Description | Default +---------|------|-------------|--------- +i | `Int` | The integer whose bits are being inspected. | - +bit_index | `Int` | The index of the bit to check (1-indexed). | - + +**Return:** Whether or not the given bit index is set to 1 in the binary representation of the integer. + + +**Example:** +```tomo +>> (6).get_bit(1) += no +>> (6).get_bit(2) += yes +>> (6).get_bit(3) += yes +>> (6).get_bit(4) += no + ``` ## Int.hex diff --git a/api/bytes.md b/api/bytes.md index 598c92b7..908d78e2 100644 --- a/api/bytes.md +++ b/api/bytes.md @@ -3,6 +3,36 @@ # Builtins # Byte +## Byte.get_bit + +```tomo +Byte.get_bit : func(i: Byte, bit_index: Int -> Bool) +``` + +In the binary representation of a byte, check whether a given bit index is set to 1 or not. + +The bit index must be between 1-8 or a runtime error will be produced. + +Argument | Type | Description | Default +---------|------|-------------|--------- +i | `Byte` | The byte whose bits are being inspected. | - +bit_index | `Int` | The index of the bit to check (1-indexed, range 1-8). | - + +**Return:** Whether or not the given bit index is set to 1 in the byte. + + +**Example:** +```tomo +>> Byte(6).get_bit(1) += no +>> Byte(6).get_bit(2) += yes +>> Byte(6).get_bit(3) += yes +>> Byte(6).get_bit(4) += no + +``` ## Byte.hex ```tomo diff --git a/api/bytes.yaml b/api/bytes.yaml index 52f48528..2785513d 100644 --- a/api/bytes.yaml +++ b/api/bytes.yaml @@ -1,3 +1,33 @@ +Byte.get_bit: + short: check whether a bit is set + description: > + In the binary representation of a byte, check whether a given bit index is + set to 1 or not. + note: > + The bit index must be between 1-8 or a runtime error will be produced. + return: + type: 'Bool' + description: > + Whether or not the given bit index is set to 1 in the byte. + args: + i: + type: 'Byte' + description: > + The byte whose bits are being inspected. + bit_index: + type: 'Int' + description: > + The index of the bit to check (1-indexed, range 1-8). + example: | + >> Byte(6).get_bit(1) + = no + >> Byte(6).get_bit(2) + = yes + >> Byte(6).get_bit(3) + = yes + >> Byte(6).get_bit(4) + = no + Byte.hex: short: convert to hexidecimal description: > diff --git a/api/integers.md b/api/integers.md index 0865e93f..efb891bf 100644 --- a/api/integers.md +++ b/api/integers.md @@ -89,6 +89,36 @@ n | `Int` | The integer to compute the factorial of. | - >> (10).factorial() = 3628800 +``` +## Int.get_bit + +```tomo +Int.get_bit : func(i: Int, bit_index: Int -> Bool) +``` + +In the binary representation of an integer, check whether a given bit index is set to 1 or not. + +For fixed-size integers, the bit index must be between 1 and the number of bits in that integer (i.e. 1-64 for `Int64`). For `Int`, the bit index must be between 1 and `Int64.max`. Values outside this range will produce a runtime error. + +Argument | Type | Description | Default +---------|------|-------------|--------- +i | `Int` | The integer whose bits are being inspected. | - +bit_index | `Int` | The index of the bit to check (1-indexed). | - + +**Return:** Whether or not the given bit index is set to 1 in the binary representation of the integer. + + +**Example:** +```tomo +>> (6).get_bit(1) += no +>> (6).get_bit(2) += yes +>> (6).get_bit(3) += yes +>> (6).get_bit(4) += no + ``` ## Int.hex diff --git a/api/integers.yaml b/api/integers.yaml index f927c75f..a91a21ce 100644 --- a/api/integers.yaml +++ b/api/integers.yaml @@ -81,7 +81,41 @@ Int.factorial: example: | >> (10).factorial() = 3628800 - + +Int.get_bit: + short: check whether a bit is set + description: > + In the binary representation of an integer, check whether a given bit index + is set to 1 or not. + note: > + For fixed-size integers, the bit index must be between 1 and the number of + bits in that integer (i.e. 1-64 for `Int64`). For `Int`, the bit index must + be between 1 and `Int64.max`. Values outside this range will produce a + runtime error. + return: + type: 'Bool' + description: > + Whether or not the given bit index is set to 1 in the binary + representation of the integer. + args: + i: + type: 'Int' + description: > + The integer whose bits are being inspected. + bit_index: + type: 'Int' + description: > + The index of the bit to check (1-indexed). + example: | + >> (6).get_bit(1) + = no + >> (6).get_bit(2) + = yes + >> (6).get_bit(3) + = yes + >> (6).get_bit(4) + = no + Int.hex: short: convert to hexidecimal description: > diff --git a/man/man3/tomo-Byte.get_bit.3 b/man/man3/tomo-Byte.get_bit.3 new file mode 100644 index 00000000..85d6c2ae --- /dev/null +++ b/man/man3/tomo-Byte.get_bit.3 @@ -0,0 +1,44 @@ +'\" t +.\" Copyright (c) 2025 Bruce Hill +.\" All rights reserved. +.\" +.TH Byte.get_bit 3 2025-06-26 "Tomo man-pages" +.SH NAME +Byte.get_bit \- check whether a bit is set +.SH LIBRARY +Tomo Standard Library +.SH SYNOPSIS +.nf +.BI Byte.get_bit\ :\ func(i:\ Byte,\ bit_index:\ Int\ ->\ Bool) +.fi +.SH DESCRIPTION +In the binary representation of a byte, check whether a given bit index is set to 1 or not. + + +.SH ARGUMENTS + +.TS +allbox; +lb lb lbx lb +l l l l. +Name Type Description Default +i Byte The byte whose bits are being inspected. - +bit_index Int The index of the bit to check (1-indexed, range 1-8). - +.TE +.SH RETURN +Whether or not the given bit index is set to 1 in the byte. + +.SH NOTES +The bit index must be between 1-8 or a runtime error will be produced. + +.SH EXAMPLES +.EX +>> Byte(6).get_bit(1) += no +>> Byte(6).get_bit(2) += yes +>> Byte(6).get_bit(3) += yes +>> Byte(6).get_bit(4) += no +.EE diff --git a/man/man3/tomo-Int.get_bit.3 b/man/man3/tomo-Int.get_bit.3 new file mode 100644 index 00000000..e0b98909 --- /dev/null +++ b/man/man3/tomo-Int.get_bit.3 @@ -0,0 +1,44 @@ +'\" t +.\" Copyright (c) 2025 Bruce Hill +.\" All rights reserved. +.\" +.TH Int.get_bit 3 2025-06-25 "Tomo man-pages" +.SH NAME +Int.get_bit \- check whether a bit is set +.SH LIBRARY +Tomo Standard Library +.SH SYNOPSIS +.nf +.BI Int.get_bit\ :\ func(i:\ Int,\ bit_index:\ Int\ ->\ Bool) +.fi +.SH DESCRIPTION +In the binary representation of an integer, check whether a given bit index is set to 1 or not. + + +.SH ARGUMENTS + +.TS +allbox; +lb lb lbx lb +l l l l. +Name Type Description Default +i Int The integer whose bits are being inspected. - +bit_index Int The index of the bit to check (1-indexed). - +.TE +.SH RETURN +Whether or not the given bit index is set to 1 in the binary representation of the integer. + +.SH NOTES +For fixed-size integers, the bit index must be between 1 and the number of bits in that integer (i.e. 1-64 for `Int64`). For `Int`, the bit index must be between 1 and `Int64.max`. Values outside this range will produce a runtime error. + +.SH EXAMPLES +.EX +>> (6).get_bit(1) += no +>> (6).get_bit(2) += yes +>> (6).get_bit(3) += yes +>> (6).get_bit(4) += no +.EE diff --git a/src/environment.c b/src/environment.c index 5fb6714a..eb2d28be 100644 --- a/src/environment.c +++ b/src/environment.c @@ -90,10 +90,11 @@ env_t *global_env(bool source_mapping) )}, {"Byte", Type(ByteType), "Byte_t", "Byte$info", TypedList(ns_entry_t, {"max", "Byte$max", "Byte"}, + {"get_bit", "Byte$get_bit", "func(x:Byte, bit_index:Int -> Bool)"}, {"hex", "Byte$hex", "func(byte:Byte, uppercase=yes, prefix=no -> Text)"}, - {"is_between", "Byte$is_between", "func(x:Byte,low:Byte,high:Byte -> Bool)"}, + {"is_between", "Byte$is_between", "func(x:Byte, low:Byte, high:Byte -> Bool)"}, {"min", "Byte$min", "Byte"}, - {"to", "Byte$to", "func(first:Byte,last:Byte,step:Int8?=none -> func(->Byte?))"}, + {"to", "Byte$to", "func(first:Byte, last:Byte, step:Int8?=none -> func(->Byte?))"}, )}, {"Int", Type(BigIntType), "Int_t", "Int$info", TypedList(ns_entry_t, {"abs", "Int$abs", "func(x:Int -> Int)"}, @@ -105,6 +106,7 @@ env_t *global_env(bool source_mapping) {"divided_by", "Int$divided_by", "func(x,y:Int -> Int)"}, {"factorial", "Int$factorial", "func(x:Int -> Int)"}, {"gcd", "Int$gcd", "func(x,y:Int -> Int)"}, + {"get_bit", "Int$get_bit", "func(x,bit_index:Int -> Bool)"}, {"hex", "Int$hex", "func(i:Int, digits=0, uppercase=yes, prefix=yes -> Text)"}, {"is_between", "Int$is_between", "func(x:Int,low:Int,high:Int -> Bool)"}, {"is_prime", "Int$is_prime", "func(x:Int,reps=50 -> Bool)"}, @@ -137,6 +139,7 @@ env_t *global_env(bool source_mapping) {"divided_by", "Int64$divided_by", "func(x,y:Int64 -> Int64)"}, {"gcd", "Int64$gcd", "func(x,y:Int64 -> Int64)"}, {"parse", "Int64$parse", "func(text:Text -> Int64?)"}, + {"get_bit", "Int64$get_bit", "func(x:Int64, bit_index:Int -> Bool)"}, {"hex", "Int64$hex", "func(i:Int64, digits=0, uppercase=yes, prefix=yes -> Text)"}, {"is_between", "Int64$is_between", "func(x:Int64,low:Int64,high:Int64 -> Bool)"}, {"max", "Int64$max", "Int64"}, @@ -158,6 +161,7 @@ env_t *global_env(bool source_mapping) {"divided_by", "Int32$divided_by", "func(x,y:Int32 -> Int32)"}, {"gcd", "Int32$gcd", "func(x,y:Int32 -> Int32)"}, {"parse", "Int32$parse", "func(text:Text -> Int32?)"}, + {"get_bit", "Int32$get_bit", "func(x:Int32, bit_index:Int -> Bool)"}, {"hex", "Int32$hex", "func(i:Int32, digits=0, uppercase=yes, prefix=yes -> Text)"}, {"is_between", "Int32$is_between", "func(x:Int32,low:Int32,high:Int32 -> Bool)"}, {"max", "Int32$max", "Int32"}, @@ -179,6 +183,7 @@ env_t *global_env(bool source_mapping) {"divided_by", "Int16$divided_by", "func(x,y:Int16 -> Int16)"}, {"gcd", "Int16$gcd", "func(x,y:Int16 -> Int16)"}, {"parse", "Int16$parse", "func(text:Text -> Int16?)"}, + {"get_bit", "Int16$get_bit", "func(x:Int16, bit_index:Int -> Bool)"}, {"hex", "Int16$hex", "func(i:Int16, digits=0, uppercase=yes, prefix=yes -> Text)"}, {"is_between", "Int16$is_between", "func(x:Int16,low:Int16,high:Int16 -> Bool)"}, {"max", "Int16$max", "Int16"}, @@ -200,6 +205,7 @@ env_t *global_env(bool source_mapping) {"divided_by", "Int8$divided_by", "func(x,y:Int8 -> Int8)"}, {"gcd", "Int8$gcd", "func(x,y:Int8 -> Int8)"}, {"parse", "Int8$parse", "func(text:Text -> Int8?)"}, + {"get_bit", "Int8$get_bit", "func(x:Int8, bit_index:Int -> Bool)"}, {"hex", "Int8$hex", "func(i:Int8, digits=0, uppercase=yes, prefix=yes -> Text)"}, {"is_between", "Int8$is_between", "func(x:Int8,low:Int8,high:Int8 -> Bool)"}, {"max", "Int8$max", "Int8"}, diff --git a/src/stdlib/bytes.c b/src/stdlib/bytes.c index b5c10aa2..48c8b93b 100644 --- a/src/stdlib/bytes.c +++ b/src/stdlib/bytes.c @@ -49,6 +49,14 @@ public Text_t Byte$hex(Byte_t byte, bool uppercase, bool prefix) { return text; } +public bool Byte$get_bit(Byte_t x, Int_t bit_index) { + if (Int$compare_value(bit_index, I(1)) < 0) + fail("Invalid bit index (expected 1 or higher): ", bit_index); + if (Int$compare_value(bit_index, I(8)) > 0) + fail("Bit index is too large! There are only 8 bits in a byte, but index is: ", bit_index); + return ((x & (Byte_t)(1L << (Int64$from_int(bit_index, true)-1L))) != 0); +} + #ifdef __TINYC__ #define __builtin_add_overflow(x, y, result) ({ *(result) = (x) + (y); false; }) #endif diff --git a/src/stdlib/bytes.h b/src/stdlib/bytes.h index 5c64687d..e733c274 100644 --- a/src/stdlib/bytes.h +++ b/src/stdlib/bytes.h @@ -30,5 +30,6 @@ extern const Byte_t Byte$max; extern const TypeInfo_t Byte$info; Text_t Byte$hex(Byte_t byte, bool uppercase, bool prefix); +bool Byte$get_bit(Byte_t x, Int_t bit_index); // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/src/stdlib/integers.c b/src/stdlib/integers.c index 7c623663..018798ec 100644 --- a/src/stdlib/integers.c +++ b/src/stdlib/integers.c @@ -359,6 +359,19 @@ public OptionalInt_t Int$sqrt(Int_t i) return Int$from_mpz(result); } +public bool Int$get_bit(Int_t x, Int_t bit_index) +{ + mpz_t i; + mpz_init_set_int(i, x); + if (Int$compare_value(bit_index, I(1)) < 0) + fail("Invalid bit index (expected 1 or higher): ", bit_index); + if (Int$compare_value(bit_index, Int$from_int64(INT64_MAX)) > 0) + fail("Bit index is too large! ", bit_index); + + int is_bit_set = mpz_tstbit(i, (mp_bitcnt_t)(Int64$from_int(bit_index, true)-1)); + return (bool)is_bit_set; +} + typedef struct { OptionalInt_t current, last; Int_t step; @@ -620,6 +633,13 @@ public void Int32$deserialize(FILE *in, void *outval, List_t *pointers, const Ty } \ return bit_list; \ } \ + public bool KindOfInt ## $get_bit(c_type x, Int_t bit_index) { \ + if (Int$compare_value(bit_index, I(1)) < 0) \ + fail("Invalid bit index (expected 1 or higher): ", bit_index); \ + if (Int$compare_value(bit_index, Int$from_int64(sizeof(c_type)*8)) > 0) \ + fail("Bit index is too large! There are only ", sizeof(c_type)*8, " bits, but index is: ", bit_index); \ + return ((x & (c_type)(1L << (Int64$from_int(bit_index, true)-1L))) != 0); \ + } \ typedef struct { \ Optional##KindOfInt##_t current, last; \ KindOfInt##_t step; \ diff --git a/src/stdlib/integers.h b/src/stdlib/integers.h index 4eaac916..beb26bd6 100644 --- a/src/stdlib/integers.h +++ b/src/stdlib/integers.h @@ -29,6 +29,7 @@ Text_t type_name ## $hex(c_type i, Int_t digits, bool uppercase, bool prefix); \ Text_t type_name ## $octal(c_type i, Int_t digits, bool prefix); \ List_t type_name ## $bits(c_type x); \ + bool type_name ## $get_bit(c_type x, Int_t bit_index); \ Closure_t type_name ## $to(c_type first, c_type last, Optional ## type_name ## _t step); \ Closure_t type_name ## $onward(c_type first, c_type step); \ PUREFUNC Optional ## type_name ## _t type_name ## $parse(Text_t text); \ @@ -105,6 +106,7 @@ Int_t Int$abs(Int_t x); Int_t Int$power(Int_t base, Int_t exponent); Int_t Int$gcd(Int_t x, Int_t y); OptionalInt_t Int$sqrt(Int_t i); +bool Int$get_bit(Int_t x, Int_t bit_index); #define BIGGEST_SMALL_INT 0x3fffffff #define SMALLEST_SMALL_INT -0x40000000 diff --git a/test/bytes.tm b/test/bytes.tm index 25efbeb8..a207c633 100644 --- a/test/bytes.tm +++ b/test/bytes.tm @@ -14,3 +14,12 @@ func main() = "0x0F" >> b.hex(uppercase=no) = "0f" + + >> Byte(0x06).get_bit(1) + = no + >> Byte(0x06).get_bit(2) + = yes + >> Byte(0x06).get_bit(3) + = yes + >> Byte(0x06).get_bit(4) + = no diff --git a/test/integers.tm b/test/integers.tm index 6ba1559c..3035ad3b 100644 --- a/test/integers.tm +++ b/test/integers.tm @@ -139,3 +139,21 @@ func main() = yes >> (3).is_between(100, 200) = no + + >> (6).get_bit(1) + = no + >> (6).get_bit(2) + = yes + >> (6).get_bit(3) + = yes + >> (6).get_bit(4) + = no + + >> Int64(6).get_bit(1) + = no + >> Int64(6).get_bit(2) + = yes + >> Int64(6).get_bit(3) + = yes + >> Int64(6).get_bit(4) + = no -- cgit v1.2.3 From ba7161d6a3156a966c21ea3e06168bdac9803819 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sat, 28 Jun 2025 14:18:59 -0400 Subject: Greatly increase the maximum free space allocated when growing lists (from 63 -> 281,474,976,710,655) to fix degenerate performance when appending to large lists. --- src/stdlib/datatypes.h | 11 ++++++----- src/stdlib/paths.c | 2 +- src/stdlib/text.c | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/stdlib/datatypes.h b/src/stdlib/datatypes.h index 77c09ddb..135fb811 100644 --- a/src/stdlib/datatypes.h +++ b/src/stdlib/datatypes.h @@ -7,12 +7,13 @@ #include #include -#define LIST_LENGTH_BITS 42 -#define LIST_FREE_BITS 6 +#define LIST_LENGTH_BITS 64 +#define LIST_FREE_BITS 48 +#define LIST_ATOMIC_BITS 1 #define LIST_REFCOUNT_BITS 3 #define LIST_STRIDE_BITS 12 -#define MAX_FOR_N_BITS(N) ((1<<(N))-1) +#define MAX_FOR_N_BITS(N) ((1L<<(N))-1L) #define LIST_MAX_STRIDE MAX_FOR_N_BITS(LIST_STRIDE_BITS-1) #define LIST_MIN_STRIDE (~MAX_FOR_N_BITS(LIST_STRIDE_BITS-1)) #define LIST_MAX_DATA_REFCOUNT MAX_FOR_N_BITS(LIST_REFCOUNT_BITS) @@ -42,8 +43,8 @@ typedef struct { // bit arithmetic to extract the necessary values, which is cheaper than // spilling onto the stack and needing to retrieve data from the stack. int64_t length:LIST_LENGTH_BITS; - uint8_t free:LIST_FREE_BITS; - bool atomic:1; + uint64_t free:LIST_FREE_BITS; + bool atomic:LIST_ATOMIC_BITS; uint8_t data_refcount:LIST_REFCOUNT_BITS; int16_t stride:LIST_STRIDE_BITS; } List_t; diff --git a/src/stdlib/paths.c b/src/stdlib/paths.c index 772fa1fd..94baf995 100644 --- a/src/stdlib/paths.c +++ b/src/stdlib/paths.c @@ -372,7 +372,7 @@ public OptionalList_t Path$read_bytes(Path_t path, OptionalInt_t count) close(fd); if (count.small != 0 && (int64_t)len < target_count) fail("Could not read ", target_count, " bytes from ", path, " (only got ", (uint64_t)len, ")"); - return (List_t){.data=content, .atomic=1, .stride=1, .length=len}; + return (List_t){.data=content, .atomic=1, .stride=1, .length=(int64_t)len}; } } diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 9dedccdd..0f3f9519 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1477,7 +1477,7 @@ public List_t Text$utf32_codepoints(Text_t text) public List_t Text$utf8_bytes(Text_t text) { const char *str = Text$as_c_string(text); - return (List_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str}; + return (List_t){.length=(int64_t)strlen(str), .stride=1, .atomic=1, .data=(void*)str}; } static INLINE const char *codepoint_name(ucs4_t c) -- cgit v1.2.3 From cd1e9b5fd52dbc993463d58c41895aba9cd78966 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Thu, 10 Jul 2025 14:44:33 -0400 Subject: Add text compression optimizations for unicode text --- docs/text.md | 63 +++++-- src/environment.c | 2 + src/stdlib/datatypes.h | 6 +- src/stdlib/text.c | 477 ++++++++++++++++++++++++++++++++++--------------- src/stdlib/text.h | 2 + 5 files changed, 384 insertions(+), 166 deletions(-) diff --git a/docs/text.md b/docs/text.md index f716de36..bff6ee4e 100644 --- a/docs/text.md +++ b/docs/text.md @@ -2,29 +2,30 @@ `Text` is Tomo's datatype to represent text. The name `Text` is used instead of "string" because Tomo text represents immutable, normalized unicode data with -fast indexing that has an implementation that is efficient for concatenation. -These are _not_ C-style NUL-terminated character lists. GNU libunistring is -used for full Unicode functionality (grapheme cluster counts, capitalization, -etc.). +fast indexing that has a tree-based implementation that is efficient for +concatenation. These are _not_ C-style NUL-terminated character arrays. GNU +libunistring is used for full Unicode functionality (grapheme cluster counts, +capitalization, etc.). ## Implementation Internally, Tomo text's implementation is based on [Raku/MoarVM's strings](https://docs.raku.org/language/unicode) and [Boehm et al's -Cords](https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf). -Strings store their grapheme cluster count and either a compact list of 8-bit +Cords/Ropes](https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf). +Texts store their grapheme cluster count and either a compact list of 8-bit ASCII characters (for ASCII text), a list of 32-bit normal-form grapheme -cluster values (see below), or a (roughly) balanced binary tree concatenation -of two texts. The upside is that repeated concatenations are typically a +cluster values (see below), a compressed form of grapheme clusters with a +lookup table, or a (roughly) balanced binary tree representing a concatenation. +The upside of this approach is that repeated concatenations are typically a constant-time operation, which will occasionally require a small rebalancing -operation. Index-based text operations (like retrieving an arbitrary index or -slicing) are very fast: typically a constant-time operation for arbitrary -unicode text, but in the worst case scenario (text built from many -concatenations), `O(log(n))` time with very generous constant factors typically -amounting to only a handful of steps. Since concatenations use shared -substructures, they are very memory-efficient and can be used efficiently for -applications like implementing a text editor that stores a full edit history of -a large file's contents. +operation. Text is stored in a format that is highly memory-efficient and +index-based text operations (like retrieving an arbitrary index or slicing) are +very fast: typically a constant-time operation for arbitrary unicode text, but +in the worst case scenario (text built from many concatenations), `O(log(n))` +time with very generous constant factors typically amounting to only a handful +of steps. Since concatenations use shared substructures, they are very +memory-efficient and can be used efficiently for applications like implementing +a text editor that stores a full edit history of a large file's contents. ### Normal-Form Graphemes @@ -49,11 +50,37 @@ codepoints." Here are some examples: - `👩🏽‍🚀` is a single graphical cluster, but it's made up of several combining codepoints (`["WOMAN", "EMOJI MODIFIER FITZPATRICK TYPE-4", "ZERO WITDH JOINER", "ROCKET"]`). Since this can't be represented with a single - codepoint, we must create a synthetic codepoint for it. If this was the `n`th + codepoint, we must create a synthetic codepoint for it. If this was the 3rd synthetic codepoint that we've found, then we would represent it with the - number `-n`, which can be used to look it up in a lookup table. The lookup + number `-3`, which can be used to look it up in a lookup table. The lookup table holds the full sequence of codepoints used in the grapheme cluster. +### Text Compression + +One common property in natural language text is that most text is comprised of a +relatively small set of grapheme clusters that are frequently reused. To take +advantage of this property, Tomo's Text implementation scans along in new text +objects looking for spans of text that use 256 or fewer unique grapheme clusters +with many repetitions. If such a span is found, the text is stored using a small +translation table and one 8-bit unsigned integer per grapheme cluster in that +chunk (which is possible when there are 256 or fewer clusters in the text). This +means that, for the cost of a small array of 32-bit integers, we can store the +entire text using only one byte per grapheme cluster instead of a full 32-bit +integer. For example, a Greek-language text will typically use the Greek +alphabet, plus a few punctuation marks. Thus, if you have a text with a few +thousand Greek letters, we can efficiently store the text as a small lookup +table of around a hundred 32-bit integers and use one byte per "letter" in the +text. This representation is around 4x more efficient than using the UTF32 +representation to store each Unicode codepoint as a 32-bit integer, and it's +about 2x more efficient than using UTF8 to store each non-ASCII codepoint as a +multi-byte sequence. Different languages will have different efficiencies, but +in general, text will be stored significantly more efficiently than UTF32 and +somewhat more efficiently than UTF8. However, the big advantage of this approach +is that we get the ability to do constant-time random access of grapheme +clusters, while getting space efficiency that is almost always better than +variable-width UTF8 encoding (which does not support fast random access of any +kind). + ## Syntax Text has a flexible syntax designed to make it easy to hold values from diff --git a/src/environment.c b/src/environment.c index eb2d28be..0dbe015d 100644 --- a/src/environment.c +++ b/src/environment.c @@ -355,9 +355,11 @@ env_t *global_env(bool source_mapping) {"from_text", "Path$from_text", "func(text:Text -> Path)"}, {"has", "Text$has", "func(text:Text, target:Text -> Bool)"}, {"join", "Text$join", "func(glue:Text, pieces:[Text] -> Text)"}, + {"layout", "Text$layout", "func(text:Text -> Text)"}, {"left_pad", "Text$left_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"lines", "Text$lines", "func(text:Text -> [Text])"}, {"lower", "Text$lower", "func(text:Text, language='C' -> Text)"}, + {"memory_size", "Text$memory_size", "func(text:Text -> Int)"}, {"middle_pad", "Text$middle_pad", "func(text:Text, count:Int, pad=' ', language='C' -> Text)"}, {"quoted", "Text$quoted", "func(text:Text, color=no, quotation_mark='\"' -> Text)"}, {"repeat", "Text$repeat", "func(text:Text, count:Int -> Text)"}, diff --git a/src/stdlib/datatypes.h b/src/stdlib/datatypes.h index 135fb811..fce1ea74 100644 --- a/src/stdlib/datatypes.h +++ b/src/stdlib/datatypes.h @@ -74,7 +74,7 @@ typedef struct { void *fn, *userdata; } Closure_t; -enum text_type { TEXT_ASCII, TEXT_GRAPHEMES, TEXT_CONCAT }; +enum text_type { TEXT_ASCII, TEXT_GRAPHEMES, TEXT_CONCAT, TEXT_BLOB }; typedef struct Text_s { int64_t length:54; // Number of grapheme clusters @@ -92,6 +92,10 @@ typedef struct Text_s { struct { const struct Text_s *left, *right; }; + struct { + const int32_t *map; + const uint8_t *bytes; + } blob; }; } Text_t; diff --git a/src/stdlib/text.c b/src/stdlib/text.c index 0f3f9519..80c267ed 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -1,31 +1,58 @@ // This file defines type info and methods for the Text datatype, which uses // libunistr for Unicode support and implements a datastructure based on a // hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of -// strings and Cords (Boehm et al), which have good runtime performance for -// text constructed by a series of many concatenations. -// +// strings, combined with a mostly-balanced tree datastructure based on Cords +// (Boehm et al), which have good runtime performance for text constructed by a +// series of many concatenations. In practice, this means Tomo's Text has an +// extremely compact memory footprint (typically as compact as UTF8 or +// up to 2x better for some languages), with extremely fast operations +// including concatenation, random indexing, and taking substrings. + // For more information on MoarVM's grapheme cluster strings, see: // https://docs.raku.org/language/unicode -// https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc For more -// information on Cords, see the paper "Ropes: an Alternative to Strings" -// (Boehm, Atkinson, Plass 1995): +// https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc +// For more information on Cords, see the paper "Ropes: an Alternative to +// Strings" (Boehm, Atkinson, Plass 1995): // https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf -// -// A note on grapheme clusters: In Unicode, codepoints can be represented using -// a 32-bit integer. Most codepoints correspond to the intuitive notion of a -// "letter", which is more formally known as a "grapheme cluster". A grapheme -// cluster is roughly speaking the amount of text that your cursor moves over -// when you press the arrow key once. However, some codepoints act as modifiers -// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify -// a letter like "e" to form "é". During normalization, this frequently -// resolves down to a single unicode codepoint, in this case, "é" resolves to -// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in -// some cases, multiple codepoints make up a grapheme cluster but *don't* -// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065) -// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph -// that is not used frequently enough to warrant its own unique codepoint (this -// is basically what Zalgo text is). -// + +// Tomo's Text datatype represents Unicode text that is fully normalized using +// normalization form C (NFC). This means that all text created from source code +// or read in at runtime will respect normalization during comparison and other +// operations, and the original (potentially non-canonical) representation of +// text is not preserved. This also means that byte sequences that do not +// represent valid unicode text cannot be interpreted as the Text datatype. For +// example, a file with malformed UTF8 sequences cannot be read as Text. + +// A note on grapheme clusters: In Unicode, the fundamental unit is the +// "codepoint", which represents things like letters, symbols, emojis, +// combiners, and modifiers that alter other codepoints. However, most people +// have an intuitive notion of what a "letter" is that corresponds to the +// concept formally known as a grapheme cluster. A grapheme cluster is roughly +// speaking the amount of text that your cursor moves over when you press the +// left or right arrow key once. This often corresponds to a single codepoint, +// but some codepoints act as modifiers on other codepoints. For example, U+0301 +// (COMBINING ACUTE ACCENT) can modify a letter like "e" to form "é". During +// normalization, this frequently resolves down to a single unicode codepoint, +// in this case, "é" resolves to the single codepoint U+00E9 (LATIN SMALL LETTER +// E WITH ACUTE). However, in some cases, multiple codepoints make up a grapheme +// cluster but *don't* normalize to a single codepoint. For example, LATIN SMALL +// LETTER E (U+0065) + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an +// unusual glyph that is not used frequently enough to warrant its own unique +// codepoint (this is basically what Zalgo text is). Emojis also use the ZERO +// WIDTH JOINER (U+200D) to add gender, skin tone, or other modifiers to emojis. +// Tomo takes an opinionated stance that grapheme clusters, not codepoints or +// bytes, are more useful to people when doing text operations like getting the +// "length" of a text or accessing the Nth "letter" of a text. If someone sends +// you a text with WOMAN (U+1F469) + ZERO WIDTH JOINER (U+200D) + ROCKET +// (U+1F680) followed by THUMBS UP (U+1F44D), it will render on your screen as +// two things: a female astronaut and a thumbs up, and this is how most people +// will think about the text. If you wish to operate on the raw codepoints that +// comprise the message, you are free to do so with the `.utf32_codepoints()` +// method and `Text.from_codepoints()`, but this is not the default behavior. +// The behavior for the given example is that `text.length == 2`, `text[1]` is +// the grapheme cluster representing a female astronaut emoji, and `text[2]` is +// the grapheme cluster representing the thumbs up emoji. + // There are a lot of benefits to storing unicode text with one grapheme // cluster per index in a densely packed list instead of storing the text as // variable-width UTF8-encoded bytes. It lets us have one canonical length for @@ -44,7 +71,7 @@ // things that would be nice if they had their own codepoint so things worked // out nicely because we're using them right now, and we'll give them a // negative number so it doesn't overlap with any real codepoints. -// + // Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E // WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2: // U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, @@ -52,6 +79,18 @@ // Where -2 is used as a lookup in a list that holds the actual unicode // codepoints: (ucs4_t[]){0x65, 0x0309} +// The text datastructure also uses a compact encoding (TEXT_BLOB) to store a +// per-chunk compressed form of the text when long stretches of text contain +// 256 or fewer unique grapheme clusters, which lets the text use a single byte +// for each grapheme cluster along with a lookup table. For typical text +// written in a variety of non-English natural languages (e.g. Spanish, Arabic, +// Japanese, Greek, German, Finnish, Basque), the in-memory representation +// takes up between 50-101% as much space as UTF8 encoding and between 24-39% +// as much space as UTF32 encoding, but with the advantage of extremely fast +// random access for indexing or slicing, unlike UTF8. In other words, this +// representation offers ASCII-like compactness and fast random access for +// non-English languages. + #include #include #include @@ -68,6 +107,7 @@ #include #include +#include "bytes.h" #include "datatypes.h" #include "lists.h" #include "integers.h" @@ -102,7 +142,6 @@ static int32_t num_synthetic_graphemes = 0; #define SHORT_ASCII_LENGTH 64 #define SHORT_GRAPHEMES_LENGTH 16 -static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize); static Text_t simple_concatenation(Text_t a, Text_t b); public Text_t EMPTY_TEXT = { @@ -142,6 +181,9 @@ static const TypeInfo_t GraphemeClusterInfo = { #endif public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) { + if (utf32_len == 1) + return (int32_t)*codepoints; + ucs4_t length_prefixed[1+utf32_len]; length_prefixed[0] = (ucs4_t)utf32_len; for (int i = 0; i < utf32_len; i++) @@ -173,6 +215,7 @@ public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_le uint8_t u8_buf[64]; size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]); uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len); + if (u8 == NULL) fail("Invalid graphemes encountered!"); // For performance reasons, use an arena allocator here to ensure that // synthetic graphemes store all of their information in a densely packed @@ -247,6 +290,26 @@ public int Text$print(FILE *stream, Text_t t) uint8_t buf[8]; size_t len = sizeof(buf); uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len); + if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme); + written += (int)fwrite(u8, sizeof(char), len, stream); + if (u8 != buf) free(u8); + } else { + const uint8_t *u8 = GRAPHEME_UTF8(grapheme); + assert(u8); + written += (int)fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream); + } + } + return written; + } + case TEXT_BLOB: { + int written = 0; + for (int64_t i = 0; i < t.length; i++) { + int32_t grapheme = t.blob.map[t.blob.bytes[i]]; + if (grapheme >= 0) { + uint8_t buf[8]; + size_t len = sizeof(buf); + uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len); + if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme); written += (int)fwrite(u8, sizeof(char), len, stream); if (u8 != buf) free(u8); } else { @@ -258,8 +321,9 @@ public int Text$print(FILE *stream, Text_t t) return written; } case TEXT_CONCAT: { - return (Text$print(stream, *t.left) - + Text$print(stream, *t.right)); + int written = Text$print(stream, *t.left); + written += Text$print(stream, *t.right); + return written; } default: return 0; } @@ -275,18 +339,24 @@ static const int64_t min_len_for_depth[MAX_TEXT_DEPTH] = { #define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth]) -static void insert_balanced(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t to_insert) +static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text) { + if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) { + insert_balanced_recursive(balanced_texts, *text.left); + insert_balanced_recursive(balanced_texts, *text.right); + return; + } + int i = 0; Text_t accumulator = EMPTY_TEXT; - for (; to_insert.length > min_len_for_depth[i + 1]; i++) { + for (; text.length > min_len_for_depth[i + 1]; i++) { if (balanced_texts[i].length) { accumulator = simple_concatenation(balanced_texts[i], accumulator); balanced_texts[i] = EMPTY_TEXT; } } - accumulator = simple_concatenation(accumulator, to_insert); + accumulator = simple_concatenation(accumulator, text); while (accumulator.length >= min_len_for_depth[i]) { if (balanced_texts[i].length) { @@ -299,16 +369,6 @@ static void insert_balanced(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t to_ins balanced_texts[i] = accumulator; } -static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text) -{ - if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) { - insert_balanced_recursive(balanced_texts, *text.left); - insert_balanced_recursive(balanced_texts, *text.right); - } else { - insert_balanced(balanced_texts, text); - } -} - static Text_t rebalanced(Text_t a, Text_t b) { Text_t balanced_texts[MAX_TEXT_DEPTH]; @@ -384,15 +444,25 @@ static Text_t concat2_assuming_safe(Text_t a, Text_t b) if (a.tag == TEXT_GRAPHEMES) { memcpy(dest, a.graphemes, sizeof(int32_t[a.length])); dest += a.length; - } else { + } else if (a.tag == TEXT_ASCII) { for (int64_t i = 0; i < a.length; i++) *(dest++) = (int32_t)a.ascii[i]; + } else if (a.tag == TEXT_BLOB) { + for (int64_t i = 0; i < a.length; i++) + *(dest++) = a.blob.map[a.blob.bytes[i]]; + } else { + errx(1, "Unreachable"); } if (b.tag == TEXT_GRAPHEMES) { memcpy(dest, b.graphemes, sizeof(int32_t[b.length])); - } else { + } else if (b.tag == TEXT_ASCII) { for (int64_t i = 0; i < b.length; i++) *(dest++) = (int32_t)b.ascii[i]; + } else if (b.tag == TEXT_BLOB) { + for (int64_t i = 0; i < b.length; i++) + *(dest++) = b.blob.map[b.blob.bytes[i]]; + } else { + errx(1, "Unreachable"); } return ret; } @@ -455,7 +525,7 @@ static Text_t concat2(Text_t a, Text_t b) return concat2_assuming_safe(a, b); } - Text_t glue = text_from_u32(norm_buf, (int64_t)norm_length, false); + Text_t glue = Text$from_codepoints((List_t){.data=norm_buf, .length=(int64_t)norm_length, .stride=sizeof(int32_t)}); if (normalized != norm_buf) free(normalized); @@ -590,8 +660,8 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) last -= text.left->length; text = *text.right; } else { - return concat2(Text$slice(*text.left, I(first), I(text.length)), - Text$slice(*text.right, I(1), I(last-text.left->length))); + return concat2_assuming_safe(Text$slice(*text.left, I(first), I(text.length)), + Text$slice(*text.right, I(1), I(last-text.left->length))); } } @@ -610,6 +680,15 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) .graphemes=text.graphemes + (first-1), }; } + case TEXT_BLOB: { + Text_t ret = (Text_t){ + .tag=TEXT_BLOB, + .length=last - first + 1, + .blob.map=text.blob.map, + .blob.bytes=text.blob.bytes + (first-1), + }; + return ret; + } default: errx(1, "Invalid tag"); } return EMPTY_TEXT; @@ -648,90 +727,64 @@ public Text_t Text$reversed(Text_t text) ((int32_t*)ret.graphemes)[text.length-1-i] = text.graphemes[i]; return ret; } - case TEXT_CONCAT: { - return concat2(Text$reversed(*text.right), Text$reversed(*text.left)); - } - default: errx(1, "Invalid tag"); - } - return EMPTY_TEXT; -} - -public PUREFUNC Text_t Text$cluster(Text_t text, Int_t index_int) -{ - int64_t index = Int64$from_int(index_int, false); - if (index == 0) fail("Invalid index: 0"); - - if (index < 0) index = text.length + index + 1; - - if (index > text.length || index < 1) - fail("Invalid index: ", index_int, " is beyond the length of the text (length = ", (int64_t)text.length, ")"); - - while (text.tag == TEXT_CONCAT) { - if (index <= text.left->length) - text = *text.left; - else - text = *text.right; - } - - switch (text.tag) { - case TEXT_ASCII: { + case TEXT_BLOB: { struct Text_s ret = { - .tag=TEXT_ASCII, - .length=1, - .ascii=GC_MALLOC_ATOMIC(sizeof(char)), + .tag=TEXT_BLOB, + .length=text.length, + .blob.map=text.blob.map, }; - *(char*)&ret.ascii[0] = text.ascii[index-1]; + ret.blob.bytes = GC_MALLOC_ATOMIC(sizeof(uint8_t[ret.length])); + for (int64_t i = 0; i < text.length; i++) + ((uint8_t*)ret.blob.bytes)[text.length-1-i] = text.graphemes[i]; return ret; } - case TEXT_GRAPHEMES: { - struct Text_s ret = { - .tag=TEXT_GRAPHEMES, - .length=1, - .graphemes=GC_MALLOC_ATOMIC(sizeof(int32_t)), - }; - *(int32_t*)&ret.graphemes[0] = text.graphemes[index-1]; - return ret; + case TEXT_CONCAT: { + return concat2_assuming_safe(Text$reversed(*text.right), Text$reversed(*text.left)); } default: errx(1, "Invalid tag"); } return EMPTY_TEXT; } -Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize) +public PUREFUNC Text_t Text$cluster(Text_t text, Int_t index) { - // Normalization is apparently guaranteed to never exceed 3x in the input length - ucs4_t norm_buf[MIN(256, 3*num_codepoints)]; - if (normalize) { - size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); - ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, (size_t)num_codepoints, norm_buf, &norm_length); - codepoints = normalized; - num_codepoints = (int64_t)norm_length; - } + return Text$slice(text, index, index); +} - // Intentionally overallocate here: allocate assuming each codepoint is a - // grapheme cluster. If that's not true, we'll have extra space at the end - // of the list, but the length will still be calculated correctly. - int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); - struct Text_s ret = { - .tag=TEXT_GRAPHEMES, - .length=0, - .graphemes=graphemes, - }; - const ucs4_t *src = codepoints; - while (src < &codepoints[num_codepoints]) { - // TODO: use grapheme breaks instead of u32_grapheme_next()? - const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]); - if (next == &src[1]) { - graphemes[ret.length] = (int32_t)*src; - } else { - // Synthetic grapheme - graphemes[ret.length] = get_synthetic_grapheme(src, next-src); +static Text_t Text$from_components(List_t graphemes, Table_t unique_clusters) +{ + struct { + int32_t map[unique_clusters.entries.length]; + uint8_t bytes[graphemes.length]; + } *blob; + // If blob optimization will save at least 200 bytes: + if (unique_clusters.entries.length <= 256 && sizeof(*blob) + 200 < sizeof(int32_t[graphemes.length])) { + Text_t ret = { + .tag=TEXT_BLOB, + .length=graphemes.length, + .depth=0, + }; + blob = GC_MALLOC_ATOMIC(sizeof(*blob)); + for (int64_t i = 0; i < unique_clusters.entries.length; i++) { + struct { int32_t g; uint8_t b; } *entry = unique_clusters.entries.data + i*unique_clusters.entries.stride; + blob->map[entry->b] = entry->g; } - ++ret.length; - src = next; + for (int64_t i = 0; i < graphemes.length; i++) { + int32_t g = *(int32_t*)(graphemes.data + i*graphemes.stride); + uint8_t *byte = Table$get(unique_clusters, &g, Table$info(&Int32$info, &Byte$info)); + assert(byte); + blob->bytes[i] = *byte; + } + ret.blob.map = &blob->map[0]; + ret.blob.bytes = &blob->bytes[0]; + return ret; + } else { + return (Text_t){ + .tag=TEXT_GRAPHEMES, + .length=graphemes.length, + .graphemes=graphemes.data, + }; } - if (normalize && codepoints != norm_buf) free(codepoints); - return ret; } public OptionalText_t Text$from_strn(const char *str, size_t len) @@ -748,18 +801,37 @@ public OptionalText_t Text$from_strn(const char *str, size_t len) .length=ascii_span, .ascii=copy, }; - } else { - if (u8_check((uint8_t*)str, len) != NULL) - return NONE_TEXT; - - ucs4_t buf[128]; - size_t length = sizeof(buf)/sizeof(buf[0]); + } + if (u8_check((uint8_t*)str, len) != NULL) + return NONE_TEXT; - ucs4_t *codepoints = u8_to_u32((uint8_t*)str, (size_t)ascii_span + strlen(str + ascii_span), buf, &length); - Text_t ret = text_from_u32(codepoints, (int64_t)length, true); - if (codepoints != buf) free(codepoints); - return ret; + List_t graphemes = {}; + Table_t unique_clusters = {}; + const uint8_t *pos = (const uint8_t*)str; + const uint8_t *end = (const uint8_t*)&str[len]; + // Iterate over grapheme clusters + for (const uint8_t *next; (next=u8_grapheme_next(pos, end)); pos = next) { + uint32_t buf[256]; + size_t u32_len = sizeof(buf)/sizeof(buf[0]); + uint32_t *u32s = u8_to_u32(pos, (size_t)(next-pos), buf, &u32_len); + + uint32_t buf2[256]; + size_t u32_normlen = sizeof(buf2)/sizeof(buf2[0]); + uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, u32s, u32_len, buf2, &u32_normlen); + + int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen); + List$insert(&graphemes, &g, I(0), sizeof(int32_t)); + Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length, Table$info(&Int32$info, &Byte$info)); + + if (u32s != buf) free(u32s); + if (u32s_normalized != buf2) free(u32s_normalized); + + if (unique_clusters.entries.length >= 256) { + return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters), Text$from_strn(next, (size_t)(end-next))); + } } + + return Text$from_components(graphemes, unique_clusters); } public OptionalText_t Text$from_str(const char *str) @@ -788,6 +860,7 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i uint8_t u8_buf[64]; size_t u8_len = sizeof(u8_buf); uint8_t *u8 = u32_to_u8((ucs4_t*)&graphemes[g], 1, u8_buf, &u8_len); + if (u8 == NULL) fail("Invalid grapheme encountered: ", graphemes[g]); if (*i + (int64_t)u8_len > (int64_t)*capacity) { *capacity = *i + (int64_t)u8_len + 1; @@ -811,6 +884,37 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i } break; } + case TEXT_BLOB: { + for (int64_t g = 0; g < text.length; g++) { + int32_t grapheme = text.blob.map[text.blob.bytes[g]]; + if (grapheme >= 0) { + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf); + uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, u8_buf, &u8_len); + if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme); + + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + (int64_t)u8_len + 1; + *buf = GC_REALLOC(*buf, (size_t)*capacity); + } + + memcpy(*buf + *i, u8, u8_len); + *i += (int64_t)u8_len; + if (u8 != u8_buf) free(u8); + } else { + const uint8_t *u8 = GRAPHEME_UTF8(grapheme); + size_t u8_len = u8_strlen(u8); + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + (int64_t)u8_len + 1; + *buf = GC_REALLOC(*buf, (size_t)*capacity); + } + + memcpy(*buf + *i, u8, u8_len); + *i += (int64_t)u8_len; + } + } + break; + } case TEXT_CONCAT: { u8_buf_append(*text.left, buf, capacity, i); u8_buf_append(*text.right, buf, capacity, i); @@ -867,6 +971,15 @@ PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t *info) int32_t last = text.length & 0x1 ? graphemes[text.length-1] : 0; // Odd number of graphemes return siphashfinish_last_part(&sh, (uint64_t)last); } + case TEXT_BLOB: { + for (int64_t i = 0; i + 1 < text.length; i += 2) { + tmp.chunks[0] = text.blob.map[text.blob.bytes[i]]; + tmp.chunks[1] = text.blob.map[text.blob.bytes[i+1]]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text.length & 0x1 ? text.blob.map[text.blob.bytes[text.length-1]] : 0; // Odd number of graphemes + return siphashfinish_last_part(&sh, (uint64_t)last); + } case TEXT_CONCAT: { TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i + 1 < text.length; i += 2) { @@ -928,6 +1041,7 @@ public int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index) switch (text.tag) { case TEXT_ASCII: return (int32_t)text.ascii[index - offset]; case TEXT_GRAPHEMES: return text.graphemes[index - offset]; + case TEXT_BLOB: return text.blob.map[text.blob.bytes[index - offset]]; default: errx(1, "Invalid text"); } return 0; @@ -1286,11 +1400,9 @@ public Text_t Text$upper(Text_t text, Text_t language) if (text.length == 0) return text; List_t codepoints = Text$utf32_codepoints(text); const char *uc_language = Text$as_c_string(language); - ucs4_t buf[128]; - size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len); - Text_t ret = text_from_u32(upper, (int64_t)out_len, false); - if (upper != buf) free(upper); + size_t out_len = 0; + ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, NULL, &out_len); + Text_t ret = Text$from_codepoints((List_t){.data=upper, .length=(int64_t)out_len, .stride=sizeof(int32_t)}); return ret; } @@ -1299,11 +1411,9 @@ public Text_t Text$lower(Text_t text, Text_t language) if (text.length == 0) return text; List_t codepoints = Text$utf32_codepoints(text); const char *uc_language = Text$as_c_string(language); - ucs4_t buf[128]; - size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len); - Text_t ret = text_from_u32(lower, (int64_t)out_len, false); - if (lower != buf) free(lower); + size_t out_len = 0; + ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, NULL, &out_len); + Text_t ret = Text$from_codepoints((List_t){.data=lower, .length=(int64_t)out_len, .stride=sizeof(int32_t)}); return ret; } @@ -1312,11 +1422,9 @@ public Text_t Text$title(Text_t text, Text_t language) if (text.length == 0) return text; List_t codepoints = Text$utf32_codepoints(text); const char *uc_language = Text$as_c_string(language); - ucs4_t buf[128]; - size_t out_len = sizeof(buf)/sizeof(buf[0]); - ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len); - Text_t ret = text_from_u32(title, (int64_t)out_len, false); - if (title != buf) free(title); + size_t out_len = 0; + ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, NULL, &out_len); + Text_t ret = Text$from_codepoints((List_t){.data=title, .length=(int64_t)out_len, .stride=sizeof(int32_t)}); return ret; } @@ -1332,12 +1440,20 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) ret = concat2_assuming_safe(ret, quotation_mark); int32_t quote_char = Text$get_grapheme(quotation_mark, 0); -#define add_escaped(str) ({ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \ +#define flush_unquoted() ({ \ + if (unquoted_span > 0) { \ + ret = concat2_assuming_safe(ret, Text$slice(text, I(i-unquoted_span+1), I(i))); \ + unquoted_span = 0; \ + } }) +#define add_escaped(str) ({ \ + flush_unquoted(); \ + if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \ ret = concat2_assuming_safe(ret, Text("\\" str)); \ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); }) TextIter_t state = NEW_TEXT_ITER_STATE(text); - // TODO: optimize for spans of non-escaped text - for (int64_t i = 0; i < text.length; i++) { + int64_t unquoted_span = 0; + int64_t i = 0; + for (i = 0; i < text.length; i++) { int32_t g = Text$get_grapheme_fast(&state, i); switch (g) { case '\a': add_escaped("a"); break; @@ -1358,6 +1474,7 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) } case '\x00' ... '\x06': case '\x0E' ... '\x1A': case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': { + flush_unquoted(); if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); ret = concat2_assuming_safe(ret, Text("\\x")); char tmp[3] = { @@ -1372,15 +1489,21 @@ public Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) } default: { if (g == quote_char) { + flush_unquoted(); + if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); + ret = concat2_assuming_safe(ret, Text("\\")); ret = concat2_assuming_safe(ret, quotation_mark); + if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); } else { - ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1))); + unquoted_span += 1; } break; } } } + flush_unquoted(); #undef add_escaped +#undef flush_unquoted ret = concat2_assuming_safe(ret, quotation_mark); if (colorize) @@ -1513,10 +1636,38 @@ public List_t Text$codepoint_names(Text_t text) public Text_t Text$from_codepoints(List_t codepoints) { - if (codepoints.stride != sizeof(int32_t)) - List$compact(&codepoints, sizeof(int32_t)); - - return text_from_u32(codepoints.data, codepoints.length, true); + if (codepoints.stride != sizeof(uint32_t)) + List$compact(&codepoints, sizeof(uint32_t)); + + List_t graphemes = {}; + Table_t unique_clusters = {}; + const uint32_t *pos = (const uint32_t*)codepoints.data; + const uint32_t *end = (const uint32_t*)&pos[codepoints.length]; + // Iterate over grapheme clusters + for (const uint32_t *next; (next=u32_grapheme_next(pos, end)); pos = next) { + // Buffer for normalized cluster: + uint32_t buf[256]; + size_t u32_normlen = sizeof(buf)/sizeof(buf[0]); + uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, pos, (size_t)(next-pos), buf, &u32_normlen); + + int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen); + List$insert(&graphemes, &g, I(0), sizeof(int32_t)); + Table$get_or_setdefault( + &unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length, + Table$info(&Int32$info, &Byte$info)); + + if (u32s_normalized != buf) free(u32s_normalized); + + if (unique_clusters.entries.length == 256) { + List_t remaining_codepoints = { + .length=(int64_t)(end-next), + .data=(void*)next, + .stride=sizeof(int32_t), + }; + return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters), Text$from_codepoints(remaining_codepoints)); + } + } + return Text$from_components(graphemes, unique_clusters); } public OptionalText_t Text$from_codepoint_names(List_t codepoint_names) @@ -1605,6 +1756,38 @@ PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t *info) return ((Text_t*)t)->length < 0; } +public Int_t Text$memory_size(Text_t text) +{ + switch (text.tag) { + case TEXT_ASCII: + return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(char[text.length])); + case TEXT_GRAPHEMES: + return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(int32_t[text.length])); + case TEXT_BLOB: + return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)((void*)text.blob.bytes - (void*)text.blob.map) + (int64_t)sizeof(uint8_t[text.length])); + case TEXT_CONCAT: + return Int$plus( + Int$from_int64((int64_t)sizeof(Text_t)), + Int$plus(Text$memory_size(*text.left), Text$memory_size(*text.right))); + default: errx(1, "Invalid text tag: ", text.tag); + } +} + +public Text_t Text$layout(Text_t text) +{ + switch (text.tag) { + case TEXT_ASCII: + return Texts(Text("ASCII("), Int64$as_text((int64_t[1]){text.length}, false, NULL), Text(")")); + case TEXT_GRAPHEMES: + return Texts(Text("Graphemes("), Int64$as_text((int64_t[1]){text.length}, false, NULL), Text(")")); + case TEXT_BLOB: + return Texts(Text("Blob("), Int64$as_text((int64_t[1]){text.length}, false, NULL), Text(")")); + case TEXT_CONCAT: + return Texts(Text("Concat("), Text$layout(*text.left), Text(", "), Text$layout(*text.right), Text(")")); + default: errx(1, "Invalid text tag: ", text.tag); + } +} + public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const TypeInfo_t *info) { (void)info; diff --git a/src/stdlib/text.h b/src/stdlib/text.h index 47ede6f1..fc336612 100644 --- a/src/stdlib/text.h +++ b/src/stdlib/text.h @@ -78,6 +78,8 @@ Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language) Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language); int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index); uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index); +Int_t Text$memory_size(Text_t text); +Text_t Text$layout(Text_t text); void Text$serialize(const void *obj, FILE *out, Table_t *, const TypeInfo_t *); void Text$deserialize(FILE *in, void *out, List_t *, const TypeInfo_t *); -- cgit v1.2.3