diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-03-04 01:34:12 -0500 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-03-04 01:34:12 -0500 |
| commit | 0b7ca098ae1dc5e31485bc505a441d3796f0702b (patch) | |
| tree | b0662c559c96bd704f7c33c9ccc3f33d38f66d8d | |
| parent | 1f6aa4cac7af0e00c826d5348b06aed70cbcb3e3 (diff) | |
Fix some bugs relating to NUL termination of strings
| -rw-r--r-- | builtins/text.c | 73 | ||||
| -rw-r--r-- | builtins/text.h | 4 | ||||
| -rw-r--r-- | compile.c | 7 | ||||
| -rw-r--r-- | environment.c | 4 | ||||
| -rw-r--r-- | test/text.tm | 31 |
5 files changed, 104 insertions, 15 deletions
diff --git a/builtins/text.c b/builtins/text.c index d4fbb32e..445d5ab0 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -10,6 +10,7 @@ #include <sys/param.h> #include <unicase.h> #include <unigbrk.h> +#include <uniname.h> #include <uninorm.h> #include <unistr.h> @@ -109,8 +110,8 @@ public uint32_t Text__hash(CORD *cord) const char *str = CORD_to_const_char_star(*cord); size_t len = strlen(str); uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len); + size_t norm_len = sizeof(buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len+1, buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); uint32_t hash; @@ -266,14 +267,14 @@ public array_t Text__clusters(CORD text) array_t clusters = {.atomic=1}; const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), buf, &norm_len); + size_t norm_len = sizeof(buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); const uint8_t *end = normalized + strlen((char*)normalized); for (const uint8_t *pos = normalized; pos != end; ) { const uint8_t *next = u8_grapheme_next(pos, end); - size_t len = next ? (size_t)(next - pos) : strlen((char*)pos); + size_t len = (size_t)(next - pos); char cluster_buf[len+1]; strlcpy(cluster_buf, (char*)pos, len+1); CORD cluster = CORD_from_char_star(cluster_buf); @@ -289,13 +290,13 @@ public array_t Text__codepoints(CORD text) { const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len); + size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); uint32_t codepoint_buf[128] = {0}; - size_t codepoint_len = sizeof(codepoint_buf)-1; - uint32_t *codepoints = u8_to_u32(normalized, norm_len, codepoint_buf, &codepoint_len); + size_t codepoint_len = sizeof(codepoint_buf); + uint32_t *codepoints = u8_to_u32(normalized, norm_len-1, codepoint_buf, &codepoint_len); array_t ret = { .length=codepoint_len, .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t)*codepoint_len), codepoints, sizeof(int32_t)*codepoint_len), @@ -312,10 +313,11 @@ public array_t Text__bytes(CORD text) { const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len); + size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); + --norm_len; // NUL byte array_t ret = { .length=norm_len, .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t)*norm_len), normalized, sizeof(uint8_t)*norm_len), @@ -327,6 +329,55 @@ public array_t Text__bytes(CORD text) return ret; } +public int64_t Text__num_clusters(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + int64_t num_clusters = 0; + const uint8_t *end = ustr + u8_strlen(ustr); + for (const uint8_t *pos = ustr; pos != end; ) { + const uint8_t *next = u8_grapheme_next(pos, end); + ++num_clusters; + pos = next; + } + return num_clusters; +} + +public int64_t Text__num_codepoints(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len); + if (!normalized) errx(1, "Unicode normalization error!"); + int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1); + if (normalized != buf) free(normalized); + return num_codepoints; +} + +public int64_t Text__num_bytes(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t norm_buf[128] = {0}; + size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); + --norm_len; // NUL byte + if (!normalized) errx(1, "Unicode normalization error!"); + if (normalized != norm_buf) free(normalized); + return norm_len; +} + +public array_t Text__character_names(CORD text) +{ + array_t codepoints = Text__codepoints(text); + array_t ret = {.length=codepoints.length, .stride=sizeof(CORD), .data=GC_MALLOC(sizeof(CORD)*codepoints.length)}; + for (int64_t i = 0; i < codepoints.length; i++) { + char buf[UNINAME_MAX]; + unicode_character_name(*(ucs4_t*)(codepoints.data + codepoints.stride*i), buf); + *(CORD*)(ret.data + ret.stride*i) = CORD_from_char_star(buf); + } + return ret; +} + public const TypeInfo Text = { .size=sizeof(CORD), .align=__alignof__(CORD), diff --git a/builtins/text.h b/builtins/text.h index 44e2c270..f2fc47cd 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -32,6 +32,10 @@ CORD Text__join(CORD glue, array_t pieces); array_t Text__clusters(CORD text); array_t Text__codepoints(CORD text); array_t Text__bytes(CORD text); +int64_t Text__num_clusters(CORD text); +int64_t Text__num_codepoints(CORD text); +int64_t Text__num_bytes(CORD text); +array_t Text__character_names(CORD text); extern const TypeInfo Text; @@ -173,7 +173,7 @@ CORD compile(env_t *env, ast_t *ast) switch (value_type(t)->tag) { case TextType: { CORD str = compile_to_pointer_depth(env, expr, 0, false); - return CORD_all("CORD_len(", str, ")"); + return CORD_all("Text__num_clusters(", str, ")"); } case ArrayType: { if (t->tag == PointerType) { @@ -860,8 +860,9 @@ CORD compile(env_t *env, ast_t *ast) if (test->output) { const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output); uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf)-1; - uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw), buf, &norm_len); + size_t norm_len = sizeof(buf); + uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len); + assert(norm[norm_len-1] == 0); output = CORD_from_char_star((char*)norm); if (norm && norm != buf) free(norm); } diff --git a/environment.c b/environment.c index 78c80c4e..483f123f 100644 --- a/environment.c +++ b/environment.c @@ -162,6 +162,10 @@ env_t *new_compilation_unit(void) {"clusters", "Text__clusters", "func(text:Text)->[Text]"}, {"codepoints", "Text__codepoints", "func(text:Text)->[Int32]"}, {"bytes", "Text__bytes", "func(text:Text)->[Int8]"}, + {"num_clusters", "Text__num_clusters", "func(text:Text)->Int"}, + {"num_codepoints", "Text__num_codepoints", "func(text:Text)->Int"}, + {"num_bytes", "Text__num_bytes", "func(text:Text)->Int"}, + {"character_names", "Text__character_names", "func(text:Text)->[Text]"}, )}, }; diff --git a/test/text.tm b/test/text.tm index 690bb57a..c842a36d 100644 --- a/test/text.tm +++ b/test/text.tm @@ -15,10 +15,39 @@ >> \UE9 == \U65\U301 = yes ->> amelie := "Amélie" +>> amelie := "Am{\UE9}lie" >> amelie:clusters() = ["A", "m", "é", "l", "i", "e"] : [Text] >> amelie:codepoints() = [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32] >> amelie:bytes() = [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8] +>> #amelie += 6 +>> amelie:num_clusters() += 6 +>> amelie:num_codepoints() += 7 +>> amelie:num_bytes() += 8 + +>> amelie2 := "Am{\U65\U301}lie" +>> amelie2:clusters() += ["A", "m", "é", "l", "i", "e"] : [Text] +>> amelie2:codepoints() += [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32] +>> amelie2:bytes() += [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8] +>> #amelie += 6 +>> amelie2:num_clusters() += 6 +>> amelie2:num_codepoints() += 7 +>> amelie2:num_bytes() += 8 + +>> amelie:character_names() += ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E", "COMBINING ACUTE ACCENT", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"] +>> amelie2:character_names() += ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E", "COMBINING ACUTE ACCENT", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"] |
