diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-03-04 01:34:12 -0500 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-03-04 01:34:12 -0500 |
| commit | 0b7ca098ae1dc5e31485bc505a441d3796f0702b (patch) | |
| tree | b0662c559c96bd704f7c33c9ccc3f33d38f66d8d /builtins | |
| parent | 1f6aa4cac7af0e00c826d5348b06aed70cbcb3e3 (diff) | |
Fix some bugs relating to NUL termination of strings
Diffstat (limited to 'builtins')
| -rw-r--r-- | builtins/text.c | 73 | ||||
| -rw-r--r-- | builtins/text.h | 4 |
2 files changed, 66 insertions, 11 deletions
diff --git a/builtins/text.c b/builtins/text.c index d4fbb32e..445d5ab0 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -10,6 +10,7 @@ #include <sys/param.h> #include <unicase.h> #include <unigbrk.h> +#include <uniname.h> #include <uninorm.h> #include <unistr.h> @@ -109,8 +110,8 @@ public uint32_t Text__hash(CORD *cord) const char *str = CORD_to_const_char_star(*cord); size_t len = strlen(str); uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len, buf, &norm_len); + size_t norm_len = sizeof(buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, (uint8_t*)str, len+1, buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); uint32_t hash; @@ -266,14 +267,14 @@ public array_t Text__clusters(CORD text) array_t clusters = {.atomic=1}; const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); uint8_t buf[128] = {0}; - size_t norm_len = sizeof(buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), buf, &norm_len); + size_t norm_len = sizeof(buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); const uint8_t *end = normalized + strlen((char*)normalized); for (const uint8_t *pos = normalized; pos != end; ) { const uint8_t *next = u8_grapheme_next(pos, end); - size_t len = next ? (size_t)(next - pos) : strlen((char*)pos); + size_t len = (size_t)(next - pos); char cluster_buf[len+1]; strlcpy(cluster_buf, (char*)pos, len+1); CORD cluster = CORD_from_char_star(cluster_buf); @@ -289,13 +290,13 @@ public array_t Text__codepoints(CORD text) { const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len); + size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); uint32_t codepoint_buf[128] = {0}; - size_t codepoint_len = sizeof(codepoint_buf)-1; - uint32_t *codepoints = u8_to_u32(normalized, norm_len, codepoint_buf, &codepoint_len); + size_t codepoint_len = sizeof(codepoint_buf); + uint32_t *codepoints = u8_to_u32(normalized, norm_len-1, codepoint_buf, &codepoint_len); array_t ret = { .length=codepoint_len, .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t)*codepoint_len), codepoints, sizeof(int32_t)*codepoint_len), @@ -312,10 +313,11 @@ public array_t Text__bytes(CORD text) { const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); uint8_t norm_buf[128] = {0}; - size_t norm_len = sizeof(norm_buf)-1; - uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr), norm_buf, &norm_len); + size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); if (!normalized) errx(1, "Unicode normalization error!"); + --norm_len; // NUL byte array_t ret = { .length=norm_len, .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t)*norm_len), normalized, sizeof(uint8_t)*norm_len), @@ -327,6 +329,55 @@ public array_t Text__bytes(CORD text) return ret; } +public int64_t Text__num_clusters(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + int64_t num_clusters = 0; + const uint8_t *end = ustr + u8_strlen(ustr); + for (const uint8_t *pos = ustr; pos != end; ) { + const uint8_t *next = u8_grapheme_next(pos, end); + ++num_clusters; + pos = next; + } + return num_clusters; +} + +public int64_t Text__num_codepoints(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t buf[128] = {0}; + size_t norm_len = sizeof(buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, buf, &norm_len); + if (!normalized) errx(1, "Unicode normalization error!"); + int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1); + if (normalized != buf) free(normalized); + return num_codepoints; +} + +public int64_t Text__num_bytes(CORD text) +{ + const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); + uint8_t norm_buf[128] = {0}; + size_t norm_len = sizeof(norm_buf); + uint8_t *normalized = u8_normalize(UNINORM_NFD, ustr, strlen((char*)ustr)+1, norm_buf, &norm_len); + --norm_len; // NUL byte + if (!normalized) errx(1, "Unicode normalization error!"); + if (normalized != norm_buf) free(normalized); + return norm_len; +} + +public array_t Text__character_names(CORD text) +{ + array_t codepoints = Text__codepoints(text); + array_t ret = {.length=codepoints.length, .stride=sizeof(CORD), .data=GC_MALLOC(sizeof(CORD)*codepoints.length)}; + for (int64_t i = 0; i < codepoints.length; i++) { + char buf[UNINAME_MAX]; + unicode_character_name(*(ucs4_t*)(codepoints.data + codepoints.stride*i), buf); + *(CORD*)(ret.data + ret.stride*i) = CORD_from_char_star(buf); + } + return ret; +} + public const TypeInfo Text = { .size=sizeof(CORD), .align=__alignof__(CORD), diff --git a/builtins/text.h b/builtins/text.h index 44e2c270..f2fc47cd 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -32,6 +32,10 @@ CORD Text__join(CORD glue, array_t pieces); array_t Text__clusters(CORD text); array_t Text__codepoints(CORD text); array_t Text__bytes(CORD text); +int64_t Text__num_clusters(CORD text); +int64_t Text__num_codepoints(CORD text); +int64_t Text__num_bytes(CORD text); +array_t Text__character_names(CORD text); extern const TypeInfo Text; |
