diff options
| -rw-r--r-- | builtins/text.c | 156 | ||||
| -rw-r--r-- | builtins/text.h | 5 | ||||
| -rw-r--r-- | compile.c | 2 | ||||
| -rw-r--r-- | environment.c | 13 | ||||
| -rw-r--r-- | test/text.tm | 54 | ||||
| -rw-r--r-- | types.c | 1 |
6 files changed, 121 insertions, 110 deletions
diff --git a/builtins/text.c b/builtins/text.c index cff3a7a8..63df117f 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -30,8 +30,10 @@ #include "text.h" #include "types.h" +#include "siphash.c" + static struct { - size_t num_codepoints; + int64_t num_codepoints; const uint32_t *codepoints; } synthetic_graphemes[1024] = {}; @@ -45,7 +47,7 @@ typedef struct { static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index); -int32_t find_synthetic_grapheme(const uint32_t *codepoints, size_t len) +int32_t find_synthetic_grapheme(const uint32_t *codepoints, int64_t len) { int32_t lo = 0, hi = num_synthetic_graphemes; while (lo <= hi) { @@ -64,7 +66,7 @@ int32_t find_synthetic_grapheme(const uint32_t *codepoints, size_t len) return hi; } -int32_t get_synthetic_grapheme(const uint32_t *codepoints, size_t len) +int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len) { int32_t index = find_synthetic_grapheme(codepoints, len); if (index < num_synthetic_graphemes @@ -85,11 +87,11 @@ int32_t get_synthetic_grapheme(const uint32_t *codepoints, size_t len) } } -static inline size_t num_subtexts(Text_t t) +static inline int64_t num_subtexts(Text_t t) { if (t.tag != TEXT_SUBTEXT) return 1; - size_t len = t.length; - size_t n = 0; + int64_t len = t.length; + int64_t n = 0; while (len > 0) { len -= t.subtexts[n].length; ++n; @@ -110,7 +112,7 @@ int text_visualize(FILE *stream, Text_t t) } case TEXT_SUBTEXT: { int printed = fprintf(stream, "<text length=%ld>", t.length); - size_t to_print = t.length; + int64_t to_print = t.length; for (int i = 0; to_print > 0; ++i) { printed += fprintf(stream, "\n "); printed += text_visualize(stream, t.subtexts[i]); @@ -148,7 +150,7 @@ public int Text$print(FILE *stream, Text_t t) case TEXT_SUBTEXT: { int written = 0; int i = 0; - for (size_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) + for (int64_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) written += Text$print(stream, t.subtexts[i]); return written; } @@ -162,8 +164,8 @@ static Text_t concat2(Text_t a, Text_t b) if (b.length == 0) return a; if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) { - size_t na = num_subtexts(a); - size_t nb = num_subtexts(b); + int64_t na = num_subtexts(a); + int64_t nb = num_subtexts(b); Text_t ret = { .length=a.length + b.length, .tag=TEXT_SUBTEXT, @@ -173,7 +175,7 @@ static Text_t concat2(Text_t a, Text_t b) memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb])); return ret; } else if (a.tag == TEXT_SUBTEXT) { - size_t n = num_subtexts(a); + int64_t n = num_subtexts(a); Text_t ret = { .length=a.length + b.length, .tag=TEXT_SUBTEXT, @@ -183,7 +185,7 @@ static Text_t concat2(Text_t a, Text_t b) ret.subtexts[n] = b; return ret; } else if (b.tag == TEXT_SUBTEXT) { - size_t n = num_subtexts(b); + int64_t n = num_subtexts(b); Text_t ret = { .length=a.length + b.length, .tag=TEXT_SUBTEXT, @@ -323,7 +325,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) } } -Text_t text_from_u32(uint32_t *codepoints, size_t num_codepoints, bool normalize) +Text_t text_from_u32(uint32_t *codepoints, int64_t num_codepoints, bool normalize) { uint32_t norm_buf[128]; if (normalize) { @@ -370,7 +372,7 @@ Text_t text_from_u32(uint32_t *codepoints, size_t num_codepoints, bool normalize public Text_t Text$from_str(const char *str) { - size_t ascii_span = 0; + int64_t ascii_span = 0; while (str[ascii_span] && isascii(str[ascii_span])) ascii_span++; @@ -378,7 +380,7 @@ public Text_t Text$from_str(const char *str) Text_t ret = {.length=ascii_span}; if (ascii_span <= 8) { ret.tag = TEXT_SHORT_ASCII; - for (size_t i = 0; i < ascii_span; i++) + for (int64_t i = 0; i < ascii_span; i++) ret.short_ascii[i] = str[i]; } else { ret.tag = TEXT_ASCII; @@ -395,7 +397,7 @@ public Text_t Text$from_str(const char *str) } } -static void u8_buf_append(Text_t text, char **buf, size_t *capacity, int64_t *i) +static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i) { switch (text.tag) { case TEXT_ASCII: case TEXT_SHORT_ASCII: { @@ -413,7 +415,7 @@ static void u8_buf_append(Text_t text, char **buf, size_t *capacity, int64_t *i) const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; for (int64_t g = 0; g + 1 < text.length; g++) { const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g]; - size_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1; + int64_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1; uint8_t u8_buf[64]; size_t u8_len = sizeof(u8_buf); uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len); @@ -442,7 +444,7 @@ static void u8_buf_append(Text_t text, char **buf, size_t *capacity, int64_t *i) public const char *Text$as_c_string(Text_t text) { - size_t capacity = text.length + 1; + int64_t capacity = text.length + 1; char *buf = GC_MALLOC_ATOMIC(capacity); int64_t i = 0; u8_buf_append(text, &buf, &capacity, &i); @@ -455,45 +457,6 @@ public const char *Text$as_c_string(Text_t text) return buf; } -uint32_t *text_to_u32(Text_t text, size_t *length) -{ - // Precalculate size: - size_t len = 0; - if (text.tag == TEXT_ASCII) { - len = text.length; - } else { - iteration_state_t state = {0, 0}; - for (int64_t i = 0; i < text.length; i++) { - int32_t grapheme = _next_grapheme(text, &state, i); - if (grapheme < 0) - len += synthetic_graphemes[-grapheme-1].num_codepoints; - else - len += 1; - } - } - assert(length); - *length = len; - - // Copy over codepoints one grapheme cluster at a time: - uint32_t *ret = GC_MALLOC_ATOMIC(sizeof(uint32_t[len])); - uint32_t *dest = ret; - iteration_state_t state = {0, 0}; - for (int64_t i = 0; i < text.length; i++) { - int32_t grapheme = _next_grapheme(text, &state, i); - if (grapheme < 0) { - const uint32_t *codepoints = synthetic_graphemes[-grapheme-1].codepoints; - size_t num_codepoints = synthetic_graphemes[-grapheme-1].num_codepoints; - for (size_t j = 0; j < num_codepoints; j++) - *(dest++) = codepoints[j]; - } else { - *(dest++) = (uint32_t)grapheme; - } - } - return ret; -} - -#include "siphash.c" - public uint64_t Text$hash(Text_t *text) { if (text->hash != 0) return text->hash; @@ -696,10 +659,10 @@ public bool Text$equal_ignoring_case(Text_t a, Text_t b) int32_t bi = _next_grapheme(b, &b_state, i); if (ai != bi) { const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : synthetic_graphemes[-ai-1].codepoints; - size_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints; + int64_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints; const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : synthetic_graphemes[-bi-1].codepoints; - size_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints; + int64_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints; int cmp; (void)u32_casecmp(a_codepoints, a_len, b_codepoints, b_len, language, UNINORM_NFC, &cmp); @@ -712,12 +675,11 @@ public bool Text$equal_ignoring_case(Text_t a, Text_t b) public Text_t Text$upper(Text_t text) { - size_t length; - uint32_t *codepoints = text_to_u32(text, &length); + uint32_t *codepoints = (uint32_t*)Text$as_c_string(text); const char *language = uc_locale_language(); uint32_t buf[128]; size_t out_len; - uint32_t *upper = u32_toupper(codepoints, length, language, UNINORM_NFC, buf, &out_len); + uint32_t *upper = u32_toupper(codepoints, strlen((char*)codepoints), language, UNINORM_NFC, buf, &out_len); Text_t ret = text_from_u32(upper, out_len, false); if (upper != buf) free(upper); return ret; @@ -725,12 +687,11 @@ public Text_t Text$upper(Text_t text) public Text_t Text$lower(Text_t text) { - size_t length; - uint32_t *codepoints = text_to_u32(text, &length); + uint32_t *codepoints = (uint32_t*)Text$as_c_string(text); const char *language = uc_locale_language(); uint32_t buf[128]; size_t out_len; - uint32_t *lower = u32_tolower(codepoints, length, language, UNINORM_NFC, buf, &out_len); + uint32_t *lower = u32_tolower(codepoints, strlen((char*)codepoints), language, UNINORM_NFC, buf, &out_len); Text_t ret = text_from_u32(lower, out_len, false); if (lower != codepoints) free(lower); return ret; @@ -738,12 +699,11 @@ public Text_t Text$lower(Text_t text) public Text_t Text$title(Text_t text) { - size_t length; - uint32_t *codepoints = text_to_u32(text, &length); + uint32_t *codepoints = (uint32_t*)Text$as_c_string(text); const char *language = uc_locale_language(); uint32_t buf[128]; size_t out_len; - uint32_t *title = u32_totitle(codepoints, length, language, UNINORM_NFC, buf, &out_len); + uint32_t *title = u32_totitle(codepoints, strlen((char*)codepoints), language, UNINORM_NFC, buf, &out_len); Text_t ret = text_from_u32(title, out_len, false); if (title != codepoints) free(title); return ret; @@ -1327,6 +1287,11 @@ public Int_t Text$find(Text_t text, Text_t pattern, Int_t from_index, int64_t *m return I(0); } +public bool Text$has(Text_t text, Text_t pattern) +{ + return !I_is_zero(Text$find(text, pattern, I_small(1), NULL)); +} + public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) { if (n < 1) return -1; @@ -1456,6 +1421,61 @@ public Text_t Text$format(const char *fmt, ...) return ret; } +public array_t Text$clusters(Text_t text) +{ + array_t clusters = {.atomic=1}; + for (int64_t i = 0; i < text.length; i++) { + Text_t cluster = Text$slice(text, Int64_to_Int(i), Int64_to_Int(i)); + Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t)); + } + return clusters; +} + +public array_t Text$utf32_codepoints(Text_t text) +{ + array_t codepoints = {.atomic=1}; + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = _next_grapheme(text, &state, i); + if (grapheme < 0) { + for (int64_t c = 0; c < synthetic_graphemes[-grapheme-1].num_codepoints; c++) + Array$insert(&codepoints, &synthetic_graphemes[-grapheme-1].codepoints[c], I_small(0), sizeof(uint32_t)); + } else { + Array$insert(&codepoints, &grapheme, I_small(0), sizeof(uint32_t)); + } + } + return codepoints; +} + +public array_t Text$utf8_bytes(Text_t text) +{ + const char *str = Text$as_c_string(text); + return (array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str}; +} + +public array_t Text$codepoint_names(Text_t text) +{ + array_t names = {}; + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = _next_grapheme(text, &state, i); + if (grapheme < 0) { + for (int64_t c = 0; c < synthetic_graphemes[-grapheme-1].num_codepoints; c++) { + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + name = unicode_character_name(synthetic_graphemes[-grapheme-1].codepoints[c], name); + Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=strlen(name), .ascii=name}; + Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); + } + } else { + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + name = unicode_character_name(grapheme, name); + Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=strlen(name), .ascii=name}; + Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); + } + } + return names; +} + public const TypeInfo $Text = { .size=sizeof(Text_t), .align=__alignof__(Text_t), diff --git a/builtins/text.h b/builtins/text.h index 1e671695..4fd3d5ac 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -61,8 +61,13 @@ Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info); Text_t Text$quoted(Text_t str, bool colorize); Text_t Text$replace(Text_t str, Text_t pat, Text_t replacement); Int_t Text$find(Text_t text, Text_t pattern, Int_t i, int64_t *match_length); +bool Text$has(Text_t text, Text_t pattern); const char *Text$as_c_string(Text_t text); public Text_t Text$format(const char *fmt, ...); +array_t Text$clusters(Text_t text); +array_t Text$utf32_codepoints(Text_t text); +array_t Text$utf8_bytes(Text_t text); +array_t Text$codepoint_names(Text_t text); extern const TypeInfo $Text; @@ -2679,6 +2679,8 @@ CORD compile(env_t *env, ast_t *ast) if (lang && streq(f->field, "text_content")) { CORD text = compile_to_pointer_depth(env, f->fielded, 0, false); return CORD_all("((Text_t)", text, ")"); + } else if (streq(f->field, "length")) { + return CORD_all("Int64_to_Int((", compile_to_pointer_depth(env, f->fielded, 0, false), ").length)"); } code_err(ast, "There is no '%s' field on %T values", f->field, value_t); } diff --git a/environment.c b/environment.c index d1639368..db01f4d7 100644 --- a/environment.c +++ b/environment.c @@ -249,25 +249,20 @@ env_t *new_compilation_unit(CORD *libname) {"Text", TEXT_TYPE, "Text_t", "$Text", TypedArray(ns_entry_t, // {"find", "Text$find", "func(text:Text, pattern:Text)->FindResult"}, {"as_c_string", "CORD_to_char_star", "func(text:Text)->CString"}, - {"bytes", "Text$bytes", "func(text:Text)->[Int8]"}, - {"character_names", "Text$character_names", "func(text:Text)->[Text]"}, {"clusters", "Text$clusters", "func(text:Text)->[Text]"}, - {"codepoints", "Text$codepoints", "func(text:Text)->[Int32]"}, + {"codepoint_names", "Text$codepoint_names", "func(text:Text)->[Text]"}, {"from_c_string", "CORD_from_char_star", "func(str:CString)->Text"}, - {"has", "Text$has", "func(text:Text, target:Text, where=Where.Anywhere)->Bool"}, + {"has", "Text$has", "func(text:Text, pattern:Text)->Bool"}, {"join", "Text$join", "func(glue:Text, pieces:[Text])->Text"}, {"lower", "Text$lower", "func(text:Text)->Text"}, - {"num_bytes", "Text$num_bytes", "func(text:Text)->Int"}, - {"num_clusters", "Text$num_clusters", "func(text:Text)->Int"}, - {"num_codepoints", "Text$num_codepoints", "func(text:Text)->Int"}, {"quoted", "Text$quoted", "func(text:Text, color=no)->Text"}, - {"read_line", "Text$read_line", "func(prompt='')->Text"}, {"replace", "Text$replace", "func(text:Text, pattern:Text, replacement:Text)->Text"}, {"split", "Text$split", "func(text:Text, split:Text)->[Text]"}, {"title", "Text$title", "func(text:Text)->Text"}, {"trimmed", "Text$trimmed", "func(text:Text, trim=\" {\\n\\r\\t}\", where=Where.Anywhere)->Text"}, {"upper", "Text$upper", "func(text:Text)->Text"}, - {"without", "Text$without", "func(text:Text, target:Text, where=Where.Anywhere)->Text"}, + {"utf32_codepoints", "Text$utf32_codepoints", "func(text:Text)->[Int32]"}, + {"utf8_bytes", "Text$utf8_bytes", "func(text:Text)->[Int8]"}, )}, {"Thread", THREAD_TYPE, "pthread_t*", "Thread", TypedArray(ns_entry_t, {"new", "Thread$new", "func(fn:func())->Thread"}, diff --git a/test/text.tm b/test/text.tm index ce8ed43f..9108012e 100644 --- a/test/text.tm +++ b/test/text.tm @@ -22,34 +22,22 @@ func main(): >> amelie := "Am$(\UE9)lie" >> amelie:clusters() = ["A", "m", "é", "l", "i", "e"] : [Text] - >> amelie:codepoints() + >> amelie:utf32_codepoints() = [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32] - >> amelie:bytes() + >> amelie:utf8_bytes() = [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8] - >> amelie:num_clusters() - = 6 - >> amelie:num_codepoints() - = 7 - >> amelie:num_bytes() - = 8 >> amelie2 := "Am$(\U65\U301)lie" >> amelie2:clusters() = ["A", "m", "é", "l", "i", "e"] : [Text] - >> amelie2:codepoints() + >> amelie2:utf32_codepoints() = [65_i32, 109_i32, 101_i32, 769_i32, 108_i32, 105_i32, 101_i32] : [Int32] - >> amelie2:bytes() + >> amelie2:utf8_bytes() = [65_i8, 109_i8, 101_i8, -52_i8, -127_i8, 108_i8, 105_i8, 101_i8] : [Int8] - >> amelie2:num_clusters() - = 6 - >> amelie2:num_codepoints() - = 7 - >> amelie2:num_bytes() - = 8 - - >> amelie:character_names() + + >> amelie:codepoint_names() = ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E", "COMBINING ACUTE ACCENT", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"] - >> amelie2:character_names() + >> amelie2:codepoint_names() = ["LATIN CAPITAL LETTER A", "LATIN SMALL LETTER M", "LATIN SMALL LETTER E", "COMBINING ACUTE ACCENT", "LATIN SMALL LETTER L", "LATIN SMALL LETTER I", "LATIN SMALL LETTER E"] >> "Hello":replace("e", "X") @@ -57,38 +45,38 @@ func main(): >> "Hello":has("l") = yes - >> "Hello":has("l", End) + >> "Hello":has("l[..end]") = no - >> "Hello":has("l", Start) + >> "Hello":has("[..start]l") = no >> "Hello":has("o") = yes - >> "Hello":has("o", where=End) + >> "Hello":has("o[..end]") = yes - >> "Hello":has("o", where=Start) + >> "Hello":has("[..start]o") = no >> "Hello":has("H") = yes - >> "Hello":has("H", End) + >> "Hello":has("H[..end]") = no - >> "Hello":has("H", Start) + >> "Hello":has("[..start]H") = yes - >> "Hello":without("l") + >> "Hello":replace("l", "") = "Heo" - >> "xxxx":without("x") + >> "xxxx":replace("x", "") = "" - >> "xxxx":without("y") + >> "xxxx":replace("y", "") = "xxxx" - >> "One two three four five six":without("e ") + >> "One two three four five six":replace("e ", "") = "Ontwo threfour fivsix" - >> " one ":trimmed() - = "one" - >> " one ":trimmed(" aeiou") - = "n" + >> " one ":replace("[start][..space]", "") + = "one " + >> " one ":replace("[..space][end]", "") + = " one" >> amelie:has(amelie2) @@ -598,6 +598,7 @@ type_t *get_field_type(type_t *t, const char *field_name) case TextType: { if (Match(t, TextType)->lang && streq(field_name, "text_content")) return Type(TextType); + else if (streq(field_name, "length")) return INT_TYPE; return NULL; } case StructType: { |
