diff options
Diffstat (limited to 'src/stdlib/text.c')
| -rw-r--r-- | src/stdlib/text.c | 96 |
1 files changed, 48 insertions, 48 deletions
diff --git a/src/stdlib/text.c b/src/stdlib/text.c index b3e9cebb..3346bf4b 100644 --- a/src/stdlib/text.c +++ b/src/stdlib/text.c @@ -27,7 +27,7 @@ // is basically what Zalgo text is). // // There are a lot of benefits to storing unicode text with one grapheme -// cluster per index in a densely packed array instead of storing the text as +// cluster per index in a densely packed list instead of storing the text as // variable-width UTF8-encoded bytes. It lets us have one canonical length for // the text that can be precomputed and is meaningful to users. It lets us // quickly get the Nth "letter" in the text. Substring slicing is fast. @@ -38,7 +38,7 @@ // Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed // integer that represents a multi-codepoint grapheme cluster that has been // encountered during the program's runtime. These clusters are stored in a -// lookup array and hash map so that we can rapidly convert between the +// lookup list and hash map so that we can rapidly convert between the // synthetic grapheme integer ID and the unicode codepoints associated with it. // Essentially, it's like we create a supplement to the unicode standard with // things that would be nice if they had their own codepoint so things worked @@ -49,7 +49,7 @@ // WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2: // U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, // COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2} -// Where -2 is used as a lookup in an array that holds the actual unicode +// Where -2 is used as a lookup in a list that holds the actual unicode // codepoints: (ucs4_t[]){0x65, 0x0309} #include <assert.h> @@ -68,7 +68,7 @@ #include <unistring/version.h> #include <uniwidth.h> -#include "arrays.h" +#include "lists.h" #include "integers.h" #include "tables.h" #include "text.h" @@ -86,7 +86,7 @@ typedef struct { // Synthetic grapheme clusters (clusters of more than one codepoint): static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID -// This will hold a dynamically growing array of synthetic graphemes: +// This will hold a dynamically growing list of synthetic graphemes: static synthetic_grapheme_t *synthetic_graphemes = NULL; static int32_t synthetic_grapheme_capacity = 0; static int32_t num_synthetic_graphemes = 0; @@ -733,7 +733,7 @@ Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize) // Intentionally overallocate here: allocate assuming each codepoint is a // grapheme cluster. If that's not true, we'll have extra space at the end - // of the array, but the length will still be calculated correctly. + // of the list, but the length will still be calculated correctly. int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); struct Text_s ret = { .tag=TEXT_GRAPHEMES, @@ -1067,10 +1067,10 @@ public Text_t Text$translate(Text_t text, Table_t translations) TextIter_t text_state = NEW_TEXT_ITER_STATE(text); Text_t result = EMPTY_TEXT; int64_t span_start = 0; - Array_t replacement_array = translations.entries; + List_t replacement_list = translations.entries; for (int64_t i = 0; i < text.length; ) { - for (int64_t r = 0; r < replacement_array.length; r++) { - struct { Text_t target, replacement; } *entry = replacement_array.data + r*replacement_array.stride; + for (int64_t r = 0; r < replacement_list.length; r++) { + struct { Text_t target, replacement; } *entry = replacement_list.data + r*replacement_list.stride; TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target); if (_matches(&text_state, &target_state, i)) { if (i > span_start) @@ -1122,36 +1122,36 @@ public bool Text$has(Text_t text, Text_t target) return false; } -public Array_t Text$split(Text_t text, Text_t delimiters) +public List_t Text$split(Text_t text, Text_t delimiters) { if (delimiters.length == 0) return Text$clusters(text); TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters); - Array_t splits = {}; + List_t splits = {}; for (int64_t i = 0; i < text.length; ) { int64_t span_len = 0; while (i + span_len < text.length && !_matches(&text_state, &delim_state, i + span_len)) { span_len += 1; } Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); - Array$insert(&splits, &slice, I(0), sizeof(slice)); + List$insert(&splits, &slice, I(0), sizeof(slice)); i += span_len + delimiters.length; if (i == text.length) { Text_t empty = Text(""); - Array$insert(&splits, &empty, I(0), sizeof(empty)); + List$insert(&splits, &empty, I(0), sizeof(empty)); } } return splits; } -public Array_t Text$split_any(Text_t text, Text_t delimiters) +public List_t Text$split_any(Text_t text, Text_t delimiters) { if (delimiters.length == 0) - return Array(text); + return List(text); TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters); - Array_t splits = {}; + List_t splits = {}; for (int64_t i = 0; i < text.length; ) { int64_t span_len = 0; while (i + span_len < text.length && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) { @@ -1159,14 +1159,14 @@ public Array_t Text$split_any(Text_t text, Text_t delimiters) } bool trailing_delim = i + span_len < text.length; Text_t slice = Text$slice(text, I(i+1), I(i+span_len)); - Array$insert(&splits, &slice, I(0), sizeof(slice)); + List$insert(&splits, &slice, I(0), sizeof(slice)); i += span_len + 1; while (i < text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) { i += 1; } if (i >= text.length && trailing_delim) { Text_t empty = Text(""); - Array$insert(&splits, &empty, I(0), sizeof(empty)); + List$insert(&splits, &empty, I(0), sizeof(empty)); } } return splits; @@ -1303,7 +1303,7 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t languag public Text_t Text$upper(Text_t text, Text_t language) { if (text.length == 0) return text; - Array_t codepoints = Text$utf32_codepoints(text); + List_t codepoints = Text$utf32_codepoints(text); const char *uc_language = Text$as_c_string(language); ucs4_t buf[128]; size_t out_len = sizeof(buf)/sizeof(buf[0]); @@ -1316,7 +1316,7 @@ public Text_t Text$upper(Text_t text, Text_t language) public Text_t Text$lower(Text_t text, Text_t language) { if (text.length == 0) return text; - Array_t codepoints = Text$utf32_codepoints(text); + List_t codepoints = Text$utf32_codepoints(text); const char *uc_language = Text$as_c_string(language); ucs4_t buf[128]; size_t out_len = sizeof(buf)/sizeof(buf[0]); @@ -1329,7 +1329,7 @@ public Text_t Text$lower(Text_t text, Text_t language) public Text_t Text$title(Text_t text, Text_t language) { if (text.length == 0) return text; - Array_t codepoints = Text$utf32_codepoints(text); + List_t codepoints = Text$utf32_codepoints(text); const char *uc_language = Text$as_c_string(language); ucs4_t buf[128]; size_t out_len = sizeof(buf)/sizeof(buf[0]); @@ -1451,7 +1451,7 @@ public Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *i return as_text; } -public Text_t Text$join(Text_t glue, Array_t pieces) +public Text_t Text$join(Text_t glue, List_t pieces) { if (pieces.length == 0) return EMPTY_TEXT; @@ -1478,38 +1478,38 @@ public Text_t Text$format(const char *fmt, ...) return ret; } -public Array_t Text$clusters(Text_t text) +public List_t Text$clusters(Text_t text) { - Array_t clusters = {}; + List_t clusters = {}; for (int64_t i = 1; i <= text.length; i++) { Text_t cluster = Text$slice(text, I(i), I(i)); - Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t)); + List$insert(&clusters, &cluster, I_small(0), sizeof(Text_t)); } return clusters; } -public Array_t Text$utf32_codepoints(Text_t text) +public List_t Text$utf32_codepoints(Text_t text) { - Array_t codepoints = {.atomic=1}; + List_t codepoints = {.atomic=1}; TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state, i); if (grapheme < 0) { for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c]; - Array$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t)); + List$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t)); } } else { - Array$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t)); + List$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t)); } } return codepoints; } -public Array_t Text$utf8_bytes(Text_t text) +public List_t Text$utf8_bytes(Text_t text) { const char *str = Text$as_c_string(text); - return (Array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str}; + return (List_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str}; } static INLINE const char *codepoint_name(ucs4_t c) @@ -1523,9 +1523,9 @@ static INLINE const char *codepoint_name(ucs4_t c) return name; } -public Array_t Text$codepoint_names(Text_t text) +public List_t Text$codepoint_names(Text_t text) { - Array_t names = {}; + List_t names = {}; TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state, i); @@ -1533,65 +1533,65 @@ public Array_t Text$codepoint_names(Text_t text) for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]); Text_t name_text = Text$from_str(name); - Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); + List$insert(&names, &name_text, I_small(0), sizeof(Text_t)); } } else { const char *name = codepoint_name((ucs4_t)grapheme); Text_t name_text = Text$from_str(name); - Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); + List$insert(&names, &name_text, I_small(0), sizeof(Text_t)); } } return names; } -public Text_t Text$from_codepoints(Array_t codepoints) +public Text_t Text$from_codepoints(List_t codepoints) { if (codepoints.stride != sizeof(int32_t)) - Array$compact(&codepoints, sizeof(int32_t)); + List$compact(&codepoints, sizeof(int32_t)); return text_from_u32(codepoints.data, codepoints.length, true); } -public OptionalText_t Text$from_codepoint_names(Array_t codepoint_names) +public OptionalText_t Text$from_codepoint_names(List_t codepoint_names) { - Array_t codepoints = {}; + List_t codepoints = {}; for (int64_t i = 0; i < codepoint_names.length; i++) { Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride)); const char *name_str = Text$as_c_string(*name); ucs4_t codepoint = unicode_name_character(name_str); if (codepoint == UNINAME_INVALID) return NONE_TEXT; - Array$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t)); + List$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t)); } return Text$from_codepoints(codepoints); } -public OptionalText_t Text$from_bytes(Array_t bytes) +public OptionalText_t Text$from_bytes(List_t bytes) { if (bytes.stride != sizeof(int8_t)) - Array$compact(&bytes, sizeof(int8_t)); + List$compact(&bytes, sizeof(int8_t)); return Text$from_strn(bytes.data, (size_t)bytes.length); } -public Array_t Text$lines(Text_t text) +public List_t Text$lines(Text_t text) { - Array_t lines = {}; + List_t lines = {}; TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0, line_start = 0; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state, i); if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF Text_t line = Text$slice(text, I(line_start+1), I(i)); - Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); + List$insert(&lines, &line, I_small(0), sizeof(Text_t)); i += 1; // skip one extra for CR line_start = i + 1; } else if (grapheme == '\n') { // newline Text_t line = Text$slice(text, I(line_start+1), I(i)); - Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); + List$insert(&lines, &line, I_small(0), sizeof(Text_t)); line_start = i + 1; } else if (i == text.length-1 && line_start != i) { // last line Text_t line = Text$slice(text, I(line_start+1), I(i+1)); - Array$insert(&lines, &line, I_small(0), sizeof(Text_t)); + List$insert(&lines, &line, I_small(0), sizeof(Text_t)); } } return lines; @@ -1645,7 +1645,7 @@ public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const fwrite(str, sizeof(char), (size_t)len, out); } -public void Text$deserialize(FILE *in, void *out, Array_t *pointers, const TypeInfo_t *) +public void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *) { int64_t len = -1; Int64$deserialize(in, &len, pointers, &Int64$info); |
