diff options
| -rw-r--r-- | builtins/text.c | 304 |
1 files changed, 205 insertions, 99 deletions
diff --git a/builtins/text.c b/builtins/text.c index b3fe25eb..14d2e5bc 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -1,5 +1,51 @@ -// Type info and methods for Text datatype, which uses the Boehm "cord" library -// and libunistr +// Type info and methods for Text datatype, which uses libunistr for Unicode +// support and implements a datastructure based on Raku/MoarVM's strings to +// efficiently store arbitrary unicode data using a mix of densely packed plain +// ASCII, 32-bit integers representing grapheme clusters (see below), and ropes +// that represent text that is a composite of multiple subtexts. Subtexts are +// only nested one level deep, not arbitrarily deep trees. +// +// A note on grapheme clusters: In Unicode, codepoints can be represented using +// a 32-bit integer. Most codepoints correspond to the intuitive notion of a +// "letter", which is more formally known as a "grapheme cluster". A grapheme +// cluster is roughly speaking the amount of text that your cursor moves over +// when you press the arrow key once. However, some codepoints act as modifiers +// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify +// a letter like "e" to form "é". During normalization, this frequently +// resolves down to a single unicode codepoint, in this case, "é" resolves to +// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in +// some cases, multiple codepoints make up a grapheme cluster but *don't* +// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065) +// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph +// that is not used frequently enough to warrant its own unique codepoint (this +// is basically what Zalgo text is). +// +// There are a lot of benefits to storing text with one grapheme cluster per +// index in a densely packed array. It lets us have one canonical length for +// the text that can be precomputed and is meaningful to users. It lets us +// quickly get the Nth "letter" in the text. Substring slicing is fast. +// However, since not all grapheme clusters take up the same number of +// codepoints, we're faced with the problem of how to jam multiple codepoints +// into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this +// implementation uses "synthetic graphemes" (in Raku's terms, Normal Form +// Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed +// integer that represents a multi-codepoint grapheme cluster that has been +// encountered during the program's runtime. These clusters are stored in a +// lookup array and hash map so that we can rapidly convert between the +// synthetic grapheme integer ID and the unicode codepoints associated with it. +// Essentially, it's like we create a supplement to the unicode standard with +// things that would be nice if they had their own codepoint so things worked +// out nicely because we're using them right now, and we'll give them a +// negative number so it doesn't overlap with any real codepoints. +// +// Example 1: U+0048, U+00E9 +// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E WITH ACUTE +// This would be stored as: (int32_t[]){0x48, 0xE9} +// Example 2: U+0048, U+0065, U+0309 +// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, COMBINING VERTICAL LINE BELOW +// This would be stored as: (int32_t[]){0x48, -2} +// Where -2 is used as a lookup in an array that holds the actual unicode codepoints: +// (uint32_t[]){0x65, 0x0309} #include <assert.h> #include <ctype.h> @@ -25,88 +71,138 @@ #include "array.h" #include "functions.h" #include "integers.h" +#include "table.h" #include "text.h" #include "types.h" #include "siphash.c" typedef struct { - int64_t num_codepoints; - const uint32_t *codepoints; + uint32_t *utf32_cluster; // length-prefixed const uint8_t *utf8; } synthetic_grapheme_t; -#define MAX_SYNTHETIC_GRAPHEMES 1024 -#define MAX_BACKREFS 100 -static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = {}; -const int32_t CRLF_GRAPHEME = -MAX_SYNTHETIC_GRAPHEMES-1; - -static int32_t num_synthetic_graphemes = 0; - -static int32_t get_grapheme(Text_t text, int64_t index); - typedef struct { int64_t subtext, sum_of_previous_subtexts; } iteration_state_t; -static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index); +#define MAX_BACKREFS 100 +#define MAX_SYNTHETIC_GRAPHEMES 1024 -int32_t find_synthetic_grapheme(const uint32_t *codepoints, int64_t len) -{ - int32_t lo = 0, hi = num_synthetic_graphemes; - while (lo <= hi) { - int32_t mid = (lo + hi) / 2; - int32_t cmp = (synthetic_graphemes[mid].num_codepoints > len) - (synthetic_graphemes[mid].num_codepoints < len); - if (cmp == 0) - cmp = memcmp(synthetic_graphemes[mid].codepoints, codepoints, sizeof(uint32_t[len])); - - if (cmp == 0) - return mid; - else if (cmp < 0) - lo = mid + 1; - else if (cmp > 0) - hi = mid - 1; - } - return hi; -} +// Synthetic grapheme clusters (clusters of more than one codepoint): +static table_t grapheme_ids_by_codepoints = {}; // uint32_t* length-prefixed codepoints -> int32_t ID -int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len) -{ - int32_t index = find_synthetic_grapheme(codepoints, len); - if (index >= 0 - && index < num_synthetic_graphemes - && synthetic_graphemes[index].num_codepoints == len - && memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) { - return -(index+1); - } else { - if (index < 0) index = 0; +static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = { + (synthetic_grapheme_t){ // CRLF + .utf32_cluster=(uint32_t[]){2, '\r', '\n'}, + .utf8=(uint8_t[]){'\r', '\n', '\0'}, + }, +}; - if (num_synthetic_graphemes >= MAX_SYNTHETIC_GRAPHEMES) - fail("Too many synthetic graphemes!"); +#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0]) +#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) +#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) - if (num_synthetic_graphemes > 0 && index != num_synthetic_graphemes) { - memmove(&synthetic_graphemes[index + 1], &synthetic_graphemes[index], - sizeof(synthetic_grapheme_t[num_synthetic_graphemes - index])); - } +static int32_t num_synthetic_graphemes = 1; +const int32_t CRLF_GRAPHEME = -1; + +static int32_t get_grapheme(Text_t text, int64_t index); +static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index); - uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len])); - memcpy(buf, codepoints, sizeof(uint32_t[len])); - synthetic_graphemes[index].codepoints = buf; - synthetic_graphemes[index].num_codepoints = len; +static bool graphemes_equal(uint32_t **a, uint32_t **b) { + if ((*a)[0] != (*b)[0]) return false; + for (int i = 0; i < (int)(*a)[0]; i++) + if ((*a)[i] != (*b)[i]) return false; + return true; +} - size_t u8_len = 0; - uint8_t *u8 = u32_to_u8(codepoints, len, NULL, &u8_len); - uint8_t *gc_u8 = GC_MALLOC_ATOMIC(u8_len + 1); - memcpy(gc_u8, u8, u8_len); - gc_u8[u8_len] = '\0'; - synthetic_graphemes[index].utf8 = gc_u8; - assert(gc_u8); - free(u8); +static uint64_t grapheme_hash(uint32_t **g) { + uint32_t *cluster = *g; + return siphash24((void*)&cluster[1], sizeof(uint32_t[cluster[0]]), (uint64_t*)TOMO_HASH_KEY); +} - ++num_synthetic_graphemes; +static const TypeInfo GraphemeClusterInfo = { + .size=sizeof(uint32_t*), + .align=__alignof__(uint32_t*), + .tag=CustomInfo, + .CustomInfo={.equal=(void*)graphemes_equal, .hash=(void*)grapheme_hash}, +}; - return -(index+1); - } +static const TypeInfo GraphemeIDLookupTableInfo = { + .size=sizeof(table_t), .align=__alignof__(table_t), + .tag=TableInfo, .TableInfo={.key=&GraphemeClusterInfo, .value=&$Int32}, +}; + +int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t utf32_len) +{ + if (utf32_len == 2 && codepoints[0] == '\r' && codepoints[1] == '\n') + return CRLF_GRAPHEME; + + uint32_t length_prefixed[1+utf32_len] = {}; + length_prefixed[0] = (uint32_t)utf32_len; + for (int i = 0; i < utf32_len; i++) + length_prefixed[i+1] = codepoints[i]; + uint32_t *ptr = &length_prefixed[0]; + + // Optimization for common case of one frequently used synthetic grapheme: + static int32_t last_grapheme = CRLF_GRAPHEME; + if (graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster)) + return last_grapheme; + + int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo); + if (found) return *found; + + // New synthetic grapheme: + if (num_synthetic_graphemes >= MAX_SYNTHETIC_GRAPHEMES) + fail("Too many synthetic graphemes!"); + + int32_t grapheme_id = -(num_synthetic_graphemes+1); + num_synthetic_graphemes += 1; + + // Get UTF8 representation: + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]); + uint8_t *u8 = u32_to_u8(codepoints, utf32_len, u8_buf, &u8_len); + + // For performance reasons, use an arena allocator here to ensure that + // synthetic graphemes store all of their information in a densely packed + // area with good cache locality: + static void *arena = NULL, *arena_end = NULL; + // Eat up any space needed to make arena 32-bit aligned: + if ((long)arena % __alignof__(uint32_t) != 0) + arena += __alignof__(uint32_t) - ((long)arena % __alignof__(uint32_t)); + + // If we have filled up this arena, allocate a new one: + size_t needed_memory = sizeof(uint32_t[1+utf32_len]) + sizeof(uint8_t[u8_len + 1]); + if (arena + needed_memory > arena_end) { + // Do reasonably big chunks at a time, so most synthetic codepoints are + // nearby each other in memory and cache locality is good. This is a + // rough guess at a good size: + size_t chunk_size = MAX(needed_memory, 512); + arena = GC_MALLOC_ATOMIC(chunk_size); + arena_end = arena + chunk_size; + } + + // Copy length-prefixed UTF32 codepoints into the arena and store where they live: + uint32_t *codepoint_copy = arena; + mempcpy(codepoint_copy, length_prefixed, sizeof(uint32_t[1+utf32_len])); + synthetic_graphemes[-grapheme_id-1].utf32_cluster = codepoint_copy; + arena += sizeof(uint32_t[1+utf32_len]); + + // Copy UTF8 bytes into the arena and store where they live: + uint8_t *utf8_final = arena; + memcpy(utf8_final, u8, sizeof(uint8_t[u8_len])); + utf8_final[u8_len] = '\0'; // Add a terminating NUL byte + synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final; + arena += sizeof(uint8_t[u8_len + 1]); + + // Cleanup from unicode API: + if (u8 != u8_buf) free(u8); + + Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo); + + last_grapheme = grapheme_id; + return grapheme_id; } static inline int64_t num_subtexts(Text_t t) @@ -167,7 +263,7 @@ public int Text$print(FILE *stream, Text_t t) written += fwrite(u8, sizeof(char), len, stream); if (u8 != buf) free(u8); } else { - const uint8_t *u8 = synthetic_graphemes[-grapheme-1].utf8; + const uint8_t *u8 = GRAPHEME_UTF8(grapheme); assert(u8); written += fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream); } @@ -484,20 +580,30 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; for (int64_t g = 0; g < text.length; g++) { - const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g]; - int64_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1; - uint8_t u8_buf[64]; - size_t u8_len = sizeof(u8_buf); - uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len); - - if (*i + (int64_t)u8_len > (int64_t)*capacity) { - *capacity = *i + u8_len + 1; - *buf = GC_REALLOC(*buf, *capacity); - } + if (graphemes[g] >= 0) { + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf); + uint8_t *u8 = u32_to_u8((uint32_t*)&graphemes[g], 1, u8_buf, &u8_len); + + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + u8_len + 1; + *buf = GC_REALLOC(*buf, *capacity); + } - memcpy(*buf + *i, u8, u8_len); - *i += u8_len; - if (u8 != u8_buf) free(u8); + memcpy(*buf + *i, u8, u8_len); + *i += u8_len; + if (u8 != u8_buf) free(u8); + } else { + const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]); + size_t u8_len = u8_strlen(u8); + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + u8_len + 1; + *buf = GC_REALLOC(*buf, *capacity); + } + + memcpy(*buf + *i, u8, u8_len); + *i += u8_len; + } } break; } @@ -684,19 +790,19 @@ public int32_t Text$compare(const Text_t *a, const Text_t *b) } else if (ai > 0) { cmp = u32_cmp2( (uint32_t*)&ai, 1, - synthetic_graphemes[-bi-1].codepoints, - synthetic_graphemes[-bi-1].num_codepoints); + GRAPHEME_CODEPOINTS(bi), + NUM_GRAPHEME_CODEPOINTS(bi)); } else if (bi > 0) { cmp = u32_cmp2( - synthetic_graphemes[-ai-1].codepoints, - synthetic_graphemes[-ai-1].num_codepoints, + GRAPHEME_CODEPOINTS(ai), + NUM_GRAPHEME_CODEPOINTS(ai), (uint32_t*)&bi, 1); } else { cmp = u32_cmp2( - synthetic_graphemes[-ai-1].codepoints, - synthetic_graphemes[-ai-1].num_codepoints, - synthetic_graphemes[-bi-1].codepoints, - synthetic_graphemes[-bi-1].num_codepoints); + GRAPHEME_CODEPOINTS(ai), + NUM_GRAPHEME_CODEPOINTS(ai), + GRAPHEME_CODEPOINTS(bi), + NUM_GRAPHEME_CODEPOINTS(bi)); } if (cmp != 0) return cmp; } @@ -728,11 +834,11 @@ public bool Text$equal_ignoring_case(Text_t a, Text_t b) int32_t ai = _next_grapheme(a, &a_state, i); int32_t bi = _next_grapheme(b, &b_state, i); if (ai != bi) { - const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : synthetic_graphemes[-ai-1].codepoints; - int64_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints; + const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : GRAPHEME_CODEPOINTS(ai); + int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai); - const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : synthetic_graphemes[-bi-1].codepoints; - int64_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints; + const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : GRAPHEME_CODEPOINTS(bi); + int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi); int cmp; (void)u32_casecmp(a_codepoints, a_len, b_codepoints, b_len, language, UNINORM_NFC, &cmp); @@ -817,7 +923,7 @@ static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) if (*i >= text.length) return false; int32_t grapheme = get_grapheme(text, *i); if (grapheme < 0) // TODO: check every codepoint in the cluster? - grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + grapheme = GRAPHEME_CODEPOINTS(grapheme)[0]; if (uc_is_property(grapheme, prop)) { *i += 1; @@ -833,7 +939,7 @@ static int64_t parse_int(Text_t text, int64_t *i) for (;; *i += 1) { int32_t grapheme = _next_grapheme(text, &state, *i); if (grapheme < 0) - grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + grapheme = GRAPHEME_CODEPOINTS(grapheme)[0]; int digit = uc_digit_value(grapheme); if (digit < 0) break; if (value >= INT64_MAX/10) break; @@ -900,7 +1006,7 @@ int64_t match_email(Text_t text, int64_t text_index) if (text_index > 0) { int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); if (prev_codepoint < 0) - prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + prev_codepoint = GRAPHEME_CODEPOINTS(prev_codepoint)[0]; if (uc_is_property_alphabetic(prev_codepoint)) return -1; } @@ -1017,7 +1123,7 @@ int64_t match_uri(Text_t text, int64_t text_index) if (text_index > 0) { int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); if (prev_codepoint < 0) - prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + prev_codepoint = GRAPHEME_CODEPOINTS(prev_codepoint)[0]; if (uc_is_property_alphabetic(prev_codepoint)) return -1; } @@ -1367,7 +1473,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter for (int64_t count = 0; count < max; ) { int32_t grapheme = _next_grapheme(text, &text_state, text_index); if (grapheme < 0) - grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + grapheme = GRAPHEME_CODEPOINTS(grapheme)[0]; bool success; if (any) { @@ -1871,8 +1977,8 @@ public array_t Text$utf32_codepoints(Text_t text) for (int64_t i = 0; i < text.length; i++) { int32_t grapheme = _next_grapheme(text, &state, i); if (grapheme < 0) { - for (int64_t c = 0; c < synthetic_graphemes[-grapheme-1].num_codepoints; c++) - Array$insert(&codepoints, &synthetic_graphemes[-grapheme-1].codepoints[c], I_small(0), sizeof(uint32_t)); + for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) + Array$insert(&codepoints, &GRAPHEME_CODEPOINTS(grapheme)[c], I_small(0), sizeof(uint32_t)); } else { Array$insert(&codepoints, &grapheme, I_small(0), sizeof(uint32_t)); } @@ -1904,8 +2010,8 @@ public array_t Text$codepoint_names(Text_t text) for (int64_t i = 0; i < text.length; i++) { int32_t grapheme = _next_grapheme(text, &state, i); if (grapheme < 0) { - for (int64_t c = 0; c < synthetic_graphemes[-grapheme-1].num_codepoints; c++) { - const char *name = codepoint_name(synthetic_graphemes[-grapheme-1].codepoints[c]); + for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { + const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]); Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=strlen(name), .ascii=name}; Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); } @@ -1988,7 +2094,7 @@ public Pattern_t Pattern$escape_text(Text_t text) iteration_state_t state = {0, 0}; for (int64_t i = 0; i < text.length; i++) { int32_t g = _next_grapheme(text, &state, i); - uint32_t g0 = g < 0 ? synthetic_graphemes[-g-1].codepoints[0] : (uint32_t)g; + uint32_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (uint32_t)g; if (g == '{') { add_str("{1{}"); |
