text.c - tomo

(1.9K lines)
   1 // This file defines type info and methods for the Text datatype, which uses
   2 // libunistr for Unicode support and implements a datastructure based on a
   3 // hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of
   4 // strings, combined with a mostly-balanced tree datastructure based on Cords
   5 // (Boehm et al), which have good runtime performance for text constructed by a
   6 // series of many concatenations. In practice, this means Tomo's Text has an
   7 // extremely compact memory footprint (typically as compact as UTF8 or
   8 // up to 2x better for some languages), with extremely fast operations
   9 // including concatenation, random indexing, and taking substrings.
  10 
  11 // For more information on MoarVM's grapheme cluster strings, see:
  12 //     https://docs.raku.org/language/unicode
  13 //     https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc
  14 // For more information on Cords, see the paper "Ropes: an Alternative to
  15 // Strings" (Boehm, Atkinson, Plass 1995):
  16 //     https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf
  17 
  18 // Tomo's Text datatype represents Unicode text that is fully normalized using
  19 // normalization form C (NFC). This means that all text created from source code
  20 // or read in at runtime will respect normalization during comparison and other
  21 // operations, and the original (potentially non-canonical) representation of
  22 // text is not preserved. This also means that byte sequences that do not
  23 // represent valid unicode text cannot be interpreted as the Text datatype. For
  24 // example, a file with malformed UTF8 sequences cannot be read as Text.
  25 
  26 // A note on grapheme clusters: In Unicode, the fundamental unit is the
  27 // "codepoint", which represents things like letters, symbols, emojis,
  28 // combiners, and modifiers that alter other codepoints. However, most people
  29 // have an intuitive notion of what a "letter" is that corresponds to the
  30 // concept formally known as a grapheme cluster. A grapheme cluster is roughly
  31 // speaking the amount of text that your cursor moves over when you press the
  32 // left or right arrow key once. This often corresponds to a single codepoint,
  33 // but some codepoints act as modifiers on other codepoints. For example, U+0301
  34 // (COMBINING ACUTE ACCENT) can modify a letter like "e" to form "é". During
  35 // normalization, this frequently resolves down to a single unicode codepoint,
  36 // in this case, "é" resolves to the single codepoint U+00E9 (LATIN SMALL LETTER
  37 // E WITH ACUTE). However, in some cases, multiple codepoints make up a grapheme
  38 // cluster but *don't* normalize to a single codepoint. For example, LATIN SMALL
  39 // LETTER E (U+0065) + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an
  40 // unusual glyph that is not used frequently enough to warrant its own unique
  41 // codepoint (this is basically what Zalgo text is). Emojis also use the ZERO
  42 // WIDTH JOINER (U+200D) to add gender, skin tone, or other modifiers to emojis.
  43 // Tomo takes an opinionated stance that grapheme clusters, not codepoints or
  44 // bytes, are more useful to people when doing text operations like getting the
  45 // "length" of a text or accessing the Nth "letter" of a text. If someone sends
  46 // you a text with WOMAN (U+1F469) + ZERO WIDTH JOINER (U+200D) + ROCKET
  47 // (U+1F680) followed by THUMBS UP (U+1F44D), it will render on your screen as
  48 // two things: a female astronaut and a thumbs up, and this is how most people
  49 // will think about the text. If you wish to operate on the raw codepoints that
  50 // comprise the message, you are free to do so with the `.utf32()`
  51 // method and `Text.from_utf32()`, but this is not the default behavior.
  52 // The behavior for the given example is that `text.length == 2`, `text[1]` is
  53 // the grapheme cluster representing a female astronaut emoji, and `text[2]` is
  54 // the grapheme cluster representing the thumbs up emoji.
  55 
  56 // There are a lot of benefits to storing unicode text with one grapheme
  57 // cluster per index in a densely packed list instead of storing the text as
  58 // variable-width UTF8-encoded bytes. It lets us have one canonical length for
  59 // the text that can be precomputed and is meaningful to users. It lets us
  60 // quickly get the Nth "letter" in the text. Substring slicing is fast.
  61 // However, since not all grapheme clusters take up the same number of
  62 // codepoints, we're faced with the problem of how to jam multiple codepoints
  63 // into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this
  64 // implementation uses "synthetic graphemes" (in Raku's terms, Normal Form
  65 // Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed
  66 // integer that represents a multi-codepoint grapheme cluster that has been
  67 // encountered during the program's runtime. These clusters are stored in a
  68 // lookup list and hash map so that we can rapidly convert between the
  69 // synthetic grapheme integer ID and the unicode codepoints associated with it.
  70 // Essentially, it's like we create a supplement to the unicode standard with
  71 // things that would be nice if they had their own codepoint so things worked
  72 // out nicely because we're using them right now, and we'll give them a
  73 // negative number so it doesn't overlap with any real codepoints.
  74 
  75 // Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E
  76 // WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2:
  77 // U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E,
  78 // COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2}
  79 // Where -2 is used as a lookup in a list that holds the actual unicode
  80 // codepoints: (ucs4_t[]){0x65, 0x0309}
  81 
  82 // The text datastructure also uses a compact encoding (TEXT_BLOB) to store a
  83 // per-chunk compressed form of the text when long stretches of text contain
  84 // 256 or fewer unique grapheme clusters, which lets the text use a single byte
  85 // for each grapheme cluster along with a lookup table. For typical text
  86 // written in a variety of non-English natural languages (e.g. Spanish, Arabic,
  87 // Japanese, Greek, German, Finnish, Basque), the in-memory representation
  88 // takes up between 50-101% as much space as UTF8 encoding and between 24-39%
  89 // as much space as UTF32 encoding, but with the advantage of extremely fast
  90 // random access for indexing or slicing, unlike UTF8. In other words, this
  91 // representation offers ASCII-like compactness and fast random access for
  92 // non-English languages.
  93 
  94 #include <assert.h>
  95 #include <ctype.h>
  96 #include <fnmatch.h>
  97 #include <gc.h>
  98 #include <stdbool.h>
  99 #include <stdint.h>
 100 #include <stdlib.h>
 101 #include <sys/param.h>
 102 
 103 #include "../unistr-fixed.h"
 104 
 105 #include <unicase.h>
 106 #include <unictype.h>
 107 #include <unigbrk.h>
 108 #include <uniname.h>
 109 #include <unistring/version.h>
 110 #include <uniwidth.h>
 111 
 112 #include "bytes.h"
 113 #include "datatypes.h"
 114 #include "integers.h"
 115 #include "lists.h"
 116 #include "optionals.h"
 117 #include "tables.h"
 118 #include "text.h"
 119 
 120 // Use inline version of the siphash code for performance:
 121 #include "siphash-internals.h"
 122 #include "siphash.h"
 123 
 124 typedef struct {
 125     ucs4_t main_codepoint;
 126     ucs4_t *utf32_cluster; // length-prefixed
 127     const uint8_t *utf8;
 128 } synthetic_grapheme_t;
 129 
 130 // Synthetic grapheme clusters (clusters of more than one codepoint):
 131 static Table_t grapheme_ids_by_codepoints = EMPTY_TABLE; // ucs4_t* length-prefixed codepoints -> int32_t ID
 132 
 133 // This will hold a dynamically growing list of synthetic graphemes:
 134 static synthetic_grapheme_t *synthetic_graphemes = NULL;
 135 static int32_t synthetic_grapheme_capacity = 0;
 136 static int32_t num_synthetic_graphemes = 0;
 137 
 138 #define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id) - 1].utf32_cluster[0])
 139 #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id) - 1].utf32_cluster[1])
 140 #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id) - 1].utf8)
 141 
 142 // Somewhat arbitrarily chosen, if two short literal ASCII or grapheme chunks
 143 // are concatenated below this length threshold, we just merge them into a
 144 // single literal node instead of a concatenation node.
 145 #define SHORT_ASCII_LENGTH 64
 146 #define SHORT_GRAPHEMES_LENGTH 16
 147 
 148 static Text_t simple_concatenation(Text_t a, Text_t b);
 149 
 150 PUREFUNC static bool graphemes_equal(const void *va, const void *vb, const TypeInfo_t *info) {
 151     (void)info;
 152     ucs4_t *a = *(ucs4_t **)va;
 153     ucs4_t *b = *(ucs4_t **)vb;
 154     if (a[0] != b[0]) return false;
 155     for (int i = 0; i < (int)a[0]; i++)
 156         if (a[i] != b[i]) return false;
 157     return true;
 158 }
 159 
 160 PUREFUNC static uint64_t grapheme_hash(const void *g, const TypeInfo_t *info) {
 161     (void)info;
 162     ucs4_t *cluster = *(ucs4_t **)g;
 163     return siphash24((void *)&cluster[1], sizeof(ucs4_t[cluster[0]]));
 164 }
 165 
 166 static const TypeInfo_t GraphemeClusterInfo = {
 167     .size = sizeof(ucs4_t *),
 168     .align = __alignof__(ucs4_t *),
 169     .metamethods =
 170         {
 171             .equal = graphemes_equal,
 172             .hash = grapheme_hash,
 173         },
 174 };
 175 
 176 #ifdef __GNUC__
 177 #pragma GCC diagnostic push
 178 #pragma GCC diagnostic ignored "-Wstack-protector"
 179 #endif
 180 public
 181 int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) {
 182     if (utf32_len == 1) return (int32_t)*codepoints;
 183 
 184     ucs4_t length_prefixed[1 + utf32_len];
 185     length_prefixed[0] = (ucs4_t)utf32_len;
 186     for (int i = 0; i < utf32_len; i++)
 187         length_prefixed[i + 1] = codepoints[i];
 188     ucs4_t *ptr = &length_prefixed[0];
 189 
 190     // Optimization for common case of one frequently used synthetic grapheme:
 191     static int32_t last_grapheme = 0;
 192     if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme - 1].utf32_cluster, NULL))
 193         return last_grapheme;
 194 
 195     TypeInfo_t GraphemeIDLookupTableInfo = *Table$info(&GraphemeClusterInfo, &Int32$info);
 196     int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo);
 197     if (found) return *found;
 198 
 199     // New synthetic grapheme:
 200     if (num_synthetic_graphemes >= synthetic_grapheme_capacity) {
 201         // If we don't have space, allocate more:
 202         synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2);
 203         synthetic_grapheme_t *new = GC_MALLOC_ATOMIC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity]));
 204         memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes]));
 205         synthetic_graphemes = new;
 206     }
 207 
 208     int32_t grapheme_id = -(num_synthetic_graphemes + 1);
 209     num_synthetic_graphemes += 1;
 210 
 211     // Get UTF8 representation:
 212     uint8_t u8_buf[64];
 213     size_t u8_len = sizeof(u8_buf) / sizeof(u8_buf[0]);
 214     uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len);
 215     if (u8 == NULL) fail("Invalid graphemes encountered!");
 216 
 217     // For performance reasons, use an arena allocator here to ensure that
 218     // synthetic graphemes store all of their information in a densely packed
 219     // area with good cache locality:
 220     static void *arena = NULL, *arena_end = NULL;
 221     if (arena != NULL && (size_t)arena % __alignof__(ucs4_t) != 0)
 222         arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t));
 223 
 224     // If we have filled up this arena, allocate a new one:
 225     size_t needed_memory = sizeof(ucs4_t[1 + utf32_len]) + sizeof(uint8_t[u8_len + 1]);
 226     if (arena == NULL || arena + needed_memory > arena_end) {
 227         // Do reasonably big chunks at a time, so most synthetic codepoints are
 228         // nearby each other in memory and cache locality is good. This is a
 229         // rough guess at a good size:
 230         size_t chunk_size = MAX(needed_memory, 512);
 231         arena = GC_MALLOC_ATOMIC(chunk_size);
 232         assert(arena != NULL);
 233         arena_end = arena + chunk_size;
 234     }
 235 
 236     // Copy length-prefixed UTF32 codepoints into the arena and store where they live:
 237     ucs4_t *codepoint_copy = arena;
 238     memcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1 + utf32_len]));
 239     synthetic_graphemes[-grapheme_id - 1].utf32_cluster = codepoint_copy;
 240     arena += sizeof(ucs4_t[1 + utf32_len]);
 241 
 242     // Copy UTF8 units into the arena and store where they live:
 243     uint8_t *utf8_final = arena;
 244     memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
 245     utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
 246     synthetic_graphemes[-grapheme_id - 1].utf8 = utf8_final;
 247     arena += sizeof(uint8_t[u8_len + 1]);
 248 
 249     // Sickos at the unicode consortium decreed that you can have grapheme clusters
 250     // that begin with *prefix* modifiers, so we gotta check for that case:
 251     synthetic_graphemes[-grapheme_id - 1].main_codepoint = length_prefixed[1];
 252     for (ucs4_t i = 0; i < utf32_len; i++) {
 253 #if _LIBUNISTRING_VERSION >= 0x010200
 254         // libuinstring version 1.2.0 introduced uc_is_property_prepended_concatenation_mark()
 255         // It's not critical, but it's technically more correct to have this check:
 256         if (unlikely(uc_is_property_prepended_concatenation_mark(length_prefixed[1 + i]))) continue;
 257 #endif
 258         synthetic_graphemes[-grapheme_id - 1].main_codepoint = length_prefixed[1 + i];
 259         break;
 260     }
 261 
 262     // Cleanup from unicode API:
 263     if (u8 != u8_buf) free(u8);
 264 
 265     Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo);
 266 
 267     last_grapheme = grapheme_id;
 268     return grapheme_id;
 269 }
 270 #ifdef __GNUC__
 271 #pragma GCC diagnostic pop
 272 #endif
 273 
 274 public
 275 int Text$print(FILE *stream, Text_t t) {
 276     if (t.length == 0) return 0;
 277 
 278     switch (t.tag) {
 279     case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream);
 280     case TEXT_GRAPHEMES: {
 281         const int32_t *graphemes = t.graphemes;
 282         int written = 0;
 283         for (int64_t i = 0; i < (int64_t)t.length; i++) {
 284             int32_t grapheme = graphemes[i];
 285             if (grapheme >= 0) {
 286                 uint8_t buf[8];
 287                 size_t len = sizeof(buf);
 288                 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, buf, &len);
 289                 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
 290                 written += (int)fwrite(u8, sizeof(char), len, stream);
 291                 if (u8 != buf) free(u8);
 292             } else {
 293                 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
 294                 assert(u8);
 295                 written += (int)fwrite(u8, sizeof(uint8_t), strlen((char *)u8), stream);
 296             }
 297         }
 298         return written;
 299     }
 300     case TEXT_BLOB: {
 301         int written = 0;
 302         for (int64_t i = 0; i < (int64_t)t.length; i++) {
 303             int32_t grapheme = t.blob.map[t.blob.bytes[i]];
 304             if (grapheme >= 0) {
 305                 uint8_t buf[8];
 306                 size_t len = sizeof(buf);
 307                 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, buf, &len);
 308                 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
 309                 written += (int)fwrite(u8, sizeof(char), len, stream);
 310                 if (u8 != buf) free(u8);
 311             } else {
 312                 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
 313                 assert(u8);
 314                 written += (int)fwrite(u8, sizeof(uint8_t), strlen((char *)u8), stream);
 315             }
 316         }
 317         return written;
 318     }
 319     case TEXT_CONCAT: {
 320         int written = Text$print(stream, *t.left);
 321         written += Text$print(stream, *t.right);
 322         return written;
 323     }
 324     default: return 0;
 325     }
 326 }
 327 
 328 static const uint64_t min_len_for_depth[MAX_TEXT_DEPTH] = {
 329     // Fibonacci numbers (skipping first two)
 330     1,         2,         3,         5,          8,          13,         21,         34,         55,        89,
 331     144,       233,       377,       610,        987,        1597,       2584,       4181,       6765,      10946,
 332     17711,     28657,     46368,     75025,      121393,     196418,     317811,     514229,     832040,    1346269,
 333     2178309,   3524578,   5702887,   9227465,    14930352,   24157817,   39088169,   63245986,   102334155, 165580141,
 334     267914296, 433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049,
 335 };
 336 
 337 #define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth])
 338 
 339 static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text) {
 340     if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) {
 341         insert_balanced_recursive(balanced_texts, *text.left);
 342         insert_balanced_recursive(balanced_texts, *text.right);
 343         return;
 344     }
 345 
 346     int i = 0;
 347     Text_t accumulator = EMPTY_TEXT;
 348     for (; text.length > min_len_for_depth[i + 1]; i++) {
 349         if (balanced_texts[i].length) {
 350             accumulator = simple_concatenation(balanced_texts[i], accumulator);
 351             balanced_texts[i] = EMPTY_TEXT;
 352         }
 353     }
 354 
 355     accumulator = simple_concatenation(accumulator, text);
 356 
 357     while (accumulator.length >= min_len_for_depth[i]) {
 358         if (balanced_texts[i].length) {
 359             accumulator = simple_concatenation(balanced_texts[i], accumulator);
 360             balanced_texts[i] = EMPTY_TEXT;
 361         }
 362         i++;
 363     }
 364     i--;
 365     balanced_texts[i] = accumulator;
 366 }
 367 
 368 static Text_t rebalanced(Text_t a, Text_t b) {
 369     Text_t balanced_texts[MAX_TEXT_DEPTH];
 370     memset(balanced_texts, 0, sizeof(balanced_texts));
 371     insert_balanced_recursive(balanced_texts, a);
 372     insert_balanced_recursive(balanced_texts, b);
 373 
 374     Text_t ret = EMPTY_TEXT;
 375     for (int64_t i = 0; ret.length < a.length + b.length; i++) {
 376         if (balanced_texts[i].length) ret = simple_concatenation(balanced_texts[i], ret);
 377     }
 378     return ret;
 379 }
 380 
 381 Text_t simple_concatenation(Text_t a, Text_t b) {
 382     if (a.length == 0) return b;
 383     if (b.length == 0) return a;
 384 
 385     uint16_t new_depth = 1 + MAX(a.depth, b.depth);
 386     // Rebalance only if depth exceeds the maximum allowed. We don't require
 387     // every concatenation to yield a balanced text, since many concatenations
 388     // are ephemeral (e.g. doing a loop repeatedly concatenating without using
 389     // the intermediary values).
 390     if (new_depth >= MAX_TEXT_DEPTH) return rebalanced(a, b);
 391 
 392     Text_t *children = GC_MALLOC(sizeof(Text_t[2]));
 393     children[0] = a;
 394     children[1] = b;
 395     return (Text_t){
 396         .tag = TEXT_CONCAT,
 397         .length = a.length + b.length,
 398         .depth = new_depth,
 399         .left = &children[0],
 400         .right = &children[1],
 401     };
 402 }
 403 
 404 static Text_t concat2_assuming_safe(Text_t a, Text_t b) {
 405     if (a.length == 0) return b;
 406     if (b.length == 0) return a;
 407 
 408     if (a.tag == TEXT_ASCII && b.tag == TEXT_ASCII && (size_t)(a.length + b.length) <= SHORT_ASCII_LENGTH) {
 409         struct Text_s ret = {
 410             .tag = TEXT_ASCII,
 411             .length = a.length + b.length,
 412         };
 413         ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
 414         memcpy((char *)ret.ascii, a.ascii, sizeof(char[a.length]));
 415         memcpy((char *)&ret.ascii[a.length], b.ascii, sizeof(char[b.length]));
 416         return ret;
 417     } else if (a.tag == TEXT_GRAPHEMES && b.tag == TEXT_GRAPHEMES
 418                && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
 419         struct Text_s ret = {
 420             .tag = TEXT_GRAPHEMES,
 421             .length = a.length + b.length,
 422         };
 423         ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
 424         memcpy((int32_t *)ret.graphemes, a.graphemes, sizeof(int32_t[a.length]));
 425         memcpy((int32_t *)&ret.graphemes[a.length], b.graphemes, sizeof(int32_t[b.length]));
 426         return ret;
 427     } else if (a.tag != TEXT_CONCAT && b.tag != TEXT_CONCAT
 428                && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
 429         // Turn a small bit of ASCII into graphemes if it helps make things smaller
 430         // Text structs come with an extra 8 bytes, so allocate enough to hold the text
 431         struct Text_s ret = {
 432             .tag = TEXT_GRAPHEMES,
 433             .length = a.length + b.length,
 434         };
 435         ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
 436         int32_t *dest = (int32_t *)ret.graphemes;
 437         if (a.tag == TEXT_GRAPHEMES) {
 438             memcpy(dest, a.graphemes, sizeof(int32_t[a.length]));
 439             dest += a.length;
 440         } else if (a.tag == TEXT_ASCII) {
 441             for (int64_t i = 0; i < (int64_t)a.length; i++)
 442                 *(dest++) = (int32_t)a.ascii[i];
 443         } else if (a.tag == TEXT_BLOB) {
 444             for (int64_t i = 0; i < (int64_t)a.length; i++)
 445                 *(dest++) = a.blob.map[a.blob.bytes[i]];
 446         } else {
 447             errx(1, "Unreachable");
 448         }
 449         if (b.tag == TEXT_GRAPHEMES) {
 450             memcpy(dest, b.graphemes, sizeof(int32_t[b.length]));
 451         } else if (b.tag == TEXT_ASCII) {
 452             for (int64_t i = 0; i < (int64_t)b.length; i++)
 453                 *(dest++) = (int32_t)b.ascii[i];
 454         } else if (b.tag == TEXT_BLOB) {
 455             for (int64_t i = 0; i < (int64_t)b.length; i++)
 456                 *(dest++) = b.blob.map[b.blob.bytes[i]];
 457         } else {
 458             errx(1, "Unreachable");
 459         }
 460         return ret;
 461     }
 462 
 463     if (a.tag == TEXT_CONCAT && b.tag != TEXT_CONCAT && a.right->tag != TEXT_CONCAT)
 464         return concat2_assuming_safe(*a.left, concat2_assuming_safe(*a.right, b));
 465 
 466     return simple_concatenation(a, b);
 467 }
 468 
 469 static Text_t concat2(Text_t a, Text_t b) {
 470     if (a.length == 0) return b;
 471     if (b.length == 0) return a;
 472 
 473     int32_t last_a = Text$get_grapheme(a, a.length - 1);
 474     int32_t first_b = Text$get_grapheme(b, 0);
 475 
 476     // Magic number, we know that no codepoints below here trigger instability:
 477     static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; // COMBINING GRAVE ACCENT
 478     if (last_a >= 0 && last_a < LOWEST_CODEPOINT_TO_CHECK && first_b >= 0 && first_b < LOWEST_CODEPOINT_TO_CHECK)
 479         return concat2_assuming_safe(a, b);
 480 
 481     size_t len = (last_a >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a);
 482     len += (first_b >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b);
 483 
 484     ucs4_t codepoints[len];
 485     ucs4_t *dest = codepoints;
 486     if (last_a < 0) {
 487         memcpy(dest, GRAPHEME_CODEPOINTS(last_a), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(last_a)]));
 488         dest += NUM_GRAPHEME_CODEPOINTS(last_a);
 489     } else {
 490         *(dest++) = (ucs4_t)last_a;
 491     }
 492 
 493     if (first_b < 0) {
 494         memcpy(dest, GRAPHEME_CODEPOINTS(first_b), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(first_b)]));
 495     } else {
 496         *dest = (ucs4_t)first_b;
 497     }
 498 
 499     // Do a normalization run for these two codepoints and see if it looks different.
 500     // Normalization should not exceed 3x in the input length (but if it does, it will be
 501     // handled gracefully)
 502     ucs4_t norm_buf[3 * len];
 503     size_t norm_length = sizeof(norm_buf) / sizeof(norm_buf[0]);
 504     ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, len, norm_buf, &norm_length);
 505     bool stable = (norm_length == len && memcmp(codepoints, normalized, sizeof(codepoints)) == 0);
 506 
 507     if (stable) {
 508         const void *second_grapheme = u32_grapheme_next(normalized, &normalized[norm_length]);
 509         if (second_grapheme == &normalized[norm_length]) stable = false;
 510     }
 511 
 512     if likely (stable) {
 513         if (normalized != norm_buf) free(normalized);
 514         return concat2_assuming_safe(a, b);
 515     }
 516 
 517     OptionalText_t glue =
 518         Text$from_utf32((List_t){.data = normalized, .length = (uint64_t)norm_length, .stride = sizeof(ucs4_t)});
 519 
 520     if (normalized != norm_buf) free(normalized);
 521 
 522     if (a.length == 1 && b.length == 1) return glue;
 523     else if (a.length == 1) return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length)));
 524     else if (b.length == 1) return concat2_assuming_safe(Text$slice(a, I(1), I(a.length - 1)), glue);
 525     else
 526         return concat2_assuming_safe(concat2_assuming_safe(Text$slice(a, I(1), I(a.length - 1)), glue),
 527                                      Text$slice(b, I(2), I(b.length)));
 528 }
 529 
 530 public
 531 Text_t Text$_concat(int n, Text_t items[n]) {
 532     if (n == 0) return EMPTY_TEXT;
 533 
 534     Text_t ret = items[0];
 535     for (int i = 1; i < n; i++) {
 536         if (items[i].length > 0) ret = concat2(ret, items[i]);
 537     }
 538     return ret;
 539 }
 540 
 541 public
 542 Text_t Text$repeat(Text_t text, Int_t count) {
 543     if (text.length == 0 || Int$is_negative(count)) return EMPTY_TEXT;
 544 
 545     Int_t result_len = Int$times(count, I(text.length));
 546     if (Int$compare_value(result_len, I(1l << 40)) > 0) fail("Text repeating would produce too big of an result!");
 547 
 548     int64_t count64 = Int64$from_int(count, false);
 549     Text_t ret = text;
 550     for (int64_t c = 1; c < count64; c++)
 551         ret = concat2(ret, text);
 552     return ret;
 553 }
 554 
 555 public
 556 Int_t Text$width(Text_t text, Text_t language) {
 557     int width = u8_strwidth((const uint8_t *)Text$as_c_string(text), Text$as_c_string(language));
 558     return Int$from_int32(width);
 559 }
 560 
 561 static Text_t Text$repeat_to_width(Text_t to_repeat, int64_t target_width, Text_t language) {
 562     if (target_width <= 0) return EMPTY_TEXT;
 563 
 564     const char *lang_str = Text$as_c_string(language);
 565     int64_t width = (int64_t)u8_strwidth((const uint8_t *)Text$as_c_string(to_repeat), lang_str);
 566     Text_t repeated = EMPTY_TEXT;
 567     int64_t repeated_width = 0;
 568     while (repeated_width + width <= target_width) {
 569         repeated = concat2(repeated, to_repeat);
 570         repeated_width += width;
 571     }
 572 
 573     if (repeated_width < target_width) {
 574         for (int64_t i = 0; repeated_width < target_width && i < (int64_t)to_repeat.length; i++) {
 575             Text_t c = Text$slice(to_repeat, I_small(i + 1), I_small(i + 1));
 576             int64_t w = (int64_t)u8_strwidth((const uint8_t *)Text$as_c_string(c), lang_str);
 577             if (repeated_width + w > target_width) {
 578                 repeated = concat2(repeated, Text$repeat(Text(" "), I(target_width - repeated_width)));
 579                 break;
 580             }
 581             repeated = concat2(repeated, c);
 582             repeated_width += w;
 583         }
 584     }
 585 
 586     return repeated;
 587 }
 588 
 589 public
 590 Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
 591     if (padding.length == 0) fail("Cannot pad with an empty text!");
 592 
 593     int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
 594     return concat2(Text$repeat_to_width(padding, needed, language), text);
 595 }
 596 
 597 public
 598 Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
 599     if (padding.length == 0) fail("Cannot pad with an empty text!");
 600 
 601     int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
 602     return concat2(text, Text$repeat_to_width(padding, needed, language));
 603 }
 604 
 605 public
 606 Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
 607     if (padding.length == 0) fail("Cannot pad with an empty text!");
 608 
 609     int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
 610     return Text$concat(Text$repeat_to_width(padding, needed / 2, language), text,
 611                        Text$repeat_to_width(padding, (needed + 1) / 2, language));
 612 }
 613 
 614 public
 615 Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) {
 616     int64_t first = Int64$from_int(first_int, false);
 617     int64_t last = Int64$from_int(last_int, false);
 618     if (first == 0) fail("Invalid index: 0");
 619     if (last == 0) return EMPTY_TEXT;
 620 
 621     if (first < 0) first = (int64_t)text.length + first + 1;
 622     if (last < 0) last = (int64_t)text.length + last + 1;
 623 
 624     if (last > (int64_t)text.length) last = (int64_t)text.length;
 625 
 626     if (first > (int64_t)text.length || last < first) return EMPTY_TEXT;
 627 
 628     if (first == 1 && last == (int64_t)text.length) return text;
 629 
 630     while (text.tag == TEXT_CONCAT) {
 631         if (last < (int64_t)text.left->length) {
 632             text = *text.left;
 633         } else if (first > (int64_t)text.left->length) {
 634             first -= (int64_t)text.left->length;
 635             last -= (int64_t)text.left->length;
 636             text = *text.right;
 637         } else {
 638             return concat2_assuming_safe(Text$slice(*text.left, I(first), I(text.length)),
 639                                          Text$slice(*text.right, I(1), I(last - text.left->length)));
 640         }
 641     }
 642 
 643     switch (text.tag) {
 644     case TEXT_ASCII: {
 645         return (Text_t){
 646             .tag = TEXT_ASCII,
 647             .length = last - first + 1,
 648             .ascii = text.ascii + (first - 1),
 649         };
 650     }
 651     case TEXT_GRAPHEMES: {
 652         return (Text_t){
 653             .tag = TEXT_GRAPHEMES,
 654             .length = last - first + 1,
 655             .graphemes = text.graphemes + (first - 1),
 656         };
 657     }
 658     case TEXT_BLOB: {
 659         Text_t ret = (Text_t){
 660             .tag = TEXT_BLOB,
 661             .length = last - first + 1,
 662             .blob.map = text.blob.map,
 663             .blob.bytes = text.blob.bytes + (first - 1),
 664         };
 665         return ret;
 666     }
 667     default: errx(1, "Invalid tag");
 668     }
 669     return EMPTY_TEXT;
 670 }
 671 
 672 public
 673 Text_t Text$from(Text_t text, Int_t first) {
 674     return Text$slice(text, first, I_small(-1));
 675 }
 676 
 677 public
 678 Text_t Text$to(Text_t text, Int_t last) {
 679     return Text$slice(text, I_small(1), last);
 680 }
 681 
 682 public
 683 Text_t Text$reversed(Text_t text) {
 684     switch (text.tag) {
 685     case TEXT_ASCII: {
 686         struct Text_s ret = {
 687             .tag = TEXT_ASCII,
 688             .length = text.length,
 689         };
 690         ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
 691         for (int64_t i = 0; i < (int64_t)text.length; i++)
 692             ((char *)ret.ascii)[text.length - 1 - i] = text.ascii[i];
 693         return ret;
 694     }
 695     case TEXT_GRAPHEMES: {
 696         struct Text_s ret = {
 697             .tag = TEXT_GRAPHEMES,
 698             .length = text.length,
 699         };
 700         ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
 701         for (int64_t i = 0; i < (int64_t)text.length; i++)
 702             ((int32_t *)ret.graphemes)[text.length - 1 - i] = text.graphemes[i];
 703         return ret;
 704     }
 705     case TEXT_BLOB: {
 706         struct Text_s ret = {
 707             .tag = TEXT_BLOB,
 708             .length = text.length,
 709             .blob.map = text.blob.map,
 710         };
 711         ret.blob.bytes = GC_MALLOC_ATOMIC(sizeof(uint8_t[ret.length]));
 712         for (int64_t i = 0; i < (int64_t)text.length; i++)
 713             ((uint8_t *)ret.blob.bytes)[text.length - 1 - i] = text.graphemes[i];
 714         return ret;
 715     }
 716     case TEXT_CONCAT: {
 717         return concat2_assuming_safe(Text$reversed(*text.right), Text$reversed(*text.left));
 718     }
 719     default: errx(1, "Invalid tag");
 720     }
 721     return EMPTY_TEXT;
 722 }
 723 
 724 public
 725 PUREFUNC OptionalText_t Text$cluster(Text_t text, Int_t index) {
 726     Text_t slice = Text$slice(text, index, index);
 727     return slice.length <= 0 ? NONE_TEXT : slice;
 728 }
 729 
 730 static Text_t Text$from_components(List_t graphemes, Table_t unique_clusters) {
 731     if (graphemes.length == 0) return EMPTY_TEXT;
 732     size_t blob_size = (sizeof(int32_t[unique_clusters.entries.length]) + sizeof(uint8_t[graphemes.length]));
 733     // If blob optimization will save at least 200 bytes:
 734     if (unique_clusters.entries.length <= 256 && blob_size + 200 < sizeof(int32_t[graphemes.length])) {
 735         Text_t ret = {
 736             .tag = TEXT_BLOB,
 737             .length = graphemes.length,
 738             .depth = 0,
 739         };
 740         void *blob = GC_MALLOC_ATOMIC(blob_size);
 741         int32_t *map = blob;
 742         uint8_t *bytes = blob + sizeof(int32_t[unique_clusters.entries.length]);
 743         for (int64_t i = 0; i < (int64_t)unique_clusters.entries.length; i++) {
 744             struct {
 745                 int32_t g;
 746                 uint8_t b;
 747             } *entry = unique_clusters.entries.data + i * unique_clusters.entries.stride;
 748             map[entry->b] = entry->g;
 749         }
 750         for (int64_t i = 0; i < (int64_t)graphemes.length; i++) {
 751             int32_t g = *(int32_t *)(graphemes.data + i * graphemes.stride);
 752             uint8_t *byte = Table$get(unique_clusters, &g, Table$info(&Int32$info, &Byte$info));
 753             assert(byte);
 754             bytes[i] = *byte;
 755         }
 756         ret.blob.map = map;
 757         ret.blob.bytes = bytes;
 758         return ret;
 759     } else {
 760         return (Text_t){
 761             .tag = TEXT_GRAPHEMES,
 762             .length = graphemes.length,
 763             .graphemes = graphemes.data,
 764         };
 765     }
 766 }
 767 
 768 public
 769 OptionalText_t Text$from_strn(const char *str, size_t len) {
 770     int64_t ascii_span = 0;
 771     for (size_t i = 0; i < len && isascii(str[i]); i++) {
 772         ascii_span++;
 773         if (str[i] == 0) return NONE_TEXT;
 774     }
 775 
 776     if (ascii_span == (int64_t)len) { // All ASCII
 777         char *copy = GC_MALLOC_ATOMIC(len);
 778         memcpy(copy, str, len);
 779         return (Text_t){
 780             .tag = TEXT_ASCII,
 781             .length = ascii_span,
 782             .ascii = copy,
 783         };
 784     }
 785     if (u8_check((uint8_t *)str, len) != NULL) return NONE_TEXT;
 786 
 787     List_t graphemes = EMPTY_LIST;
 788     Table_t unique_clusters = EMPTY_TABLE;
 789     const uint8_t *pos = (const uint8_t *)str;
 790     const uint8_t *end = (const uint8_t *)&str[len];
 791     // Iterate over grapheme clusters
 792     for (const uint8_t *next; (next = u8_grapheme_next(pos, end)); pos = next) {
 793         uint32_t buf[256];
 794         size_t u32_len = sizeof(buf) / sizeof(buf[0]);
 795         uint32_t *u32s = u8_to_u32(pos, (size_t)(next - pos), buf, &u32_len);
 796         if (u32s == NULL) return NONE_TEXT;
 797 
 798         uint32_t buf2[256];
 799         size_t u32_normlen = sizeof(buf2) / sizeof(buf2[0]);
 800         uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, u32s, u32_len, buf2, &u32_normlen);
 801         if (u32s_normalized == NULL) return NONE_TEXT;
 802 
 803         int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
 804         if (g == 0) return NONE_TEXT;
 805         List$insert(&graphemes, &g, I(0), sizeof(int32_t));
 806         Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length,
 807                                 Table$info(&Int32$info, &Byte$info));
 808 
 809         if (u32s != buf) free(u32s);
 810         if (u32s_normalized != buf2) free(u32s_normalized);
 811 
 812         if (unique_clusters.entries.length >= 256) {
 813             return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters),
 814                                          Text$from_strn((const char *)next, (size_t)(end - next)));
 815         }
 816     }
 817 
 818     return Text$from_components(graphemes, unique_clusters);
 819 }
 820 
 821 public
 822 OptionalText_t Text$from_str(const char *str) {
 823     return str ? Text$from_strn(str, strlen(str)) : Text("");
 824 }
 825 
 826 static void u8_buf_append(Text_t text, Byte_t **buf, int64_t *capacity, int64_t *i) {
 827     switch (text.tag) {
 828     case TEXT_ASCII: {
 829         if (*i + (int64_t)text.length > (int64_t)*capacity) {
 830             *capacity = *i + (int64_t)text.length + 1;
 831             *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
 832         }
 833 
 834         const char *bytes = text.ascii;
 835         memcpy(*buf + *i, bytes, (size_t)text.length);
 836         *i += text.length;
 837         break;
 838     }
 839     case TEXT_GRAPHEMES: {
 840         const int32_t *graphemes = text.graphemes;
 841         for (int64_t g = 0; g < (int64_t)text.length; g++) {
 842             if (graphemes[g] >= 0) {
 843                 uint8_t u8_buf[64];
 844                 size_t u8_len = sizeof(u8_buf);
 845                 uint8_t *u8 = u32_to_u8((ucs4_t *)&graphemes[g], 1, u8_buf, &u8_len);
 846                 if (u8 == NULL) fail("Invalid grapheme encountered: ", graphemes[g]);
 847 
 848                 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
 849                     *capacity = *i + (int64_t)u8_len + 1;
 850                     *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
 851                 }
 852 
 853                 memcpy(*buf + *i, u8, u8_len);
 854                 *i += (int64_t)u8_len;
 855                 if (u8 != u8_buf) free(u8);
 856             } else {
 857                 const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]);
 858                 size_t u8_len = u8_strlen(u8);
 859                 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
 860                     *capacity = *i + (int64_t)u8_len + 1;
 861                     *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
 862                 }
 863 
 864                 memcpy(*buf + *i, u8, u8_len);
 865                 *i += (int64_t)u8_len;
 866             }
 867         }
 868         break;
 869     }
 870     case TEXT_BLOB: {
 871         for (int64_t g = 0; g < (int64_t)text.length; g++) {
 872             int32_t grapheme = text.blob.map[text.blob.bytes[g]];
 873             if (grapheme >= 0) {
 874                 uint8_t u8_buf[64];
 875                 size_t u8_len = sizeof(u8_buf);
 876                 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, u8_buf, &u8_len);
 877                 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
 878 
 879                 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
 880                     *capacity = *i + (int64_t)u8_len + 1;
 881                     *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
 882                 }
 883 
 884                 memcpy(*buf + *i, u8, u8_len);
 885                 *i += (int64_t)u8_len;
 886                 if (u8 != u8_buf) free(u8);
 887             } else {
 888                 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
 889                 size_t u8_len = u8_strlen(u8);
 890                 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
 891                     *capacity = *i + (int64_t)u8_len + 1;
 892                     *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
 893                 }
 894 
 895                 memcpy(*buf + *i, u8, u8_len);
 896                 *i += (int64_t)u8_len;
 897             }
 898         }
 899         break;
 900     }
 901     case TEXT_CONCAT: {
 902         u8_buf_append(*text.left, buf, capacity, i);
 903         u8_buf_append(*text.right, buf, capacity, i);
 904         break;
 905     }
 906     default: break;
 907     }
 908 }
 909 
 910 public
 911 const char *Text$as_c_string(Text_t text) {
 912     if (text.tag == TEXT_ASCII && text.ascii[text.length] == '\0') return text.ascii;
 913     int64_t capacity = text.length + 1;
 914     char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
 915     int64_t i = 0;
 916     u8_buf_append(text, (Byte_t **)&buf, &capacity, &i);
 917 
 918     if (i + 1 > (int64_t)capacity) {
 919         capacity = i + 1;
 920         buf = GC_REALLOC(buf, (size_t)capacity);
 921     }
 922     buf[i] = '\0';
 923     return buf;
 924 }
 925 
 926 PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t *info) {
 927     (void)info;
 928     Text_t text = *(Text_t *)obj;
 929     siphash sh;
 930     siphashinit(&sh, sizeof(int32_t[text.length]));
 931 
 932     union {
 933         int32_t chunks[2];
 934         uint64_t whole;
 935     } tmp;
 936     switch (text.tag) {
 937     case TEXT_ASCII: {
 938         const char *bytes = text.ascii;
 939         for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
 940             tmp.chunks[0] = (int32_t)bytes[i];
 941             tmp.chunks[1] = (int32_t)bytes[i + 1];
 942             siphashadd64bits(&sh, tmp.whole);
 943         }
 944         int32_t last = text.length & 0x1 ? (int32_t)bytes[text.length - 1] : 0; // Odd number of graphemes
 945         return siphashfinish_last_part(&sh, (uint64_t)last);
 946     }
 947     case TEXT_GRAPHEMES: {
 948         const int32_t *graphemes = text.graphemes;
 949         for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
 950             tmp.chunks[0] = graphemes[i];
 951             tmp.chunks[1] = graphemes[i + 1];
 952             siphashadd64bits(&sh, tmp.whole);
 953         }
 954         int32_t last = text.length & 0x1 ? graphemes[text.length - 1] : 0; // Odd number of graphemes
 955         return siphashfinish_last_part(&sh, (uint64_t)last);
 956     }
 957     case TEXT_BLOB: {
 958         for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
 959             tmp.chunks[0] = text.blob.map[text.blob.bytes[i]];
 960             tmp.chunks[1] = text.blob.map[text.blob.bytes[i + 1]];
 961             siphashadd64bits(&sh, tmp.whole);
 962         }
 963         int32_t last =
 964             text.length & 0x1 ? text.blob.map[text.blob.bytes[text.length - 1]] : 0; // Odd number of graphemes
 965         return siphashfinish_last_part(&sh, (uint64_t)last);
 966     }
 967     case TEXT_CONCAT: {
 968         TextIter_t state = NEW_TEXT_ITER_STATE(text);
 969         for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
 970             tmp.chunks[0] = Text$get_grapheme_fast(&state, i);
 971             tmp.chunks[1] = Text$get_grapheme_fast(&state, i + 1);
 972             siphashadd64bits(&sh, tmp.whole);
 973         }
 974 
 975         int32_t last = (text.length & 0x1) ? Text$get_grapheme_fast(&state, text.length - 1) : 0;
 976         return siphashfinish_last_part(&sh, (uint64_t)last);
 977     }
 978     default: errx(1, "Invalid text");
 979     }
 980     return 0;
 981 }
 982 
 983 public
 984 int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index) {
 985     if (index < 0) return 0;
 986     if (index >= (int64_t)state->stack[0].text.length) return 0;
 987 
 988     assert(state->stack[0].text.depth <= MAX_TEXT_DEPTH);
 989 
 990     // Go up the stack as needed:
 991     while (index < state->stack[state->stack_index].offset
 992            || index
 993                   >= state->stack[state->stack_index].offset + (int64_t)state->stack[state->stack_index].text.length) {
 994         state->stack_index -= 1;
 995         assert(state->stack_index >= 0);
 996     }
 997 
 998     assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
 999 
1000     // Go down the stack as needed:
1001     while (state->stack[state->stack_index].text.tag == TEXT_CONCAT) {
1002         Text_t text = state->stack[state->stack_index].text;
1003         int64_t offset = state->stack[state->stack_index].offset;
1004         assert(state->stack_index <= MAX_TEXT_DEPTH);
1005         assert(index >= offset);
1006         assert(index < offset + (int64_t)text.length);
1007 
1008         state->stack_index += 1;
1009         if (index < offset + (int64_t)text.left->length) {
1010             state->stack[state->stack_index].text = *text.left;
1011             state->stack[state->stack_index].offset = offset;
1012         } else {
1013             state->stack[state->stack_index].text = *text.right;
1014             state->stack[state->stack_index].offset = offset + text.left->length;
1015         }
1016         assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
1017     }
1018 
1019     Text_t text = state->stack[state->stack_index].text;
1020     int64_t offset = state->stack[state->stack_index].offset;
1021 
1022     if (index < offset || index >= offset + (int64_t)text.length) {
1023         return 0;
1024     }
1025 
1026     switch (text.tag) {
1027     case TEXT_ASCII: return (int32_t)text.ascii[index - offset];
1028     case TEXT_GRAPHEMES: return text.graphemes[index - offset];
1029     case TEXT_BLOB: return text.blob.map[text.blob.bytes[index - offset]];
1030     default: errx(1, "Invalid text");
1031     }
1032     return 0;
1033 }
1034 
1035 public
1036 uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index) {
1037     int32_t g = Text$get_grapheme_fast(state, index);
1038     return (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint;
1039 }
1040 
1041 PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t *info) {
1042     (void)info;
1043     if (va == vb) return 0;
1044     const Text_t a = *(const Text_t *)va;
1045     const Text_t b = *(const Text_t *)vb;
1046 
1047     // TODO: make this smarter and more efficient
1048     int64_t len = MAX(a.length, b.length);
1049     TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1050     for (int64_t i = 0; i < len; i++) {
1051         int32_t ai = Text$get_grapheme_fast(&a_state, i);
1052         int32_t bi = Text$get_grapheme_fast(&b_state, i);
1053         if (ai == bi) continue;
1054         int32_t cmp;
1055         if (ai > 0 && bi > 0) {
1056             cmp = u32_cmp((ucs4_t *)&ai, (ucs4_t *)&bi, 1);
1057         } else if (ai > 0) {
1058             cmp = u32_cmp2((ucs4_t *)&ai, 1, GRAPHEME_CODEPOINTS(bi), NUM_GRAPHEME_CODEPOINTS(bi));
1059         } else if (bi > 0) {
1060             cmp = u32_cmp2(GRAPHEME_CODEPOINTS(ai), NUM_GRAPHEME_CODEPOINTS(ai), (ucs4_t *)&bi, 1);
1061         } else {
1062             cmp = u32_cmp2(GRAPHEME_CODEPOINTS(ai), NUM_GRAPHEME_CODEPOINTS(ai), GRAPHEME_CODEPOINTS(bi),
1063                            NUM_GRAPHEME_CODEPOINTS(bi));
1064         }
1065         if (cmp != 0) return cmp;
1066     }
1067     return 0;
1068 }
1069 
1070 bool _matches(TextIter_t *text_state, TextIter_t *target_state, int64_t pos) {
1071     for (int64_t i = 0; i < (int64_t)target_state->stack[0].text.length; i++) {
1072         int32_t text_i = Text$get_grapheme_fast(text_state, pos + i);
1073         int32_t target_i = Text$get_grapheme_fast(target_state, i);
1074         if (text_i != target_i) return false;
1075     }
1076     return true;
1077 }
1078 
1079 PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix, Text_t *remainder) {
1080     if (text.length < prefix.length) return false;
1081     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix);
1082     if (_matches(&text_state, &prefix_state, 0)) {
1083         if (remainder) *remainder = Text$from(text, Int$from_int64(prefix.length + 1));
1084         return true;
1085     } else {
1086         if (remainder) *remainder = text;
1087         return false;
1088     }
1089 }
1090 
1091 PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix, Text_t *remainder) {
1092     if (text.length < suffix.length) return false;
1093     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix);
1094     if (_matches(&text_state, &suffix_state, text.length - suffix.length)) {
1095         if (remainder) *remainder = Text$to(text, Int$from_int64(text.length - suffix.length));
1096         return true;
1097     } else {
1098         if (remainder) *remainder = text;
1099         return false;
1100     }
1101 }
1102 
1103 public
1104 Text_t Text$without_prefix(Text_t text, Text_t prefix) {
1105     return Text$starts_with(text, prefix, NULL) ? Text$slice(text, I(prefix.length + 1), I(text.length)) : text;
1106 }
1107 
1108 public
1109 Text_t Text$without_suffix(Text_t text, Text_t suffix) {
1110     return Text$ends_with(text, suffix, NULL) ? Text$slice(text, I(1), I(text.length - suffix.length)) : text;
1111 }
1112 
1113 static bool _has_grapheme(TextIter_t *text, int32_t g) {
1114     for (int64_t t = 0; t < (int64_t)text->stack[0].text.length; t++) {
1115         if (g == Text$get_grapheme_fast(text, t)) {
1116             return true;
1117         }
1118     }
1119     return false;
1120 }
1121 
1122 public
1123 OptionalInt_t Text$find(Text_t text, Text_t target, Int_t start) {
1124     if (text.length < target.length) return NONE_INT;
1125     if (target.length <= 0) return I(1);
1126     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1127     for (int64_t i = Int64$from_int(start, false) - 1; i < (int64_t)text.length - (int64_t)target.length + 1; i++) {
1128         if (_matches(&text_state, &target_state, i)) {
1129             return Int$from_int64(i + 1);
1130         }
1131     }
1132     return NONE_INT;
1133 }
1134 
1135 public
1136 bool Text$matches_glob(Text_t text, Text_t glob) {
1137     return !fnmatch(Text$as_c_string(glob), Text$as_c_string(text), 0);
1138 }
1139 
1140 public
1141 Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right) {
1142     int64_t first = 0;
1143     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), trim_state = NEW_TEXT_ITER_STATE(to_trim);
1144     if (left) {
1145         while (first < (int64_t)text.length && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, first))) {
1146             first += 1;
1147         }
1148     }
1149     int64_t last = text.length - 1;
1150     if (right) {
1151         while (last >= first && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, last))) {
1152             last -= 1;
1153         }
1154     }
1155     return (first != 0 || last != text.length - 1) ? Text$slice(text, I(first + 1), I(last + 1)) : text;
1156 }
1157 
1158 public
1159 Text_t Text$translate(Text_t text, Table_t translations) {
1160     TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1161     Text_t result = EMPTY_TEXT;
1162     int64_t span_start = 0;
1163     List_t replacement_list = translations.entries;
1164     for (int64_t i = 0; i < (int64_t)text.length;) {
1165         for (int64_t r = 0; r < (int64_t)replacement_list.length; r++) {
1166             struct {
1167                 Text_t target, replacement;
1168             } *entry = replacement_list.data + r * replacement_list.stride;
1169             if (entry->target.length <= 0) continue;
1170             TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target);
1171             if (_matches(&text_state, &target_state, i)) {
1172                 if (i > span_start) result = concat2(result, Text$slice(text, I(span_start + 1), I(i)));
1173 
1174                 result = concat2(result, entry->replacement);
1175                 i += entry->target.length;
1176                 span_start = i;
1177                 goto found_match;
1178             }
1179         }
1180         i += 1;
1181     found_match:
1182         continue;
1183     }
1184     if (span_start < (int64_t)text.length)
1185         result = concat2(result, Text$slice(text, I(span_start + 1), I((int64_t)text.length)));
1186     return result;
1187 }
1188 
1189 public
1190 Text_t Text$replace(Text_t text, Text_t target, Text_t replacement) {
1191     if (target.length <= 0) return text;
1192     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1193     Text_t result = EMPTY_TEXT;
1194     int64_t span_start = 0;
1195     for (int64_t i = 0; i < (int64_t)text.length;) {
1196         if (_matches(&text_state, &target_state, i)) {
1197             if (i > span_start) result = concat2(result, Text$slice(text, I(span_start + 1), I(i)));
1198 
1199             result = concat2(result, replacement);
1200             i += target.length;
1201             span_start = i;
1202         } else {
1203             i += 1;
1204         }
1205     }
1206     if (span_start < (int64_t)text.length)
1207         result = concat2(result, Text$slice(text, I(span_start + 1), I((int64_t)text.length)));
1208     return result;
1209 }
1210 
1211 public
1212 PUREFUNC bool Text$has(Text_t text, Text_t target) {
1213     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1214     for (int64_t i = 0; i < (int64_t)text.length; i++) {
1215         if (_matches(&text_state, &target_state, i)) return true;
1216     }
1217     return false;
1218 }
1219 
1220 public
1221 List_t Text$split(Text_t text, Text_t delimiters) {
1222     if (delimiters.length == 0) return Text$clusters(text);
1223 
1224     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
1225     List_t splits = EMPTY_LIST;
1226     for (int64_t i = 0; i < (int64_t)text.length;) {
1227         int64_t span_len = 0;
1228         while (i + span_len < (int64_t)text.length && !_matches(&text_state, &delim_state, i + span_len)) {
1229             span_len += 1;
1230         }
1231         Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1232         List$insert(&splits, &slice, I(0), sizeof(slice));
1233         i += span_len + delimiters.length;
1234         if (i == text.length) {
1235             Text_t empty = Text("");
1236             List$insert(&splits, &empty, I(0), sizeof(empty));
1237         }
1238     }
1239     return splits;
1240 }
1241 
1242 public
1243 List_t Text$split_any(Text_t text, Text_t delimiters) {
1244     if (delimiters.length == 0) return List(text);
1245 
1246     TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
1247     List_t splits = EMPTY_LIST;
1248     for (int64_t i = 0; i < (int64_t)text.length;) {
1249         int64_t span_len = 0;
1250         while (i + span_len < (int64_t)text.length
1251                && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) {
1252             span_len += 1;
1253         }
1254         bool trailing_delim = i + span_len < (int64_t)text.length;
1255         Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1256         List$insert(&splits, &slice, I(0), sizeof(slice));
1257         i += span_len + 1;
1258         while (i < (int64_t)text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) {
1259             i += 1;
1260         }
1261         if (i >= (int64_t)text.length && trailing_delim) {
1262             Text_t empty = Text("");
1263             List$insert(&splits, &empty, I(0), sizeof(empty));
1264         }
1265     }
1266     return splits;
1267 }
1268 
1269 typedef struct {
1270     TextIter_t state;
1271     int64_t i;
1272     Text_t delimiter;
1273 } split_iter_state_t;
1274 
1275 static OptionalText_t next_split(split_iter_state_t *state) {
1276     Text_t text = state->state.stack[0].text;
1277     if (state->i >= (int64_t)text.length) {
1278         if (state->delimiter.length > 0 && state->i == text.length) { // special case
1279             state->i = text.length + 1;
1280             return EMPTY_TEXT;
1281         }
1282         return NONE_TEXT;
1283     }
1284 
1285     if (state->delimiter.length == 0) { // special case
1286         state->i = text.length + 1;
1287         return text;
1288     }
1289 
1290     TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
1291     int64_t i = state->i;
1292     int64_t span_len = 0;
1293     while (i + span_len < (int64_t)text.length && !_matches(&state->state, &delim_state, i + span_len)) {
1294         span_len += 1;
1295     }
1296     Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1297     state->i = i + span_len + state->delimiter.length;
1298     return slice;
1299 }
1300 
1301 public
1302 Closure_t Text$by_split(Text_t text, Text_t delimiter) {
1303     return (Closure_t){
1304         .fn = (void *)next_split,
1305         .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .delimiter = delimiter),
1306     };
1307 }
1308 
1309 static OptionalText_t next_split_any(split_iter_state_t *state) {
1310     Text_t text = state->state.stack[0].text;
1311     if (state->i >= (int64_t)text.length) {
1312         if (state->delimiter.length > 0 && state->i == text.length) { // special case
1313             state->i = text.length + 1;
1314             return EMPTY_TEXT;
1315         }
1316         return NONE_TEXT;
1317     }
1318 
1319     if (state->delimiter.length == 0) { // special case
1320         Text_t ret = Text$cluster(text, I(state->i + 1));
1321         state->i += 1;
1322         return ret;
1323     }
1324 
1325     TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
1326     int64_t i = state->i;
1327     int64_t span_len = 0;
1328     while (i + span_len < (int64_t)text.length
1329            && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i + span_len))) {
1330         span_len += 1;
1331     }
1332     Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1333     i += span_len + 1;
1334     while (i < (int64_t)text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i))) {
1335         i += 1;
1336     }
1337     state->i = i;
1338     return slice;
1339 }
1340 
1341 public
1342 Closure_t Text$by_split_any(Text_t text, Text_t delimiters) {
1343     return (Closure_t){
1344         .fn = (void *)next_split_any,
1345         .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .delimiter = delimiters),
1346     };
1347 }
1348 
1349 PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) {
1350     if (a.length != b.length) return false;
1351     int64_t len = a.length;
1352     TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1353     // TODO: make this smarter and more efficient
1354     for (int64_t i = 0; i < len; i++) {
1355         int32_t ai = Text$get_grapheme_fast(&a_state, i);
1356         int32_t bi = Text$get_grapheme_fast(&b_state, i);
1357         if (ai != bi) return false;
1358     }
1359     return true;
1360 }
1361 
1362 PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t *info) {
1363     (void)info;
1364     if (a == b) return true;
1365     return Text$equal_values(*(Text_t *)a, *(Text_t *)b);
1366 }
1367 
1368 PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language) {
1369     if (a.length != b.length) return false;
1370     int64_t len = a.length;
1371     TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1372     const char *uc_language = Text$as_c_string(language);
1373     for (int64_t i = 0; i < len; i++) {
1374         int32_t ai = Text$get_grapheme_fast(&a_state, i);
1375         int32_t bi = Text$get_grapheme_fast(&b_state, i);
1376         if (ai != bi) {
1377             const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t *)&ai : GRAPHEME_CODEPOINTS(ai);
1378             int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);
1379 
1380             const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t *)&bi : GRAPHEME_CODEPOINTS(bi);
1381             int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
1382 
1383             int cmp = 0;
1384             (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
1385             if (cmp != 0) return false;
1386         }
1387     }
1388     return true;
1389 }
1390 
1391 public
1392 Text_t Text$upper(Text_t text, Text_t language) {
1393     if (text.length == 0) return text;
1394     List_t codepoints = Text$utf32(text);
1395     if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1396     const char *uc_language = Text$as_c_string(language);
1397     size_t out_len = 256;
1398     uint32_t buf[out_len];
1399     ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1400     OptionalText_t ret =
1401         Text$from_utf32((List_t){.data = upper, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1402     if (upper != buf) free(upper);
1403     return ret;
1404 }
1405 
1406 public
1407 Text_t Text$lower(Text_t text, Text_t language) {
1408     if (text.length == 0) return text;
1409     List_t codepoints = Text$utf32(text);
1410     if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1411     const char *uc_language = Text$as_c_string(language);
1412     size_t out_len = 256;
1413     uint32_t buf[out_len];
1414     ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1415     OptionalText_t ret =
1416         Text$from_utf32((List_t){.data = lower, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1417     if (lower != buf) free(lower);
1418     return ret;
1419 }
1420 
1421 public
1422 Text_t Text$title(Text_t text, Text_t language) {
1423     if (text.length == 0) return text;
1424     List_t codepoints = Text$utf32(text);
1425     if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1426     const char *uc_language = Text$as_c_string(language);
1427     size_t out_len = 256;
1428     uint32_t buf[out_len];
1429     ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1430     OptionalText_t ret =
1431         Text$from_utf32((List_t){.data = title, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1432     if (title != buf) free(title);
1433     return ret;
1434 }
1435 
1436 public
1437 double Text$distance(Text_t a, Text_t b, Text_t language) {
1438     if (a.length <= 0) return (double)b.length;
1439     if (b.length <= 0) return (double)a.length;
1440 
1441     // The current implementation of text distance uses a modified form
1442     // of Damerau–Levenshtein distance that gives slightly lower distances
1443     // for letters with the same main grapheme and slightly lower distances
1444     // for letters that are the same, but with different casing.
1445     double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t) * (a.length + 1) * (b.length + 1));
1446 #define DIST(x, y) distances[(x) * b.length + (y)]
1447     for (int64_t i = 0; i <= (int64_t)a.length; i++)
1448         DIST(i, 0) = i;
1449     for (int64_t j = 0; j <= (int64_t)b.length; j++)
1450         DIST(0, j) = j;
1451 
1452     TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
1453     TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
1454     const char *uc_language = Text$as_c_string(language);
1455     for (int64_t i = 1; i <= (int64_t)a.length; i++) {
1456         for (int64_t j = 1; j <= (int64_t)b.length; j++) {
1457             int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
1458             int32_t bj = Text$get_grapheme_fast(&b_state, j - 1);
1459             double cost = (double)(ai != bj);
1460             if (cost > 0) {
1461                 ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
1462                 ucs4_t main_bj = (bj) >= 0 ? (ucs4_t)bj : synthetic_graphemes[-(bj)-1].main_codepoint;
1463                 if (main_ai == main_bj) {
1464                     // Same main grapheme (different modifiers)
1465                     cost = 0.25;
1466                 } else {
1467                     int cmp;
1468                     (void)u32_casecmp(&main_ai, 1, &main_bj, 1, uc_language, UNINORM_NFC, &cmp);
1469                     if (cmp == 0) cost = 0.5;
1470                 }
1471             }
1472 
1473             double insertion = 1. + DIST(i - 1, j);
1474             double deletion = 1. + DIST(i, j - 1);
1475             double dist = MIN(insertion, deletion);
1476             double substitution = cost + DIST(i - 1, j - 1);
1477             dist = MIN(dist, substitution);
1478             // Check for transposition:
1479             if (i >= 2 && j >= 2 && cost != 0) {
1480                 int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
1481                 int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
1482                 if (ai == bi_prev && bj == ai_prev) {
1483                     double transposition = cost + DIST(i - 2, j - 2);
1484                     dist = MIN(dist, transposition);
1485                 }
1486             }
1487             DIST(i, j) = dist;
1488         }
1489     }
1490 #undef DIST
1491 
1492     return (double)distances[a.length * b.length + b.length];
1493 }
1494 
1495 public
1496 Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
1497     Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
1498 #define flush_unquoted()                                                                                               \
1499     ({                                                                                                                 \
1500         if (unquoted_span > 0) {                                                                                       \
1501             ret = concat2_assuming_safe(ret, Text$slice(text, I(i - unquoted_span + 1), I(i)));                        \
1502             unquoted_span = 0;                                                                                         \
1503         }                                                                                                              \
1504     })
1505 #define add_escaped(str)                                                                                               \
1506     ({                                                                                                                 \
1507         flush_unquoted();                                                                                              \
1508         if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));                                            \
1509         ret = concat2_assuming_safe(ret, Text("\\" str));                                                              \
1510         if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));                                            \
1511     })
1512     TextIter_t state = NEW_TEXT_ITER_STATE(text);
1513     int64_t unquoted_span = 0;
1514     int64_t i = 0;
1515     for (i = 0; i < (int64_t)text.length; i++) {
1516         int32_t g = Text$get_grapheme_fast(&state, i);
1517         switch (g) {
1518         case '\a': add_escaped("a"); break;
1519         case '\b': add_escaped("b"); break;
1520         case '\x1b': add_escaped("e"); break;
1521         case '\f': add_escaped("f"); break;
1522         case '\n': add_escaped("n"); break;
1523         case '\r': add_escaped("r"); break;
1524         case '\t': add_escaped("t"); break;
1525         case '\v': add_escaped("v"); break;
1526         case '\\': {
1527             add_escaped("\\");
1528             break;
1529         }
1530         case '$': {
1531             add_escaped("$");
1532             break;
1533         }
1534         case '\x00' ... '\x06':
1535         case '\x0E' ... '\x1A':
1536         case '\x1C' ... '\x1F':
1537         case '\x7F' ... '\x7F': {
1538             flush_unquoted();
1539             if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
1540             ret = concat2_assuming_safe(ret, Text("\\x"));
1541             char tmp[3] = {
1542                 (g / 16) > 9 ? 'a' + (g / 16) - 10 : '0' + (g / 16),
1543                 (g & 15) > 9 ? 'a' + (g & 15) - 10 : '0' + (g & 15),
1544                 '\0',
1545             };
1546             ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2));
1547             if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
1548             break;
1549         }
1550         default: {
1551             TextIter_t esc_state = NEW_TEXT_ITER_STATE(extra_escapes);
1552             for (int64_t j = 0; j < (int64_t)extra_escapes.length; j++) {
1553                 int32_t esc = Text$get_grapheme_fast(&esc_state, j);
1554                 if (g == esc) {
1555                     flush_unquoted();
1556                     if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
1557                     ret = concat2_assuming_safe(ret, Text("\\"));
1558                     if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
1559                     break;
1560                 }
1561             }
1562             unquoted_span += 1;
1563             break;
1564         }
1565         }
1566     }
1567     flush_unquoted();
1568 #undef add_escaped
1569 #undef flush_unquoted
1570     if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[m"));
1571     return ret;
1572 }
1573 
1574 public
1575 Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) {
1576     if (quotation_mark.length != 1) fail("Invalid quote text: ", quotation_mark, " (must have length == 1)");
1577 
1578     Text_t ret = Text$escaped(text, colorize, quotation_mark);
1579     if (!(Text$equal_values(quotation_mark, Text("\"")) || Text$equal_values(quotation_mark, Text("'"))
1580           || Text$equal_values(quotation_mark, Text("`"))))
1581         ret = Text$concat(Text("$"), quotation_mark, ret, quotation_mark);
1582     else ret = Text$concat(quotation_mark, ret, quotation_mark);
1583     return ret;
1584 }
1585 
1586 public
1587 Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *info) {
1588     (void)info;
1589     if (!vtext) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
1590 
1591     Text_t text = *(Text_t *)vtext;
1592     // Figure out the best quotation mark to use:
1593     bool has_double_quote = false, has_backtick = false, has_single_quote = false, needs_escapes = false;
1594     TextIter_t state = NEW_TEXT_ITER_STATE(text);
1595     for (int64_t i = 0; i < (int64_t)text.length; i++) {
1596         int32_t g = Text$get_grapheme_fast(&state, i);
1597         if (g == '"') {
1598             has_double_quote = true;
1599         } else if (g == '`') {
1600             has_backtick = true;
1601         } else if (g == (g & 0x7F) && (g == '\'' || g == '\n' || g == '\r' || g == '\t' || !isprint((char)g))) {
1602             needs_escapes = true;
1603         }
1604     }
1605 
1606     Text_t quote;
1607     // If there's double quotes in the string, it would be nice to avoid
1608     // needing to escape them by using single quotes, but only if we don't have
1609     // single quotes or need to escape anything else (because single quotes
1610     // don't have interpolation):
1611     if (has_double_quote && !has_single_quote) quote = Text("'");
1612     // If there is a double quote, but no backtick, we can save a bit of
1613     // escaping by using backtick instead of double quote:
1614     else if (has_double_quote && has_single_quote && !has_backtick && !needs_escapes) quote = Text("`");
1615     // Otherwise fall back to double quotes as the default quoting style:
1616     else quote = Text("\"");
1617 
1618     Text_t as_text = Text$quoted(text, colorize, quote);
1619     if (info && info->TextInfo.lang && info != &Text$info)
1620         as_text = Text$concat(colorize ? Text("\x1b[1m$") : Text("$"), Text$from_str(info->TextInfo.lang),
1621                               colorize ? Text("\x1b[0m") : Text(""), as_text);
1622     return as_text;
1623 }
1624 
1625 public
1626 Text_t Text$join(Text_t glue, List_t pieces) {
1627     if (pieces.length == 0) return EMPTY_TEXT;
1628 
1629     Text_t result = *(Text_t *)pieces.data;
1630     for (int64_t i = 1; i < (int64_t)pieces.length; i++) {
1631         result = Text$concat(result, glue, *(Text_t *)(pieces.data + i * pieces.stride));
1632     }
1633     return result;
1634 }
1635 
1636 public
1637 List_t Text$clusters(Text_t text) {
1638     List_t clusters = EMPTY_LIST;
1639     for (int64_t i = 1; i <= (int64_t)text.length; i++) {
1640         Text_t cluster = Text$slice(text, I(i), I(i));
1641         List$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
1642     }
1643     return clusters;
1644 }
1645 
1646 public
1647 List_t Text$utf8(Text_t text) {
1648     if (text.length == 0) return EMPTY_ATOMIC_LIST;
1649     int64_t capacity = (int64_t)text.length;
1650     Byte_t *buf = GC_MALLOC_ATOMIC(sizeof(Byte_t[capacity]));
1651     int64_t i = 0;
1652     u8_buf_append(text, &buf, &capacity, &i);
1653     return (List_t){
1654         .data = buf, .length = (uint64_t)i, .stride = 1, .atomic = 1, .free = MIN(LIST_MAX_FREE_ENTRIES, capacity - i)};
1655 }
1656 
1657 public
1658 List_t Text$utf16(Text_t text) {
1659     if (text.length == 0) return EMPTY_ATOMIC_LIST;
1660     List_t utf32 = Text$utf32(text);
1661     if (utf32.length == 0) return EMPTY_ATOMIC_LIST;
1662     List_t utf16 = {.free = MIN(LIST_MAX_FREE_ENTRIES, (uint64_t)utf32.length), .atomic = 1};
1663     utf16.data = GC_MALLOC_ATOMIC(sizeof(int32_t[utf16.free]));
1664     for (int64_t i = 0; i < (int64_t)utf32.length; i++) {
1665         uint16_t u16_buf[4];
1666         size_t u16_len = sizeof(u16_buf) / sizeof(u16_buf[0]);
1667         uint16_t *chunk_u16 = u32_to_u16(utf32.data + utf32.stride * i, 1, u16_buf, &u16_len);
1668         if (chunk_u16 == NULL) fail("Invalid codepoints encountered!");
1669         List$insert_all(&utf16, (List_t){.data = chunk_u16, .stride = sizeof(uint16_t), .length = (uint64_t)u16_len},
1670                         I(0), sizeof(uint16_t));
1671         if (chunk_u16 != u16_buf) free(chunk_u16);
1672     }
1673     return utf16;
1674 }
1675 
1676 public
1677 List_t Text$utf32(Text_t text) {
1678     if (text.length == 0) return EMPTY_ATOMIC_LIST;
1679     List_t codepoints = EMPTY_ATOMIC_LIST;
1680     TextIter_t state = NEW_TEXT_ITER_STATE(text);
1681     for (int64_t i = 0; i < (int64_t)text.length; i++) {
1682         int32_t grapheme = Text$get_grapheme_fast(&state, i);
1683         if (grapheme < 0) {
1684             for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
1685                 ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
1686                 List$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t));
1687             }
1688         } else {
1689             List$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t));
1690         }
1691     }
1692     return codepoints;
1693 }
1694 
1695 static INLINE const char *codepoint_name(ucs4_t c) {
1696     char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
1697     char *found_name = unicode_character_name(c, name);
1698     if (found_name) return found_name;
1699     const uc_block_t *block = uc_block(c);
1700     if (!block) return "???";
1701     assert(block);
1702     return String(block->name, "-", hex(c, .no_prefix = true, .uppercase = true));
1703 }
1704 
1705 public
1706 List_t Text$codepoint_names(Text_t text) {
1707     List_t names = EMPTY_LIST;
1708     TextIter_t state = NEW_TEXT_ITER_STATE(text);
1709     for (int64_t i = 0; i < (int64_t)text.length; i++) {
1710         int32_t grapheme = Text$get_grapheme_fast(&state, i);
1711         if (grapheme < 0) {
1712             for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
1713                 const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
1714                 Text_t name_text = Text$from_str(name);
1715                 List$insert(&names, &name_text, I_small(0), sizeof(Text_t));
1716             }
1717         } else {
1718             const char *name = codepoint_name((ucs4_t)grapheme);
1719             Text_t name_text = Text$from_str(name);
1720             List$insert(&names, &name_text, I_small(0), sizeof(Text_t));
1721         }
1722     }
1723     return names;
1724 }
1725 
1726 public
1727 OptionalText_t Text$from_utf8(List_t units) {
1728     if (units.length == 0) return EMPTY_TEXT;
1729     if (units.stride != sizeof(uint8_t)) List$compact(&units, sizeof(uint8_t));
1730     return Text$from_strn(units.data, (size_t)units.length);
1731 }
1732 
1733 public
1734 OptionalText_t Text$from_utf16(List_t units) {
1735     if (units.length == 0) return EMPTY_TEXT;
1736     if (units.stride != sizeof(int16_t)) List$compact(&units, sizeof(int16_t));
1737 
1738     size_t length = 256;
1739     uint8_t buf[length];
1740     uint8_t *u8 = u16_to_u8(units.data, (size_t)units.length, buf, &length);
1741     Text_t ret =
1742         Text$from_utf8((List_t){.data = u8, .length = (uint64_t)length, .stride = sizeof(uint8_t), .atomic = 1});
1743     if (u8 != buf) free(u8);
1744     return ret;
1745 }
1746 
1747 public
1748 OptionalText_t Text$from_utf32(List_t codepoints) {
1749     if (codepoints.length == 0) return EMPTY_TEXT;
1750     if (codepoints.stride != sizeof(uint32_t)) List$compact(&codepoints, sizeof(uint32_t));
1751 
1752     List_t graphemes = EMPTY_ATOMIC_LIST;
1753     Table_t unique_clusters = EMPTY_TABLE;
1754     const uint32_t *pos = (const uint32_t *)codepoints.data;
1755     const uint32_t *end = (const uint32_t *)&pos[codepoints.length];
1756     // Iterate over grapheme clusters
1757     for (const uint32_t *next; (next = u32_grapheme_next(pos, end)); pos = next) {
1758         // Buffer for normalized cluster:
1759         uint32_t buf[256];
1760         size_t u32_normlen = sizeof(buf) / sizeof(buf[0]);
1761         if (u32_check(pos, (size_t)(next - pos)) != NULL) return NONE_TEXT;
1762         uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, pos, (size_t)(next - pos), buf, &u32_normlen);
1763 
1764         int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
1765         List$insert(&graphemes, &g, I(0), sizeof(int32_t));
1766         Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length,
1767                                 Table$info(&Int32$info, &Byte$info));
1768 
1769         if (u32s_normalized != buf) free(u32s_normalized);
1770 
1771         if (unique_clusters.entries.length == 256) {
1772             List_t remaining_codepoints = {
1773                 .length = (uint64_t)(end - next),
1774                 .data = (void *)next,
1775                 .stride = sizeof(int32_t),
1776             };
1777             OptionalText_t remainder = Text$from_utf32(remaining_codepoints);
1778             if (remainder.tag == TEXT_NONE) return NONE_TEXT;
1779             return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters), remainder);
1780         }
1781     }
1782     return Text$from_components(graphemes, unique_clusters);
1783 }
1784 
1785 public
1786 OptionalText_t Text$from_codepoint_names(List_t codepoint_names) {
1787     List_t codepoints = EMPTY_LIST;
1788     for (int64_t i = 0; i < (int64_t)codepoint_names.length; i++) {
1789         Text_t *name = ((Text_t *)(codepoint_names.data + i * codepoint_names.stride));
1790         const char *name_str = Text$as_c_string(*name);
1791         ucs4_t codepoint = unicode_name_character(name_str);
1792         if (codepoint == UNINAME_INVALID) return NONE_TEXT;
1793         List$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
1794     }
1795     return Text$from_utf32(codepoints);
1796 }
1797 
1798 public
1799 List_t Text$lines(Text_t text) {
1800     List_t lines = EMPTY_LIST;
1801     TextIter_t state = NEW_TEXT_ITER_STATE(text);
1802     for (int64_t i = 0, line_start = 0; i < (int64_t)text.length; i++) {
1803         int32_t grapheme = Text$get_grapheme_fast(&state, i);
1804         if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF
1805             Text_t line = Text$slice(text, I(line_start + 1), I(i));
1806             List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1807             i += 1; // skip one extra for CR
1808             line_start = i + 1;
1809         } else if (grapheme == '\n') { // newline
1810             Text_t line = Text$slice(text, I(line_start + 1), I(i));
1811             List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1812             line_start = i + 1;
1813         } else if (i == text.length - 1 && line_start != i) { // last line
1814             Text_t line = Text$slice(text, I(line_start + 1), I(i + 1));
1815             List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1816         }
1817     }
1818     return lines;
1819 }
1820 
1821 typedef struct {
1822     TextIter_t state;
1823     int64_t i;
1824 } line_iter_state_t;
1825 
1826 static OptionalText_t next_line(line_iter_state_t *state) {
1827     Text_t text = state->state.stack[0].text;
1828     for (int64_t i = state->i; i < (int64_t)text.length; i++) {
1829         int32_t grapheme = Text$get_grapheme_fast(&state->state, i);
1830         if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF
1831             Text_t line = Text$slice(text, I(state->i + 1), I(i));
1832             state->i = i + 2; // skip one extra for CR
1833             return line;
1834         } else if (grapheme == '\n') { // newline
1835             Text_t line = Text$slice(text, I(state->i + 1), I(i));
1836             state->i = i + 1;
1837             return line;
1838         } else if (i == text.length - 1 && state->i != i) { // last line
1839             Text_t line = Text$slice(text, I(state->i + 1), I(i + 1));
1840             state->i = i + 1;
1841             return line;
1842         }
1843     }
1844     return NONE_TEXT;
1845 }
1846 
1847 public
1848 Closure_t Text$by_line(Text_t text) {
1849     return (Closure_t){
1850         .fn = (void *)next_line,
1851         .userdata = new (line_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0),
1852     };
1853 }
1854 
1855 PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t *info) {
1856     (void)info;
1857     return ((Text_t *)t)->tag == TEXT_NONE;
1858 }
1859 
1860 public
1861 Int_t Text$memory_size(Text_t text) {
1862     switch (text.tag) {
1863     case TEXT_ASCII: return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(char[text.length]));
1864     case TEXT_GRAPHEMES: return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(int32_t[text.length]));
1865     case TEXT_BLOB:
1866         return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)((void *)text.blob.bytes - (void *)text.blob.map)
1867                               + (int64_t)sizeof(uint8_t[text.length]));
1868     case TEXT_CONCAT:
1869         return Int$plus(Int$from_int64((int64_t)sizeof(Text_t)),
1870                         Int$plus(Text$memory_size(*text.left), Text$memory_size(*text.right)));
1871     default: errx(1, "Invalid text tag: %d", text.tag);
1872     }
1873 }
1874 
1875 public
1876 Text_t Text$layout(Text_t text) {
1877     switch (text.tag) {
1878     case TEXT_ASCII: return Text$concat(Text("ASCII("), Int64$value_as_text(text.length), Text(")"));
1879     case TEXT_GRAPHEMES: return Text$concat(Text("Graphemes("), Int64$value_as_text(text.length), Text(")"));
1880     case TEXT_BLOB: return Text$concat(Text("Blob("), Int64$value_as_text(text.length), Text(")"));
1881     case TEXT_CONCAT:
1882         return Text$concat(Text("Concat("), Text$layout(*text.left), Text(", "), Text$layout(*text.right), Text(")"));
1883     default: errx(1, "Invalid text tag: %d", text.tag);
1884     }
1885 }
1886 
1887 public
1888 void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const TypeInfo_t *info) {
1889     (void)info;
1890     const char *str = Text$as_c_string(*(Text_t *)obj);
1891     int64_t len = (int64_t)strlen(str);
1892     Int64$serialize(&len, out, pointers, &Int64$info);
1893     fwrite(str, sizeof(char), (size_t)len, out);
1894 }
1895 
1896 public
1897 void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *info) {
1898     (void)info;
1899     int64_t len = 0;
1900     Int64$deserialize(in, &len, pointers, &Int64$info);
1901     if (len < 0) fail("Cannot deserialize text with a negative length!");
1902     char *buf = GC_MALLOC_ATOMIC((size_t)len + 1);
1903     if (fread(buf, sizeof(char), (size_t)len, in) != (size_t)len) fail("Not enough data in stream to deserialize");
1904     buf[len + 1] = '\0';
1905     *(Text_t *)out = Text$from_strn(buf, (size_t)len);
1906 }
1907 
1908 public
1909 const TypeInfo_t Text$info = {
1910     .size = sizeof(Text_t),
1911     .align = __alignof__(Text_t),
1912     .tag = TextInfo,
1913     .TextInfo = {.lang = "Text"},
1914     .metamethods = Text$metamethods,
1915 };