1 // This file defines type info and methods for the Text datatype, which uses
2 // libunistr for Unicode support and implements a datastructure based on a
3 // hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of
4 // strings, combined with a mostly-balanced tree datastructure based on Cords
5 // (Boehm et al), which have good runtime performance for text constructed by a
6 // series of many concatenations. In practice, this means Tomo's Text has an
7 // extremely compact memory footprint (typically as compact as UTF8 or
8 // up to 2x better for some languages), with extremely fast operations
9 // including concatenation, random indexing, and taking substrings.
11 // For more information on MoarVM's grapheme cluster strings, see:
12 // https://docs.raku.org/language/unicode
13 // https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc
14 // For more information on Cords, see the paper "Ropes: an Alternative to
15 // Strings" (Boehm, Atkinson, Plass 1995):
16 // https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf
18 // Tomo's Text datatype represents Unicode text that is fully normalized using
19 // normalization form C (NFC). This means that all text created from source code
20 // or read in at runtime will respect normalization during comparison and other
21 // operations, and the original (potentially non-canonical) representation of
22 // text is not preserved. This also means that byte sequences that do not
23 // represent valid unicode text cannot be interpreted as the Text datatype. For
24 // example, a file with malformed UTF8 sequences cannot be read as Text.
26 // A note on grapheme clusters: In Unicode, the fundamental unit is the
27 // "codepoint", which represents things like letters, symbols, emojis,
28 // combiners, and modifiers that alter other codepoints. However, most people
29 // have an intuitive notion of what a "letter" is that corresponds to the
30 // concept formally known as a grapheme cluster. A grapheme cluster is roughly
31 // speaking the amount of text that your cursor moves over when you press the
32 // left or right arrow key once. This often corresponds to a single codepoint,
33 // but some codepoints act as modifiers on other codepoints. For example, U+0301
34 // (COMBINING ACUTE ACCENT) can modify a letter like "e" to form "é". During
35 // normalization, this frequently resolves down to a single unicode codepoint,
36 // in this case, "é" resolves to the single codepoint U+00E9 (LATIN SMALL LETTER
37 // E WITH ACUTE). However, in some cases, multiple codepoints make up a grapheme
38 // cluster but *don't* normalize to a single codepoint. For example, LATIN SMALL
39 // LETTER E (U+0065) + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an
40 // unusual glyph that is not used frequently enough to warrant its own unique
41 // codepoint (this is basically what Zalgo text is). Emojis also use the ZERO
42 // WIDTH JOINER (U+200D) to add gender, skin tone, or other modifiers to emojis.
43 // Tomo takes an opinionated stance that grapheme clusters, not codepoints or
44 // bytes, are more useful to people when doing text operations like getting the
45 // "length" of a text or accessing the Nth "letter" of a text. If someone sends
46 // you a text with WOMAN (U+1F469) + ZERO WIDTH JOINER (U+200D) + ROCKET
47 // (U+1F680) followed by THUMBS UP (U+1F44D), it will render on your screen as
48 // two things: a female astronaut and a thumbs up, and this is how most people
49 // will think about the text. If you wish to operate on the raw codepoints that
50 // comprise the message, you are free to do so with the `.utf32()`
51 // method and `Text.from_utf32()`, but this is not the default behavior.
52 // The behavior for the given example is that `text.length == 2`, `text[1]` is
53 // the grapheme cluster representing a female astronaut emoji, and `text[2]` is
54 // the grapheme cluster representing the thumbs up emoji.
56 // There are a lot of benefits to storing unicode text with one grapheme
57 // cluster per index in a densely packed list instead of storing the text as
58 // variable-width UTF8-encoded bytes. It lets us have one canonical length for
59 // the text that can be precomputed and is meaningful to users. It lets us
60 // quickly get the Nth "letter" in the text. Substring slicing is fast.
61 // However, since not all grapheme clusters take up the same number of
62 // codepoints, we're faced with the problem of how to jam multiple codepoints
63 // into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this
64 // implementation uses "synthetic graphemes" (in Raku's terms, Normal Form
65 // Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed
66 // integer that represents a multi-codepoint grapheme cluster that has been
67 // encountered during the program's runtime. These clusters are stored in a
68 // lookup list and hash map so that we can rapidly convert between the
69 // synthetic grapheme integer ID and the unicode codepoints associated with it.
70 // Essentially, it's like we create a supplement to the unicode standard with
71 // things that would be nice if they had their own codepoint so things worked
72 // out nicely because we're using them right now, and we'll give them a
73 // negative number so it doesn't overlap with any real codepoints.
75 // Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E
76 // WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2:
77 // U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E,
78 // COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2}
79 // Where -2 is used as a lookup in a list that holds the actual unicode
80 // codepoints: (ucs4_t[]){0x65, 0x0309}
82 // The text datastructure also uses a compact encoding (TEXT_BLOB) to store a
83 // per-chunk compressed form of the text when long stretches of text contain
84 // 256 or fewer unique grapheme clusters, which lets the text use a single byte
85 // for each grapheme cluster along with a lookup table. For typical text
86 // written in a variety of non-English natural languages (e.g. Spanish, Arabic,
87 // Japanese, Greek, German, Finnish, Basque), the in-memory representation
88 // takes up between 50-101% as much space as UTF8 encoding and between 24-39%
89 // as much space as UTF32 encoding, but with the advantage of extremely fast
90 // random access for indexing or slicing, unlike UTF8. In other words, this
91 // representation offers ASCII-like compactness and fast random access for
92 // non-English languages.
101 #include <sys/param.h>
103 #include "../unistr-fixed.h"
106 #include <unictype.h>
109 #include <unistring/version.h>
110 #include <uniwidth.h>
113 #include "datatypes.h"
114 #include "integers.h"
116 #include "optionals.h"
120 // Use inline version of the siphash code for performance:
121 #include "siphash-internals.h"
125 ucs4_t main_codepoint;
126 ucs4_t *utf32_cluster; // length-prefixed
128 } synthetic_grapheme_t;
130 // Synthetic grapheme clusters (clusters of more than one codepoint):
131 static Table_t grapheme_ids_by_codepoints = EMPTY_TABLE; // ucs4_t* length-prefixed codepoints -> int32_t ID
133 // This will hold a dynamically growing list of synthetic graphemes:
134 static synthetic_grapheme_t *synthetic_graphemes = NULL;
135 static int32_t synthetic_grapheme_capacity = 0;
136 static int32_t num_synthetic_graphemes = 0;
138 #define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id) - 1].utf32_cluster[0])
139 #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id) - 1].utf32_cluster[1])
140 #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id) - 1].utf8)
142 // Somewhat arbitrarily chosen, if two short literal ASCII or grapheme chunks
143 // are concatenated below this length threshold, we just merge them into a
144 // single literal node instead of a concatenation node.
145 #define SHORT_ASCII_LENGTH 64
146 #define SHORT_GRAPHEMES_LENGTH 16
148 static Text_t simple_concatenation(Text_t a, Text_t b);
150 PUREFUNC static bool graphemes_equal(const void *va, const void *vb, const TypeInfo_t *info) {
152 ucs4_t *a = *(ucs4_t **)va;
153 ucs4_t *b = *(ucs4_t **)vb;
154 if (a[0] != b[0]) return false;
155 for (int i = 0; i < (int)a[0]; i++)
156 if (a[i] != b[i]) return false;
160 PUREFUNC static uint64_t grapheme_hash(const void *g, const TypeInfo_t *info) {
162 ucs4_t *cluster = *(ucs4_t **)g;
163 return siphash24((void *)&cluster[1], sizeof(ucs4_t[cluster[0]]));
166 static const TypeInfo_t GraphemeClusterInfo = {
167 .size = sizeof(ucs4_t *),
168 .align = __alignof__(ucs4_t *),
171 .equal = graphemes_equal,
172 .hash = grapheme_hash,
177 #pragma GCC diagnostic push
178 #pragma GCC diagnostic ignored "-Wstack-protector"
181 int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) {
182 if (utf32_len == 1) return (int32_t)*codepoints;
184 ucs4_t length_prefixed[1 + utf32_len];
185 length_prefixed[0] = (ucs4_t)utf32_len;
186 for (int i = 0; i < utf32_len; i++)
187 length_prefixed[i + 1] = codepoints[i];
188 ucs4_t *ptr = &length_prefixed[0];
190 // Optimization for common case of one frequently used synthetic grapheme:
191 static int32_t last_grapheme = 0;
192 if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme - 1].utf32_cluster, NULL))
193 return last_grapheme;
195 TypeInfo_t GraphemeIDLookupTableInfo = *Table$info(&GraphemeClusterInfo, &Int32$info);
196 int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo);
197 if (found) return *found;
199 // New synthetic grapheme:
200 if (num_synthetic_graphemes >= synthetic_grapheme_capacity) {
201 // If we don't have space, allocate more:
202 synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2);
203 synthetic_grapheme_t *new = GC_MALLOC_ATOMIC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity]));
204 memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes]));
205 synthetic_graphemes = new;
208 int32_t grapheme_id = -(num_synthetic_graphemes + 1);
209 num_synthetic_graphemes += 1;
211 // Get UTF8 representation:
213 size_t u8_len = sizeof(u8_buf) / sizeof(u8_buf[0]);
214 uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len);
215 if (u8 == NULL) fail("Invalid graphemes encountered!");
217 // For performance reasons, use an arena allocator here to ensure that
218 // synthetic graphemes store all of their information in a densely packed
219 // area with good cache locality:
220 static void *arena = NULL, *arena_end = NULL;
221 if (arena != NULL && (size_t)arena % __alignof__(ucs4_t) != 0)
222 arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t));
224 // If we have filled up this arena, allocate a new one:
225 size_t needed_memory = sizeof(ucs4_t[1 + utf32_len]) + sizeof(uint8_t[u8_len + 1]);
226 if (arena == NULL || arena + needed_memory > arena_end) {
227 // Do reasonably big chunks at a time, so most synthetic codepoints are
228 // nearby each other in memory and cache locality is good. This is a
229 // rough guess at a good size:
230 size_t chunk_size = MAX(needed_memory, 512);
231 arena = GC_MALLOC_ATOMIC(chunk_size);
232 assert(arena != NULL);
233 arena_end = arena + chunk_size;
236 // Copy length-prefixed UTF32 codepoints into the arena and store where they live:
237 ucs4_t *codepoint_copy = arena;
238 memcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1 + utf32_len]));
239 synthetic_graphemes[-grapheme_id - 1].utf32_cluster = codepoint_copy;
240 arena += sizeof(ucs4_t[1 + utf32_len]);
242 // Copy UTF8 units into the arena and store where they live:
243 uint8_t *utf8_final = arena;
244 memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
245 utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
246 synthetic_graphemes[-grapheme_id - 1].utf8 = utf8_final;
247 arena += sizeof(uint8_t[u8_len + 1]);
249 // Sickos at the unicode consortium decreed that you can have grapheme clusters
250 // that begin with *prefix* modifiers, so we gotta check for that case:
251 synthetic_graphemes[-grapheme_id - 1].main_codepoint = length_prefixed[1];
252 for (ucs4_t i = 0; i < utf32_len; i++) {
253 #if _LIBUNISTRING_VERSION >= 0x010200
254 // libuinstring version 1.2.0 introduced uc_is_property_prepended_concatenation_mark()
255 // It's not critical, but it's technically more correct to have this check:
256 if (unlikely(uc_is_property_prepended_concatenation_mark(length_prefixed[1 + i]))) continue;
258 synthetic_graphemes[-grapheme_id - 1].main_codepoint = length_prefixed[1 + i];
262 // Cleanup from unicode API:
263 if (u8 != u8_buf) free(u8);
265 Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo);
267 last_grapheme = grapheme_id;
271 #pragma GCC diagnostic pop
275 int Text$print(FILE *stream, Text_t t) {
276 if (t.length == 0) return 0;
279 case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream);
280 case TEXT_GRAPHEMES: {
281 const int32_t *graphemes = t.graphemes;
283 for (int64_t i = 0; i < (int64_t)t.length; i++) {
284 int32_t grapheme = graphemes[i];
287 size_t len = sizeof(buf);
288 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, buf, &len);
289 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
290 written += (int)fwrite(u8, sizeof(char), len, stream);
291 if (u8 != buf) free(u8);
293 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
295 written += (int)fwrite(u8, sizeof(uint8_t), strlen((char *)u8), stream);
302 for (int64_t i = 0; i < (int64_t)t.length; i++) {
303 int32_t grapheme = t.blob.map[t.blob.bytes[i]];
306 size_t len = sizeof(buf);
307 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, buf, &len);
308 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
309 written += (int)fwrite(u8, sizeof(char), len, stream);
310 if (u8 != buf) free(u8);
312 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
314 written += (int)fwrite(u8, sizeof(uint8_t), strlen((char *)u8), stream);
320 int written = Text$print(stream, *t.left);
321 written += Text$print(stream, *t.right);
328 static const uint64_t min_len_for_depth[MAX_TEXT_DEPTH] = {
329 // Fibonacci numbers (skipping first two)
330 1, 2, 3, 5, 8, 13, 21, 34, 55, 89,
331 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946,
332 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269,
333 2178309, 3524578, 5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141,
334 267914296, 433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049,
337 #define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth])
339 static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text) {
340 if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) {
341 insert_balanced_recursive(balanced_texts, *text.left);
342 insert_balanced_recursive(balanced_texts, *text.right);
347 Text_t accumulator = EMPTY_TEXT;
348 for (; text.length > min_len_for_depth[i + 1]; i++) {
349 if (balanced_texts[i].length) {
350 accumulator = simple_concatenation(balanced_texts[i], accumulator);
351 balanced_texts[i] = EMPTY_TEXT;
355 accumulator = simple_concatenation(accumulator, text);
357 while (accumulator.length >= min_len_for_depth[i]) {
358 if (balanced_texts[i].length) {
359 accumulator = simple_concatenation(balanced_texts[i], accumulator);
360 balanced_texts[i] = EMPTY_TEXT;
365 balanced_texts[i] = accumulator;
368 static Text_t rebalanced(Text_t a, Text_t b) {
369 Text_t balanced_texts[MAX_TEXT_DEPTH];
370 memset(balanced_texts, 0, sizeof(balanced_texts));
371 insert_balanced_recursive(balanced_texts, a);
372 insert_balanced_recursive(balanced_texts, b);
374 Text_t ret = EMPTY_TEXT;
375 for (int64_t i = 0; ret.length < a.length + b.length; i++) {
376 if (balanced_texts[i].length) ret = simple_concatenation(balanced_texts[i], ret);
381 Text_t simple_concatenation(Text_t a, Text_t b) {
382 if (a.length == 0) return b;
383 if (b.length == 0) return a;
385 uint16_t new_depth = 1 + MAX(a.depth, b.depth);
386 // Rebalance only if depth exceeds the maximum allowed. We don't require
387 // every concatenation to yield a balanced text, since many concatenations
388 // are ephemeral (e.g. doing a loop repeatedly concatenating without using
389 // the intermediary values).
390 if (new_depth >= MAX_TEXT_DEPTH) return rebalanced(a, b);
392 Text_t *children = GC_MALLOC(sizeof(Text_t[2]));
397 .length = a.length + b.length,
399 .left = &children[0],
400 .right = &children[1],
404 static Text_t concat2_assuming_safe(Text_t a, Text_t b) {
405 if (a.length == 0) return b;
406 if (b.length == 0) return a;
408 if (a.tag == TEXT_ASCII && b.tag == TEXT_ASCII && (size_t)(a.length + b.length) <= SHORT_ASCII_LENGTH) {
409 struct Text_s ret = {
411 .length = a.length + b.length,
413 ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
414 memcpy((char *)ret.ascii, a.ascii, sizeof(char[a.length]));
415 memcpy((char *)&ret.ascii[a.length], b.ascii, sizeof(char[b.length]));
417 } else if (a.tag == TEXT_GRAPHEMES && b.tag == TEXT_GRAPHEMES
418 && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
419 struct Text_s ret = {
420 .tag = TEXT_GRAPHEMES,
421 .length = a.length + b.length,
423 ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
424 memcpy((int32_t *)ret.graphemes, a.graphemes, sizeof(int32_t[a.length]));
425 memcpy((int32_t *)&ret.graphemes[a.length], b.graphemes, sizeof(int32_t[b.length]));
427 } else if (a.tag != TEXT_CONCAT && b.tag != TEXT_CONCAT
428 && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
429 // Turn a small bit of ASCII into graphemes if it helps make things smaller
430 // Text structs come with an extra 8 bytes, so allocate enough to hold the text
431 struct Text_s ret = {
432 .tag = TEXT_GRAPHEMES,
433 .length = a.length + b.length,
435 ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
436 int32_t *dest = (int32_t *)ret.graphemes;
437 if (a.tag == TEXT_GRAPHEMES) {
438 memcpy(dest, a.graphemes, sizeof(int32_t[a.length]));
440 } else if (a.tag == TEXT_ASCII) {
441 for (int64_t i = 0; i < (int64_t)a.length; i++)
442 *(dest++) = (int32_t)a.ascii[i];
443 } else if (a.tag == TEXT_BLOB) {
444 for (int64_t i = 0; i < (int64_t)a.length; i++)
445 *(dest++) = a.blob.map[a.blob.bytes[i]];
447 errx(1, "Unreachable");
449 if (b.tag == TEXT_GRAPHEMES) {
450 memcpy(dest, b.graphemes, sizeof(int32_t[b.length]));
451 } else if (b.tag == TEXT_ASCII) {
452 for (int64_t i = 0; i < (int64_t)b.length; i++)
453 *(dest++) = (int32_t)b.ascii[i];
454 } else if (b.tag == TEXT_BLOB) {
455 for (int64_t i = 0; i < (int64_t)b.length; i++)
456 *(dest++) = b.blob.map[b.blob.bytes[i]];
458 errx(1, "Unreachable");
463 if (a.tag == TEXT_CONCAT && b.tag != TEXT_CONCAT && a.right->tag != TEXT_CONCAT)
464 return concat2_assuming_safe(*a.left, concat2_assuming_safe(*a.right, b));
466 return simple_concatenation(a, b);
469 static Text_t concat2(Text_t a, Text_t b) {
470 if (a.length == 0) return b;
471 if (b.length == 0) return a;
473 int32_t last_a = Text$get_grapheme(a, a.length - 1);
474 int32_t first_b = Text$get_grapheme(b, 0);
476 // Magic number, we know that no codepoints below here trigger instability:
477 static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; // COMBINING GRAVE ACCENT
478 if (last_a >= 0 && last_a < LOWEST_CODEPOINT_TO_CHECK && first_b >= 0 && first_b < LOWEST_CODEPOINT_TO_CHECK)
479 return concat2_assuming_safe(a, b);
481 size_t len = (last_a >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a);
482 len += (first_b >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b);
484 ucs4_t codepoints[len];
485 ucs4_t *dest = codepoints;
487 memcpy(dest, GRAPHEME_CODEPOINTS(last_a), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(last_a)]));
488 dest += NUM_GRAPHEME_CODEPOINTS(last_a);
490 *(dest++) = (ucs4_t)last_a;
494 memcpy(dest, GRAPHEME_CODEPOINTS(first_b), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(first_b)]));
496 *dest = (ucs4_t)first_b;
499 // Do a normalization run for these two codepoints and see if it looks different.
500 // Normalization should not exceed 3x in the input length (but if it does, it will be
501 // handled gracefully)
502 ucs4_t norm_buf[3 * len];
503 size_t norm_length = sizeof(norm_buf) / sizeof(norm_buf[0]);
504 ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, len, norm_buf, &norm_length);
505 bool stable = (norm_length == len && memcmp(codepoints, normalized, sizeof(codepoints)) == 0);
508 const void *second_grapheme = u32_grapheme_next(normalized, &normalized[norm_length]);
509 if (second_grapheme == &normalized[norm_length]) stable = false;
513 if (normalized != norm_buf) free(normalized);
514 return concat2_assuming_safe(a, b);
517 OptionalText_t glue =
518 Text$from_utf32((List_t){.data = normalized, .length = (uint64_t)norm_length, .stride = sizeof(ucs4_t)});
520 if (normalized != norm_buf) free(normalized);
522 if (a.length == 1 && b.length == 1) return glue;
523 else if (a.length == 1) return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length)));
524 else if (b.length == 1) return concat2_assuming_safe(Text$slice(a, I(1), I(a.length - 1)), glue);
526 return concat2_assuming_safe(concat2_assuming_safe(Text$slice(a, I(1), I(a.length - 1)), glue),
527 Text$slice(b, I(2), I(b.length)));
531 Text_t Text$_concat(int n, Text_t items[n]) {
532 if (n == 0) return EMPTY_TEXT;
534 Text_t ret = items[0];
535 for (int i = 1; i < n; i++) {
536 if (items[i].length > 0) ret = concat2(ret, items[i]);
542 Text_t Text$repeat(Text_t text, Int_t count) {
543 if (text.length == 0 || Int$is_negative(count)) return EMPTY_TEXT;
545 Int_t result_len = Int$times(count, I(text.length));
546 if (Int$compare_value(result_len, I(1l << 40)) > 0) fail("Text repeating would produce too big of an result!");
548 int64_t count64 = Int64$from_int(count, false);
550 for (int64_t c = 1; c < count64; c++)
551 ret = concat2(ret, text);
556 Int_t Text$width(Text_t text, Text_t language) {
557 int width = u8_strwidth((const uint8_t *)Text$as_c_string(text), Text$as_c_string(language));
558 return Int$from_int32(width);
561 static Text_t Text$repeat_to_width(Text_t to_repeat, int64_t target_width, Text_t language) {
562 if (target_width <= 0) return EMPTY_TEXT;
564 const char *lang_str = Text$as_c_string(language);
565 int64_t width = (int64_t)u8_strwidth((const uint8_t *)Text$as_c_string(to_repeat), lang_str);
566 Text_t repeated = EMPTY_TEXT;
567 int64_t repeated_width = 0;
568 while (repeated_width + width <= target_width) {
569 repeated = concat2(repeated, to_repeat);
570 repeated_width += width;
573 if (repeated_width < target_width) {
574 for (int64_t i = 0; repeated_width < target_width && i < (int64_t)to_repeat.length; i++) {
575 Text_t c = Text$slice(to_repeat, I_small(i + 1), I_small(i + 1));
576 int64_t w = (int64_t)u8_strwidth((const uint8_t *)Text$as_c_string(c), lang_str);
577 if (repeated_width + w > target_width) {
578 repeated = concat2(repeated, Text$repeat(Text(" "), I(target_width - repeated_width)));
581 repeated = concat2(repeated, c);
590 Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
591 if (padding.length == 0) fail("Cannot pad with an empty text!");
593 int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
594 return concat2(Text$repeat_to_width(padding, needed, language), text);
598 Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
599 if (padding.length == 0) fail("Cannot pad with an empty text!");
601 int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
602 return concat2(text, Text$repeat_to_width(padding, needed, language));
606 Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
607 if (padding.length == 0) fail("Cannot pad with an empty text!");
609 int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
610 return Text$concat(Text$repeat_to_width(padding, needed / 2, language), text,
611 Text$repeat_to_width(padding, (needed + 1) / 2, language));
615 Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) {
616 int64_t first = Int64$from_int(first_int, false);
617 int64_t last = Int64$from_int(last_int, false);
618 if (first == 0) fail("Invalid index: 0");
619 if (last == 0) return EMPTY_TEXT;
621 if (first < 0) first = (int64_t)text.length + first + 1;
622 if (last < 0) last = (int64_t)text.length + last + 1;
624 if (last > (int64_t)text.length) last = (int64_t)text.length;
626 if (first > (int64_t)text.length || last < first) return EMPTY_TEXT;
628 if (first == 1 && last == (int64_t)text.length) return text;
630 while (text.tag == TEXT_CONCAT) {
631 if (last < (int64_t)text.left->length) {
633 } else if (first > (int64_t)text.left->length) {
634 first -= (int64_t)text.left->length;
635 last -= (int64_t)text.left->length;
638 return concat2_assuming_safe(Text$slice(*text.left, I(first), I(text.length)),
639 Text$slice(*text.right, I(1), I(last - text.left->length)));
647 .length = last - first + 1,
648 .ascii = text.ascii + (first - 1),
651 case TEXT_GRAPHEMES: {
653 .tag = TEXT_GRAPHEMES,
654 .length = last - first + 1,
655 .graphemes = text.graphemes + (first - 1),
659 Text_t ret = (Text_t){
661 .length = last - first + 1,
662 .blob.map = text.blob.map,
663 .blob.bytes = text.blob.bytes + (first - 1),
667 default: errx(1, "Invalid tag");
673 Text_t Text$from(Text_t text, Int_t first) {
674 return Text$slice(text, first, I_small(-1));
678 Text_t Text$to(Text_t text, Int_t last) {
679 return Text$slice(text, I_small(1), last);
683 Text_t Text$reversed(Text_t text) {
686 struct Text_s ret = {
688 .length = text.length,
690 ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
691 for (int64_t i = 0; i < (int64_t)text.length; i++)
692 ((char *)ret.ascii)[text.length - 1 - i] = text.ascii[i];
695 case TEXT_GRAPHEMES: {
696 struct Text_s ret = {
697 .tag = TEXT_GRAPHEMES,
698 .length = text.length,
700 ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
701 for (int64_t i = 0; i < (int64_t)text.length; i++)
702 ((int32_t *)ret.graphemes)[text.length - 1 - i] = text.graphemes[i];
706 struct Text_s ret = {
708 .length = text.length,
709 .blob.map = text.blob.map,
711 ret.blob.bytes = GC_MALLOC_ATOMIC(sizeof(uint8_t[ret.length]));
712 for (int64_t i = 0; i < (int64_t)text.length; i++)
713 ((uint8_t *)ret.blob.bytes)[text.length - 1 - i] = text.graphemes[i];
717 return concat2_assuming_safe(Text$reversed(*text.right), Text$reversed(*text.left));
719 default: errx(1, "Invalid tag");
725 PUREFUNC OptionalText_t Text$cluster(Text_t text, Int_t index) {
726 Text_t slice = Text$slice(text, index, index);
727 return slice.length <= 0 ? NONE_TEXT : slice;
730 static Text_t Text$from_components(List_t graphemes, Table_t unique_clusters) {
731 if (graphemes.length == 0) return EMPTY_TEXT;
732 size_t blob_size = (sizeof(int32_t[unique_clusters.entries.length]) + sizeof(uint8_t[graphemes.length]));
733 // If blob optimization will save at least 200 bytes:
734 if (unique_clusters.entries.length <= 256 && blob_size + 200 < sizeof(int32_t[graphemes.length])) {
737 .length = graphemes.length,
740 void *blob = GC_MALLOC_ATOMIC(blob_size);
742 uint8_t *bytes = blob + sizeof(int32_t[unique_clusters.entries.length]);
743 for (int64_t i = 0; i < (int64_t)unique_clusters.entries.length; i++) {
747 } *entry = unique_clusters.entries.data + i * unique_clusters.entries.stride;
748 map[entry->b] = entry->g;
750 for (int64_t i = 0; i < (int64_t)graphemes.length; i++) {
751 int32_t g = *(int32_t *)(graphemes.data + i * graphemes.stride);
752 uint8_t *byte = Table$get(unique_clusters, &g, Table$info(&Int32$info, &Byte$info));
757 ret.blob.bytes = bytes;
761 .tag = TEXT_GRAPHEMES,
762 .length = graphemes.length,
763 .graphemes = graphemes.data,
769 OptionalText_t Text$from_strn(const char *str, size_t len) {
770 int64_t ascii_span = 0;
771 for (size_t i = 0; i < len && isascii(str[i]); i++) {
773 if (str[i] == 0) return NONE_TEXT;
776 if (ascii_span == (int64_t)len) { // All ASCII
777 char *copy = GC_MALLOC_ATOMIC(len);
778 memcpy(copy, str, len);
781 .length = ascii_span,
785 if (u8_check((uint8_t *)str, len) != NULL) return NONE_TEXT;
787 List_t graphemes = EMPTY_LIST;
788 Table_t unique_clusters = EMPTY_TABLE;
789 const uint8_t *pos = (const uint8_t *)str;
790 const uint8_t *end = (const uint8_t *)&str[len];
791 // Iterate over grapheme clusters
792 for (const uint8_t *next; (next = u8_grapheme_next(pos, end)); pos = next) {
794 size_t u32_len = sizeof(buf) / sizeof(buf[0]);
795 uint32_t *u32s = u8_to_u32(pos, (size_t)(next - pos), buf, &u32_len);
796 if (u32s == NULL) return NONE_TEXT;
799 size_t u32_normlen = sizeof(buf2) / sizeof(buf2[0]);
800 uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, u32s, u32_len, buf2, &u32_normlen);
801 if (u32s_normalized == NULL) return NONE_TEXT;
803 int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
804 if (g == 0) return NONE_TEXT;
805 List$insert(&graphemes, &g, I(0), sizeof(int32_t));
806 Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length,
807 Table$info(&Int32$info, &Byte$info));
809 if (u32s != buf) free(u32s);
810 if (u32s_normalized != buf2) free(u32s_normalized);
812 if (unique_clusters.entries.length >= 256) {
813 return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters),
814 Text$from_strn((const char *)next, (size_t)(end - next)));
818 return Text$from_components(graphemes, unique_clusters);
822 OptionalText_t Text$from_str(const char *str) {
823 return str ? Text$from_strn(str, strlen(str)) : Text("");
826 static void u8_buf_append(Text_t text, Byte_t **buf, int64_t *capacity, int64_t *i) {
829 if (*i + (int64_t)text.length > (int64_t)*capacity) {
830 *capacity = *i + (int64_t)text.length + 1;
831 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
834 const char *bytes = text.ascii;
835 memcpy(*buf + *i, bytes, (size_t)text.length);
839 case TEXT_GRAPHEMES: {
840 const int32_t *graphemes = text.graphemes;
841 for (int64_t g = 0; g < (int64_t)text.length; g++) {
842 if (graphemes[g] >= 0) {
844 size_t u8_len = sizeof(u8_buf);
845 uint8_t *u8 = u32_to_u8((ucs4_t *)&graphemes[g], 1, u8_buf, &u8_len);
846 if (u8 == NULL) fail("Invalid grapheme encountered: ", graphemes[g]);
848 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
849 *capacity = *i + (int64_t)u8_len + 1;
850 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
853 memcpy(*buf + *i, u8, u8_len);
854 *i += (int64_t)u8_len;
855 if (u8 != u8_buf) free(u8);
857 const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]);
858 size_t u8_len = u8_strlen(u8);
859 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
860 *capacity = *i + (int64_t)u8_len + 1;
861 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
864 memcpy(*buf + *i, u8, u8_len);
865 *i += (int64_t)u8_len;
871 for (int64_t g = 0; g < (int64_t)text.length; g++) {
872 int32_t grapheme = text.blob.map[text.blob.bytes[g]];
875 size_t u8_len = sizeof(u8_buf);
876 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, u8_buf, &u8_len);
877 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
879 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
880 *capacity = *i + (int64_t)u8_len + 1;
881 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
884 memcpy(*buf + *i, u8, u8_len);
885 *i += (int64_t)u8_len;
886 if (u8 != u8_buf) free(u8);
888 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
889 size_t u8_len = u8_strlen(u8);
890 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
891 *capacity = *i + (int64_t)u8_len + 1;
892 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
895 memcpy(*buf + *i, u8, u8_len);
896 *i += (int64_t)u8_len;
902 u8_buf_append(*text.left, buf, capacity, i);
903 u8_buf_append(*text.right, buf, capacity, i);
911 const char *Text$as_c_string(Text_t text) {
912 if (text.tag == TEXT_ASCII && text.ascii[text.length] == '\0') return text.ascii;
913 int64_t capacity = text.length + 1;
914 char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
916 u8_buf_append(text, (Byte_t **)&buf, &capacity, &i);
918 if (i + 1 > (int64_t)capacity) {
920 buf = GC_REALLOC(buf, (size_t)capacity);
926 PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t *info) {
928 Text_t text = *(Text_t *)obj;
930 siphashinit(&sh, sizeof(int32_t[text.length]));
938 const char *bytes = text.ascii;
939 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
940 tmp.chunks[0] = (int32_t)bytes[i];
941 tmp.chunks[1] = (int32_t)bytes[i + 1];
942 siphashadd64bits(&sh, tmp.whole);
944 int32_t last = text.length & 0x1 ? (int32_t)bytes[text.length - 1] : 0; // Odd number of graphemes
945 return siphashfinish_last_part(&sh, (uint64_t)last);
947 case TEXT_GRAPHEMES: {
948 const int32_t *graphemes = text.graphemes;
949 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
950 tmp.chunks[0] = graphemes[i];
951 tmp.chunks[1] = graphemes[i + 1];
952 siphashadd64bits(&sh, tmp.whole);
954 int32_t last = text.length & 0x1 ? graphemes[text.length - 1] : 0; // Odd number of graphemes
955 return siphashfinish_last_part(&sh, (uint64_t)last);
958 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
959 tmp.chunks[0] = text.blob.map[text.blob.bytes[i]];
960 tmp.chunks[1] = text.blob.map[text.blob.bytes[i + 1]];
961 siphashadd64bits(&sh, tmp.whole);
964 text.length & 0x1 ? text.blob.map[text.blob.bytes[text.length - 1]] : 0; // Odd number of graphemes
965 return siphashfinish_last_part(&sh, (uint64_t)last);
968 TextIter_t state = NEW_TEXT_ITER_STATE(text);
969 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
970 tmp.chunks[0] = Text$get_grapheme_fast(&state, i);
971 tmp.chunks[1] = Text$get_grapheme_fast(&state, i + 1);
972 siphashadd64bits(&sh, tmp.whole);
975 int32_t last = (text.length & 0x1) ? Text$get_grapheme_fast(&state, text.length - 1) : 0;
976 return siphashfinish_last_part(&sh, (uint64_t)last);
978 default: errx(1, "Invalid text");
984 int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index) {
985 if (index < 0) return 0;
986 if (index >= (int64_t)state->stack[0].text.length) return 0;
988 assert(state->stack[0].text.depth <= MAX_TEXT_DEPTH);
990 // Go up the stack as needed:
991 while (index < state->stack[state->stack_index].offset
993 >= state->stack[state->stack_index].offset + (int64_t)state->stack[state->stack_index].text.length) {
994 state->stack_index -= 1;
995 assert(state->stack_index >= 0);
998 assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
1000 // Go down the stack as needed:
1001 while (state->stack[state->stack_index].text.tag == TEXT_CONCAT) {
1002 Text_t text = state->stack[state->stack_index].text;
1003 int64_t offset = state->stack[state->stack_index].offset;
1004 assert(state->stack_index <= MAX_TEXT_DEPTH);
1005 assert(index >= offset);
1006 assert(index < offset + (int64_t)text.length);
1008 state->stack_index += 1;
1009 if (index < offset + (int64_t)text.left->length) {
1010 state->stack[state->stack_index].text = *text.left;
1011 state->stack[state->stack_index].offset = offset;
1013 state->stack[state->stack_index].text = *text.right;
1014 state->stack[state->stack_index].offset = offset + text.left->length;
1016 assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
1019 Text_t text = state->stack[state->stack_index].text;
1020 int64_t offset = state->stack[state->stack_index].offset;
1022 if (index < offset || index >= offset + (int64_t)text.length) {
1027 case TEXT_ASCII: return (int32_t)text.ascii[index - offset];
1028 case TEXT_GRAPHEMES: return text.graphemes[index - offset];
1029 case TEXT_BLOB: return text.blob.map[text.blob.bytes[index - offset]];
1030 default: errx(1, "Invalid text");
1036 uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index) {
1037 int32_t g = Text$get_grapheme_fast(state, index);
1038 return (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint;
1041 PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t *info) {
1043 if (va == vb) return 0;
1044 const Text_t a = *(const Text_t *)va;
1045 const Text_t b = *(const Text_t *)vb;
1047 // TODO: make this smarter and more efficient
1048 int64_t len = MAX(a.length, b.length);
1049 TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1050 for (int64_t i = 0; i < len; i++) {
1051 int32_t ai = Text$get_grapheme_fast(&a_state, i);
1052 int32_t bi = Text$get_grapheme_fast(&b_state, i);
1053 if (ai == bi) continue;
1055 if (ai > 0 && bi > 0) {
1056 cmp = u32_cmp((ucs4_t *)&ai, (ucs4_t *)&bi, 1);
1057 } else if (ai > 0) {
1058 cmp = u32_cmp2((ucs4_t *)&ai, 1, GRAPHEME_CODEPOINTS(bi), NUM_GRAPHEME_CODEPOINTS(bi));
1059 } else if (bi > 0) {
1060 cmp = u32_cmp2(GRAPHEME_CODEPOINTS(ai), NUM_GRAPHEME_CODEPOINTS(ai), (ucs4_t *)&bi, 1);
1062 cmp = u32_cmp2(GRAPHEME_CODEPOINTS(ai), NUM_GRAPHEME_CODEPOINTS(ai), GRAPHEME_CODEPOINTS(bi),
1063 NUM_GRAPHEME_CODEPOINTS(bi));
1065 if (cmp != 0) return cmp;
1070 bool _matches(TextIter_t *text_state, TextIter_t *target_state, int64_t pos) {
1071 for (int64_t i = 0; i < (int64_t)target_state->stack[0].text.length; i++) {
1072 int32_t text_i = Text$get_grapheme_fast(text_state, pos + i);
1073 int32_t target_i = Text$get_grapheme_fast(target_state, i);
1074 if (text_i != target_i) return false;
1079 PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix, Text_t *remainder) {
1080 if (text.length < prefix.length) return false;
1081 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix);
1082 if (_matches(&text_state, &prefix_state, 0)) {
1083 if (remainder) *remainder = Text$from(text, Int$from_int64(prefix.length + 1));
1086 if (remainder) *remainder = text;
1091 PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix, Text_t *remainder) {
1092 if (text.length < suffix.length) return false;
1093 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix);
1094 if (_matches(&text_state, &suffix_state, text.length - suffix.length)) {
1095 if (remainder) *remainder = Text$to(text, Int$from_int64(text.length - suffix.length));
1098 if (remainder) *remainder = text;
1104 Text_t Text$without_prefix(Text_t text, Text_t prefix) {
1105 return Text$starts_with(text, prefix, NULL) ? Text$slice(text, I(prefix.length + 1), I(text.length)) : text;
1109 Text_t Text$without_suffix(Text_t text, Text_t suffix) {
1110 return Text$ends_with(text, suffix, NULL) ? Text$slice(text, I(1), I(text.length - suffix.length)) : text;
1113 static bool _has_grapheme(TextIter_t *text, int32_t g) {
1114 for (int64_t t = 0; t < (int64_t)text->stack[0].text.length; t++) {
1115 if (g == Text$get_grapheme_fast(text, t)) {
1123 OptionalInt_t Text$find(Text_t text, Text_t target, Int_t start) {
1124 if (text.length < target.length) return NONE_INT;
1125 if (target.length <= 0) return I(1);
1126 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1127 for (int64_t i = Int64$from_int(start, false) - 1; i < (int64_t)text.length - (int64_t)target.length + 1; i++) {
1128 if (_matches(&text_state, &target_state, i)) {
1129 return Int$from_int64(i + 1);
1136 bool Text$matches_glob(Text_t text, Text_t glob) {
1137 return !fnmatch(Text$as_c_string(glob), Text$as_c_string(text), 0);
1141 Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right) {
1143 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), trim_state = NEW_TEXT_ITER_STATE(to_trim);
1145 while (first < (int64_t)text.length && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, first))) {
1149 int64_t last = text.length - 1;
1151 while (last >= first && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, last))) {
1155 return (first != 0 || last != text.length - 1) ? Text$slice(text, I(first + 1), I(last + 1)) : text;
1159 Text_t Text$translate(Text_t text, Table_t translations) {
1160 TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1161 Text_t result = EMPTY_TEXT;
1162 int64_t span_start = 0;
1163 List_t replacement_list = translations.entries;
1164 for (int64_t i = 0; i < (int64_t)text.length;) {
1165 for (int64_t r = 0; r < (int64_t)replacement_list.length; r++) {
1167 Text_t target, replacement;
1168 } *entry = replacement_list.data + r * replacement_list.stride;
1169 if (entry->target.length <= 0) continue;
1170 TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target);
1171 if (_matches(&text_state, &target_state, i)) {
1172 if (i > span_start) result = concat2(result, Text$slice(text, I(span_start + 1), I(i)));
1174 result = concat2(result, entry->replacement);
1175 i += entry->target.length;
1184 if (span_start < (int64_t)text.length)
1185 result = concat2(result, Text$slice(text, I(span_start + 1), I((int64_t)text.length)));
1190 Text_t Text$replace(Text_t text, Text_t target, Text_t replacement) {
1191 if (target.length <= 0) return text;
1192 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1193 Text_t result = EMPTY_TEXT;
1194 int64_t span_start = 0;
1195 for (int64_t i = 0; i < (int64_t)text.length;) {
1196 if (_matches(&text_state, &target_state, i)) {
1197 if (i > span_start) result = concat2(result, Text$slice(text, I(span_start + 1), I(i)));
1199 result = concat2(result, replacement);
1206 if (span_start < (int64_t)text.length)
1207 result = concat2(result, Text$slice(text, I(span_start + 1), I((int64_t)text.length)));
1212 PUREFUNC bool Text$has(Text_t text, Text_t target) {
1213 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1214 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1215 if (_matches(&text_state, &target_state, i)) return true;
1221 List_t Text$split(Text_t text, Text_t delimiters) {
1222 if (delimiters.length == 0) return Text$clusters(text);
1224 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
1225 List_t splits = EMPTY_LIST;
1226 for (int64_t i = 0; i < (int64_t)text.length;) {
1227 int64_t span_len = 0;
1228 while (i + span_len < (int64_t)text.length && !_matches(&text_state, &delim_state, i + span_len)) {
1231 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1232 List$insert(&splits, &slice, I(0), sizeof(slice));
1233 i += span_len + delimiters.length;
1234 if (i == text.length) {
1235 Text_t empty = Text("");
1236 List$insert(&splits, &empty, I(0), sizeof(empty));
1243 List_t Text$split_any(Text_t text, Text_t delimiters) {
1244 if (delimiters.length == 0) return List(text);
1246 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
1247 List_t splits = EMPTY_LIST;
1248 for (int64_t i = 0; i < (int64_t)text.length;) {
1249 int64_t span_len = 0;
1250 while (i + span_len < (int64_t)text.length
1251 && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) {
1254 bool trailing_delim = i + span_len < (int64_t)text.length;
1255 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1256 List$insert(&splits, &slice, I(0), sizeof(slice));
1258 while (i < (int64_t)text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) {
1261 if (i >= (int64_t)text.length && trailing_delim) {
1262 Text_t empty = Text("");
1263 List$insert(&splits, &empty, I(0), sizeof(empty));
1273 } split_iter_state_t;
1275 static OptionalText_t next_split(split_iter_state_t *state) {
1276 Text_t text = state->state.stack[0].text;
1277 if (state->i >= (int64_t)text.length) {
1278 if (state->delimiter.length > 0 && state->i == text.length) { // special case
1279 state->i = text.length + 1;
1285 if (state->delimiter.length == 0) { // special case
1286 state->i = text.length + 1;
1290 TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
1291 int64_t i = state->i;
1292 int64_t span_len = 0;
1293 while (i + span_len < (int64_t)text.length && !_matches(&state->state, &delim_state, i + span_len)) {
1296 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1297 state->i = i + span_len + state->delimiter.length;
1302 Closure_t Text$by_split(Text_t text, Text_t delimiter) {
1304 .fn = (void *)next_split,
1305 .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .delimiter = delimiter),
1309 static OptionalText_t next_split_any(split_iter_state_t *state) {
1310 Text_t text = state->state.stack[0].text;
1311 if (state->i >= (int64_t)text.length) {
1312 if (state->delimiter.length > 0 && state->i == text.length) { // special case
1313 state->i = text.length + 1;
1319 if (state->delimiter.length == 0) { // special case
1320 Text_t ret = Text$cluster(text, I(state->i + 1));
1325 TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
1326 int64_t i = state->i;
1327 int64_t span_len = 0;
1328 while (i + span_len < (int64_t)text.length
1329 && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i + span_len))) {
1332 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1334 while (i < (int64_t)text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i))) {
1342 Closure_t Text$by_split_any(Text_t text, Text_t delimiters) {
1344 .fn = (void *)next_split_any,
1345 .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .delimiter = delimiters),
1349 PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) {
1350 if (a.length != b.length) return false;
1351 int64_t len = a.length;
1352 TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1353 // TODO: make this smarter and more efficient
1354 for (int64_t i = 0; i < len; i++) {
1355 int32_t ai = Text$get_grapheme_fast(&a_state, i);
1356 int32_t bi = Text$get_grapheme_fast(&b_state, i);
1357 if (ai != bi) return false;
1362 PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t *info) {
1364 if (a == b) return true;
1365 return Text$equal_values(*(Text_t *)a, *(Text_t *)b);
1368 PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language) {
1369 if (a.length != b.length) return false;
1370 int64_t len = a.length;
1371 TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1372 const char *uc_language = Text$as_c_string(language);
1373 for (int64_t i = 0; i < len; i++) {
1374 int32_t ai = Text$get_grapheme_fast(&a_state, i);
1375 int32_t bi = Text$get_grapheme_fast(&b_state, i);
1377 const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t *)&ai : GRAPHEME_CODEPOINTS(ai);
1378 int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);
1380 const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t *)&bi : GRAPHEME_CODEPOINTS(bi);
1381 int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
1384 (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
1385 if (cmp != 0) return false;
1392 Text_t Text$upper(Text_t text, Text_t language) {
1393 if (text.length == 0) return text;
1394 List_t codepoints = Text$utf32(text);
1395 if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1396 const char *uc_language = Text$as_c_string(language);
1397 size_t out_len = 256;
1398 uint32_t buf[out_len];
1399 ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1400 OptionalText_t ret =
1401 Text$from_utf32((List_t){.data = upper, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1402 if (upper != buf) free(upper);
1407 Text_t Text$lower(Text_t text, Text_t language) {
1408 if (text.length == 0) return text;
1409 List_t codepoints = Text$utf32(text);
1410 if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1411 const char *uc_language = Text$as_c_string(language);
1412 size_t out_len = 256;
1413 uint32_t buf[out_len];
1414 ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1415 OptionalText_t ret =
1416 Text$from_utf32((List_t){.data = lower, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1417 if (lower != buf) free(lower);
1422 Text_t Text$title(Text_t text, Text_t language) {
1423 if (text.length == 0) return text;
1424 List_t codepoints = Text$utf32(text);
1425 if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1426 const char *uc_language = Text$as_c_string(language);
1427 size_t out_len = 256;
1428 uint32_t buf[out_len];
1429 ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1430 OptionalText_t ret =
1431 Text$from_utf32((List_t){.data = title, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1432 if (title != buf) free(title);
1437 double Text$distance(Text_t a, Text_t b, Text_t language) {
1438 if (a.length <= 0) return (double)b.length;
1439 if (b.length <= 0) return (double)a.length;
1441 // The current implementation of text distance uses a modified form
1442 // of Damerau–Levenshtein distance that gives slightly lower distances
1443 // for letters with the same main grapheme and slightly lower distances
1444 // for letters that are the same, but with different casing.
1445 double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t) * (a.length + 1) * (b.length + 1));
1446 #define DIST(x, y) distances[(x) * b.length + (y)]
1447 for (int64_t i = 0; i <= (int64_t)a.length; i++)
1449 for (int64_t j = 0; j <= (int64_t)b.length; j++)
1452 TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
1453 TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
1454 const char *uc_language = Text$as_c_string(language);
1455 for (int64_t i = 1; i <= (int64_t)a.length; i++) {
1456 for (int64_t j = 1; j <= (int64_t)b.length; j++) {
1457 int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
1458 int32_t bj = Text$get_grapheme_fast(&b_state, j - 1);
1459 double cost = (double)(ai != bj);
1461 ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
1462 ucs4_t main_bj = (bj) >= 0 ? (ucs4_t)bj : synthetic_graphemes[-(bj)-1].main_codepoint;
1463 if (main_ai == main_bj) {
1464 // Same main grapheme (different modifiers)
1468 (void)u32_casecmp(&main_ai, 1, &main_bj, 1, uc_language, UNINORM_NFC, &cmp);
1469 if (cmp == 0) cost = 0.5;
1473 double insertion = 1. + DIST(i - 1, j);
1474 double deletion = 1. + DIST(i, j - 1);
1475 double dist = MIN(insertion, deletion);
1476 double substitution = cost + DIST(i - 1, j - 1);
1477 dist = MIN(dist, substitution);
1478 // Check for transposition:
1479 if (i >= 2 && j >= 2 && cost != 0) {
1480 int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
1481 int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
1482 if (ai == bi_prev && bj == ai_prev) {
1483 double transposition = cost + DIST(i - 2, j - 2);
1484 dist = MIN(dist, transposition);
1492 return (double)distances[a.length * b.length + b.length];
1496 Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
1497 Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
1498 #define flush_unquoted() \
1500 if (unquoted_span > 0) { \
1501 ret = concat2_assuming_safe(ret, Text$slice(text, I(i - unquoted_span + 1), I(i))); \
1502 unquoted_span = 0; \
1505 #define add_escaped(str) \
1508 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \
1509 ret = concat2_assuming_safe(ret, Text("\\" str)); \
1510 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); \
1512 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1513 int64_t unquoted_span = 0;
1515 for (i = 0; i < (int64_t)text.length; i++) {
1516 int32_t g = Text$get_grapheme_fast(&state, i);
1518 case '\a': add_escaped("a"); break;
1519 case '\b': add_escaped("b"); break;
1520 case '\x1b': add_escaped("e"); break;
1521 case '\f': add_escaped("f"); break;
1522 case '\n': add_escaped("n"); break;
1523 case '\r': add_escaped("r"); break;
1524 case '\t': add_escaped("t"); break;
1525 case '\v': add_escaped("v"); break;
1534 case '\x00' ... '\x06':
1535 case '\x0E' ... '\x1A':
1536 case '\x1C' ... '\x1F':
1537 case '\x7F' ... '\x7F': {
1539 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
1540 ret = concat2_assuming_safe(ret, Text("\\x"));
1542 (g / 16) > 9 ? 'a' + (g / 16) - 10 : '0' + (g / 16),
1543 (g & 15) > 9 ? 'a' + (g & 15) - 10 : '0' + (g & 15),
1546 ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2));
1547 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
1551 TextIter_t esc_state = NEW_TEXT_ITER_STATE(extra_escapes);
1552 for (int64_t j = 0; j < (int64_t)extra_escapes.length; j++) {
1553 int32_t esc = Text$get_grapheme_fast(&esc_state, j);
1556 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
1557 ret = concat2_assuming_safe(ret, Text("\\"));
1558 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
1569 #undef flush_unquoted
1570 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[m"));
1575 Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) {
1576 if (quotation_mark.length != 1) fail("Invalid quote text: ", quotation_mark, " (must have length == 1)");
1578 Text_t ret = Text$escaped(text, colorize, quotation_mark);
1579 if (!(Text$equal_values(quotation_mark, Text("\"")) || Text$equal_values(quotation_mark, Text("'"))
1580 || Text$equal_values(quotation_mark, Text("`"))))
1581 ret = Text$concat(Text("$"), quotation_mark, ret, quotation_mark);
1582 else ret = Text$concat(quotation_mark, ret, quotation_mark);
1587 Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *info) {
1589 if (!vtext) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
1591 Text_t text = *(Text_t *)vtext;
1592 // Figure out the best quotation mark to use:
1593 bool has_double_quote = false, has_backtick = false, has_single_quote = false, needs_escapes = false;
1594 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1595 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1596 int32_t g = Text$get_grapheme_fast(&state, i);
1598 has_double_quote = true;
1599 } else if (g == '`') {
1600 has_backtick = true;
1601 } else if (g == (g & 0x7F) && (g == '\'' || g == '\n' || g == '\r' || g == '\t' || !isprint((char)g))) {
1602 needs_escapes = true;
1607 // If there's double quotes in the string, it would be nice to avoid
1608 // needing to escape them by using single quotes, but only if we don't have
1609 // single quotes or need to escape anything else (because single quotes
1610 // don't have interpolation):
1611 if (has_double_quote && !has_single_quote) quote = Text("'");
1612 // If there is a double quote, but no backtick, we can save a bit of
1613 // escaping by using backtick instead of double quote:
1614 else if (has_double_quote && has_single_quote && !has_backtick && !needs_escapes) quote = Text("`");
1615 // Otherwise fall back to double quotes as the default quoting style:
1616 else quote = Text("\"");
1618 Text_t as_text = Text$quoted(text, colorize, quote);
1619 if (info && info->TextInfo.lang && info != &Text$info)
1620 as_text = Text$concat(colorize ? Text("\x1b[1m$") : Text("$"), Text$from_str(info->TextInfo.lang),
1621 colorize ? Text("\x1b[0m") : Text(""), as_text);
1626 Text_t Text$join(Text_t glue, List_t pieces) {
1627 if (pieces.length == 0) return EMPTY_TEXT;
1629 Text_t result = *(Text_t *)pieces.data;
1630 for (int64_t i = 1; i < (int64_t)pieces.length; i++) {
1631 result = Text$concat(result, glue, *(Text_t *)(pieces.data + i * pieces.stride));
1637 List_t Text$clusters(Text_t text) {
1638 List_t clusters = EMPTY_LIST;
1639 for (int64_t i = 1; i <= (int64_t)text.length; i++) {
1640 Text_t cluster = Text$slice(text, I(i), I(i));
1641 List$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
1647 List_t Text$utf8(Text_t text) {
1648 if (text.length == 0) return EMPTY_ATOMIC_LIST;
1649 int64_t capacity = (int64_t)text.length;
1650 Byte_t *buf = GC_MALLOC_ATOMIC(sizeof(Byte_t[capacity]));
1652 u8_buf_append(text, &buf, &capacity, &i);
1654 .data = buf, .length = (uint64_t)i, .stride = 1, .atomic = 1, .free = MIN(LIST_MAX_FREE_ENTRIES, capacity - i)};
1658 List_t Text$utf16(Text_t text) {
1659 if (text.length == 0) return EMPTY_ATOMIC_LIST;
1660 List_t utf32 = Text$utf32(text);
1661 if (utf32.length == 0) return EMPTY_ATOMIC_LIST;
1662 List_t utf16 = {.free = MIN(LIST_MAX_FREE_ENTRIES, (uint64_t)utf32.length), .atomic = 1};
1663 utf16.data = GC_MALLOC_ATOMIC(sizeof(int32_t[utf16.free]));
1664 for (int64_t i = 0; i < (int64_t)utf32.length; i++) {
1665 uint16_t u16_buf[4];
1666 size_t u16_len = sizeof(u16_buf) / sizeof(u16_buf[0]);
1667 uint16_t *chunk_u16 = u32_to_u16(utf32.data + utf32.stride * i, 1, u16_buf, &u16_len);
1668 if (chunk_u16 == NULL) fail("Invalid codepoints encountered!");
1669 List$insert_all(&utf16, (List_t){.data = chunk_u16, .stride = sizeof(uint16_t), .length = (uint64_t)u16_len},
1670 I(0), sizeof(uint16_t));
1671 if (chunk_u16 != u16_buf) free(chunk_u16);
1677 List_t Text$utf32(Text_t text) {
1678 if (text.length == 0) return EMPTY_ATOMIC_LIST;
1679 List_t codepoints = EMPTY_ATOMIC_LIST;
1680 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1681 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1682 int32_t grapheme = Text$get_grapheme_fast(&state, i);
1684 for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
1685 ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
1686 List$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t));
1689 List$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t));
1695 static INLINE const char *codepoint_name(ucs4_t c) {
1696 char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
1697 char *found_name = unicode_character_name(c, name);
1698 if (found_name) return found_name;
1699 const uc_block_t *block = uc_block(c);
1700 if (!block) return "???";
1702 return String(block->name, "-", hex(c, .no_prefix = true, .uppercase = true));
1706 List_t Text$codepoint_names(Text_t text) {
1707 List_t names = EMPTY_LIST;
1708 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1709 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1710 int32_t grapheme = Text$get_grapheme_fast(&state, i);
1712 for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
1713 const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
1714 Text_t name_text = Text$from_str(name);
1715 List$insert(&names, &name_text, I_small(0), sizeof(Text_t));
1718 const char *name = codepoint_name((ucs4_t)grapheme);
1719 Text_t name_text = Text$from_str(name);
1720 List$insert(&names, &name_text, I_small(0), sizeof(Text_t));
1727 OptionalText_t Text$from_utf8(List_t units) {
1728 if (units.length == 0) return EMPTY_TEXT;
1729 if (units.stride != sizeof(uint8_t)) List$compact(&units, sizeof(uint8_t));
1730 return Text$from_strn(units.data, (size_t)units.length);
1734 OptionalText_t Text$from_utf16(List_t units) {
1735 if (units.length == 0) return EMPTY_TEXT;
1736 if (units.stride != sizeof(int16_t)) List$compact(&units, sizeof(int16_t));
1738 size_t length = 256;
1739 uint8_t buf[length];
1740 uint8_t *u8 = u16_to_u8(units.data, (size_t)units.length, buf, &length);
1742 Text$from_utf8((List_t){.data = u8, .length = (uint64_t)length, .stride = sizeof(uint8_t), .atomic = 1});
1743 if (u8 != buf) free(u8);
1748 OptionalText_t Text$from_utf32(List_t codepoints) {
1749 if (codepoints.length == 0) return EMPTY_TEXT;
1750 if (codepoints.stride != sizeof(uint32_t)) List$compact(&codepoints, sizeof(uint32_t));
1752 List_t graphemes = EMPTY_ATOMIC_LIST;
1753 Table_t unique_clusters = EMPTY_TABLE;
1754 const uint32_t *pos = (const uint32_t *)codepoints.data;
1755 const uint32_t *end = (const uint32_t *)&pos[codepoints.length];
1756 // Iterate over grapheme clusters
1757 for (const uint32_t *next; (next = u32_grapheme_next(pos, end)); pos = next) {
1758 // Buffer for normalized cluster:
1760 size_t u32_normlen = sizeof(buf) / sizeof(buf[0]);
1761 if (u32_check(pos, (size_t)(next - pos)) != NULL) return NONE_TEXT;
1762 uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, pos, (size_t)(next - pos), buf, &u32_normlen);
1764 int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
1765 List$insert(&graphemes, &g, I(0), sizeof(int32_t));
1766 Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length,
1767 Table$info(&Int32$info, &Byte$info));
1769 if (u32s_normalized != buf) free(u32s_normalized);
1771 if (unique_clusters.entries.length == 256) {
1772 List_t remaining_codepoints = {
1773 .length = (uint64_t)(end - next),
1774 .data = (void *)next,
1775 .stride = sizeof(int32_t),
1777 OptionalText_t remainder = Text$from_utf32(remaining_codepoints);
1778 if (remainder.tag == TEXT_NONE) return NONE_TEXT;
1779 return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters), remainder);
1782 return Text$from_components(graphemes, unique_clusters);
1786 OptionalText_t Text$from_codepoint_names(List_t codepoint_names) {
1787 List_t codepoints = EMPTY_LIST;
1788 for (int64_t i = 0; i < (int64_t)codepoint_names.length; i++) {
1789 Text_t *name = ((Text_t *)(codepoint_names.data + i * codepoint_names.stride));
1790 const char *name_str = Text$as_c_string(*name);
1791 ucs4_t codepoint = unicode_name_character(name_str);
1792 if (codepoint == UNINAME_INVALID) return NONE_TEXT;
1793 List$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
1795 return Text$from_utf32(codepoints);
1799 List_t Text$lines(Text_t text) {
1800 List_t lines = EMPTY_LIST;
1801 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1802 for (int64_t i = 0, line_start = 0; i < (int64_t)text.length; i++) {
1803 int32_t grapheme = Text$get_grapheme_fast(&state, i);
1804 if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF
1805 Text_t line = Text$slice(text, I(line_start + 1), I(i));
1806 List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1807 i += 1; // skip one extra for CR
1809 } else if (grapheme == '\n') { // newline
1810 Text_t line = Text$slice(text, I(line_start + 1), I(i));
1811 List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1813 } else if (i == text.length - 1 && line_start != i) { // last line
1814 Text_t line = Text$slice(text, I(line_start + 1), I(i + 1));
1815 List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1824 } line_iter_state_t;
1826 static OptionalText_t next_line(line_iter_state_t *state) {
1827 Text_t text = state->state.stack[0].text;
1828 for (int64_t i = state->i; i < (int64_t)text.length; i++) {
1829 int32_t grapheme = Text$get_grapheme_fast(&state->state, i);
1830 if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF
1831 Text_t line = Text$slice(text, I(state->i + 1), I(i));
1832 state->i = i + 2; // skip one extra for CR
1834 } else if (grapheme == '\n') { // newline
1835 Text_t line = Text$slice(text, I(state->i + 1), I(i));
1838 } else if (i == text.length - 1 && state->i != i) { // last line
1839 Text_t line = Text$slice(text, I(state->i + 1), I(i + 1));
1848 Closure_t Text$by_line(Text_t text) {
1850 .fn = (void *)next_line,
1851 .userdata = new (line_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0),
1855 PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t *info) {
1857 return ((Text_t *)t)->tag == TEXT_NONE;
1861 Int_t Text$memory_size(Text_t text) {
1863 case TEXT_ASCII: return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(char[text.length]));
1864 case TEXT_GRAPHEMES: return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(int32_t[text.length]));
1866 return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)((void *)text.blob.bytes - (void *)text.blob.map)
1867 + (int64_t)sizeof(uint8_t[text.length]));
1869 return Int$plus(Int$from_int64((int64_t)sizeof(Text_t)),
1870 Int$plus(Text$memory_size(*text.left), Text$memory_size(*text.right)));
1871 default: errx(1, "Invalid text tag: %d", text.tag);
1876 Text_t Text$layout(Text_t text) {
1878 case TEXT_ASCII: return Text$concat(Text("ASCII("), Int64$value_as_text(text.length), Text(")"));
1879 case TEXT_GRAPHEMES: return Text$concat(Text("Graphemes("), Int64$value_as_text(text.length), Text(")"));
1880 case TEXT_BLOB: return Text$concat(Text("Blob("), Int64$value_as_text(text.length), Text(")"));
1882 return Text$concat(Text("Concat("), Text$layout(*text.left), Text(", "), Text$layout(*text.right), Text(")"));
1883 default: errx(1, "Invalid text tag: %d", text.tag);
1888 void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const TypeInfo_t *info) {
1890 const char *str = Text$as_c_string(*(Text_t *)obj);
1891 int64_t len = (int64_t)strlen(str);
1892 Int64$serialize(&len, out, pointers, &Int64$info);
1893 fwrite(str, sizeof(char), (size_t)len, out);
1897 void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *info) {
1900 Int64$deserialize(in, &len, pointers, &Int64$info);
1901 if (len < 0) fail("Cannot deserialize text with a negative length!");
1902 char *buf = GC_MALLOC_ATOMIC((size_t)len + 1);
1903 if (fread(buf, sizeof(char), (size_t)len, in) != (size_t)len) fail("Not enough data in stream to deserialize");
1904 buf[len + 1] = '\0';
1905 *(Text_t *)out = Text$from_strn(buf, (size_t)len);
1909 const TypeInfo_t Text$info = {
1910 .size = sizeof(Text_t),
1911 .align = __alignof__(Text_t),
1913 .TextInfo = {.lang = "Text"},
1914 .metamethods = Text$metamethods,