code / tomo

Lines41.3K C23.7K Markdown9.7K YAML5.0K Tomo2.3K
7 others 763
Python231 Shell230 make212 INI47 Text21 SVG16 Lua6
(1.9K lines)
1 // This file defines type info and methods for the Text datatype, which uses
2 // libunistr for Unicode support and implements a datastructure based on a
3 // hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of
4 // strings, combined with a mostly-balanced tree datastructure based on Cords
5 // (Boehm et al), which have good runtime performance for text constructed by a
6 // series of many concatenations. In practice, this means Tomo's Text has an
7 // extremely compact memory footprint (typically as compact as UTF8 or
8 // up to 2x better for some languages), with extremely fast operations
9 // including concatenation, random indexing, and taking substrings.
11 // For more information on MoarVM's grapheme cluster strings, see:
12 // https://docs.raku.org/language/unicode
13 // https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc
14 // For more information on Cords, see the paper "Ropes: an Alternative to
15 // Strings" (Boehm, Atkinson, Plass 1995):
16 // https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf
18 // Tomo's Text datatype represents Unicode text that is fully normalized using
19 // normalization form C (NFC). This means that all text created from source code
20 // or read in at runtime will respect normalization during comparison and other
21 // operations, and the original (potentially non-canonical) representation of
22 // text is not preserved. This also means that byte sequences that do not
23 // represent valid unicode text cannot be interpreted as the Text datatype. For
24 // example, a file with malformed UTF8 sequences cannot be read as Text.
26 // A note on grapheme clusters: In Unicode, the fundamental unit is the
27 // "codepoint", which represents things like letters, symbols, emojis,
28 // combiners, and modifiers that alter other codepoints. However, most people
29 // have an intuitive notion of what a "letter" is that corresponds to the
30 // concept formally known as a grapheme cluster. A grapheme cluster is roughly
31 // speaking the amount of text that your cursor moves over when you press the
32 // left or right arrow key once. This often corresponds to a single codepoint,
33 // but some codepoints act as modifiers on other codepoints. For example, U+0301
34 // (COMBINING ACUTE ACCENT) can modify a letter like "e" to form "é". During
35 // normalization, this frequently resolves down to a single unicode codepoint,
36 // in this case, "é" resolves to the single codepoint U+00E9 (LATIN SMALL LETTER
37 // E WITH ACUTE). However, in some cases, multiple codepoints make up a grapheme
38 // cluster but *don't* normalize to a single codepoint. For example, LATIN SMALL
39 // LETTER E (U+0065) + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an
40 // unusual glyph that is not used frequently enough to warrant its own unique
41 // codepoint (this is basically what Zalgo text is). Emojis also use the ZERO
42 // WIDTH JOINER (U+200D) to add gender, skin tone, or other modifiers to emojis.
43 // Tomo takes an opinionated stance that grapheme clusters, not codepoints or
44 // bytes, are more useful to people when doing text operations like getting the
45 // "length" of a text or accessing the Nth "letter" of a text. If someone sends
46 // you a text with WOMAN (U+1F469) + ZERO WIDTH JOINER (U+200D) + ROCKET
47 // (U+1F680) followed by THUMBS UP (U+1F44D), it will render on your screen as
48 // two things: a female astronaut and a thumbs up, and this is how most people
49 // will think about the text. If you wish to operate on the raw codepoints that
50 // comprise the message, you are free to do so with the `.utf32()`
51 // method and `Text.from_utf32()`, but this is not the default behavior.
52 // The behavior for the given example is that `text.length == 2`, `text[1]` is
53 // the grapheme cluster representing a female astronaut emoji, and `text[2]` is
54 // the grapheme cluster representing the thumbs up emoji.
56 // There are a lot of benefits to storing unicode text with one grapheme
57 // cluster per index in a densely packed list instead of storing the text as
58 // variable-width UTF8-encoded bytes. It lets us have one canonical length for
59 // the text that can be precomputed and is meaningful to users. It lets us
60 // quickly get the Nth "letter" in the text. Substring slicing is fast.
61 // However, since not all grapheme clusters take up the same number of
62 // codepoints, we're faced with the problem of how to jam multiple codepoints
63 // into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this
64 // implementation uses "synthetic graphemes" (in Raku's terms, Normal Form
65 // Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed
66 // integer that represents a multi-codepoint grapheme cluster that has been
67 // encountered during the program's runtime. These clusters are stored in a
68 // lookup list and hash map so that we can rapidly convert between the
69 // synthetic grapheme integer ID and the unicode codepoints associated with it.
70 // Essentially, it's like we create a supplement to the unicode standard with
71 // things that would be nice if they had their own codepoint so things worked
72 // out nicely because we're using them right now, and we'll give them a
73 // negative number so it doesn't overlap with any real codepoints.
75 // Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E
76 // WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2:
77 // U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E,
78 // COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2}
79 // Where -2 is used as a lookup in a list that holds the actual unicode
80 // codepoints: (ucs4_t[]){0x65, 0x0309}
82 // The text datastructure also uses a compact encoding (TEXT_BLOB) to store a
83 // per-chunk compressed form of the text when long stretches of text contain
84 // 256 or fewer unique grapheme clusters, which lets the text use a single byte
85 // for each grapheme cluster along with a lookup table. For typical text
86 // written in a variety of non-English natural languages (e.g. Spanish, Arabic,
87 // Japanese, Greek, German, Finnish, Basque), the in-memory representation
88 // takes up between 50-101% as much space as UTF8 encoding and between 24-39%
89 // as much space as UTF32 encoding, but with the advantage of extremely fast
90 // random access for indexing or slicing, unlike UTF8. In other words, this
91 // representation offers ASCII-like compactness and fast random access for
92 // non-English languages.
94 #include <assert.h>
95 #include <ctype.h>
96 #include <fnmatch.h>
97 #include <gc.h>
98 #include <stdbool.h>
99 #include <stdint.h>
100 #include <stdlib.h>
101 #include <sys/param.h>
103 #include "../unistr-fixed.h"
105 #include <unicase.h>
106 #include <unictype.h>
107 #include <unigbrk.h>
108 #include <uniname.h>
109 #include <unistring/version.h>
110 #include <uniwidth.h>
112 #include "bytes.h"
113 #include "datatypes.h"
114 #include "integers.h"
115 #include "lists.h"
116 #include "optionals.h"
117 #include "tables.h"
118 #include "text.h"
120 // Use inline version of the siphash code for performance:
121 #include "siphash-internals.h"
122 #include "siphash.h"
124 typedef struct {
125 ucs4_t main_codepoint;
126 ucs4_t *utf32_cluster; // length-prefixed
127 const uint8_t *utf8;
128 } synthetic_grapheme_t;
130 // Synthetic grapheme clusters (clusters of more than one codepoint):
131 static Table_t grapheme_ids_by_codepoints = EMPTY_TABLE; // ucs4_t* length-prefixed codepoints -> int32_t ID
133 // This will hold a dynamically growing list of synthetic graphemes:
134 static synthetic_grapheme_t *synthetic_graphemes = NULL;
135 static int32_t synthetic_grapheme_capacity = 0;
136 static int32_t num_synthetic_graphemes = 0;
138 #define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id) - 1].utf32_cluster[0])
139 #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id) - 1].utf32_cluster[1])
140 #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id) - 1].utf8)
142 // Somewhat arbitrarily chosen, if two short literal ASCII or grapheme chunks
143 // are concatenated below this length threshold, we just merge them into a
144 // single literal node instead of a concatenation node.
145 #define SHORT_ASCII_LENGTH 64
146 #define SHORT_GRAPHEMES_LENGTH 16
148 static Text_t simple_concatenation(Text_t a, Text_t b);
150 PUREFUNC static bool graphemes_equal(const void *va, const void *vb, const TypeInfo_t *info) {
151 (void)info;
152 ucs4_t *a = *(ucs4_t **)va;
153 ucs4_t *b = *(ucs4_t **)vb;
154 if (a[0] != b[0]) return false;
155 for (int i = 0; i < (int)a[0]; i++)
156 if (a[i] != b[i]) return false;
157 return true;
160 PUREFUNC static uint64_t grapheme_hash(const void *g, const TypeInfo_t *info) {
161 (void)info;
162 ucs4_t *cluster = *(ucs4_t **)g;
163 return siphash24((void *)&cluster[1], sizeof(ucs4_t[cluster[0]]));
166 static const TypeInfo_t GraphemeClusterInfo = {
167 .size = sizeof(ucs4_t *),
168 .align = __alignof__(ucs4_t *),
169 .metamethods =
171 .equal = graphemes_equal,
172 .hash = grapheme_hash,
176 #ifdef __GNUC__
177 #pragma GCC diagnostic push
178 #pragma GCC diagnostic ignored "-Wstack-protector"
179 #endif
180 public
181 int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len) {
182 if (utf32_len == 1) return (int32_t)*codepoints;
184 ucs4_t length_prefixed[1 + utf32_len];
185 length_prefixed[0] = (ucs4_t)utf32_len;
186 for (int i = 0; i < utf32_len; i++)
187 length_prefixed[i + 1] = codepoints[i];
188 ucs4_t *ptr = &length_prefixed[0];
190 // Optimization for common case of one frequently used synthetic grapheme:
191 static int32_t last_grapheme = 0;
192 if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme - 1].utf32_cluster, NULL))
193 return last_grapheme;
195 TypeInfo_t GraphemeIDLookupTableInfo = *Table$info(&GraphemeClusterInfo, &Int32$info);
196 int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo);
197 if (found) return *found;
199 // New synthetic grapheme:
200 if (num_synthetic_graphemes >= synthetic_grapheme_capacity) {
201 // If we don't have space, allocate more:
202 synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2);
203 synthetic_grapheme_t *new = GC_MALLOC_ATOMIC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity]));
204 memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes]));
205 synthetic_graphemes = new;
208 int32_t grapheme_id = -(num_synthetic_graphemes + 1);
209 num_synthetic_graphemes += 1;
211 // Get UTF8 representation:
212 uint8_t u8_buf[64];
213 size_t u8_len = sizeof(u8_buf) / sizeof(u8_buf[0]);
214 uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len);
215 if (u8 == NULL) fail("Invalid graphemes encountered!");
217 // For performance reasons, use an arena allocator here to ensure that
218 // synthetic graphemes store all of their information in a densely packed
219 // area with good cache locality:
220 static void *arena = NULL, *arena_end = NULL;
221 if (arena != NULL && (size_t)arena % __alignof__(ucs4_t) != 0)
222 arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t));
224 // If we have filled up this arena, allocate a new one:
225 size_t needed_memory = sizeof(ucs4_t[1 + utf32_len]) + sizeof(uint8_t[u8_len + 1]);
226 if (arena == NULL || arena + needed_memory > arena_end) {
227 // Do reasonably big chunks at a time, so most synthetic codepoints are
228 // nearby each other in memory and cache locality is good. This is a
229 // rough guess at a good size:
230 size_t chunk_size = MAX(needed_memory, 512);
231 arena = GC_MALLOC_ATOMIC(chunk_size);
232 assert(arena != NULL);
233 arena_end = arena + chunk_size;
236 // Copy length-prefixed UTF32 codepoints into the arena and store where they live:
237 ucs4_t *codepoint_copy = arena;
238 memcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1 + utf32_len]));
239 synthetic_graphemes[-grapheme_id - 1].utf32_cluster = codepoint_copy;
240 arena += sizeof(ucs4_t[1 + utf32_len]);
242 // Copy UTF8 units into the arena and store where they live:
243 uint8_t *utf8_final = arena;
244 memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
245 utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
246 synthetic_graphemes[-grapheme_id - 1].utf8 = utf8_final;
247 arena += sizeof(uint8_t[u8_len + 1]);
249 // Sickos at the unicode consortium decreed that you can have grapheme clusters
250 // that begin with *prefix* modifiers, so we gotta check for that case:
251 synthetic_graphemes[-grapheme_id - 1].main_codepoint = length_prefixed[1];
252 for (ucs4_t i = 0; i < utf32_len; i++) {
253 #if _LIBUNISTRING_VERSION >= 0x010200
254 // libuinstring version 1.2.0 introduced uc_is_property_prepended_concatenation_mark()
255 // It's not critical, but it's technically more correct to have this check:
256 if (unlikely(uc_is_property_prepended_concatenation_mark(length_prefixed[1 + i]))) continue;
257 #endif
258 synthetic_graphemes[-grapheme_id - 1].main_codepoint = length_prefixed[1 + i];
259 break;
262 // Cleanup from unicode API:
263 if (u8 != u8_buf) free(u8);
265 Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo);
267 last_grapheme = grapheme_id;
268 return grapheme_id;
270 #ifdef __GNUC__
271 #pragma GCC diagnostic pop
272 #endif
274 public
275 int Text$print(FILE *stream, Text_t t) {
276 if (t.length == 0) return 0;
278 switch (t.tag) {
279 case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream);
280 case TEXT_GRAPHEMES: {
281 const int32_t *graphemes = t.graphemes;
282 int written = 0;
283 for (int64_t i = 0; i < (int64_t)t.length; i++) {
284 int32_t grapheme = graphemes[i];
285 if (grapheme >= 0) {
286 uint8_t buf[8];
287 size_t len = sizeof(buf);
288 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, buf, &len);
289 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
290 written += (int)fwrite(u8, sizeof(char), len, stream);
291 if (u8 != buf) free(u8);
292 } else {
293 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
294 assert(u8);
295 written += (int)fwrite(u8, sizeof(uint8_t), strlen((char *)u8), stream);
298 return written;
300 case TEXT_BLOB: {
301 int written = 0;
302 for (int64_t i = 0; i < (int64_t)t.length; i++) {
303 int32_t grapheme = t.blob.map[t.blob.bytes[i]];
304 if (grapheme >= 0) {
305 uint8_t buf[8];
306 size_t len = sizeof(buf);
307 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, buf, &len);
308 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
309 written += (int)fwrite(u8, sizeof(char), len, stream);
310 if (u8 != buf) free(u8);
311 } else {
312 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
313 assert(u8);
314 written += (int)fwrite(u8, sizeof(uint8_t), strlen((char *)u8), stream);
317 return written;
319 case TEXT_CONCAT: {
320 int written = Text$print(stream, *t.left);
321 written += Text$print(stream, *t.right);
322 return written;
324 default: return 0;
328 static const uint64_t min_len_for_depth[MAX_TEXT_DEPTH] = {
329 // Fibonacci numbers (skipping first two)
330 1, 2, 3, 5, 8, 13, 21, 34, 55, 89,
331 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946,
332 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269,
333 2178309, 3524578, 5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141,
334 267914296, 433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049,
337 #define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth])
339 static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text) {
340 if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) {
341 insert_balanced_recursive(balanced_texts, *text.left);
342 insert_balanced_recursive(balanced_texts, *text.right);
343 return;
346 int i = 0;
347 Text_t accumulator = EMPTY_TEXT;
348 for (; text.length > min_len_for_depth[i + 1]; i++) {
349 if (balanced_texts[i].length) {
350 accumulator = simple_concatenation(balanced_texts[i], accumulator);
351 balanced_texts[i] = EMPTY_TEXT;
355 accumulator = simple_concatenation(accumulator, text);
357 while (accumulator.length >= min_len_for_depth[i]) {
358 if (balanced_texts[i].length) {
359 accumulator = simple_concatenation(balanced_texts[i], accumulator);
360 balanced_texts[i] = EMPTY_TEXT;
362 i++;
364 i--;
365 balanced_texts[i] = accumulator;
368 static Text_t rebalanced(Text_t a, Text_t b) {
369 Text_t balanced_texts[MAX_TEXT_DEPTH];
370 memset(balanced_texts, 0, sizeof(balanced_texts));
371 insert_balanced_recursive(balanced_texts, a);
372 insert_balanced_recursive(balanced_texts, b);
374 Text_t ret = EMPTY_TEXT;
375 for (int64_t i = 0; ret.length < a.length + b.length; i++) {
376 if (balanced_texts[i].length) ret = simple_concatenation(balanced_texts[i], ret);
378 return ret;
381 Text_t simple_concatenation(Text_t a, Text_t b) {
382 if (a.length == 0) return b;
383 if (b.length == 0) return a;
385 uint16_t new_depth = 1 + MAX(a.depth, b.depth);
386 // Rebalance only if depth exceeds the maximum allowed. We don't require
387 // every concatenation to yield a balanced text, since many concatenations
388 // are ephemeral (e.g. doing a loop repeatedly concatenating without using
389 // the intermediary values).
390 if (new_depth >= MAX_TEXT_DEPTH) return rebalanced(a, b);
392 Text_t *children = GC_MALLOC(sizeof(Text_t[2]));
393 children[0] = a;
394 children[1] = b;
395 return (Text_t){
396 .tag = TEXT_CONCAT,
397 .length = a.length + b.length,
398 .depth = new_depth,
399 .left = &children[0],
400 .right = &children[1],
404 static Text_t concat2_assuming_safe(Text_t a, Text_t b) {
405 if (a.length == 0) return b;
406 if (b.length == 0) return a;
408 if (a.tag == TEXT_ASCII && b.tag == TEXT_ASCII && (size_t)(a.length + b.length) <= SHORT_ASCII_LENGTH) {
409 struct Text_s ret = {
410 .tag = TEXT_ASCII,
411 .length = a.length + b.length,
413 ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
414 memcpy((char *)ret.ascii, a.ascii, sizeof(char[a.length]));
415 memcpy((char *)&ret.ascii[a.length], b.ascii, sizeof(char[b.length]));
416 return ret;
417 } else if (a.tag == TEXT_GRAPHEMES && b.tag == TEXT_GRAPHEMES
418 && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
419 struct Text_s ret = {
420 .tag = TEXT_GRAPHEMES,
421 .length = a.length + b.length,
423 ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
424 memcpy((int32_t *)ret.graphemes, a.graphemes, sizeof(int32_t[a.length]));
425 memcpy((int32_t *)&ret.graphemes[a.length], b.graphemes, sizeof(int32_t[b.length]));
426 return ret;
427 } else if (a.tag != TEXT_CONCAT && b.tag != TEXT_CONCAT
428 && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
429 // Turn a small bit of ASCII into graphemes if it helps make things smaller
430 // Text structs come with an extra 8 bytes, so allocate enough to hold the text
431 struct Text_s ret = {
432 .tag = TEXT_GRAPHEMES,
433 .length = a.length + b.length,
435 ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
436 int32_t *dest = (int32_t *)ret.graphemes;
437 if (a.tag == TEXT_GRAPHEMES) {
438 memcpy(dest, a.graphemes, sizeof(int32_t[a.length]));
439 dest += a.length;
440 } else if (a.tag == TEXT_ASCII) {
441 for (int64_t i = 0; i < (int64_t)a.length; i++)
442 *(dest++) = (int32_t)a.ascii[i];
443 } else if (a.tag == TEXT_BLOB) {
444 for (int64_t i = 0; i < (int64_t)a.length; i++)
445 *(dest++) = a.blob.map[a.blob.bytes[i]];
446 } else {
447 errx(1, "Unreachable");
449 if (b.tag == TEXT_GRAPHEMES) {
450 memcpy(dest, b.graphemes, sizeof(int32_t[b.length]));
451 } else if (b.tag == TEXT_ASCII) {
452 for (int64_t i = 0; i < (int64_t)b.length; i++)
453 *(dest++) = (int32_t)b.ascii[i];
454 } else if (b.tag == TEXT_BLOB) {
455 for (int64_t i = 0; i < (int64_t)b.length; i++)
456 *(dest++) = b.blob.map[b.blob.bytes[i]];
457 } else {
458 errx(1, "Unreachable");
460 return ret;
463 if (a.tag == TEXT_CONCAT && b.tag != TEXT_CONCAT && a.right->tag != TEXT_CONCAT)
464 return concat2_assuming_safe(*a.left, concat2_assuming_safe(*a.right, b));
466 return simple_concatenation(a, b);
469 static Text_t concat2(Text_t a, Text_t b) {
470 if (a.length == 0) return b;
471 if (b.length == 0) return a;
473 int32_t last_a = Text$get_grapheme(a, a.length - 1);
474 int32_t first_b = Text$get_grapheme(b, 0);
476 // Magic number, we know that no codepoints below here trigger instability:
477 static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; // COMBINING GRAVE ACCENT
478 if (last_a >= 0 && last_a < LOWEST_CODEPOINT_TO_CHECK && first_b >= 0 && first_b < LOWEST_CODEPOINT_TO_CHECK)
479 return concat2_assuming_safe(a, b);
481 size_t len = (last_a >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a);
482 len += (first_b >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b);
484 ucs4_t codepoints[len];
485 ucs4_t *dest = codepoints;
486 if (last_a < 0) {
487 memcpy(dest, GRAPHEME_CODEPOINTS(last_a), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(last_a)]));
488 dest += NUM_GRAPHEME_CODEPOINTS(last_a);
489 } else {
490 *(dest++) = (ucs4_t)last_a;
493 if (first_b < 0) {
494 memcpy(dest, GRAPHEME_CODEPOINTS(first_b), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(first_b)]));
495 } else {
496 *dest = (ucs4_t)first_b;
499 // Do a normalization run for these two codepoints and see if it looks different.
500 // Normalization should not exceed 3x in the input length (but if it does, it will be
501 // handled gracefully)
502 ucs4_t norm_buf[3 * len];
503 size_t norm_length = sizeof(norm_buf) / sizeof(norm_buf[0]);
504 ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, len, norm_buf, &norm_length);
505 bool stable = (norm_length == len && memcmp(codepoints, normalized, sizeof(codepoints)) == 0);
507 if (stable) {
508 const void *second_grapheme = u32_grapheme_next(normalized, &normalized[norm_length]);
509 if (second_grapheme == &normalized[norm_length]) stable = false;
512 if likely (stable) {
513 if (normalized != norm_buf) free(normalized);
514 return concat2_assuming_safe(a, b);
517 OptionalText_t glue =
518 Text$from_utf32((List_t){.data = normalized, .length = (uint64_t)norm_length, .stride = sizeof(ucs4_t)});
520 if (normalized != norm_buf) free(normalized);
522 if (a.length == 1 && b.length == 1) return glue;
523 else if (a.length == 1) return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length)));
524 else if (b.length == 1) return concat2_assuming_safe(Text$slice(a, I(1), I(a.length - 1)), glue);
525 else
526 return concat2_assuming_safe(concat2_assuming_safe(Text$slice(a, I(1), I(a.length - 1)), glue),
527 Text$slice(b, I(2), I(b.length)));
530 public
531 Text_t Text$_concat(int n, Text_t items[n]) {
532 if (n == 0) return EMPTY_TEXT;
534 Text_t ret = items[0];
535 for (int i = 1; i < n; i++) {
536 if (items[i].length > 0) ret = concat2(ret, items[i]);
538 return ret;
541 public
542 Text_t Text$repeat(Text_t text, Int_t count) {
543 if (text.length == 0 || Int$is_negative(count)) return EMPTY_TEXT;
545 Int_t result_len = Int$times(count, I(text.length));
546 if (Int$compare_value(result_len, I(1l << 40)) > 0) fail("Text repeating would produce too big of an result!");
548 int64_t count64 = Int64$from_int(count, false);
549 Text_t ret = text;
550 for (int64_t c = 1; c < count64; c++)
551 ret = concat2(ret, text);
552 return ret;
555 public
556 Int_t Text$width(Text_t text, Text_t language) {
557 int width = u8_strwidth((const uint8_t *)Text$as_c_string(text), Text$as_c_string(language));
558 return Int$from_int32(width);
561 static Text_t Text$repeat_to_width(Text_t to_repeat, int64_t target_width, Text_t language) {
562 if (target_width <= 0) return EMPTY_TEXT;
564 const char *lang_str = Text$as_c_string(language);
565 int64_t width = (int64_t)u8_strwidth((const uint8_t *)Text$as_c_string(to_repeat), lang_str);
566 Text_t repeated = EMPTY_TEXT;
567 int64_t repeated_width = 0;
568 while (repeated_width + width <= target_width) {
569 repeated = concat2(repeated, to_repeat);
570 repeated_width += width;
573 if (repeated_width < target_width) {
574 for (int64_t i = 0; repeated_width < target_width && i < (int64_t)to_repeat.length; i++) {
575 Text_t c = Text$slice(to_repeat, I_small(i + 1), I_small(i + 1));
576 int64_t w = (int64_t)u8_strwidth((const uint8_t *)Text$as_c_string(c), lang_str);
577 if (repeated_width + w > target_width) {
578 repeated = concat2(repeated, Text$repeat(Text(" "), I(target_width - repeated_width)));
579 break;
581 repeated = concat2(repeated, c);
582 repeated_width += w;
586 return repeated;
589 public
590 Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
591 if (padding.length == 0) fail("Cannot pad with an empty text!");
593 int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
594 return concat2(Text$repeat_to_width(padding, needed, language), text);
597 public
598 Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
599 if (padding.length == 0) fail("Cannot pad with an empty text!");
601 int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
602 return concat2(text, Text$repeat_to_width(padding, needed, language));
605 public
606 Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language) {
607 if (padding.length == 0) fail("Cannot pad with an empty text!");
609 int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
610 return Text$concat(Text$repeat_to_width(padding, needed / 2, language), text,
611 Text$repeat_to_width(padding, (needed + 1) / 2, language));
614 public
615 Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) {
616 int64_t first = Int64$from_int(first_int, false);
617 int64_t last = Int64$from_int(last_int, false);
618 if (first == 0) fail("Invalid index: 0");
619 if (last == 0) return EMPTY_TEXT;
621 if (first < 0) first = (int64_t)text.length + first + 1;
622 if (last < 0) last = (int64_t)text.length + last + 1;
624 if (last > (int64_t)text.length) last = (int64_t)text.length;
626 if (first > (int64_t)text.length || last < first) return EMPTY_TEXT;
628 if (first == 1 && last == (int64_t)text.length) return text;
630 while (text.tag == TEXT_CONCAT) {
631 if (last < (int64_t)text.left->length) {
632 text = *text.left;
633 } else if (first > (int64_t)text.left->length) {
634 first -= (int64_t)text.left->length;
635 last -= (int64_t)text.left->length;
636 text = *text.right;
637 } else {
638 return concat2_assuming_safe(Text$slice(*text.left, I(first), I(text.length)),
639 Text$slice(*text.right, I(1), I(last - text.left->length)));
643 switch (text.tag) {
644 case TEXT_ASCII: {
645 return (Text_t){
646 .tag = TEXT_ASCII,
647 .length = last - first + 1,
648 .ascii = text.ascii + (first - 1),
651 case TEXT_GRAPHEMES: {
652 return (Text_t){
653 .tag = TEXT_GRAPHEMES,
654 .length = last - first + 1,
655 .graphemes = text.graphemes + (first - 1),
658 case TEXT_BLOB: {
659 Text_t ret = (Text_t){
660 .tag = TEXT_BLOB,
661 .length = last - first + 1,
662 .blob.map = text.blob.map,
663 .blob.bytes = text.blob.bytes + (first - 1),
665 return ret;
667 default: errx(1, "Invalid tag");
669 return EMPTY_TEXT;
672 public
673 Text_t Text$from(Text_t text, Int_t first) {
674 return Text$slice(text, first, I_small(-1));
677 public
678 Text_t Text$to(Text_t text, Int_t last) {
679 return Text$slice(text, I_small(1), last);
682 public
683 Text_t Text$reversed(Text_t text) {
684 switch (text.tag) {
685 case TEXT_ASCII: {
686 struct Text_s ret = {
687 .tag = TEXT_ASCII,
688 .length = text.length,
690 ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
691 for (int64_t i = 0; i < (int64_t)text.length; i++)
692 ((char *)ret.ascii)[text.length - 1 - i] = text.ascii[i];
693 return ret;
695 case TEXT_GRAPHEMES: {
696 struct Text_s ret = {
697 .tag = TEXT_GRAPHEMES,
698 .length = text.length,
700 ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
701 for (int64_t i = 0; i < (int64_t)text.length; i++)
702 ((int32_t *)ret.graphemes)[text.length - 1 - i] = text.graphemes[i];
703 return ret;
705 case TEXT_BLOB: {
706 struct Text_s ret = {
707 .tag = TEXT_BLOB,
708 .length = text.length,
709 .blob.map = text.blob.map,
711 ret.blob.bytes = GC_MALLOC_ATOMIC(sizeof(uint8_t[ret.length]));
712 for (int64_t i = 0; i < (int64_t)text.length; i++)
713 ((uint8_t *)ret.blob.bytes)[text.length - 1 - i] = text.graphemes[i];
714 return ret;
716 case TEXT_CONCAT: {
717 return concat2_assuming_safe(Text$reversed(*text.right), Text$reversed(*text.left));
719 default: errx(1, "Invalid tag");
721 return EMPTY_TEXT;
724 public
725 PUREFUNC OptionalText_t Text$cluster(Text_t text, Int_t index) {
726 Text_t slice = Text$slice(text, index, index);
727 return slice.length <= 0 ? NONE_TEXT : slice;
730 static Text_t Text$from_components(List_t graphemes, Table_t unique_clusters) {
731 if (graphemes.length == 0) return EMPTY_TEXT;
732 size_t blob_size = (sizeof(int32_t[unique_clusters.entries.length]) + sizeof(uint8_t[graphemes.length]));
733 // If blob optimization will save at least 200 bytes:
734 if (unique_clusters.entries.length <= 256 && blob_size + 200 < sizeof(int32_t[graphemes.length])) {
735 Text_t ret = {
736 .tag = TEXT_BLOB,
737 .length = graphemes.length,
738 .depth = 0,
740 void *blob = GC_MALLOC_ATOMIC(blob_size);
741 int32_t *map = blob;
742 uint8_t *bytes = blob + sizeof(int32_t[unique_clusters.entries.length]);
743 for (int64_t i = 0; i < (int64_t)unique_clusters.entries.length; i++) {
744 struct {
745 int32_t g;
746 uint8_t b;
747 } *entry = unique_clusters.entries.data + i * unique_clusters.entries.stride;
748 map[entry->b] = entry->g;
750 for (int64_t i = 0; i < (int64_t)graphemes.length; i++) {
751 int32_t g = *(int32_t *)(graphemes.data + i * graphemes.stride);
752 uint8_t *byte = Table$get(unique_clusters, &g, Table$info(&Int32$info, &Byte$info));
753 assert(byte);
754 bytes[i] = *byte;
756 ret.blob.map = map;
757 ret.blob.bytes = bytes;
758 return ret;
759 } else {
760 return (Text_t){
761 .tag = TEXT_GRAPHEMES,
762 .length = graphemes.length,
763 .graphemes = graphemes.data,
768 public
769 OptionalText_t Text$from_strn(const char *str, size_t len) {
770 int64_t ascii_span = 0;
771 for (size_t i = 0; i < len && isascii(str[i]); i++) {
772 ascii_span++;
773 if (str[i] == 0) return NONE_TEXT;
776 if (ascii_span == (int64_t)len) { // All ASCII
777 char *copy = GC_MALLOC_ATOMIC(len);
778 memcpy(copy, str, len);
779 return (Text_t){
780 .tag = TEXT_ASCII,
781 .length = ascii_span,
782 .ascii = copy,
785 if (u8_check((uint8_t *)str, len) != NULL) return NONE_TEXT;
787 List_t graphemes = EMPTY_LIST;
788 Table_t unique_clusters = EMPTY_TABLE;
789 const uint8_t *pos = (const uint8_t *)str;
790 const uint8_t *end = (const uint8_t *)&str[len];
791 // Iterate over grapheme clusters
792 for (const uint8_t *next; (next = u8_grapheme_next(pos, end)); pos = next) {
793 uint32_t buf[256];
794 size_t u32_len = sizeof(buf) / sizeof(buf[0]);
795 uint32_t *u32s = u8_to_u32(pos, (size_t)(next - pos), buf, &u32_len);
796 if (u32s == NULL) return NONE_TEXT;
798 uint32_t buf2[256];
799 size_t u32_normlen = sizeof(buf2) / sizeof(buf2[0]);
800 uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, u32s, u32_len, buf2, &u32_normlen);
801 if (u32s_normalized == NULL) return NONE_TEXT;
803 int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
804 if (g == 0) return NONE_TEXT;
805 List$insert(&graphemes, &g, I(0), sizeof(int32_t));
806 Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length,
807 Table$info(&Int32$info, &Byte$info));
809 if (u32s != buf) free(u32s);
810 if (u32s_normalized != buf2) free(u32s_normalized);
812 if (unique_clusters.entries.length >= 256) {
813 return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters),
814 Text$from_strn((const char *)next, (size_t)(end - next)));
818 return Text$from_components(graphemes, unique_clusters);
821 public
822 OptionalText_t Text$from_str(const char *str) {
823 return str ? Text$from_strn(str, strlen(str)) : Text("");
826 static void u8_buf_append(Text_t text, Byte_t **buf, int64_t *capacity, int64_t *i) {
827 switch (text.tag) {
828 case TEXT_ASCII: {
829 if (*i + (int64_t)text.length > (int64_t)*capacity) {
830 *capacity = *i + (int64_t)text.length + 1;
831 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
834 const char *bytes = text.ascii;
835 memcpy(*buf + *i, bytes, (size_t)text.length);
836 *i += text.length;
837 break;
839 case TEXT_GRAPHEMES: {
840 const int32_t *graphemes = text.graphemes;
841 for (int64_t g = 0; g < (int64_t)text.length; g++) {
842 if (graphemes[g] >= 0) {
843 uint8_t u8_buf[64];
844 size_t u8_len = sizeof(u8_buf);
845 uint8_t *u8 = u32_to_u8((ucs4_t *)&graphemes[g], 1, u8_buf, &u8_len);
846 if (u8 == NULL) fail("Invalid grapheme encountered: ", graphemes[g]);
848 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
849 *capacity = *i + (int64_t)u8_len + 1;
850 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
853 memcpy(*buf + *i, u8, u8_len);
854 *i += (int64_t)u8_len;
855 if (u8 != u8_buf) free(u8);
856 } else {
857 const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]);
858 size_t u8_len = u8_strlen(u8);
859 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
860 *capacity = *i + (int64_t)u8_len + 1;
861 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
864 memcpy(*buf + *i, u8, u8_len);
865 *i += (int64_t)u8_len;
868 break;
870 case TEXT_BLOB: {
871 for (int64_t g = 0; g < (int64_t)text.length; g++) {
872 int32_t grapheme = text.blob.map[text.blob.bytes[g]];
873 if (grapheme >= 0) {
874 uint8_t u8_buf[64];
875 size_t u8_len = sizeof(u8_buf);
876 uint8_t *u8 = u32_to_u8((ucs4_t *)&grapheme, 1, u8_buf, &u8_len);
877 if (u8 == NULL) fail("Invalid grapheme encountered: ", grapheme);
879 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
880 *capacity = *i + (int64_t)u8_len + 1;
881 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
884 memcpy(*buf + *i, u8, u8_len);
885 *i += (int64_t)u8_len;
886 if (u8 != u8_buf) free(u8);
887 } else {
888 const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
889 size_t u8_len = u8_strlen(u8);
890 if (*i + (int64_t)u8_len > (int64_t)*capacity) {
891 *capacity = *i + (int64_t)u8_len + 1;
892 *buf = GC_REALLOC(*buf, sizeof(Byte_t[*capacity]));
895 memcpy(*buf + *i, u8, u8_len);
896 *i += (int64_t)u8_len;
899 break;
901 case TEXT_CONCAT: {
902 u8_buf_append(*text.left, buf, capacity, i);
903 u8_buf_append(*text.right, buf, capacity, i);
904 break;
906 default: break;
910 public
911 const char *Text$as_c_string(Text_t text) {
912 if (text.tag == TEXT_ASCII && text.ascii[text.length] == '\0') return text.ascii;
913 int64_t capacity = text.length + 1;
914 char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
915 int64_t i = 0;
916 u8_buf_append(text, (Byte_t **)&buf, &capacity, &i);
918 if (i + 1 > (int64_t)capacity) {
919 capacity = i + 1;
920 buf = GC_REALLOC(buf, (size_t)capacity);
922 buf[i] = '\0';
923 return buf;
926 PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t *info) {
927 (void)info;
928 Text_t text = *(Text_t *)obj;
929 siphash sh;
930 siphashinit(&sh, sizeof(int32_t[text.length]));
932 union {
933 int32_t chunks[2];
934 uint64_t whole;
935 } tmp;
936 switch (text.tag) {
937 case TEXT_ASCII: {
938 const char *bytes = text.ascii;
939 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
940 tmp.chunks[0] = (int32_t)bytes[i];
941 tmp.chunks[1] = (int32_t)bytes[i + 1];
942 siphashadd64bits(&sh, tmp.whole);
944 int32_t last = text.length & 0x1 ? (int32_t)bytes[text.length - 1] : 0; // Odd number of graphemes
945 return siphashfinish_last_part(&sh, (uint64_t)last);
947 case TEXT_GRAPHEMES: {
948 const int32_t *graphemes = text.graphemes;
949 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
950 tmp.chunks[0] = graphemes[i];
951 tmp.chunks[1] = graphemes[i + 1];
952 siphashadd64bits(&sh, tmp.whole);
954 int32_t last = text.length & 0x1 ? graphemes[text.length - 1] : 0; // Odd number of graphemes
955 return siphashfinish_last_part(&sh, (uint64_t)last);
957 case TEXT_BLOB: {
958 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
959 tmp.chunks[0] = text.blob.map[text.blob.bytes[i]];
960 tmp.chunks[1] = text.blob.map[text.blob.bytes[i + 1]];
961 siphashadd64bits(&sh, tmp.whole);
963 int32_t last =
964 text.length & 0x1 ? text.blob.map[text.blob.bytes[text.length - 1]] : 0; // Odd number of graphemes
965 return siphashfinish_last_part(&sh, (uint64_t)last);
967 case TEXT_CONCAT: {
968 TextIter_t state = NEW_TEXT_ITER_STATE(text);
969 for (int64_t i = 0; i + 1 < (int64_t)text.length; i += 2) {
970 tmp.chunks[0] = Text$get_grapheme_fast(&state, i);
971 tmp.chunks[1] = Text$get_grapheme_fast(&state, i + 1);
972 siphashadd64bits(&sh, tmp.whole);
975 int32_t last = (text.length & 0x1) ? Text$get_grapheme_fast(&state, text.length - 1) : 0;
976 return siphashfinish_last_part(&sh, (uint64_t)last);
978 default: errx(1, "Invalid text");
980 return 0;
983 public
984 int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index) {
985 if (index < 0) return 0;
986 if (index >= (int64_t)state->stack[0].text.length) return 0;
988 assert(state->stack[0].text.depth <= MAX_TEXT_DEPTH);
990 // Go up the stack as needed:
991 while (index < state->stack[state->stack_index].offset
992 || index
993 >= state->stack[state->stack_index].offset + (int64_t)state->stack[state->stack_index].text.length) {
994 state->stack_index -= 1;
995 assert(state->stack_index >= 0);
998 assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
1000 // Go down the stack as needed:
1001 while (state->stack[state->stack_index].text.tag == TEXT_CONCAT) {
1002 Text_t text = state->stack[state->stack_index].text;
1003 int64_t offset = state->stack[state->stack_index].offset;
1004 assert(state->stack_index <= MAX_TEXT_DEPTH);
1005 assert(index >= offset);
1006 assert(index < offset + (int64_t)text.length);
1008 state->stack_index += 1;
1009 if (index < offset + (int64_t)text.left->length) {
1010 state->stack[state->stack_index].text = *text.left;
1011 state->stack[state->stack_index].offset = offset;
1012 } else {
1013 state->stack[state->stack_index].text = *text.right;
1014 state->stack[state->stack_index].offset = offset + text.left->length;
1016 assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
1019 Text_t text = state->stack[state->stack_index].text;
1020 int64_t offset = state->stack[state->stack_index].offset;
1022 if (index < offset || index >= offset + (int64_t)text.length) {
1023 return 0;
1026 switch (text.tag) {
1027 case TEXT_ASCII: return (int32_t)text.ascii[index - offset];
1028 case TEXT_GRAPHEMES: return text.graphemes[index - offset];
1029 case TEXT_BLOB: return text.blob.map[text.blob.bytes[index - offset]];
1030 default: errx(1, "Invalid text");
1032 return 0;
1035 public
1036 uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index) {
1037 int32_t g = Text$get_grapheme_fast(state, index);
1038 return (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint;
1041 PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t *info) {
1042 (void)info;
1043 if (va == vb) return 0;
1044 const Text_t a = *(const Text_t *)va;
1045 const Text_t b = *(const Text_t *)vb;
1047 // TODO: make this smarter and more efficient
1048 int64_t len = MAX(a.length, b.length);
1049 TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1050 for (int64_t i = 0; i < len; i++) {
1051 int32_t ai = Text$get_grapheme_fast(&a_state, i);
1052 int32_t bi = Text$get_grapheme_fast(&b_state, i);
1053 if (ai == bi) continue;
1054 int32_t cmp;
1055 if (ai > 0 && bi > 0) {
1056 cmp = u32_cmp((ucs4_t *)&ai, (ucs4_t *)&bi, 1);
1057 } else if (ai > 0) {
1058 cmp = u32_cmp2((ucs4_t *)&ai, 1, GRAPHEME_CODEPOINTS(bi), NUM_GRAPHEME_CODEPOINTS(bi));
1059 } else if (bi > 0) {
1060 cmp = u32_cmp2(GRAPHEME_CODEPOINTS(ai), NUM_GRAPHEME_CODEPOINTS(ai), (ucs4_t *)&bi, 1);
1061 } else {
1062 cmp = u32_cmp2(GRAPHEME_CODEPOINTS(ai), NUM_GRAPHEME_CODEPOINTS(ai), GRAPHEME_CODEPOINTS(bi),
1063 NUM_GRAPHEME_CODEPOINTS(bi));
1065 if (cmp != 0) return cmp;
1067 return 0;
1070 bool _matches(TextIter_t *text_state, TextIter_t *target_state, int64_t pos) {
1071 for (int64_t i = 0; i < (int64_t)target_state->stack[0].text.length; i++) {
1072 int32_t text_i = Text$get_grapheme_fast(text_state, pos + i);
1073 int32_t target_i = Text$get_grapheme_fast(target_state, i);
1074 if (text_i != target_i) return false;
1076 return true;
1079 PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix, Text_t *remainder) {
1080 if (text.length < prefix.length) return false;
1081 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix);
1082 if (_matches(&text_state, &prefix_state, 0)) {
1083 if (remainder) *remainder = Text$from(text, Int$from_int64(prefix.length + 1));
1084 return true;
1085 } else {
1086 if (remainder) *remainder = text;
1087 return false;
1091 PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix, Text_t *remainder) {
1092 if (text.length < suffix.length) return false;
1093 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix);
1094 if (_matches(&text_state, &suffix_state, text.length - suffix.length)) {
1095 if (remainder) *remainder = Text$to(text, Int$from_int64(text.length - suffix.length));
1096 return true;
1097 } else {
1098 if (remainder) *remainder = text;
1099 return false;
1103 public
1104 Text_t Text$without_prefix(Text_t text, Text_t prefix) {
1105 return Text$starts_with(text, prefix, NULL) ? Text$slice(text, I(prefix.length + 1), I(text.length)) : text;
1108 public
1109 Text_t Text$without_suffix(Text_t text, Text_t suffix) {
1110 return Text$ends_with(text, suffix, NULL) ? Text$slice(text, I(1), I(text.length - suffix.length)) : text;
1113 static bool _has_grapheme(TextIter_t *text, int32_t g) {
1114 for (int64_t t = 0; t < (int64_t)text->stack[0].text.length; t++) {
1115 if (g == Text$get_grapheme_fast(text, t)) {
1116 return true;
1119 return false;
1122 public
1123 OptionalInt_t Text$find(Text_t text, Text_t target, Int_t start) {
1124 if (text.length < target.length) return NONE_INT;
1125 if (target.length <= 0) return I(1);
1126 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1127 for (int64_t i = Int64$from_int(start, false) - 1; i < (int64_t)text.length - (int64_t)target.length + 1; i++) {
1128 if (_matches(&text_state, &target_state, i)) {
1129 return Int$from_int64(i + 1);
1132 return NONE_INT;
1135 public
1136 bool Text$matches_glob(Text_t text, Text_t glob) {
1137 return !fnmatch(Text$as_c_string(glob), Text$as_c_string(text), 0);
1140 public
1141 Text_t Text$trim(Text_t text, Text_t to_trim, bool left, bool right) {
1142 int64_t first = 0;
1143 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), trim_state = NEW_TEXT_ITER_STATE(to_trim);
1144 if (left) {
1145 while (first < (int64_t)text.length && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, first))) {
1146 first += 1;
1149 int64_t last = text.length - 1;
1150 if (right) {
1151 while (last >= first && _has_grapheme(&trim_state, Text$get_grapheme_fast(&text_state, last))) {
1152 last -= 1;
1155 return (first != 0 || last != text.length - 1) ? Text$slice(text, I(first + 1), I(last + 1)) : text;
1158 public
1159 Text_t Text$translate(Text_t text, Table_t translations) {
1160 TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1161 Text_t result = EMPTY_TEXT;
1162 int64_t span_start = 0;
1163 List_t replacement_list = translations.entries;
1164 for (int64_t i = 0; i < (int64_t)text.length;) {
1165 for (int64_t r = 0; r < (int64_t)replacement_list.length; r++) {
1166 struct {
1167 Text_t target, replacement;
1168 } *entry = replacement_list.data + r * replacement_list.stride;
1169 if (entry->target.length <= 0) continue;
1170 TextIter_t target_state = NEW_TEXT_ITER_STATE(entry->target);
1171 if (_matches(&text_state, &target_state, i)) {
1172 if (i > span_start) result = concat2(result, Text$slice(text, I(span_start + 1), I(i)));
1174 result = concat2(result, entry->replacement);
1175 i += entry->target.length;
1176 span_start = i;
1177 goto found_match;
1180 i += 1;
1181 found_match:
1182 continue;
1184 if (span_start < (int64_t)text.length)
1185 result = concat2(result, Text$slice(text, I(span_start + 1), I((int64_t)text.length)));
1186 return result;
1189 public
1190 Text_t Text$replace(Text_t text, Text_t target, Text_t replacement) {
1191 if (target.length <= 0) return text;
1192 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1193 Text_t result = EMPTY_TEXT;
1194 int64_t span_start = 0;
1195 for (int64_t i = 0; i < (int64_t)text.length;) {
1196 if (_matches(&text_state, &target_state, i)) {
1197 if (i > span_start) result = concat2(result, Text$slice(text, I(span_start + 1), I(i)));
1199 result = concat2(result, replacement);
1200 i += target.length;
1201 span_start = i;
1202 } else {
1203 i += 1;
1206 if (span_start < (int64_t)text.length)
1207 result = concat2(result, Text$slice(text, I(span_start + 1), I((int64_t)text.length)));
1208 return result;
1211 public
1212 PUREFUNC bool Text$has(Text_t text, Text_t target) {
1213 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), target_state = NEW_TEXT_ITER_STATE(target);
1214 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1215 if (_matches(&text_state, &target_state, i)) return true;
1217 return false;
1220 public
1221 List_t Text$split(Text_t text, Text_t delimiters) {
1222 if (delimiters.length == 0) return Text$clusters(text);
1224 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
1225 List_t splits = EMPTY_LIST;
1226 for (int64_t i = 0; i < (int64_t)text.length;) {
1227 int64_t span_len = 0;
1228 while (i + span_len < (int64_t)text.length && !_matches(&text_state, &delim_state, i + span_len)) {
1229 span_len += 1;
1231 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1232 List$insert(&splits, &slice, I(0), sizeof(slice));
1233 i += span_len + delimiters.length;
1234 if (i == text.length) {
1235 Text_t empty = Text("");
1236 List$insert(&splits, &empty, I(0), sizeof(empty));
1239 return splits;
1242 public
1243 List_t Text$split_any(Text_t text, Text_t delimiters) {
1244 if (delimiters.length == 0) return List(text);
1246 TextIter_t text_state = NEW_TEXT_ITER_STATE(text), delim_state = NEW_TEXT_ITER_STATE(delimiters);
1247 List_t splits = EMPTY_LIST;
1248 for (int64_t i = 0; i < (int64_t)text.length;) {
1249 int64_t span_len = 0;
1250 while (i + span_len < (int64_t)text.length
1251 && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i + span_len))) {
1252 span_len += 1;
1254 bool trailing_delim = i + span_len < (int64_t)text.length;
1255 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1256 List$insert(&splits, &slice, I(0), sizeof(slice));
1257 i += span_len + 1;
1258 while (i < (int64_t)text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&text_state, i))) {
1259 i += 1;
1261 if (i >= (int64_t)text.length && trailing_delim) {
1262 Text_t empty = Text("");
1263 List$insert(&splits, &empty, I(0), sizeof(empty));
1266 return splits;
1269 typedef struct {
1270 TextIter_t state;
1271 int64_t i;
1272 Text_t delimiter;
1273 } split_iter_state_t;
1275 static OptionalText_t next_split(split_iter_state_t *state) {
1276 Text_t text = state->state.stack[0].text;
1277 if (state->i >= (int64_t)text.length) {
1278 if (state->delimiter.length > 0 && state->i == text.length) { // special case
1279 state->i = text.length + 1;
1280 return EMPTY_TEXT;
1282 return NONE_TEXT;
1285 if (state->delimiter.length == 0) { // special case
1286 state->i = text.length + 1;
1287 return text;
1290 TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
1291 int64_t i = state->i;
1292 int64_t span_len = 0;
1293 while (i + span_len < (int64_t)text.length && !_matches(&state->state, &delim_state, i + span_len)) {
1294 span_len += 1;
1296 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1297 state->i = i + span_len + state->delimiter.length;
1298 return slice;
1301 public
1302 Closure_t Text$by_split(Text_t text, Text_t delimiter) {
1303 return (Closure_t){
1304 .fn = (void *)next_split,
1305 .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .delimiter = delimiter),
1309 static OptionalText_t next_split_any(split_iter_state_t *state) {
1310 Text_t text = state->state.stack[0].text;
1311 if (state->i >= (int64_t)text.length) {
1312 if (state->delimiter.length > 0 && state->i == text.length) { // special case
1313 state->i = text.length + 1;
1314 return EMPTY_TEXT;
1316 return NONE_TEXT;
1319 if (state->delimiter.length == 0) { // special case
1320 Text_t ret = Text$cluster(text, I(state->i + 1));
1321 state->i += 1;
1322 return ret;
1325 TextIter_t delim_state = NEW_TEXT_ITER_STATE(state->delimiter);
1326 int64_t i = state->i;
1327 int64_t span_len = 0;
1328 while (i + span_len < (int64_t)text.length
1329 && !_has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i + span_len))) {
1330 span_len += 1;
1332 Text_t slice = Text$slice(text, I(i + 1), I(i + span_len));
1333 i += span_len + 1;
1334 while (i < (int64_t)text.length && _has_grapheme(&delim_state, Text$get_grapheme_fast(&state->state, i))) {
1335 i += 1;
1337 state->i = i;
1338 return slice;
1341 public
1342 Closure_t Text$by_split_any(Text_t text, Text_t delimiters) {
1343 return (Closure_t){
1344 .fn = (void *)next_split_any,
1345 .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .delimiter = delimiters),
1349 PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) {
1350 if (a.length != b.length) return false;
1351 int64_t len = a.length;
1352 TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1353 // TODO: make this smarter and more efficient
1354 for (int64_t i = 0; i < len; i++) {
1355 int32_t ai = Text$get_grapheme_fast(&a_state, i);
1356 int32_t bi = Text$get_grapheme_fast(&b_state, i);
1357 if (ai != bi) return false;
1359 return true;
1362 PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t *info) {
1363 (void)info;
1364 if (a == b) return true;
1365 return Text$equal_values(*(Text_t *)a, *(Text_t *)b);
1368 PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language) {
1369 if (a.length != b.length) return false;
1370 int64_t len = a.length;
1371 TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
1372 const char *uc_language = Text$as_c_string(language);
1373 for (int64_t i = 0; i < len; i++) {
1374 int32_t ai = Text$get_grapheme_fast(&a_state, i);
1375 int32_t bi = Text$get_grapheme_fast(&b_state, i);
1376 if (ai != bi) {
1377 const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t *)&ai : GRAPHEME_CODEPOINTS(ai);
1378 int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);
1380 const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t *)&bi : GRAPHEME_CODEPOINTS(bi);
1381 int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
1383 int cmp = 0;
1384 (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
1385 if (cmp != 0) return false;
1388 return true;
1391 public
1392 Text_t Text$upper(Text_t text, Text_t language) {
1393 if (text.length == 0) return text;
1394 List_t codepoints = Text$utf32(text);
1395 if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1396 const char *uc_language = Text$as_c_string(language);
1397 size_t out_len = 256;
1398 uint32_t buf[out_len];
1399 ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1400 OptionalText_t ret =
1401 Text$from_utf32((List_t){.data = upper, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1402 if (upper != buf) free(upper);
1403 return ret;
1406 public
1407 Text_t Text$lower(Text_t text, Text_t language) {
1408 if (text.length == 0) return text;
1409 List_t codepoints = Text$utf32(text);
1410 if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1411 const char *uc_language = Text$as_c_string(language);
1412 size_t out_len = 256;
1413 uint32_t buf[out_len];
1414 ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1415 OptionalText_t ret =
1416 Text$from_utf32((List_t){.data = lower, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1417 if (lower != buf) free(lower);
1418 return ret;
1421 public
1422 Text_t Text$title(Text_t text, Text_t language) {
1423 if (text.length == 0) return text;
1424 List_t codepoints = Text$utf32(text);
1425 if (codepoints.stride != sizeof(int32_t)) List$compact(&codepoints, sizeof(int32_t));
1426 const char *uc_language = Text$as_c_string(language);
1427 size_t out_len = 256;
1428 uint32_t buf[out_len];
1429 ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
1430 OptionalText_t ret =
1431 Text$from_utf32((List_t){.data = title, .length = (uint64_t)out_len, .stride = sizeof(int32_t)});
1432 if (title != buf) free(title);
1433 return ret;
1436 public
1437 double Text$distance(Text_t a, Text_t b, Text_t language) {
1438 if (a.length <= 0) return (double)b.length;
1439 if (b.length <= 0) return (double)a.length;
1441 // The current implementation of text distance uses a modified form
1442 // of Damerau–Levenshtein distance that gives slightly lower distances
1443 // for letters with the same main grapheme and slightly lower distances
1444 // for letters that are the same, but with different casing.
1445 double *distances = GC_MALLOC_ATOMIC(sizeof(uint32_t) * (a.length + 1) * (b.length + 1));
1446 #define DIST(x, y) distances[(x) * b.length + (y)]
1447 for (int64_t i = 0; i <= (int64_t)a.length; i++)
1448 DIST(i, 0) = i;
1449 for (int64_t j = 0; j <= (int64_t)b.length; j++)
1450 DIST(0, j) = j;
1452 TextIter_t a_state = NEW_TEXT_ITER_STATE(a);
1453 TextIter_t b_state = NEW_TEXT_ITER_STATE(b);
1454 const char *uc_language = Text$as_c_string(language);
1455 for (int64_t i = 1; i <= (int64_t)a.length; i++) {
1456 for (int64_t j = 1; j <= (int64_t)b.length; j++) {
1457 int32_t ai = Text$get_grapheme_fast(&a_state, i - 1);
1458 int32_t bj = Text$get_grapheme_fast(&b_state, j - 1);
1459 double cost = (double)(ai != bj);
1460 if (cost > 0) {
1461 ucs4_t main_ai = (ai) >= 0 ? (ucs4_t)ai : synthetic_graphemes[-(ai)-1].main_codepoint;
1462 ucs4_t main_bj = (bj) >= 0 ? (ucs4_t)bj : synthetic_graphemes[-(bj)-1].main_codepoint;
1463 if (main_ai == main_bj) {
1464 // Same main grapheme (different modifiers)
1465 cost = 0.25;
1466 } else {
1467 int cmp;
1468 (void)u32_casecmp(&main_ai, 1, &main_bj, 1, uc_language, UNINORM_NFC, &cmp);
1469 if (cmp == 0) cost = 0.5;
1473 double insertion = 1. + DIST(i - 1, j);
1474 double deletion = 1. + DIST(i, j - 1);
1475 double dist = MIN(insertion, deletion);
1476 double substitution = cost + DIST(i - 1, j - 1);
1477 dist = MIN(dist, substitution);
1478 // Check for transposition:
1479 if (i >= 2 && j >= 2 && cost != 0) {
1480 int32_t ai_prev = Text$get_grapheme_fast(&a_state, i - 2);
1481 int32_t bi_prev = Text$get_grapheme_fast(&b_state, i - 2);
1482 if (ai == bi_prev && bj == ai_prev) {
1483 double transposition = cost + DIST(i - 2, j - 2);
1484 dist = MIN(dist, transposition);
1487 DIST(i, j) = dist;
1490 #undef DIST
1492 return (double)distances[a.length * b.length + b.length];
1495 public
1496 Text_t Text$escaped(Text_t text, bool colorize, Text_t extra_escapes) {
1497 Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
1498 #define flush_unquoted() \
1499 ({ \
1500 if (unquoted_span > 0) { \
1501 ret = concat2_assuming_safe(ret, Text$slice(text, I(i - unquoted_span + 1), I(i))); \
1502 unquoted_span = 0; \
1503 } \
1505 #define add_escaped(str) \
1506 ({ \
1507 flush_unquoted(); \
1508 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \
1509 ret = concat2_assuming_safe(ret, Text("\\" str)); \
1510 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); \
1512 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1513 int64_t unquoted_span = 0;
1514 int64_t i = 0;
1515 for (i = 0; i < (int64_t)text.length; i++) {
1516 int32_t g = Text$get_grapheme_fast(&state, i);
1517 switch (g) {
1518 case '\a': add_escaped("a"); break;
1519 case '\b': add_escaped("b"); break;
1520 case '\x1b': add_escaped("e"); break;
1521 case '\f': add_escaped("f"); break;
1522 case '\n': add_escaped("n"); break;
1523 case '\r': add_escaped("r"); break;
1524 case '\t': add_escaped("t"); break;
1525 case '\v': add_escaped("v"); break;
1526 case '\\': {
1527 add_escaped("\\");
1528 break;
1530 case '$': {
1531 add_escaped("$");
1532 break;
1534 case '\x00' ... '\x06':
1535 case '\x0E' ... '\x1A':
1536 case '\x1C' ... '\x1F':
1537 case '\x7F' ... '\x7F': {
1538 flush_unquoted();
1539 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
1540 ret = concat2_assuming_safe(ret, Text("\\x"));
1541 char tmp[3] = {
1542 (g / 16) > 9 ? 'a' + (g / 16) - 10 : '0' + (g / 16),
1543 (g & 15) > 9 ? 'a' + (g & 15) - 10 : '0' + (g & 15),
1544 '\0',
1546 ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2));
1547 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
1548 break;
1550 default: {
1551 TextIter_t esc_state = NEW_TEXT_ITER_STATE(extra_escapes);
1552 for (int64_t j = 0; j < (int64_t)extra_escapes.length; j++) {
1553 int32_t esc = Text$get_grapheme_fast(&esc_state, j);
1554 if (g == esc) {
1555 flush_unquoted();
1556 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
1557 ret = concat2_assuming_safe(ret, Text("\\"));
1558 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
1559 break;
1562 unquoted_span += 1;
1563 break;
1567 flush_unquoted();
1568 #undef add_escaped
1569 #undef flush_unquoted
1570 if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[m"));
1571 return ret;
1574 public
1575 Text_t Text$quoted(Text_t text, bool colorize, Text_t quotation_mark) {
1576 if (quotation_mark.length != 1) fail("Invalid quote text: ", quotation_mark, " (must have length == 1)");
1578 Text_t ret = Text$escaped(text, colorize, quotation_mark);
1579 if (!(Text$equal_values(quotation_mark, Text("\"")) || Text$equal_values(quotation_mark, Text("'"))
1580 || Text$equal_values(quotation_mark, Text("`"))))
1581 ret = Text$concat(Text("$"), quotation_mark, ret, quotation_mark);
1582 else ret = Text$concat(quotation_mark, ret, quotation_mark);
1583 return ret;
1586 public
1587 Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *info) {
1588 (void)info;
1589 if (!vtext) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
1591 Text_t text = *(Text_t *)vtext;
1592 // Figure out the best quotation mark to use:
1593 bool has_double_quote = false, has_backtick = false, has_single_quote = false, needs_escapes = false;
1594 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1595 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1596 int32_t g = Text$get_grapheme_fast(&state, i);
1597 if (g == '"') {
1598 has_double_quote = true;
1599 } else if (g == '`') {
1600 has_backtick = true;
1601 } else if (g == (g & 0x7F) && (g == '\'' || g == '\n' || g == '\r' || g == '\t' || !isprint((char)g))) {
1602 needs_escapes = true;
1606 Text_t quote;
1607 // If there's double quotes in the string, it would be nice to avoid
1608 // needing to escape them by using single quotes, but only if we don't have
1609 // single quotes or need to escape anything else (because single quotes
1610 // don't have interpolation):
1611 if (has_double_quote && !has_single_quote) quote = Text("'");
1612 // If there is a double quote, but no backtick, we can save a bit of
1613 // escaping by using backtick instead of double quote:
1614 else if (has_double_quote && has_single_quote && !has_backtick && !needs_escapes) quote = Text("`");
1615 // Otherwise fall back to double quotes as the default quoting style:
1616 else quote = Text("\"");
1618 Text_t as_text = Text$quoted(text, colorize, quote);
1619 if (info && info->TextInfo.lang && info != &Text$info)
1620 as_text = Text$concat(colorize ? Text("\x1b[1m$") : Text("$"), Text$from_str(info->TextInfo.lang),
1621 colorize ? Text("\x1b[0m") : Text(""), as_text);
1622 return as_text;
1625 public
1626 Text_t Text$join(Text_t glue, List_t pieces) {
1627 if (pieces.length == 0) return EMPTY_TEXT;
1629 Text_t result = *(Text_t *)pieces.data;
1630 for (int64_t i = 1; i < (int64_t)pieces.length; i++) {
1631 result = Text$concat(result, glue, *(Text_t *)(pieces.data + i * pieces.stride));
1633 return result;
1636 public
1637 List_t Text$clusters(Text_t text) {
1638 List_t clusters = EMPTY_LIST;
1639 for (int64_t i = 1; i <= (int64_t)text.length; i++) {
1640 Text_t cluster = Text$slice(text, I(i), I(i));
1641 List$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
1643 return clusters;
1646 public
1647 List_t Text$utf8(Text_t text) {
1648 if (text.length == 0) return EMPTY_ATOMIC_LIST;
1649 int64_t capacity = (int64_t)text.length;
1650 Byte_t *buf = GC_MALLOC_ATOMIC(sizeof(Byte_t[capacity]));
1651 int64_t i = 0;
1652 u8_buf_append(text, &buf, &capacity, &i);
1653 return (List_t){
1654 .data = buf, .length = (uint64_t)i, .stride = 1, .atomic = 1, .free = MIN(LIST_MAX_FREE_ENTRIES, capacity - i)};
1657 public
1658 List_t Text$utf16(Text_t text) {
1659 if (text.length == 0) return EMPTY_ATOMIC_LIST;
1660 List_t utf32 = Text$utf32(text);
1661 if (utf32.length == 0) return EMPTY_ATOMIC_LIST;
1662 List_t utf16 = {.free = MIN(LIST_MAX_FREE_ENTRIES, (uint64_t)utf32.length), .atomic = 1};
1663 utf16.data = GC_MALLOC_ATOMIC(sizeof(int32_t[utf16.free]));
1664 for (int64_t i = 0; i < (int64_t)utf32.length; i++) {
1665 uint16_t u16_buf[4];
1666 size_t u16_len = sizeof(u16_buf) / sizeof(u16_buf[0]);
1667 uint16_t *chunk_u16 = u32_to_u16(utf32.data + utf32.stride * i, 1, u16_buf, &u16_len);
1668 if (chunk_u16 == NULL) fail("Invalid codepoints encountered!");
1669 List$insert_all(&utf16, (List_t){.data = chunk_u16, .stride = sizeof(uint16_t), .length = (uint64_t)u16_len},
1670 I(0), sizeof(uint16_t));
1671 if (chunk_u16 != u16_buf) free(chunk_u16);
1673 return utf16;
1676 public
1677 List_t Text$utf32(Text_t text) {
1678 if (text.length == 0) return EMPTY_ATOMIC_LIST;
1679 List_t codepoints = EMPTY_ATOMIC_LIST;
1680 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1681 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1682 int32_t grapheme = Text$get_grapheme_fast(&state, i);
1683 if (grapheme < 0) {
1684 for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
1685 ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
1686 List$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t));
1688 } else {
1689 List$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t));
1692 return codepoints;
1695 static INLINE const char *codepoint_name(ucs4_t c) {
1696 char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
1697 char *found_name = unicode_character_name(c, name);
1698 if (found_name) return found_name;
1699 const uc_block_t *block = uc_block(c);
1700 if (!block) return "???";
1701 assert(block);
1702 return String(block->name, "-", hex(c, .no_prefix = true, .uppercase = true));
1705 public
1706 List_t Text$codepoint_names(Text_t text) {
1707 List_t names = EMPTY_LIST;
1708 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1709 for (int64_t i = 0; i < (int64_t)text.length; i++) {
1710 int32_t grapheme = Text$get_grapheme_fast(&state, i);
1711 if (grapheme < 0) {
1712 for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
1713 const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
1714 Text_t name_text = Text$from_str(name);
1715 List$insert(&names, &name_text, I_small(0), sizeof(Text_t));
1717 } else {
1718 const char *name = codepoint_name((ucs4_t)grapheme);
1719 Text_t name_text = Text$from_str(name);
1720 List$insert(&names, &name_text, I_small(0), sizeof(Text_t));
1723 return names;
1726 public
1727 OptionalText_t Text$from_utf8(List_t units) {
1728 if (units.length == 0) return EMPTY_TEXT;
1729 if (units.stride != sizeof(uint8_t)) List$compact(&units, sizeof(uint8_t));
1730 return Text$from_strn(units.data, (size_t)units.length);
1733 public
1734 OptionalText_t Text$from_utf16(List_t units) {
1735 if (units.length == 0) return EMPTY_TEXT;
1736 if (units.stride != sizeof(int16_t)) List$compact(&units, sizeof(int16_t));
1738 size_t length = 256;
1739 uint8_t buf[length];
1740 uint8_t *u8 = u16_to_u8(units.data, (size_t)units.length, buf, &length);
1741 Text_t ret =
1742 Text$from_utf8((List_t){.data = u8, .length = (uint64_t)length, .stride = sizeof(uint8_t), .atomic = 1});
1743 if (u8 != buf) free(u8);
1744 return ret;
1747 public
1748 OptionalText_t Text$from_utf32(List_t codepoints) {
1749 if (codepoints.length == 0) return EMPTY_TEXT;
1750 if (codepoints.stride != sizeof(uint32_t)) List$compact(&codepoints, sizeof(uint32_t));
1752 List_t graphemes = EMPTY_ATOMIC_LIST;
1753 Table_t unique_clusters = EMPTY_TABLE;
1754 const uint32_t *pos = (const uint32_t *)codepoints.data;
1755 const uint32_t *end = (const uint32_t *)&pos[codepoints.length];
1756 // Iterate over grapheme clusters
1757 for (const uint32_t *next; (next = u32_grapheme_next(pos, end)); pos = next) {
1758 // Buffer for normalized cluster:
1759 uint32_t buf[256];
1760 size_t u32_normlen = sizeof(buf) / sizeof(buf[0]);
1761 if (u32_check(pos, (size_t)(next - pos)) != NULL) return NONE_TEXT;
1762 uint32_t *u32s_normalized = u32_normalize(UNINORM_NFC, pos, (size_t)(next - pos), buf, &u32_normlen);
1764 int32_t g = get_synthetic_grapheme(u32s_normalized, (int64_t)u32_normlen);
1765 List$insert(&graphemes, &g, I(0), sizeof(int32_t));
1766 Table$get_or_setdefault(&unique_clusters, int32_t, uint8_t, g, (uint8_t)unique_clusters.entries.length,
1767 Table$info(&Int32$info, &Byte$info));
1769 if (u32s_normalized != buf) free(u32s_normalized);
1771 if (unique_clusters.entries.length == 256) {
1772 List_t remaining_codepoints = {
1773 .length = (uint64_t)(end - next),
1774 .data = (void *)next,
1775 .stride = sizeof(int32_t),
1777 OptionalText_t remainder = Text$from_utf32(remaining_codepoints);
1778 if (remainder.tag == TEXT_NONE) return NONE_TEXT;
1779 return concat2_assuming_safe(Text$from_components(graphemes, unique_clusters), remainder);
1782 return Text$from_components(graphemes, unique_clusters);
1785 public
1786 OptionalText_t Text$from_codepoint_names(List_t codepoint_names) {
1787 List_t codepoints = EMPTY_LIST;
1788 for (int64_t i = 0; i < (int64_t)codepoint_names.length; i++) {
1789 Text_t *name = ((Text_t *)(codepoint_names.data + i * codepoint_names.stride));
1790 const char *name_str = Text$as_c_string(*name);
1791 ucs4_t codepoint = unicode_name_character(name_str);
1792 if (codepoint == UNINAME_INVALID) return NONE_TEXT;
1793 List$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
1795 return Text$from_utf32(codepoints);
1798 public
1799 List_t Text$lines(Text_t text) {
1800 List_t lines = EMPTY_LIST;
1801 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1802 for (int64_t i = 0, line_start = 0; i < (int64_t)text.length; i++) {
1803 int32_t grapheme = Text$get_grapheme_fast(&state, i);
1804 if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF
1805 Text_t line = Text$slice(text, I(line_start + 1), I(i));
1806 List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1807 i += 1; // skip one extra for CR
1808 line_start = i + 1;
1809 } else if (grapheme == '\n') { // newline
1810 Text_t line = Text$slice(text, I(line_start + 1), I(i));
1811 List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1812 line_start = i + 1;
1813 } else if (i == text.length - 1 && line_start != i) { // last line
1814 Text_t line = Text$slice(text, I(line_start + 1), I(i + 1));
1815 List$insert(&lines, &line, I_small(0), sizeof(Text_t));
1818 return lines;
1821 typedef struct {
1822 TextIter_t state;
1823 int64_t i;
1824 } line_iter_state_t;
1826 static OptionalText_t next_line(line_iter_state_t *state) {
1827 Text_t text = state->state.stack[0].text;
1828 for (int64_t i = state->i; i < (int64_t)text.length; i++) {
1829 int32_t grapheme = Text$get_grapheme_fast(&state->state, i);
1830 if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF
1831 Text_t line = Text$slice(text, I(state->i + 1), I(i));
1832 state->i = i + 2; // skip one extra for CR
1833 return line;
1834 } else if (grapheme == '\n') { // newline
1835 Text_t line = Text$slice(text, I(state->i + 1), I(i));
1836 state->i = i + 1;
1837 return line;
1838 } else if (i == text.length - 1 && state->i != i) { // last line
1839 Text_t line = Text$slice(text, I(state->i + 1), I(i + 1));
1840 state->i = i + 1;
1841 return line;
1844 return NONE_TEXT;
1847 public
1848 Closure_t Text$by_line(Text_t text) {
1849 return (Closure_t){
1850 .fn = (void *)next_line,
1851 .userdata = new (line_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0),
1855 PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t *info) {
1856 (void)info;
1857 return ((Text_t *)t)->tag == TEXT_NONE;
1860 public
1861 Int_t Text$memory_size(Text_t text) {
1862 switch (text.tag) {
1863 case TEXT_ASCII: return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(char[text.length]));
1864 case TEXT_GRAPHEMES: return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)sizeof(int32_t[text.length]));
1865 case TEXT_BLOB:
1866 return Int$from_int64((int64_t)sizeof(Text_t) + (int64_t)((void *)text.blob.bytes - (void *)text.blob.map)
1867 + (int64_t)sizeof(uint8_t[text.length]));
1868 case TEXT_CONCAT:
1869 return Int$plus(Int$from_int64((int64_t)sizeof(Text_t)),
1870 Int$plus(Text$memory_size(*text.left), Text$memory_size(*text.right)));
1871 default: errx(1, "Invalid text tag: %d", text.tag);
1875 public
1876 Text_t Text$layout(Text_t text) {
1877 switch (text.tag) {
1878 case TEXT_ASCII: return Text$concat(Text("ASCII("), Int64$value_as_text(text.length), Text(")"));
1879 case TEXT_GRAPHEMES: return Text$concat(Text("Graphemes("), Int64$value_as_text(text.length), Text(")"));
1880 case TEXT_BLOB: return Text$concat(Text("Blob("), Int64$value_as_text(text.length), Text(")"));
1881 case TEXT_CONCAT:
1882 return Text$concat(Text("Concat("), Text$layout(*text.left), Text(", "), Text$layout(*text.right), Text(")"));
1883 default: errx(1, "Invalid text tag: %d", text.tag);
1887 public
1888 void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const TypeInfo_t *info) {
1889 (void)info;
1890 const char *str = Text$as_c_string(*(Text_t *)obj);
1891 int64_t len = (int64_t)strlen(str);
1892 Int64$serialize(&len, out, pointers, &Int64$info);
1893 fwrite(str, sizeof(char), (size_t)len, out);
1896 public
1897 void Text$deserialize(FILE *in, void *out, List_t *pointers, const TypeInfo_t *info) {
1898 (void)info;
1899 int64_t len = 0;
1900 Int64$deserialize(in, &len, pointers, &Int64$info);
1901 if (len < 0) fail("Cannot deserialize text with a negative length!");
1902 char *buf = GC_MALLOC_ATOMIC((size_t)len + 1);
1903 if (fread(buf, sizeof(char), (size_t)len, in) != (size_t)len) fail("Not enough data in stream to deserialize");
1904 buf[len + 1] = '\0';
1905 *(Text_t *)out = Text$from_strn(buf, (size_t)len);
1908 public
1909 const TypeInfo_t Text$info = {
1910 .size = sizeof(Text_t),
1911 .align = __alignof__(Text_t),
1912 .tag = TextInfo,
1913 .TextInfo = {.lang = "Text"},
1914 .metamethods = Text$metamethods,