From 5ee185a4896e43c67b6d299becfa616da78fb9f4 Mon Sep 17 00:00:00 2001
From: Bruce Hill <bruce@bruce-hill.com>
Date: Fri, 21 Mar 2025 21:48:53 -0400
Subject: Move stdlib into src/

---
 src/stdlib/text.c | 1499 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1499 insertions(+)
 create mode 100644 src/stdlib/text.c

(limited to 'src/stdlib/text.c')

diff --git a/src/stdlib/text.c b/src/stdlib/text.c
new file mode 100644
index 00000000..bfaa0581
--- /dev/null
+++ b/src/stdlib/text.c
@@ -0,0 +1,1499 @@
+// This file defines type info and methods for the Text datatype, which uses
+// libunistr for Unicode support and implements a datastructure based on a
+// hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of
+// strings and Cords (Boehm et al), which have good runtime performance for
+// text constructed by a series of many concatenations.
+//
+// For more information on MoarVM's grapheme cluster strings, see:
+//     https://docs.raku.org/language/unicode
+//     https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc For more
+// information on Cords, see the paper "Ropes: an Alternative to Strings"
+// (Boehm, Atkinson, Plass 1995):
+//     https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf
+//
+// A note on grapheme clusters: In Unicode, codepoints can be represented using
+// a 32-bit integer. Most codepoints correspond to the intuitive notion of a
+// "letter", which is more formally known as a "grapheme cluster". A grapheme
+// cluster is roughly speaking the amount of text that your cursor moves over
+// when you press the arrow key once. However, some codepoints act as modifiers
+// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify
+// a letter like "e" to form "é". During normalization, this frequently
+// resolves down to a single unicode codepoint, in this case, "é" resolves to
+// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in
+// some cases, multiple codepoints make up a grapheme cluster but *don't*
+// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065)
+// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph
+// that is not used frequently enough to warrant its own unique codepoint (this
+// is basically what Zalgo text is).
+//
+// There are a lot of benefits to storing unicode text with one grapheme
+// cluster per index in a densely packed array instead of storing the text as
+// variable-width UTF8-encoded bytes. It lets us have one canonical length for
+// the text that can be precomputed and is meaningful to users. It lets us
+// quickly get the Nth "letter" in the text. Substring slicing is fast.
+// However, since not all grapheme clusters take up the same number of
+// codepoints, we're faced with the problem of how to jam multiple codepoints
+// into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this
+// implementation uses "synthetic graphemes" (in Raku's terms, Normal Form
+// Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed
+// integer that represents a multi-codepoint grapheme cluster that has been
+// encountered during the program's runtime. These clusters are stored in a
+// lookup array and hash map so that we can rapidly convert between the
+// synthetic grapheme integer ID and the unicode codepoints associated with it.
+// Essentially, it's like we create a supplement to the unicode standard with
+// things that would be nice if they had their own codepoint so things worked
+// out nicely because we're using them right now, and we'll give them a
+// negative number so it doesn't overlap with any real codepoints.
+//
+// Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E
+// WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2:
+// U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E,
+// COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2}
+// Where -2 is used as a lookup in an array that holds the actual unicode
+// codepoints: (ucs4_t[]){0x65, 0x0309}
+
+#include <assert.h>
+#include <ctype.h>
+#include <gc.h>
+#include <printf.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/param.h>
+
+#include <unicase.h>
+#include <unictype.h>
+#include <unigbrk.h>
+#include <uniname.h>
+#include <unistr.h>
+#include <unistring/version.h>
+#include <uniwidth.h>
+
+#include "arrays.h"
+#include "integers.h"
+#include "patterns.h"
+#include "tables.h"
+#include "text.h"
+
+// Use inline version of the siphash code for performance:
+#include "siphash.h"
+#include "siphash-internals.h"
+
+typedef struct {
+    ucs4_t main_codepoint;
+    ucs4_t *utf32_cluster; // length-prefixed
+    const uint8_t *utf8;
+} synthetic_grapheme_t;
+
+// Synthetic grapheme clusters (clusters of more than one codepoint):
+static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID
+
+// This will hold a dynamically growing array of synthetic graphemes:
+static synthetic_grapheme_t *synthetic_graphemes = NULL;
+static int32_t synthetic_grapheme_capacity = 0;
+static int32_t num_synthetic_graphemes = 0;
+
+#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0])
+#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1])
+#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8)
+
+// Somewhat arbitrarily chosen, if two short literal ASCII or grapheme chunks
+// are concatenated below this length threshold, we just merge them into a
+// single literal node instead of a concatenation node.
+#define SHORT_ASCII_LENGTH 64
+#define SHORT_GRAPHEMES_LENGTH 16
+
+static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize);
+static Text_t simple_concatenation(Text_t a, Text_t b);
+
+public Text_t EMPTY_TEXT = {
+    .length=0,
+    .tag=TEXT_ASCII,
+    .ascii=0,
+};
+
+PUREFUNC static bool graphemes_equal(const void *va, const void *vb, const TypeInfo_t*) {
+    ucs4_t *a = *(ucs4_t**)va;
+    ucs4_t *b = *(ucs4_t**)vb;
+    if (a[0] != b[0]) return false;
+    for (int i = 0; i < (int)a[0]; i++)
+        if (a[i] != b[i]) return false;
+    return true;
+}
+
+PUREFUNC static uint64_t grapheme_hash(const void *g, const TypeInfo_t*) {
+    ucs4_t *cluster = *(ucs4_t**)g;
+    return siphash24((void*)&cluster[1], sizeof(ucs4_t[cluster[0]]));
+}
+
+static const TypeInfo_t GraphemeClusterInfo = {
+    .size=sizeof(ucs4_t*),
+    .align=__alignof__(ucs4_t*),
+    .metamethods={
+        .equal=graphemes_equal,
+        .hash=grapheme_hash,
+    },
+};
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstack-protector"
+public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len)
+{
+    ucs4_t length_prefixed[1+utf32_len];
+    length_prefixed[0] = (ucs4_t)utf32_len;
+    for (int i = 0; i < utf32_len; i++)
+        length_prefixed[i+1] = codepoints[i];
+    ucs4_t *ptr = &length_prefixed[0];
+
+    // Optimization for common case of one frequently used synthetic grapheme:
+    static int32_t last_grapheme = 0;
+    if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster, NULL))
+        return last_grapheme;
+
+    TypeInfo_t GraphemeIDLookupTableInfo = *Table$info(&GraphemeClusterInfo, &Int32$info);
+    int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo);
+    if (found) return *found;
+
+    // New synthetic grapheme:
+    if (num_synthetic_graphemes >= synthetic_grapheme_capacity) {
+        // If we don't have space, allocate more:
+        synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2);
+        synthetic_grapheme_t *new = GC_MALLOC_ATOMIC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity]));
+        memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes]));
+        synthetic_graphemes = new;
+    }
+
+    int32_t grapheme_id = -(num_synthetic_graphemes+1);
+    num_synthetic_graphemes += 1;
+
+    // Get UTF8 representation:
+    uint8_t u8_buf[64];
+    size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]);
+    uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len);
+
+    // For performance reasons, use an arena allocator here to ensure that
+    // synthetic graphemes store all of their information in a densely packed
+    // area with good cache locality:
+    static void *arena = NULL, *arena_end = NULL;
+    // Eat up any space needed to make arena 32-bit aligned:
+    if ((size_t)arena % __alignof__(ucs4_t) != 0)
+        arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t));
+
+    // If we have filled up this arena, allocate a new one:
+    size_t needed_memory = sizeof(ucs4_t[1+utf32_len]) + sizeof(uint8_t[u8_len + 1]);
+    if (arena + needed_memory > arena_end) {
+        // Do reasonably big chunks at a time, so most synthetic codepoints are
+        // nearby each other in memory and cache locality is good. This is a
+        // rough guess at a good size:
+        size_t chunk_size = MAX(needed_memory, 512);
+        arena = GC_MALLOC_ATOMIC(chunk_size);
+        arena_end = arena + chunk_size;
+    }
+
+    // Copy length-prefixed UTF32 codepoints into the arena and store where they live:
+    ucs4_t *codepoint_copy = arena;
+    mempcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1+utf32_len]));
+    synthetic_graphemes[-grapheme_id-1].utf32_cluster = codepoint_copy;
+    arena += sizeof(ucs4_t[1+utf32_len]);
+
+    // Copy UTF8 bytes into the arena and store where they live:
+    uint8_t *utf8_final = arena;
+    memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
+    utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
+    synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final;
+    arena += sizeof(uint8_t[u8_len + 1]);
+
+    // Sickos at the unicode consortium decreed that you can have grapheme clusters
+    // that begin with *prefix* modifiers, so we gotta check for that case:
+    synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1];
+    for (ucs4_t i = 0; i < utf32_len; i++) {
+#if _LIBUNISTRING_VERSION >= 0x010200
+// libuinstring version 1.2.0 introduced uc_is_property_prepended_concatenation_mark()
+// It's not critical, but it's technically more correct to have this check:
+        if (unlikely(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i])))
+            continue;
+#endif
+        synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i];
+        break;
+    }
+
+    // Cleanup from unicode API:
+    if (u8 != u8_buf) free(u8);
+
+    Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo);
+
+    last_grapheme = grapheme_id;
+    return grapheme_id;
+}
+#pragma GCC diagnostic pop
+
+int text_visualize(FILE *stream, Text_t t, int depth)
+{
+    switch (t.tag) {
+    case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii);
+    case TEXT_GRAPHEMES: {
+        int printed = fprintf(stream, "<graphemes length=%ld>", t.length);
+        printed += Text$print(stream, t);
+        printed += fprintf(stream, "</graphemes>");
+        return printed;
+    }
+    case TEXT_CONCAT: {
+        int printed = fprintf(stream, "<concat depth=%ld length=%ld>\n", t.depth, t.length);
+        for (int i = 0; i < depth+1; i++)
+            printed += fputc(' ', stream);
+        printed += text_visualize(stream, *t.left, depth+1);
+        printed += fputc('\n', stream);
+        for (int i = 0; i < depth+1; i++)
+            printed += fputc(' ', stream);
+        printed += text_visualize(stream, *t.right, depth+1);
+        printed += fputc('\n', stream);
+        for (int i = 0; i < depth; i++)
+            printed += fputc(' ', stream);
+        printed += fprintf(stream, "</concat>");
+        return printed;
+    }
+    default: return 0;
+    }
+}
+
+public int Text$print(FILE *stream, Text_t t)
+{
+    if (t.length == 0) return 0;
+
+    switch (t.tag) {
+    case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream);
+    case TEXT_GRAPHEMES: {
+        const int32_t *graphemes = t.graphemes;
+        int written = 0;
+        for (int64_t i = 0; i < t.length; i++) {
+            int32_t grapheme = graphemes[i];
+            if (grapheme >= 0) {
+                uint8_t buf[8];
+                size_t len = sizeof(buf);
+                uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len);
+                written += (int)fwrite(u8, sizeof(char), len, stream);
+                if (u8 != buf) free(u8);
+            } else {
+                const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
+                assert(u8);
+                written += (int)fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
+            }
+        }
+        return written;
+    }
+    case TEXT_CONCAT: {
+        return (Text$print(stream, *t.left)
+                + Text$print(stream, *t.right));
+    }
+    default: return 0;
+    }
+}
+
+static const int64_t min_len_for_depth[MAX_TEXT_DEPTH] = {
+    // Fibonacci numbers (skipping first two)
+    1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946,
+    17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578,
+    5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
+    433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049,
+};
+
+#define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth])
+
+static void insert_balanced(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t to_insert)
+{
+    int i = 0;
+    Text_t accumulator = EMPTY_TEXT;
+    for (; to_insert.length > min_len_for_depth[i + 1]; i++) {
+        if (balanced_texts[i].length) {
+            accumulator = simple_concatenation(balanced_texts[i], accumulator);
+            balanced_texts[i] = EMPTY_TEXT;
+        }
+    }
+
+    accumulator = simple_concatenation(accumulator, to_insert);
+
+    while (accumulator.length >= min_len_for_depth[i]) {
+        if (balanced_texts[i].length) {
+            accumulator = simple_concatenation(balanced_texts[i], accumulator);
+            balanced_texts[i] = EMPTY_TEXT;
+        }
+        i++;
+    }
+    i--;
+    balanced_texts[i] = accumulator;
+}
+
+static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text)
+{
+    if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) {
+        insert_balanced_recursive(balanced_texts, *text.left);
+        insert_balanced_recursive(balanced_texts, *text.right);
+    } else {
+        insert_balanced(balanced_texts, text);
+    }
+}
+
+static Text_t rebalanced(Text_t a, Text_t b)
+{
+    Text_t balanced_texts[MAX_TEXT_DEPTH];
+    memset(balanced_texts, 0, sizeof(balanced_texts));
+    insert_balanced_recursive(balanced_texts, a);
+    insert_balanced_recursive(balanced_texts, b);
+
+    Text_t ret = EMPTY_TEXT;
+    for (int i = 0; ret.length < a.length + b.length; i++) {
+        if (balanced_texts[i].length)
+            ret = simple_concatenation(balanced_texts[i], ret);
+    }
+    return ret;
+}
+
+Text_t simple_concatenation(Text_t a, Text_t b)
+{
+    if (a.length == 0) return b;
+    if (b.length == 0) return a;
+
+    uint16_t new_depth = 1 + MAX(a.depth, b.depth);
+    // Rebalance only if depth exceeds the maximum allowed. We don't require
+    // every concatenation to yield a balanced text, since many concatenations
+    // are ephemeral (e.g. doing a loop repeatedly concatenating without using
+    // the intermediary values).
+    if (new_depth >= MAX_TEXT_DEPTH)
+        return rebalanced(a, b);
+
+    Text_t *children = GC_MALLOC(sizeof(Text_t[2]));
+    children[0] = a;
+    children[1] = b;
+    return (Text_t){
+        .tag=TEXT_CONCAT,
+        .length=a.length + b.length,
+        .depth=new_depth,
+        .left=&children[0],
+        .right=&children[1],
+    };
+}
+
+static Text_t concat2_assuming_safe(Text_t a, Text_t b)
+{
+    if (a.length == 0) return b;
+    if (b.length == 0) return a;
+
+    if (a.tag == TEXT_ASCII && b.tag == TEXT_ASCII && (size_t)(a.length + b.length) <= SHORT_ASCII_LENGTH) {
+        struct Text_s ret = {
+            .tag=TEXT_ASCII,
+            .length=a.length + b.length,
+        };
+        ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
+        memcpy((char*)ret.ascii, a.ascii, sizeof(char[a.length]));
+        memcpy((char*)&ret.ascii[a.length], b.ascii, sizeof(char[b.length]));
+        return ret;
+    } else if (a.tag == TEXT_GRAPHEMES && b.tag == TEXT_GRAPHEMES && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
+        struct Text_s ret = {
+            .tag=TEXT_GRAPHEMES,
+            .length=a.length + b.length,
+        };
+        ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
+        memcpy((int32_t*)ret.graphemes, a.graphemes, sizeof(int32_t[a.length]));
+        memcpy((int32_t*)&ret.graphemes[a.length], b.graphemes, sizeof(int32_t[b.length]));
+        return ret;
+    } else if (a.tag != TEXT_CONCAT && b.tag != TEXT_CONCAT && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
+        // Turn a small bit of ASCII into graphemes if it helps make things smaller
+        // Text structs come with an extra 8 bytes, so allocate enough to hold the text
+        struct Text_s ret = {
+            .tag=TEXT_GRAPHEMES,
+            .length=a.length + b.length,
+        };
+        ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
+        int32_t *dest = (int32_t*)ret.graphemes;
+        if (a.tag == TEXT_GRAPHEMES) {
+            dest = mempcpy(dest, a.graphemes, sizeof(int32_t[a.length]));
+        } else {
+            for (int64_t i = 0; i < a.length; i++)
+                *(dest++) = (int32_t)a.ascii[i];
+        }
+        if (b.tag == TEXT_GRAPHEMES) {
+            memcpy(dest, b.graphemes, sizeof(int32_t[b.length]));
+        } else {
+            for (int64_t i = 0; i < b.length; i++)
+                *(dest++) = (int32_t)b.ascii[i];
+        }
+        return ret;
+    }
+
+    if (a.tag == TEXT_CONCAT && b.tag != TEXT_CONCAT && a.right->tag != TEXT_CONCAT)
+        return concat2_assuming_safe(*a.left, concat2_assuming_safe(*a.right, b));
+
+    return simple_concatenation(a, b);
+}
+
+static Text_t concat2(Text_t a, Text_t b)
+{
+    if (a.length == 0) return b;
+    if (b.length == 0) return a;
+
+    int32_t last_a = Text$get_grapheme(a, a.length-1);
+    int32_t first_b = Text$get_grapheme(b, 0);
+
+    // Magic number, we know that no codepoints below here trigger instability:
+    static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; // COMBINING GRAVE ACCENT
+    if (last_a >= 0 && last_a < LOWEST_CODEPOINT_TO_CHECK && first_b >= 0 && first_b < LOWEST_CODEPOINT_TO_CHECK)
+        return concat2_assuming_safe(a, b);
+
+    size_t len = (last_a >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a);
+    len += (first_b >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b);
+
+    ucs4_t codepoints[len];
+    ucs4_t *dest = codepoints;
+    if (last_a < 0)
+        dest = mempcpy(dest, GRAPHEME_CODEPOINTS(last_a), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(last_a)]));
+    else
+        *(dest++) = (ucs4_t)last_a;
+
+    if (first_b < 0)
+        dest = mempcpy(dest, GRAPHEME_CODEPOINTS(first_b), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(first_b)]));
+    else
+        *(dest++) = (ucs4_t)first_b;
+
+    // Do a normalization run for these two codepoints and see if it looks different.
+    // Normalization should not exceed 3x in the input length (but if it does, it will be
+    // handled gracefully)
+    ucs4_t norm_buf[3*len];
+    size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
+    ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, len, norm_buf, &norm_length);
+    bool stable = (norm_length == len && memcmp(codepoints, normalized, sizeof(codepoints)) == 0);
+
+    if (stable) {
+        const void *second_grapheme = u32_grapheme_next(normalized, &normalized[norm_length]);
+        if (second_grapheme == &normalized[norm_length])
+            stable = false;
+    }
+
+    if likely (stable) {
+        if (normalized != norm_buf)
+            free(normalized);
+        return concat2_assuming_safe(a, b);
+    }
+
+    Text_t glue = text_from_u32(norm_buf, (int64_t)norm_length, false);
+
+    if (normalized != norm_buf)
+        free(normalized);
+
+    if (a.length == 1 && b.length == 1)
+        return glue;
+    else if (a.length == 1)
+        return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length)));
+    else if (b.length == 1)
+        return concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue);
+    else
+        return concat2_assuming_safe(
+            concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue),
+            Text$slice(b, I(2), I(b.length)));
+}
+
+public Text_t Text$_concat(int n, Text_t items[n])
+{
+    if (n == 0) return EMPTY_TEXT;
+
+    Text_t ret = items[0];
+    for (int i = 1; i < n; i++) {
+        if (items[i].length > 0)
+            ret = concat2(ret, items[i]);
+    }
+    return ret;
+}
+
+public Text_t Text$repeat(Text_t text, Int_t count)
+{
+    if (text.length == 0 || Int$is_negative(count))
+        return EMPTY_TEXT;
+
+    Int_t result_len = Int$times(count, I(text.length));
+    if (Int$compare_value(result_len, I(1l<<40)) > 0)
+        fail("Text repeating would produce too big of an result!");
+
+    int64_t count64 = Int64$from_int(count, false);
+    Text_t ret = text;
+    for (int64_t c = 1; c < count64; c++)
+        ret = concat2(ret, text);
+    return ret;
+}
+
+public Int_t Text$width(Text_t text, Text_t language)
+{
+    int width = u8_strwidth((const uint8_t*)Text$as_c_string(text), Text$as_c_string(language));
+    return Int$from_int32(width);
+}
+
+static Text_t Text$repeat_to_width(Text_t to_repeat, int64_t target_width, Text_t language)
+{
+    if (target_width <= 0)
+        return EMPTY_TEXT;
+
+    const char *lang_str = Text$as_c_string(language);
+    int64_t width = (int64_t)u8_strwidth((const uint8_t*)Text$as_c_string(to_repeat), lang_str);
+    Text_t repeated = EMPTY_TEXT;
+    int64_t repeated_width = 0;
+    while (repeated_width + width <= target_width) {
+        repeated = concat2(repeated, to_repeat);
+        repeated_width += width;
+    }
+
+    if (repeated_width < target_width) {
+        for (int64_t i = 0; repeated_width < target_width && i < to_repeat.length; i++) {
+            Text_t c = Text$slice(to_repeat, I_small(i+1), I_small(i+1));
+            int64_t w = (int64_t)u8_strwidth((const uint8_t*)Text$as_c_string(c), lang_str);
+            if (repeated_width + w > target_width) {
+                repeated = concat2(repeated, Text$repeat(Text(" "), I(target_width - repeated_width)));
+                repeated_width = target_width;
+                break;
+            }
+            repeated = concat2(repeated, c);
+            repeated_width += w;
+        }
+    }
+
+    return repeated;
+}
+
+public Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language)
+{
+    if (padding.length == 0)
+        fail("Cannot pad with an empty text!");
+
+    int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
+    return concat2(Text$repeat_to_width(padding, needed, language), text);
+}
+
+public Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language)
+{
+    if (padding.length == 0)
+        fail("Cannot pad with an empty text!");
+
+    int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
+    return concat2(text, Text$repeat_to_width(padding, needed, language));
+}
+
+public Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language)
+{
+    if (padding.length == 0)
+        fail("Cannot pad with an empty text!");
+
+    int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
+    return Texts(Text$repeat_to_width(padding, needed/2, language), text, Text$repeat_to_width(padding, (needed+1)/2, language));
+}
+
+public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
+{
+    int64_t first = Int64$from_int(first_int, false);
+    int64_t last = Int64$from_int(last_int, false);
+    if (first == 0) fail("Invalid index: 0");
+    if (last == 0) return EMPTY_TEXT;
+
+    if (first < 0) first = text.length + first + 1;
+    if (last < 0) last = text.length + last + 1;
+
+    if (last > text.length) last = text.length;
+
+    if (first > text.length || last < first)
+        return EMPTY_TEXT;
+
+    if (first == 1 && last == text.length)
+        return text;
+
+    while (text.tag == TEXT_CONCAT) {
+        if (last < text.left->length) {
+            text = *text.left;
+        } else if (first > text.left->length) {
+            first -= text.left->length;
+            last -= text.left->length;
+            text = *text.right;
+        } else {
+            return concat2(Text$slice(*text.left, I(first), I(text.length)),
+                           Text$slice(*text.right, I(1), I(last-text.left->length)));
+        }
+    }
+
+    switch (text.tag) {
+    case TEXT_ASCII: {
+        return (Text_t){
+            .tag=TEXT_ASCII,
+            .length=last - first + 1,
+            .ascii=text.ascii + (first-1),
+        };
+    }
+    case TEXT_GRAPHEMES: {
+        return (Text_t){
+            .tag=TEXT_GRAPHEMES,
+            .length=last - first + 1,
+            .graphemes=text.graphemes + (first-1),
+        };
+    }
+    default: errx(1, "Invalid tag");
+    }
+}
+
+public Text_t Text$from(Text_t text, Int_t first)
+{
+    return Text$slice(text, first, I_small(-1));
+}
+
+public Text_t Text$to(Text_t text, Int_t last)
+{
+    return Text$slice(text, I_small(1), last);
+}
+
+public Text_t Text$reversed(Text_t text)
+{
+    switch (text.tag) {
+    case TEXT_ASCII: {
+        struct Text_s ret = {
+            .tag=TEXT_ASCII,
+            .length=text.length,
+        };
+        ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
+        for (int64_t i = 0; i < text.length; i++)
+            ((char*)ret.ascii)[text.length-1-i] = text.ascii[i];
+        return ret;
+    }
+    case TEXT_GRAPHEMES: {
+        struct Text_s ret = {
+            .tag=TEXT_GRAPHEMES,
+            .length=text.length,
+        };
+        ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
+        for (int64_t i = 0; i < text.length; i++)
+            ((int32_t*)ret.graphemes)[text.length-1-i] = text.graphemes[i];
+        return ret;
+    }
+    case TEXT_CONCAT: {
+        return concat2(Text$reversed(*text.right), Text$reversed(*text.left));
+    }
+    default: errx(1, "Invalid tag");
+    }
+}
+
+public PUREFUNC Text_t Text$cluster(Text_t text, Int_t index_int)
+{
+    int64_t index = Int64$from_int(index_int, false);
+    if (index == 0) fail("Invalid index: 0");
+
+    if (index < 0) index = text.length + index + 1;
+
+    if (index > text.length || index < 1)
+        fail("Invalid index: %ld is beyond the length of the text (length = %ld)",
+             Int64$from_int(index_int, false), text.length);
+
+    while (text.tag == TEXT_CONCAT) {
+        if (index <= text.left->length)
+            text = *text.left;
+        else
+            text = *text.right;
+    }
+
+    switch (text.tag) {
+    case TEXT_ASCII: {
+        struct Text_s ret = {
+            .tag=TEXT_ASCII,
+            .length=1,
+            .ascii=GC_MALLOC_ATOMIC(sizeof(char)),
+        };
+        *(char*)&ret.ascii[0] = text.ascii[index-1];
+        return ret;
+    }
+    case TEXT_GRAPHEMES: {
+        struct Text_s ret = {
+            .tag=TEXT_GRAPHEMES,
+            .length=1,
+            .graphemes=GC_MALLOC_ATOMIC(sizeof(int32_t)),
+        };
+        *(int32_t*)&ret.graphemes[0] = text.graphemes[index-1];
+        return ret;
+    }
+    default: errx(1, "Invalid tag");
+    }
+}
+
+Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize)
+{
+    // Normalization is apparently guaranteed to never exceed 3x in the input length
+    ucs4_t norm_buf[MIN(256, 3*num_codepoints)];
+    if (normalize) {
+        size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
+        ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, (size_t)num_codepoints, norm_buf, &norm_length);
+        codepoints = normalized;
+        num_codepoints = (int64_t)norm_length;
+    }
+
+    // Intentionally overallocate here: allocate assuming each codepoint is a
+    // grapheme cluster. If that's not true, we'll have extra space at the end
+    // of the array, but the length will still be calculated correctly.
+    int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints]));
+    struct Text_s ret = {
+        .tag=TEXT_GRAPHEMES,
+        .length=0,
+        .graphemes=graphemes,
+    };
+    const ucs4_t *src = codepoints;
+    while (src < &codepoints[num_codepoints]) {
+        // TODO: use grapheme breaks instead of u32_grapheme_next()?
+        const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]);
+        if (next == &src[1]) {
+            graphemes[ret.length] = (int32_t)*src;
+        } else {
+            // Synthetic grapheme
+            graphemes[ret.length] = get_synthetic_grapheme(src, next-src);
+        }
+        ++ret.length;
+        src = next;
+    }
+    if (normalize && codepoints != norm_buf) free(codepoints);
+    return ret;
+}
+
+public OptionalText_t Text$from_strn(const char *str, size_t len)
+{
+    int64_t ascii_span = 0;
+    for (size_t i = 0; i < len && isascii(str[i]); i++)
+        ascii_span++;
+
+    if (ascii_span == (int64_t)len) { // All ASCII
+        char *copy = GC_MALLOC_ATOMIC(len);
+        memcpy(copy, str, len);
+        return (Text_t){
+            .tag=TEXT_ASCII,
+            .length=ascii_span,
+            .ascii=copy,
+        };
+    } else {
+        if (u8_check((uint8_t*)str, len) != NULL)
+            return NONE_TEXT;
+
+        ucs4_t buf[128];
+        size_t length = sizeof(buf)/sizeof(buf[0]);
+
+        ucs4_t *codepoints = u8_to_u32((uint8_t*)str, (size_t)ascii_span + strlen(str + ascii_span), buf, &length);
+        Text_t ret = text_from_u32(codepoints, (int64_t)length, true);
+        if (codepoints != buf) free(codepoints);
+        return ret;
+    }
+}
+
+public OptionalText_t Text$from_str(const char *str)
+{
+    return str ? Text$from_strn(str, strlen(str)) : Text("");
+}
+
+static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i)
+{
+    switch (text.tag) {
+    case TEXT_ASCII: {
+        if (*i + text.length > (int64_t)*capacity) {
+            *capacity = *i + text.length + 1;
+            *buf = GC_REALLOC(*buf, (size_t)*capacity);
+        }
+
+        const char *bytes = text.ascii;
+        memcpy(*buf + *i, bytes, (size_t)text.length);
+        *i += text.length;
+        break;
+    }
+    case TEXT_GRAPHEMES: {
+        const int32_t *graphemes = text.graphemes;
+        for (int64_t g = 0; g < text.length; g++) {
+            if (graphemes[g] >= 0) {
+                uint8_t u8_buf[64];
+                size_t u8_len = sizeof(u8_buf);
+                uint8_t *u8 = u32_to_u8((ucs4_t*)&graphemes[g], 1, u8_buf, &u8_len);
+
+                if (*i + (int64_t)u8_len > (int64_t)*capacity) {
+                    *capacity = *i + (int64_t)u8_len + 1;
+                    *buf = GC_REALLOC(*buf, (size_t)*capacity);
+                }
+
+                memcpy(*buf + *i, u8, u8_len);
+                *i += (int64_t)u8_len;
+                if (u8 != u8_buf) free(u8);
+            } else {
+                const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]);
+                size_t u8_len = u8_strlen(u8);
+                if (*i + (int64_t)u8_len > (int64_t)*capacity) {
+                    *capacity = *i + (int64_t)u8_len + 1;
+                    *buf = GC_REALLOC(*buf, (size_t)*capacity);
+                }
+
+                memcpy(*buf + *i, u8, u8_len);
+                *i += (int64_t)u8_len;
+            }
+        }
+        break;
+    }
+    case TEXT_CONCAT: {
+        u8_buf_append(*text.left, buf, capacity, i);
+        u8_buf_append(*text.right, buf, capacity, i);
+        break;
+    }
+    default: break;
+    }
+}
+
+public char *Text$as_c_string(Text_t text)
+{
+    int64_t capacity = text.length + 1;
+    char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
+    int64_t i = 0;
+    u8_buf_append(text, &buf, &capacity, &i);
+
+    if (i + 1 > (int64_t)capacity) {
+        capacity = i + 1;
+        buf = GC_REALLOC(buf, (size_t)capacity);
+    }
+    buf[i] = '\0';
+    return buf;
+}
+
+PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t*)
+{
+    Text_t text = *(Text_t*)obj;
+    siphash sh;
+    siphashinit(&sh, sizeof(int32_t[text.length]));
+
+    union {
+        int32_t chunks[2];
+        uint64_t whole;
+    } tmp;
+    switch (text.tag) {
+    case TEXT_ASCII: {
+        const char *bytes = text.ascii;
+        for (int64_t i = 0; i + 1 < text.length; i += 2) {
+            tmp.chunks[0] = (int32_t)bytes[i];
+            tmp.chunks[1] = (int32_t)bytes[i+1];
+            siphashadd64bits(&sh, tmp.whole);
+        }
+        int32_t last = text.length & 0x1 ? (int32_t)bytes[text.length-1] : 0; // Odd number of graphemes
+        return siphashfinish_last_part(&sh, (uint64_t)last);
+    }
+    case TEXT_GRAPHEMES: {
+        const int32_t *graphemes = text.graphemes;
+        for (int64_t i = 0; i + 1 < text.length; i += 2) {
+            tmp.chunks[0] = graphemes[i];
+            tmp.chunks[1] = graphemes[i];
+            siphashadd64bits(&sh, tmp.whole);
+        }
+        int32_t last = text.length & 0x1 ? graphemes[text.length-1] : 0; // Odd number of graphemes
+        return siphashfinish_last_part(&sh, (uint64_t)last);
+    }
+    case TEXT_CONCAT: {
+        TextIter_t state = NEW_TEXT_ITER_STATE(text);
+        for (int64_t i = 0; i < (text.length & ~0x1); i += 2) {
+            tmp.chunks[0] = Text$get_grapheme_fast(&state, i);
+            tmp.chunks[0] = Text$get_grapheme_fast(&state, i+1);
+            siphashadd64bits(&sh, tmp.whole);
+        }
+
+        int32_t last = (text.length & 0x1) ? Text$get_grapheme_fast(&state, text.length-1) : 0;
+        return siphashfinish_last_part(&sh, (uint64_t)last);
+    }
+    default: errx(1, "Invalid text");
+    }
+}
+
+public int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index)
+{
+    if (index < 0) return 0;
+    if (index >= state->stack[0].text.length) return 0;
+
+    assert(state->stack[0].text.depth <= MAX_TEXT_DEPTH);
+
+    // Go up the stack as needed:
+    while (index < state->stack[state->stack_index].offset
+           || index >= state->stack[state->stack_index].offset + state->stack[state->stack_index].text.length) {
+        state->stack_index -= 1;
+        assert(state->stack_index >= 0);
+    }
+
+    assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
+
+    // Go down the stack as needed:
+    while (state->stack[state->stack_index].text.tag == TEXT_CONCAT) {
+        Text_t text = state->stack[state->stack_index].text;
+        int64_t offset = state->stack[state->stack_index].offset;
+        assert(state->stack_index <= MAX_TEXT_DEPTH);
+        assert(index >= offset);
+        assert(index < offset + text.length);
+
+        state->stack_index += 1;
+        if (index < offset + text.left->length) {
+            state->stack[state->stack_index].text = *text.left;
+            state->stack[state->stack_index].offset = offset;
+        } else {
+            state->stack[state->stack_index].text = *text.right;
+            state->stack[state->stack_index].offset = offset + text.left->length;
+        }
+        assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
+    }
+
+    Text_t text = state->stack[state->stack_index].text;
+    int64_t offset = state->stack[state->stack_index].offset;
+
+    if (index < offset || index >= offset + text.length) {
+        return 0;
+    }
+
+    switch (text.tag) {
+    case TEXT_ASCII: return (int32_t)text.ascii[index - offset];
+    case TEXT_GRAPHEMES: return text.graphemes[index - offset];
+    default: errx(1, "Invalid text");
+    }
+    return 0;
+}
+
+public uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index)
+{
+    int32_t g = Text$get_grapheme_fast(state, index);
+    return (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint;
+}
+
+PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*)
+{
+    if (va == vb) return 0;
+    const Text_t a = *(const Text_t*)va;
+    const Text_t b = *(const Text_t*)vb;
+
+    // TODO: make this smarter and more efficient
+    int64_t len = MAX(a.length, b.length);
+    TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
+    for (int64_t i = 0; i < len; i++) {
+        int32_t ai = Text$get_grapheme_fast(&a_state, i);
+        int32_t bi = Text$get_grapheme_fast(&b_state, i);
+        if (ai == bi) continue;
+        int32_t cmp;
+        if (ai > 0 && bi > 0) {
+            cmp = u32_cmp((ucs4_t*)&ai, (ucs4_t*)&bi, 1);
+        } else if (ai > 0) {
+            cmp = u32_cmp2(
+                (ucs4_t*)&ai, 1,
+                GRAPHEME_CODEPOINTS(bi),
+                NUM_GRAPHEME_CODEPOINTS(bi));
+        } else if (bi > 0) {
+            cmp = u32_cmp2(
+                GRAPHEME_CODEPOINTS(ai),
+                NUM_GRAPHEME_CODEPOINTS(ai),
+                (ucs4_t*)&bi, 1);
+        } else {
+            cmp = u32_cmp2(
+                GRAPHEME_CODEPOINTS(ai),
+                NUM_GRAPHEME_CODEPOINTS(ai),
+                GRAPHEME_CODEPOINTS(bi),
+                NUM_GRAPHEME_CODEPOINTS(bi));
+        }
+        if (cmp != 0) return cmp;
+    }
+    return 0;
+}
+
+PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix)
+{
+    if (text.length < prefix.length)
+        return false;
+    TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix);
+    for (int64_t i = 0; i < prefix.length; i++) {
+        int32_t text_i = Text$get_grapheme_fast(&text_state, i);
+        int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i);
+        if (text_i != prefix_i) return false;
+    }
+    return true;
+}
+
+PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix)
+{
+    if (text.length < suffix.length)
+        return false;
+    TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix);
+    for (int64_t i = 0; i < suffix.length; i++) {
+        int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i);
+        int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i);
+        if (text_i != suffix_i) return false;
+    }
+    return true;
+}
+
+PUREFUNC public bool Text$equal_values(Text_t a, Text_t b)
+{
+    if (a.length != b.length)
+        return false;
+    int64_t len = a.length;
+    TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
+    // TODO: make this smarter and more efficient
+    for (int64_t i = 0; i < len; i++) {
+        int32_t ai = Text$get_grapheme_fast(&a_state, i);
+        int32_t bi = Text$get_grapheme_fast(&b_state, i);
+        if (ai != bi) return false;
+    }
+    return true;
+}
+
+PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t*)
+{
+    if (a == b) return true;
+    return Text$equal_values(*(Text_t*)a, *(Text_t*)b);
+}
+
+PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language)
+{
+    if (a.length != b.length)
+        return false;
+    int64_t len = a.length;
+    TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
+    const char *uc_language = Text$as_c_string(language);
+    for (int64_t i = 0; i < len; i++) {
+        int32_t ai = Text$get_grapheme_fast(&a_state, i);
+        int32_t bi = Text$get_grapheme_fast(&b_state, i);
+        if (ai != bi) {
+            const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai);
+            int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);
+
+            const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t*)&bi : GRAPHEME_CODEPOINTS(bi);
+            int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
+
+            int cmp = 0;
+            (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
+            if (cmp != 0)
+                return false;
+        }
+    }
+    return true;
+}
+
+public Text_t Text$upper(Text_t text, Text_t language)
+{
+    if (text.length == 0) return text;
+    Array_t codepoints = Text$utf32_codepoints(text);
+    const char *uc_language = Text$as_c_string(language);
+    ucs4_t buf[128]; 
+    size_t out_len = sizeof(buf)/sizeof(buf[0]);
+    ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
+    Text_t ret = text_from_u32(upper, (int64_t)out_len, false);
+    if (upper != buf) free(upper);
+    return ret;
+}
+
+public Text_t Text$lower(Text_t text, Text_t language)
+{
+    if (text.length == 0) return text;
+    Array_t codepoints = Text$utf32_codepoints(text);
+    const char *uc_language = Text$as_c_string(language);
+    ucs4_t buf[128]; 
+    size_t out_len = sizeof(buf)/sizeof(buf[0]);
+    ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
+    Text_t ret = text_from_u32(lower, (int64_t)out_len, false);
+    if (lower != buf) free(lower);
+    return ret;
+}
+
+public Text_t Text$title(Text_t text, Text_t language)
+{
+    if (text.length == 0) return text;
+    Array_t codepoints = Text$utf32_codepoints(text);
+    const char *uc_language = Text$as_c_string(language);
+    ucs4_t buf[128]; 
+    size_t out_len = sizeof(buf)/sizeof(buf[0]);
+    ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
+    Text_t ret = text_from_u32(title, (int64_t)out_len, false);
+    if (title != buf) free(title);
+    return ret;
+}
+
+public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n])
+{
+    if (n < 1) return -1;
+    (void)info;
+    argtypes[0] = PA_POINTER;
+    sizes[0] = sizeof(Text_t);
+    return 1;
+}
+
+public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[])
+{
+    Text_t *t = *(Text_t**)args[0];
+    if (info->alt)
+        return text_visualize(stream, *t, 0);
+    else
+        return Text$print(stream, *t);
+}
+
+static INLINE Text_t _quoted(Text_t text, bool colorize, char quote_char)
+{
+    Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
+    if (quote_char != '"' && quote_char != '\'' && quote_char != '`')
+        ret = concat2_assuming_safe(ret, Text("$"));
+
+    Text_t quote_text = Text$from_strn(&quote_char, 1);
+    ret = concat2_assuming_safe(ret, quote_text);
+
+#define add_escaped(str) ({ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \
+                          if (!just_escaped) ret = concat2_assuming_safe(ret, Text("$")); \
+                          ret = concat2_assuming_safe(ret, Text("\\" str)); \
+                          just_escaped = true; \
+                          if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); })
+    TextIter_t state = NEW_TEXT_ITER_STATE(text);
+    bool just_escaped = false;
+    // TODO: optimize for spans of non-escaped text
+    for (int64_t i = 0; i < text.length; i++) {
+        int32_t g = Text$get_grapheme_fast(&state, i);
+        switch (g) {
+        case '\a': add_escaped("a"); break;
+        case '\b': add_escaped("b"); break;
+        case '\x1b': add_escaped("e"); break;
+        case '\f': add_escaped("f"); break;
+        case '\n': add_escaped("n"); break;
+        case '\r': add_escaped("r"); break;
+        case '\t': add_escaped("t"); break;
+        case '\v': add_escaped("v"); break;
+        case '\\': {
+            if (just_escaped) {
+                add_escaped("\\");
+            } else {
+                ret = concat2_assuming_safe(ret, Text("\\"));
+                just_escaped = false;
+            }
+            break;
+        }
+        case '$': {
+            if (quote_char == '\'') {
+                ret = concat2_assuming_safe(ret, Text("$"));
+                just_escaped = false;
+            } else {
+                add_escaped("$");
+            }
+            break;
+        }
+        case '\x00' ... '\x06': case '\x0E' ... '\x1A':
+        case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': {
+            if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
+            ret = concat2_assuming_safe(ret, Text("\\x"));
+            char tmp[2];
+            sprintf(tmp, "%02X", g);
+            ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2));
+            if (colorize)
+                ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
+            just_escaped = true;
+            break;
+        }
+        default: {
+            if (g == quote_char) {
+                ret = concat2_assuming_safe(ret, quote_text);
+            } else {
+                ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1)));
+                just_escaped = false;
+            }
+            break;
+        }
+        }
+    }
+#undef add_escaped
+
+    ret = concat2_assuming_safe(ret, quote_text);
+    if (colorize)
+        ret = concat2_assuming_safe(ret, Text("\x1b[m"));
+
+    return ret;
+}
+
+public Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *info)
+{
+    (void)info;
+    if (info->TextInfo.lang && streq(info->TextInfo.lang, "Path")) {
+        if (!vtext) return Text("Path");
+        Text_t text = *(Text_t*)vtext;
+        return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", &text, colorize ? "\x1b[m" : "");
+    }
+
+    if (!vtext) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
+
+    Text_t text = *(Text_t*)vtext;
+    char quote_char;
+    if (info == &Pattern$info) {
+        quote_char = Text$has(text, Pattern("/")) && !Text$has(text, Pattern("|")) ? '|' : '/';
+    } else {
+        // Figure out the best quotation mark to use:
+        bool has_dollar = false, has_double_quote = false, has_backtick = false,
+             has_single_quote = false, needs_escapes = false;
+        TextIter_t state = NEW_TEXT_ITER_STATE(text);
+        for (int64_t i = 0; i < text.length; i++) {
+            int32_t g = Text$get_grapheme_fast(&state, i);
+            if (g == '$') {
+                has_dollar = true;
+            } else if (g == '"') {
+                has_double_quote = true;
+            } else if (g == '`') {
+                has_backtick = true;
+            } else if (g == (g & 0x7F) && (g == '\'' || g == '\n' || g == '\r' || g == '\t' || !isprint((char)g))) {
+                needs_escapes = true;
+            }
+        }
+
+        // If there's dollar signs and/or double quotes in the string, it would
+        // be nice to avoid needing to escape them by using single quotes, but
+        // only if we don't have single quotes or need to escape anything else
+        // (because single quotes don't have interpolation):
+        if ((has_dollar || has_double_quote) && !has_single_quote && !needs_escapes)
+            quote_char = '\'';
+        // If there is a double quote, but no backtick, we can save a bit of
+        // escaping by using backtick instead of double quote:
+        else if (has_double_quote && !has_backtick)
+            quote_char = '`';
+        // Otherwise fall back to double quotes as the default quoting style:
+        else
+            quote_char = '"';
+    }
+
+    Text_t as_text = _quoted(text, colorize, quote_char);
+    if (info && info->TextInfo.lang && info != &Text$info && info != &Pattern$info)
+        as_text = Text$concat(
+            colorize ? Text("\x1b[1m$") : Text("$"),
+            Text$from_str(info->TextInfo.lang),
+            colorize ? Text("\x1b[0m") : Text(""),
+            as_text);
+    return as_text;
+}
+
+public Text_t Text$quoted(Text_t text, bool colorize)
+{
+    return _quoted(text, colorize, '"');
+}
+
+public Text_t Text$join(Text_t glue, Array_t pieces)
+{
+    if (pieces.length == 0) return EMPTY_TEXT;
+
+    Text_t result = *(Text_t*)pieces.data;
+    for (int64_t i = 1; i < pieces.length; i++) {
+        result = Text$concat(result, glue, *(Text_t*)(pieces.data + i*pieces.stride));
+    }
+    return result;
+}
+
+__attribute__((format(printf, 1, 2)))
+public Text_t Text$format(const char *fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+
+    char buf[9];
+    int len = vsnprintf(buf, sizeof(buf), fmt, args);
+    char *str = GC_MALLOC_ATOMIC((size_t)(len+1));
+    vsnprintf(str, (size_t)(len+1), fmt, args);
+    Text_t ret = Text$from_str(str);
+    va_end(args);
+    return ret;
+}
+
+public Array_t Text$clusters(Text_t text)
+{
+    Array_t clusters = {};
+    for (int64_t i = 1; i <= text.length; i++) {
+        Text_t cluster = Text$slice(text, I(i), I(i));
+        Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
+    }
+    return clusters;
+}
+
+public Array_t Text$utf32_codepoints(Text_t text)
+{
+    Array_t codepoints = {.atomic=1};
+    TextIter_t state = NEW_TEXT_ITER_STATE(text);
+    for (int64_t i = 0; i < text.length; i++) {
+        int32_t grapheme = Text$get_grapheme_fast(&state, i);
+        if (grapheme < 0) {
+            for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
+                ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
+                Array$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t));
+            }
+        } else {
+            Array$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t));
+        }
+    }
+    return codepoints;
+}
+
+public Array_t Text$utf8_bytes(Text_t text)
+{
+    const char *str = Text$as_c_string(text);
+    return (Array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str};
+}
+
+static INLINE const char *codepoint_name(ucs4_t c)
+{
+    char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
+    char *found_name = unicode_character_name(c, name);
+    if (found_name) return found_name;
+    const uc_block_t *block = uc_block(c);
+    assert(block);
+    snprintf(name, UNINAME_MAX, "%s-%X", block->name, c);
+    return name;
+}
+
+public Array_t Text$codepoint_names(Text_t text)
+{
+    Array_t names = {};
+    TextIter_t state = NEW_TEXT_ITER_STATE(text);
+    for (int64_t i = 0; i < text.length; i++) {
+        int32_t grapheme = Text$get_grapheme_fast(&state, i);
+        if (grapheme < 0) {
+            for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
+                const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
+                Text_t name_text = Text$from_str(name);
+                Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
+            }
+        } else {
+            const char *name = codepoint_name((ucs4_t)grapheme);
+            Text_t name_text = Text$from_str(name);
+            Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
+        }
+    }
+    return names;
+}
+
+public Text_t Text$from_codepoints(Array_t codepoints)
+{
+    if (codepoints.stride != sizeof(int32_t))
+        Array$compact(&codepoints, sizeof(int32_t));
+
+    return text_from_u32(codepoints.data, codepoints.length, true);
+}
+
+public OptionalText_t Text$from_codepoint_names(Array_t codepoint_names)
+{
+    Array_t codepoints = {};
+    for (int64_t i = 0; i < codepoint_names.length; i++) {
+        Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
+        const char *name_str = Text$as_c_string(*name);
+        ucs4_t codepoint = unicode_name_character(name_str);
+        if (codepoint == UNINAME_INVALID)
+            return NONE_TEXT;
+        Array$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
+    }
+    return Text$from_codepoints(codepoints);
+}
+
+public OptionalText_t Text$from_bytes(Array_t bytes)
+{
+    if (bytes.stride != sizeof(int8_t))
+        Array$compact(&bytes, sizeof(int8_t));
+
+    return Text$from_strn(bytes.data, (size_t)bytes.length);
+}
+
+public Array_t Text$lines(Text_t text)
+{
+    Array_t lines = {};
+    TextIter_t state = NEW_TEXT_ITER_STATE(text);
+    for (int64_t i = 0, line_start = 0; i < text.length; i++) {
+        int32_t grapheme = Text$get_grapheme_fast(&state, i);
+        if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF
+            Text_t line = Text$slice(text, I(line_start+1), I(i));
+            Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
+            i += 1; // skip one extra for CR
+            line_start = i + 1;
+        } else if (grapheme == '\n') { // newline
+            Text_t line = Text$slice(text, I(line_start+1), I(i));
+            Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
+            line_start = i + 1;
+        } else if (i == text.length-1 && line_start != i) { // last line
+            Text_t line = Text$slice(text, I(line_start+1), I(i+1));
+            Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
+        }
+    }
+    return lines;
+}
+
+typedef struct {
+    TextIter_t state;
+    int64_t i;
+} line_iter_state_t;
+
+static OptionalText_t next_line(line_iter_state_t *state)
+{
+    Text_t text = state->state.stack[0].text;
+    for (int64_t i = state->i; i < text.length; i++) {
+        int32_t grapheme = Text$get_grapheme_fast(&state->state, i);
+        if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF
+            Text_t line = Text$slice(text, I(state->i+1), I(i));
+            state->i = i + 2; // skip one extra for CR
+            return line;
+        } else if (grapheme == '\n') { // newline
+            Text_t line = Text$slice(text, I(state->i+1), I(i));
+            state->i = i + 1;
+            return line;
+        } else if (i == text.length-1 && state->i != i) { // last line
+            Text_t line = Text$slice(text, I(state->i+1), I(i+1));
+            state->i = i + 1;
+            return line;
+        }
+    }
+    return NONE_TEXT;
+}
+
+public Closure_t Text$by_line(Text_t text)
+{
+    return (Closure_t){
+        .fn=(void*)next_line,
+        .userdata=new(line_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0),
+    };
+}
+
+PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t*)
+{
+    return ((Text_t*)t)->length < 0;
+}
+
+public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const TypeInfo_t *)
+{
+    const char *str = Text$as_c_string(*(Text_t*)obj);
+    int64_t len = (int64_t)strlen(str);
+    Int64$serialize(&len, out, pointers, &Int64$info);
+    fwrite(str, sizeof(char), (size_t)len, out);
+}
+
+public void Text$deserialize(FILE *in, void *out, Array_t *pointers, const TypeInfo_t *)
+{
+    int64_t len = -1;
+    Int64$deserialize(in, &len, pointers, &Int64$info);
+    char *buf = GC_MALLOC_ATOMIC((size_t)len+1);
+    fread(buf, sizeof(char), (size_t)len, in);
+    buf[len+1] = '\0';
+    *(Text_t*)out = Text$from_strn(buf, (size_t)len);
+}
+
+public const TypeInfo_t Text$info = {
+    .size=sizeof(Text_t),
+    .align=__alignof__(Text_t),
+    .tag=TextInfo,
+    .TextInfo={.lang="Text"},
+    .metamethods=Text$metamethods,
+};
+
+public Pattern_t Pattern$escape_text(Text_t text)
+{
+    // TODO: optimize for spans of non-escaped text
+    Text_t ret = EMPTY_TEXT;
+    TextIter_t state = NEW_TEXT_ITER_STATE(text);
+    for (int64_t i = 0; i < text.length; i++) {
+        int32_t g = Text$get_grapheme_fast(&state, i);
+        ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g;
+
+        if (g == '{') {
+            ret = concat2_assuming_safe(ret, Text("{1{}"));
+        } else if (g0 == '?'
+                   || uc_is_property_quotation_mark(g0)
+                   || (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) {
+            ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}"));
+        } else {
+            ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1)));
+        }
+    }
+    return ret;
+}
+
+// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
-- 
cgit v1.2.3