tomo/builtins/text.c

// Type info and methods for Text datatype, which uses libunistr for Unicode
// support and implements a datastructure based on Raku/MoarVM's strings to
// efficiently store arbitrary unicode data using a mix of densely packed plain
// ASCII, 32-bit integers representing grapheme clusters (see below), and ropes
// that represent text that is a composite of multiple subtexts. Subtexts are
// only nested one level deep, not arbitrarily deep trees.
//
// A note on grapheme clusters: In Unicode, codepoints can be represented using
// a 32-bit integer. Most codepoints correspond to the intuitive notion of a
// "letter", which is more formally known as a "grapheme cluster". A grapheme
// cluster is roughly speaking the amount of text that your cursor moves over
// when you press the arrow key once. However, some codepoints act as modifiers
// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify
// a letter like "e" to form "é". During normalization, this frequently
// resolves down to a single unicode codepoint, in this case, "é" resolves to
// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in
// some cases, multiple codepoints make up a grapheme cluster but *don't*
// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065)
// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph
// that is not used frequently enough to warrant its own unique codepoint (this
// is basically what Zalgo text is).
//
// There are a lot of benefits to storing text with one grapheme cluster per
// index in a densely packed array. It lets us have one canonical length for
// the text that can be precomputed and is meaningful to users. It lets us
// quickly get the Nth "letter" in the text. Substring slicing is fast.
// However, since not all grapheme clusters take up the same number of
// codepoints, we're faced with the problem of how to jam multiple codepoints
// into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this
// implementation uses "synthetic graphemes" (in Raku's terms, Normal Form
// Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed
// integer that represents a multi-codepoint grapheme cluster that has been
// encountered during the program's runtime. These clusters are stored in a
// lookup array and hash map so that we can rapidly convert between the
// synthetic grapheme integer ID and the unicode codepoints associated with it.
// Essentially, it's like we create a supplement to the unicode standard with
// things that would be nice if they had their own codepoint so things worked
// out nicely because we're using them right now, and we'll give them a
// negative number so it doesn't overlap with any real codepoints.
//
// Example 1: U+0048, U+00E9
//    AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E WITH ACUTE
//    This would be stored as: (int32_t[]){0x48, 0xE9}
// Example 2: U+0048, U+0065, U+0309
//    AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, COMBINING VERTICAL LINE BELOW
//    This would be stored as: (int32_t[]){0x48, -2}
//    Where -2 is used as a lookup in an array that holds the actual unicode codepoints:
//       (ucs4_t[]){0x65, 0x0309}

#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <gc.h>
#include <limits.h>
#include <printf.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/param.h>

#include <unicase.h>
#include <unictype.h>
#include <unigbrk.h>
#include <uniname.h>
#include <uninorm.h>
#include <unistd.h>
#include <unistdio.h>
#include <unistr.h>

#include "array.h"
#include "functions.h"
#include "integers.h"
#include "table.h"
#include "text.h"
#include "types.h"

// Use inline version of the siphash code for performance:
#include "siphash.h"
#include "siphash-internals.h"

typedef struct {
    ucs4_t main_codepoint;
    ucs4_t *utf32_cluster; // length-prefixed
    const uint8_t *utf8;
} synthetic_grapheme_t;

typedef struct {
    int64_t subtext, sum_of_previous_subtexts;
} text_iter_t;

#define MAX_BACKREFS 100

// Synthetic grapheme clusters (clusters of more than one codepoint):
static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID

// This will hold a dynamically growing array of synthetic graphemes:
static synthetic_grapheme_t *synthetic_graphemes = NULL;
static int32_t synthetic_grapheme_capacity = 0;
static int32_t num_synthetic_graphemes = 0;

#define MAIN_GRAPHEME_CODEPOINT(_g) ({ int32_t g = _g; (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint; })
#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0])
#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1])
#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8)

static int32_t get_grapheme(Text_t text, int64_t index);
static int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index);
#define _get_main_grapheme(...) MAIN_GRAPHEME_CODEPOINT(_get_grapheme(__VA_ARGS__))
static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize);

PUREFUNC static bool graphemes_equal(ucs4_t **a, ucs4_t **b) {
    if ((*a)[0] != (*b)[0]) return false;
    for (int i = 0; i < (int)(*a)[0]; i++)
        if ((*a)[i] != (*b)[i]) return false;
    return true;
}

PUREFUNC static uint64_t grapheme_hash(ucs4_t **g) {
    ucs4_t *cluster = *g;
    return siphash24((void*)&cluster[1], sizeof(ucs4_t[cluster[0]]));
}

static const TypeInfo GraphemeClusterInfo = {
    .size=sizeof(ucs4_t*),
    .align=__alignof__(ucs4_t*),
    .tag=CustomInfo,
    .CustomInfo={.equal=(void*)graphemes_equal, .hash=(void*)grapheme_hash},
};

static const TypeInfo GraphemeIDLookupTableInfo = {
    .size=sizeof(Table_t), .align=__alignof__(Table_t),
    .tag=TableInfo, .TableInfo={.key=&GraphemeClusterInfo, .value=&Int32$info},
};

#pragma GCC diagnostic ignored "-Wstack-protector"
int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len)
{
    ucs4_t length_prefixed[1+utf32_len] = {};
    length_prefixed[0] = (ucs4_t)utf32_len;
    for (int i = 0; i < utf32_len; i++)
        length_prefixed[i+1] = codepoints[i];
    ucs4_t *ptr = &length_prefixed[0];

    // Optimization for common case of one frequently used synthetic grapheme:
    static int32_t last_grapheme = 0;
    if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster))
        return last_grapheme;

    int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo);
    if (found) return *found;

    // New synthetic grapheme:
    if (num_synthetic_graphemes >= synthetic_grapheme_capacity) {
        // If we don't have space, allocate more:
        synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2);
        synthetic_grapheme_t *new = GC_MALLOC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity]));
        memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes]));
        synthetic_graphemes = new;
    }

    int32_t grapheme_id = -(num_synthetic_graphemes+1);
    num_synthetic_graphemes += 1;

    // Get UTF8 representation:
    uint8_t u8_buf[64];
    size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]);
    uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len);

    // For performance reasons, use an arena allocator here to ensure that
    // synthetic graphemes store all of their information in a densely packed
    // area with good cache locality:
    static void *arena = NULL, *arena_end = NULL;
    // Eat up any space needed to make arena 32-bit aligned:
    if ((size_t)arena % __alignof__(ucs4_t) != 0)
        arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t));

    // If we have filled up this arena, allocate a new one:
    size_t needed_memory = sizeof(ucs4_t[1+utf32_len]) + sizeof(uint8_t[u8_len + 1]);
    if (arena + needed_memory > arena_end) {
        // Do reasonably big chunks at a time, so most synthetic codepoints are
        // nearby each other in memory and cache locality is good. This is a
        // rough guess at a good size:
        size_t chunk_size = MAX(needed_memory, 512);
        arena = GC_MALLOC_ATOMIC(chunk_size);
        arena_end = arena + chunk_size;
    }

    // Copy length-prefixed UTF32 codepoints into the arena and store where they live:
    ucs4_t *codepoint_copy = arena;
    mempcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1+utf32_len]));
    synthetic_graphemes[-grapheme_id-1].utf32_cluster = codepoint_copy;
    arena += sizeof(ucs4_t[1+utf32_len]);

    // Copy UTF8 bytes into the arena and store where they live:
    uint8_t *utf8_final = arena;
    memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
    utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
    synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final;
    arena += sizeof(uint8_t[u8_len + 1]);

    // Sickos at the unicode consortium decreed that you can have grapheme clusters
    // that begin with *prefix* modifiers, so we gotta check for that case:
    synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1];
    for (ucs4_t i = 0; i < utf32_len; i++) {
        if (!__builtin_expect(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i]), 0)) {
            synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i];
            break;
        }
    }

    // Cleanup from unicode API:
    if (u8 != u8_buf) free(u8);

    Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo);

    last_grapheme = grapheme_id;
    return grapheme_id;
}

PUREFUNC static inline int64_t num_subtexts(Text_t t)
{
    if (t.tag != TEXT_SUBTEXT) return 1;
    int64_t len = t.length;
    int64_t n = 0;
    while (len > 0) {
        len -= t.subtexts[n].length;
        ++n;
    }
    return n;
}

int text_visualize(FILE *stream, Text_t t)
{
    switch (t.tag) {
    case TEXT_SHORT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.short_ascii);
    case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii);
    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
        int printed = fprintf(stream, "<graphemes length=%ld>", t.length);
        printed += Text$print(stream, t);
        printed += fprintf(stream, "</graphemes>");
        return printed;
    }
    case TEXT_SUBTEXT: {
        int printed = fprintf(stream, "<text length=%ld>", t.length);
        int64_t to_print = t.length;
        for (int i = 0; to_print > 0; ++i) {
            printed += fprintf(stream, "\n  ");
            printed += text_visualize(stream, t.subtexts[i]);
            to_print -= t.subtexts[i].length;
            if (t.subtexts[i].length == 0) break;
        }
        printed += fprintf(stream, "\n</text>");
        return printed;
    }
    default: return 0;
    }
}

public int Text$print(FILE *stream, Text_t t)
{
    if (t.length == 0) return 0;

    switch (t.tag) {
    case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), (size_t)t.length, stream);
    case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream);
    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
        const int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes;
        int written = 0;
        for (int64_t i = 0; i < t.length; i++) {
            int32_t grapheme = graphemes[i];
            if (grapheme >= 0) {
                uint8_t buf[8];
                size_t len = sizeof(buf);
                uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len);
                written += (int)fwrite(u8, sizeof(char), len, stream);
                if (u8 != buf) free(u8);
            } else {
                const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
                assert(u8);
                written += (int)fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
            }
        }
        return written;
    }
    case TEXT_SUBTEXT: {
        int written = 0;
        int i = 0;
        for (int64_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i)
            written += Text$print(stream, t.subtexts[i]);
        return written;
    }
    default: return 0;
    }
}

static bool is_concat_stable(Text_t a, Text_t b)
{
    if (a.length == 0 || b.length == 0)
        return true;

    int32_t last_a = get_grapheme(a, a.length-1);
    int32_t first_b = get_grapheme(b, 0);

    // Synthetic graphemes are weird and probably need to check with normalization:
    if (last_a < 0 || first_b < 0)
        return 0;

    // Magic number, we know that no codepoints below here trigger instability:
    static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300;
    if (last_a < LOWEST_CODEPOINT_TO_CHECK && first_b < LOWEST_CODEPOINT_TO_CHECK)
        return true;

    // Do a normalization run for these two codepoints and see if it looks different:
    ucs4_t codepoints[2] = {(ucs4_t)last_a, (ucs4_t)first_b};
    ucs4_t norm_buf[3*2]; // Normalization should not exceed 3x in the input length
    size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
    ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, 2, norm_buf, &norm_length);
    if (norm_length != 2) {
        // Looks like these two codepoints merged into one (or maybe had a child, who knows?)
        if (normalized != norm_buf) free(normalized);
        return false;
    }

    // If there's still two codepoints, we might end up with a single grapheme
    // cluster which will need to turn into a synthetic grapheme:
    const void *second_grapheme = u32_grapheme_next(normalized, &normalized[2]);
    if (normalized != norm_buf) free(normalized);
    return (second_grapheme == &normalized[1]);
}

static Text_t concat2_assuming_safe(Text_t a, Text_t b)
{
    if (a.length == 0) return b;
    if (b.length == 0) return a;

    if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) {
        int64_t na = num_subtexts(a);
        int64_t nb = num_subtexts(b);
        Text_t ret = {
            .length=a.length + b.length,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])),
        };
        memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na]));
        memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb]));
        return ret;
    } else if (a.tag == TEXT_SUBTEXT) {
        int64_t n = num_subtexts(a);
        Text_t ret = {
            .length=a.length + b.length,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])),
        };
        memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n]));
        ret.subtexts[n] = b;
        return ret;
    } else if (b.tag == TEXT_SUBTEXT) {
        int64_t n = num_subtexts(b);
        Text_t ret = {
            .length=a.length + b.length,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])),
        };
        ret.subtexts[0] = a;
        memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n]));
        return ret;
    } else {
        Text_t ret = {
            .length=a.length + b.length,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[2])),
        };
        ret.subtexts[0] = a;
        ret.subtexts[1] = b;
        return ret;
    }
}

static Text_t concat2(Text_t a, Text_t b)
{
    if (a.length == 0) return b;
    if (b.length == 0) return a;

    if (__builtin_expect(is_concat_stable(a, b), 1))
        return concat2_assuming_safe(a, b);

    // Do full normalization of the last/first characters
    int32_t last_a = get_grapheme(a, a.length-1);
    int32_t first_b = get_grapheme(b, 0);

    size_t utf32_len = (last_a >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a)) + (first_b >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b));
    ucs4_t join_graphemes[utf32_len] = {};
    ucs4_t *p = &join_graphemes[0];
    if (last_a < 0) p = mempcpy(p, GRAPHEME_CODEPOINTS(last_a), NUM_GRAPHEME_CODEPOINTS(last_a));
    else *(p++) = (ucs4_t)last_a;
    if (first_b < 0) p = mempcpy(p, GRAPHEME_CODEPOINTS(first_b), NUM_GRAPHEME_CODEPOINTS(first_b));
    else *(p++) = (ucs4_t)first_b;

    Text_t glue = text_from_u32(join_graphemes, (int64_t)utf32_len, true);

    if (a.length == 1 && b.length == 1)
        return glue;
    else if (a.length == 1)
        return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length)));
    else if (b.length == 1)
        return concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue);
    else
        return concat2_assuming_safe(
            concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue),
            b);
}

public Text_t Text$_concat(int n, Text_t items[n])
{
    if (n == 0) return (Text_t){.length=0};
    if (n == 1) return items[0];
    if (n == 2) return concat2(items[0], items[1]);

    int64_t len = 0, subtexts = 0;
    for (int i = 0; i < n; i++) {
        len += items[i].length;
        if (items[i].length > 0)
            subtexts += num_subtexts(items[i]);
    }

    Text_t ret = {
        .length=0,
        .tag=TEXT_SUBTEXT,
        .subtexts=GC_MALLOC(sizeof(Text_t[len])),
    };
    int64_t sub_i = 0;
    for (int i = 0; i < n; i++) {
        if (items[i].length == 0)
            continue;

        if (i > 0 && !__builtin_expect(is_concat_stable(items[i-1], items[i]), 1)) {
            // Oops, guess this wasn't stable for concatenation, let's break it
            // up into subtasks:
            return concat2(ret, Text$_concat(n-i, &items[i]));
        }

        if (items[i].tag == TEXT_SUBTEXT) {
            for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) {
                ret.subtexts[sub_i++] = items[i].subtexts[j];
                remainder -= items[i].subtexts[j].length;
            }
        } else {
            ret.subtexts[sub_i++] = items[i];
        }
        ret.length += items[i].length;
    }
    return ret;
}

public Text_t Text$repeat(Text_t text, Int_t count)
{
    if (text.length == 0 || Int$is_negative(count))
        return Text("");

    Int_t result_len = Int$times(count, I(text.length));
    if (Int$compare_value(result_len, I(1l<<40)) > 0)
        fail("Text repeating would produce too big of an result!");

    int64_t count64 = Int_to_Int64(count, false);
    if (text.tag == TEXT_SUBTEXT) {
        int64_t subtexts = num_subtexts(text);
        Text_t ret = {
            .length=text.length * count64,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[subtexts * count64])),
        };
        for (int64_t c = 0; c < count64; c++) {
            for (int64_t i = 0; i < subtexts; i++) {
                if (text.subtexts[i].length > 0)
                    ret.subtexts[c*subtexts + i] = text.subtexts[i];
            }
        }
        return ret;
    } else {
        Text_t ret = {
            .length=text.length * count64,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[count64])),
        };
        for (int64_t i = 0; i < count64; i++)
            ret.subtexts[i] = text;
        return ret;
    }
}

public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
{
    int64_t first = Int_to_Int64(first_int, false);
    int64_t last = Int_to_Int64(last_int, false);
    if (first == 0) fail("Invalid index: 0");
    if (last == 0) return (Text_t){.length=0};

    if (first < 0) first = text.length + first + 1;
    if (last < 0) last = text.length + last + 1;

    if (last > text.length) last = text.length;

    if (first > text.length || last < first)
        return (Text_t){.length=0};

    if (first == 1 && last == text.length)
        return text;

    switch (text.tag) {
    case TEXT_SHORT_ASCII: {
        Text_t ret = (Text_t) {
            .tag=TEXT_SHORT_ASCII,
            .length=last - first + 1,
        };
        memcpy(ret.short_ascii, text.short_ascii + (first-1), (size_t)ret.length);
        return ret;
    }
    case TEXT_ASCII: {
        Text_t ret = {
            .tag=TEXT_ASCII,
            .length=last - first + 1,
            .ascii=text.ascii + (first-1),
        };
        return ret;
    }
    case TEXT_SHORT_GRAPHEMES: {
        assert((first == 1 && last == 1) || (first == 2 && last == 2));
        Text_t ret = {
            .tag=TEXT_SHORT_GRAPHEMES,
            .length=1,
            .short_graphemes={text.short_graphemes[first-1]},
        };
        return ret;
    }
    case TEXT_GRAPHEMES: {
        Text_t ret = {
            .tag=TEXT_GRAPHEMES,
            .length=last - first + 1,
            .graphemes=text.graphemes + (first-1),
        };
        return ret;
    }
    case TEXT_SUBTEXT: {
        Text_t *subtexts = text.subtexts;
        while (first > subtexts[0].length) {
            first -= subtexts[0].length;
            last -= subtexts[0].length;
            ++subtexts;
        }

        int64_t needed_len = (last - first) + 1;
        int64_t num_subtexts = 0;
        for (int64_t included = 0; included < needed_len; ) {
            if (included == 0)
                included += subtexts[num_subtexts].length - first + 1;
            else
                included += subtexts[num_subtexts].length;
            num_subtexts += 1;
        }
        if (num_subtexts == 1)
            return Text$slice(subtexts[0], I(first), I(last));

        Text_t ret = {
            .length=needed_len,
            .tag=TEXT_SUBTEXT,
            .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])),
        };
        for (int64_t i = 0; i < num_subtexts; i++) {
            ret.subtexts[i] = Text$slice(subtexts[i], I(first), I(last));
            first = 1;
            needed_len -= ret.subtexts[i].length;
            last = first + needed_len - 1;
        }
        return ret;
    }
    default: errx(1, "Invalid tag");
    }
}

Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize)
{
    // Normalization is apparently guaranteed to never exceed 3x in the input length
    ucs4_t norm_buf[MIN(256, 3*num_codepoints)];
    if (normalize) {
        size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
        ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, (size_t)num_codepoints, norm_buf, &norm_length);
        codepoints = normalized;
        num_codepoints = (int64_t)norm_length;
    }

    // char breaks[num_codepoints];
    // u32_grapheme_breaks(codepoints, num_codepoints, breaks);

    Text_t ret = {
        .length=0,
        .tag=TEXT_SHORT_GRAPHEMES,
    };
    const ucs4_t *src = codepoints;
    int32_t *graphemes = ret.short_graphemes;
    while (src < &codepoints[num_codepoints]) {
        if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length + 1 > 2) {
            graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation
            graphemes[0] = ret.short_graphemes[0];
            graphemes[1] = ret.short_graphemes[1];
            ret.tag = TEXT_GRAPHEMES;
            ret.graphemes = graphemes;
        }

        // TODO: use grapheme breaks instead of u32_grapheme_next()
        const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]);
        if (next == &src[1]) {
            graphemes[ret.length] = (int32_t)*src;
        } else {
            // Synthetic grapheme
            graphemes[ret.length] = get_synthetic_grapheme(src, next-src);
        }
        ++ret.length;
        src = next;
    }
    if (normalize && codepoints != norm_buf) free(codepoints);
    return ret;
}

public Text_t Text$from_strn(const char *str, size_t len)
{
    int64_t ascii_span = 0;
    for (size_t i = 0; i < len && isascii(str[i]); i++)
        ascii_span++;

    if (ascii_span == (int64_t)len) { // All ASCII
        Text_t ret = {.length=ascii_span};
        if (ascii_span <= 8) {
            ret.tag = TEXT_SHORT_ASCII;
            for (int64_t i = 0; i < ascii_span; i++)
                ret.short_ascii[i] = str[i];
        } else {
            ret.tag = TEXT_ASCII;
            ret.ascii = str;
        }
        return ret;
    } else {
        if (u8_check((uint8_t*)str, len) != NULL)
            return Text("");

        ucs4_t buf[128];
        size_t length = sizeof(buf)/sizeof(buf[0]);

        ucs4_t *codepoints = u8_to_u32((uint8_t*)str, (size_t)ascii_span + strlen(str + ascii_span), buf, &length);
        Text_t ret = text_from_u32(codepoints, (int64_t)length, true);
        if (codepoints != buf) free(codepoints);
        return ret;
    }
}

public Text_t Text$from_str(const char *str)
{
    return str ? Text$from_strn(str, strlen(str)) : Text("");
}

static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i)
{
    switch (text.tag) {
    case TEXT_ASCII: case TEXT_SHORT_ASCII: {
        if (*i + text.length > (int64_t)*capacity) {
            *capacity = *i + text.length + 1;
            *buf = GC_REALLOC(*buf, (size_t)*capacity);
        }

        const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii;
        memcpy(*buf + *i, bytes, (size_t)text.length);
        *i += text.length;
        break;
    }
    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
        const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes;
        for (int64_t g = 0; g < text.length; g++) {
            if (graphemes[g] >= 0) {
                uint8_t u8_buf[64];
                size_t u8_len = sizeof(u8_buf);
                uint8_t *u8 = u32_to_u8((ucs4_t*)&graphemes[g], 1, u8_buf, &u8_len);

                if (*i + (int64_t)u8_len > (int64_t)*capacity) {
                    *capacity = *i + (int64_t)u8_len + 1;
                    *buf = GC_REALLOC(*buf, (size_t)*capacity);
                }

                memcpy(*buf + *i, u8, u8_len);
                *i += (int64_t)u8_len;
                if (u8 != u8_buf) free(u8);
            } else {
                const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]);
                size_t u8_len = u8_strlen(u8);
                if (*i + (int64_t)u8_len > (int64_t)*capacity) {
                    *capacity = *i + (int64_t)u8_len + 1;
                    *buf = GC_REALLOC(*buf, (size_t)*capacity);
                }

                memcpy(*buf + *i, u8, u8_len);
                *i += (int64_t)u8_len;
            }
        }
        break;
    }
    case TEXT_SUBTEXT: {
        for (int64_t s = 0, remaining = text.length; remaining > 0; s++) {
            u8_buf_append(text.subtexts[s], buf, capacity, i);
            remaining -= text.subtexts[s].length;
        }
        break;
    }
    default: break;
    }
}

public char *Text$as_c_string(Text_t text)
{
    int64_t capacity = text.length + 1;
    char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
    int64_t i = 0;
    u8_buf_append(text, &buf, &capacity, &i);

    if (i + 1 > (int64_t)capacity) {
        capacity = i + 1;
        buf = GC_REALLOC(buf, (size_t)capacity);
    }
    buf[i] = '\0';
    return buf;
}

PUREFUNC public uint64_t Text$hash(Text_t *text)
{
    if (text->hash != 0) return text->hash;
    siphash sh;
    siphashinit(&sh, sizeof(int32_t[text->length]));

    union {
        int32_t chunks[2];
        uint64_t whole;
    } tmp;
    switch (text->tag) {
    case TEXT_ASCII: case TEXT_SHORT_ASCII: {
        const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii;
        for (int64_t i = 0; i + 1 < text->length; i++) {
            tmp.chunks[0] = (int32_t)bytes[i];
            tmp.chunks[1] = (int32_t)bytes[i+1];
            siphashadd64bits(&sh, tmp.whole);
        }
        int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes
        text->hash = siphashfinish_last_part(&sh, (uint64_t)last);
        break;
    }
    case TEXT_GRAPHEMES: {
        const int32_t *graphemes = text->graphemes;
        for (int64_t i = 0; i + 1 < text->length; i++) {
            tmp.chunks[0] = graphemes[i];
            tmp.chunks[1] = graphemes[i];
            siphashadd64bits(&sh, tmp.whole);
        }
        int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes
        text->hash = siphashfinish_last_part(&sh, (uint64_t)last);
        break;
    }
    case TEXT_SHORT_GRAPHEMES: {
        tmp.chunks[0] = text->short_graphemes[0];
        if (text->length > 1)
            tmp.chunks[1] = text->short_graphemes[1];
        text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole);
        break;
    }
    case TEXT_SUBTEXT: {
        int32_t leftover = 0;
        for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) {
            Text_t subtext = text->subtexts[sub_i];
            if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) {
                const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii;
                int64_t grapheme = 0;
                if (leftover) {
                    tmp.chunks[0] = leftover;
                    tmp.chunks[1] = (int32_t)bytes[0];
                    siphashadd64bits(&sh, tmp.whole);
                    grapheme += 1;
                }
                for (; grapheme + 1 < subtext.length; grapheme += 2) {
                    tmp.chunks[0] = (int32_t)bytes[grapheme];
                    tmp.chunks[1] = (int32_t)bytes[grapheme+1];
                    siphashadd64bits(&sh, tmp.whole);
                }
                leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0;
            } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) {
                if (leftover) {
                    tmp.chunks[0] = leftover;
                    tmp.chunks[1] = subtext.short_graphemes[0];
                    siphashadd64bits(&sh, tmp.whole);
                    leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0;
                } else if (subtext.length == 1) {
                    leftover = subtext.short_graphemes[0];
                } else {
                    tmp.chunks[0] = subtext.short_graphemes[0];
                    tmp.chunks[1] = subtext.short_graphemes[1];
                    siphashadd64bits(&sh, tmp.whole);
                }
            } else if (subtext.tag == TEXT_GRAPHEMES) {
                const int32_t *graphemes = subtext.graphemes;
                int64_t grapheme = 0;
                if (leftover) {
                    tmp.chunks[0] = leftover;
                    tmp.chunks[1] = graphemes[0];
                    siphashadd64bits(&sh, tmp.whole);
                    grapheme += 1;
                }
                for (; grapheme + 1 < subtext.length; grapheme += 2) {
                    tmp.chunks[0] = graphemes[grapheme];
                    tmp.chunks[1] = graphemes[grapheme+1];
                    siphashadd64bits(&sh, tmp.whole);
                }
                leftover = grapheme < subtext.length ? graphemes[grapheme] : 0;
            }

            to_hash -= text->subtexts[sub_i].length;

            ++sub_i;
        }

        text->hash = siphashfinish_last_part(&sh, (uint64_t)leftover);
        break;
    }
    default: errx(1, "Invalid text");
    }

    if (text->hash == 0)
        text->hash = 1;

    return text->hash;
}

int32_t _get_grapheme(Text_t text, text_iter_t *state, int64_t index)
{
    switch (text.tag) {
    case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0;
    case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0;
    case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0;
    case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0;
    case TEXT_SUBTEXT: {
        text_iter_t backup_state = {0, 0};
        if (!state) state = &backup_state;

        if (index < 0 || index >= text.length)
            return 0;

        while (index < state->sum_of_previous_subtexts && state->subtext > 0) {
            state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length;
            state->subtext -= 1;
        }
        for (;;) {
            if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length)
                return _get_grapheme(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts);
            state->sum_of_previous_subtexts += text.subtexts[state->subtext].length;
            state->subtext += 1;
        }
        return 0;
    }
    default: errx(1, "Invalid text");
    }
    return 0;
}

int32_t get_grapheme(Text_t text, int64_t index)
{
    text_iter_t state = {0, 0};
    return _get_grapheme(text, &state, index);
}

PUREFUNC public int32_t Text$compare(const Text_t *a, const Text_t *b)
{
    if (a == b) return 0;

    int64_t len = MAX(a->length, b->length);
    text_iter_t a_state = {0, 0}, b_state = {0, 0};
    for (int64_t i = 0; i < len; i++) {
        int32_t ai = _get_grapheme(*a, &a_state, i);
        int32_t bi = _get_grapheme(*b, &b_state, i);
        if (ai == bi) continue;
        int32_t cmp;
        if (ai > 0 && bi > 0) {
            cmp = u32_cmp((ucs4_t*)&ai, (ucs4_t*)&bi, 1);
        } else if (ai > 0) {
            cmp = u32_cmp2(
                (ucs4_t*)&ai, 1,
                GRAPHEME_CODEPOINTS(bi),
                NUM_GRAPHEME_CODEPOINTS(bi));
        } else if (bi > 0) {
            cmp = u32_cmp2(
                GRAPHEME_CODEPOINTS(ai),
                NUM_GRAPHEME_CODEPOINTS(ai),
                (ucs4_t*)&bi, 1);
        } else {
            cmp = u32_cmp2(
                GRAPHEME_CODEPOINTS(ai),
                NUM_GRAPHEME_CODEPOINTS(ai),
                GRAPHEME_CODEPOINTS(bi),
                NUM_GRAPHEME_CODEPOINTS(bi));
        }
        if (cmp != 0) return cmp;
    }
    return 0;
}

PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix)
{
    if (text.length < prefix.length)
        return false;
    text_iter_t text_state = {0, 0}, prefix_state = {0, 0};
    for (int64_t i = 0; i < prefix.length; i++) {
        int32_t text_i = _get_grapheme(text, &text_state, i);
        int32_t prefix_i = _get_grapheme(prefix, &prefix_state, i);
        if (text_i != prefix_i) return false;
    }
    return true;
}

PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix)
{
    if (text.length < suffix.length)
        return false;
    text_iter_t text_state = {0, 0}, prefix_state = {0, 0};
    for (int64_t i = 0; i < suffix.length; i++) {
        int32_t text_i = _get_grapheme(text, &text_state, text.length - suffix.length + i);
        int32_t suffix_i = _get_grapheme(suffix, &prefix_state, i);
        if (text_i != suffix_i) return false;
    }
    return true;
}

PUREFUNC public bool Text$equal_values(Text_t a, Text_t b)
{
    if (a.length != b.length || (a.hash != 0 && b.hash != 0 && a.hash != b.hash))
        return false;
    int64_t len = a.length;
    text_iter_t a_state = {0, 0}, b_state = {0, 0};
    for (int64_t i = 0; i < len; i++) {
        int32_t ai = _get_grapheme(a, &a_state, i);
        int32_t bi = _get_grapheme(b, &b_state, i);
        if (ai != bi) return false;
    }
    return true;
}

PUREFUNC public bool Text$equal(const Text_t *a, const Text_t *b)
{
    if (a == b) return true;
    return Text$equal_values(*a, *b);
}

PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b)
{
    if (a.length != b.length)
        return false;
    int64_t len = a.length;
    text_iter_t a_state = {0, 0}, b_state = {0, 0};
    const char *language = uc_locale_language();
    for (int64_t i = 0; i < len; i++) {
        int32_t ai = _get_grapheme(a, &a_state, i);
        int32_t bi = _get_grapheme(b, &b_state, i);
        if (ai != bi) {
            const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai);
            int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);

            const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t*)&bi : GRAPHEME_CODEPOINTS(bi);
            int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);

            int cmp = 0;
            (void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, language, UNINORM_NFC, &cmp);
            if (cmp != 0)
                return false;
        }
    }
    return true;
}

public Text_t Text$upper(Text_t text)
{
    if (text.length == 0) return text;
    Array_t codepoints = Text$utf32_codepoints(text);
    const char *language = uc_locale_language();
    ucs4_t buf[128];
    size_t out_len = sizeof(buf)/sizeof(buf[0]);
    ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
    Text_t ret = text_from_u32(upper, (int64_t)out_len, false);
    if (upper != buf) free(upper);
    return ret;
}

public Text_t Text$lower(Text_t text)
{
    if (text.length == 0) return text;
    Array_t codepoints = Text$utf32_codepoints(text);
    const char *language = uc_locale_language();
    ucs4_t buf[128];
    size_t out_len = sizeof(buf)/sizeof(buf[0]);
    ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
    Text_t ret = text_from_u32(lower, (int64_t)out_len, false);
    if (lower != buf) free(lower);
    return ret;
}

public Text_t Text$title(Text_t text)
{
    if (text.length == 0) return text;
    Array_t codepoints = Text$utf32_codepoints(text);
    const char *language = uc_locale_language();
    ucs4_t buf[128];
    size_t out_len = sizeof(buf)/sizeof(buf[0]);
    ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, language, UNINORM_NFC, buf, &out_len);
    Text_t ret = text_from_u32(title, (int64_t)out_len, false);
    if (title != buf) free(title);
    return ret;
}

static inline void skip_whitespace(Text_t text, int64_t *i)
{
    text_iter_t state = {0, 0};
    while (*i < text.length) {
        int32_t grapheme = _get_grapheme(text, &state, *i);
        if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
            return;
        *i += 1;
    }
}

static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme)
{
    if (*i < text.length && get_grapheme(text, *i) == grapheme) {
        *i += 1;
        return true;
    }
    return false;
}

static inline bool match_str(Text_t text, int64_t *i, const char *str)
{
    text_iter_t state = {0, 0};
    int64_t matched = 0;
    while (matched[str]) {
        if (*i + matched >= text.length || _get_grapheme(text, &state, *i + matched) != str[matched])
            return false;
        matched += 1;
    }
    *i += matched;
    return true;
}

static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
{
    if (*i >= text.length) return false;
    int32_t grapheme = get_grapheme(text, *i);
    // TODO: check every codepoint in the cluster?
    if (uc_is_property(MAIN_GRAPHEME_CODEPOINT(grapheme), prop)) {
        *i += 1;
        return true;
    }
    return false;
}

static int64_t parse_int(Text_t text, int64_t *i)
{
    text_iter_t state = {0, 0};
    int64_t value = 0;
    for (;; *i += 1) {
        ucs4_t grapheme = _get_main_grapheme(text, &state, *i);
        int digit = uc_digit_value((ucs4_t)grapheme);
        if (digit < 0) break;
        if (value >= INT64_MAX/10) break;
        value = 10*value + digit;
    }
    return value;
}

const char *get_property_name(Text_t text, int64_t *i)
{
    skip_whitespace(text, i);
    char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
    char *dest = name;
    text_iter_t state = {0, 0};
    while (*i < text.length) {
        int32_t grapheme = _get_grapheme(text, &state, *i);
        if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
            *dest = (char)grapheme;
            ++dest;
            if (dest >= name + UNINAME_MAX - 1)
                break;
        } else {
            break;
        }
        *i += 1;
    }

    while (dest > name && dest[-1] == ' ')
        *(dest--) = '\0';

    if (dest == name) return NULL;
    *dest = '\0';
    return name;
}

#define EAT1(text, state, index, cond) ({\
        int32_t grapheme = _get_grapheme(text, state, index); \
        bool success = (cond); \
        if (success) index += 1; \
        success; })

#define EAT2(text, state, index, cond1, cond2) ({\
        int32_t grapheme = _get_grapheme(text, state, index); \
        bool success = (cond1); \
        if (success) { \
            grapheme = _get_grapheme(text, state, index + 1); \
            success = (cond2); \
            if (success) \
                index += 2; \
        } \
        success; })


#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; })

int64_t match_email(Text_t text, int64_t index)
{
    // email = local "@" domain
    // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
    // domain = dns-label ("." dns-label)*
    // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)

    text_iter_t state = {0, 0};
    if (index > 0) {
        ucs4_t prev_codepoint = _get_main_grapheme(text, &state, index - 1);
        if (uc_is_property_alphabetic((ucs4_t)prev_codepoint))
            return -1;
    }

    int64_t start_index = index;

    // Local part:
    int64_t local_len = 0;
    static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
    while (EAT1(text, &state, index,
                (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
        local_len += 1;
        if (local_len > 64) return -1;
    }

    if (!EAT1(text, &state, index, grapheme == '@'))
        return -1;

    // Host
    int64_t host_len = 0;
    do {
        int64_t label_len = 0;
        while (EAT1(text, &state, index,
                    (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
            label_len += 1;
            if (label_len > 63) return -1;
        }

        if (label_len == 0)
            return -1;

        host_len += label_len;
        if (host_len > 255)
            return -1;
        host_len += 1;
    } while (EAT1(text, &state, index, grapheme == '.'));

    return index - start_index;
}

int64_t match_ipv6(Text_t text, int64_t index)
{
    text_iter_t state = {0, 0};
    if (index > 0) {
        int32_t prev_codepoint = _get_grapheme(text, &state, index - 1);
        if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
            return -1;
    }
    int64_t start_index = index;
    const int NUM_CLUSTERS = 8;
    bool double_colon_used = false;
    for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
        for (int digits = 0; digits < 4; digits++) {
            if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
                break;
        }
        if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
            return -1; // Too many digits

        if (cluster == NUM_CLUSTERS-1) {
            break;
        } else if (!EAT1(text, &state, index, grapheme == ':')) {
            if (double_colon_used)
                break;
            return -1;
        }

        if (EAT1(text, &state, index, grapheme == ':')) {
            if (double_colon_used)
                return -1;
            double_colon_used = true;
        }
    }
    return index - start_index;
}

static int64_t match_ipv4(Text_t text, int64_t index)
{
    text_iter_t state = {0, 0};
    if (index > 0) {
        int32_t prev_codepoint = _get_grapheme(text, &state, index - 1);
        if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
            return -1;
    }
    int64_t start_index = index;

    const int NUM_CLUSTERS = 4;
    for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
        for (int digits = 0; digits < 3; digits++) {
            if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
                if (digits == 0) return -1;
                break;
            }
        }

        if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
            return -1; // Too many digits

        if (cluster == NUM_CLUSTERS-1)
            break;
        else if (!EAT1(text, &state, index, grapheme == '.'))
            return -1;
    }
    return (index - start_index);
}

int64_t match_ip(Text_t text, int64_t index)
{
    int64_t len = match_ipv6(text, index);
    if (len >= 0) return len;
    len = match_ipv4(text, index);
    return (len >= 0) ? len : -1;
}

int64_t match_uri(Text_t text, int64_t index)
{
    // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
    // scheme = [a-zA-Z] [a-zA-Z0-9+.-]
    // authority = [userinfo "@"] host [":" port]

    text_iter_t state = {0, 0};
    if (index > 0) {
        int32_t prev_codepoint = _get_grapheme(text, &state, index - 1);
        if (uc_is_property_alphabetic(MAIN_GRAPHEME_CODEPOINT(prev_codepoint)))
            return -1;
    }

    int64_t start_index = index;

    // Scheme:
    if (!EAT1(text, &state, index, isalpha(grapheme)))
        return -1;

    EAT_MANY(text, &state, index,
             !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));

    if (index == start_index)
        return -1;

    if (!match_grapheme(text, &index, ':'))
        return -1;

    // Authority:
    if (match_str(text, &index, "//")) {
        int64_t authority_start = index;
        // Username or host:
        static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
        if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
            return -1;

        if (EAT1(text, &state, index, grapheme == '@')) {
            // Found a username, now get a host:
            if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
                return -1;
        } else {
            int64_t ip = authority_start;
            int64_t ipv4_len = match_ipv4(text, ip);
            if (ipv4_len > 0) {
                ip += ipv4_len;
            } else if (match_grapheme(text, &ip, '[')) {
                ip += match_ipv6(text, ip);
                if (ip > authority_start + 1 && match_grapheme(text, &ip, ']'))
                    index = ip;
            }
        }

        // Port:
        if (EAT1(text, &state, index, grapheme == ':')) {
            if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
                return -1;
        }
        if (!EAT1(text, &state, index, grapheme == '/'))
            return (index - start_index); // No path
    } else {
        // Optional path root:
        EAT1(text, &state, index, grapheme == '/');
    }

    // Path:
    static const char *non_path = " \"#?<>[]{}\\^`|";
    EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));

    if (EAT1(text, &state, index, grapheme == '?')) { // Query
        static const char *non_query = " \"#<>[]{}\\^`|";
        EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
    }

    if (EAT1(text, &state, index, grapheme == '#')) { // Fragment
        static const char *non_fragment = " \"#<>[]{}\\^`|";
        EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
    }
    return index - start_index;
}

int64_t match_url(Text_t text, int64_t index)
{
    int64_t lookahead = index;
    if (!(match_str(text, &lookahead, "https:")
        || match_str(text, &lookahead, "http:")
        || match_str(text, &lookahead, "ftp:")
        || match_str(text, &lookahead, "wss:")
        || match_str(text, &lookahead, "ws:")))
        return -1;

    return match_uri(text, index);
}

int64_t match_id(Text_t text, int64_t index)
{
    text_iter_t state = {0, 0};
    if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START)))
        return -1;
    return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
}

int64_t match_int(Text_t text, int64_t index)
{
    text_iter_t state = {0, 0};
    int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
    return len >= 0 ? len : -1;
}

int64_t match_num(Text_t text, int64_t index)
{
    text_iter_t state = {0, 0};
    bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0;
    int64_t pre_decimal = EAT_MANY(text, &state, index,
                                   uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
    bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1);
    int64_t post_decimal = decimal ? EAT_MANY(text, &state, index,
                                              uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
    if (pre_decimal == 0 && post_decimal == 0)
        return -1;
    return negative + pre_decimal + decimal + post_decimal;
}

int64_t match_newline(Text_t text, int64_t index)
{
    if (index >= text.length)
        return -1;

    text_iter_t state = {0, 0};
    ucs4_t grapheme = index >= text.length ? 0 : _get_main_grapheme(text, &state, index);
    if (grapheme == '\n')
        return 1;
    if (grapheme == '\r' && _get_grapheme(text, &state, index + 1) == '\n')
        return 2;
    return -1;
}

typedef struct {
    int64_t index, length;
    bool occupied, recursive;
} capture_t;

typedef struct {
    enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
    bool negated, non_capturing;
    int64_t min, max;
    union {
        int32_t grapheme;
        uc_property_t property;
        int64_t (*fn)(Text_t, int64_t);
        int32_t quote_graphemes[2];
        int32_t pair_graphemes[2];
    };
} pat_t;

int64_t match_pat(Text_t text, text_iter_t *state, int64_t index, pat_t pat)
{
    int32_t grapheme = index >= text.length ? 0 : _get_grapheme(text, state, index);

    switch (pat.tag) {
    case PAT_START: {
        if (index == 0)
            return pat.negated ? -1 : 0;
        return pat.negated ? 0 : -1;
    }
    case PAT_END: {
        if (index >= text.length)
            return pat.negated ? -1 : 0;
        return pat.negated ? 0 : -1;
    }
    case PAT_ANY: {
        assert(!pat.negated);
        return (index < text.length) ? 1 : -1;
    }
    case PAT_GRAPHEME: {
        if (index >= text.length)
            return -1;
        else if (grapheme == pat.grapheme)
            return pat.negated ? -1 : 1;
        return pat.negated ? 1 : -1;
    }
    case PAT_PROPERTY: {
        if (index >= text.length)
            return -1;
        else if (uc_is_property((ucs4_t)grapheme, pat.property))
            return pat.negated ? -1 : 1;
        return pat.negated ? 1 : -1;
    }
    case PAT_PAIR: {
        // Nested punctuation: (?), [?], etc
        if (index >= text.length)
            return -1;

        int32_t open = pat.pair_graphemes[0];
        if (grapheme != open)
            return pat.negated ? 1 : -1;

        int32_t close = pat.pair_graphemes[1];
        int64_t depth = 1;
        int64_t match_len = 1;
        for (; depth > 0; match_len++) {
            if (index + match_len >= text.length)
                return pat.negated ? 1 : -1;

            int32_t c = _get_grapheme(text, state, index + match_len);
            if (c == open)
                depth += 1;
            else if (c == close)
                depth -= 1;
        }
        return pat.negated ? -1 : match_len;
    }
    case PAT_QUOTE: {
        // Nested quotes: "?", '?', etc
        if (index >= text.length)
            return -1;

        int32_t open = pat.quote_graphemes[0];
        if (grapheme != open)
            return pat.negated ? 1 : -1;

        int32_t close = pat.quote_graphemes[1];
        for (int64_t i = index + 1; i < text.length; i++) {
            int32_t c = _get_grapheme(text, state, i);
            if (c == close) {
                return pat.negated ? -1 : (i - index) + 1;
            } else if (c == '\\' && index + 1 < text.length) {
                i += 1; // Skip ahead an extra step
            }
        }
        return pat.negated ? 1 : -1;
    }
    case PAT_FUNCTION: {
        int64_t match_len = pat.fn(text, index);
        if (match_len >= 0)
            return pat.negated ? -1 : match_len;
        return pat.negated ? 1 : -1;
    }
    default: errx(1, "Invalid pattern");
    }
    errx(1, "Unreachable");
}

pat_t parse_next_pat(Text_t pattern, text_iter_t *state, int64_t *index)
{
    if (EAT2(pattern, state, *index,
             uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK),
             grapheme == '?')) {
        // Quotations: "?", '?', etc
        int32_t open = _get_grapheme(pattern, state, *index-2);
        int32_t close = open;
        uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
        if (!match_grapheme(pattern, index, close))
            fail("Pattern's closing quote is missing: %k", &pattern);

        return (pat_t){
            .tag=PAT_QUOTE,
            .min=1, .max=1,
            .quote_graphemes={open, close},
        };
    } else if (EAT2(pattern, state, *index,
                    uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
                    grapheme == '?')) {
        // Nested punctuation: (?), [?], etc
        int32_t open = _get_grapheme(pattern, state, *index-2);
        int32_t close = open;
        uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
        if (!match_grapheme(pattern, index, close))
            fail("Pattern's closing brace is missing: %k", &pattern);

        return (pat_t){
            .tag=PAT_PAIR,
            .min=1, .max=1,
            .pair_graphemes={open, close},
        };
    } else if (EAT1(pattern, state, *index,
                    grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
        skip_whitespace(pattern, index);
        int64_t min, max;
        if (uc_is_digit((ucs4_t)_get_grapheme(pattern, state, *index))) {
            min = parse_int(pattern, index);
            skip_whitespace(pattern, index);
            if (match_grapheme(pattern, index, '+')) {
                max = INT64_MAX;
            } else if (match_grapheme(pattern, index, '-')) {
                max = parse_int(pattern, index);
            } else {
                max = min;
            }
            if (min > max) fail("Minimum repetitions (%ld) is less than the maximum (%ld)", min, max);
        } else {
            min = -1, max = -1;
        }

        skip_whitespace(pattern, index);

        bool negated = match_grapheme(pattern, index, '!');
#define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__})
        const char *prop_name;
        if (match_str(pattern, index, ".."))
            prop_name = "..";
        else
            prop_name = get_property_name(pattern, index);

        if (!prop_name) {
            // Literal character, e.g. {1?}
            skip_whitespace(pattern, index);
            int32_t grapheme = _get_grapheme(pattern, state, (*index)++);
            if (!match_grapheme(pattern, index, '}'))
                fail("Missing closing '}' in pattern: %k", &pattern);
            return PAT(PAT_GRAPHEME, .grapheme=grapheme);
        } else if (strlen(prop_name) == 1) {
            // Single letter names: {1+ A}
            skip_whitespace(pattern, index);
            if (!match_grapheme(pattern, index, '}'))
                fail("Missing closing '}' in pattern: %k", &pattern);
            return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
        }

        skip_whitespace(pattern, index);
        if (!match_grapheme(pattern, index, '}'))
            fail("Missing closing '}' in pattern: %k", &pattern);

        switch (tolower(prop_name[0])) {
        case '.':
            if (prop_name[1] == '.') {
                if (negated)
                    return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true});
                else
                    return PAT(PAT_ANY);
            }
            break;
        case 'd':
            if (strcasecmp(prop_name, "digit") == 0) {
                return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT);
            }
            break;
        case 'e':
            if (strcasecmp(prop_name, "end") == 0) {
                return PAT(PAT_END, .non_capturing=!negated);
            } else if (strcasecmp(prop_name, "email") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_email);
            } else if (strcasecmp(prop_name, "emoji") == 0) {
                return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI);
            }
            break;
        case 'i':
            if (strcasecmp(prop_name, "id") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_id);
            } else if (strcasecmp(prop_name, "int") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_int);
            } else if (strcasecmp(prop_name, "ipv4") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_ipv4);
            } else if (strcasecmp(prop_name, "ipv6") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_ipv6);
            } else if (strcasecmp(prop_name, "ip") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_ip);
            }
            break;
        case 'n':
            if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0
                || strcasecmp(prop_name, "crlf")) {
                return PAT(PAT_FUNCTION, .fn=match_newline);
            } else if (strcasecmp(prop_name, "num") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_num);
            }
            break;
        case 's':
            if (strcasecmp(prop_name, "start") == 0) {
                return PAT(PAT_START, .non_capturing=!negated);
            }
            break;
        case 'u':
            if (strcasecmp(prop_name, "uri") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_uri);
            } else if (strcasecmp(prop_name, "url") == 0) {
                return PAT(PAT_FUNCTION, .fn=match_url);
            }
            break;
        default: break;
        }

        uc_property_t prop = uc_property_byname(prop_name);
        if (uc_property_is_valid(prop))
            return PAT(PAT_PROPERTY, .property=prop);

        ucs4_t grapheme = unicode_name_character(prop_name);
        if (grapheme == UNINAME_INVALID)
            fail("Not a valid property or character name: %s", prop_name);
        return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme);
#undef PAT
    } else {
        return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=_get_grapheme(pattern, state, (*index)++)};
    }
}

int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index)
{
    if (pattern_index >= pattern.length) // End of the pattern
        return 0;

    int64_t start_index = text_index;
    text_iter_t pattern_state = {0, 0}, text_state = {0, 0};
    pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index);

    if (pat.min == -1 && pat.max == -1) {
        if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
            pat.min = pat.max = MAX(1, text.length - text_index);
        } else {
            pat.min = 1;
            pat.max = INT64_MAX;
        }
    }

    int64_t capture_start = text_index;
    int64_t count = 0, capture_len = 0, next_match_len = 0;

    if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
        int64_t remaining = text.length - text_index;
        capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1;
        text_index += capture_len;
        goto success;
    }

    if (pat.min == 0 && pattern_index < pattern.length) {
        next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
        if (next_match_len >= 0) {
            capture_len = 0;
            goto success;
        }
    }

    while (count < pat.max) {
        int64_t match_len = match_pat(text, &text_state, text_index, pat);
        if (match_len < 0)
            break;
        capture_len += match_len;
        text_index += match_len;
        count += 1;

        if (pattern_index < pattern.length) { // More stuff after this
            if (count < pat.min)
                next_match_len = -1;
            else
                next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
        } else {
            next_match_len = 0;
        }

        if (match_len == 0) {
            if (next_match_len >= 0) {
                // If we're good to go, no need to keep re-matching zero-length
                // matches till we hit max:
                count = pat.max;
                break;
            } else {
                return -1;
            }
        }

        if (pattern_index < pattern.length && next_match_len >= 0)
            break; // Next guy exists and wants to stop here

        if (text_index >= text.length)
            break;
    }

    if (count < pat.min || next_match_len < 0)
        return -1;

  success:
    if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) {
        if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) {
            assert(capture_len > 0);
            captures[capture_index] = (capture_t){
                .index=capture_start + 1, // Skip leading quote/paren
                .length=capture_len - 2, // Skip open/close
                .occupied=true,
                .recursive=(pat.tag == PAT_PAIR),
            };
        } else {
            captures[capture_index] = (capture_t){
                .index=capture_start,
                .length=capture_len,
                .occupied=true,
                .recursive=false,
            };
        }
    }
    return (text_index - start_index) + next_match_len;
}

#undef EAT1
#undef EAT2
#undef EAT_MANY

static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length)
{
    int32_t first_grapheme = get_grapheme(pattern, 0);
    bool find_first = (first_grapheme != '{'
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));

    text_iter_t text_state = {0, 0};

    for (int64_t i = first; i <= last; i++) {
        // Optimization: quickly skip ahead to first char in pattern:
        if (find_first) {
            while (i < text.length && _get_grapheme(text, &text_state, i) != first_grapheme)
                ++i;
        }

        int64_t m = match(text, i, pattern, 0, NULL, 0);
        if (m >= 0) {
            if (match_length)
                *match_length = m;
            return i;
        }
    }
    if (match_length)
        *match_length = -1;
    return -1;
}

public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length)
{
    int64_t first = Int_to_Int64(from_index, false);
    if (first == 0) fail("Invalid index: 0");
    if (first < 0) first = text.length + first + 1;
    if (first > text.length || first < 1)
        return I(0);
    int64_t found = _find(text, pattern, first-1, text.length-1, match_length);
    return I(found+1);
}

PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern)
{
    if (Text$starts_with(pattern, Text("{start}"))) {
        int64_t m = match(text, 0, pattern, 0, NULL, 0);
        return m >= 0;
    } else if (Text$ends_with(text, Text("{end}"))) {
        for (int64_t i = text.length-1; i >= 0; i--) {
            int64_t match_len = match(text, i, pattern, 0, NULL, 0);
            if (match_len >= 0 && i + match_len == text.length)
                return true;
        }
        return false;
    } else {
        int64_t found = _find(text, pattern, 0, text.length-1, NULL);
        return (found >= 0);
    }
}

PUREFUNC public bool Text$matches(Text_t text, Pattern_t pattern)
{
    int64_t m = match(text, 0, pattern, 0, NULL, 0);
    return m == text.length;
}

public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n])
{
    if (n < 1) return -1;
    (void)info;
    argtypes[0] = PA_POINTER;
    sizes[0] = sizeof(Text_t*);
    return 1;
}

public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[])
{
    Text_t t = **(Text_t**)args[0];
    if (info->alt)
        return text_visualize(stream, t);
    else
        return Text$print(stream, t);
}

static inline Text_t _quoted(Text_t text, bool colorize, char quote_char)
{
    // TODO: optimize for ASCII and short strings
    Array_t graphemes = {.atomic=1};
#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t))
#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); })
    if (colorize)
        add_str("\x1b[35m");
    if (quote_char != '"' && quote_char != '\'' && quote_char != '`')
        add_char('$');
    add_char(quote_char);

#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); })
    text_iter_t state = {0, 0};
    for (int64_t i = 0; i < text.length; i++) {
        int32_t g = _get_grapheme(text, &state, i);
        switch (g) {
        case '\a': add_escaped("a"); break;
        case '\b': add_escaped("b"); break;
        case '\x1b': add_escaped("e"); break;
        case '\f': add_escaped("f"); break;
        case '\n': add_escaped("n"); break;
        case '\r': add_escaped("r"); break;
        case '\t': add_escaped("t"); break;
        case '\v': add_escaped("v"); break;
        case '\\': add_escaped("\\"); break;
        case '\x00' ... '\x06': case '\x0E' ... '\x1A':
        case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': {
            if (colorize) add_str("\x1b[34;1m");
            add_char('\\');
            add_char('x');
            char tmp[4];
            sprintf(tmp, "%02X", g);
            add_str(tmp);
            if (colorize)
                add_str("\x1b[0;35m");
            break;
        }
        default: {
            if (g == quote_char)
                add_escaped(((char[2]){quote_char, 0}));
            else
               add_char(g);
            break;
        }
        }
    }

    add_char(quote_char);
    if (colorize)
        add_str("\x1b[m");

    return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data};
#undef add_str
#undef add_char
#undef add_escaped
}

public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info)
{
    (void)info;
    if (info->TextInfo.lang && streq(info->TextInfo.lang, "Path")) {
        if (!text) return Text("Path");
        return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", text, colorize ? "\x1b[m" : "");
    }

    if (!text) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
    Text_t as_text = _quoted(*(Text_t*)text, colorize, info == &Pattern$info ? '/' : '"');
    if (info && info->TextInfo.lang && info != &Text$info && info != &Pattern$info)
        as_text = Text$concat(
            colorize ? Text("\x1b[1m$") : Text("$"),
            Text$from_str(info->TextInfo.lang),
            colorize ? Text("\x1b[0m") : Text(""),
            as_text);
    return as_text;
}

public Text_t Text$quoted(Text_t text, bool colorize)
{
    return _quoted(text, colorize, '"');
}

public Array_t Text$find_all(Text_t text, Pattern_t pattern)
{
    if (pattern.length == 0) // special case
        return (Array_t){.length=0};

    Array_t matches = {};

    for (int64_t i = 0; ; ) {
        int64_t len = 0;
        int64_t found = _find(text, pattern, i, text.length-1, &len);
        if (found < 0) break;
        Text_t match = Text$slice(text, I(found+1), I(found + len));
        Array$insert(&matches, &match, I_small(0), sizeof(Text_t));
        i = found + MAX(len, 1);
    }

    return matches;
}

static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
{
    if (backref_pat.length == 0)
        return replacement;

    int32_t first_grapheme = get_grapheme(backref_pat, 0);
    bool find_first = (first_grapheme != '{'
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));

    Text_t ret = Text("");
    text_iter_t state = {0, 0};
    int64_t nonmatching_pos = 0;
    for (int64_t pos = 0; pos < replacement.length; ) {
        // Optimization: quickly skip ahead to first char in the backref pattern:
        if (find_first) {
            while (pos < replacement.length && _get_grapheme(replacement, &state, pos) != first_grapheme)
                ++pos;
        }

        int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0);
        if (backref_len < 0) {
            pos += 1;
            continue;
        }

        int64_t after_backref = pos + backref_len;
        int64_t backref = parse_int(replacement, &after_backref);
        if (after_backref == pos + backref_len) { // Not actually a backref if there's no number
            pos += 1;
            continue;
        }
        if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1);
        backref_len = (after_backref - pos);

        if (_get_grapheme(replacement, &state, pos + backref_len) == ';')
            backref_len += 1; // skip optional semicolon

        if (!captures[backref].occupied)
            fail("There is no capture number %ld!", backref);

        Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length));

        if (captures[backref].recursive && original_pattern.length > 0)
            backref_text = Text$replace(backref_text, original_pattern, replacement, backref_pat, true);

        if (pos > nonmatching_pos) {
            Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos));
            ret = Text$concat(ret, before_slice, backref_text);
        } else {
            ret = concat2(ret, backref_text);
        }

        pos += backref_len;
        nonmatching_pos = pos;
    }
    if (nonmatching_pos < replacement.length) {
        Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length));
        ret = concat2(ret, last_slice);
    }
    return ret;
}

public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive)
{
    Text_t ret = {.length=0};

    int32_t first_grapheme = get_grapheme(pattern, 0);
    bool find_first = (first_grapheme != '{'
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));

    text_iter_t text_state = {0, 0};
    int64_t nonmatching_pos = 0;
    for (int64_t pos = 0; pos < text.length; ) {
        // Optimization: quickly skip ahead to first char in pattern:
        if (find_first) {
            while (pos < text.length && _get_grapheme(text, &text_state, pos) != first_grapheme)
                ++pos;
        }

        capture_t captures[MAX_BACKREFS] = {};
        int64_t match_len = match(text, pos, pattern, 0, captures, 1);
        if (match_len < 0) {
            pos += 1;
            continue;
        }
        captures[0] = (capture_t){
            .index = pos, .length = match_len,
            .occupied = true, .recursive = false,
        };

        Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures);
        if (pos > nonmatching_pos) {
            Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
            ret = Text$concat(ret, before_slice, replacement_text);
        } else {
            ret = concat2(ret, replacement_text);
        }
        nonmatching_pos = pos + match_len;
        pos += MAX(match_len, 1);
    }
    if (nonmatching_pos < text.length) {
        Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
        ret = concat2(ret, last_slice);
    }
    return ret;
}

public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool trim_right)
{
    int64_t first = 0, last = text.length-1;
    if (trim_left) {
        int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
        if (match_len > 0)
            first = match_len;
    }

    if (trim_right) {
        for (int64_t i = text.length-1; i >= first; i--) {
            int64_t match_len = match(text, i, pattern, 0, NULL, 0);
            if (match_len > 0 && i + match_len == text.length)
                last = i-1;
        }
    }
    return Text$slice(text, I(first+1), I(last+1));
}

public Text_t Text$map(Text_t text, Pattern_t pattern, closure_t fn)
{
    Text_t ret = {.length=0};

    int32_t first_grapheme = get_grapheme(pattern, 0);
    bool find_first = (first_grapheme != '{'
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                       && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));

    text_iter_t text_state = {0, 0};
    int64_t nonmatching_pos = 0;

    Text_t (*text_mapper)(Text_t, void*) = fn.fn;
    for (int64_t pos = 0; pos < text.length; pos++) {
        // Optimization: quickly skip ahead to first char in pattern:
        if (find_first) {
            while (pos < text.length && _get_grapheme(text, &text_state, pos) != first_grapheme)
                ++pos;
        }

        int64_t match_len = match(text, pos, pattern, 0, NULL, 0);
        if (match_len < 0) continue;

        Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata);
        if (pos > nonmatching_pos) {
            Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
            ret = Text$concat(ret, before_slice, replacement);
        } else {
            ret = concat2(ret, replacement);
        }
        nonmatching_pos = pos + match_len;
        pos += (match_len - 1);
    }
    if (nonmatching_pos < text.length) {
        Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
        ret = concat2(ret, last_slice);
    }
    return ret;
}

public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive)
{
    if (replacements.entries.length == 0) return text;

    Text_t ret = {.length=0};

    int64_t nonmatch_pos = 0;
    for (int64_t pos = 0; pos < text.length; ) {
        // Find the first matching pattern at this position:
        for (int64_t i = 0; i < replacements.entries.length; i++) {
            Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride);
            capture_t captures[MAX_BACKREFS] = {};
            int64_t len = match(text, pos, pattern, 0, captures, 1);
            if (len < 0) continue;
            captures[0].index = pos;
            captures[0].length = len;

            // If we skipped over some non-matching text before finding a match, insert it here:
            if (pos > nonmatch_pos) {
                Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos));
                ret = concat2(ret, before_slice);
            }

            // Concatenate the replacement:
            Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
            Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures);
            ret = concat2(ret, replacement_text);
            pos += MAX(len, 1);
            nonmatch_pos = pos;
            goto next_pos;
        }

        pos += 1;
      next_pos:
        continue;
    }

    if (nonmatch_pos <= text.length) {
        Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length));
        ret = concat2(ret, last_slice);
    }
    return ret;
}

public Array_t Text$split(Text_t text, Pattern_t pattern)
{
    if (text.length == 0) // special case
        return (Array_t){.length=0};

    if (pattern.length == 0) // special case
        return Text$clusters(text);

    Array_t chunks = {};

    Int_t i = I_small(1);
    for (;;) {
        int64_t len = 0;
        Int_t found = Text$find(text, pattern, i, &len);
        if (I_is_zero(found)) break;
        Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1)));
        Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
        i = Int$plus(found, I(MAX(len, 1)));
    }

    Text_t last_chunk = Text$slice(text, i, I(text.length));
    Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));

    return chunks;
}

public Text_t Text$join(Text_t glue, Array_t pieces)
{
    if (pieces.length == 0) return (Text_t){.length=0};

    Text_t result = *(Text_t*)pieces.data;
    for (int64_t i = 1; i < pieces.length; i++) {
        result = Text$concat(result, glue, *(Text_t*)(pieces.data + i*pieces.stride));
    }
    return result;
}

__attribute__((format(printf, 1, 2)))
public Text_t Text$format(const char *fmt, ...)
{
    va_list args;
    va_start(args, fmt);

    char buf[9];
    int len = vsnprintf(buf, sizeof(buf), fmt, args);
    Text_t ret;
    if (len <= 8) {
        ret = (Text_t){
            .length=len,
            .tag=TEXT_SHORT_ASCII,
        };
        for (int i = 0; i < len; i++)
            ret.short_ascii[i] = buf[i];
    } else {
        char *str = GC_MALLOC_ATOMIC((size_t)(len+1));
        vsnprintf(str, (size_t)(len+1), fmt, args);
        ret = Text$from_str(str);
    }
    va_end(args);
    return ret;
}

public Array_t Text$clusters(Text_t text)
{
    Array_t clusters = {.atomic=1};
    for (int64_t i = 1; i <= text.length; i++) {
        Text_t cluster = Text$slice(text, I(i), I(i));
        Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
    }
    return clusters;
}

public Array_t Text$utf32_codepoints(Text_t text)
{
    Array_t codepoints = {.atomic=1};
    text_iter_t state = {0, 0};
    for (int64_t i = 0; i < text.length; i++) {
        int32_t grapheme = _get_grapheme(text, &state, i);
        if (grapheme < 0) {
            for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
                ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
                Array$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t));
            }
        } else {
            Array$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t));
        }
    }
    return codepoints;
}

public Array_t Text$utf8_bytes(Text_t text)
{
    const char *str = Text$as_c_string(text);
    return (Array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str};
}

static inline const char *codepoint_name(ucs4_t c)
{
    char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
    char *found_name = unicode_character_name(c, name);
    if (found_name) return found_name;
    const uc_block_t *block = uc_block(c);
    assert(block);
    snprintf(name, UNINAME_MAX, "%s-%X", block->name, c);
    return name;
}

public Array_t Text$codepoint_names(Text_t text)
{
    Array_t names = {};
    text_iter_t state = {0, 0};
    for (int64_t i = 0; i < text.length; i++) {
        int32_t grapheme = _get_grapheme(text, &state, i);
        if (grapheme < 0) {
            for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
                const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
                Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name};
                Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
            }
        } else {
            const char *name = codepoint_name((ucs4_t)grapheme);
            Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name};
            Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
        }
    }
    return names;
}

public Text_t Text$from_codepoints(Array_t codepoints)
{
    if (codepoints.stride != sizeof(int32_t))
        Array$compact(&codepoints, sizeof(int32_t));

    return text_from_u32(codepoints.data, codepoints.length, true);
}

public Text_t Text$from_codepoint_names(Array_t codepoint_names)
{
    Array_t codepoints = {};
    for (int64_t i = 0; i < codepoint_names.length; i++) {
        Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
        const char *name_str = Text$as_c_string(*name);
        ucs4_t codepoint = unicode_name_character(name_str);
        if (codepoint != UNINAME_INVALID)
            Array$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
    }
    return Text$from_codepoints(codepoints);
}

public Text_t Text$from_bytes(Array_t bytes)
{
    if (bytes.stride != sizeof(int8_t))
        Array$compact(&bytes, sizeof(int8_t));

    int8_t nul = 0;
    Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t));
    return Text$from_str(bytes.data);
}

public Array_t Text$lines(Text_t text)
{
    Array_t lines = {};
    text_iter_t state = {0, 0};
    for (int64_t i = 0, line_start = 0; i < text.length; i++) {
        int32_t grapheme = _get_grapheme(text, &state, i);
        if (grapheme == '\r' && _get_grapheme(text, &state, i + 1) == '\n') { // CRLF
            Text_t line = Text$slice(text, I(line_start+1), I(i));
            Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
            i += 1; // skip one extra for CR
            line_start = i + 1;
        } else if (grapheme == '\n') { // newline
            Text_t line = Text$slice(text, I(line_start+1), I(i));
            Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
            line_start = i + 1;
        } else if (i == text.length-1 && line_start != i) { // last line
            Text_t line = Text$slice(text, I(line_start+1), I(i+1));
            Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
        }
    }
    return lines;
}

public const TypeInfo Text$info = {
    .size=sizeof(Text_t),
    .align=__alignof__(Text_t),
    .tag=TextInfo,
    .TextInfo={.lang="Text"},
};

public Pattern_t Pattern$escape_text(Text_t text)
{
    // TODO: optimize for ASCII and short strings
    Array_t graphemes = {.atomic=1};
#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t))
#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); })
    text_iter_t state = {0, 0};
    for (int64_t i = 0; i < text.length; i++) {
        int32_t g = _get_grapheme(text, &state, i);
        ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g;

        if (g == '{') {
            add_str("{1{}");
        } else if (g0 == '?'
                   || uc_is_property_quotation_mark(g0)
                   || (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) {
            add_char('{');
            add_char('1');
            add_char(g);
            add_char('}');
        } else {
            add_char(g);
        }
    }
    return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data};
#undef add_str
#undef add_char
#undef add_escaped
}

public const TypeInfo Pattern$info = {
    .size=sizeof(Pattern_t),
    .align=__alignof__(Pattern_t),
    .tag=TextInfo,
    .TextInfo={.lang="Pattern"},
};

// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0