Initial WIP first past

author: Bruce Hill <bruce@bruce-hill.com> 2024-09-02 18:47:39 -0400
committer: Bruce Hill <bruce@bruce-hill.com> 2024-09-02 18:47:39 -0400
commit: 61e482f6f36aee6f72392a6188f2ec5c858b88fd (patch)
tree: bea4123fcc62dd834405ae89ce9fe260e90a0023 /builtins/text.c
parent: f0f8f218703ebb4512b3cd3f9e06b86a7d9861b0 (diff)
1 files changed, 1317 insertions, 280 deletions
diff --git a/builtins/text.c b/builtins/text.c
index 966018f1..ff709e02 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -1,417 +1,1454 @@
 // Type info and methods for Text datatype, which uses the Boehm "cord" library
 // and libunistr
+
 #include <assert.h>
 #include <ctype.h>
 #include <err.h>
 #include <gc.h>
-#include <gc/cord.h>
 #include <gmp.h>
 #include <limits.h>
+#include <printf.h>
 #include <readline/history.h>
 #include <readline/readline.h>
 #include <stdbool.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <sys/param.h>
+
 #include <unicase.h>
+#include <unictype.h>
 #include <unigbrk.h>
 #include <uniname.h>
 #include <uninorm.h>
+#include <unistd.h>
+#include <unistdio.h>
 #include <unistr.h>
 
 #include "array.h"
 #include "functions.h"
-#include "halfsiphash.h"
 #include "integers.h"
 #include "text.h"
 #include "types.h"
 
-#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo))
+static struct {
+    size_t num_codepoints;
+    const uint32_t *codepoints;
+} synthetic_graphemes[1024] = {};
+
+static int32_t num_synthetic_graphemes = 0;
+
+static int32_t get_grapheme(Text_t text, int64_t index);
+
+typedef struct {
+    int64_t subtext, sum_of_previous_subtexts;
+} iteration_state_t;
 
-static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len)
+static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index);
+
+int32_t find_synthetic_grapheme(const uint32_t *codepoints, size_t len)
 {
-    const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str);
-    uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len);
-    if (!normalized) errx(1, "Unicode normalization error!");
-    return normalized;
+    int32_t lo = 0, hi = num_synthetic_graphemes;
+    while (lo <= hi) {
+        int32_t mid = (lo + hi) / 2;
+        int32_t cmp = (synthetic_graphemes[mid].num_codepoints > len) - (synthetic_graphemes[mid].num_codepoints < len);
+        if (cmp == 0)
+            cmp = memcmp(synthetic_graphemes[mid].codepoints, codepoints, sizeof(uint32_t[len]));
+
+        if (cmp == 0)
+            return mid;
+        else if (cmp < 0)
+            lo = mid + 1;    
+        else if (cmp > 0)
+            hi = mid - 1;
+    }
+    return hi;
 }
 
-public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info)
+int32_t get_synthetic_grapheme(const uint32_t *codepoints, size_t len)
 {
-    if (!text) return info->TextInfo.lang;
-    CORD ret = Text$quoted(*(CORD*)text, colorize);
-    if (!streq(info->TextInfo.lang, "Text"))
-        ret = colorize ? CORD_all("\x1b[1m$", info->TextInfo.lang, "\x1b[m", ret) : CORD_all("$", info->TextInfo.lang, ret);
-    return ret;
+    int32_t index = find_synthetic_grapheme(codepoints, len);
+    if (index < num_synthetic_graphemes
+        && synthetic_graphemes[index].num_codepoints == len
+        && memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) {
+        return -(index+1);
+    } else {
+        if (num_synthetic_graphemes > 0)
+            memmove(&synthetic_graphemes[index], &synthetic_graphemes[index + 1], num_synthetic_graphemes - index);
+
+        uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len]));
+        memcpy(buf, codepoints, sizeof(uint32_t[len]));
+        synthetic_graphemes[index].codepoints = buf;
+        synthetic_graphemes[index].num_codepoints = len;
+
+        ++num_synthetic_graphemes;
+        return -(index+1);
+    }
 }
 
-public CORD Text$quoted(CORD str, bool colorize)
-{
-    // Note: it's important to have unicode strings not get broken up with
-    // escapes, otherwise they won't print right.
-    if (colorize) {
-        CORD quoted = "\x1b[35m\"";
-        CORD_pos i;
-        CORD_FOR(i, str) {
-            char c = CORD_pos_fetch(i);
-            switch (c) {
-#define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m"
-            case '\a': quoted = CORD_cat(quoted, BACKSLASHED("a")); break;
-            case '\b': quoted = CORD_cat(quoted, BACKSLASHED("b")); break;
-            case '\x1b': quoted = CORD_cat(quoted, BACKSLASHED("e")); break;
-            case '\f': quoted = CORD_cat(quoted, BACKSLASHED("f")); break;
-            case '\n': quoted = CORD_cat(quoted, BACKSLASHED("n")); break;
-            case '\r': quoted = CORD_cat(quoted, BACKSLASHED("r")); break;
-            case '\t': quoted = CORD_cat(quoted, BACKSLASHED("t")); break;
-            case '\v': quoted = CORD_cat(quoted, BACKSLASHED("v")); break;
-            case '"': quoted = CORD_cat(quoted, BACKSLASHED("\"")); break;
-            case '\\': quoted = CORD_cat(quoted, BACKSLASHED("\\")); break;
-            case '\x00' ... '\x06': case '\x0E' ... '\x1A':
-            case '\x1C' ... '\x1F': case '\x7F' ... '\x7F':
-                CORD_sprintf(&quoted, "%r" BACKSLASHED("x%02X"), quoted, c);
-                break;
-            default: quoted = CORD_cat_char(quoted, c); break;
-#undef BACKSLASHED
+static inline size_t num_subtexts(Text_t t)
+{
+    if (t.tag != TEXT_SUBTEXT) return 1;
+    size_t len = t.length;
+    size_t n = 0;
+    while (len > 0) {
+        len -= t.subtexts[n].length;
+        ++n;
+    }
+    return n;
+}
+
+int text_visualize(FILE *stream, Text_t t)
+{
+    switch (t.tag) {
+    case TEXT_SHORT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.short_ascii);
+    case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii);
+    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
+        int printed = fprintf(stream, "<graphemes length=%ld>", t.length);
+        printed += Text$print(stream, t);
+        printed += fprintf(stream, "</graphemes>");
+        return printed;
+    }
+    case TEXT_SUBTEXT: {
+        int printed = fprintf(stream, "<text length=%ld>", t.length);
+        size_t to_print = t.length;
+        for (int i = 0; to_print > 0; ++i) {
+            printed += fprintf(stream, "\n  ");
+            printed += text_visualize(stream, t.subtexts[i]);
+            to_print -= t.subtexts[i].length;
+            if (t.subtexts[i].length == 0) break;
+        }
+        printed += fprintf(stream, "\n</text>");
+        return printed;
+    }
+    default: return 0;
+    }
+}
+
+public int Text$print(FILE *stream, Text_t t)
+{
+    switch (t.tag) {
+    case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream);
+    case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream);
+    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
+        int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes;
+        int written = 0;
+        for (int64_t i = 0; i < t.length; i++) {
+            int32_t grapheme = graphemes[i];
+            if (grapheme >= 0) {
+                written += ulc_fprintf(stream, "%.*llU", 1, &grapheme);
+            } else {
+                written += ulc_fprintf(
+                    stream, "%.*llU",
+                    synthetic_graphemes[-grapheme-1].num_codepoints,
+                    synthetic_graphemes[-grapheme-1].codepoints);
             }
         }
-        quoted = CORD_cat(quoted, "\"\x1b[m");
-        return quoted;
+        return written;
+    }
+    case TEXT_SUBTEXT: {
+        int written = 0;
+        int i = 0;
+        for (size_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i)
+            written += Text$print(stream, t.subtexts[i]);
+        return written;
+    }
+    default: return 0;
+    }
+}
+
+static Text_t concat2(Text_t a, Text_t b)
+{
+    if (a.length == 0) return b;
+    if (b.length == 0) return a;
+
+    if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) {
+        size_t na = num_subtexts(a);
+        size_t nb = num_subtexts(b);
+        Text_t ret = {
+            .length=a.length + b.length,
+            .tag=TEXT_SUBTEXT,
+            .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])),
+        };
+        memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na]));
+        memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb]));
+        return ret;
+    } else if (a.tag == TEXT_SUBTEXT) {
+        size_t n = num_subtexts(a);
+        Text_t ret = {
+            .length=a.length + b.length,
+            .tag=TEXT_SUBTEXT,
+            .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])),
+        };
+        memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n]));
+        ret.subtexts[n] = b;
+        return ret;
+    } else if (b.tag == TEXT_SUBTEXT) {
+        size_t n = num_subtexts(b);
+        Text_t ret = {
+            .length=a.length + b.length,
+            .tag=TEXT_SUBTEXT,
+            .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])),
+        };
+        ret.subtexts[0] = a;
+        memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n]));
+        return ret;
     } else {
-        CORD quoted = "\"";
-        CORD_pos i;
-        CORD_FOR(i, str) {
-            char c = CORD_pos_fetch(i);
-            switch (c) {
-            case '\a': quoted = CORD_cat(quoted, "\\a"); break;
-            case '\b': quoted = CORD_cat(quoted, "\\b"); break;
-            case '\x1b': quoted = CORD_cat(quoted, "\\e"); break;
-            case '\f': quoted = CORD_cat(quoted, "\\f"); break;
-            case '\n': quoted = CORD_cat(quoted, "\\n"); break;
-            case '\r': quoted = CORD_cat(quoted, "\\r"); break;
-            case '\t': quoted = CORD_cat(quoted, "\\t"); break;
-            case '\v': quoted = CORD_cat(quoted, "\\v"); break;
-            case '"': quoted = CORD_cat(quoted, "\\\""); break;
-            case '\\': quoted = CORD_cat(quoted, "\\\\"); break;
-            case '\x00' ... '\x06': case '\x0E' ... '\x1A':
-            case '\x1C' ... '\x1F': case '\x7F' ... '\x7F':
-                CORD_sprintf(&quoted, "%r\\x%02X", quoted, c);
-                break;
-            default: quoted = CORD_cat_char(quoted, c); break;
+        Text_t ret = {
+            .length=a.length + b.length,
+            .tag=TEXT_SUBTEXT,
+            .subtexts=GC_MALLOC(sizeof(Text_t[2])),
+        };
+        ret.subtexts[0] = a;
+        ret.subtexts[1] = b;
+        return ret;
+    }
+}
+
+public Text_t Text$_concat(int n, Text_t items[n])
+{
+    if (n == 0) return (Text_t){.length=0};
+    if (n == 1) return items[0];
+    if (n == 2) return concat2(items[0], items[1]);
+
+    int64_t len = 0, subtexts = 0;
+    for (int i = 0; i < n; i++) {
+        len += items[i].length;
+        subtexts += num_subtexts(items[i]);
+    }
+
+    Text_t ret = {
+        .length=len,
+        .tag=TEXT_SUBTEXT,
+        .subtexts=GC_MALLOC(sizeof(Text_t[len])),
+    };
+    int64_t sub_i = 0;
+    for (int i = 0; i < n; i++) {
+        if (items[i].tag == TEXT_SUBTEXT) {
+            for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) {
+                ret.subtexts[sub_i++] = items[i].subtexts[j];
+                remainder -= items[i].subtexts[j].length;
             }
+        } else {
+            ret.subtexts[sub_i++] = items[i];
         }
-        quoted = CORD_cat_char(quoted, '"');
-        return quoted;
     }
+    return ret;
 }
 
-public int Text$compare(const CORD *x, const CORD *y)
+public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
 {
-    uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x);
-    uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y);
-    int result = 0;
-    if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result))
-        fail("Something went wrong while comparing text");
-    return result;
+    int64_t first = Int_to_Int64(first_int, false)-1;
+    int64_t last = Int_to_Int64(last_int, false)-1;
+    if (first == 0) errx(1, "Invalid index: 0");
+    if (last == 0) return (Text_t){.length=0};
+
+    if (first < 0) first = text.length + first + 1;
+    if (last < 0) last = text.length + last + 1;
+
+    if (last > text.length) last = text.length;
+
+    if (first > text.length || last < first)
+        return (Text_t){.length=0};
+
+    if (first == 1 && last == text.length)
+        return text;
+
+    switch (text.tag) {
+    case TEXT_SHORT_ASCII: {
+        Text_t ret = text;
+        ret.length = last - first + 1;
+        if (first > 1)
+            memcpy(ret.short_ascii, text.short_ascii + (first-1), ret.length);
+        return ret;
+    }
+    case TEXT_ASCII: {
+        Text_t ret = {
+            .tag=TEXT_ASCII,
+            .length=last - first + 1,
+            .ascii=text.ascii + (first-1),
+        };
+        return ret;
+    }
+    case TEXT_SHORT_GRAPHEMES: {
+        assert((first == 1 && last == 1) || (first == 2 && last == 2));
+        Text_t ret = {
+            .tag=TEXT_SHORT_GRAPHEMES,
+            .length=1,
+            .short_graphemes={text.short_graphemes[first-1]},
+        };
+        return ret;
+    }
+    case TEXT_GRAPHEMES: {
+        Text_t ret = {
+            .tag=TEXT_GRAPHEMES,
+            .length=last - first + 1,
+            .graphemes=text.graphemes + (first-1),
+        };
+        return ret;
+    }
+    case TEXT_SUBTEXT: {
+        Text_t *subtexts = text.subtexts;
+        while (first > subtexts[0].length) {
+            first -= subtexts[0].length;
+            last -= subtexts[0].length;
+            ++subtexts;
+        }
+
+        int64_t needed_len = (last - first) + 1;
+        int64_t num_subtexts = 0;
+        for (int64_t included = 0; included < needed_len; ) {
+            if (included == 0)
+                included += subtexts[num_subtexts].length - first + 1;
+            else
+                included += subtexts[num_subtexts].length;
+            num_subtexts += 1;
+        }
+        if (num_subtexts == 1)
+            return Text$slice(subtexts[0], Int64_to_Int(first+1), Int64_to_Int(last+1));
+
+        Text_t ret = {
+            .length=needed_len,
+            .tag=TEXT_SUBTEXT,
+            .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])),
+        };
+        for (int64_t i = 0; i < num_subtexts; i++) {
+            ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first+1), Int64_to_Int(last+1));
+            first = 1;
+            needed_len -= ret.subtexts[i].length;
+            last = first + needed_len - 1;
+        }
+        return ret;
+    }
+    default: errx(1, "Invalid tag");
+    }
 }
 
-public bool Text$equal(const CORD *x, const CORD *y)
+Text_t text_from_u32(uint32_t *codepoints, size_t num_codepoints, bool normalize)
 {
-    return Text$compare(x, y) == 0;
+    uint32_t norm_buf[128];
+    if (normalize) {
+        size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
+        uint32_t *normalized = u32_normalize(UNINORM_NFC, codepoints, num_codepoints, norm_buf, &norm_length);
+        codepoints = normalized;
+        num_codepoints = norm_length;
+    }
+
+    char breaks[num_codepoints];
+    u32_grapheme_breaks(codepoints, num_codepoints, breaks);
+
+    Text_t ret = {
+        .length=0,
+        .tag=TEXT_SHORT_GRAPHEMES,
+    };
+    const uint32_t *src = codepoints;
+    int32_t *dest = &ret.short_graphemes[0];
+    while (src != &codepoints[num_codepoints]) {
+        ++ret.length;
+
+        if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length > 2) {
+            int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation
+            graphemes[0] = ret.short_graphemes[0];
+            graphemes[1] = ret.short_graphemes[1];
+            ret.tag = TEXT_GRAPHEMES;
+            ret.graphemes = graphemes;
+            dest = &graphemes[2];
+        }
+
+        const uint32_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]);
+        if (next == &src[1]) {
+            *dest = (int32_t)*src;
+        } else {
+            // Synthetic grapheme
+            *dest = get_synthetic_grapheme(src, next-src);
+        }
+        ++dest;
+        src = next;
+    }
+    if (normalize && codepoints != norm_buf) free(codepoints);
+    return ret;
 }
 
-public uint32_t Text$hash(const CORD *cord)
+public Text_t Text$from_str(const char *str)
 {
-    if (!*cord) return 0;
+    size_t ascii_span = 0;
+    while (str[ascii_span] && isascii(str[ascii_span]))
+        ascii_span++;
 
-    uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
-    uint8_t *normalized = _normalize(*cord, buf, &norm_len);
+    if (str[ascii_span] == '\0') { // All ASCII
+        Text_t ret = {.length=ascii_span};
+        if (ascii_span <= 8) {
+            ret.tag = TEXT_SHORT_ASCII;
+            for (size_t i = 0; i < ascii_span; i++)
+                ret.short_ascii[i] = str[i];
+        } else {
+            ret.tag = TEXT_ASCII;
+            ret.ascii = str;
+        }
+        return ret;
+    } else {
+        uint32_t buf[128];
+        size_t length = sizeof(buf)/sizeof(buf[0]);
+        uint32_t *codepoints = u8_to_u32((uint8_t*)str, ascii_span + strlen(str + ascii_span), buf, &length);
+        Text_t ret = text_from_u32(codepoints, length, true);
+        if (codepoints != buf) free(codepoints);
+        return ret;
+    }
+}
+
+static void u8_buf_append(Text_t text, char **buf, size_t *capacity, int64_t *i)
+{
+    switch (text.tag) {
+    case TEXT_ASCII: case TEXT_SHORT_ASCII: {
+        if (*i + text.length > (int64_t)*capacity) {
+            *capacity = *i + text.length;
+            *buf = GC_REALLOC(*buf, *capacity);
+        }
+
+        const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii;
+        memcpy(*buf + *i, bytes, text.length);
+        *i += text.length;
+        break;
+    }
+    case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
+        const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes;
+        for (int64_t g = 0; g + 1 < text.length; g++) {
+            const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g];
+            size_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1;
+            uint8_t u8_buf[64];
+            size_t u8_len = sizeof(u8_buf);
+            uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len);
+
+            if (*i + (int64_t)u8_len > (int64_t)*capacity) {
+                *capacity = *i + u8_len;
+                *buf = GC_REALLOC(*buf, *capacity);
+            }
 
-    uint32_t hash;
-    halfsiphash(normalized, norm_len, TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash));
-    if (normalized != buf) free(normalized);
-    return hash;
+            memcpy(*buf + *i, u8, u8_len);
+            *i += u8_len;
+            if (u8 != u8_buf) free(u8);
+        }
+        break;
+    }
+    case TEXT_SUBTEXT: {
+        for (int64_t s = 0, remaining = text.length; remaining > 0; s++) {
+            u8_buf_append(text.subtexts[s], buf, capacity, i);
+            remaining -= text.subtexts[s].length;
+        }
+        break;
+    }
+    default: break;
+    }
 }
 
-public CORD Text$upper(CORD str)
+public const char *Text$as_c_string(Text_t text)
 {
-    if (!str) return str;
-    size_t len = strlen(str) + 1;
-    uint8_t *dest = GC_MALLOC_ATOMIC(len);
-    dest[len-1] = 0;
-    return (CORD)u8_toupper((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len);
+    size_t capacity = text.length;
+    char *buf = GC_MALLOC_ATOMIC(capacity);
+    int64_t i = 0;
+    u8_buf_append(text, &buf, &capacity, &i);
+    return buf;
 }
 
-public CORD Text$lower(CORD str)
+uint32_t *text_to_u32(Text_t text, size_t *length)
 {
-    if (!str) return str;
-    size_t len = strlen(str) + 1;
-    uint8_t *dest = GC_MALLOC_ATOMIC(len);
-    dest[len-1] = 0;
-    return (CORD)u8_tolower((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len);
+    // Precalculate size:
+    size_t len = 0;
+    if (text.tag == TEXT_ASCII) {
+        len = text.length;
+    } else {
+        iteration_state_t state = {0, 0};
+        for (int64_t i = 0; i < text.length; i++) {
+            int32_t grapheme = _next_grapheme(text, &state, i);
+            if (grapheme < 0)
+                len += synthetic_graphemes[-grapheme-1].num_codepoints;
+            else
+                len += 1;
+        }
+    }
+    assert(length);
+    *length = len;
+
+    // Copy over codepoints one grapheme cluster at a time:
+    uint32_t *ret = GC_MALLOC_ATOMIC(sizeof(uint32_t[len]));
+    uint32_t *dest = ret;
+    iteration_state_t state = {0, 0};
+    for (int64_t i = 0; i < text.length; i++) {
+        int32_t grapheme = _next_grapheme(text, &state, i);
+        if (grapheme < 0) {
+            const uint32_t *codepoints = synthetic_graphemes[-grapheme-1].codepoints;
+            size_t num_codepoints = synthetic_graphemes[-grapheme-1].num_codepoints;
+            for (size_t j = 0; j < num_codepoints; j++)
+                *(dest++) = codepoints[j];
+        } else {
+            *(dest++) = (uint32_t)grapheme;
+        }
+    }
+    return ret;
 }
 
-public CORD Text$title(CORD str)
+#include "siphash.c"
+
+public uint64_t Text$hash(Text_t *text)
 {
-    if (!str) return str;
-    size_t len = strlen(str) + 1;
-    uint8_t *dest = GC_MALLOC_ATOMIC(len);
-    dest[len-1] = 0;
-    return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len);
+    if (text->hash != 0) return text->hash;
+    siphash sh;
+    siphashinit(&sh, sizeof(int32_t[text->length]), (uint64_t*)TOMO_HASH_KEY);
+
+    union {
+        int32_t chunks[2];
+        uint64_t whole;
+    } tmp;
+    switch (text->tag) {
+    case TEXT_ASCII: case TEXT_SHORT_ASCII: {
+        const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii;
+        for (int64_t i = 0; i + 1 < text->length; i++) {
+            tmp.chunks[0] = (int32_t)bytes[i];
+            tmp.chunks[1] = (int32_t)bytes[i+1];
+            siphashadd64bits(&sh, tmp.whole);
+        }
+        int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes
+        text->hash = siphashfinish_last_part(&sh, (uint64_t)last);
+        break;
+    }
+    case TEXT_GRAPHEMES: {
+        const int32_t *graphemes = text->graphemes;
+        for (int64_t i = 0; i + 1 < text->length; i++) {
+            tmp.chunks[0] = graphemes[i];
+            tmp.chunks[1] = graphemes[i];
+            siphashadd64bits(&sh, tmp.whole);
+        }
+        int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes
+        text->hash = siphashfinish_last_part(&sh, (uint64_t)last);
+        break;
+    }
+    case TEXT_SHORT_GRAPHEMES: {
+        tmp.chunks[0] = text->short_graphemes[0];
+        if (text->length > 1)
+            tmp.chunks[1] = text->short_graphemes[1];
+        text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole);
+        break;
+    }
+    case TEXT_SUBTEXT: {
+        int32_t leftover = 0;
+        for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) {
+            Text_t subtext = text->subtexts[sub_i];
+            if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) {
+                const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii;
+                int64_t grapheme = 0;
+                if (leftover) {
+                    tmp.chunks[0] = leftover;
+                    tmp.chunks[1] = (int32_t)bytes[0];
+                    siphashadd64bits(&sh, tmp.whole);
+                    grapheme += 1;
+                }
+                for (; grapheme + 1 < subtext.length; grapheme += 2) {
+                    tmp.chunks[0] = (int32_t)bytes[grapheme];
+                    tmp.chunks[1] = (int32_t)bytes[grapheme+1];
+                    siphashadd64bits(&sh, tmp.whole);
+                }
+                leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0;
+            } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) {
+                if (leftover) {
+                    tmp.chunks[0] = leftover;
+                    tmp.chunks[1] = subtext.short_graphemes[0];
+                    siphashadd64bits(&sh, tmp.whole);
+                    leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0;
+                } else if (subtext.length == 1) {
+                    leftover = subtext.short_graphemes[0];
+                } else {
+                    tmp.chunks[0] = subtext.short_graphemes[0];
+                    tmp.chunks[1] = subtext.short_graphemes[1];
+                    siphashadd64bits(&sh, tmp.whole);
+                }
+            } else if (subtext.tag == TEXT_GRAPHEMES) {
+                int32_t *graphemes = subtext.graphemes;
+                int64_t grapheme = 0;
+                if (leftover) {
+                    tmp.chunks[0] = leftover;
+                    tmp.chunks[1] = graphemes[0];
+                    siphashadd64bits(&sh, tmp.whole);
+                    grapheme += 1;
+                }
+                for (; grapheme + 1 < subtext.length; grapheme += 2) {
+                    tmp.chunks[0] = graphemes[grapheme];
+                    tmp.chunks[1] = graphemes[grapheme+1];
+                    siphashadd64bits(&sh, tmp.whole);
+                }
+                leftover = grapheme < subtext.length ? graphemes[grapheme] : 0;
+            }
+        
+            to_hash -= text->subtexts[sub_i].length;
+
+            ++sub_i;
+        }
+
+        text->hash = siphashfinish_last_part(&sh, leftover);
+        break;
+    }
+    default: errx(1, "Invalid text");
+    }
+
+    if (text->hash == 0)
+        text->hash = 1;
+
+    return text->hash;
 }
 
-public bool Text$has(CORD str, CORD target, Where_t where)
+int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index)
 {
-    if (!target) return true;
-    if (!str) return false;
+    switch (text.tag) {
+    case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0;
+    case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0;
+    case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0;
+    case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0;
+    case TEXT_SUBTEXT: {
+        iteration_state_t backup_state = {0, 0};
+        if (!state) state = &backup_state;
 
-    uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf);
-    uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len);
+        if (index < 0 || index >= text.length)
+            return 0;
 
-    uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf);
-    uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len);
+        while (index < state->sum_of_previous_subtexts && state->subtext > 0) {
+            state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length;
+            state->subtext -= 1;
+        }
+        for (;;) {
+            if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length)
+                return _next_grapheme(text.subtexts[state->subtext], NULL, index);
+            state->sum_of_previous_subtexts += text.subtexts[state->subtext].length;
+            state->subtext += 1;
+        }
+        return 0;
+    }
+    default: errx(1, "Invalid text");
+    }
+    return 0;
+}
 
-    if (target_norm_len > str_norm_len) return false;
+int32_t get_grapheme(Text_t text, int64_t index)
+{
+    iteration_state_t state = {0, 0};
+    return _next_grapheme(text, &state, index);
+}
 
-    bool ret;
-    if (where.tag == $tag$Where$Start) {
-        ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0);
-    } else if (where.tag == $tag$Where$End) {
-        ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0);
-    } else {
-        assert(where.tag == $tag$Where$Anywhere);
-        ret = (u8_strstr(str_normalized, target_normalized) != NULL);
+int32_t Text$compare(const Text_t *a, const Text_t *b)
+{
+    int64_t len = MAX(a->length, b->length);
+    iteration_state_t a_state = {0, 0}, b_state = {0, 0};
+    for (int64_t i = 0; i < len; i++) {
+        int32_t ai = _next_grapheme(*a, &a_state, i);
+        int32_t bi = _next_grapheme(*b, &b_state, i);
+        if (ai == bi) continue;
+        int32_t cmp;
+        if (ai > 0 && bi > 0) {
+            cmp = u32_cmp((uint32_t*)&ai, (uint32_t*)&bi, 1);
+        } else if (ai > 0) {
+            cmp = u32_cmp2(
+                (uint32_t*)&ai, 1,
+                synthetic_graphemes[-bi-1].codepoints,
+                synthetic_graphemes[-bi-1].num_codepoints);
+        } else if (bi > 0) {
+            cmp = u32_cmp2(
+                synthetic_graphemes[-ai-1].codepoints,
+                synthetic_graphemes[-ai-1].num_codepoints,
+                (uint32_t*)&bi, 1);
+        } else {
+            cmp = u32_cmp2(
+                synthetic_graphemes[-ai-1].codepoints,
+                synthetic_graphemes[-ai-1].num_codepoints,
+                synthetic_graphemes[-bi-1].codepoints,
+                synthetic_graphemes[-bi-1].num_codepoints);
+        }
+        if (cmp != 0) return cmp;
     }
+    return 0;
+}
 
-    if (str_normalized != str_buf) free(str_normalized);
-    if (target_normalized != target_buf) free(target_normalized);
-    return ret;
+public bool Text$equal(const Text_t *a, const Text_t *b)
+{
+    if (a->length != b->length || (a->hash != 0 && b->hash != 0 && a->hash != b->hash))
+        return false;
+    int64_t len = a->length;
+    iteration_state_t a_state = {0, 0}, b_state = {0, 0};
+    for (int64_t i = 0; i < len; i++) {
+        int32_t ai = _next_grapheme(*a, &a_state, i);
+        int32_t bi = _next_grapheme(*b, &b_state, i);
+        if (ai != bi) return false;
+    }
+    return true;
 }
 
-public CORD Text$without(CORD str, CORD target, Where_t where)
+public bool Text$equal_ignoring_case(Text_t a, Text_t b)
 {
-    if (!str || !target) return str;
+    if (a.length != b.length)
+        return false;
+    int64_t len = a.length;
+    iteration_state_t a_state = {0, 0}, b_state = {0, 0};
+    const char *language = uc_locale_language();
+    for (int64_t i = 0; i < len; i++) {
+        int32_t ai = _next_grapheme(a, &a_state, i);
+        int32_t bi = _next_grapheme(b, &b_state, i);
+        if (ai != bi) {
+            const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : synthetic_graphemes[-ai-1].codepoints;
+            size_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints;
 
-    size_t target_len = CORD_len(target);
-    size_t str_len = CORD_len(str);
-    if (where.tag == $tag$Where$Start) {
-        if (CORD_ncmp(str, 0, target, 0, target_len) == 0)
-            return CORD_substr(str, target_len, str_len - target_len);
-        return str;
-    } else if (where.tag == $tag$Where$End) {
-        if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0)
-            return CORD_substr(str, 0, str_len - target_len);
-        return str;
-    } else {
-        CORD ret = CORD_EMPTY;
-        size_t i = 0;
-        for (;;) {
-            size_t match = CORD_str(str, i, target);
-            if (match == CORD_NOT_FOUND) {
-                if (i == 0) return str; // No matches!
-                ret = CORD_cat(ret, CORD_substr(str, i, str_len));
-                break;
-            }
-            ret = CORD_cat(ret, CORD_substr(str, i, (match-i)));
-            i = match + target_len;
+            const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : synthetic_graphemes[-bi-1].codepoints;
+            size_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints;
+
+            int cmp;
+            (void)u32_casecmp(a_codepoints, a_len, b_codepoints, b_len, language, UNINORM_NFC, &cmp);
+            if (cmp != 0)
+                return false;
         }
-        return ret;
     }
+    return true;
 }
 
-public CORD Text$trimmed(CORD str, CORD skip, Where_t where)
+public Text_t Text$upper(Text_t text)
 {
-    if (!str || !skip) return str;
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str);
-    const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip);
-    // TODO: implement proper reverse iteration with u8_prev()
-    if (where.tag == $tag$Where$Start) {
-        size_t span = u8_strspn(ustr, uskip);
-        return (CORD)ustr + span;
-    } else if (where.tag == $tag$Where$End) {
-        size_t len = u8_strlen(ustr);
-        const uint8_t *back = ustr + len;
-        size_t back_span = 0;
-        while (back - back_span > ustr && u8_strspn(back-back_span-1, uskip) > back_span)
-            ++back_span;
-        return CORD_substr((CORD)ustr, 0, len - back_span);
-    } else {
-        size_t span = u8_strspn(ustr, uskip);
-        size_t len = u8_strlen(ustr);
-        const uint8_t *back = ustr + len;
-        size_t back_span = 0;
-        while (back - back_span > ustr + span && u8_strspn(back-back_span-1, uskip) > back_span)
-            ++back_span;
-        return CORD_substr((CORD)(ustr + span), 0, len - span - back_span);
+    size_t length;
+    uint32_t *codepoints = text_to_u32(text, &length);
+    const char *language = uc_locale_language();
+    uint32_t buf[128]; 
+    size_t out_len;
+    uint32_t *upper = u32_toupper(codepoints, length, language, UNINORM_NFC, buf, &out_len);
+    Text_t ret = text_from_u32(upper, out_len, false);
+    if (upper != buf) free(upper);
+    return ret;
+}
+
+public Text_t Text$lower(Text_t text)
+{
+    size_t length;
+    uint32_t *codepoints = text_to_u32(text, &length);
+    const char *language = uc_locale_language();
+    uint32_t buf[128]; 
+    size_t out_len;
+    uint32_t *lower = u32_tolower(codepoints, length, language, UNINORM_NFC, buf, &out_len);
+    Text_t ret = text_from_u32(lower, out_len, false);
+    if (lower != codepoints) free(lower);
+    return ret;
+}
+
+public Text_t Text$title(Text_t text)
+{
+    size_t length;
+    uint32_t *codepoints = text_to_u32(text, &length);
+    const char *language = uc_locale_language();
+    uint32_t buf[128]; 
+    size_t out_len;
+    uint32_t *title = u32_totitle(codepoints, length, language, UNINORM_NFC, buf, &out_len);
+    Text_t ret = text_from_u32(title, out_len, false);
+    if (title != codepoints) free(title);
+    return ret;
+}
+
+static inline void skip_whitespace(Text_t text, int64_t *i)
+{
+    iteration_state_t state = {0, 0};
+    while (*i < text.length) {
+        int32_t grapheme = _next_grapheme(text, &state, *i);
+        if (grapheme > 0 && !uc_is_property_white_space(grapheme))
+            return;
+        *i += 1;
     }
 }
 
-public find_result_t Text$find(CORD str, CORD pat)
+static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme)
 {
-    if (!pat) return (find_result_t){.status=FIND_SUCCESS, .index=1};
-    size_t pos = CORD_str(str, 0, pat);
-    return (pos == CORD_NOT_FOUND) ? (find_result_t){.status=FIND_FAILURE} : (find_result_t){.status=FIND_SUCCESS, .index=(int32_t)pos};
+    if (*i < text.length && get_grapheme(text, *i) == grapheme) {
+        *i += 1;
+        return true;
+    }
+    return false;
 }
 
-public CORD Text$replace(CORD text, CORD pat, CORD replacement, Int_t int_limit)
+static inline bool match_str(Text_t text, int64_t *i, const char *str)
 {
-    if (!text || !pat) return text;
-    CORD ret = CORD_EMPTY;
-    size_t pos = 0, pat_len = CORD_len(pat);
-    int64_t limit = Int_to_Int64(int_limit, false);
-    for (size_t found; limit != 0 && (found=CORD_str(text, pos, pat)) != CORD_NOT_FOUND; --limit) {
-        ret = CORD_all(ret, CORD_substr(text, pos, found - pos), replacement);
-        pos = found + pat_len;
+    iteration_state_t state = {0, 0};
+    int64_t matched = 0;
+    while (matched[str]) {
+        if (*i + matched >= text.length || _next_grapheme(text, &state, *i + matched) != str[matched])
+            return false;
+        matched += 1;
     }
-    size_t str_len = CORD_len(text);
-    return CORD_cat(ret, CORD_substr(text, pos, str_len - pos));
+    *i += matched;
+    return true;
 }
 
-public array_t Text$split(CORD str, CORD split)
+static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
 {
-    if (!str) return (array_t){.data=GC_MALLOC(sizeof(CORD)), .atomic=1, .length=1, .stride=sizeof(CORD)};
-    array_t strings = {.stride=sizeof(CORD), .atomic=1};
+    if (*i >= text.length) return false;
+    int32_t grapheme = get_grapheme(text, *i);
+    if (grapheme < 0) // TODO: check every codepoint in the cluster?
+        grapheme = synthetic_graphemes[-grapheme-1].codepoints[0];
 
-    const uint8_t *ustr = (uint8_t*)CORD_to_const_char_star(str);
-    const uint8_t *usplit = (uint8_t*)CORD_to_const_char_star(split);
-    for (int64_t i = 0; ; ) {
-        size_t non_split = u8_strcspn(ustr + i, usplit);
-        CORD chunk = CORD_substr((CORD)ustr, i, non_split);
-        Array$insert(&strings, &chunk, I(0), sizeof(CORD));
+    if (uc_is_property(grapheme, prop)) {
+        *i += 1;
+        return true;
+    }
+    return false;
+}
 
-        i += non_split;
+static int64_t parse_int(Text_t text, int64_t *i)
+{
+    iteration_state_t state = {0, 0};
+    int64_t value = 0;
+    for (;; *i += 1) {
+        int32_t grapheme = _next_grapheme(text, &state, *i);
+        if (grapheme < 0)
+            grapheme = synthetic_graphemes[-grapheme-1].codepoints[0];
+        int digit = uc_digit_value(grapheme);
+        if (digit < 0) break;
+        if (value >= INT64_MAX/10) break;
+        value = 10*value + digit;
+    }
+    return value;
+}
 
-        size_t split_span = u8_strspn(ustr + i, usplit);
-        if (split_span == 0) break;
-        i += split_span;
+const char *get_property_name(Text_t text, int64_t *i)
+{
+    skip_whitespace(text, i);
+    char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
+    char *dest = name;
+    iteration_state_t state = {0, 0};
+    while (*i < text.length) {
+        int32_t grapheme = _next_grapheme(text, &state, *i);
+        if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
+            *dest = (char)grapheme;
+            ++dest;
+            if (dest >= name + UNINAME_MAX - 1)
+                break;
+        } else if (dest == name && grapheme >= 0 && grapheme != ']') {
+            // Literal character escape: [..[] --> "LEFT SQUARE BRACKET"
+            name = unicode_character_name(grapheme, name);
+            *i += 1;
+            return name;
+        } else {
+            break;
+        }
+        *i += 1;
     }
-    return strings;
+    if (dest == name) return NULL;
+    *dest = '\0';
+    return name;
 }
 
-public CORD Text$join(CORD glue, array_t pieces)
+#define EAT1(state, cond) ({\
+        int32_t grapheme = _next_grapheme(text, state, text_index); \
+        bool success = (cond); \
+        if (success) text_index += 1; \
+        success; })
+
+#define EAT_MANY(state, cond) ({ int64_t n = 0; while (EAT1(state, cond)) { n += 1; } n; })
+
+int64_t match_email(Text_t text, int64_t text_index)
 {
-    if (pieces.length == 0) return CORD_EMPTY;
+    // email = local "@" domain
+    // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
+    // domain = dns-label ("." dns-label)*
+    // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
 
-    CORD ret = CORD_EMPTY;
-    for (int64_t i = 0; i < pieces.length; i++) {
-        if (i > 0) ret = CORD_cat(ret, glue);
-        ret = CORD_cat(ret, *(CORD*)((void*)pieces.data + i*pieces.stride));
+    iteration_state_t state = {0, 0};
+    if (text_index > 0) {
+        int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
+        if (prev_codepoint < 0)
+            prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
+        if (uc_is_property_alphabetic(prev_codepoint))
+            return -1;
     }
-    return ret;
+
+    int64_t start_index = text_index;
+
+    // Local part:
+    int64_t local_len = 0;
+    static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
+    while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
+        local_len += 1;
+        if (local_len > 64) return -1;
+    }
+    
+    if (!EAT1(&state, grapheme == '@'))
+        return -1;
+
+    // Host
+    int64_t host_len = 0;
+    do {
+        int64_t label_len = 0;
+        while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
+            label_len += 1;
+            if (label_len > 63) return -1;
+        }
+
+        if (label_len == 0)
+            return -1;
+
+        host_len += label_len;
+        if (host_len > 255)
+            return -1;
+        host_len += 1;
+    } while (EAT1(&state, grapheme == '.'));
+
+    return text_index - start_index;
 }
 
-public array_t Text$clusters(CORD text)
+int64_t match_ipv6(Text_t text, int64_t text_index)
 {
-    array_t clusters = {.atomic=1};
-    uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
-    uint8_t *normalized = _normalize(text, buf, &norm_len);
-    const uint8_t *end = normalized + strlen((char*)normalized);
-    for (const uint8_t *pos = normalized; pos != end; ) {
-        const uint8_t *next = u8_grapheme_next(pos, end);
-        size_t len = (size_t)(next - pos);
-        char cluster_buf[len+1];
-        strlcpy(cluster_buf, (char*)pos, len+1);
-        CORD cluster = CORD_from_char_star(cluster_buf);
-        Array$insert(&clusters, &cluster, I(0), sizeof(CORD));
-        pos = next;
+    iteration_state_t state = {0, 0};
+    if (text_index > 0) {
+        int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
+        if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
+            return -1;
     }
+    int64_t start_index = text_index;
+    const int NUM_CLUSTERS = 8;
+    bool double_colon_used = false;
+    for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
+        for (int digits = 0; digits < 4; digits++) {
+            if (!EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+                break;
+        }
+        if (EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+            return -1; // Too many digits
+
+        if (cluster == NUM_CLUSTERS-1) {
+            break;
+        } else if (!EAT1(&state, grapheme == ':')) {
+            if (double_colon_used)
+                break;
+            return -1;
+        }
 
-    if (normalized != buf) free(normalized);
-    return clusters;
+        if (EAT1(&state, grapheme == ':')) {
+            if (double_colon_used)
+                return -1;
+            double_colon_used = true;
+        }
+    }
+    return text_index - start_index;
 }
 
-public array_t Text$codepoints(CORD text)
+static int64_t match_ipv4(Text_t text, int64_t text_index)
 {
-    uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
-    uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
+    iteration_state_t state = {0, 0};
+    if (text_index > 0) {
+        int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
+        if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
+            return -1;
+    }
+    int64_t start_index = text_index;
 
-    uint32_t codepoint_buf[128] = {0};
-    size_t codepoint_len = sizeof(codepoint_buf);
-    uint32_t *codepoints = u8_to_u32(normalized, norm_len-1, codepoint_buf, &codepoint_len);
-    array_t ret = {
-        .length=codepoint_len,
-        .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t[codepoint_len])), codepoints, sizeof(int32_t[codepoint_len])),
-        .stride=sizeof(int32_t),
-        .atomic=1,
-    };
+    const int NUM_CLUSTERS = 4;
+    for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
+        for (int digits = 0; digits < 3; digits++) {
+            if (!EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
+                if (digits == 0) return -1;
+                break;
+            }
+        }
 
-    if (normalized != norm_buf) free(normalized);
-    if (codepoints != codepoint_buf) free(codepoints);
-    return ret;
+        if (EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
+            return -1; // Too many digits
+
+        if (cluster == NUM_CLUSTERS-1)
+            break;
+        else if (!EAT1(&state, grapheme == '.'))
+            return -1;
+    }
+    return (text_index - start_index);
 }
 
-public array_t Text$bytes(CORD text)
+int64_t match_uri(Text_t text, int64_t text_index)
 {
-    uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
-    uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
+    // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
+    // scheme = [a-zA-Z] [a-zA-Z0-9+.-]
+    // authority = [userinfo "@"] host [":" port]
 
-    --norm_len; // NUL byte
-    array_t ret = {
-        .length=norm_len,
-        .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t[norm_len])), normalized, sizeof(uint8_t[norm_len])),
-        .stride=sizeof(uint8_t),
-        .atomic=1,
-    };
+    iteration_state_t state = {0, 0};
+    if (text_index > 0) {
+        int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
+        if (prev_codepoint < 0)
+            prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
+        if (uc_is_property_alphabetic(prev_codepoint))
+            return -1;
+    }
 
-    if (normalized != norm_buf) free(normalized);
-    return ret;
+    int64_t start_index = text_index;
+
+    // Scheme:
+    if (!EAT1(&state, isalpha(grapheme)))
+        return -1;
+
+    EAT_MANY(&state, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
+
+    if (text_index == start_index)
+        return -1;
+
+    if (!match_grapheme(text, &text_index, ':'))
+        return -1;
+
+    // Authority:
+    if (match_str(text, &text_index, "//")) {
+        int64_t authority_start = text_index;
+        // Username or host:
+        static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
+        if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
+            return -1;
+
+        if (EAT1(&state, grapheme == '@')) {
+            // Found a username, now get a host:
+            if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
+                return -1;
+        } else {
+            int64_t ip = authority_start;
+            int64_t ipv4_len = match_ipv4(text, ip);
+            if (ipv4_len > 0) {
+                ip += ipv4_len;
+            } else if (match_grapheme(text, &ip, '[')) {
+                ip += match_ipv6(text, ip);
+                if (ip > authority_start + 1 && match_grapheme(text, &ip, ']'))
+                    text_index = ip;
+            }
+        }
+
+        // Port:
+        if (EAT1(&state, grapheme == ':')) {
+            if (EAT_MANY(&state, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
+                return -1;
+        }
+        if (!EAT1(&state, grapheme == '/'))
+            return (text_index - start_index); // No path
+    } else {
+        // Optional path root:
+        EAT1(&state, grapheme == '/');
+    }
+
+    // Path:
+    static const char *non_path = " \"#?<>[]{}\\^`|";
+    EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
+
+    if (EAT1(&state, grapheme == '?')) { // Query
+        static const char *non_query = " \"#<>[]{}\\^`|";
+        EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
+    }
+    
+    if (EAT1(&state, grapheme == '#')) { // Fragment
+        static const char *non_fragment = " \"#<>[]{}\\^`|";
+        EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
+    }
+    return text_index - start_index;
 }
 
-public Int_t Text$num_clusters(CORD text)
+int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_index)
 {
-    const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text);
-    int64_t num_clusters = 0;
-    const uint8_t *end = ustr + u8_strlen(ustr);
-    for (const uint8_t *pos = ustr; pos != end; ) {
-        const uint8_t *next = u8_grapheme_next(pos, end);
-        ++num_clusters;
-        pos = next;
+    if (pattern_index >= pattern.length) return 0;
+    int64_t start_index = text_index;
+    iteration_state_t pattern_state = {0, 0}, text_state = {0, 0};
+    while (pattern_index < pattern.length) {
+        int64_t old_pat_index = pattern_index;
+        if (match_str(pattern, &pattern_index, "[..")) {
+            skip_whitespace(pattern, &pattern_index);
+            int64_t min, max;
+            if (uc_is_digit(_next_grapheme(pattern, &pattern_state, pattern_index))) {
+                min = parse_int(pattern, &pattern_index);
+                skip_whitespace(pattern, &pattern_index);
+                if (match_grapheme(pattern, &pattern_index, '+')) {
+                    max = INT64_MAX;
+                } else if (match_grapheme(pattern, &pattern_index, '-')) {
+                    max = parse_int(pattern, &pattern_index);
+                } else {
+                    max = min;
+                }
+            } else {
+                min = 1, max = INT64_MAX;
+            }
+
+            skip_whitespace(pattern, &pattern_index);
+            bool want_to_match = !match_grapheme(pattern, &pattern_index, '!');
+            const char *prop_name = get_property_name(pattern, &pattern_index);
+
+            skip_whitespace(pattern, &pattern_index);
+            if (!match_grapheme(pattern, &pattern_index, ']'))
+                errx(1, "Missing closing ']' in pattern: \"%T\"", &pattern);
+
+            int64_t before_group = text_index;
+            bool any = false;
+            uc_property_t prop;
+            int32_t specific_codepoint = UNINAME_INVALID;
+
+#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } })
+            if (prop_name) {
+                switch (tolower(prop_name[0])) {
+                case 'd':
+                    if (strcasecmp(prop_name, "digit") == 0) {
+                        prop = UC_PROPERTY_DECIMAL_DIGIT;
+                        goto got_prop;
+                    }
+                    break;
+                case 'e':
+                    if (strcasecmp(prop_name, "end") == 0) {
+                        if (text_index != text.length)
+                            FAIL();
+                        continue;
+                    } else if (prop_name && strcasecmp(prop_name, "email") == 0) {
+                        int64_t len = match_email(text, text_index);
+                        if (len < 0)
+                            FAIL();
+                        text_index += len;
+                        continue;
+                    } else if (prop_name && strcasecmp(prop_name, "emoji") == 0) {
+                        prop = UC_PROPERTY_EMOJI;
+                        goto got_prop;
+                    }
+                    break;
+                case 'i':
+                    if (prop_name && strcasecmp(prop_name, "id") == 0) {
+                        if (!EAT1(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_START)))
+                            FAIL();
+                        EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
+                        continue;
+                    } else if (prop_name && strcasecmp(prop_name, "ipv4") == 0) {
+                        int64_t len = match_ipv4(text, text_index);
+                        if (len < 0)
+                            FAIL();
+                        text_index += len;
+                        continue;
+                    } else if (prop_name && strcasecmp(prop_name, "ipv6") == 0) {
+                        int64_t len = match_ipv6(text, text_index);
+                        if (len < 0)
+                            FAIL();
+                        text_index += len;
+                        continue;
+                    } else if (prop_name && strcasecmp(prop_name, "ip") == 0) {
+                        int64_t len = match_ipv6(text, text_index);
+                        if (len < 0)
+                            len = match_ipv4(text, text_index);
+                        if (len < 0)
+                            FAIL();
+                        text_index += len;
+                        continue;
+                    }
+                    break;
+                case 's':
+                    if (strcasecmp(prop_name, "start") == 0) {
+                        if (text_index != 0) return -1;
+                        continue;
+                    }
+                    break;
+                case 'u':
+                    if (prop_name && strcasecmp(prop_name, "uri") == 0) {
+                        int64_t len = match_uri(text, text_index);
+                        if (len < 0)
+                            FAIL();
+                        text_index += len;
+                        continue;
+                    } else if (prop_name && strcasecmp(prop_name, "url") == 0) {
+                        int64_t lookahead = text_index;
+                        if (!(match_str(text, &lookahead, "https:")
+                            || match_str(text, &lookahead, "http:")
+                            || match_str(text, &lookahead, "ftp:")
+                            || match_str(text, &lookahead, "wss:")
+                            || match_str(text, &lookahead, "ws:")))
+                            FAIL();
+
+                        int64_t len = match_uri(text, text_index);
+                        if (len < 0)
+                            FAIL();
+                        text_index += len;
+                        continue;
+                    }
+                    break;
+                }
+
+                prop = uc_property_byname(prop_name);
+                if (!uc_property_is_valid(prop)) {
+                    specific_codepoint = unicode_name_character(prop_name);
+                    if (specific_codepoint == UNINAME_INVALID)
+                        errx(1, "Not a valid property or character name: %s", prop_name);
+                }
+            } else {
+                any = true;
+                prop = UC_PROPERTY_PRIVATE_USE;
+            }
+      got_prop:;
+
+            if (min == 0 && pattern_index < pattern.length) {
+                int64_t match_len = match(text, pattern, text_index, pattern_index);
+                if (match_len >= 0)
+                    return (text_index - start_index) + match_len;
+            }
+
+            for (int64_t count = 0; count < max; ) {
+                int32_t grapheme = _next_grapheme(text, &text_state, text_index);
+                if (grapheme < 0)
+                    grapheme = synthetic_graphemes[-grapheme-1].codepoints[0];
+
+                bool success;
+                if (any)
+                    success = true;
+                else if (specific_codepoint != UNINAME_INVALID)
+                    success = (grapheme == specific_codepoint);
+                else
+                    success = uc_is_property(grapheme, prop);
+
+                if (success != want_to_match) {
+                    if (count < min) return -1;
+                    else break;
+                }
+
+                text_index += 1;
+                count += 1;
+
+                if (count >= min) {
+                    if (pattern_index < pattern.length) {
+                        int64_t match_len = match(text, pattern, text_index, pattern_index);
+                        if (match_len >= 0) {
+                            return (text_index - start_index) + match_len;
+                        }
+                    } else if (text_index >= text.length) {
+                        break;
+                    }
+                }
+            }
+        } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_QUOTATION_MARK)
+                   && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) {
+            // Quotation: "?", '?', etc
+            int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
+            if (!match_grapheme(text, &text_index, open)) return -1;
+            int32_t close = open;
+            uc_mirror_char(open, (uint32_t*)&close);
+            if (!match_grapheme(pattern, &pattern_index, close))
+                errx(1, "I expected a closing brace");
+            while (text_index < text.length) {
+                int32_t c = _next_grapheme(text, &text_state, text_index);
+                if (c == close)
+                    return (text_index - start_index);
+
+                if (c == '\\' && text_index < text.length) {
+                    text_index += 2;
+                } else {
+                    text_index += 1;
+                }
+            }
+            return -1;
+        } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_PAIRED_PUNCTUATION)
+                   && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) {
+            // Nested punctuation: (?), [?], etc
+            int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
+            if (!match_grapheme(text, &text_index, open)) return -1;
+            int32_t close = open;
+            uc_mirror_char(open, (uint32_t*)&close);
+            if (!match_grapheme(pattern, &pattern_index, close))
+                errx(1, "I expected a closing brace");
+            int64_t depth = 1;
+            for (; depth > 0 && text_index < text.length; ++text_index) {
+                int32_t c = _next_grapheme(text, &text_state, text_index);
+                if (c == open)
+                    depth += 1;
+                else if (c == close)
+                    depth -= 1;
+            }
+            if (depth > 0) return -1;
+        } else {
+            // Plain character:
+            pattern_index = old_pat_index;
+            int32_t pat_grapheme = _next_grapheme(pattern, &pattern_state, pattern_index);
+
+            if (pattern_index == 0 && text_index > 0) {
+                int32_t pat_codepoint = pat_grapheme;
+                if (pat_codepoint < 0)
+                    pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0];
+
+                int32_t prev_codepoint = _next_grapheme(text, &text_state, text_index - 1);
+                if (prev_codepoint < 0)
+                    prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
+                if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(prev_codepoint))
+                    return -1;
+            }
+
+            int32_t text_grapheme = _next_grapheme(text, &text_state, text_index);
+            if (pat_grapheme != text_grapheme)
+                return -1;
+
+            pattern_index += 1;
+            text_index += 1;
+
+            if (pattern_index == pattern.length && text_index < text.length) {
+                int32_t pat_codepoint = pat_grapheme;
+                if (pat_codepoint < 0)
+                    pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0];
+
+                int32_t next_codepoint = _next_grapheme(text, &text_state, text_index);
+                if (next_codepoint < 0)
+                    next_codepoint = synthetic_graphemes[-next_codepoint-1].codepoints[0];
+                if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(next_codepoint))
+                    return -1;
+            }
+        }
+    }
+    if (text_index >= text.length && pattern_index < pattern.length)
+        return -1;
+    return (text_index - start_index);
+}
+
+#undef EAT1
+#undef EAT_MANY
+
+public Int_t Text$find(Text_t text, Text_t pattern, Int_t from_index, int64_t *match_length)
+{
+    int32_t first = get_grapheme(pattern, 0);
+    bool find_first = (first != '['
+                       && !uc_is_property(first, UC_PROPERTY_QUOTATION_MARK)
+                       && !uc_is_property(first, UC_PROPERTY_PAIRED_PUNCTUATION));
+
+    iteration_state_t text_state = {0, 0};
+    for (int64_t i = Int_to_Int64(from_index, false)-1; i < text.length; i++) {
+        // Optimization: quickly skip ahead to first char in pattern:
+        if (find_first) {
+            while (i < text.length && _next_grapheme(text, &text_state, i) != first)
+                ++i;
+        }
+
+        int64_t m = match(text, pattern, i, 0);
+        if (m >= 0) {
+            if (match_length)
+                *match_length = m;
+            return I(i+1);
+        }
     }
-    return I(num_clusters);
+    if (match_length)
+        *match_length = -1;
+    return I(0);
+}
+
+public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n])
+{
+    if (n < 1) return -1;
+    (void)info;
+    argtypes[0] = PA_POINTER;
+    sizes[0] = sizeof(Text_t*);
+    return 1;
+}
+
+public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[])
+{
+    (void)info;
+    Text_t t = **(Text_t**)args[0];
+    return Text$print(stream, t);
 }
 
-public Int_t Text$num_codepoints(CORD text)
+public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info)
 {
-    uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf);
-    uint8_t *normalized = _normalize(text, buf, &norm_len);
-    int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1);
-    if (normalized != buf) free(normalized);
-    return I(num_codepoints);
+    (void)info;
+    if (!text) return Text$from_str("Text");
+    return Text$quoted(*(Text_t*)text, colorize);
 }
 
-public Int_t Text$num_bytes(CORD text)
+public Text_t Text$quoted(Text_t text, bool colorize)
 {
-    uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf);
-    uint8_t *normalized = _normalize(text, norm_buf, &norm_len);
-    --norm_len; // NUL byte
-    if (!normalized) errx(1, "Unicode normalization error!");
-    if (normalized != norm_buf) free(normalized);
-    return I(norm_len);
+    // TODO: optimize for ASCII and short strings
+    array_t graphemes = {.atomic=1};
+#define add_char(c) Array$insert_value(&graphemes, (uint32_t)c, I_small(0), sizeof(uint32_t))
+#define add_str(s) ({ for (char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (uint32_t)*_c, I_small(0), sizeof(uint32_t)); })
+    if (colorize)
+        add_str("\x1b[35m\"");
+    else
+        add_char('"');
+
+#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); })
+    iteration_state_t state = {0, 0};
+    for (int64_t i = 0; i < text.length; i++) {
+        int32_t g = _next_grapheme(text, &state, i);
+        switch (g) {
+        case '\a': add_escaped("a"); break;
+        case '\b': add_escaped("b"); break;
+        case '\x1b': add_escaped("e"); break;
+        case '\f': add_escaped("f"); break;
+        case '\n': add_escaped("n"); break;
+        case '\r': add_escaped("r"); break;
+        case '\t': add_escaped("t"); break;
+        case '\v': add_escaped("v"); break;
+        case '"': add_escaped("\""); break;
+        case '\\': add_escaped("\\"); break;
+        case '\x00' ... '\x06': case '\x0E' ... '\x1A':
+        case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': {
+            if (colorize) add_str("\x1b[34;1m");
+            add_char('\\');
+            add_char('x');
+            char tmp[4];
+            sprintf(tmp, "%02X", g);
+            add_str(tmp);
+            if (colorize)
+                add_str("\x1b[0;35m");
+            break;
+        }
+        default: add_char(g); break;
+        }
+    }
+
+    if (colorize)
+        add_str("\"\x1b[m");
+    else
+        add_char('"');
+
+    return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data};
+#undef add_str
+#undef add_char
+#undef add_escaped
 }
 
-public array_t Text$character_names(CORD text)
+public Text_t Text$replace(Text_t text, Text_t pattern, Text_t replacement)
 {
-    array_t codepoints = Text$codepoints(text);
-    array_t ret = {.length=codepoints.length, .stride=sizeof(CORD), .data=GC_MALLOC(sizeof(CORD[codepoints.length]))};
-    for (int64_t i = 0; i < codepoints.length; i++) {
-        char buf[UNINAME_MAX];
-        unicode_character_name(*(ucs4_t*)(codepoints.data + codepoints.stride*i), buf);
-        *(CORD*)(ret.data + ret.stride*i) = CORD_from_char_star(buf);
+    Text_t ret = {.length=0};
+
+    Int_t i = I_small(0);
+    for (;;) {
+        int64_t len;
+        Int_t found = Text$find(text, pattern, i, &len);
+        if (found.small == I_small(0).small) break;
+        if (Int$compare(&found, &i, &$Text) > 0) {
+            ret = Text$concat(
+                ret,
+                Text$slice(text, i, Int$minus(found, I_small(1))),
+                replacement
+            );
+        } else {
+            ret = concat2(ret, replacement);
+        }
+    }
+    if (Int_to_Int64(i, false) <= text.length) {
+        ret = concat2(ret, Text$slice(text, i, Int64_to_Int(text.length)));
     }
     return ret;
 }
 
-public CORD Text$read_line(CORD prompt)
+public Text_t Text$format(const char *fmt, ...)
 {
-    char *line = readline(CORD_to_const_char_star(prompt));
-    if (!line) return CORD_EMPTY;
-    CORD ret = CORD_from_char_star(line);
-    free(line);
+    va_list args;
+    va_start(args, fmt);
+
+    char buf[8];
+    int len = vsnprintf(buf, sizeof(buf), fmt, args);
+    Text_t ret;
+    if (len <= (int)sizeof(buf)) {
+        ret = (Text_t){
+            .length=len,
+            .tag = TEXT_SHORT_ASCII,
+        };
+        for (int i = 0; i < len; i++)
+            ret.short_ascii[i] = buf[i];
+    } else {
+        char *str = GC_MALLOC_ATOMIC(len);
+        vsnprintf(str, len, fmt, args);
+        ret = Text$from_str(str);
+    }
+    va_end(args);
     return ret;
 }
 
 public const TypeInfo $Text = {
-    .size=sizeof(CORD),
-    .align=__alignof__(CORD),
+    .size=sizeof(Text_t),
+    .align=__alignof__(Text_t),
     .tag=TextInfo,
     .TextInfo={.lang="Text"},
 };
author	Bruce Hill <bruce@bruce-hill.com>	2024-09-02 18:47:39 -0400
committer	Bruce Hill <bruce@bruce-hill.com>	2024-09-02 18:47:39 -0400
commit	61e482f6f36aee6f72392a6188f2ec5c858b88fd (patch)
tree	bea4123fcc62dd834405ae89ce9fe260e90a0023 /builtins/text.c
parent	f0f8f218703ebb4512b3cd3f9e06b86a7d9861b0 (diff)