diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-09-02 18:47:39 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-09-02 18:47:39 -0400 |
| commit | 61e482f6f36aee6f72392a6188f2ec5c858b88fd (patch) | |
| tree | bea4123fcc62dd834405ae89ce9fe260e90a0023 /builtins/text.c | |
| parent | f0f8f218703ebb4512b3cd3f9e06b86a7d9861b0 (diff) | |
Initial WIP first past
Diffstat (limited to 'builtins/text.c')
| -rw-r--r-- | builtins/text.c | 1597 |
1 files changed, 1317 insertions, 280 deletions
diff --git a/builtins/text.c b/builtins/text.c index 966018f1..ff709e02 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -1,417 +1,1454 @@ // Type info and methods for Text datatype, which uses the Boehm "cord" library // and libunistr + #include <assert.h> #include <ctype.h> #include <err.h> #include <gc.h> -#include <gc/cord.h> #include <gmp.h> #include <limits.h> +#include <printf.h> #include <readline/history.h> #include <readline/readline.h> #include <stdbool.h> #include <stdint.h> #include <stdlib.h> #include <sys/param.h> + #include <unicase.h> +#include <unictype.h> #include <unigbrk.h> #include <uniname.h> #include <uninorm.h> +#include <unistd.h> +#include <unistdio.h> #include <unistr.h> #include "array.h" #include "functions.h" -#include "halfsiphash.h" #include "integers.h" #include "text.h" #include "types.h" -#define CLAMP(x, lo, hi) MIN(hi, MAX(x,lo)) +static struct { + size_t num_codepoints; + const uint32_t *codepoints; +} synthetic_graphemes[1024] = {}; + +static int32_t num_synthetic_graphemes = 0; + +static int32_t get_grapheme(Text_t text, int64_t index); + +typedef struct { + int64_t subtext, sum_of_previous_subtexts; +} iteration_state_t; -static inline uint8_t *_normalize(CORD str, uint8_t *buf, size_t *len) +static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index); + +int32_t find_synthetic_grapheme(const uint32_t *codepoints, size_t len) { - const uint8_t *str_u8 = (const uint8_t*)CORD_to_const_char_star(str); - uint8_t *normalized = u8_normalize(UNINORM_NFD, str_u8, strlen((char*)str_u8)+1, buf, len); - if (!normalized) errx(1, "Unicode normalization error!"); - return normalized; + int32_t lo = 0, hi = num_synthetic_graphemes; + while (lo <= hi) { + int32_t mid = (lo + hi) / 2; + int32_t cmp = (synthetic_graphemes[mid].num_codepoints > len) - (synthetic_graphemes[mid].num_codepoints < len); + if (cmp == 0) + cmp = memcmp(synthetic_graphemes[mid].codepoints, codepoints, sizeof(uint32_t[len])); + + if (cmp == 0) + return mid; + else if (cmp < 0) + lo = mid + 1; + else if (cmp > 0) + hi = mid - 1; + } + return hi; } -public CORD Text$as_text(const void *text, bool colorize, const TypeInfo *info) +int32_t get_synthetic_grapheme(const uint32_t *codepoints, size_t len) { - if (!text) return info->TextInfo.lang; - CORD ret = Text$quoted(*(CORD*)text, colorize); - if (!streq(info->TextInfo.lang, "Text")) - ret = colorize ? CORD_all("\x1b[1m$", info->TextInfo.lang, "\x1b[m", ret) : CORD_all("$", info->TextInfo.lang, ret); - return ret; + int32_t index = find_synthetic_grapheme(codepoints, len); + if (index < num_synthetic_graphemes + && synthetic_graphemes[index].num_codepoints == len + && memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) { + return -(index+1); + } else { + if (num_synthetic_graphemes > 0) + memmove(&synthetic_graphemes[index], &synthetic_graphemes[index + 1], num_synthetic_graphemes - index); + + uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len])); + memcpy(buf, codepoints, sizeof(uint32_t[len])); + synthetic_graphemes[index].codepoints = buf; + synthetic_graphemes[index].num_codepoints = len; + + ++num_synthetic_graphemes; + return -(index+1); + } } -public CORD Text$quoted(CORD str, bool colorize) -{ - // Note: it's important to have unicode strings not get broken up with - // escapes, otherwise they won't print right. - if (colorize) { - CORD quoted = "\x1b[35m\""; - CORD_pos i; - CORD_FOR(i, str) { - char c = CORD_pos_fetch(i); - switch (c) { -#define BACKSLASHED(esc) "\x1b[34m\\\x1b[1m" esc "\x1b[0;35m" - case '\a': quoted = CORD_cat(quoted, BACKSLASHED("a")); break; - case '\b': quoted = CORD_cat(quoted, BACKSLASHED("b")); break; - case '\x1b': quoted = CORD_cat(quoted, BACKSLASHED("e")); break; - case '\f': quoted = CORD_cat(quoted, BACKSLASHED("f")); break; - case '\n': quoted = CORD_cat(quoted, BACKSLASHED("n")); break; - case '\r': quoted = CORD_cat(quoted, BACKSLASHED("r")); break; - case '\t': quoted = CORD_cat(quoted, BACKSLASHED("t")); break; - case '\v': quoted = CORD_cat(quoted, BACKSLASHED("v")); break; - case '"': quoted = CORD_cat(quoted, BACKSLASHED("\"")); break; - case '\\': quoted = CORD_cat(quoted, BACKSLASHED("\\")); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': - CORD_sprintf("ed, "%r" BACKSLASHED("x%02X"), quoted, c); - break; - default: quoted = CORD_cat_char(quoted, c); break; -#undef BACKSLASHED +static inline size_t num_subtexts(Text_t t) +{ + if (t.tag != TEXT_SUBTEXT) return 1; + size_t len = t.length; + size_t n = 0; + while (len > 0) { + len -= t.subtexts[n].length; + ++n; + } + return n; +} + +int text_visualize(FILE *stream, Text_t t) +{ + switch (t.tag) { + case TEXT_SHORT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.short_ascii); + case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii); + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + int printed = fprintf(stream, "<graphemes length=%ld>", t.length); + printed += Text$print(stream, t); + printed += fprintf(stream, "</graphemes>"); + return printed; + } + case TEXT_SUBTEXT: { + int printed = fprintf(stream, "<text length=%ld>", t.length); + size_t to_print = t.length; + for (int i = 0; to_print > 0; ++i) { + printed += fprintf(stream, "\n "); + printed += text_visualize(stream, t.subtexts[i]); + to_print -= t.subtexts[i].length; + if (t.subtexts[i].length == 0) break; + } + printed += fprintf(stream, "\n</text>"); + return printed; + } + default: return 0; + } +} + +public int Text$print(FILE *stream, Text_t t) +{ + switch (t.tag) { + case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream); + case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream); + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes; + int written = 0; + for (int64_t i = 0; i < t.length; i++) { + int32_t grapheme = graphemes[i]; + if (grapheme >= 0) { + written += ulc_fprintf(stream, "%.*llU", 1, &grapheme); + } else { + written += ulc_fprintf( + stream, "%.*llU", + synthetic_graphemes[-grapheme-1].num_codepoints, + synthetic_graphemes[-grapheme-1].codepoints); } } - quoted = CORD_cat(quoted, "\"\x1b[m"); - return quoted; + return written; + } + case TEXT_SUBTEXT: { + int written = 0; + int i = 0; + for (size_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) + written += Text$print(stream, t.subtexts[i]); + return written; + } + default: return 0; + } +} + +static Text_t concat2(Text_t a, Text_t b) +{ + if (a.length == 0) return b; + if (b.length == 0) return a; + + if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) { + size_t na = num_subtexts(a); + size_t nb = num_subtexts(b); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])), + }; + memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na])); + memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb])); + return ret; + } else if (a.tag == TEXT_SUBTEXT) { + size_t n = num_subtexts(a); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), + }; + memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n])); + ret.subtexts[n] = b; + return ret; + } else if (b.tag == TEXT_SUBTEXT) { + size_t n = num_subtexts(b); + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), + }; + ret.subtexts[0] = a; + memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n])); + return ret; } else { - CORD quoted = "\""; - CORD_pos i; - CORD_FOR(i, str) { - char c = CORD_pos_fetch(i); - switch (c) { - case '\a': quoted = CORD_cat(quoted, "\\a"); break; - case '\b': quoted = CORD_cat(quoted, "\\b"); break; - case '\x1b': quoted = CORD_cat(quoted, "\\e"); break; - case '\f': quoted = CORD_cat(quoted, "\\f"); break; - case '\n': quoted = CORD_cat(quoted, "\\n"); break; - case '\r': quoted = CORD_cat(quoted, "\\r"); break; - case '\t': quoted = CORD_cat(quoted, "\\t"); break; - case '\v': quoted = CORD_cat(quoted, "\\v"); break; - case '"': quoted = CORD_cat(quoted, "\\\""); break; - case '\\': quoted = CORD_cat(quoted, "\\\\"); break; - case '\x00' ... '\x06': case '\x0E' ... '\x1A': - case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': - CORD_sprintf("ed, "%r\\x%02X", quoted, c); - break; - default: quoted = CORD_cat_char(quoted, c); break; + Text_t ret = { + .length=a.length + b.length, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[2])), + }; + ret.subtexts[0] = a; + ret.subtexts[1] = b; + return ret; + } +} + +public Text_t Text$_concat(int n, Text_t items[n]) +{ + if (n == 0) return (Text_t){.length=0}; + if (n == 1) return items[0]; + if (n == 2) return concat2(items[0], items[1]); + + int64_t len = 0, subtexts = 0; + for (int i = 0; i < n; i++) { + len += items[i].length; + subtexts += num_subtexts(items[i]); + } + + Text_t ret = { + .length=len, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[len])), + }; + int64_t sub_i = 0; + for (int i = 0; i < n; i++) { + if (items[i].tag == TEXT_SUBTEXT) { + for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) { + ret.subtexts[sub_i++] = items[i].subtexts[j]; + remainder -= items[i].subtexts[j].length; } + } else { + ret.subtexts[sub_i++] = items[i]; } - quoted = CORD_cat_char(quoted, '"'); - return quoted; } + return ret; } -public int Text$compare(const CORD *x, const CORD *y) +public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) { - uint8_t *xx = (uint8_t*)CORD_to_const_char_star(*x); - uint8_t *yy = (uint8_t*)CORD_to_const_char_star(*y); - int result = 0; - if (u8_normcmp(xx, strlen((char*)xx), yy, strlen((char*)yy), UNINORM_NFD, &result)) - fail("Something went wrong while comparing text"); - return result; + int64_t first = Int_to_Int64(first_int, false)-1; + int64_t last = Int_to_Int64(last_int, false)-1; + if (first == 0) errx(1, "Invalid index: 0"); + if (last == 0) return (Text_t){.length=0}; + + if (first < 0) first = text.length + first + 1; + if (last < 0) last = text.length + last + 1; + + if (last > text.length) last = text.length; + + if (first > text.length || last < first) + return (Text_t){.length=0}; + + if (first == 1 && last == text.length) + return text; + + switch (text.tag) { + case TEXT_SHORT_ASCII: { + Text_t ret = text; + ret.length = last - first + 1; + if (first > 1) + memcpy(ret.short_ascii, text.short_ascii + (first-1), ret.length); + return ret; + } + case TEXT_ASCII: { + Text_t ret = { + .tag=TEXT_ASCII, + .length=last - first + 1, + .ascii=text.ascii + (first-1), + }; + return ret; + } + case TEXT_SHORT_GRAPHEMES: { + assert((first == 1 && last == 1) || (first == 2 && last == 2)); + Text_t ret = { + .tag=TEXT_SHORT_GRAPHEMES, + .length=1, + .short_graphemes={text.short_graphemes[first-1]}, + }; + return ret; + } + case TEXT_GRAPHEMES: { + Text_t ret = { + .tag=TEXT_GRAPHEMES, + .length=last - first + 1, + .graphemes=text.graphemes + (first-1), + }; + return ret; + } + case TEXT_SUBTEXT: { + Text_t *subtexts = text.subtexts; + while (first > subtexts[0].length) { + first -= subtexts[0].length; + last -= subtexts[0].length; + ++subtexts; + } + + int64_t needed_len = (last - first) + 1; + int64_t num_subtexts = 0; + for (int64_t included = 0; included < needed_len; ) { + if (included == 0) + included += subtexts[num_subtexts].length - first + 1; + else + included += subtexts[num_subtexts].length; + num_subtexts += 1; + } + if (num_subtexts == 1) + return Text$slice(subtexts[0], Int64_to_Int(first+1), Int64_to_Int(last+1)); + + Text_t ret = { + .length=needed_len, + .tag=TEXT_SUBTEXT, + .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])), + }; + for (int64_t i = 0; i < num_subtexts; i++) { + ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first+1), Int64_to_Int(last+1)); + first = 1; + needed_len -= ret.subtexts[i].length; + last = first + needed_len - 1; + } + return ret; + } + default: errx(1, "Invalid tag"); + } } -public bool Text$equal(const CORD *x, const CORD *y) +Text_t text_from_u32(uint32_t *codepoints, size_t num_codepoints, bool normalize) { - return Text$compare(x, y) == 0; + uint32_t norm_buf[128]; + if (normalize) { + size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]); + uint32_t *normalized = u32_normalize(UNINORM_NFC, codepoints, num_codepoints, norm_buf, &norm_length); + codepoints = normalized; + num_codepoints = norm_length; + } + + char breaks[num_codepoints]; + u32_grapheme_breaks(codepoints, num_codepoints, breaks); + + Text_t ret = { + .length=0, + .tag=TEXT_SHORT_GRAPHEMES, + }; + const uint32_t *src = codepoints; + int32_t *dest = &ret.short_graphemes[0]; + while (src != &codepoints[num_codepoints]) { + ++ret.length; + + if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length > 2) { + int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation + graphemes[0] = ret.short_graphemes[0]; + graphemes[1] = ret.short_graphemes[1]; + ret.tag = TEXT_GRAPHEMES; + ret.graphemes = graphemes; + dest = &graphemes[2]; + } + + const uint32_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]); + if (next == &src[1]) { + *dest = (int32_t)*src; + } else { + // Synthetic grapheme + *dest = get_synthetic_grapheme(src, next-src); + } + ++dest; + src = next; + } + if (normalize && codepoints != norm_buf) free(codepoints); + return ret; } -public uint32_t Text$hash(const CORD *cord) +public Text_t Text$from_str(const char *str) { - if (!*cord) return 0; + size_t ascii_span = 0; + while (str[ascii_span] && isascii(str[ascii_span])) + ascii_span++; - uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *normalized = _normalize(*cord, buf, &norm_len); + if (str[ascii_span] == '\0') { // All ASCII + Text_t ret = {.length=ascii_span}; + if (ascii_span <= 8) { + ret.tag = TEXT_SHORT_ASCII; + for (size_t i = 0; i < ascii_span; i++) + ret.short_ascii[i] = str[i]; + } else { + ret.tag = TEXT_ASCII; + ret.ascii = str; + } + return ret; + } else { + uint32_t buf[128]; + size_t length = sizeof(buf)/sizeof(buf[0]); + uint32_t *codepoints = u8_to_u32((uint8_t*)str, ascii_span + strlen(str + ascii_span), buf, &length); + Text_t ret = text_from_u32(codepoints, length, true); + if (codepoints != buf) free(codepoints); + return ret; + } +} + +static void u8_buf_append(Text_t text, char **buf, size_t *capacity, int64_t *i) +{ + switch (text.tag) { + case TEXT_ASCII: case TEXT_SHORT_ASCII: { + if (*i + text.length > (int64_t)*capacity) { + *capacity = *i + text.length; + *buf = GC_REALLOC(*buf, *capacity); + } + + const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii; + memcpy(*buf + *i, bytes, text.length); + *i += text.length; + break; + } + case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; + for (int64_t g = 0; g + 1 < text.length; g++) { + const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g]; + size_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1; + uint8_t u8_buf[64]; + size_t u8_len = sizeof(u8_buf); + uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len); + + if (*i + (int64_t)u8_len > (int64_t)*capacity) { + *capacity = *i + u8_len; + *buf = GC_REALLOC(*buf, *capacity); + } - uint32_t hash; - halfsiphash(normalized, norm_len, TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - if (normalized != buf) free(normalized); - return hash; + memcpy(*buf + *i, u8, u8_len); + *i += u8_len; + if (u8 != u8_buf) free(u8); + } + break; + } + case TEXT_SUBTEXT: { + for (int64_t s = 0, remaining = text.length; remaining > 0; s++) { + u8_buf_append(text.subtexts[s], buf, capacity, i); + remaining -= text.subtexts[s].length; + } + break; + } + default: break; + } } -public CORD Text$upper(CORD str) +public const char *Text$as_c_string(Text_t text) { - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - dest[len-1] = 0; - return (CORD)u8_toupper((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); + size_t capacity = text.length; + char *buf = GC_MALLOC_ATOMIC(capacity); + int64_t i = 0; + u8_buf_append(text, &buf, &capacity, &i); + return buf; } -public CORD Text$lower(CORD str) +uint32_t *text_to_u32(Text_t text, size_t *length) { - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - dest[len-1] = 0; - return (CORD)u8_tolower((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); + // Precalculate size: + size_t len = 0; + if (text.tag == TEXT_ASCII) { + len = text.length; + } else { + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = _next_grapheme(text, &state, i); + if (grapheme < 0) + len += synthetic_graphemes[-grapheme-1].num_codepoints; + else + len += 1; + } + } + assert(length); + *length = len; + + // Copy over codepoints one grapheme cluster at a time: + uint32_t *ret = GC_MALLOC_ATOMIC(sizeof(uint32_t[len])); + uint32_t *dest = ret; + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t grapheme = _next_grapheme(text, &state, i); + if (grapheme < 0) { + const uint32_t *codepoints = synthetic_graphemes[-grapheme-1].codepoints; + size_t num_codepoints = synthetic_graphemes[-grapheme-1].num_codepoints; + for (size_t j = 0; j < num_codepoints; j++) + *(dest++) = codepoints[j]; + } else { + *(dest++) = (uint32_t)grapheme; + } + } + return ret; } -public CORD Text$title(CORD str) +#include "siphash.c" + +public uint64_t Text$hash(Text_t *text) { - if (!str) return str; - size_t len = strlen(str) + 1; - uint8_t *dest = GC_MALLOC_ATOMIC(len); - dest[len-1] = 0; - return (CORD)u8_totitle((const uint8_t*)str, len-1, uc_locale_language(), NULL, dest, &len); + if (text->hash != 0) return text->hash; + siphash sh; + siphashinit(&sh, sizeof(int32_t[text->length]), (uint64_t*)TOMO_HASH_KEY); + + union { + int32_t chunks[2]; + uint64_t whole; + } tmp; + switch (text->tag) { + case TEXT_ASCII: case TEXT_SHORT_ASCII: { + const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii; + for (int64_t i = 0; i + 1 < text->length; i++) { + tmp.chunks[0] = (int32_t)bytes[i]; + tmp.chunks[1] = (int32_t)bytes[i+1]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes + text->hash = siphashfinish_last_part(&sh, (uint64_t)last); + break; + } + case TEXT_GRAPHEMES: { + const int32_t *graphemes = text->graphemes; + for (int64_t i = 0; i + 1 < text->length; i++) { + tmp.chunks[0] = graphemes[i]; + tmp.chunks[1] = graphemes[i]; + siphashadd64bits(&sh, tmp.whole); + } + int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes + text->hash = siphashfinish_last_part(&sh, (uint64_t)last); + break; + } + case TEXT_SHORT_GRAPHEMES: { + tmp.chunks[0] = text->short_graphemes[0]; + if (text->length > 1) + tmp.chunks[1] = text->short_graphemes[1]; + text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole); + break; + } + case TEXT_SUBTEXT: { + int32_t leftover = 0; + for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) { + Text_t subtext = text->subtexts[sub_i]; + if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) { + const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii; + int64_t grapheme = 0; + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = (int32_t)bytes[0]; + siphashadd64bits(&sh, tmp.whole); + grapheme += 1; + } + for (; grapheme + 1 < subtext.length; grapheme += 2) { + tmp.chunks[0] = (int32_t)bytes[grapheme]; + tmp.chunks[1] = (int32_t)bytes[grapheme+1]; + siphashadd64bits(&sh, tmp.whole); + } + leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0; + } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) { + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = subtext.short_graphemes[0]; + siphashadd64bits(&sh, tmp.whole); + leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0; + } else if (subtext.length == 1) { + leftover = subtext.short_graphemes[0]; + } else { + tmp.chunks[0] = subtext.short_graphemes[0]; + tmp.chunks[1] = subtext.short_graphemes[1]; + siphashadd64bits(&sh, tmp.whole); + } + } else if (subtext.tag == TEXT_GRAPHEMES) { + int32_t *graphemes = subtext.graphemes; + int64_t grapheme = 0; + if (leftover) { + tmp.chunks[0] = leftover; + tmp.chunks[1] = graphemes[0]; + siphashadd64bits(&sh, tmp.whole); + grapheme += 1; + } + for (; grapheme + 1 < subtext.length; grapheme += 2) { + tmp.chunks[0] = graphemes[grapheme]; + tmp.chunks[1] = graphemes[grapheme+1]; + siphashadd64bits(&sh, tmp.whole); + } + leftover = grapheme < subtext.length ? graphemes[grapheme] : 0; + } + + to_hash -= text->subtexts[sub_i].length; + + ++sub_i; + } + + text->hash = siphashfinish_last_part(&sh, leftover); + break; + } + default: errx(1, "Invalid text"); + } + + if (text->hash == 0) + text->hash = 1; + + return text->hash; } -public bool Text$has(CORD str, CORD target, Where_t where) +int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index) { - if (!target) return true; - if (!str) return false; + switch (text.tag) { + case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0; + case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0; + case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0; + case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0; + case TEXT_SUBTEXT: { + iteration_state_t backup_state = {0, 0}; + if (!state) state = &backup_state; - uint8_t str_buf[128] = {0}; size_t str_norm_len = sizeof(str_buf); - uint8_t *str_normalized = _normalize(str, str_buf, &str_norm_len); + if (index < 0 || index >= text.length) + return 0; - uint8_t target_buf[128] = {0}; size_t target_norm_len = sizeof(target_buf); - uint8_t *target_normalized = _normalize(target, target_buf, &target_norm_len); + while (index < state->sum_of_previous_subtexts && state->subtext > 0) { + state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length; + state->subtext -= 1; + } + for (;;) { + if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length) + return _next_grapheme(text.subtexts[state->subtext], NULL, index); + state->sum_of_previous_subtexts += text.subtexts[state->subtext].length; + state->subtext += 1; + } + return 0; + } + default: errx(1, "Invalid text"); + } + return 0; +} - if (target_norm_len > str_norm_len) return false; +int32_t get_grapheme(Text_t text, int64_t index) +{ + iteration_state_t state = {0, 0}; + return _next_grapheme(text, &state, index); +} - bool ret; - if (where.tag == $tag$Where$Start) { - ret = (u8_strncmp(str_normalized, target_normalized, target_norm_len-1) == 0); - } else if (where.tag == $tag$Where$End) { - ret = (u8_strcmp(str_normalized + str_norm_len - target_norm_len, target_normalized) == 0); - } else { - assert(where.tag == $tag$Where$Anywhere); - ret = (u8_strstr(str_normalized, target_normalized) != NULL); +int32_t Text$compare(const Text_t *a, const Text_t *b) +{ + int64_t len = MAX(a->length, b->length); + iteration_state_t a_state = {0, 0}, b_state = {0, 0}; + for (int64_t i = 0; i < len; i++) { + int32_t ai = _next_grapheme(*a, &a_state, i); + int32_t bi = _next_grapheme(*b, &b_state, i); + if (ai == bi) continue; + int32_t cmp; + if (ai > 0 && bi > 0) { + cmp = u32_cmp((uint32_t*)&ai, (uint32_t*)&bi, 1); + } else if (ai > 0) { + cmp = u32_cmp2( + (uint32_t*)&ai, 1, + synthetic_graphemes[-bi-1].codepoints, + synthetic_graphemes[-bi-1].num_codepoints); + } else if (bi > 0) { + cmp = u32_cmp2( + synthetic_graphemes[-ai-1].codepoints, + synthetic_graphemes[-ai-1].num_codepoints, + (uint32_t*)&bi, 1); + } else { + cmp = u32_cmp2( + synthetic_graphemes[-ai-1].codepoints, + synthetic_graphemes[-ai-1].num_codepoints, + synthetic_graphemes[-bi-1].codepoints, + synthetic_graphemes[-bi-1].num_codepoints); + } + if (cmp != 0) return cmp; } + return 0; +} - if (str_normalized != str_buf) free(str_normalized); - if (target_normalized != target_buf) free(target_normalized); - return ret; +public bool Text$equal(const Text_t *a, const Text_t *b) +{ + if (a->length != b->length || (a->hash != 0 && b->hash != 0 && a->hash != b->hash)) + return false; + int64_t len = a->length; + iteration_state_t a_state = {0, 0}, b_state = {0, 0}; + for (int64_t i = 0; i < len; i++) { + int32_t ai = _next_grapheme(*a, &a_state, i); + int32_t bi = _next_grapheme(*b, &b_state, i); + if (ai != bi) return false; + } + return true; } -public CORD Text$without(CORD str, CORD target, Where_t where) +public bool Text$equal_ignoring_case(Text_t a, Text_t b) { - if (!str || !target) return str; + if (a.length != b.length) + return false; + int64_t len = a.length; + iteration_state_t a_state = {0, 0}, b_state = {0, 0}; + const char *language = uc_locale_language(); + for (int64_t i = 0; i < len; i++) { + int32_t ai = _next_grapheme(a, &a_state, i); + int32_t bi = _next_grapheme(b, &b_state, i); + if (ai != bi) { + const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : synthetic_graphemes[-ai-1].codepoints; + size_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints; - size_t target_len = CORD_len(target); - size_t str_len = CORD_len(str); - if (where.tag == $tag$Where$Start) { - if (CORD_ncmp(str, 0, target, 0, target_len) == 0) - return CORD_substr(str, target_len, str_len - target_len); - return str; - } else if (where.tag == $tag$Where$End) { - if (CORD_ncmp(str, str_len-target_len, target, 0, target_len) == 0) - return CORD_substr(str, 0, str_len - target_len); - return str; - } else { - CORD ret = CORD_EMPTY; - size_t i = 0; - for (;;) { - size_t match = CORD_str(str, i, target); - if (match == CORD_NOT_FOUND) { - if (i == 0) return str; // No matches! - ret = CORD_cat(ret, CORD_substr(str, i, str_len)); - break; - } - ret = CORD_cat(ret, CORD_substr(str, i, (match-i))); - i = match + target_len; + const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : synthetic_graphemes[-bi-1].codepoints; + size_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints; + + int cmp; + (void)u32_casecmp(a_codepoints, a_len, b_codepoints, b_len, language, UNINORM_NFC, &cmp); + if (cmp != 0) + return false; } - return ret; } + return true; } -public CORD Text$trimmed(CORD str, CORD skip, Where_t where) +public Text_t Text$upper(Text_t text) { - if (!str || !skip) return str; - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(str); - const uint8_t *uskip = (const uint8_t*)CORD_to_const_char_star(skip); - // TODO: implement proper reverse iteration with u8_prev() - if (where.tag == $tag$Where$Start) { - size_t span = u8_strspn(ustr, uskip); - return (CORD)ustr + span; - } else if (where.tag == $tag$Where$End) { - size_t len = u8_strlen(ustr); - const uint8_t *back = ustr + len; - size_t back_span = 0; - while (back - back_span > ustr && u8_strspn(back-back_span-1, uskip) > back_span) - ++back_span; - return CORD_substr((CORD)ustr, 0, len - back_span); - } else { - size_t span = u8_strspn(ustr, uskip); - size_t len = u8_strlen(ustr); - const uint8_t *back = ustr + len; - size_t back_span = 0; - while (back - back_span > ustr + span && u8_strspn(back-back_span-1, uskip) > back_span) - ++back_span; - return CORD_substr((CORD)(ustr + span), 0, len - span - back_span); + size_t length; + uint32_t *codepoints = text_to_u32(text, &length); + const char *language = uc_locale_language(); + uint32_t buf[128]; + size_t out_len; + uint32_t *upper = u32_toupper(codepoints, length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(upper, out_len, false); + if (upper != buf) free(upper); + return ret; +} + +public Text_t Text$lower(Text_t text) +{ + size_t length; + uint32_t *codepoints = text_to_u32(text, &length); + const char *language = uc_locale_language(); + uint32_t buf[128]; + size_t out_len; + uint32_t *lower = u32_tolower(codepoints, length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(lower, out_len, false); + if (lower != codepoints) free(lower); + return ret; +} + +public Text_t Text$title(Text_t text) +{ + size_t length; + uint32_t *codepoints = text_to_u32(text, &length); + const char *language = uc_locale_language(); + uint32_t buf[128]; + size_t out_len; + uint32_t *title = u32_totitle(codepoints, length, language, UNINORM_NFC, buf, &out_len); + Text_t ret = text_from_u32(title, out_len, false); + if (title != codepoints) free(title); + return ret; +} + +static inline void skip_whitespace(Text_t text, int64_t *i) +{ + iteration_state_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = _next_grapheme(text, &state, *i); + if (grapheme > 0 && !uc_is_property_white_space(grapheme)) + return; + *i += 1; } } -public find_result_t Text$find(CORD str, CORD pat) +static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) { - if (!pat) return (find_result_t){.status=FIND_SUCCESS, .index=1}; - size_t pos = CORD_str(str, 0, pat); - return (pos == CORD_NOT_FOUND) ? (find_result_t){.status=FIND_FAILURE} : (find_result_t){.status=FIND_SUCCESS, .index=(int32_t)pos}; + if (*i < text.length && get_grapheme(text, *i) == grapheme) { + *i += 1; + return true; + } + return false; } -public CORD Text$replace(CORD text, CORD pat, CORD replacement, Int_t int_limit) +static inline bool match_str(Text_t text, int64_t *i, const char *str) { - if (!text || !pat) return text; - CORD ret = CORD_EMPTY; - size_t pos = 0, pat_len = CORD_len(pat); - int64_t limit = Int_to_Int64(int_limit, false); - for (size_t found; limit != 0 && (found=CORD_str(text, pos, pat)) != CORD_NOT_FOUND; --limit) { - ret = CORD_all(ret, CORD_substr(text, pos, found - pos), replacement); - pos = found + pat_len; + iteration_state_t state = {0, 0}; + int64_t matched = 0; + while (matched[str]) { + if (*i + matched >= text.length || _next_grapheme(text, &state, *i + matched) != str[matched]) + return false; + matched += 1; } - size_t str_len = CORD_len(text); - return CORD_cat(ret, CORD_substr(text, pos, str_len - pos)); + *i += matched; + return true; } -public array_t Text$split(CORD str, CORD split) +static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) { - if (!str) return (array_t){.data=GC_MALLOC(sizeof(CORD)), .atomic=1, .length=1, .stride=sizeof(CORD)}; - array_t strings = {.stride=sizeof(CORD), .atomic=1}; + if (*i >= text.length) return false; + int32_t grapheme = get_grapheme(text, *i); + if (grapheme < 0) // TODO: check every codepoint in the cluster? + grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; - const uint8_t *ustr = (uint8_t*)CORD_to_const_char_star(str); - const uint8_t *usplit = (uint8_t*)CORD_to_const_char_star(split); - for (int64_t i = 0; ; ) { - size_t non_split = u8_strcspn(ustr + i, usplit); - CORD chunk = CORD_substr((CORD)ustr, i, non_split); - Array$insert(&strings, &chunk, I(0), sizeof(CORD)); + if (uc_is_property(grapheme, prop)) { + *i += 1; + return true; + } + return false; +} - i += non_split; +static int64_t parse_int(Text_t text, int64_t *i) +{ + iteration_state_t state = {0, 0}; + int64_t value = 0; + for (;; *i += 1) { + int32_t grapheme = _next_grapheme(text, &state, *i); + if (grapheme < 0) + grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + int digit = uc_digit_value(grapheme); + if (digit < 0) break; + if (value >= INT64_MAX/10) break; + value = 10*value + digit; + } + return value; +} - size_t split_span = u8_strspn(ustr + i, usplit); - if (split_span == 0) break; - i += split_span; +const char *get_property_name(Text_t text, int64_t *i) +{ + skip_whitespace(text, i); + char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); + char *dest = name; + iteration_state_t state = {0, 0}; + while (*i < text.length) { + int32_t grapheme = _next_grapheme(text, &state, *i); + if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { + *dest = (char)grapheme; + ++dest; + if (dest >= name + UNINAME_MAX - 1) + break; + } else if (dest == name && grapheme >= 0 && grapheme != ']') { + // Literal character escape: [..[] --> "LEFT SQUARE BRACKET" + name = unicode_character_name(grapheme, name); + *i += 1; + return name; + } else { + break; + } + *i += 1; } - return strings; + if (dest == name) return NULL; + *dest = '\0'; + return name; } -public CORD Text$join(CORD glue, array_t pieces) +#define EAT1(state, cond) ({\ + int32_t grapheme = _next_grapheme(text, state, text_index); \ + bool success = (cond); \ + if (success) text_index += 1; \ + success; }) + +#define EAT_MANY(state, cond) ({ int64_t n = 0; while (EAT1(state, cond)) { n += 1; } n; }) + +int64_t match_email(Text_t text, int64_t text_index) { - if (pieces.length == 0) return CORD_EMPTY; + // email = local "@" domain + // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) + // domain = dns-label ("." dns-label)* + // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - CORD ret = CORD_EMPTY; - for (int64_t i = 0; i < pieces.length; i++) { - if (i > 0) ret = CORD_cat(ret, glue); - ret = CORD_cat(ret, *(CORD*)((void*)pieces.data + i*pieces.stride)); + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if (prev_codepoint < 0) + prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; } - return ret; + + int64_t start_index = text_index; + + // Local part: + int64_t local_len = 0; + static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; + while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { + local_len += 1; + if (local_len > 64) return -1; + } + + if (!EAT1(&state, grapheme == '@')) + return -1; + + // Host + int64_t host_len = 0; + do { + int64_t label_len = 0; + while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { + label_len += 1; + if (label_len > 63) return -1; + } + + if (label_len == 0) + return -1; + + host_len += label_len; + if (host_len > 255) + return -1; + host_len += 1; + } while (EAT1(&state, grapheme == '.')); + + return text_index - start_index; } -public array_t Text$clusters(CORD text) +int64_t match_ipv6(Text_t text, int64_t text_index) { - array_t clusters = {.atomic=1}; - uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *normalized = _normalize(text, buf, &norm_len); - const uint8_t *end = normalized + strlen((char*)normalized); - for (const uint8_t *pos = normalized; pos != end; ) { - const uint8_t *next = u8_grapheme_next(pos, end); - size_t len = (size_t)(next - pos); - char cluster_buf[len+1]; - strlcpy(cluster_buf, (char*)pos, len+1); - CORD cluster = CORD_from_char_star(cluster_buf); - Array$insert(&clusters, &cluster, I(0), sizeof(CORD)); - pos = next; + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) + return -1; } + int64_t start_index = text_index; + const int NUM_CLUSTERS = 8; + bool double_colon_used = false; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 4; digits++) { + if (!EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + break; + } + if (EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) { + break; + } else if (!EAT1(&state, grapheme == ':')) { + if (double_colon_used) + break; + return -1; + } - if (normalized != buf) free(normalized); - return clusters; + if (EAT1(&state, grapheme == ':')) { + if (double_colon_used) + return -1; + double_colon_used = true; + } + } + return text_index - start_index; } -public array_t Text$codepoints(CORD text) +static int64_t match_ipv4(Text_t text, int64_t text_index) { - uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = _normalize(text, norm_buf, &norm_len); + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) + return -1; + } + int64_t start_index = text_index; - uint32_t codepoint_buf[128] = {0}; - size_t codepoint_len = sizeof(codepoint_buf); - uint32_t *codepoints = u8_to_u32(normalized, norm_len-1, codepoint_buf, &codepoint_len); - array_t ret = { - .length=codepoint_len, - .data=memcpy(GC_MALLOC_ATOMIC(sizeof(int32_t[codepoint_len])), codepoints, sizeof(int32_t[codepoint_len])), - .stride=sizeof(int32_t), - .atomic=1, - }; + const int NUM_CLUSTERS = 4; + for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { + for (int digits = 0; digits < 3; digits++) { + if (!EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (digits == 0) return -1; + break; + } + } - if (normalized != norm_buf) free(normalized); - if (codepoints != codepoint_buf) free(codepoints); - return ret; + if (EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + return -1; // Too many digits + + if (cluster == NUM_CLUSTERS-1) + break; + else if (!EAT1(&state, grapheme == '.')) + return -1; + } + return (text_index - start_index); } -public array_t Text$bytes(CORD text) +int64_t match_uri(Text_t text, int64_t text_index) { - uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = _normalize(text, norm_buf, &norm_len); + // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] + // scheme = [a-zA-Z] [a-zA-Z0-9+.-] + // authority = [userinfo "@"] host [":" port] - --norm_len; // NUL byte - array_t ret = { - .length=norm_len, - .data=memcpy(GC_MALLOC_ATOMIC(sizeof(uint8_t[norm_len])), normalized, sizeof(uint8_t[norm_len])), - .stride=sizeof(uint8_t), - .atomic=1, - }; + iteration_state_t state = {0, 0}; + if (text_index > 0) { + int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1); + if (prev_codepoint < 0) + prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(prev_codepoint)) + return -1; + } - if (normalized != norm_buf) free(normalized); - return ret; + int64_t start_index = text_index; + + // Scheme: + if (!EAT1(&state, isalpha(grapheme))) + return -1; + + EAT_MANY(&state, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + + if (text_index == start_index) + return -1; + + if (!match_grapheme(text, &text_index, ':')) + return -1; + + // Authority: + if (match_str(text, &text_index, "//")) { + int64_t authority_start = text_index; + // Username or host: + static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; + if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + + if (EAT1(&state, grapheme == '@')) { + // Found a username, now get a host: + if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) + return -1; + } else { + int64_t ip = authority_start; + int64_t ipv4_len = match_ipv4(text, ip); + if (ipv4_len > 0) { + ip += ipv4_len; + } else if (match_grapheme(text, &ip, '[')) { + ip += match_ipv6(text, ip); + if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) + text_index = ip; + } + } + + // Port: + if (EAT1(&state, grapheme == ':')) { + if (EAT_MANY(&state, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + if (!EAT1(&state, grapheme == '/')) + return (text_index - start_index); // No path + } else { + // Optional path root: + EAT1(&state, grapheme == '/'); + } + + // Path: + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(&state, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); + } + + if (EAT1(&state, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); + } + return text_index - start_index; } -public Int_t Text$num_clusters(CORD text) +int64_t match(Text_t text, Text_t pattern, int64_t text_index, int64_t pattern_index) { - const uint8_t *ustr = (const uint8_t*)CORD_to_const_char_star(text); - int64_t num_clusters = 0; - const uint8_t *end = ustr + u8_strlen(ustr); - for (const uint8_t *pos = ustr; pos != end; ) { - const uint8_t *next = u8_grapheme_next(pos, end); - ++num_clusters; - pos = next; + if (pattern_index >= pattern.length) return 0; + int64_t start_index = text_index; + iteration_state_t pattern_state = {0, 0}, text_state = {0, 0}; + while (pattern_index < pattern.length) { + int64_t old_pat_index = pattern_index; + if (match_str(pattern, &pattern_index, "[..")) { + skip_whitespace(pattern, &pattern_index); + int64_t min, max; + if (uc_is_digit(_next_grapheme(pattern, &pattern_state, pattern_index))) { + min = parse_int(pattern, &pattern_index); + skip_whitespace(pattern, &pattern_index); + if (match_grapheme(pattern, &pattern_index, '+')) { + max = INT64_MAX; + } else if (match_grapheme(pattern, &pattern_index, '-')) { + max = parse_int(pattern, &pattern_index); + } else { + max = min; + } + } else { + min = 1, max = INT64_MAX; + } + + skip_whitespace(pattern, &pattern_index); + bool want_to_match = !match_grapheme(pattern, &pattern_index, '!'); + const char *prop_name = get_property_name(pattern, &pattern_index); + + skip_whitespace(pattern, &pattern_index); + if (!match_grapheme(pattern, &pattern_index, ']')) + errx(1, "Missing closing ']' in pattern: \"%T\"", &pattern); + + int64_t before_group = text_index; + bool any = false; + uc_property_t prop; + int32_t specific_codepoint = UNINAME_INVALID; + +#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } }) + if (prop_name) { + switch (tolower(prop_name[0])) { + case 'd': + if (strcasecmp(prop_name, "digit") == 0) { + prop = UC_PROPERTY_DECIMAL_DIGIT; + goto got_prop; + } + break; + case 'e': + if (strcasecmp(prop_name, "end") == 0) { + if (text_index != text.length) + FAIL(); + continue; + } else if (prop_name && strcasecmp(prop_name, "email") == 0) { + int64_t len = match_email(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "emoji") == 0) { + prop = UC_PROPERTY_EMOJI; + goto got_prop; + } + break; + case 'i': + if (prop_name && strcasecmp(prop_name, "id") == 0) { + if (!EAT1(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_START))) + FAIL(); + EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE)); + continue; + } else if (prop_name && strcasecmp(prop_name, "ipv4") == 0) { + int64_t len = match_ipv4(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "ipv6") == 0) { + int64_t len = match_ipv6(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "ip") == 0) { + int64_t len = match_ipv6(text, text_index); + if (len < 0) + len = match_ipv4(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } + break; + case 's': + if (strcasecmp(prop_name, "start") == 0) { + if (text_index != 0) return -1; + continue; + } + break; + case 'u': + if (prop_name && strcasecmp(prop_name, "uri") == 0) { + int64_t len = match_uri(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } else if (prop_name && strcasecmp(prop_name, "url") == 0) { + int64_t lookahead = text_index; + if (!(match_str(text, &lookahead, "https:") + || match_str(text, &lookahead, "http:") + || match_str(text, &lookahead, "ftp:") + || match_str(text, &lookahead, "wss:") + || match_str(text, &lookahead, "ws:"))) + FAIL(); + + int64_t len = match_uri(text, text_index); + if (len < 0) + FAIL(); + text_index += len; + continue; + } + break; + } + + prop = uc_property_byname(prop_name); + if (!uc_property_is_valid(prop)) { + specific_codepoint = unicode_name_character(prop_name); + if (specific_codepoint == UNINAME_INVALID) + errx(1, "Not a valid property or character name: %s", prop_name); + } + } else { + any = true; + prop = UC_PROPERTY_PRIVATE_USE; + } + got_prop:; + + if (min == 0 && pattern_index < pattern.length) { + int64_t match_len = match(text, pattern, text_index, pattern_index); + if (match_len >= 0) + return (text_index - start_index) + match_len; + } + + for (int64_t count = 0; count < max; ) { + int32_t grapheme = _next_grapheme(text, &text_state, text_index); + if (grapheme < 0) + grapheme = synthetic_graphemes[-grapheme-1].codepoints[0]; + + bool success; + if (any) + success = true; + else if (specific_codepoint != UNINAME_INVALID) + success = (grapheme == specific_codepoint); + else + success = uc_is_property(grapheme, prop); + + if (success != want_to_match) { + if (count < min) return -1; + else break; + } + + text_index += 1; + count += 1; + + if (count >= min) { + if (pattern_index < pattern.length) { + int64_t match_len = match(text, pattern, text_index, pattern_index); + if (match_len >= 0) { + return (text_index - start_index) + match_len; + } + } else if (text_index >= text.length) { + break; + } + } + } + } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_QUOTATION_MARK) + && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) { + // Quotation: "?", '?', etc + int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2); + if (!match_grapheme(text, &text_index, open)) return -1; + int32_t close = open; + uc_mirror_char(open, (uint32_t*)&close); + if (!match_grapheme(pattern, &pattern_index, close)) + errx(1, "I expected a closing brace"); + while (text_index < text.length) { + int32_t c = _next_grapheme(text, &text_state, text_index); + if (c == close) + return (text_index - start_index); + + if (c == '\\' && text_index < text.length) { + text_index += 2; + } else { + text_index += 1; + } + } + return -1; + } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_PAIRED_PUNCTUATION) + && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) { + // Nested punctuation: (?), [?], etc + int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2); + if (!match_grapheme(text, &text_index, open)) return -1; + int32_t close = open; + uc_mirror_char(open, (uint32_t*)&close); + if (!match_grapheme(pattern, &pattern_index, close)) + errx(1, "I expected a closing brace"); + int64_t depth = 1; + for (; depth > 0 && text_index < text.length; ++text_index) { + int32_t c = _next_grapheme(text, &text_state, text_index); + if (c == open) + depth += 1; + else if (c == close) + depth -= 1; + } + if (depth > 0) return -1; + } else { + // Plain character: + pattern_index = old_pat_index; + int32_t pat_grapheme = _next_grapheme(pattern, &pattern_state, pattern_index); + + if (pattern_index == 0 && text_index > 0) { + int32_t pat_codepoint = pat_grapheme; + if (pat_codepoint < 0) + pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0]; + + int32_t prev_codepoint = _next_grapheme(text, &text_state, text_index - 1); + if (prev_codepoint < 0) + prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(prev_codepoint)) + return -1; + } + + int32_t text_grapheme = _next_grapheme(text, &text_state, text_index); + if (pat_grapheme != text_grapheme) + return -1; + + pattern_index += 1; + text_index += 1; + + if (pattern_index == pattern.length && text_index < text.length) { + int32_t pat_codepoint = pat_grapheme; + if (pat_codepoint < 0) + pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0]; + + int32_t next_codepoint = _next_grapheme(text, &text_state, text_index); + if (next_codepoint < 0) + next_codepoint = synthetic_graphemes[-next_codepoint-1].codepoints[0]; + if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(next_codepoint)) + return -1; + } + } + } + if (text_index >= text.length && pattern_index < pattern.length) + return -1; + return (text_index - start_index); +} + +#undef EAT1 +#undef EAT_MANY + +public Int_t Text$find(Text_t text, Text_t pattern, Int_t from_index, int64_t *match_length) +{ + int32_t first = get_grapheme(pattern, 0); + bool find_first = (first != '[' + && !uc_is_property(first, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property(first, UC_PROPERTY_PAIRED_PUNCTUATION)); + + iteration_state_t text_state = {0, 0}; + for (int64_t i = Int_to_Int64(from_index, false)-1; i < text.length; i++) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (i < text.length && _next_grapheme(text, &text_state, i) != first) + ++i; + } + + int64_t m = match(text, pattern, i, 0); + if (m >= 0) { + if (match_length) + *match_length = m; + return I(i+1); + } } - return I(num_clusters); + if (match_length) + *match_length = -1; + return I(0); +} + +public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]) +{ + if (n < 1) return -1; + (void)info; + argtypes[0] = PA_POINTER; + sizes[0] = sizeof(Text_t*); + return 1; +} + +public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]) +{ + (void)info; + Text_t t = **(Text_t**)args[0]; + return Text$print(stream, t); } -public Int_t Text$num_codepoints(CORD text) +public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info) { - uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *normalized = _normalize(text, buf, &norm_len); - int64_t num_codepoints = u8_mbsnlen(normalized, norm_len-1); - if (normalized != buf) free(normalized); - return I(num_codepoints); + (void)info; + if (!text) return Text$from_str("Text"); + return Text$quoted(*(Text_t*)text, colorize); } -public Int_t Text$num_bytes(CORD text) +public Text_t Text$quoted(Text_t text, bool colorize) { - uint8_t norm_buf[128] = {0}; size_t norm_len = sizeof(norm_buf); - uint8_t *normalized = _normalize(text, norm_buf, &norm_len); - --norm_len; // NUL byte - if (!normalized) errx(1, "Unicode normalization error!"); - if (normalized != norm_buf) free(normalized); - return I(norm_len); + // TODO: optimize for ASCII and short strings + array_t graphemes = {.atomic=1}; +#define add_char(c) Array$insert_value(&graphemes, (uint32_t)c, I_small(0), sizeof(uint32_t)) +#define add_str(s) ({ for (char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (uint32_t)*_c, I_small(0), sizeof(uint32_t)); }) + if (colorize) + add_str("\x1b[35m\""); + else + add_char('"'); + +#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); }) + iteration_state_t state = {0, 0}; + for (int64_t i = 0; i < text.length; i++) { + int32_t g = _next_grapheme(text, &state, i); + switch (g) { + case '\a': add_escaped("a"); break; + case '\b': add_escaped("b"); break; + case '\x1b': add_escaped("e"); break; + case '\f': add_escaped("f"); break; + case '\n': add_escaped("n"); break; + case '\r': add_escaped("r"); break; + case '\t': add_escaped("t"); break; + case '\v': add_escaped("v"); break; + case '"': add_escaped("\""); break; + case '\\': add_escaped("\\"); break; + case '\x00' ... '\x06': case '\x0E' ... '\x1A': + case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': { + if (colorize) add_str("\x1b[34;1m"); + add_char('\\'); + add_char('x'); + char tmp[4]; + sprintf(tmp, "%02X", g); + add_str(tmp); + if (colorize) + add_str("\x1b[0;35m"); + break; + } + default: add_char(g); break; + } + } + + if (colorize) + add_str("\"\x1b[m"); + else + add_char('"'); + + return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; +#undef add_str +#undef add_char +#undef add_escaped } -public array_t Text$character_names(CORD text) +public Text_t Text$replace(Text_t text, Text_t pattern, Text_t replacement) { - array_t codepoints = Text$codepoints(text); - array_t ret = {.length=codepoints.length, .stride=sizeof(CORD), .data=GC_MALLOC(sizeof(CORD[codepoints.length]))}; - for (int64_t i = 0; i < codepoints.length; i++) { - char buf[UNINAME_MAX]; - unicode_character_name(*(ucs4_t*)(codepoints.data + codepoints.stride*i), buf); - *(CORD*)(ret.data + ret.stride*i) = CORD_from_char_star(buf); + Text_t ret = {.length=0}; + + Int_t i = I_small(0); + for (;;) { + int64_t len; + Int_t found = Text$find(text, pattern, i, &len); + if (found.small == I_small(0).small) break; + if (Int$compare(&found, &i, &$Text) > 0) { + ret = Text$concat( + ret, + Text$slice(text, i, Int$minus(found, I_small(1))), + replacement + ); + } else { + ret = concat2(ret, replacement); + } + } + if (Int_to_Int64(i, false) <= text.length) { + ret = concat2(ret, Text$slice(text, i, Int64_to_Int(text.length))); } return ret; } -public CORD Text$read_line(CORD prompt) +public Text_t Text$format(const char *fmt, ...) { - char *line = readline(CORD_to_const_char_star(prompt)); - if (!line) return CORD_EMPTY; - CORD ret = CORD_from_char_star(line); - free(line); + va_list args; + va_start(args, fmt); + + char buf[8]; + int len = vsnprintf(buf, sizeof(buf), fmt, args); + Text_t ret; + if (len <= (int)sizeof(buf)) { + ret = (Text_t){ + .length=len, + .tag = TEXT_SHORT_ASCII, + }; + for (int i = 0; i < len; i++) + ret.short_ascii[i] = buf[i]; + } else { + char *str = GC_MALLOC_ATOMIC(len); + vsnprintf(str, len, fmt, args); + ret = Text$from_str(str); + } + va_end(args); return ret; } public const TypeInfo $Text = { - .size=sizeof(CORD), - .align=__alignof__(CORD), + .size=sizeof(Text_t), + .align=__alignof__(Text_t), .tag=TextInfo, .TextInfo={.lang="Text"}, }; |
