tomo/builtins/text.c

1884 lines
66 KiB
C
Raw Normal View History

2024-03-18 09:57:49 -07:00
// Type info and methods for Text datatype, which uses the Boehm "cord" library
// and libunistr
2024-09-02 15:47:39 -07:00
2024-02-04 18:13:50 -08:00
#include <assert.h>
2024-02-16 10:29:02 -08:00
#include <ctype.h>
#include <err.h>
2024-02-04 18:13:50 -08:00
#include <gc.h>
#include <gmp.h>
2024-02-16 10:29:02 -08:00
#include <limits.h>
2024-09-02 15:47:39 -07:00
#include <printf.h>
#include <readline/history.h>
#include <readline/readline.h>
2024-02-04 18:13:50 -08:00
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/param.h>
2024-09-02 15:47:39 -07:00
2024-02-16 10:29:02 -08:00
#include <unicase.h>
2024-09-02 15:47:39 -07:00
#include <unictype.h>
#include <unigbrk.h>
#include <uniname.h>
#include <uninorm.h>
2024-09-02 15:47:39 -07:00
#include <unistd.h>
#include <unistdio.h>
#include <unistr.h>
2024-02-04 18:13:50 -08:00
#include "array.h"
#include "functions.h"
#include "integers.h"
2024-03-03 15:15:45 -08:00
#include "text.h"
#include "types.h"
2024-02-04 18:13:50 -08:00
2024-09-02 17:22:13 -07:00
#include "siphash.c"
typedef struct {
2024-09-02 17:22:13 -07:00
int64_t num_codepoints;
2024-09-02 15:47:39 -07:00
const uint32_t *codepoints;
2024-09-02 17:34:00 -07:00
const uint8_t *utf8;
} synthetic_grapheme_t;
#define MAX_SYNTHETIC_GRAPHEMES 1024
static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = {};
2024-09-02 15:47:39 -07:00
static int32_t num_synthetic_graphemes = 0;
static int32_t get_grapheme(Text_t text, int64_t index);
typedef struct {
int64_t subtext, sum_of_previous_subtexts;
} iteration_state_t;
2024-02-04 18:13:50 -08:00
2024-09-02 15:47:39 -07:00
static int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index);
2024-09-02 17:22:13 -07:00
int32_t find_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
{
2024-09-02 15:47:39 -07:00
int32_t lo = 0, hi = num_synthetic_graphemes;
while (lo <= hi) {
int32_t mid = (lo + hi) / 2;
int32_t cmp = (synthetic_graphemes[mid].num_codepoints > len) - (synthetic_graphemes[mid].num_codepoints < len);
if (cmp == 0)
cmp = memcmp(synthetic_graphemes[mid].codepoints, codepoints, sizeof(uint32_t[len]));
if (cmp == 0)
return mid;
else if (cmp < 0)
lo = mid + 1;
else if (cmp > 0)
hi = mid - 1;
}
return hi;
}
2024-09-02 17:22:13 -07:00
int32_t get_synthetic_grapheme(const uint32_t *codepoints, int64_t len)
{
2024-09-02 15:47:39 -07:00
int32_t index = find_synthetic_grapheme(codepoints, len);
if (index >= 0
&& index < num_synthetic_graphemes
2024-09-02 15:47:39 -07:00
&& synthetic_graphemes[index].num_codepoints == len
&& memcmp(synthetic_graphemes[index].codepoints, codepoints, len) == 0) {
return -(index+1);
} else {
if (index < 0) index = 0;
if (num_synthetic_graphemes >= MAX_SYNTHETIC_GRAPHEMES)
fail("Too many synthetic graphemes!");
if (num_synthetic_graphemes > 0 && index != num_synthetic_graphemes) {
memmove(&synthetic_graphemes[index + 1], &synthetic_graphemes[index],
sizeof(synthetic_grapheme_t[num_synthetic_graphemes - index]));
}
2024-09-02 15:47:39 -07:00
uint32_t *buf = GC_MALLOC_ATOMIC(sizeof(uint32_t[len]));
memcpy(buf, codepoints, sizeof(uint32_t[len]));
synthetic_graphemes[index].codepoints = buf;
synthetic_graphemes[index].num_codepoints = len;
2024-09-02 17:34:00 -07:00
size_t u8_len = 0;
uint8_t *u8 = u32_to_u8(codepoints, len, NULL, &u8_len);
uint8_t *gc_u8 = GC_MALLOC_ATOMIC(u8_len + 1);
memcpy(gc_u8, u8, u8_len);
gc_u8[u8_len] = '\0';
synthetic_graphemes[index].utf8 = gc_u8;
assert(gc_u8);
2024-09-02 17:34:00 -07:00
free(u8);
2024-09-02 15:47:39 -07:00
++num_synthetic_graphemes;
2024-09-02 15:47:39 -07:00
return -(index+1);
}
}
2024-09-02 17:22:13 -07:00
static inline int64_t num_subtexts(Text_t t)
2024-09-02 15:47:39 -07:00
{
if (t.tag != TEXT_SUBTEXT) return 1;
2024-09-02 17:22:13 -07:00
int64_t len = t.length;
int64_t n = 0;
2024-09-02 15:47:39 -07:00
while (len > 0) {
len -= t.subtexts[n].length;
++n;
}
return n;
}
int text_visualize(FILE *stream, Text_t t)
{
switch (t.tag) {
case TEXT_SHORT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.short_ascii);
case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii);
case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
int printed = fprintf(stream, "<graphemes length=%ld>", t.length);
printed += Text$print(stream, t);
printed += fprintf(stream, "</graphemes>");
return printed;
}
case TEXT_SUBTEXT: {
int printed = fprintf(stream, "<text length=%ld>", t.length);
2024-09-02 17:22:13 -07:00
int64_t to_print = t.length;
2024-09-02 15:47:39 -07:00
for (int i = 0; to_print > 0; ++i) {
printed += fprintf(stream, "\n ");
printed += text_visualize(stream, t.subtexts[i]);
to_print -= t.subtexts[i].length;
if (t.subtexts[i].length == 0) break;
}
printed += fprintf(stream, "\n</text>");
return printed;
}
default: return 0;
}
}
public int Text$print(FILE *stream, Text_t t)
{
if (t.length == 0) return 0;
2024-09-02 15:47:39 -07:00
switch (t.tag) {
case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), t.length, stream);
case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), t.length, stream);
case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes;
int written = 0;
for (int64_t i = 0; i < t.length; i++) {
int32_t grapheme = graphemes[i];
if (grapheme >= 0) {
2024-09-02 17:34:00 -07:00
uint8_t buf[8];
size_t len = sizeof(buf);
uint8_t *u8 = u32_to_u8((uint32_t*)&grapheme, 1, buf, &len);
written += fwrite(u8, sizeof(char), len, stream);
if (u8 != buf) free(u8);
2024-09-02 15:47:39 -07:00
} else {
2024-09-02 17:34:00 -07:00
const uint8_t *u8 = synthetic_graphemes[-grapheme-1].utf8;
assert(u8);
2024-09-02 17:34:00 -07:00
written += fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
2024-02-04 18:13:50 -08:00
}
}
2024-09-02 15:47:39 -07:00
return written;
}
case TEXT_SUBTEXT: {
int written = 0;
int i = 0;
2024-09-02 17:22:13 -07:00
for (int64_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i)
2024-09-02 15:47:39 -07:00
written += Text$print(stream, t.subtexts[i]);
return written;
}
default: return 0;
}
}
static Text_t concat2(Text_t a, Text_t b)
{
if (a.length == 0) return b;
if (b.length == 0) return a;
if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) {
2024-09-02 17:22:13 -07:00
int64_t na = num_subtexts(a);
int64_t nb = num_subtexts(b);
2024-09-02 15:47:39 -07:00
Text_t ret = {
.length=a.length + b.length,
.tag=TEXT_SUBTEXT,
.subtexts=GC_MALLOC(sizeof(Text_t[na + nb])),
};
memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na]));
memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb]));
return ret;
} else if (a.tag == TEXT_SUBTEXT) {
2024-09-02 17:22:13 -07:00
int64_t n = num_subtexts(a);
2024-09-02 15:47:39 -07:00
Text_t ret = {
.length=a.length + b.length,
.tag=TEXT_SUBTEXT,
.subtexts=GC_MALLOC(sizeof(Text_t[n + 1])),
};
memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n]));
ret.subtexts[n] = b;
return ret;
} else if (b.tag == TEXT_SUBTEXT) {
2024-09-02 17:22:13 -07:00
int64_t n = num_subtexts(b);
2024-09-02 15:47:39 -07:00
Text_t ret = {
.length=a.length + b.length,
.tag=TEXT_SUBTEXT,
.subtexts=GC_MALLOC(sizeof(Text_t[n + 1])),
};
ret.subtexts[0] = a;
memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n]));
return ret;
2024-02-04 18:13:50 -08:00
} else {
2024-09-02 15:47:39 -07:00
Text_t ret = {
.length=a.length + b.length,
.tag=TEXT_SUBTEXT,
.subtexts=GC_MALLOC(sizeof(Text_t[2])),
};
ret.subtexts[0] = a;
ret.subtexts[1] = b;
return ret;
}
}
public Text_t Text$_concat(int n, Text_t items[n])
{
if (n == 0) return (Text_t){.length=0};
if (n == 1) return items[0];
if (n == 2) return concat2(items[0], items[1]);
int64_t len = 0, subtexts = 0;
for (int i = 0; i < n; i++) {
len += items[i].length;
if (items[i].length > 0)
subtexts += num_subtexts(items[i]);
2024-09-02 15:47:39 -07:00
}
Text_t ret = {
.length=len,
.tag=TEXT_SUBTEXT,
.subtexts=GC_MALLOC(sizeof(Text_t[len])),
};
int64_t sub_i = 0;
for (int i = 0; i < n; i++) {
if (items[i].length == 0)
continue;
2024-09-02 15:47:39 -07:00
if (items[i].tag == TEXT_SUBTEXT) {
for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) {
ret.subtexts[sub_i++] = items[i].subtexts[j];
remainder -= items[i].subtexts[j].length;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
} else {
ret.subtexts[sub_i++] = items[i];
2024-02-04 18:13:50 -08:00
}
}
2024-09-02 15:47:39 -07:00
return ret;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
2024-02-04 18:13:50 -08:00
{
2024-09-02 16:49:52 -07:00
int64_t first = Int_to_Int64(first_int, false);
int64_t last = Int_to_Int64(last_int, false);
if (first == 0) fail("Invalid index: 0");
2024-09-02 15:47:39 -07:00
if (last == 0) return (Text_t){.length=0};
if (first < 0) first = text.length + first + 1;
if (last < 0) last = text.length + last + 1;
if (last > text.length) last = text.length;
if (first > text.length || last < first)
return (Text_t){.length=0};
if (first == 1 && last == text.length)
return text;
switch (text.tag) {
case TEXT_SHORT_ASCII: {
2024-09-02 18:18:15 -07:00
Text_t ret = (Text_t) {
.tag=TEXT_SHORT_ASCII,
.length=last - first + 1,
};
memcpy(ret.short_ascii, text.short_ascii + (first-1), ret.length);
2024-09-02 15:47:39 -07:00
return ret;
}
case TEXT_ASCII: {
Text_t ret = {
.tag=TEXT_ASCII,
.length=last - first + 1,
.ascii=text.ascii + (first-1),
};
return ret;
}
case TEXT_SHORT_GRAPHEMES: {
assert((first == 1 && last == 1) || (first == 2 && last == 2));
Text_t ret = {
.tag=TEXT_SHORT_GRAPHEMES,
.length=1,
.short_graphemes={text.short_graphemes[first-1]},
};
return ret;
}
case TEXT_GRAPHEMES: {
Text_t ret = {
.tag=TEXT_GRAPHEMES,
.length=last - first + 1,
.graphemes=text.graphemes + (first-1),
};
return ret;
}
case TEXT_SUBTEXT: {
Text_t *subtexts = text.subtexts;
while (first > subtexts[0].length) {
first -= subtexts[0].length;
last -= subtexts[0].length;
++subtexts;
}
int64_t needed_len = (last - first) + 1;
int64_t num_subtexts = 0;
for (int64_t included = 0; included < needed_len; ) {
if (included == 0)
included += subtexts[num_subtexts].length - first + 1;
else
included += subtexts[num_subtexts].length;
num_subtexts += 1;
}
if (num_subtexts == 1)
2024-09-02 18:18:15 -07:00
return Text$slice(subtexts[0], Int64_to_Int(first), Int64_to_Int(last));
2024-09-02 15:47:39 -07:00
Text_t ret = {
.length=needed_len,
.tag=TEXT_SUBTEXT,
.subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])),
};
for (int64_t i = 0; i < num_subtexts; i++) {
2024-09-02 18:18:15 -07:00
ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first), Int64_to_Int(last));
2024-09-02 15:47:39 -07:00
first = 1;
needed_len -= ret.subtexts[i].length;
last = first + needed_len - 1;
}
return ret;
}
default: errx(1, "Invalid tag");
}
2024-02-04 18:13:50 -08:00
}
2024-09-02 17:22:13 -07:00
Text_t text_from_u32(uint32_t *codepoints, int64_t num_codepoints, bool normalize)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
uint32_t norm_buf[128];
if (normalize) {
size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
uint32_t *normalized = u32_normalize(UNINORM_NFC, codepoints, num_codepoints, norm_buf, &norm_length);
codepoints = normalized;
num_codepoints = norm_length;
}
char breaks[num_codepoints];
u32_grapheme_breaks(codepoints, num_codepoints, breaks);
Text_t ret = {
.length=0,
.tag=TEXT_SHORT_GRAPHEMES,
};
const uint32_t *src = codepoints;
2024-09-02 17:47:58 -07:00
int32_t *graphemes = ret.short_graphemes;
while (src < &codepoints[num_codepoints]) {
if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length + 1 > 2) {
graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation
2024-09-02 15:47:39 -07:00
graphemes[0] = ret.short_graphemes[0];
graphemes[1] = ret.short_graphemes[1];
ret.tag = TEXT_GRAPHEMES;
ret.graphemes = graphemes;
}
const uint32_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]);
if (next == &src[1]) {
2024-09-02 17:47:58 -07:00
graphemes[ret.length] = (int32_t)*src;
2024-09-02 15:47:39 -07:00
} else {
// Synthetic grapheme
2024-09-02 17:47:58 -07:00
graphemes[ret.length] = get_synthetic_grapheme(src, next-src);
2024-09-02 15:47:39 -07:00
}
2024-09-02 17:47:58 -07:00
++ret.length;
2024-09-02 15:47:39 -07:00
src = next;
}
if (normalize && codepoints != norm_buf) free(codepoints);
return ret;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
public Text_t Text$from_str(const char *str)
2024-02-04 18:13:50 -08:00
{
2024-09-02 17:22:13 -07:00
int64_t ascii_span = 0;
2024-09-02 15:47:39 -07:00
while (str[ascii_span] && isascii(str[ascii_span]))
ascii_span++;
2024-02-04 18:13:50 -08:00
2024-09-02 15:47:39 -07:00
if (str[ascii_span] == '\0') { // All ASCII
Text_t ret = {.length=ascii_span};
if (ascii_span <= 8) {
ret.tag = TEXT_SHORT_ASCII;
2024-09-02 17:22:13 -07:00
for (int64_t i = 0; i < ascii_span; i++)
2024-09-02 15:47:39 -07:00
ret.short_ascii[i] = str[i];
} else {
ret.tag = TEXT_ASCII;
ret.ascii = str;
}
return ret;
} else {
uint32_t buf[128];
size_t length = sizeof(buf)/sizeof(buf[0]);
uint32_t *codepoints = u8_to_u32((uint8_t*)str, ascii_span + strlen(str + ascii_span), buf, &length);
Text_t ret = text_from_u32(codepoints, length, true);
if (codepoints != buf) free(codepoints);
return ret;
}
}
2024-09-02 17:22:13 -07:00
static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i)
2024-09-02 15:47:39 -07:00
{
switch (text.tag) {
case TEXT_ASCII: case TEXT_SHORT_ASCII: {
if (*i + text.length > (int64_t)*capacity) {
2024-09-02 18:18:15 -07:00
*capacity = *i + text.length + 1;
2024-09-02 15:47:39 -07:00
*buf = GC_REALLOC(*buf, *capacity);
}
const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii;
memcpy(*buf + *i, bytes, text.length);
*i += text.length;
break;
}
case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: {
const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes;
2024-09-02 18:18:15 -07:00
for (int64_t g = 0; g < text.length; g++) {
2024-09-02 15:47:39 -07:00
const uint32_t *codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].codepoints : (uint32_t*)&graphemes[g];
2024-09-02 17:22:13 -07:00
int64_t num_codepoints = graphemes[g] < 0 ? synthetic_graphemes[-graphemes[g]-1].num_codepoints : 1;
2024-09-02 15:47:39 -07:00
uint8_t u8_buf[64];
size_t u8_len = sizeof(u8_buf);
uint8_t *u8 = u32_to_u8(codepoints, num_codepoints, u8_buf, &u8_len);
if (*i + (int64_t)u8_len > (int64_t)*capacity) {
2024-09-02 18:18:15 -07:00
*capacity = *i + u8_len + 1;
2024-09-02 15:47:39 -07:00
*buf = GC_REALLOC(*buf, *capacity);
}
2024-02-04 18:13:50 -08:00
2024-09-02 15:47:39 -07:00
memcpy(*buf + *i, u8, u8_len);
*i += u8_len;
if (u8 != u8_buf) free(u8);
}
break;
}
case TEXT_SUBTEXT: {
for (int64_t s = 0, remaining = text.length; remaining > 0; s++) {
u8_buf_append(text.subtexts[s], buf, capacity, i);
remaining -= text.subtexts[s].length;
}
break;
}
default: break;
}
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
public const char *Text$as_c_string(Text_t text)
2024-02-04 18:13:50 -08:00
{
2024-09-02 17:22:13 -07:00
int64_t capacity = text.length + 1;
2024-09-02 15:47:39 -07:00
char *buf = GC_MALLOC_ATOMIC(capacity);
int64_t i = 0;
u8_buf_append(text, &buf, &capacity, &i);
2024-09-02 16:23:35 -07:00
if (i + 1 > (int64_t)capacity) {
capacity = i + 1;
buf = GC_REALLOC(buf, capacity);
}
buf[i] = '\0';
2024-09-02 15:47:39 -07:00
return buf;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
public uint64_t Text$hash(Text_t *text)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
if (text->hash != 0) return text->hash;
siphash sh;
siphashinit(&sh, sizeof(int32_t[text->length]), (uint64_t*)TOMO_HASH_KEY);
union {
int32_t chunks[2];
uint64_t whole;
} tmp;
switch (text->tag) {
case TEXT_ASCII: case TEXT_SHORT_ASCII: {
const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii;
for (int64_t i = 0; i + 1 < text->length; i++) {
tmp.chunks[0] = (int32_t)bytes[i];
tmp.chunks[1] = (int32_t)bytes[i+1];
siphashadd64bits(&sh, tmp.whole);
}
int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes
text->hash = siphashfinish_last_part(&sh, (uint64_t)last);
break;
}
case TEXT_GRAPHEMES: {
const int32_t *graphemes = text->graphemes;
for (int64_t i = 0; i + 1 < text->length; i++) {
tmp.chunks[0] = graphemes[i];
tmp.chunks[1] = graphemes[i];
siphashadd64bits(&sh, tmp.whole);
}
int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes
text->hash = siphashfinish_last_part(&sh, (uint64_t)last);
break;
}
case TEXT_SHORT_GRAPHEMES: {
tmp.chunks[0] = text->short_graphemes[0];
if (text->length > 1)
tmp.chunks[1] = text->short_graphemes[1];
text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole);
break;
}
case TEXT_SUBTEXT: {
int32_t leftover = 0;
for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) {
Text_t subtext = text->subtexts[sub_i];
if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) {
const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii;
int64_t grapheme = 0;
if (leftover) {
tmp.chunks[0] = leftover;
tmp.chunks[1] = (int32_t)bytes[0];
siphashadd64bits(&sh, tmp.whole);
grapheme += 1;
}
for (; grapheme + 1 < subtext.length; grapheme += 2) {
tmp.chunks[0] = (int32_t)bytes[grapheme];
tmp.chunks[1] = (int32_t)bytes[grapheme+1];
siphashadd64bits(&sh, tmp.whole);
}
leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0;
} else if (subtext.tag == TEXT_SHORT_GRAPHEMES) {
if (leftover) {
tmp.chunks[0] = leftover;
tmp.chunks[1] = subtext.short_graphemes[0];
siphashadd64bits(&sh, tmp.whole);
leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0;
} else if (subtext.length == 1) {
leftover = subtext.short_graphemes[0];
} else {
tmp.chunks[0] = subtext.short_graphemes[0];
tmp.chunks[1] = subtext.short_graphemes[1];
siphashadd64bits(&sh, tmp.whole);
}
} else if (subtext.tag == TEXT_GRAPHEMES) {
int32_t *graphemes = subtext.graphemes;
int64_t grapheme = 0;
if (leftover) {
tmp.chunks[0] = leftover;
tmp.chunks[1] = graphemes[0];
siphashadd64bits(&sh, tmp.whole);
grapheme += 1;
}
for (; grapheme + 1 < subtext.length; grapheme += 2) {
tmp.chunks[0] = graphemes[grapheme];
tmp.chunks[1] = graphemes[grapheme+1];
siphashadd64bits(&sh, tmp.whole);
}
leftover = grapheme < subtext.length ? graphemes[grapheme] : 0;
}
to_hash -= text->subtexts[sub_i].length;
++sub_i;
}
text->hash = siphashfinish_last_part(&sh, leftover);
break;
}
default: errx(1, "Invalid text");
}
if (text->hash == 0)
text->hash = 1;
return text->hash;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
int32_t _next_grapheme(Text_t text, iteration_state_t *state, int64_t index)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
switch (text.tag) {
case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0;
case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0;
case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0;
case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0;
case TEXT_SUBTEXT: {
iteration_state_t backup_state = {0, 0};
if (!state) state = &backup_state;
2024-02-16 10:29:02 -08:00
2024-09-02 15:47:39 -07:00
if (index < 0 || index >= text.length)
return 0;
2024-09-02 15:47:39 -07:00
while (index < state->sum_of_previous_subtexts && state->subtext > 0) {
state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length;
state->subtext -= 1;
}
for (;;) {
if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length)
2024-09-02 16:05:06 -07:00
return _next_grapheme(text.subtexts[state->subtext], NULL, index - state->sum_of_previous_subtexts);
2024-09-02 15:47:39 -07:00
state->sum_of_previous_subtexts += text.subtexts[state->subtext].length;
state->subtext += 1;
}
return 0;
}
default: errx(1, "Invalid text");
}
return 0;
}
2024-09-02 15:47:39 -07:00
int32_t get_grapheme(Text_t text, int64_t index)
{
iteration_state_t state = {0, 0};
return _next_grapheme(text, &state, index);
}
2024-09-02 16:18:21 -07:00
public int32_t Text$compare(const Text_t *a, const Text_t *b)
2024-09-02 15:47:39 -07:00
{
int64_t len = MAX(a->length, b->length);
iteration_state_t a_state = {0, 0}, b_state = {0, 0};
for (int64_t i = 0; i < len; i++) {
int32_t ai = _next_grapheme(*a, &a_state, i);
int32_t bi = _next_grapheme(*b, &b_state, i);
if (ai == bi) continue;
int32_t cmp;
if (ai > 0 && bi > 0) {
cmp = u32_cmp((uint32_t*)&ai, (uint32_t*)&bi, 1);
} else if (ai > 0) {
cmp = u32_cmp2(
(uint32_t*)&ai, 1,
synthetic_graphemes[-bi-1].codepoints,
synthetic_graphemes[-bi-1].num_codepoints);
} else if (bi > 0) {
cmp = u32_cmp2(
synthetic_graphemes[-ai-1].codepoints,
synthetic_graphemes[-ai-1].num_codepoints,
(uint32_t*)&bi, 1);
} else {
cmp = u32_cmp2(
synthetic_graphemes[-ai-1].codepoints,
synthetic_graphemes[-ai-1].num_codepoints,
synthetic_graphemes[-bi-1].codepoints,
synthetic_graphemes[-bi-1].num_codepoints);
}
if (cmp != 0) return cmp;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
return 0;
}
2024-09-02 15:47:39 -07:00
public bool Text$equal(const Text_t *a, const Text_t *b)
{
if (a->length != b->length || (a->hash != 0 && b->hash != 0 && a->hash != b->hash))
return false;
int64_t len = a->length;
iteration_state_t a_state = {0, 0}, b_state = {0, 0};
for (int64_t i = 0; i < len; i++) {
int32_t ai = _next_grapheme(*a, &a_state, i);
int32_t bi = _next_grapheme(*b, &b_state, i);
if (ai != bi) return false;
}
return true;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
public bool Text$equal_ignoring_case(Text_t a, Text_t b)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
if (a.length != b.length)
return false;
int64_t len = a.length;
iteration_state_t a_state = {0, 0}, b_state = {0, 0};
const char *language = uc_locale_language();
for (int64_t i = 0; i < len; i++) {
int32_t ai = _next_grapheme(a, &a_state, i);
int32_t bi = _next_grapheme(b, &b_state, i);
if (ai != bi) {
const uint32_t *a_codepoints = ai >= 0 ? (uint32_t*)&ai : synthetic_graphemes[-ai-1].codepoints;
2024-09-02 17:22:13 -07:00
int64_t a_len = ai >= 0 ? 1 : synthetic_graphemes[-ai-1].num_codepoints;
2024-02-16 10:29:02 -08:00
2024-09-02 15:47:39 -07:00
const uint32_t *b_codepoints = bi >= 0 ? (uint32_t*)&bi : synthetic_graphemes[-bi-1].codepoints;
2024-09-02 17:22:13 -07:00
int64_t b_len = bi >= 0 ? 1 : synthetic_graphemes[-bi-1].num_codepoints;
2024-09-02 15:47:39 -07:00
int cmp;
(void)u32_casecmp(a_codepoints, a_len, b_codepoints, b_len, language, UNINORM_NFC, &cmp);
if (cmp != 0)
return false;
}
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
return true;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
public Text_t Text$upper(Text_t text)
2024-02-04 18:13:50 -08:00
{
2024-09-02 17:47:58 -07:00
array_t codepoints = Text$utf32_codepoints(text);
2024-09-02 15:47:39 -07:00
const char *language = uc_locale_language();
uint32_t buf[128];
size_t out_len;
2024-09-02 17:47:58 -07:00
uint32_t *upper = u32_toupper(codepoints.data, codepoints.length, language, UNINORM_NFC, buf, &out_len);
2024-09-02 15:47:39 -07:00
Text_t ret = text_from_u32(upper, out_len, false);
if (upper != buf) free(upper);
return ret;
}
public Text_t Text$lower(Text_t text)
{
2024-09-02 17:47:58 -07:00
array_t codepoints = Text$utf32_codepoints(text);
2024-09-02 15:47:39 -07:00
const char *language = uc_locale_language();
uint32_t buf[128];
size_t out_len;
2024-09-02 17:47:58 -07:00
uint32_t *lower = u32_tolower(codepoints.data, codepoints.length, language, UNINORM_NFC, buf, &out_len);
2024-09-02 15:47:39 -07:00
Text_t ret = text_from_u32(lower, out_len, false);
2024-09-02 17:47:58 -07:00
if (lower != buf) free(lower);
2024-09-02 15:47:39 -07:00
return ret;
}
public Text_t Text$title(Text_t text)
{
2024-09-02 17:47:58 -07:00
array_t codepoints = Text$utf32_codepoints(text);
2024-09-02 15:47:39 -07:00
const char *language = uc_locale_language();
uint32_t buf[128];
size_t out_len;
2024-09-02 17:47:58 -07:00
uint32_t *title = u32_totitle(codepoints.data, codepoints.length, language, UNINORM_NFC, buf, &out_len);
2024-09-02 15:47:39 -07:00
Text_t ret = text_from_u32(title, out_len, false);
2024-09-02 17:47:58 -07:00
if (title != buf) free(title);
2024-09-02 15:47:39 -07:00
return ret;
}
static inline void skip_whitespace(Text_t text, int64_t *i)
{
iteration_state_t state = {0, 0};
while (*i < text.length) {
int32_t grapheme = _next_grapheme(text, &state, *i);
if (grapheme > 0 && !uc_is_property_white_space(grapheme))
return;
*i += 1;
2024-02-04 18:13:50 -08:00
}
}
2024-09-02 15:47:39 -07:00
static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
if (*i < text.length && get_grapheme(text, *i) == grapheme) {
*i += 1;
return true;
}
return false;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
static inline bool match_str(Text_t text, int64_t *i, const char *str)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
iteration_state_t state = {0, 0};
int64_t matched = 0;
while (matched[str]) {
if (*i + matched >= text.length || _next_grapheme(text, &state, *i + matched) != str[matched])
return false;
matched += 1;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
*i += matched;
return true;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
if (*i >= text.length) return false;
int32_t grapheme = get_grapheme(text, *i);
if (grapheme < 0) // TODO: check every codepoint in the cluster?
grapheme = synthetic_graphemes[-grapheme-1].codepoints[0];
2024-02-04 18:13:50 -08:00
2024-09-02 15:47:39 -07:00
if (uc_is_property(grapheme, prop)) {
*i += 1;
return true;
}
return false;
}
2024-09-02 15:47:39 -07:00
static int64_t parse_int(Text_t text, int64_t *i)
{
iteration_state_t state = {0, 0};
int64_t value = 0;
for (;; *i += 1) {
int32_t grapheme = _next_grapheme(text, &state, *i);
if (grapheme < 0)
grapheme = synthetic_graphemes[-grapheme-1].codepoints[0];
int digit = uc_digit_value(grapheme);
if (digit < 0) break;
if (value >= INT64_MAX/10) break;
value = 10*value + digit;
}
return value;
}
2024-09-02 15:47:39 -07:00
const char *get_property_name(Text_t text, int64_t *i)
{
skip_whitespace(text, i);
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *dest = name;
iteration_state_t state = {0, 0};
while (*i < text.length) {
int32_t grapheme = _next_grapheme(text, &state, *i);
if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
*dest = (char)grapheme;
++dest;
if (dest >= name + UNINAME_MAX - 1)
break;
} else {
break;
}
*i += 1;
2024-02-04 18:13:50 -08:00
}
while (dest > name && dest[-1] == ' ')
*(dest--) = '\0';
2024-09-02 15:47:39 -07:00
if (dest == name) return NULL;
*dest = '\0';
return name;
2024-02-04 18:13:50 -08:00
}
#define EAT1(text, state, index, cond) ({\
int32_t grapheme = _next_grapheme(text, state, index); \
2024-09-02 15:47:39 -07:00
bool success = (cond); \
if (success) index += 1; \
success; })
#define EAT2(text, state, index, cond1, cond2) ({\
int32_t grapheme = _next_grapheme(text, state, index); \
bool success = (cond1); \
if (success) { \
grapheme = _next_grapheme(text, state, index + 1); \
success = (cond2); \
if (success) \
index += 2; \
} \
2024-09-02 15:47:39 -07:00
success; })
#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; })
2024-09-02 15:47:39 -07:00
int64_t match_email(Text_t text, int64_t text_index)
2024-02-04 18:13:50 -08:00
{
2024-09-02 15:47:39 -07:00
// email = local "@" domain
// local = 1-64 ([a-zA-Z0-9!#$%&*+/=?^_`.{|}~] | non-ascii)
// domain = dns-label ("." dns-label)*
// dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
2024-02-04 18:13:50 -08:00
2024-09-02 15:47:39 -07:00
iteration_state_t state = {0, 0};
if (text_index > 0) {
int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
if (prev_codepoint < 0)
prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
int64_t start_index = text_index;
// Local part:
int64_t local_len = 0;
static const char *allowed_local = "!#$%&*+/=?^_`.{|}~";
while (EAT1(text, &state, text_index,
(grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
2024-09-02 15:47:39 -07:00
local_len += 1;
if (local_len > 64) return -1;
}
if (!EAT1(text, &state, text_index, grapheme == '@'))
2024-09-02 15:47:39 -07:00
return -1;
// Host
int64_t host_len = 0;
do {
int64_t label_len = 0;
while (EAT1(text, &state, text_index,
(grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
2024-09-02 15:47:39 -07:00
label_len += 1;
if (label_len > 63) return -1;
}
if (label_len == 0)
return -1;
host_len += label_len;
if (host_len > 255)
return -1;
host_len += 1;
} while (EAT1(text, &state, text_index, grapheme == '.'));
2024-09-02 15:47:39 -07:00
return text_index - start_index;
2024-02-04 18:13:50 -08:00
}
2024-09-02 15:47:39 -07:00
int64_t match_ipv6(Text_t text, int64_t text_index)
{
2024-09-02 15:47:39 -07:00
iteration_state_t state = {0, 0};
if (text_index > 0) {
int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':'))
return -1;
}
2024-09-02 15:47:39 -07:00
int64_t start_index = text_index;
const int NUM_CLUSTERS = 8;
bool double_colon_used = false;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 4; digits++) {
if (!EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
2024-09-02 15:47:39 -07:00
break;
}
if (EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
2024-09-02 15:47:39 -07:00
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1) {
break;
} else if (!EAT1(text, &state, text_index, grapheme == ':')) {
2024-09-02 15:47:39 -07:00
if (double_colon_used)
break;
return -1;
}
if (EAT1(text, &state, text_index, grapheme == ':')) {
2024-09-02 15:47:39 -07:00
if (double_colon_used)
return -1;
double_colon_used = true;
}
}
return text_index - start_index;
}
2024-09-02 15:47:39 -07:00
static int64_t match_ipv4(Text_t text, int64_t text_index)
{
2024-09-02 15:47:39 -07:00
iteration_state_t state = {0, 0};
if (text_index > 0) {
int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.'))
return -1;
}
int64_t start_index = text_index;
2024-09-02 15:47:39 -07:00
const int NUM_CLUSTERS = 4;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 3; digits++) {
if (!EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
2024-09-02 15:47:39 -07:00
if (digits == 0) return -1;
break;
}
}
if (EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
2024-09-02 15:47:39 -07:00
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1)
break;
else if (!EAT1(text, &state, text_index, grapheme == '.'))
2024-09-02 15:47:39 -07:00
return -1;
}
return (text_index - start_index);
}
2024-09-02 15:47:39 -07:00
int64_t match_uri(Text_t text, int64_t text_index)
{
2024-09-02 15:47:39 -07:00
// URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
// scheme = [a-zA-Z] [a-zA-Z0-9+.-]
// authority = [userinfo "@"] host [":" port]
2024-09-02 15:47:39 -07:00
iteration_state_t state = {0, 0};
if (text_index > 0) {
int32_t prev_codepoint = _next_grapheme(text, &state, text_index - 1);
if (prev_codepoint < 0)
prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
if (uc_is_property_alphabetic(prev_codepoint))
return -1;
}
2024-09-02 15:47:39 -07:00
int64_t start_index = text_index;
// Scheme:
if (!EAT1(text, &state, text_index, isalpha(grapheme)))
2024-09-02 15:47:39 -07:00
return -1;
EAT_MANY(text, &state, text_index,
!(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
2024-09-02 15:47:39 -07:00
if (text_index == start_index)
return -1;
if (!match_grapheme(text, &text_index, ':'))
return -1;
// Authority:
if (match_str(text, &text_index, "//")) {
int64_t authority_start = text_index;
// Username or host:
static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
if (EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
2024-09-02 15:47:39 -07:00
return -1;
if (EAT1(text, &state, text_index, grapheme == '@')) {
2024-09-02 15:47:39 -07:00
// Found a username, now get a host:
if (EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
2024-09-02 15:47:39 -07:00
return -1;
} else {
int64_t ip = authority_start;
int64_t ipv4_len = match_ipv4(text, ip);
if (ipv4_len > 0) {
ip += ipv4_len;
} else if (match_grapheme(text, &ip, '[')) {
ip += match_ipv6(text, ip);
if (ip > authority_start + 1 && match_grapheme(text, &ip, ']'))
text_index = ip;
}
}
// Port:
if (EAT1(text, &state, text_index, grapheme == ':')) {
if (EAT_MANY(text, &state, text_index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
2024-09-02 15:47:39 -07:00
return -1;
}
if (!EAT1(text, &state, text_index, grapheme == '/'))
2024-09-02 15:47:39 -07:00
return (text_index - start_index); // No path
} else {
// Optional path root:
EAT1(text, &state, text_index, grapheme == '/');
2024-09-02 15:47:39 -07:00
}
// Path:
static const char *non_path = " \"#?<>[]{}\\^`|";
EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
2024-09-02 15:47:39 -07:00
if (EAT1(text, &state, text_index, grapheme == '?')) { // Query
2024-09-02 15:47:39 -07:00
static const char *non_query = " \"#<>[]{}\\^`|";
EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
2024-09-02 15:47:39 -07:00
}
if (EAT1(text, &state, text_index, grapheme == '#')) { // Fragment
2024-09-02 15:47:39 -07:00
static const char *non_fragment = " \"#<>[]{}\\^`|";
EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
2024-09-02 15:47:39 -07:00
}
return text_index - start_index;
}
int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t pattern_index)
{
2024-09-02 15:47:39 -07:00
if (pattern_index >= pattern.length) return 0;
int64_t start_index = text_index;
iteration_state_t pattern_state = {0, 0}, text_state = {0, 0};
while (pattern_index < pattern.length) {
int64_t old_pat_index = pattern_index;
if (EAT2(pattern, &pattern_state, pattern_index,
uc_is_property(grapheme, UC_PROPERTY_QUOTATION_MARK),
grapheme == '?')) {
// Quotations: "?", '?', etc
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
if (!match_grapheme(text, &text_index, open)) return -1;
int32_t close = open;
uc_mirror_char(open, (uint32_t*)&close);
if (!match_grapheme(pattern, &pattern_index, close))
fail("Pattern's closing brace is missing: %k", &pattern);
while (text_index < text.length) {
int32_t c = _next_grapheme(text, &text_state, text_index);
if (c == close)
return (text_index - start_index);
if (c == '\\' && text_index < text.length) {
text_index += 2;
} else {
text_index += 1;
}
}
return -1;
} else if (EAT2(pattern, &pattern_state, pattern_index,
uc_is_property(grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
grapheme == '?')) {
// Nested punctuation: (?), [?], etc
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
if (!match_grapheme(text, &text_index, open)) return -1;
int32_t close = open;
uc_mirror_char(open, (uint32_t*)&close);
if (!match_grapheme(pattern, &pattern_index, close))
fail("Pattern's closing brace is missing: %k", &pattern);
int64_t depth = 1;
for (; depth > 0 && text_index < text.length; ++text_index) {
int32_t c = _next_grapheme(text, &text_state, text_index);
if (c == open)
depth += 1;
else if (c == close)
depth -= 1;
}
if (depth > 0) return -1;
} else if (EAT1(pattern, &pattern_state, pattern_index,
grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
2024-09-02 15:47:39 -07:00
skip_whitespace(pattern, &pattern_index);
int64_t min, max;
if (uc_is_digit(_next_grapheme(pattern, &pattern_state, pattern_index))) {
min = parse_int(pattern, &pattern_index);
skip_whitespace(pattern, &pattern_index);
if (match_grapheme(pattern, &pattern_index, '+')) {
max = INT64_MAX;
} else if (match_grapheme(pattern, &pattern_index, '-')) {
max = parse_int(pattern, &pattern_index);
} else {
max = min;
}
} else {
min = 1, max = INT64_MAX;
}
skip_whitespace(pattern, &pattern_index);
bool any = false;
uc_property_t prop;
int32_t specific_codepoint = UNINAME_INVALID;
bool want_to_match = !match_grapheme(pattern, &pattern_index, '!');
const char *prop_name;
if (match_str(pattern, &pattern_index, ".."))
prop_name = "..";
else
prop_name = get_property_name(pattern, &pattern_index);
if (!prop_name) {
// Literal character, e.g. {1?}
specific_codepoint = _next_grapheme(pattern, &pattern_state, pattern_index);
pattern_index += 1;
} else if (strlen(prop_name) == 1) {
// Single letter names: {1+ A}
specific_codepoint = prop_name[0];
prop_name = NULL;
}
2024-09-02 15:47:39 -07:00
skip_whitespace(pattern, &pattern_index);
if (!match_grapheme(pattern, &pattern_index, '}'))
fail("Missing closing '}' in pattern: %k", &pattern);
int64_t before_group = text_index;
2024-09-02 15:47:39 -07:00
#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } })
if (prop_name) {
switch (tolower(prop_name[0])) {
case '.':
if (prop_name[1] == '.') {
any = true;
prop = UC_PROPERTY_PRIVATE_USE;
2024-09-03 12:14:46 -07:00
goto got_prop;
}
break;
2024-09-02 15:47:39 -07:00
case 'd':
if (strcasecmp(prop_name, "digit") == 0) {
prop = UC_PROPERTY_DECIMAL_DIGIT;
goto got_prop;
}
break;
case 'e':
if (strcasecmp(prop_name, "end") == 0) {
if (text_index != text.length)
FAIL();
continue;
} else if (prop_name && strcasecmp(prop_name, "email") == 0) {
int64_t len = match_email(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
} else if (prop_name && strcasecmp(prop_name, "emoji") == 0) {
prop = UC_PROPERTY_EMOJI;
goto got_prop;
}
break;
case 'i':
if (prop_name && strcasecmp(prop_name, "id") == 0) {
if (!EAT1(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_XID_START)))
2024-09-02 15:47:39 -07:00
FAIL();
EAT_MANY(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
2024-09-02 15:47:39 -07:00
continue;
2024-09-02 21:36:10 -07:00
} else if (prop_name && strcasecmp(prop_name, "int") == 0) {
EAT1(text, &text_state, text_index, grapheme == '-');
int64_t n = EAT_MANY(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
2024-09-02 21:36:10 -07:00
if (n <= 0)
FAIL();
continue;
2024-09-02 15:47:39 -07:00
} else if (prop_name && strcasecmp(prop_name, "ipv4") == 0) {
int64_t len = match_ipv4(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
} else if (prop_name && strcasecmp(prop_name, "ipv6") == 0) {
int64_t len = match_ipv6(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
} else if (prop_name && strcasecmp(prop_name, "ip") == 0) {
int64_t len = match_ipv6(text, text_index);
if (len < 0)
len = match_ipv4(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
}
break;
2024-09-02 21:36:10 -07:00
case 'n':
if (prop_name && strcasecmp(prop_name, "num") == 0) {
EAT1(text, &text_state, text_index, grapheme == '-');
int64_t pre_decimal = EAT_MANY(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
bool decimal = (EAT1(text, &text_state, text_index, grapheme == '.') == 1);
int64_t post_decimal = decimal ? EAT_MANY(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
2024-09-02 21:36:10 -07:00
if (pre_decimal == 0 && post_decimal == 0)
FAIL();
continue;
}
break;
2024-09-02 15:47:39 -07:00
case 's':
if (strcasecmp(prop_name, "start") == 0) {
if (text_index != 0) return -1;
continue;
}
break;
case 'u':
if (prop_name && strcasecmp(prop_name, "uri") == 0) {
int64_t len = match_uri(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
} else if (prop_name && strcasecmp(prop_name, "url") == 0) {
int64_t lookahead = text_index;
if (!(match_str(text, &lookahead, "https:")
|| match_str(text, &lookahead, "http:")
|| match_str(text, &lookahead, "ftp:")
|| match_str(text, &lookahead, "wss:")
|| match_str(text, &lookahead, "ws:")))
FAIL();
int64_t len = match_uri(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
}
break;
}
prop = uc_property_byname(prop_name);
if (!uc_property_is_valid(prop)) {
specific_codepoint = unicode_name_character(prop_name);
if (specific_codepoint == UNINAME_INVALID)
fail("Not a valid property or character name: %s", prop_name);
2024-09-02 15:47:39 -07:00
}
}
got_prop:;
if (min == 0 && pattern_index < pattern.length) {
int64_t match_len = match(text, pattern, text_index, pattern_index);
if (match_len >= 0)
return (text_index - start_index) + match_len;
}
for (int64_t count = 0; count < max; ) {
int32_t grapheme = _next_grapheme(text, &text_state, text_index);
if (grapheme < 0)
grapheme = synthetic_graphemes[-grapheme-1].codepoints[0];
bool success;
if (any)
success = true;
else if (specific_codepoint != UNINAME_INVALID)
success = (grapheme == specific_codepoint);
else
success = uc_is_property(grapheme, prop);
if (success != want_to_match) {
if (count < min) return -1;
else break;
}
text_index += 1;
count += 1;
if (count >= min) {
if (pattern_index < pattern.length) {
int64_t match_len = match(text, pattern, text_index, pattern_index);
if (match_len >= 0) {
return (text_index - start_index) + match_len;
}
} else if (text_index >= text.length) {
break;
}
}
}
} else {
// Plain character:
pattern_index = old_pat_index;
int32_t pat_grapheme = _next_grapheme(pattern, &pattern_state, pattern_index);
int32_t text_grapheme = _next_grapheme(text, &text_state, text_index);
if (pat_grapheme != text_grapheme)
return -1;
pattern_index += 1;
text_index += 1;
}
}
if (text_index >= text.length && pattern_index < pattern.length)
return -1;
return (text_index - start_index);
}
#undef EAT1
#undef EAT2
2024-09-02 15:47:39 -07:00
#undef EAT_MANY
public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length)
2024-09-02 15:47:39 -07:00
{
int64_t first = Int_to_Int64(from_index, false);
if (first == 0) fail("Invalid index: 0");
if (first < 0) first = text.length + first + 1;
if (first > text.length || first < 1)
return I_small(0);
int32_t first_grapheme = get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
2024-09-02 15:47:39 -07:00
iteration_state_t text_state = {0, 0};
for (int64_t i = first-1; i < text.length; i++) {
2024-09-02 15:47:39 -07:00
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (i < text.length && _next_grapheme(text, &text_state, i) != first_grapheme)
2024-09-02 15:47:39 -07:00
++i;
}
int64_t m = match(text, pattern, i, 0);
if (m >= 0) {
if (match_length)
*match_length = m;
return I(i+1);
}
}
2024-09-02 15:47:39 -07:00
if (match_length)
*match_length = -1;
return I(0);
}
public bool Text$has(Text_t text, Pattern_t pattern)
2024-09-02 17:22:13 -07:00
{
return !I_is_zero(Text$find(text, pattern, I_small(1), NULL));
}
2024-09-02 15:47:39 -07:00
public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n])
{
if (n < 1) return -1;
(void)info;
argtypes[0] = PA_POINTER;
sizes[0] = sizeof(Text_t*);
return 1;
}
public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[])
{
Text_t t = **(Text_t**)args[0];
if (info->alt)
return text_visualize(stream, t);
else
return Text$print(stream, t);
}
static inline Text_t _quoted(Text_t text, bool colorize, char quote_char)
{
2024-09-02 15:47:39 -07:00
// TODO: optimize for ASCII and short strings
array_t graphemes = {.atomic=1};
#define add_char(c) Array$insert_value(&graphemes, (uint32_t)c, I_small(0), sizeof(uint32_t))
#define add_str(s) ({ for (char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (uint32_t)*_c, I_small(0), sizeof(uint32_t)); })
if (colorize)
add_str("\x1b[35m");
if (quote_char != '"' && quote_char != '\"' && quote_char != '`')
add_char('$');
add_char(quote_char);
2024-09-02 15:47:39 -07:00
#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); add_char('\\'); add_str(str); if (colorize) add_str("\x1b[0;35m"); })
iteration_state_t state = {0, 0};
for (int64_t i = 0; i < text.length; i++) {
int32_t g = _next_grapheme(text, &state, i);
switch (g) {
case '\a': add_escaped("a"); break;
case '\b': add_escaped("b"); break;
case '\x1b': add_escaped("e"); break;
case '\f': add_escaped("f"); break;
case '\n': add_escaped("n"); break;
case '\r': add_escaped("r"); break;
case '\t': add_escaped("t"); break;
case '\v': add_escaped("v"); break;
case '\\': add_escaped("\\"); break;
case '\x00' ... '\x06': case '\x0E' ... '\x1A':
case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': {
if (colorize) add_str("\x1b[34;1m");
add_char('\\');
add_char('x');
char tmp[4];
sprintf(tmp, "%02X", g);
add_str(tmp);
if (colorize)
add_str("\x1b[0;35m");
break;
}
default: {
if (g == quote_char)
add_escaped(((char[2]){quote_char, 0}));
else
add_char(g);
break;
}
2024-09-02 15:47:39 -07:00
}
}
add_char(quote_char);
2024-09-02 15:47:39 -07:00
if (colorize)
add_str("\x1b[m");
2024-09-02 15:47:39 -07:00
return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data};
#undef add_str
#undef add_char
#undef add_escaped
}
public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info)
{
(void)info;
if (!text) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
Text_t as_text = _quoted(*(Text_t*)text, colorize, info == &Pattern ? '/' : '"');
if (info && info->TextInfo.lang && info != &$Text && info != &Pattern)
as_text = Text$concat(colorize ? Text("\x1b[1m$") : Text("$"), Text$from_str(info->TextInfo.lang), as_text);
return as_text;
}
public Text_t Text$quoted(Text_t text, bool colorize)
{
return _quoted(text, colorize, '"');
}
public array_t Text$find_all(Text_t text, Pattern_t pattern)
2024-09-02 20:26:55 -07:00
{
if (pattern.length == 0) // special case
return (array_t){.length=0};
array_t matches = {};
Int_t i = I_small(1);
for (;;) {
int64_t len;
Int_t found = Text$find(text, pattern, i, &len);
if (I_is_zero(found)) break;
Text_t match = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
Array$insert(&matches, &match, I_small(0), sizeof(Text_t));
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
2024-09-02 20:26:55 -07:00
}
return matches;
}
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t placeholder)
{
2024-09-02 15:47:39 -07:00
Text_t ret = {.length=0};
2024-09-02 16:49:52 -07:00
Int_t i = I_small(1);
2024-09-02 15:47:39 -07:00
for (;;) {
int64_t len;
Int_t found = Text$find(text, pattern, i, &len);
2024-09-02 16:49:52 -07:00
if (I_is_zero(found)) break;
Text_t replacement_text = replacement;
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
replacement_text = Text$replace(replacement, placeholder, matched_text, Text(""));
}
2024-09-02 15:47:39 -07:00
if (Int$compare(&found, &i, &$Text) > 0) {
Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1)));
ret = Text$concat(ret, before_slice, replacement_text);
2024-09-02 15:47:39 -07:00
} else {
ret = concat2(ret, replacement_text);
2024-09-02 15:47:39 -07:00
}
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
2024-09-02 15:47:39 -07:00
}
if (Int_to_Int64(i, false) <= text.length) {
Text_t last_slice = Text$slice(text, i, Int64_to_Int(text.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Text_t placeholder)
{
if (patterns.length != replacements.length)
fail("The number of patterns given (%ld) is not the same as the number of replacements (%ld)",
patterns.length, replacements.length);
if (patterns.length == 0) return text;
Text_t ret = {.length=0};
Pattern_t first_pattern = *(Pattern_t*)(patterns.data);
int32_t first_grapheme = get_grapheme(first_pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
iteration_state_t text_state = {0, 0};
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (pos < text.length && _next_grapheme(text, &text_state, pos) != first_grapheme)
++pos;
}
// Get all match lengths:
int64_t lengths[patterns.length] = {};
for (int64_t i = 0, match_pos = pos; i < patterns.length; i++) {
Pattern_t pattern = *(Pattern_t*)(patterns.data + i*patterns.stride);
// If one of the patterns is `?` sandwiched between two pats
if (i > 0 && i < patterns.length-1 && Text$equal(&pattern, (Pattern_t[1]){Text("?")})) {
Pattern_t prev_pat = *(Pattern_t*)(patterns.data + (i-1)*patterns.stride);
int32_t prev_last_grapheme = get_grapheme(prev_pat, prev_pat.length-1);
if (prev_last_grapheme < 0) goto literal_pat;
Pattern_t next_pat = *(Pattern_t*)(patterns.data + (i+1)*patterns.stride);
int32_t next_first_grapheme = get_grapheme(next_pat, 0);
if (next_first_grapheme < 0) goto literal_pat;
int32_t mirrored = prev_last_grapheme;
uc_mirror_char(prev_last_grapheme, (uint32_t*)&mirrored);
if (next_first_grapheme != mirrored)
goto literal_pat;
if ((uc_is_property_quotation_mark(prev_last_grapheme) && uc_is_property_quotation_mark(next_first_grapheme))
|| ((uc_is_property_paired_punctuation(prev_last_grapheme)
&& uc_is_property_paired_punctuation(next_first_grapheme)
&& uc_is_property_left_of_pair(prev_last_grapheme)))) {
// $/"/, $/?/, $/"/
// $/(/, $/?/, $/)/
Pattern_t matching_pair_pat = text_from_u32((uint32_t[3]){prev_last_grapheme, '?', next_first_grapheme}, 3, false);
int64_t enclosing_len = match(text, matching_pair_pat, match_pos-1, 0);
if (enclosing_len < 0) goto no_match;
assert(enclosing_len >= 2);
lengths[i] = enclosing_len - 2; // Exclude '(' and ')' or whatever delims
goto found_match;
}
}
literal_pat:;
lengths[i] = match(text, pattern, match_pos, 0);
if (lengths[i] < 0)
goto no_match;
found_match:
match_pos += lengths[i];
}
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
ret = concat2(ret, before_slice);
}
// Concatenate the slices/replacements
for (int64_t i = 0, replace_pos = pos; i < patterns.length; i++) {
Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride);
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, Int64_to_Int(replace_pos+1), Int64_to_Int(replace_pos + lengths[i]));
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
}
ret = concat2(ret, replacement);
replace_pos += lengths[i];
}
int64_t total_match_len = 0;
for (int64_t i = 0; i < patterns.length; i++)
total_match_len += lengths[i];
pos += (total_match_len <= 0) ? 1 : total_match_len;
nonmatch_pos = pos;
continue;
no_match:
pos += 1;
continue;
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeholder)
{
if (replacements.entries.length == 0) return text;
Text_t ret = {.length=0};
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Find the first matching pattern at this position:
for (int64_t i = 0; i < replacements.entries.length; i++) {
Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride);
int64_t len = match(text, pattern, pos, 0);
if (len < 0) continue;
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
ret = concat2(ret, before_slice);
}
// Concatenate the replacement:
Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, Int64_to_Int(pos+1), Int64_to_Int(pos + len));
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
}
ret = concat2(ret, replacement);
pos += len > 0 ? len : 1;
nonmatch_pos = pos;
goto next_pos;
}
pos += 1;
next_pos:
continue;
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public array_t Text$split(Text_t text, Pattern_t pattern)
{
2024-09-02 20:24:16 -07:00
if (text.length == 0) // special case
return (array_t){.length=0};
if (pattern.length == 0) // special case
return Text$clusters(text);
array_t chunks = {};
Int_t i = I_small(1);
for (;;) {
int64_t len;
Int_t found = Text$find(text, pattern, i, &len);
if (I_is_zero(found)) break;
Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1)));
Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
}
Text_t last_chunk = Text$slice(text, i, Int64_to_Int(text.length));
Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));
return chunks;
}
2024-09-02 20:13:02 -07:00
public Text_t Text$join(Text_t glue, array_t pieces)
{
if (pieces.length == 0) return (Text_t){.length=0};
Text_t result = *(Text_t*)pieces.data;
for (int64_t i = 1; i < pieces.length; i++) {
result = Text$concat(result, glue, *(Text_t*)(pieces.data + i*pieces.stride));
}
return result;
}
2024-09-02 15:47:39 -07:00
public Text_t Text$format(const char *fmt, ...)
{
2024-09-02 15:47:39 -07:00
va_list args;
va_start(args, fmt);
2024-09-02 15:54:53 -07:00
char buf[9];
2024-09-02 15:47:39 -07:00
int len = vsnprintf(buf, sizeof(buf), fmt, args);
Text_t ret;
2024-09-02 15:54:53 -07:00
if (len <= 8) {
2024-09-02 15:47:39 -07:00
ret = (Text_t){
.length=len,
2024-09-02 15:54:53 -07:00
.tag=TEXT_SHORT_ASCII,
2024-09-02 15:47:39 -07:00
};
for (int i = 0; i < len; i++)
ret.short_ascii[i] = buf[i];
} else {
2024-09-02 15:54:53 -07:00
char *str = GC_MALLOC_ATOMIC(len+1);
vsnprintf(str, len+1, fmt, args);
2024-09-02 15:47:39 -07:00
ret = Text$from_str(str);
}
va_end(args);
return ret;
}
2024-09-02 17:22:13 -07:00
public array_t Text$clusters(Text_t text)
{
array_t clusters = {.atomic=1};
2024-09-02 18:18:15 -07:00
for (int64_t i = 1; i <= text.length; i++) {
2024-09-02 17:22:13 -07:00
Text_t cluster = Text$slice(text, Int64_to_Int(i), Int64_to_Int(i));
Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
}
return clusters;
}
public array_t Text$utf32_codepoints(Text_t text)
{
array_t codepoints = {.atomic=1};
iteration_state_t state = {0, 0};
for (int64_t i = 0; i < text.length; i++) {
int32_t grapheme = _next_grapheme(text, &state, i);
if (grapheme < 0) {
for (int64_t c = 0; c < synthetic_graphemes[-grapheme-1].num_codepoints; c++)
Array$insert(&codepoints, &synthetic_graphemes[-grapheme-1].codepoints[c], I_small(0), sizeof(uint32_t));
} else {
Array$insert(&codepoints, &grapheme, I_small(0), sizeof(uint32_t));
}
}
return codepoints;
}
public array_t Text$utf8_bytes(Text_t text)
{
const char *str = Text$as_c_string(text);
return (array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str};
}
static inline const char *codepoint_name(uint32_t c)
{
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *found_name = unicode_character_name(c, name);
if (found_name) return found_name;
const uc_block_t *block = uc_block(c);
assert(block);
snprintf(name, UNINAME_MAX, "%s-%X", block->name, c);
return name;
}
2024-09-02 17:22:13 -07:00
public array_t Text$codepoint_names(Text_t text)
{
array_t names = {};
iteration_state_t state = {0, 0};
for (int64_t i = 0; i < text.length; i++) {
int32_t grapheme = _next_grapheme(text, &state, i);
if (grapheme < 0) {
for (int64_t c = 0; c < synthetic_graphemes[-grapheme-1].num_codepoints; c++) {
const char *name = codepoint_name(synthetic_graphemes[-grapheme-1].codepoints[c]);
2024-09-02 17:22:13 -07:00
Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=strlen(name), .ascii=name};
Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
}
} else {
const char *name = codepoint_name(grapheme);
2024-09-02 17:22:13 -07:00
Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=strlen(name), .ascii=name};
Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
}
}
return names;
}
public Text_t Text$from_codepoints(array_t codepoints)
{
if (codepoints.stride != sizeof(int32_t))
Array$compact(&codepoints, sizeof(int32_t));
return text_from_u32(codepoints.data, codepoints.length, true);
}
public Text_t Text$from_codepoint_names(array_t codepoint_names)
{
array_t codepoints = {};
for (int64_t i = 0; i < codepoint_names.length; i++) {
Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
const char *name_str = Text$as_c_string(*name);
uint32_t codepoint = unicode_name_character(name_str);
if (codepoint != UNINAME_INVALID)
Array$insert(&codepoints, &codepoint, I_small(0), sizeof(uint32_t));
}
return Text$from_codepoints(codepoints);
}
public Text_t Text$from_bytes(array_t bytes)
{
if (bytes.stride != sizeof(int8_t))
Array$compact(&bytes, sizeof(int8_t));
int8_t nul = 0;
Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t));
return Text$from_str(bytes.data);
}
2024-09-02 19:57:49 -07:00
public array_t Text$lines(Text_t text)
{
array_t lines = {};
iteration_state_t state = {0, 0};
for (int64_t i = 0, line_start = 0; i < text.length; i++) {
int32_t grapheme = _next_grapheme(text, &state, i);
if (grapheme == '\r' && _next_grapheme(text, &state, i + 1) == '\n') { // CRLF
Text_t line = Text$slice(text, Int64_to_Int(line_start+1), Int64_to_Int(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
i += 1; // skip one extra for CR
line_start = i + 1;
} else if (grapheme == '\n') { // newline
Text_t line = Text$slice(text, Int64_to_Int(line_start+1), Int64_to_Int(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
line_start = i + 1;
} else if (i == text.length-1 && line_start != i) { // last line
Text_t line = Text$slice(text, Int64_to_Int(line_start+1), Int64_to_Int(i+1));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
}
}
return lines;
}
public const TypeInfo $Text = {
2024-09-02 15:47:39 -07:00
.size=sizeof(Text_t),
.align=__alignof__(Text_t),
2024-03-09 15:22:12 -08:00
.tag=TextInfo,
.TextInfo={.lang="Text"},
2024-02-04 18:13:50 -08:00
};
public Pattern_t Pattern$escape_text(Text_t text)
{
// TODO: optimize for ASCII and short strings
array_t graphemes = {.atomic=1};
#define add_char(c) Array$insert_value(&graphemes, (uint32_t)c, I_small(0), sizeof(uint32_t))
#define add_str(s) ({ for (char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (uint32_t)*_c, I_small(0), sizeof(uint32_t)); })
iteration_state_t state = {0, 0};
for (int64_t i = 0; i < text.length; i++) {
int32_t g = _next_grapheme(text, &state, i);
uint32_t g0 = g < 0 ? synthetic_graphemes[-g-1].codepoints[0] : (uint32_t)g;
if (g == '{') {
add_str("{1{}");
2024-09-03 11:31:21 -07:00
} else if (g0 == '?'
|| uc_is_property_quotation_mark(g0)
|| (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) {
add_char('{');
add_char('1');
add_char(g);
add_char('}');
} else {
add_char(g);
}
}
return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data};
#undef add_str
#undef add_char
#undef add_escaped
}
public const TypeInfo Pattern = {
.size=sizeof(Text_t),
.align=__alignof__(Text_t),
.tag=TextInfo,
.TextInfo={.lang="Pattern"},
};
2024-02-04 18:13:50 -08:00
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0