tomo/stdlib/text.c
2025-03-21 16:30:18 -04:00

1500 lines
54 KiB
C

// This file defines type info and methods for the Text datatype, which uses
// libunistr for Unicode support and implements a datastructure based on a
// hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of
// strings and Cords (Boehm et al), which have good runtime performance for
// text constructed by a series of many concatenations.
//
// For more information on MoarVM's grapheme cluster strings, see:
// https://docs.raku.org/language/unicode
// https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc For more
// information on Cords, see the paper "Ropes: an Alternative to Strings"
// (Boehm, Atkinson, Plass 1995):
// https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf
//
// A note on grapheme clusters: In Unicode, codepoints can be represented using
// a 32-bit integer. Most codepoints correspond to the intuitive notion of a
// "letter", which is more formally known as a "grapheme cluster". A grapheme
// cluster is roughly speaking the amount of text that your cursor moves over
// when you press the arrow key once. However, some codepoints act as modifiers
// on other codepoints. For example, U+0301 (COMBINING ACUTE ACCENT) can modify
// a letter like "e" to form "é". During normalization, this frequently
// resolves down to a single unicode codepoint, in this case, "é" resolves to
// the single codepoint U+00E9 (LATIN SMALL LETTER E WITH ACUTE). However, in
// some cases, multiple codepoints make up a grapheme cluster but *don't*
// normalize to a single codepoint. For example, LATIN SMALL LETTER E (U+0065)
// + COMBINING VERTICAL LINE BELOW (U+0329) combine to form an unusual glyph
// that is not used frequently enough to warrant its own unique codepoint (this
// is basically what Zalgo text is).
//
// There are a lot of benefits to storing unicode text with one grapheme
// cluster per index in a densely packed array instead of storing the text as
// variable-width UTF8-encoded bytes. It lets us have one canonical length for
// the text that can be precomputed and is meaningful to users. It lets us
// quickly get the Nth "letter" in the text. Substring slicing is fast.
// However, since not all grapheme clusters take up the same number of
// codepoints, we're faced with the problem of how to jam multiple codepoints
// into a single 32-bit slot. Inspired by Raku and MoarVM's approach, this
// implementation uses "synthetic graphemes" (in Raku's terms, Normal Form
// Graphemes, aka NFG). A synthetic grapheme is a negative 32-bit signed
// integer that represents a multi-codepoint grapheme cluster that has been
// encountered during the program's runtime. These clusters are stored in a
// lookup array and hash map so that we can rapidly convert between the
// synthetic grapheme integer ID and the unicode codepoints associated with it.
// Essentially, it's like we create a supplement to the unicode standard with
// things that would be nice if they had their own codepoint so things worked
// out nicely because we're using them right now, and we'll give them a
// negative number so it doesn't overlap with any real codepoints.
//
// Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E
// WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2:
// U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E,
// COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2}
// Where -2 is used as a lookup in an array that holds the actual unicode
// codepoints: (ucs4_t[]){0x65, 0x0309}
#include <assert.h>
#include <ctype.h>
#include <gc.h>
#include <printf.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <sys/param.h>
#include <unicase.h>
#include <unictype.h>
#include <unigbrk.h>
#include <uniname.h>
#include <unistr.h>
#include <unistring/version.h>
#include <uniwidth.h>
#include "arrays.h"
#include "integers.h"
#include "patterns.h"
#include "tables.h"
#include "text.h"
// Use inline version of the siphash code for performance:
#include "siphash.h"
#include "siphash-internals.h"
typedef struct {
ucs4_t main_codepoint;
ucs4_t *utf32_cluster; // length-prefixed
const uint8_t *utf8;
} synthetic_grapheme_t;
// Synthetic grapheme clusters (clusters of more than one codepoint):
static Table_t grapheme_ids_by_codepoints = {}; // ucs4_t* length-prefixed codepoints -> int32_t ID
// This will hold a dynamically growing array of synthetic graphemes:
static synthetic_grapheme_t *synthetic_graphemes = NULL;
static int32_t synthetic_grapheme_capacity = 0;
static int32_t num_synthetic_graphemes = 0;
#define NUM_GRAPHEME_CODEPOINTS(id) (synthetic_graphemes[-(id)-1].utf32_cluster[0])
#define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1])
#define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8)
// Somewhat arbitrarily chosen, if two short literal ASCII or grapheme chunks
// are concatenated below this length threshold, we just merge them into a
// single literal node instead of a concatenation node.
#define SHORT_ASCII_LENGTH 64
#define SHORT_GRAPHEMES_LENGTH 16
static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize);
static Text_t simple_concatenation(Text_t a, Text_t b);
public Text_t EMPTY_TEXT = {
.length=0,
.tag=TEXT_ASCII,
.ascii=0,
};
PUREFUNC static bool graphemes_equal(const void *va, const void *vb, const TypeInfo_t*) {
ucs4_t *a = *(ucs4_t**)va;
ucs4_t *b = *(ucs4_t**)vb;
if (a[0] != b[0]) return false;
for (int i = 0; i < (int)a[0]; i++)
if (a[i] != b[i]) return false;
return true;
}
PUREFUNC static uint64_t grapheme_hash(const void *g, const TypeInfo_t*) {
ucs4_t *cluster = *(ucs4_t**)g;
return siphash24((void*)&cluster[1], sizeof(ucs4_t[cluster[0]]));
}
static const TypeInfo_t GraphemeClusterInfo = {
.size=sizeof(ucs4_t*),
.align=__alignof__(ucs4_t*),
.metamethods={
.equal=graphemes_equal,
.hash=grapheme_hash,
},
};
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wstack-protector"
public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_len)
{
ucs4_t length_prefixed[1+utf32_len];
length_prefixed[0] = (ucs4_t)utf32_len;
for (int i = 0; i < utf32_len; i++)
length_prefixed[i+1] = codepoints[i];
ucs4_t *ptr = &length_prefixed[0];
// Optimization for common case of one frequently used synthetic grapheme:
static int32_t last_grapheme = 0;
if (last_grapheme != 0 && graphemes_equal(&ptr, &synthetic_graphemes[-last_grapheme-1].utf32_cluster, NULL))
return last_grapheme;
TypeInfo_t GraphemeIDLookupTableInfo = *Table$info(&GraphemeClusterInfo, &Int32$info);
int32_t *found = Table$get(grapheme_ids_by_codepoints, &ptr, &GraphemeIDLookupTableInfo);
if (found) return *found;
// New synthetic grapheme:
if (num_synthetic_graphemes >= synthetic_grapheme_capacity) {
// If we don't have space, allocate more:
synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2);
synthetic_grapheme_t *new = GC_MALLOC_ATOMIC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity]));
memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes]));
synthetic_graphemes = new;
}
int32_t grapheme_id = -(num_synthetic_graphemes+1);
num_synthetic_graphemes += 1;
// Get UTF8 representation:
uint8_t u8_buf[64];
size_t u8_len = sizeof(u8_buf)/sizeof(u8_buf[0]);
uint8_t *u8 = u32_to_u8(codepoints, (size_t)utf32_len, u8_buf, &u8_len);
// For performance reasons, use an arena allocator here to ensure that
// synthetic graphemes store all of their information in a densely packed
// area with good cache locality:
static void *arena = NULL, *arena_end = NULL;
// Eat up any space needed to make arena 32-bit aligned:
if ((size_t)arena % __alignof__(ucs4_t) != 0)
arena += __alignof__(ucs4_t) - ((size_t)arena % __alignof__(ucs4_t));
// If we have filled up this arena, allocate a new one:
size_t needed_memory = sizeof(ucs4_t[1+utf32_len]) + sizeof(uint8_t[u8_len + 1]);
if (arena + needed_memory > arena_end) {
// Do reasonably big chunks at a time, so most synthetic codepoints are
// nearby each other in memory and cache locality is good. This is a
// rough guess at a good size:
size_t chunk_size = MAX(needed_memory, 512);
arena = GC_MALLOC_ATOMIC(chunk_size);
arena_end = arena + chunk_size;
}
// Copy length-prefixed UTF32 codepoints into the arena and store where they live:
ucs4_t *codepoint_copy = arena;
mempcpy(codepoint_copy, length_prefixed, sizeof(ucs4_t[1+utf32_len]));
synthetic_graphemes[-grapheme_id-1].utf32_cluster = codepoint_copy;
arena += sizeof(ucs4_t[1+utf32_len]);
// Copy UTF8 bytes into the arena and store where they live:
uint8_t *utf8_final = arena;
memcpy(utf8_final, u8, sizeof(uint8_t[u8_len]));
utf8_final[u8_len] = '\0'; // Add a terminating NUL byte
synthetic_graphemes[-grapheme_id-1].utf8 = utf8_final;
arena += sizeof(uint8_t[u8_len + 1]);
// Sickos at the unicode consortium decreed that you can have grapheme clusters
// that begin with *prefix* modifiers, so we gotta check for that case:
synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1];
for (ucs4_t i = 0; i < utf32_len; i++) {
#if _LIBUNISTRING_VERSION >= 0x010200
// libuinstring version 1.2.0 introduced uc_is_property_prepended_concatenation_mark()
// It's not critical, but it's technically more correct to have this check:
if (unlikely(uc_is_property_prepended_concatenation_mark(length_prefixed[1+i])))
continue;
#endif
synthetic_graphemes[-grapheme_id-1].main_codepoint = length_prefixed[1+i];
break;
}
// Cleanup from unicode API:
if (u8 != u8_buf) free(u8);
Table$set(&grapheme_ids_by_codepoints, &codepoint_copy, &grapheme_id, &GraphemeIDLookupTableInfo);
last_grapheme = grapheme_id;
return grapheme_id;
}
#pragma GCC diagnostic pop
int text_visualize(FILE *stream, Text_t t, int depth)
{
switch (t.tag) {
case TEXT_ASCII: return fprintf(stream, "<ascii length=%ld>%.*s</ascii>", t.length, t.length, t.ascii);
case TEXT_GRAPHEMES: {
int printed = fprintf(stream, "<graphemes length=%ld>", t.length);
printed += Text$print(stream, t);
printed += fprintf(stream, "</graphemes>");
return printed;
}
case TEXT_CONCAT: {
int printed = fprintf(stream, "<concat depth=%ld length=%ld>\n", t.depth, t.length);
for (int i = 0; i < depth+1; i++)
printed += fputc(' ', stream);
printed += text_visualize(stream, *t.left, depth+1);
printed += fputc('\n', stream);
for (int i = 0; i < depth+1; i++)
printed += fputc(' ', stream);
printed += text_visualize(stream, *t.right, depth+1);
printed += fputc('\n', stream);
for (int i = 0; i < depth; i++)
printed += fputc(' ', stream);
printed += fprintf(stream, "</concat>");
return printed;
}
default: return 0;
}
}
public int Text$print(FILE *stream, Text_t t)
{
if (t.length == 0) return 0;
switch (t.tag) {
case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream);
case TEXT_GRAPHEMES: {
const int32_t *graphemes = t.graphemes;
int written = 0;
for (int64_t i = 0; i < t.length; i++) {
int32_t grapheme = graphemes[i];
if (grapheme >= 0) {
uint8_t buf[8];
size_t len = sizeof(buf);
uint8_t *u8 = u32_to_u8((ucs4_t*)&grapheme, 1, buf, &len);
written += (int)fwrite(u8, sizeof(char), len, stream);
if (u8 != buf) free(u8);
} else {
const uint8_t *u8 = GRAPHEME_UTF8(grapheme);
assert(u8);
written += (int)fwrite(u8, sizeof(uint8_t), strlen((char*)u8), stream);
}
}
return written;
}
case TEXT_CONCAT: {
return (Text$print(stream, *t.left)
+ Text$print(stream, *t.right));
}
default: return 0;
}
}
static const int64_t min_len_for_depth[MAX_TEXT_DEPTH] = {
// Fibonacci numbers (skipping first two)
1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946,
17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578,
5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296,
433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049,
};
#define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth])
static void insert_balanced(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t to_insert)
{
int i = 0;
Text_t accumulator = EMPTY_TEXT;
for (; to_insert.length > min_len_for_depth[i + 1]; i++) {
if (balanced_texts[i].length) {
accumulator = simple_concatenation(balanced_texts[i], accumulator);
balanced_texts[i] = EMPTY_TEXT;
}
}
accumulator = simple_concatenation(accumulator, to_insert);
while (accumulator.length >= min_len_for_depth[i]) {
if (balanced_texts[i].length) {
accumulator = simple_concatenation(balanced_texts[i], accumulator);
balanced_texts[i] = EMPTY_TEXT;
}
i++;
}
i--;
balanced_texts[i] = accumulator;
}
static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text)
{
if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) {
insert_balanced_recursive(balanced_texts, *text.left);
insert_balanced_recursive(balanced_texts, *text.right);
} else {
insert_balanced(balanced_texts, text);
}
}
static Text_t rebalanced(Text_t a, Text_t b)
{
Text_t balanced_texts[MAX_TEXT_DEPTH];
memset(balanced_texts, 0, sizeof(balanced_texts));
insert_balanced_recursive(balanced_texts, a);
insert_balanced_recursive(balanced_texts, b);
Text_t ret = EMPTY_TEXT;
for (int i = 0; ret.length < a.length + b.length; i++) {
if (balanced_texts[i].length)
ret = simple_concatenation(balanced_texts[i], ret);
}
return ret;
}
Text_t simple_concatenation(Text_t a, Text_t b)
{
if (a.length == 0) return b;
if (b.length == 0) return a;
uint16_t new_depth = 1 + MAX(a.depth, b.depth);
// Rebalance only if depth exceeds the maximum allowed. We don't require
// every concatenation to yield a balanced text, since many concatenations
// are ephemeral (e.g. doing a loop repeatedly concatenating without using
// the intermediary values).
if (new_depth >= MAX_TEXT_DEPTH)
return rebalanced(a, b);
Text_t *children = GC_MALLOC(sizeof(Text_t[2]));
children[0] = a;
children[1] = b;
return (Text_t){
.tag=TEXT_CONCAT,
.length=a.length + b.length,
.depth=new_depth,
.left=&children[0],
.right=&children[1],
};
}
static Text_t concat2_assuming_safe(Text_t a, Text_t b)
{
if (a.length == 0) return b;
if (b.length == 0) return a;
if (a.tag == TEXT_ASCII && b.tag == TEXT_ASCII && (size_t)(a.length + b.length) <= SHORT_ASCII_LENGTH) {
struct Text_s ret = {
.tag=TEXT_ASCII,
.length=a.length + b.length,
};
ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
memcpy((char*)ret.ascii, a.ascii, sizeof(char[a.length]));
memcpy((char*)&ret.ascii[a.length], b.ascii, sizeof(char[b.length]));
return ret;
} else if (a.tag == TEXT_GRAPHEMES && b.tag == TEXT_GRAPHEMES && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
struct Text_s ret = {
.tag=TEXT_GRAPHEMES,
.length=a.length + b.length,
};
ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
memcpy((int32_t*)ret.graphemes, a.graphemes, sizeof(int32_t[a.length]));
memcpy((int32_t*)&ret.graphemes[a.length], b.graphemes, sizeof(int32_t[b.length]));
return ret;
} else if (a.tag != TEXT_CONCAT && b.tag != TEXT_CONCAT && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) {
// Turn a small bit of ASCII into graphemes if it helps make things smaller
// Text structs come with an extra 8 bytes, so allocate enough to hold the text
struct Text_s ret = {
.tag=TEXT_GRAPHEMES,
.length=a.length + b.length,
};
ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
int32_t *dest = (int32_t*)ret.graphemes;
if (a.tag == TEXT_GRAPHEMES) {
dest = mempcpy(dest, a.graphemes, sizeof(int32_t[a.length]));
} else {
for (int64_t i = 0; i < a.length; i++)
*(dest++) = (int32_t)a.ascii[i];
}
if (b.tag == TEXT_GRAPHEMES) {
memcpy(dest, b.graphemes, sizeof(int32_t[b.length]));
} else {
for (int64_t i = 0; i < b.length; i++)
*(dest++) = (int32_t)b.ascii[i];
}
return ret;
}
if (a.tag == TEXT_CONCAT && b.tag != TEXT_CONCAT && a.right->tag != TEXT_CONCAT)
return concat2_assuming_safe(*a.left, concat2_assuming_safe(*a.right, b));
return simple_concatenation(a, b);
}
static Text_t concat2(Text_t a, Text_t b)
{
if (a.length == 0) return b;
if (b.length == 0) return a;
int32_t last_a = Text$get_grapheme(a, a.length-1);
int32_t first_b = Text$get_grapheme(b, 0);
// Magic number, we know that no codepoints below here trigger instability:
static const int32_t LOWEST_CODEPOINT_TO_CHECK = 0x300; // COMBINING GRAVE ACCENT
if (last_a >= 0 && last_a < LOWEST_CODEPOINT_TO_CHECK && first_b >= 0 && first_b < LOWEST_CODEPOINT_TO_CHECK)
return concat2_assuming_safe(a, b);
size_t len = (last_a >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(last_a);
len += (first_b >= 0) ? 1 : NUM_GRAPHEME_CODEPOINTS(first_b);
ucs4_t codepoints[len];
ucs4_t *dest = codepoints;
if (last_a < 0)
dest = mempcpy(dest, GRAPHEME_CODEPOINTS(last_a), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(last_a)]));
else
*(dest++) = (ucs4_t)last_a;
if (first_b < 0)
dest = mempcpy(dest, GRAPHEME_CODEPOINTS(first_b), sizeof(ucs4_t[NUM_GRAPHEME_CODEPOINTS(first_b)]));
else
*(dest++) = (ucs4_t)first_b;
// Do a normalization run for these two codepoints and see if it looks different.
// Normalization should not exceed 3x in the input length (but if it does, it will be
// handled gracefully)
ucs4_t norm_buf[3*len];
size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, len, norm_buf, &norm_length);
bool stable = (norm_length == len && memcmp(codepoints, normalized, sizeof(codepoints)) == 0);
if (stable) {
const void *second_grapheme = u32_grapheme_next(normalized, &normalized[norm_length]);
if (second_grapheme == &normalized[norm_length])
stable = false;
}
if likely (stable) {
if (normalized != norm_buf)
free(normalized);
return concat2_assuming_safe(a, b);
}
Text_t glue = text_from_u32(norm_buf, (int64_t)norm_length, false);
if (normalized != norm_buf)
free(normalized);
if (a.length == 1 && b.length == 1)
return glue;
else if (a.length == 1)
return concat2_assuming_safe(glue, Text$slice(b, I(2), I(b.length)));
else if (b.length == 1)
return concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue);
else
return concat2_assuming_safe(
concat2_assuming_safe(Text$slice(a, I(1), I(a.length-1)), glue),
Text$slice(b, I(2), I(b.length)));
}
public Text_t Text$_concat(int n, Text_t items[n])
{
if (n == 0) return EMPTY_TEXT;
Text_t ret = items[0];
for (int i = 1; i < n; i++) {
if (items[i].length > 0)
ret = concat2(ret, items[i]);
}
return ret;
}
public Text_t Text$repeat(Text_t text, Int_t count)
{
if (text.length == 0 || Int$is_negative(count))
return EMPTY_TEXT;
Int_t result_len = Int$times(count, I(text.length));
if (Int$compare_value(result_len, I(1l<<40)) > 0)
fail("Text repeating would produce too big of an result!");
int64_t count64 = Int64$from_int(count, false);
Text_t ret = text;
for (int64_t c = 1; c < count64; c++)
ret = concat2(ret, text);
return ret;
}
public Int_t Text$width(Text_t text, Text_t language)
{
int width = u8_strwidth((const uint8_t*)Text$as_c_string(text), Text$as_c_string(language));
return Int$from_int32(width);
}
static Text_t Text$repeat_to_width(Text_t to_repeat, int64_t target_width, Text_t language)
{
if (target_width <= 0)
return EMPTY_TEXT;
const char *lang_str = Text$as_c_string(language);
int64_t width = (int64_t)u8_strwidth((const uint8_t*)Text$as_c_string(to_repeat), lang_str);
Text_t repeated = EMPTY_TEXT;
int64_t repeated_width = 0;
while (repeated_width + width <= target_width) {
repeated = concat2(repeated, to_repeat);
repeated_width += width;
}
if (repeated_width < target_width) {
for (int64_t i = 0; repeated_width < target_width && i < to_repeat.length; i++) {
Text_t c = Text$slice(to_repeat, I_small(i+1), I_small(i+1));
int64_t w = (int64_t)u8_strwidth((const uint8_t*)Text$as_c_string(c), lang_str);
if (repeated_width + w > target_width) {
repeated = concat2(repeated, Text$repeat(Text(" "), I(target_width - repeated_width)));
repeated_width = target_width;
break;
}
repeated = concat2(repeated, c);
repeated_width += w;
}
}
return repeated;
}
public Text_t Text$left_pad(Text_t text, Int_t width, Text_t padding, Text_t language)
{
if (padding.length == 0)
fail("Cannot pad with an empty text!");
int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
return concat2(Text$repeat_to_width(padding, needed, language), text);
}
public Text_t Text$right_pad(Text_t text, Int_t width, Text_t padding, Text_t language)
{
if (padding.length == 0)
fail("Cannot pad with an empty text!");
int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
return concat2(text, Text$repeat_to_width(padding, needed, language));
}
public Text_t Text$middle_pad(Text_t text, Int_t width, Text_t padding, Text_t language)
{
if (padding.length == 0)
fail("Cannot pad with an empty text!");
int64_t needed = Int64$from_int(width, false) - Int64$from_int(Text$width(text, language), false);
return Texts(Text$repeat_to_width(padding, needed/2, language), text, Text$repeat_to_width(padding, (needed+1)/2, language));
}
public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
{
int64_t first = Int64$from_int(first_int, false);
int64_t last = Int64$from_int(last_int, false);
if (first == 0) fail("Invalid index: 0");
if (last == 0) return EMPTY_TEXT;
if (first < 0) first = text.length + first + 1;
if (last < 0) last = text.length + last + 1;
if (last > text.length) last = text.length;
if (first > text.length || last < first)
return EMPTY_TEXT;
if (first == 1 && last == text.length)
return text;
while (text.tag == TEXT_CONCAT) {
if (last < text.left->length) {
text = *text.left;
} else if (first > text.left->length) {
first -= text.left->length;
last -= text.left->length;
text = *text.right;
} else {
return concat2(Text$slice(*text.left, I(first), I(text.length)),
Text$slice(*text.right, I(1), I(last-text.left->length)));
}
}
switch (text.tag) {
case TEXT_ASCII: {
return (Text_t){
.tag=TEXT_ASCII,
.length=last - first + 1,
.ascii=text.ascii + (first-1),
};
}
case TEXT_GRAPHEMES: {
return (Text_t){
.tag=TEXT_GRAPHEMES,
.length=last - first + 1,
.graphemes=text.graphemes + (first-1),
};
}
default: errx(1, "Invalid tag");
}
}
public Text_t Text$from(Text_t text, Int_t first)
{
return Text$slice(text, first, I_small(-1));
}
public Text_t Text$to(Text_t text, Int_t last)
{
return Text$slice(text, I_small(1), last);
}
public Text_t Text$reversed(Text_t text)
{
switch (text.tag) {
case TEXT_ASCII: {
struct Text_s ret = {
.tag=TEXT_ASCII,
.length=text.length,
};
ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length]));
for (int64_t i = 0; i < text.length; i++)
((char*)ret.ascii)[text.length-1-i] = text.ascii[i];
return ret;
}
case TEXT_GRAPHEMES: {
struct Text_s ret = {
.tag=TEXT_GRAPHEMES,
.length=text.length,
};
ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length]));
for (int64_t i = 0; i < text.length; i++)
((int32_t*)ret.graphemes)[text.length-1-i] = text.graphemes[i];
return ret;
}
case TEXT_CONCAT: {
return concat2(Text$reversed(*text.right), Text$reversed(*text.left));
}
default: errx(1, "Invalid tag");
}
}
public PUREFUNC Text_t Text$cluster(Text_t text, Int_t index_int)
{
int64_t index = Int64$from_int(index_int, false);
if (index == 0) fail("Invalid index: 0");
if (index < 0) index = text.length + index + 1;
if (index > text.length || index < 1)
fail("Invalid index: %ld is beyond the length of the text (length = %ld)",
Int64$from_int(index_int, false), text.length);
while (text.tag == TEXT_CONCAT) {
if (index <= text.left->length)
text = *text.left;
else
text = *text.right;
}
switch (text.tag) {
case TEXT_ASCII: {
struct Text_s ret = {
.tag=TEXT_ASCII,
.length=1,
.ascii=GC_MALLOC_ATOMIC(sizeof(char)),
};
*(char*)&ret.ascii[0] = text.ascii[index-1];
return ret;
}
case TEXT_GRAPHEMES: {
struct Text_s ret = {
.tag=TEXT_GRAPHEMES,
.length=1,
.graphemes=GC_MALLOC_ATOMIC(sizeof(int32_t)),
};
*(int32_t*)&ret.graphemes[0] = text.graphemes[index-1];
return ret;
}
default: errx(1, "Invalid tag");
}
}
Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize)
{
// Normalization is apparently guaranteed to never exceed 3x in the input length
ucs4_t norm_buf[MIN(256, 3*num_codepoints)];
if (normalize) {
size_t norm_length = sizeof(norm_buf)/sizeof(norm_buf[0]);
ucs4_t *normalized = u32_normalize(UNINORM_NFC, codepoints, (size_t)num_codepoints, norm_buf, &norm_length);
codepoints = normalized;
num_codepoints = (int64_t)norm_length;
}
// Intentionally overallocate here: allocate assuming each codepoint is a
// grapheme cluster. If that's not true, we'll have extra space at the end
// of the array, but the length will still be calculated correctly.
int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints]));
struct Text_s ret = {
.tag=TEXT_GRAPHEMES,
.length=0,
.graphemes=graphemes,
};
const ucs4_t *src = codepoints;
while (src < &codepoints[num_codepoints]) {
// TODO: use grapheme breaks instead of u32_grapheme_next()?
const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]);
if (next == &src[1]) {
graphemes[ret.length] = (int32_t)*src;
} else {
// Synthetic grapheme
graphemes[ret.length] = get_synthetic_grapheme(src, next-src);
}
++ret.length;
src = next;
}
if (normalize && codepoints != norm_buf) free(codepoints);
return ret;
}
public OptionalText_t Text$from_strn(const char *str, size_t len)
{
int64_t ascii_span = 0;
for (size_t i = 0; i < len && isascii(str[i]); i++)
ascii_span++;
if (ascii_span == (int64_t)len) { // All ASCII
char *copy = GC_MALLOC_ATOMIC(len);
memcpy(copy, str, len);
return (Text_t){
.tag=TEXT_ASCII,
.length=ascii_span,
.ascii=copy,
};
} else {
if (u8_check((uint8_t*)str, len) != NULL)
return NONE_TEXT;
ucs4_t buf[128];
size_t length = sizeof(buf)/sizeof(buf[0]);
ucs4_t *codepoints = u8_to_u32((uint8_t*)str, (size_t)ascii_span + strlen(str + ascii_span), buf, &length);
Text_t ret = text_from_u32(codepoints, (int64_t)length, true);
if (codepoints != buf) free(codepoints);
return ret;
}
}
public OptionalText_t Text$from_str(const char *str)
{
return str ? Text$from_strn(str, strlen(str)) : Text("");
}
static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i)
{
switch (text.tag) {
case TEXT_ASCII: {
if (*i + text.length > (int64_t)*capacity) {
*capacity = *i + text.length + 1;
*buf = GC_REALLOC(*buf, (size_t)*capacity);
}
const char *bytes = text.ascii;
memcpy(*buf + *i, bytes, (size_t)text.length);
*i += text.length;
break;
}
case TEXT_GRAPHEMES: {
const int32_t *graphemes = text.graphemes;
for (int64_t g = 0; g < text.length; g++) {
if (graphemes[g] >= 0) {
uint8_t u8_buf[64];
size_t u8_len = sizeof(u8_buf);
uint8_t *u8 = u32_to_u8((ucs4_t*)&graphemes[g], 1, u8_buf, &u8_len);
if (*i + (int64_t)u8_len > (int64_t)*capacity) {
*capacity = *i + (int64_t)u8_len + 1;
*buf = GC_REALLOC(*buf, (size_t)*capacity);
}
memcpy(*buf + *i, u8, u8_len);
*i += (int64_t)u8_len;
if (u8 != u8_buf) free(u8);
} else {
const uint8_t *u8 = GRAPHEME_UTF8(graphemes[g]);
size_t u8_len = u8_strlen(u8);
if (*i + (int64_t)u8_len > (int64_t)*capacity) {
*capacity = *i + (int64_t)u8_len + 1;
*buf = GC_REALLOC(*buf, (size_t)*capacity);
}
memcpy(*buf + *i, u8, u8_len);
*i += (int64_t)u8_len;
}
}
break;
}
case TEXT_CONCAT: {
u8_buf_append(*text.left, buf, capacity, i);
u8_buf_append(*text.right, buf, capacity, i);
break;
}
default: break;
}
}
public char *Text$as_c_string(Text_t text)
{
int64_t capacity = text.length + 1;
char *buf = GC_MALLOC_ATOMIC((size_t)capacity);
int64_t i = 0;
u8_buf_append(text, &buf, &capacity, &i);
if (i + 1 > (int64_t)capacity) {
capacity = i + 1;
buf = GC_REALLOC(buf, (size_t)capacity);
}
buf[i] = '\0';
return buf;
}
PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t*)
{
Text_t text = *(Text_t*)obj;
siphash sh;
siphashinit(&sh, sizeof(int32_t[text.length]));
union {
int32_t chunks[2];
uint64_t whole;
} tmp;
switch (text.tag) {
case TEXT_ASCII: {
const char *bytes = text.ascii;
for (int64_t i = 0; i + 1 < text.length; i += 2) {
tmp.chunks[0] = (int32_t)bytes[i];
tmp.chunks[1] = (int32_t)bytes[i+1];
siphashadd64bits(&sh, tmp.whole);
}
int32_t last = text.length & 0x1 ? (int32_t)bytes[text.length-1] : 0; // Odd number of graphemes
return siphashfinish_last_part(&sh, (uint64_t)last);
}
case TEXT_GRAPHEMES: {
const int32_t *graphemes = text.graphemes;
for (int64_t i = 0; i + 1 < text.length; i += 2) {
tmp.chunks[0] = graphemes[i];
tmp.chunks[1] = graphemes[i];
siphashadd64bits(&sh, tmp.whole);
}
int32_t last = text.length & 0x1 ? graphemes[text.length-1] : 0; // Odd number of graphemes
return siphashfinish_last_part(&sh, (uint64_t)last);
}
case TEXT_CONCAT: {
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < (text.length & ~0x1); i += 2) {
tmp.chunks[0] = Text$get_grapheme_fast(&state, i);
tmp.chunks[0] = Text$get_grapheme_fast(&state, i+1);
siphashadd64bits(&sh, tmp.whole);
}
int32_t last = (text.length & 0x1) ? Text$get_grapheme_fast(&state, text.length-1) : 0;
return siphashfinish_last_part(&sh, (uint64_t)last);
}
default: errx(1, "Invalid text");
}
}
public int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index)
{
if (index < 0) return 0;
if (index >= state->stack[0].text.length) return 0;
assert(state->stack[0].text.depth <= MAX_TEXT_DEPTH);
// Go up the stack as needed:
while (index < state->stack[state->stack_index].offset
|| index >= state->stack[state->stack_index].offset + state->stack[state->stack_index].text.length) {
state->stack_index -= 1;
assert(state->stack_index >= 0);
}
assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
// Go down the stack as needed:
while (state->stack[state->stack_index].text.tag == TEXT_CONCAT) {
Text_t text = state->stack[state->stack_index].text;
int64_t offset = state->stack[state->stack_index].offset;
assert(state->stack_index <= MAX_TEXT_DEPTH);
assert(index >= offset);
assert(index < offset + text.length);
state->stack_index += 1;
if (index < offset + text.left->length) {
state->stack[state->stack_index].text = *text.left;
state->stack[state->stack_index].offset = offset;
} else {
state->stack[state->stack_index].text = *text.right;
state->stack[state->stack_index].offset = offset + text.left->length;
}
assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH);
}
Text_t text = state->stack[state->stack_index].text;
int64_t offset = state->stack[state->stack_index].offset;
if (index < offset || index >= offset + text.length) {
return 0;
}
switch (text.tag) {
case TEXT_ASCII: return (int32_t)text.ascii[index - offset];
case TEXT_GRAPHEMES: return text.graphemes[index - offset];
default: errx(1, "Invalid text");
}
return 0;
}
public uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index)
{
int32_t g = Text$get_grapheme_fast(state, index);
return (g) >= 0 ? (ucs4_t)(g) : synthetic_graphemes[-(g)-1].main_codepoint;
}
PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*)
{
if (va == vb) return 0;
const Text_t a = *(const Text_t*)va;
const Text_t b = *(const Text_t*)vb;
// TODO: make this smarter and more efficient
int64_t len = MAX(a.length, b.length);
TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
for (int64_t i = 0; i < len; i++) {
int32_t ai = Text$get_grapheme_fast(&a_state, i);
int32_t bi = Text$get_grapheme_fast(&b_state, i);
if (ai == bi) continue;
int32_t cmp;
if (ai > 0 && bi > 0) {
cmp = u32_cmp((ucs4_t*)&ai, (ucs4_t*)&bi, 1);
} else if (ai > 0) {
cmp = u32_cmp2(
(ucs4_t*)&ai, 1,
GRAPHEME_CODEPOINTS(bi),
NUM_GRAPHEME_CODEPOINTS(bi));
} else if (bi > 0) {
cmp = u32_cmp2(
GRAPHEME_CODEPOINTS(ai),
NUM_GRAPHEME_CODEPOINTS(ai),
(ucs4_t*)&bi, 1);
} else {
cmp = u32_cmp2(
GRAPHEME_CODEPOINTS(ai),
NUM_GRAPHEME_CODEPOINTS(ai),
GRAPHEME_CODEPOINTS(bi),
NUM_GRAPHEME_CODEPOINTS(bi));
}
if (cmp != 0) return cmp;
}
return 0;
}
PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix)
{
if (text.length < prefix.length)
return false;
TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix);
for (int64_t i = 0; i < prefix.length; i++) {
int32_t text_i = Text$get_grapheme_fast(&text_state, i);
int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i);
if (text_i != prefix_i) return false;
}
return true;
}
PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix)
{
if (text.length < suffix.length)
return false;
TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix);
for (int64_t i = 0; i < suffix.length; i++) {
int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i);
int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i);
if (text_i != suffix_i) return false;
}
return true;
}
PUREFUNC public bool Text$equal_values(Text_t a, Text_t b)
{
if (a.length != b.length)
return false;
int64_t len = a.length;
TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
// TODO: make this smarter and more efficient
for (int64_t i = 0; i < len; i++) {
int32_t ai = Text$get_grapheme_fast(&a_state, i);
int32_t bi = Text$get_grapheme_fast(&b_state, i);
if (ai != bi) return false;
}
return true;
}
PUREFUNC public bool Text$equal(const void *a, const void *b, const TypeInfo_t*)
{
if (a == b) return true;
return Text$equal_values(*(Text_t*)a, *(Text_t*)b);
}
PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b, Text_t language)
{
if (a.length != b.length)
return false;
int64_t len = a.length;
TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b);
const char *uc_language = Text$as_c_string(language);
for (int64_t i = 0; i < len; i++) {
int32_t ai = Text$get_grapheme_fast(&a_state, i);
int32_t bi = Text$get_grapheme_fast(&b_state, i);
if (ai != bi) {
const ucs4_t *a_codepoints = ai >= 0 ? (ucs4_t*)&ai : GRAPHEME_CODEPOINTS(ai);
int64_t a_len = ai >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(ai);
const ucs4_t *b_codepoints = bi >= 0 ? (ucs4_t*)&bi : GRAPHEME_CODEPOINTS(bi);
int64_t b_len = bi >= 0 ? 1 : NUM_GRAPHEME_CODEPOINTS(bi);
int cmp = 0;
(void)u32_casecmp(a_codepoints, (size_t)a_len, b_codepoints, (size_t)b_len, uc_language, UNINORM_NFC, &cmp);
if (cmp != 0)
return false;
}
}
return true;
}
public Text_t Text$upper(Text_t text, Text_t language)
{
if (text.length == 0) return text;
Array_t codepoints = Text$utf32_codepoints(text);
const char *uc_language = Text$as_c_string(language);
ucs4_t buf[128];
size_t out_len = sizeof(buf)/sizeof(buf[0]);
ucs4_t *upper = u32_toupper(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
Text_t ret = text_from_u32(upper, (int64_t)out_len, false);
if (upper != buf) free(upper);
return ret;
}
public Text_t Text$lower(Text_t text, Text_t language)
{
if (text.length == 0) return text;
Array_t codepoints = Text$utf32_codepoints(text);
const char *uc_language = Text$as_c_string(language);
ucs4_t buf[128];
size_t out_len = sizeof(buf)/sizeof(buf[0]);
ucs4_t *lower = u32_tolower(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
Text_t ret = text_from_u32(lower, (int64_t)out_len, false);
if (lower != buf) free(lower);
return ret;
}
public Text_t Text$title(Text_t text, Text_t language)
{
if (text.length == 0) return text;
Array_t codepoints = Text$utf32_codepoints(text);
const char *uc_language = Text$as_c_string(language);
ucs4_t buf[128];
size_t out_len = sizeof(buf)/sizeof(buf[0]);
ucs4_t *title = u32_totitle(codepoints.data, (size_t)codepoints.length, uc_language, UNINORM_NFC, buf, &out_len);
Text_t ret = text_from_u32(title, (int64_t)out_len, false);
if (title != buf) free(title);
return ret;
}
public int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n])
{
if (n < 1) return -1;
(void)info;
argtypes[0] = PA_POINTER;
sizes[0] = sizeof(Text_t);
return 1;
}
public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[])
{
Text_t *t = *(Text_t**)args[0];
if (info->alt)
return text_visualize(stream, *t, 0);
else
return Text$print(stream, *t);
}
static INLINE Text_t _quoted(Text_t text, bool colorize, char quote_char)
{
Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT;
if (quote_char != '"' && quote_char != '\'' && quote_char != '`')
ret = concat2_assuming_safe(ret, Text("$"));
Text_t quote_text = Text$from_strn(&quote_char, 1);
ret = concat2_assuming_safe(ret, quote_text);
#define add_escaped(str) ({ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \
if (!just_escaped) ret = concat2_assuming_safe(ret, Text("$")); \
ret = concat2_assuming_safe(ret, Text("\\" str)); \
just_escaped = true; \
if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); })
TextIter_t state = NEW_TEXT_ITER_STATE(text);
bool just_escaped = false;
// TODO: optimize for spans of non-escaped text
for (int64_t i = 0; i < text.length; i++) {
int32_t g = Text$get_grapheme_fast(&state, i);
switch (g) {
case '\a': add_escaped("a"); break;
case '\b': add_escaped("b"); break;
case '\x1b': add_escaped("e"); break;
case '\f': add_escaped("f"); break;
case '\n': add_escaped("n"); break;
case '\r': add_escaped("r"); break;
case '\t': add_escaped("t"); break;
case '\v': add_escaped("v"); break;
case '\\': {
if (just_escaped) {
add_escaped("\\");
} else {
ret = concat2_assuming_safe(ret, Text("\\"));
just_escaped = false;
}
break;
}
case '$': {
if (quote_char == '\'') {
ret = concat2_assuming_safe(ret, Text("$"));
just_escaped = false;
} else {
add_escaped("$");
}
break;
}
case '\x00' ... '\x06': case '\x0E' ... '\x1A':
case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': {
if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m"));
ret = concat2_assuming_safe(ret, Text("\\x"));
char tmp[2];
sprintf(tmp, "%02X", g);
ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2));
if (colorize)
ret = concat2_assuming_safe(ret, Text("\x1b[0;35m"));
just_escaped = true;
break;
}
default: {
if (g == quote_char) {
ret = concat2_assuming_safe(ret, quote_text);
} else {
ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1)));
just_escaped = false;
}
break;
}
}
}
#undef add_escaped
ret = concat2_assuming_safe(ret, quote_text);
if (colorize)
ret = concat2_assuming_safe(ret, Text("\x1b[m"));
return ret;
}
public Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *info)
{
(void)info;
if (info->TextInfo.lang && streq(info->TextInfo.lang, "Path")) {
if (!vtext) return Text("Path");
Text_t text = *(Text_t*)vtext;
return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", &text, colorize ? "\x1b[m" : "");
}
if (!vtext) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text");
Text_t text = *(Text_t*)vtext;
char quote_char;
if (info == &Pattern$info) {
quote_char = Text$has(text, Pattern("/")) && !Text$has(text, Pattern("|")) ? '|' : '/';
} else {
// Figure out the best quotation mark to use:
bool has_dollar = false, has_double_quote = false, has_backtick = false,
has_single_quote = false, needs_escapes = false;
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < text.length; i++) {
int32_t g = Text$get_grapheme_fast(&state, i);
if (g == '$') {
has_dollar = true;
} else if (g == '"') {
has_double_quote = true;
} else if (g == '`') {
has_backtick = true;
} else if (g == (g & 0x7F) && (g == '\'' || g == '\n' || g == '\r' || g == '\t' || !isprint((char)g))) {
needs_escapes = true;
}
}
// If there's dollar signs and/or double quotes in the string, it would
// be nice to avoid needing to escape them by using single quotes, but
// only if we don't have single quotes or need to escape anything else
// (because single quotes don't have interpolation):
if ((has_dollar || has_double_quote) && !has_single_quote && !needs_escapes)
quote_char = '\'';
// If there is a double quote, but no backtick, we can save a bit of
// escaping by using backtick instead of double quote:
else if (has_double_quote && !has_backtick)
quote_char = '`';
// Otherwise fall back to double quotes as the default quoting style:
else
quote_char = '"';
}
Text_t as_text = _quoted(text, colorize, quote_char);
if (info && info->TextInfo.lang && info != &Text$info && info != &Pattern$info)
as_text = Text$concat(
colorize ? Text("\x1b[1m$") : Text("$"),
Text$from_str(info->TextInfo.lang),
colorize ? Text("\x1b[0m") : Text(""),
as_text);
return as_text;
}
public Text_t Text$quoted(Text_t text, bool colorize)
{
return _quoted(text, colorize, '"');
}
public Text_t Text$join(Text_t glue, Array_t pieces)
{
if (pieces.length == 0) return EMPTY_TEXT;
Text_t result = *(Text_t*)pieces.data;
for (int64_t i = 1; i < pieces.length; i++) {
result = Text$concat(result, glue, *(Text_t*)(pieces.data + i*pieces.stride));
}
return result;
}
__attribute__((format(printf, 1, 2)))
public Text_t Text$format(const char *fmt, ...)
{
va_list args;
va_start(args, fmt);
char buf[9];
int len = vsnprintf(buf, sizeof(buf), fmt, args);
char *str = GC_MALLOC_ATOMIC((size_t)(len+1));
vsnprintf(str, (size_t)(len+1), fmt, args);
Text_t ret = Text$from_str(str);
va_end(args);
return ret;
}
public Array_t Text$clusters(Text_t text)
{
Array_t clusters = {};
for (int64_t i = 1; i <= text.length; i++) {
Text_t cluster = Text$slice(text, I(i), I(i));
Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
}
return clusters;
}
public Array_t Text$utf32_codepoints(Text_t text)
{
Array_t codepoints = {.atomic=1};
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < text.length; i++) {
int32_t grapheme = Text$get_grapheme_fast(&state, i);
if (grapheme < 0) {
for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
ucs4_t subg = GRAPHEME_CODEPOINTS(grapheme)[c];
Array$insert(&codepoints, &subg, I_small(0), sizeof(ucs4_t));
}
} else {
Array$insert(&codepoints, &grapheme, I_small(0), sizeof(ucs4_t));
}
}
return codepoints;
}
public Array_t Text$utf8_bytes(Text_t text)
{
const char *str = Text$as_c_string(text);
return (Array_t){.length=strlen(str), .stride=1, .atomic=1, .data=(void*)str};
}
static INLINE const char *codepoint_name(ucs4_t c)
{
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *found_name = unicode_character_name(c, name);
if (found_name) return found_name;
const uc_block_t *block = uc_block(c);
assert(block);
snprintf(name, UNINAME_MAX, "%s-%X", block->name, c);
return name;
}
public Array_t Text$codepoint_names(Text_t text)
{
Array_t names = {};
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < text.length; i++) {
int32_t grapheme = Text$get_grapheme_fast(&state, i);
if (grapheme < 0) {
for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) {
const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]);
Text_t name_text = Text$from_str(name);
Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
}
} else {
const char *name = codepoint_name((ucs4_t)grapheme);
Text_t name_text = Text$from_str(name);
Array$insert(&names, &name_text, I_small(0), sizeof(Text_t));
}
}
return names;
}
public Text_t Text$from_codepoints(Array_t codepoints)
{
if (codepoints.stride != sizeof(int32_t))
Array$compact(&codepoints, sizeof(int32_t));
return text_from_u32(codepoints.data, codepoints.length, true);
}
public OptionalText_t Text$from_codepoint_names(Array_t codepoint_names)
{
Array_t codepoints = {};
for (int64_t i = 0; i < codepoint_names.length; i++) {
Text_t *name = ((Text_t*)(codepoint_names.data + i*codepoint_names.stride));
const char *name_str = Text$as_c_string(*name);
ucs4_t codepoint = unicode_name_character(name_str);
if (codepoint == UNINAME_INVALID)
return NONE_TEXT;
Array$insert(&codepoints, &codepoint, I_small(0), sizeof(ucs4_t));
}
return Text$from_codepoints(codepoints);
}
public OptionalText_t Text$from_bytes(Array_t bytes)
{
if (bytes.stride != sizeof(int8_t))
Array$compact(&bytes, sizeof(int8_t));
return Text$from_strn(bytes.data, (size_t)bytes.length);
}
public Array_t Text$lines(Text_t text)
{
Array_t lines = {};
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0, line_start = 0; i < text.length; i++) {
int32_t grapheme = Text$get_grapheme_fast(&state, i);
if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF
Text_t line = Text$slice(text, I(line_start+1), I(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
i += 1; // skip one extra for CR
line_start = i + 1;
} else if (grapheme == '\n') { // newline
Text_t line = Text$slice(text, I(line_start+1), I(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
line_start = i + 1;
} else if (i == text.length-1 && line_start != i) { // last line
Text_t line = Text$slice(text, I(line_start+1), I(i+1));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
}
}
return lines;
}
typedef struct {
TextIter_t state;
int64_t i;
} line_iter_state_t;
static OptionalText_t next_line(line_iter_state_t *state)
{
Text_t text = state->state.stack[0].text;
for (int64_t i = state->i; i < text.length; i++) {
int32_t grapheme = Text$get_grapheme_fast(&state->state, i);
if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF
Text_t line = Text$slice(text, I(state->i+1), I(i));
state->i = i + 2; // skip one extra for CR
return line;
} else if (grapheme == '\n') { // newline
Text_t line = Text$slice(text, I(state->i+1), I(i));
state->i = i + 1;
return line;
} else if (i == text.length-1 && state->i != i) { // last line
Text_t line = Text$slice(text, I(state->i+1), I(i+1));
state->i = i + 1;
return line;
}
}
return NONE_TEXT;
}
public Closure_t Text$by_line(Text_t text)
{
return (Closure_t){
.fn=(void*)next_line,
.userdata=new(line_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0),
};
}
PUREFUNC public bool Text$is_none(const void *t, const TypeInfo_t*)
{
return ((Text_t*)t)->length < 0;
}
public void Text$serialize(const void *obj, FILE *out, Table_t *pointers, const TypeInfo_t *)
{
const char *str = Text$as_c_string(*(Text_t*)obj);
int64_t len = (int64_t)strlen(str);
Int64$serialize(&len, out, pointers, &Int64$info);
fwrite(str, sizeof(char), (size_t)len, out);
}
public void Text$deserialize(FILE *in, void *out, Array_t *pointers, const TypeInfo_t *)
{
int64_t len = -1;
Int64$deserialize(in, &len, pointers, &Int64$info);
char *buf = GC_MALLOC_ATOMIC((size_t)len+1);
fread(buf, sizeof(char), (size_t)len, in);
buf[len+1] = '\0';
*(Text_t*)out = Text$from_strn(buf, (size_t)len);
}
public const TypeInfo_t Text$info = {
.size=sizeof(Text_t),
.align=__alignof__(Text_t),
.tag=TextInfo,
.TextInfo={.lang="Text"},
.metamethods=Text$metamethods,
};
public Pattern_t Pattern$escape_text(Text_t text)
{
// TODO: optimize for spans of non-escaped text
Text_t ret = EMPTY_TEXT;
TextIter_t state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = 0; i < text.length; i++) {
int32_t g = Text$get_grapheme_fast(&state, i);
ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g;
if (g == '{') {
ret = concat2_assuming_safe(ret, Text("{1{}"));
} else if (g0 == '?'
|| uc_is_property_quotation_mark(g0)
|| (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) {
ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}"));
} else {
ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1)));
}
}
return ret;
}
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0