Overhaul of Text implementation to be more like Cords and have much

better performance for long sequences of repeated concatenation.
This commit is contained in:
Bruce Hill 2025-01-23 15:33:56 -05:00
parent c60ea2079f
commit f93dde1449
10 changed files with 477 additions and 558 deletions

View File

@ -34,6 +34,7 @@ static CORD compile_none(type_t *t);
static CORD compile_to_type(env_t *env, ast_t *ast, type_t *t);
static CORD check_none(type_t *t, CORD value);
static CORD optional_into_nonnone(type_t *t, CORD value);
static CORD compile_string_literal(CORD literal);
CORD promote_to_optional(type_t *t, CORD code)
{
@ -569,7 +570,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output);
uint8_t buf[128] = {0};
size_t norm_len = sizeof(buf);
uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len);
uint8_t *norm = u8_normalize(UNINORM_NFC, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len);
assert(norm[norm_len-1] == 0);
output = CORD_from_char_star((char*)norm);
if (norm && norm != buf) free(norm);
@ -579,7 +580,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
auto decl = Match(test->expr, Declare);
const char *varname = Match(decl->var, Var)->name;
if (streq(varname, "_"))
return compile_statement(env, WrapAST(ast, DocTest, .expr=decl->value, .output=test->output, .skip_source=test->skip_source));
return compile_statement(env, WrapAST(ast, DocTest, .expr=decl->value, .output=output, .skip_source=test->skip_source));
CORD var = CORD_all("$", Match(decl->var, Var)->name);
type_t *t = get_type(env, decl->value);
CORD val_code = compile_maybe_incref(env, decl->value, t);
@ -593,7 +594,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
compile_declaration(t, var),
var, val_code,
compile_type_info(env, get_type(env, decl->value)),
CORD_quoted(output),
compile_string_literal(output),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
} else if (test->expr->tag == Assign) {
@ -612,12 +613,12 @@ CORD compile_statement(env_t *env, ast_t *ast)
"test((%r), %r, %r, %ld, %ld);",
compile_assignment(env, assign->targets->ast, value),
compile_type_info(env, lhs_t),
CORD_quoted(test->output),
compile_string_literal(output),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
} else {
// Multi-assign or assignment to potentially non-idempotent targets
if (test->output && assign->targets->next)
if (output && assign->targets->next)
code_err(ast, "Sorry, but doctesting with '=' is not supported for multi-assignments");
CORD code = "test(({ // Assignment\n";
@ -643,7 +644,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
CORD_appendf(&code, "$1; }), %r, %r, %ld, %ld);",
compile_type_info(env, first_type),
CORD_quoted(test->output),
compile_string_literal(output),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
return code;
@ -667,7 +668,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
compile_lvalue(env, update->lhs),
compile_statement(env, update_var),
compile_type_info(env, lhs_t),
CORD_quoted(test->output),
compile_string_literal(output),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
} else if (expr_t->tag == VoidType || expr_t->tag == AbortType || expr_t->tag == ReturnType) {
@ -681,7 +682,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
"test(%r, %r, %r, %ld, %ld);",
compile(env, test->expr),
compile_type_info(env, expr_t),
CORD_quoted(output),
compile_string_literal(output),
(int64_t)(test->expr->start - test->expr->file->text),
(int64_t)(test->expr->end - test->expr->file->text));
}
@ -1953,7 +1954,7 @@ CORD compile_math_method(env_t *env, binop_e op, ast_t *lhs, ast_t *rhs, type_t
return CORD_EMPTY;
}
static CORD compile_string_literal(CORD literal)
CORD compile_string_literal(CORD literal)
{
CORD code = "\"";
CORD_pos i;

View File

@ -9,12 +9,22 @@ etc.).
## Implementation
Internally, Tomo text's implementation is based on [Raku's
strings](https://docs.raku.org/language/unicode). Strings store their grapheme
cluster count and either a compact array of 8-bit ASCII characters (for ASCII
text), an array of 32-bit normal-form grapheme cluster values (see below), or a
flat subarray of multiple texts that are either ASCII or graphemes (the
structure is not arbitrarily nested).
Internally, Tomo text's implementation is based on [Raku/MoarVM's
strings](https://docs.raku.org/language/unicode) and [Boehm et al's
Cords](https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf).
Strings store their grapheme cluster count and either a compact array of 8-bit
ASCII characters (for ASCII text), an array of 32-bit normal-form grapheme
cluster values (see below), or a (roughly) balanced binary tree concatenation
of two texts. The upside is that repeated concatenations are typically a
constant-time operation, which will occasionally require a small rebalancing
operation. Index-based text operations (like retrieving an arbitrary index or
slicing) are very fast: typically a constant-time operation for arbitrary
unicode text, but in the worst case scenario (text built from many
concatenations), `O(log(n))` time with very generous constant factors typically
amounting to only a handful of steps. Since concatenations use shared
substructures, they are very memory-efficient and can be used efficiently for
applications like implementing a text editor that stores a full edit history of
a large file's contents.
### Normal-Form Graphemes

View File

@ -17,15 +17,16 @@ PUREFUNC public Text_t Byte$as_text(const void *b, bool colorize, const TypeInfo
}
public Text_t Byte$hex(Byte_t byte, bool uppercase, bool prefix) {
Text_t text = {.tag=TEXT_SHORT_ASCII};
struct Text_s text = {.tag=TEXT_ASCII};
text.ascii = GC_MALLOC_ATOMIC(8);
if (prefix && uppercase)
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "0x%02X", byte);
text.length = (int64_t)snprintf((char*)text.ascii, 8, "0x%02X", byte);
else if (prefix && !uppercase)
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "0x%02x", byte);
text.length = (int64_t)snprintf((char*)text.ascii, 8, "0x%02x", byte);
else if (!prefix && uppercase)
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "%02X", byte);
text.length = (int64_t)snprintf((char*)text.ascii, 8, "%02X", byte);
else if (!prefix && !uppercase)
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "%02x", byte);
text.length = (int64_t)snprintf((char*)text.ascii, 8, "%02x", byte);
return text;
}

View File

@ -66,18 +66,24 @@ typedef struct Range_s {
Int_t first, last, step;
} Range_t;
enum text_type { TEXT_SHORT_ASCII, TEXT_ASCII, TEXT_SHORT_GRAPHEMES, TEXT_GRAPHEMES, TEXT_SUBTEXT };
enum text_type { TEXT_ASCII, TEXT_GRAPHEMES, TEXT_CONCAT };
typedef struct Text_s {
int64_t length; // Number of grapheme clusters
uint64_t hash:61;
uint8_t tag:3;
int64_t length:54; // Number of grapheme clusters
uint8_t depth:8;
uint8_t tag:2;
union {
char short_ascii[8];
const char *ascii;
int32_t short_graphemes[2];
const int32_t *graphemes;
struct Text_s *subtexts;
struct {
const char *ascii;
// char ascii_buf[8];
};
struct {
const int32_t *graphemes;
// int32_t grapheme_buf[2];
};
struct {
const struct Text_s *left, *right;
};
};
} Text_t;

View File

@ -74,7 +74,7 @@ public void Optional$deserialize(FILE *in, void *outval, Array_t *pointers, cons
_deserialize(in, outval, pointers, nonnull);
} else {
if (nonnull->tag == TextInfo)
*(Text_t*)outval = (Text_t){.length=-1};
*(Text_t*)outval = NONE_TEXT;
else if (nonnull->tag == ArrayInfo)
*(Array_t*)outval = (Array_t){.length=-1};
else if (nonnull->tag == TableInfo)

View File

@ -36,7 +36,7 @@ typedef struct {
static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
{
while (*i < state->text.length) {
while (*i < state->stack[0].text.length) {
int32_t grapheme = Text$get_grapheme_fast(state, *i);
if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
return;
@ -46,7 +46,7 @@ static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
{
if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
*i += 1;
return true;
}
@ -57,7 +57,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
{
int64_t matched = 0;
while (matched[str]) {
if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
return false;
matched += 1;
}
@ -67,7 +67,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
static INLINE bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop)
{
if (*i >= state->text.length) return false;
if (*i >= state->stack[0].text.length) return false;
uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
// TODO: check every codepoint in the cluster?
if (uc_is_property(grapheme, prop)) {
@ -95,7 +95,7 @@ static const char *get_property_name(TextIter_t *state, int64_t *i)
skip_whitespace(state, i);
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
char *dest = name;
while (*i < state->text.length) {
while (*i < state->stack[0].text.length) {
int32_t grapheme = Text$get_grapheme_fast(state, *i);
if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
*dest = (char)grapheme;
@ -406,10 +406,10 @@ static int64_t match_num(TextIter_t *state, int64_t index)
static int64_t match_newline(TextIter_t *state, int64_t index)
{
if (index >= state->text.length)
if (index >= state->stack[0].text.length)
return -1;
uint32_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index);
uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
if (grapheme == '\n')
return 1;
if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
@ -419,7 +419,7 @@ static int64_t match_newline(TextIter_t *state, int64_t index)
static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
{
Text_t text = state->text;
Text_t text = state->stack[0].text;
int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
switch (pat.tag) {
@ -516,7 +516,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
if (!match_grapheme(state, index, close))
fail("Pattern's closing quote is missing: %k", &state->text);
fail("Pattern's closing quote is missing: %k", &state->stack[0].text);
return (pat_t){
.tag=PAT_QUOTE,
@ -531,7 +531,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
int32_t close = open;
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
if (!match_grapheme(state, index, close))
fail("Pattern's closing brace is missing: %k", &state->text);
fail("Pattern's closing brace is missing: %k", &state->stack[0].text);
return (pat_t){
.tag=PAT_PAIR,
@ -571,19 +571,19 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
skip_whitespace(state, index);
int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
if (!match_grapheme(state, index, '}'))
fail("Missing closing '}' in pattern: %k", &state->text);
fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
return PAT(PAT_GRAPHEME, .grapheme=grapheme);
} else if (strlen(prop_name) == 1) {
// Single letter names: {1+ A}
skip_whitespace(state, index);
if (!match_grapheme(state, index, '}'))
fail("Missing closing '}' in pattern: %k", &state->text);
fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
}
skip_whitespace(state, index);
if (!match_grapheme(state, index, '}'))
fail("Missing closing '}' in pattern: %k", &state->text);
fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
switch (tolower(prop_name[0])) {
case '.':
@ -677,7 +677,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t
return 0;
int64_t start_index = text_index;
TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0};
TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
if (pat.min == -1 && pat.max == -1) {
@ -778,7 +778,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = {text, 0, 0};
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
for (int64_t i = first; i <= last; i++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
@ -881,12 +881,12 @@ typedef struct {
static OptionalMatch_t next_match(match_iter_state_t *state)
{
if (Int_to_Int64(state->i, false) > state->state.text.length)
if (Int_to_Int64(state->i, false) > state->state.stack[0].text.length)
return NONE_MATCH;
OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i);
OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i);
if (m.index.small == 0) // No match
state->i = I(state->state.text.length + 1);
state->i = I(state->state.stack[0].text.length + 1);
else
state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
return m;
@ -896,7 +896,7 @@ public Closure_t Text$by_match(Text_t text, Pattern_t pattern)
{
return (Closure_t){
.fn=(void*)next_match,
.userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern),
.userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern),
};
}
@ -911,7 +911,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
Text_t ret = Text("");
TextIter_t replacement_state = {replacement, 0, 0};
TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < replacement.length; ) {
// Optimization: quickly skip ahead to first char in the backref pattern:
@ -965,14 +965,14 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive)
{
Text_t ret = {.length=0};
Text_t ret = EMPTY_TEXT;
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = {text, 0, 0};
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Optimization: quickly skip ahead to first char in pattern:
@ -1030,14 +1030,14 @@ public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool tri
public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
{
Text_t ret = {.length=0};
Text_t ret = EMPTY_TEXT;
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = {text, 0, 0};
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
int64_t nonmatching_pos = 0;
Text_t (*text_mapper)(Match_t, void*) = fn.fn;
@ -1086,7 +1086,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
TextIter_t text_state = {text, 0, 0};
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
void (*action)(Match_t, void*) = fn.fn;
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
@ -1118,7 +1118,7 @@ public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref
{
if (replacements.entries.length == 0) return text;
Text_t ret = {.length=0};
Text_t ret = EMPTY_TEXT;
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
@ -1194,11 +1194,11 @@ typedef struct {
static OptionalText_t next_split(split_iter_state_t *state)
{
Text_t text = state->state.text;
Text_t text = state->state.stack[0].text;
if (state->i >= text.length) {
if (state->pattern.length > 0 && state->i == text.length) { // special case
state->i = text.length + 1;
return (Text_t){.length=0};
return EMPTY_TEXT;
}
return NONE_TEXT;
}
@ -1220,7 +1220,7 @@ static OptionalText_t next_split(split_iter_state_t *state)
state->i = MAX(found + len, state->i + 1);
return Text$slice(text, I(start+1), I(found));
} else {
state->i = state->state.text.length + 1;
state->i = state->state.stack[0].text.length + 1;
return Text$slice(text, I(start+1), I(text.length));
}
}
@ -1229,7 +1229,7 @@ public Closure_t Text$by_split(Text_t text, Pattern_t pattern)
{
return (Closure_t){
.fn=(void*)next_split,
.userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern),
.userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern),
};
}

View File

@ -14,24 +14,7 @@
public Shell_t Shell$escape_text(Text_t text)
{
// TODO: optimize for ASCII and short strings
Array_t shell_graphemes = {.atomic=1};
#define add_char(c) Array$insert(&shell_graphemes, (uint32_t[1]){c}, I_small(0), sizeof(uint32_t))
add_char('\'');
const char *text_utf8 = Text$as_c_string(text);
for (const char *p = text_utf8; *p; p++) {
if (*p == '\'') {
add_char('\'');
add_char('"');
add_char('\'');
add_char('"');
add_char('\'');
} else
add_char((uint8_t)*p);
}
add_char('\'');
#undef add_char
return (Text_t){.length=shell_graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=shell_graphemes.data};
return Text$replace(text, Text("'"), Text("'\"'\"'"), Text(""), false);
}
public Shell_t Shell$escape_text_array(Array_t texts)

View File

@ -500,12 +500,12 @@ public void end_test(const void *expr, const TypeInfo_t *type, const char *expec
if (expected && expected[0]) {
Text_t expected_text = Text$from_str(expected);
Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text;
bool success = Text$equal(&expr_plain, &expected_text, &Text$info);
bool success = Text$equal_values(expr_plain, expected_text);
if (!success) {
OptionalMatch_t colon = Text$find(expected_text, Text(":"), I_small(1));
if (colon.index.small) {
Text_t with_type = Text$concat(expr_plain, Text(" : "), type_name);
success = Text$equal(&with_type, &expected_text, &Text$info);
success = Text$equal_values(with_type, expected_text);
}
}
@ -594,7 +594,7 @@ public bool pop_flag(char **argv, int *i, const char *flag, Text_t *result)
if (argv[*i][0] != '-' || argv[*i][1] != '-') {
return false;
} else if (streq(argv[*i] + 2, flag)) {
*result = (Text_t){.length=0};
*result = EMPTY_TEXT;
argv[*i] = NULL;
*i += 1;
return true;

File diff suppressed because it is too large Load Diff

View File

@ -13,18 +13,24 @@
#include "types.h"
#include "util.h"
#define MAX_TEXT_DEPTH 48
typedef struct {
Text_t text;
int64_t subtext, sum_of_previous_subtexts;
struct {
Text_t text;
int64_t offset;
} stack[MAX_TEXT_DEPTH];
int64_t stack_index;
} TextIter_t;
#define NEW_TEXT_ITER_STATE(t) (TextIter_t){.stack={{t, 0}}, .stack_index=0}
int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]);
int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]);
#define Text(str) ((Text_t){.length=sizeof(str)-1, .tag=TEXT_ASCII, .ascii="" str})
int Text$print(FILE *stream, Text_t t);
void Text$visualize(Text_t t);
Text_t Text$_concat(int n, Text_t items[n]);
#define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__})
#define Texts(...) Text$concat(__VA_ARGS__)
@ -69,11 +75,12 @@ void Text$deserialize(FILE *in, void *out, Array_t *, const TypeInfo_t *);
MACROLIKE int32_t Text$get_grapheme(Text_t text, int64_t index)
{
TextIter_t state = {text, 0, 0};
TextIter_t state = NEW_TEXT_ITER_STATE(text);
return Text$get_grapheme_fast(&state, index);
}
extern const TypeInfo_t Text$info;
extern Text_t EMPTY_TEXT;
#define Text$metamethods ((metamethods_t){ \
.as_text=Text$as_text, \