diff --git a/compile.c b/compile.c index e6a22b0..ca89062 100644 --- a/compile.c +++ b/compile.c @@ -34,6 +34,7 @@ static CORD compile_none(type_t *t); static CORD compile_to_type(env_t *env, ast_t *ast, type_t *t); static CORD check_none(type_t *t, CORD value); static CORD optional_into_nonnone(type_t *t, CORD value); +static CORD compile_string_literal(CORD literal); CORD promote_to_optional(type_t *t, CORD code) { @@ -569,7 +570,7 @@ CORD compile_statement(env_t *env, ast_t *ast) const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output); uint8_t buf[128] = {0}; size_t norm_len = sizeof(buf); - uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len); + uint8_t *norm = u8_normalize(UNINORM_NFC, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len); assert(norm[norm_len-1] == 0); output = CORD_from_char_star((char*)norm); if (norm && norm != buf) free(norm); @@ -579,7 +580,7 @@ CORD compile_statement(env_t *env, ast_t *ast) auto decl = Match(test->expr, Declare); const char *varname = Match(decl->var, Var)->name; if (streq(varname, "_")) - return compile_statement(env, WrapAST(ast, DocTest, .expr=decl->value, .output=test->output, .skip_source=test->skip_source)); + return compile_statement(env, WrapAST(ast, DocTest, .expr=decl->value, .output=output, .skip_source=test->skip_source)); CORD var = CORD_all("$", Match(decl->var, Var)->name); type_t *t = get_type(env, decl->value); CORD val_code = compile_maybe_incref(env, decl->value, t); @@ -593,7 +594,7 @@ CORD compile_statement(env_t *env, ast_t *ast) compile_declaration(t, var), var, val_code, compile_type_info(env, get_type(env, decl->value)), - CORD_quoted(output), + compile_string_literal(output), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else if (test->expr->tag == Assign) { @@ -612,12 +613,12 @@ CORD compile_statement(env_t *env, ast_t *ast) "test((%r), %r, %r, %ld, %ld);", compile_assignment(env, assign->targets->ast, value), compile_type_info(env, lhs_t), - CORD_quoted(test->output), + compile_string_literal(output), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else { // Multi-assign or assignment to potentially non-idempotent targets - if (test->output && assign->targets->next) + if (output && assign->targets->next) code_err(ast, "Sorry, but doctesting with '=' is not supported for multi-assignments"); CORD code = "test(({ // Assignment\n"; @@ -643,7 +644,7 @@ CORD compile_statement(env_t *env, ast_t *ast) CORD_appendf(&code, "$1; }), %r, %r, %ld, %ld);", compile_type_info(env, first_type), - CORD_quoted(test->output), + compile_string_literal(output), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); return code; @@ -667,7 +668,7 @@ CORD compile_statement(env_t *env, ast_t *ast) compile_lvalue(env, update->lhs), compile_statement(env, update_var), compile_type_info(env, lhs_t), - CORD_quoted(test->output), + compile_string_literal(output), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } else if (expr_t->tag == VoidType || expr_t->tag == AbortType || expr_t->tag == ReturnType) { @@ -681,7 +682,7 @@ CORD compile_statement(env_t *env, ast_t *ast) "test(%r, %r, %r, %ld, %ld);", compile(env, test->expr), compile_type_info(env, expr_t), - CORD_quoted(output), + compile_string_literal(output), (int64_t)(test->expr->start - test->expr->file->text), (int64_t)(test->expr->end - test->expr->file->text)); } @@ -1953,7 +1954,7 @@ CORD compile_math_method(env_t *env, binop_e op, ast_t *lhs, ast_t *rhs, type_t return CORD_EMPTY; } -static CORD compile_string_literal(CORD literal) +CORD compile_string_literal(CORD literal) { CORD code = "\""; CORD_pos i; diff --git a/docs/text.md b/docs/text.md index 0384aa4..c8a105d 100644 --- a/docs/text.md +++ b/docs/text.md @@ -9,12 +9,22 @@ etc.). ## Implementation -Internally, Tomo text's implementation is based on [Raku's -strings](https://docs.raku.org/language/unicode). Strings store their grapheme -cluster count and either a compact array of 8-bit ASCII characters (for ASCII -text), an array of 32-bit normal-form grapheme cluster values (see below), or a -flat subarray of multiple texts that are either ASCII or graphemes (the -structure is not arbitrarily nested). +Internally, Tomo text's implementation is based on [Raku/MoarVM's +strings](https://docs.raku.org/language/unicode) and [Boehm et al's +Cords](https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf). +Strings store their grapheme cluster count and either a compact array of 8-bit +ASCII characters (for ASCII text), an array of 32-bit normal-form grapheme +cluster values (see below), or a (roughly) balanced binary tree concatenation +of two texts. The upside is that repeated concatenations are typically a +constant-time operation, which will occasionally require a small rebalancing +operation. Index-based text operations (like retrieving an arbitrary index or +slicing) are very fast: typically a constant-time operation for arbitrary +unicode text, but in the worst case scenario (text built from many +concatenations), `O(log(n))` time with very generous constant factors typically +amounting to only a handful of steps. Since concatenations use shared +substructures, they are very memory-efficient and can be used efficiently for +applications like implementing a text editor that stores a full edit history of +a large file's contents. ### Normal-Form Graphemes diff --git a/stdlib/bytes.c b/stdlib/bytes.c index 6c94c05..1e889f6 100644 --- a/stdlib/bytes.c +++ b/stdlib/bytes.c @@ -17,15 +17,16 @@ PUREFUNC public Text_t Byte$as_text(const void *b, bool colorize, const TypeInfo } public Text_t Byte$hex(Byte_t byte, bool uppercase, bool prefix) { - Text_t text = {.tag=TEXT_SHORT_ASCII}; + struct Text_s text = {.tag=TEXT_ASCII}; + text.ascii = GC_MALLOC_ATOMIC(8); if (prefix && uppercase) - text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "0x%02X", byte); + text.length = (int64_t)snprintf((char*)text.ascii, 8, "0x%02X", byte); else if (prefix && !uppercase) - text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "0x%02x", byte); + text.length = (int64_t)snprintf((char*)text.ascii, 8, "0x%02x", byte); else if (!prefix && uppercase) - text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "%02X", byte); + text.length = (int64_t)snprintf((char*)text.ascii, 8, "%02X", byte); else if (!prefix && !uppercase) - text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "%02x", byte); + text.length = (int64_t)snprintf((char*)text.ascii, 8, "%02x", byte); return text; } diff --git a/stdlib/datatypes.h b/stdlib/datatypes.h index 11ae130..80591ad 100644 --- a/stdlib/datatypes.h +++ b/stdlib/datatypes.h @@ -66,18 +66,24 @@ typedef struct Range_s { Int_t first, last, step; } Range_t; -enum text_type { TEXT_SHORT_ASCII, TEXT_ASCII, TEXT_SHORT_GRAPHEMES, TEXT_GRAPHEMES, TEXT_SUBTEXT }; +enum text_type { TEXT_ASCII, TEXT_GRAPHEMES, TEXT_CONCAT }; typedef struct Text_s { - int64_t length; // Number of grapheme clusters - uint64_t hash:61; - uint8_t tag:3; + int64_t length:54; // Number of grapheme clusters + uint8_t depth:8; + uint8_t tag:2; union { - char short_ascii[8]; - const char *ascii; - int32_t short_graphemes[2]; - const int32_t *graphemes; - struct Text_s *subtexts; + struct { + const char *ascii; + // char ascii_buf[8]; + }; + struct { + const int32_t *graphemes; + // int32_t grapheme_buf[2]; + }; + struct { + const struct Text_s *left, *right; + }; }; } Text_t; diff --git a/stdlib/optionals.c b/stdlib/optionals.c index ccda980..990ca13 100644 --- a/stdlib/optionals.c +++ b/stdlib/optionals.c @@ -74,7 +74,7 @@ public void Optional$deserialize(FILE *in, void *outval, Array_t *pointers, cons _deserialize(in, outval, pointers, nonnull); } else { if (nonnull->tag == TextInfo) - *(Text_t*)outval = (Text_t){.length=-1}; + *(Text_t*)outval = NONE_TEXT; else if (nonnull->tag == ArrayInfo) *(Array_t*)outval = (Array_t){.length=-1}; else if (nonnull->tag == TableInfo) diff --git a/stdlib/patterns.c b/stdlib/patterns.c index 48f43ae..bee8476 100644 --- a/stdlib/patterns.c +++ b/stdlib/patterns.c @@ -36,7 +36,7 @@ typedef struct { static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) { - while (*i < state->text.length) { + while (*i < state->stack[0].text.length) { int32_t grapheme = Text$get_grapheme_fast(state, *i); if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) return; @@ -46,7 +46,7 @@ static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) { - if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) { + if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) { *i += 1; return true; } @@ -57,7 +57,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) { int64_t matched = 0; while (matched[str]) { - if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) + if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) return false; matched += 1; } @@ -67,7 +67,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) static INLINE bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop) { - if (*i >= state->text.length) return false; + if (*i >= state->stack[0].text.length) return false; uint32_t grapheme = Text$get_main_grapheme_fast(state, *i); // TODO: check every codepoint in the cluster? if (uc_is_property(grapheme, prop)) { @@ -95,7 +95,7 @@ static const char *get_property_name(TextIter_t *state, int64_t *i) skip_whitespace(state, i); char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); char *dest = name; - while (*i < state->text.length) { + while (*i < state->stack[0].text.length) { int32_t grapheme = Text$get_grapheme_fast(state, *i); if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { *dest = (char)grapheme; @@ -406,10 +406,10 @@ static int64_t match_num(TextIter_t *state, int64_t index) static int64_t match_newline(TextIter_t *state, int64_t index) { - if (index >= state->text.length) + if (index >= state->stack[0].text.length) return -1; - uint32_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index); + uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index); if (grapheme == '\n') return 1; if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') @@ -419,7 +419,7 @@ static int64_t match_newline(TextIter_t *state, int64_t index) static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) { - Text_t text = state->text; + Text_t text = state->stack[0].text; int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); switch (pat.tag) { @@ -516,7 +516,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index) int32_t close = open; uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); if (!match_grapheme(state, index, close)) - fail("Pattern's closing quote is missing: %k", &state->text); + fail("Pattern's closing quote is missing: %k", &state->stack[0].text); return (pat_t){ .tag=PAT_QUOTE, @@ -531,7 +531,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index) int32_t close = open; uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); if (!match_grapheme(state, index, close)) - fail("Pattern's closing brace is missing: %k", &state->text); + fail("Pattern's closing brace is missing: %k", &state->stack[0].text); return (pat_t){ .tag=PAT_PAIR, @@ -571,19 +571,19 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index) skip_whitespace(state, index); int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); if (!match_grapheme(state, index, '}')) - fail("Missing closing '}' in pattern: %k", &state->text); + fail("Missing closing '}' in pattern: %k", &state->stack[0].text); return PAT(PAT_GRAPHEME, .grapheme=grapheme); } else if (strlen(prop_name) == 1) { // Single letter names: {1+ A} skip_whitespace(state, index); if (!match_grapheme(state, index, '}')) - fail("Missing closing '}' in pattern: %k", &state->text); + fail("Missing closing '}' in pattern: %k", &state->stack[0].text); return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); } skip_whitespace(state, index); if (!match_grapheme(state, index, '}')) - fail("Missing closing '}' in pattern: %k", &state->text); + fail("Missing closing '}' in pattern: %k", &state->stack[0].text); switch (tolower(prop_name[0])) { case '.': @@ -677,7 +677,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t return 0; int64_t start_index = text_index; - TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0}; + TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text); pat_t pat = parse_next_pat(&pattern_state, &pattern_index); if (pat.min == -1 && pat.max == -1) { @@ -778,7 +778,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {text, 0, 0}; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); for (int64_t i = first; i <= last; i++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { @@ -881,12 +881,12 @@ typedef struct { static OptionalMatch_t next_match(match_iter_state_t *state) { - if (Int_to_Int64(state->i, false) > state->state.text.length) + if (Int_to_Int64(state->i, false) > state->state.stack[0].text.length) return NONE_MATCH; - OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i); + OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i); if (m.index.small == 0) // No match - state->i = I(state->state.text.length + 1); + state->i = I(state->state.stack[0].text.length + 1); else state->i = Int$plus(m.index, I(MAX(1, m.text.length))); return m; @@ -896,7 +896,7 @@ public Closure_t Text$by_match(Text_t text, Pattern_t pattern) { return (Closure_t){ .fn=(void*)next_match, - .userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern), + .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern), }; } @@ -911,7 +911,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); Text_t ret = Text(""); - TextIter_t replacement_state = {replacement, 0, 0}; + TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement); int64_t nonmatching_pos = 0; for (int64_t pos = 0; pos < replacement.length; ) { // Optimization: quickly skip ahead to first char in the backref pattern: @@ -965,14 +965,14 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive) { - Text_t ret = {.length=0}; + Text_t ret = EMPTY_TEXT; int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {text, 0, 0}; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); int64_t nonmatching_pos = 0; for (int64_t pos = 0; pos < text.length; ) { // Optimization: quickly skip ahead to first char in pattern: @@ -1030,14 +1030,14 @@ public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool tri public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) { - Text_t ret = {.length=0}; + Text_t ret = EMPTY_TEXT; int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {text, 0, 0}; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); int64_t nonmatching_pos = 0; Text_t (*text_mapper)(Match_t, void*) = fn.fn; @@ -1086,7 +1086,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {text, 0, 0}; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text); void (*action)(Match_t, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: @@ -1118,7 +1118,7 @@ public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref { if (replacements.entries.length == 0) return text; - Text_t ret = {.length=0}; + Text_t ret = EMPTY_TEXT; int64_t nonmatch_pos = 0; for (int64_t pos = 0; pos < text.length; ) { @@ -1194,11 +1194,11 @@ typedef struct { static OptionalText_t next_split(split_iter_state_t *state) { - Text_t text = state->state.text; + Text_t text = state->state.stack[0].text; if (state->i >= text.length) { if (state->pattern.length > 0 && state->i == text.length) { // special case state->i = text.length + 1; - return (Text_t){.length=0}; + return EMPTY_TEXT; } return NONE_TEXT; } @@ -1220,7 +1220,7 @@ static OptionalText_t next_split(split_iter_state_t *state) state->i = MAX(found + len, state->i + 1); return Text$slice(text, I(start+1), I(found)); } else { - state->i = state->state.text.length + 1; + state->i = state->state.stack[0].text.length + 1; return Text$slice(text, I(start+1), I(text.length)); } } @@ -1229,7 +1229,7 @@ public Closure_t Text$by_split(Text_t text, Pattern_t pattern) { return (Closure_t){ .fn=(void*)next_split, - .userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern), + .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern), }; } diff --git a/stdlib/shell.c b/stdlib/shell.c index 4a48f5c..694d155 100644 --- a/stdlib/shell.c +++ b/stdlib/shell.c @@ -14,24 +14,7 @@ public Shell_t Shell$escape_text(Text_t text) { - // TODO: optimize for ASCII and short strings - Array_t shell_graphemes = {.atomic=1}; -#define add_char(c) Array$insert(&shell_graphemes, (uint32_t[1]){c}, I_small(0), sizeof(uint32_t)) - add_char('\''); - const char *text_utf8 = Text$as_c_string(text); - for (const char *p = text_utf8; *p; p++) { - if (*p == '\'') { - add_char('\''); - add_char('"'); - add_char('\''); - add_char('"'); - add_char('\''); - } else - add_char((uint8_t)*p); - } - add_char('\''); -#undef add_char - return (Text_t){.length=shell_graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=shell_graphemes.data}; + return Text$replace(text, Text("'"), Text("'\"'\"'"), Text(""), false); } public Shell_t Shell$escape_text_array(Array_t texts) diff --git a/stdlib/stdlib.c b/stdlib/stdlib.c index cb9d221..bab9790 100644 --- a/stdlib/stdlib.c +++ b/stdlib/stdlib.c @@ -500,12 +500,12 @@ public void end_test(const void *expr, const TypeInfo_t *type, const char *expec if (expected && expected[0]) { Text_t expected_text = Text$from_str(expected); Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text; - bool success = Text$equal(&expr_plain, &expected_text, &Text$info); + bool success = Text$equal_values(expr_plain, expected_text); if (!success) { OptionalMatch_t colon = Text$find(expected_text, Text(":"), I_small(1)); if (colon.index.small) { Text_t with_type = Text$concat(expr_plain, Text(" : "), type_name); - success = Text$equal(&with_type, &expected_text, &Text$info); + success = Text$equal_values(with_type, expected_text); } } @@ -594,7 +594,7 @@ public bool pop_flag(char **argv, int *i, const char *flag, Text_t *result) if (argv[*i][0] != '-' || argv[*i][1] != '-') { return false; } else if (streq(argv[*i] + 2, flag)) { - *result = (Text_t){.length=0}; + *result = EMPTY_TEXT; argv[*i] = NULL; *i += 1; return true; diff --git a/stdlib/text.c b/stdlib/text.c index e25e7bc..96ea02d 100644 --- a/stdlib/text.c +++ b/stdlib/text.c @@ -1,9 +1,15 @@ -// Type info and methods for Text datatype, which uses libunistr for Unicode -// support and implements a datastructure based on Raku/MoarVM's strings to -// efficiently store arbitrary unicode data using a mix of densely packed plain -// ASCII, 32-bit integers representing grapheme clusters (see below), and ropes -// that represent text that is a composite of multiple subtexts. Subtexts are -// only nested one level deep, not arbitrarily deep trees. +// This file defines type info and methods for the Text datatype, which uses +// libunistr for Unicode support and implements a datastructure based on a +// hybrid of Raku/MoarVM's space-efficient grapheme cluster representation of +// strings and Cords (Boehm et al), which have good runtime performance for +// text constructed by a series of many concatenations. +// +// For more information on MoarVM's grapheme cluster strings, see: +// https://docs.raku.org/language/unicode +// https://github.com/MoarVM/MoarVM/blob/main/docs/strings.asciidoc For more +// information on Cords, see the paper "Ropes: an Alternative to Strings" +// (Boehm, Atkinson, Plass 1995): +// https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf // // A note on grapheme clusters: In Unicode, codepoints can be represented using // a 32-bit integer. Most codepoints correspond to the intuitive notion of a @@ -20,8 +26,9 @@ // that is not used frequently enough to warrant its own unique codepoint (this // is basically what Zalgo text is). // -// There are a lot of benefits to storing text with one grapheme cluster per -// index in a densely packed array. It lets us have one canonical length for +// There are a lot of benefits to storing unicode text with one grapheme +// cluster per index in a densely packed array instead of storing the text as +// variable-width UTF8-encoded bytes. It lets us have one canonical length for // the text that can be precomputed and is meaningful to users. It lets us // quickly get the Nth "letter" in the text. Substring slicing is fast. // However, since not all grapheme clusters take up the same number of @@ -38,14 +45,12 @@ // out nicely because we're using them right now, and we'll give them a // negative number so it doesn't overlap with any real codepoints. // -// Example 1: U+0048, U+00E9 -// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E WITH ACUTE -// This would be stored as: (int32_t[]){0x48, 0xE9} -// Example 2: U+0048, U+0065, U+0309 -// AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, COMBINING VERTICAL LINE BELOW -// This would be stored as: (int32_t[]){0x48, -2} -// Where -2 is used as a lookup in an array that holds the actual unicode codepoints: -// (ucs4_t[]){0x65, 0x0309} +// Example 1: U+0048, U+00E9 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E +// WITH ACUTE This would be stored as: (int32_t[]){0x48, 0xE9} Example 2: +// U+0048, U+0065, U+0309 AKA: LATIN CAPITAL LETTER H, LATIN SMALL LETTER E, +// COMBINING VERTICAL LINE BELOW This would be stored as: (int32_t[]){0x48, -2} +// Where -2 is used as a lookup in an array that holds the actual unicode +// codepoints: (ucs4_t[]){0x65, 0x0309} #include #include @@ -90,7 +95,20 @@ static int32_t num_synthetic_graphemes = 0; #define GRAPHEME_CODEPOINTS(id) (&synthetic_graphemes[-(id)-1].utf32_cluster[1]) #define GRAPHEME_UTF8(id) (synthetic_graphemes[-(id)-1].utf8) +// Somewhat arbitrarily chosen, if two short literal ASCII or grapheme chunks +// are concatenated below this length threshold, we just merge them into a +// single literal node instead of a concatenation node. +#define SHORT_ASCII_LENGTH 64 +#define SHORT_GRAPHEMES_LENGTH 16 + static Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize); +static Text_t simple_concatenation(Text_t a, Text_t b); + +public Text_t EMPTY_TEXT = { + .length=0, + .tag=TEXT_ASCII, + .ascii=0, +}; PUREFUNC static bool graphemes_equal(const void *va, const void *vb, const TypeInfo_t*) { ucs4_t *a = *(ucs4_t**)va; @@ -138,7 +156,7 @@ public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_le if (num_synthetic_graphemes >= synthetic_grapheme_capacity) { // If we don't have space, allocate more: synthetic_grapheme_capacity = MAX(128, synthetic_grapheme_capacity * 2); - synthetic_grapheme_t *new = GC_MALLOC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity])); + synthetic_grapheme_t *new = GC_MALLOC_ATOMIC(sizeof(synthetic_grapheme_t[synthetic_grapheme_capacity])); memcpy(new, synthetic_graphemes, sizeof(synthetic_grapheme_t[num_synthetic_graphemes])); synthetic_graphemes = new; } @@ -203,39 +221,29 @@ public int32_t get_synthetic_grapheme(const ucs4_t *codepoints, int64_t utf32_le } #pragma GCC diagnostic pop -PUREFUNC static inline int64_t num_subtexts(Text_t t) -{ - if (t.tag != TEXT_SUBTEXT) return 1; - int64_t remaining = t.length; - int64_t subtexts = 0; - while (remaining > 0) { - remaining -= t.subtexts[subtexts].length; - ++subtexts; - } - return subtexts; -} - -int text_visualize(FILE *stream, Text_t t) +int text_visualize(FILE *stream, Text_t t, int depth) { switch (t.tag) { - case TEXT_SHORT_ASCII: return fprintf(stream, "%.*s", t.length, t.length, t.short_ascii); case TEXT_ASCII: return fprintf(stream, "%.*s", t.length, t.length, t.ascii); - case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { + case TEXT_GRAPHEMES: { int printed = fprintf(stream, "", t.length); printed += Text$print(stream, t); printed += fprintf(stream, ""); return printed; } - case TEXT_SUBTEXT: { - int printed = fprintf(stream, "", t.length); - int64_t to_print = t.length; - for (int i = 0; to_print > 0; ++i) { - printed += fprintf(stream, "\n "); - printed += text_visualize(stream, t.subtexts[i]); - to_print -= t.subtexts[i].length; - if (t.subtexts[i].length == 0) break; - } - printed += fprintf(stream, "\n"); + case TEXT_CONCAT: { + int printed = fprintf(stream, "\n", t.depth, t.length); + for (int i = 0; i < depth+1; i++) + printed += fputc(' ', stream); + printed += text_visualize(stream, *t.left, depth+1); + printed += fputc('\n', stream); + for (int i = 0; i < depth+1; i++) + printed += fputc(' ', stream); + printed += text_visualize(stream, *t.right, depth+1); + printed += fputc('\n', stream); + for (int i = 0; i < depth; i++) + printed += fputc(' ', stream); + printed += fprintf(stream, ""); return printed; } default: return 0; @@ -247,10 +255,9 @@ public int Text$print(FILE *stream, Text_t t) if (t.length == 0) return 0; switch (t.tag) { - case TEXT_SHORT_ASCII: return fwrite(t.short_ascii, sizeof(char), (size_t)t.length, stream); case TEXT_ASCII: return fwrite(t.ascii, sizeof(char), (size_t)t.length, stream); - case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { - const int32_t *graphemes = t.tag == TEXT_SHORT_GRAPHEMES ? t.short_graphemes : t.graphemes; + case TEXT_GRAPHEMES: { + const int32_t *graphemes = t.graphemes; int written = 0; for (int64_t i = 0; i < t.length; i++) { int32_t grapheme = graphemes[i]; @@ -268,12 +275,9 @@ public int Text$print(FILE *stream, Text_t t) } return written; } - case TEXT_SUBTEXT: { - int written = 0; - int i = 0; - for (int64_t to_print = t.length; to_print > 0; to_print -= t.subtexts[i].length, ++i) - written += Text$print(stream, t.subtexts[i]); - return written; + case TEXT_CONCAT: { + return (Text$print(stream, *t.left) + + Text$print(stream, *t.right)); } default: return 0; } @@ -314,52 +318,140 @@ static bool is_concat_stable(Text_t a, Text_t b) return (second_grapheme == &normalized[1]); } +static const int64_t min_len_for_depth[MAX_TEXT_DEPTH] = { + // Fibonacci numbers (skipping first two) + 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, + 17711, 28657, 46368, 75025, 121393, 196418, 317811, 514229, 832040, 1346269, 2178309, 3524578, + 5702887, 9227465, 14930352, 24157817, 39088169, 63245986, 102334155, 165580141, 267914296, + 433494437, 701408733, 1134903170, 1836311903, 2971215073, 4807526976, 7778742049, +}; + +#define IS_BALANCED_TEXT(t) ((t).length >= min_len_for_depth[(t).depth]) + +static void insert_balanced(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t to_insert) +{ + int i = 0; + Text_t accumulator = EMPTY_TEXT; + for (; to_insert.length > min_len_for_depth[i + 1]; i++) { + if (balanced_texts[i].length) { + accumulator = simple_concatenation(balanced_texts[i], accumulator); + balanced_texts[i] = EMPTY_TEXT; + } + } + + accumulator = simple_concatenation(accumulator, to_insert); + + while (accumulator.length >= min_len_for_depth[i]) { + if (balanced_texts[i].length) { + accumulator = simple_concatenation(balanced_texts[i], accumulator); + balanced_texts[i] = EMPTY_TEXT; + } + i++; + } + i--; + balanced_texts[i] = accumulator; +} + +static void insert_balanced_recursive(Text_t balanced_texts[MAX_TEXT_DEPTH], Text_t text) +{ + if (text.tag == TEXT_CONCAT && (!IS_BALANCED_TEXT(text) || text.depth >= MAX_TEXT_DEPTH)) { + insert_balanced_recursive(balanced_texts, *text.left); + insert_balanced_recursive(balanced_texts, *text.right); + } else { + insert_balanced(balanced_texts, text); + } +} + +static Text_t rebalanced(Text_t a, Text_t b) +{ + Text_t balanced_texts[MAX_TEXT_DEPTH] = {}; + insert_balanced_recursive(balanced_texts, a); + insert_balanced_recursive(balanced_texts, b); + + Text_t ret = EMPTY_TEXT; + for (int i = 0; ret.length < a.length + b.length; i++) { + if (balanced_texts[i].length) + ret = simple_concatenation(balanced_texts[i], ret); + } + return ret; +} + +Text_t simple_concatenation(Text_t a, Text_t b) +{ + if (a.length == 0) return b; + if (b.length == 0) return a; + + uint16_t new_depth = 1 + MAX(a.depth, b.depth); + // Rebalance only if depth exceeds the maximum allowed. We don't require + // every concatenation to yield a balanced text, since many concatenations + // are ephemeral (e.g. doing a loop repeatedly concatenating without using + // the intermediary values). + if (new_depth >= MAX_TEXT_DEPTH) + return rebalanced(a, b); + + Text_t *children = GC_MALLOC(sizeof(Text_t[2])); + children[0] = a; + children[1] = b; + return (Text_t){ + .tag=TEXT_CONCAT, + .length=a.length + b.length, + .depth=new_depth, + .left=&children[0], + .right=&children[1], + }; +} + static Text_t concat2_assuming_safe(Text_t a, Text_t b) { if (a.length == 0) return b; if (b.length == 0) return a; - if (a.tag == TEXT_SUBTEXT && b.tag == TEXT_SUBTEXT) { - int64_t na = num_subtexts(a); - int64_t nb = num_subtexts(b); - Text_t ret = { + if (a.tag == TEXT_ASCII && b.tag == TEXT_ASCII && (size_t)(a.length + b.length) <= SHORT_ASCII_LENGTH) { + struct Text_s ret = { + .tag=TEXT_ASCII, .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[na + nb])), }; - memcpy(&ret.subtexts[0], a.subtexts, sizeof(Text_t[na])); - memcpy(&ret.subtexts[na], b.subtexts, sizeof(Text_t[nb])); + ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length])); + memcpy((char*)ret.ascii, a.ascii, sizeof(char[a.length])); + memcpy((char*)&ret.ascii[a.length], b.ascii, sizeof(char[b.length])); return ret; - } else if (a.tag == TEXT_SUBTEXT) { - int64_t n = num_subtexts(a); - Text_t ret = { + } else if (a.tag == TEXT_GRAPHEMES && b.tag == TEXT_GRAPHEMES && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) { + struct Text_s ret = { + .tag=TEXT_GRAPHEMES, .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), }; - memcpy(ret.subtexts, a.subtexts, sizeof(Text_t[n])); - ret.subtexts[n] = b; + ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length])); + memcpy((int32_t*)ret.graphemes, a.graphemes, sizeof(int32_t[a.length])); + memcpy((int32_t*)&ret.graphemes[a.length], b.graphemes, sizeof(int32_t[b.length])); return ret; - } else if (b.tag == TEXT_SUBTEXT) { - int64_t n = num_subtexts(b); - Text_t ret = { + } else if (a.tag != TEXT_CONCAT && b.tag != TEXT_CONCAT && (size_t)(a.length + b.length) <= SHORT_GRAPHEMES_LENGTH) { + // Turn a small bit of ASCII into graphemes if it helps make things smaller + // Text structs come with an extra 8 bytes, so allocate enough to hold the text + struct Text_s ret = { + .tag=TEXT_GRAPHEMES, .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[n + 1])), }; - ret.subtexts[0] = a; - memcpy(&ret.subtexts[1], b.subtexts, sizeof(Text_t[n])); - return ret; - } else { - Text_t ret = { - .length=a.length + b.length, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[2])), - }; - ret.subtexts[0] = a; - ret.subtexts[1] = b; + ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length])); + int32_t *dest = (int32_t*)ret.graphemes; + if (a.tag == TEXT_GRAPHEMES) { + dest = mempcpy(dest, a.graphemes, sizeof(int32_t[a.length])); + } else { + for (int64_t i = 0; i < a.length; i++) + *(dest++) = (int32_t)a.ascii[i]; + } + if (b.tag == TEXT_GRAPHEMES) { + memcpy(dest, b.graphemes, sizeof(int32_t[b.length])); + } else { + for (int64_t i = 0; i < b.length; i++) + *(dest++) = (int32_t)b.ascii[i]; + } return ret; } + + if (a.tag == TEXT_CONCAT && b.tag != TEXT_CONCAT && a.right->tag != TEXT_CONCAT) + return concat2_assuming_safe(*a.left, concat2_assuming_safe(*a.right, b)); + + return simple_concatenation(a, b); } static Text_t concat2(Text_t a, Text_t b) @@ -398,41 +490,12 @@ static Text_t concat2(Text_t a, Text_t b) public Text_t Text$_concat(int n, Text_t items[n]) { - if (n == 0) return (Text_t){.length=0}; - if (n == 1) return items[0]; - if (n == 2) return concat2(items[0], items[1]); + if (n == 0) return EMPTY_TEXT; - int64_t subtexts = 0; - for (int i = 0; i < n; i++) { + Text_t ret = items[0]; + for (int i = 1; i < n; i++) { if (items[i].length > 0) - subtexts += num_subtexts(items[i]); - } - - Text_t ret = { - .length=0, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[subtexts])), - }; - int64_t sub_i = 0; - for (int i = 0; i < n; i++) { - if (items[i].length == 0) - continue; - - if (i > 0 && unlikely(!is_concat_stable(ret, items[i]))) { - // Oops, guess this wasn't stable for concatenation, let's break it - // up into subtasks: - return concat2(ret, Text$_concat(n-i, &items[i])); - } - - if (items[i].tag == TEXT_SUBTEXT) { - for (int64_t j = 0, remainder = items[i].length; remainder > 0; j++) { - ret.subtexts[sub_i++] = items[i].subtexts[j]; - remainder -= items[i].subtexts[j].length; - } - } else { - ret.subtexts[sub_i++] = items[i]; - } - ret.length += items[i].length; + ret = concat2(ret, items[i]); } return ret; } @@ -440,37 +503,17 @@ public Text_t Text$_concat(int n, Text_t items[n]) public Text_t Text$repeat(Text_t text, Int_t count) { if (text.length == 0 || Int$is_negative(count)) - return Text(""); + return EMPTY_TEXT; Int_t result_len = Int$times(count, I(text.length)); if (Int$compare_value(result_len, I(1l<<40)) > 0) fail("Text repeating would produce too big of an result!"); int64_t count64 = Int_to_Int64(count, false); - if (text.tag == TEXT_SUBTEXT) { - int64_t subtexts = num_subtexts(text); - Text_t ret = { - .length=text.length * count64, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[subtexts * count64])), - }; - for (int64_t c = 0; c < count64; c++) { - for (int64_t i = 0; i < subtexts; i++) { - if (text.subtexts[i].length > 0) - ret.subtexts[c*subtexts + i] = text.subtexts[i]; - } - } - return ret; - } else { - Text_t ret = { - .length=text.length * count64, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[count64])), - }; - for (int64_t i = 0; i < count64; i++) - ret.subtexts[i] = text; - return ret; - } + Text_t ret = text; + for (int64_t c = 1; c < count64; c++) + ret = concat2(ret, text); + return ret; } public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) @@ -478,7 +521,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) int64_t first = Int_to_Int64(first_int, false); int64_t last = Int_to_Int64(last_int, false); if (first == 0) fail("Invalid index: 0"); - if (last == 0) return (Text_t){.length=0}; + if (last == 0) return EMPTY_TEXT; if (first < 0) first = text.length + first + 1; if (last < 0) last = text.length + last + 1; @@ -486,77 +529,38 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int) if (last > text.length) last = text.length; if (first > text.length || last < first) - return (Text_t){.length=0}; + return EMPTY_TEXT; if (first == 1 && last == text.length) return text; - switch (text.tag) { - case TEXT_SHORT_ASCII: { - Text_t ret = (Text_t) { - .tag=TEXT_SHORT_ASCII, - .length=last - first + 1, - }; - memcpy(ret.short_ascii, text.short_ascii + (first-1), (size_t)ret.length); - return ret; + while (text.tag == TEXT_CONCAT) { + if (last < text.left->length) { + text = *text.left; + } else if (first > text.left->length) { + first -= text.left->length; + last -= text.left->length; + text = *text.right; + } else { + return concat2(Text$slice(*text.left, I(first), I(text.length)), + Text$slice(*text.right, I(1), I(last-text.left->length))); + } } + + switch (text.tag) { case TEXT_ASCII: { - Text_t ret = { + return (Text_t){ .tag=TEXT_ASCII, .length=last - first + 1, .ascii=text.ascii + (first-1), }; - return ret; - } - case TEXT_SHORT_GRAPHEMES: { - assert((first == 1 && last == 1) || (first == 2 && last == 2)); - Text_t ret = { - .tag=TEXT_SHORT_GRAPHEMES, - .length=1, - .short_graphemes={text.short_graphemes[first-1]}, - }; - return ret; } case TEXT_GRAPHEMES: { - Text_t ret = { + return (Text_t){ .tag=TEXT_GRAPHEMES, .length=last - first + 1, .graphemes=text.graphemes + (first-1), }; - return ret; - } - case TEXT_SUBTEXT: { - Text_t *subtexts = text.subtexts; - while (first > subtexts[0].length) { - first -= subtexts[0].length; - last -= subtexts[0].length; - ++subtexts; - } - - int64_t needed_len = (last - first) + 1; - int64_t num_subtexts = 0; - for (int64_t included = 0; included < needed_len; ) { - if (included == 0) - included += subtexts[num_subtexts].length - first + 1; - else - included += subtexts[num_subtexts].length; - num_subtexts += 1; - } - if (num_subtexts == 1) - return Text$slice(subtexts[0], I(first), I(last)); - - Text_t ret = { - .length=needed_len, - .tag=TEXT_SUBTEXT, - .subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])), - }; - for (int64_t i = 0; i < num_subtexts; i++) { - ret.subtexts[i] = Text$slice(subtexts[i], I(first), I(last)); - first = 1; - needed_len -= ret.subtexts[i].length; - last = first + needed_len - 1; - } - return ret; } default: errx(1, "Invalid tag"); } @@ -575,45 +579,34 @@ public Text_t Text$to(Text_t text, Int_t last) public Text_t Text$reversed(Text_t text) { switch (text.tag) { - case TEXT_SHORT_ASCII: { - Text_t ret = text; - for (int64_t i = 0; i < text.length; i++) - ret.short_ascii[text.length-1-i] = text.short_ascii[i]; - return ret; - } case TEXT_ASCII: { - Text_t ret = text; - ret.ascii = GC_MALLOC_ATOMIC(sizeof(uint8_t[text.length])); + struct Text_s ret = { + .tag=TEXT_ASCII, + .length=text.length, + }; + ret.ascii = GC_MALLOC_ATOMIC(sizeof(char[ret.length])); for (int64_t i = 0; i < text.length; i++) ((char*)ret.ascii)[text.length-1-i] = text.ascii[i]; return ret; } - case TEXT_SHORT_GRAPHEMES: { - Text_t ret = text; - for (int64_t i = 0; i < text.length; i++) - ret.short_graphemes[text.length-1-i] = text.short_graphemes[i]; - return ret; - } case TEXT_GRAPHEMES: { - Text_t ret = text; - ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[text.length])); + struct Text_s ret = { + .tag=TEXT_GRAPHEMES, + .length=text.length, + }; + ret.graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[ret.length])); for (int64_t i = 0; i < text.length; i++) ((int32_t*)ret.graphemes)[text.length-1-i] = text.graphemes[i]; return ret; } - case TEXT_SUBTEXT: { - Text_t ret = text; - int64_t n = num_subtexts(text); - ret.subtexts = GC_MALLOC(sizeof(Text_t*[n])); - for (int64_t i = 0; i < n; i++) - ret.subtexts[n-1-i] = Text$reversed(text.subtexts[i]); - return ret; + case TEXT_CONCAT: { + return concat2(Text$reversed(*text.right), Text$reversed(*text.left)); } default: errx(1, "Invalid tag"); } } -public Text_t Text$cluster(Text_t text, Int_t index_int) +public PUREFUNC Text_t Text$cluster(Text_t text, Int_t index_int) { int64_t index = Int_to_Int64(index_int, false); if (index == 0) fail("Invalid index: 0"); @@ -624,42 +617,31 @@ public Text_t Text$cluster(Text_t text, Int_t index_int) fail("Invalid index: %ld is beyond the length of the text (length = %ld)", Int_to_Int64(index_int, false), text.length); + while (text.tag == TEXT_CONCAT) { + if (index <= text.left->length) + text = *text.left; + else + text = *text.right; + } + switch (text.tag) { - case TEXT_SHORT_ASCII: { - return (Text_t) { - .tag=TEXT_SHORT_ASCII, - .length=1, - .short_ascii={text.short_ascii[index-1]}, - }; - } case TEXT_ASCII: { - return (Text_t) { - .tag=TEXT_SHORT_ASCII, + struct Text_s ret = { + .tag=TEXT_ASCII, .length=1, - .short_ascii={text.ascii[index-1]}, - }; - } - case TEXT_SHORT_GRAPHEMES: { - return (Text_t) { - .tag=TEXT_SHORT_GRAPHEMES, - .length=1, - .short_graphemes={text.short_graphemes[index-1]}, + .ascii=GC_MALLOC_ATOMIC(sizeof(char)), }; + *(char*)&ret.ascii[0] = text.ascii[index-1]; + return ret; } case TEXT_GRAPHEMES: { - return (Text_t) { - .tag=TEXT_SHORT_GRAPHEMES, + struct Text_s ret = { + .tag=TEXT_GRAPHEMES, .length=1, - .short_graphemes={text.graphemes[index-1]}, + .graphemes=GC_MALLOC_ATOMIC(sizeof(int32_t)), }; - } - case TEXT_SUBTEXT: { - Text_t *subtext = text.subtexts; - while (index > subtext[0].length) { - index -= subtext[0].length; - ++subtext; - } - return Text$cluster(*subtext, I(index)); + *(int32_t*)&ret.graphemes[0] = text.graphemes[index-1]; + return ret; } default: errx(1, "Invalid tag"); } @@ -676,25 +658,18 @@ Text_t text_from_u32(ucs4_t *codepoints, int64_t num_codepoints, bool normalize) num_codepoints = (int64_t)norm_length; } - // char breaks[num_codepoints]; - // u32_grapheme_breaks(codepoints, num_codepoints, breaks); - - Text_t ret = { + // Intentionally overallocate here: allocate assuming each codepoint is a + // grapheme cluster. If that's not true, we'll have extra space at the end + // of the array, but the length will still be calculated correctly. + int32_t *graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); + struct Text_s ret = { + .tag=TEXT_GRAPHEMES, .length=0, - .tag=TEXT_SHORT_GRAPHEMES, + .graphemes=graphemes, }; const ucs4_t *src = codepoints; - int32_t *graphemes = ret.short_graphemes; while (src < &codepoints[num_codepoints]) { - if (ret.tag == TEXT_SHORT_GRAPHEMES && ret.length + 1 > 2) { - graphemes = GC_MALLOC_ATOMIC(sizeof(int32_t[num_codepoints])); // May be a slight overallocation - graphemes[0] = ret.short_graphemes[0]; - graphemes[1] = ret.short_graphemes[1]; - ret.tag = TEXT_GRAPHEMES; - ret.graphemes = graphemes; - } - - // TODO: use grapheme breaks instead of u32_grapheme_next() + // TODO: use grapheme breaks instead of u32_grapheme_next()? const ucs4_t *next = u32_grapheme_next(src, &codepoints[num_codepoints]); if (next == &src[1]) { graphemes[ret.length] = (int32_t)*src; @@ -716,16 +691,13 @@ public OptionalText_t Text$from_strn(const char *str, size_t len) ascii_span++; if (ascii_span == (int64_t)len) { // All ASCII - Text_t ret = {.length=ascii_span}; - if (ascii_span <= 8) { - ret.tag = TEXT_SHORT_ASCII; - for (int64_t i = 0; i < ascii_span; i++) - ret.short_ascii[i] = str[i]; - } else { - ret.tag = TEXT_ASCII; - ret.ascii = str; - } - return ret; + char *copy = GC_MALLOC_ATOMIC(len); + memcpy(copy, str, len); + return (Text_t){ + .tag=TEXT_ASCII, + .length=ascii_span, + .ascii=copy, + }; } else { if (u8_check((uint8_t*)str, len) != NULL) return NONE_TEXT; @@ -748,19 +720,19 @@ public OptionalText_t Text$from_str(const char *str) static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i) { switch (text.tag) { - case TEXT_ASCII: case TEXT_SHORT_ASCII: { + case TEXT_ASCII: { if (*i + text.length > (int64_t)*capacity) { *capacity = *i + text.length + 1; *buf = GC_REALLOC(*buf, (size_t)*capacity); } - const char *bytes = text.tag == TEXT_ASCII ? text.ascii : text.short_ascii; + const char *bytes = text.ascii; memcpy(*buf + *i, bytes, (size_t)text.length); *i += text.length; break; } - case TEXT_GRAPHEMES: case TEXT_SHORT_GRAPHEMES: { - const int32_t *graphemes = text.tag == TEXT_GRAPHEMES ? text.graphemes : text.short_graphemes; + case TEXT_GRAPHEMES: { + const int32_t *graphemes = text.graphemes; for (int64_t g = 0; g < text.length; g++) { if (graphemes[g] >= 0) { uint8_t u8_buf[64]; @@ -789,11 +761,9 @@ static void u8_buf_append(Text_t text, char **buf, int64_t *capacity, int64_t *i } break; } - case TEXT_SUBTEXT: { - for (int64_t s = 0, remaining = text.length; remaining > 0; s++) { - u8_buf_append(text.subtexts[s], buf, capacity, i); - remaining -= text.subtexts[s].length; - } + case TEXT_CONCAT: { + u8_buf_append(*text.left, buf, capacity, i); + u8_buf_append(*text.right, buf, capacity, i); break; } default: break; @@ -817,135 +787,95 @@ public char *Text$as_c_string(Text_t text) PUREFUNC public uint64_t Text$hash(const void *obj, const TypeInfo_t*) { - Text_t *text = (Text_t*)obj; - if (text->hash != 0) return text->hash; + Text_t text = *(Text_t*)obj; siphash sh; - siphashinit(&sh, sizeof(int32_t[text->length])); + siphashinit(&sh, sizeof(int32_t[text.length])); union { int32_t chunks[2]; uint64_t whole; } tmp; - switch (text->tag) { - case TEXT_ASCII: case TEXT_SHORT_ASCII: { - const char *bytes = text->tag == TEXT_ASCII ? text->ascii : text->short_ascii; - for (int64_t i = 0; i + 1 < text->length; i += 2) { + switch (text.tag) { + case TEXT_ASCII: { + const char *bytes = text.ascii; + for (int64_t i = 0; i + 1 < text.length; i += 2) { tmp.chunks[0] = (int32_t)bytes[i]; tmp.chunks[1] = (int32_t)bytes[i+1]; siphashadd64bits(&sh, tmp.whole); } - int32_t last = text->length & 0x1 ? (int32_t)bytes[text->length-1] : 0; // Odd number of graphemes - text->hash = siphashfinish_last_part(&sh, (uint64_t)last); - break; + int32_t last = text.length & 0x1 ? (int32_t)bytes[text.length-1] : 0; // Odd number of graphemes + return siphashfinish_last_part(&sh, (uint64_t)last); } case TEXT_GRAPHEMES: { - const int32_t *graphemes = text->graphemes; - for (int64_t i = 0; i + 1 < text->length; i += 2) { + const int32_t *graphemes = text.graphemes; + for (int64_t i = 0; i + 1 < text.length; i += 2) { tmp.chunks[0] = graphemes[i]; tmp.chunks[1] = graphemes[i]; siphashadd64bits(&sh, tmp.whole); } - int32_t last = text->length & 0x1 ? graphemes[text->length-1] : 0; // Odd number of graphemes - text->hash = siphashfinish_last_part(&sh, (uint64_t)last); - break; + int32_t last = text.length & 0x1 ? graphemes[text.length-1] : 0; // Odd number of graphemes + return siphashfinish_last_part(&sh, (uint64_t)last); } - case TEXT_SHORT_GRAPHEMES: { - tmp.chunks[0] = text->short_graphemes[0]; - if (text->length > 1) - tmp.chunks[1] = text->short_graphemes[1]; - text->hash = siphashfinish_last_part(&sh, (uint64_t)tmp.whole); - break; - } - case TEXT_SUBTEXT: { - int32_t leftover = 0; - for (int64_t sub_i = 0, to_hash = text->length; to_hash > 0; ) { - Text_t subtext = text->subtexts[sub_i]; - if (subtext.tag == TEXT_ASCII || subtext.tag == TEXT_SHORT_ASCII) { - const char *bytes = subtext.tag == TEXT_ASCII ? subtext.ascii : subtext.short_ascii; - int64_t grapheme = 0; - if (leftover) { - tmp.chunks[0] = leftover; - tmp.chunks[1] = (int32_t)bytes[0]; - siphashadd64bits(&sh, tmp.whole); - grapheme += 1; - } - for (; grapheme + 1 < subtext.length; grapheme += 2) { - tmp.chunks[0] = (int32_t)bytes[grapheme]; - tmp.chunks[1] = (int32_t)bytes[grapheme+1]; - siphashadd64bits(&sh, tmp.whole); - } - leftover = grapheme < subtext.length ? (int32_t)bytes[grapheme] : 0; - } else if (subtext.tag == TEXT_SHORT_GRAPHEMES) { - if (leftover) { - tmp.chunks[0] = leftover; - tmp.chunks[1] = subtext.short_graphemes[0]; - siphashadd64bits(&sh, tmp.whole); - leftover = subtext.length > 1 ? subtext.short_graphemes[1] : 0; - } else if (subtext.length == 1) { - leftover = subtext.short_graphemes[0]; - } else { - tmp.chunks[0] = subtext.short_graphemes[0]; - tmp.chunks[1] = subtext.short_graphemes[1]; - siphashadd64bits(&sh, tmp.whole); - } - } else if (subtext.tag == TEXT_GRAPHEMES) { - const int32_t *graphemes = subtext.graphemes; - int64_t grapheme = 0; - if (leftover) { - tmp.chunks[0] = leftover; - tmp.chunks[1] = graphemes[0]; - siphashadd64bits(&sh, tmp.whole); - grapheme += 1; - } - for (; grapheme + 1 < subtext.length; grapheme += 2) { - tmp.chunks[0] = graphemes[grapheme]; - tmp.chunks[1] = graphemes[grapheme+1]; - siphashadd64bits(&sh, tmp.whole); - } - leftover = grapheme < subtext.length ? graphemes[grapheme] : 0; - } - - to_hash -= text->subtexts[sub_i].length; - - ++sub_i; + case TEXT_CONCAT: { + TextIter_t state = NEW_TEXT_ITER_STATE(text); + for (int64_t i = 0; i < (text.length & ~0x1); i += 2) { + tmp.chunks[0] = Text$get_grapheme_fast(&state, i); + tmp.chunks[0] = Text$get_grapheme_fast(&state, i+1); + siphashadd64bits(&sh, tmp.whole); } - text->hash = siphashfinish_last_part(&sh, (uint64_t)leftover); - break; + int32_t last = (text.length & 0x1) ? Text$get_grapheme_fast(&state, text.length-1) : 0; + return siphashfinish_last_part(&sh, (uint64_t)last); } default: errx(1, "Invalid text"); } - - if (text->hash == 0) - text->hash = 1; - - return text->hash; } public int32_t Text$get_grapheme_fast(TextIter_t *state, int64_t index) { - Text_t text = state->text; - switch (text.tag) { - case TEXT_ASCII: return index < text.length ? (int32_t)text.ascii[index] : 0; - case TEXT_SHORT_ASCII: return index < text.length ? (int32_t)text.short_ascii[index] : 0; - case TEXT_GRAPHEMES: return index < text.length ? text.graphemes[index] : 0; - case TEXT_SHORT_GRAPHEMES: return index < text.length ? text.short_graphemes[index] : 0; - case TEXT_SUBTEXT: { - if (index < 0 || index >= text.length) - return 0; + if (index < 0) return 0; + if (index >= state->stack[0].text.length) return 0; - while (index < state->sum_of_previous_subtexts && state->subtext > 0) { - state->sum_of_previous_subtexts -= text.subtexts[state->subtext].length; - state->subtext -= 1; - } - for (;;) { - if (index < state->sum_of_previous_subtexts + text.subtexts[state->subtext].length) - return Text$get_grapheme(text.subtexts[state->subtext], index - state->sum_of_previous_subtexts); - state->sum_of_previous_subtexts += text.subtexts[state->subtext].length; - state->subtext += 1; + assert(state->stack[0].text.depth <= MAX_TEXT_DEPTH); + + // Go up the stack as needed: + while (index < state->stack[state->stack_index].offset + || index >= state->stack[state->stack_index].offset + state->stack[state->stack_index].text.length) { + state->stack_index -= 1; + assert(state->stack_index >= 0); + } + + assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH); + + // Go down the stack as needed: + while (state->stack[state->stack_index].text.tag == TEXT_CONCAT) { + Text_t text = state->stack[state->stack_index].text; + int64_t offset = state->stack[state->stack_index].offset; + assert(state->stack_index <= MAX_TEXT_DEPTH); + assert(index >= offset); + assert(index < offset + text.length); + + state->stack_index += 1; + if (index < offset + text.left->length) { + state->stack[state->stack_index].text = *text.left; + state->stack[state->stack_index].offset = offset; + } else { + state->stack[state->stack_index].text = *text.right; + state->stack[state->stack_index].offset = offset + text.left->length; } + assert(state->stack_index >= 0 && state->stack_index <= MAX_TEXT_DEPTH); + } + + Text_t text = state->stack[state->stack_index].text; + int64_t offset = state->stack[state->stack_index].offset; + + if (index < offset || index >= offset + text.length) { return 0; } + + switch (text.tag) { + case TEXT_ASCII: return (int32_t)text.ascii[index - offset]; + case TEXT_GRAPHEMES: return text.graphemes[index - offset]; default: errx(1, "Invalid text"); } return 0; @@ -960,11 +890,12 @@ public uint32_t Text$get_main_grapheme_fast(TextIter_t *state, int64_t index) PUREFUNC public int32_t Text$compare(const void *va, const void *vb, const TypeInfo_t*) { if (va == vb) return 0; - const Text_t *a = (const Text_t*)va; - const Text_t *b = (const Text_t*)vb; + const Text_t a = *(const Text_t*)va; + const Text_t b = *(const Text_t*)vb; - int64_t len = MAX(a->length, b->length); - TextIter_t a_state = {*a, 0, 0}, b_state = {*b, 0, 0}; + // TODO: make this smarter and more efficient + int64_t len = MAX(a.length, b.length); + TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b); for (int64_t i = 0; i < len; i++) { int32_t ai = Text$get_grapheme_fast(&a_state, i); int32_t bi = Text$get_grapheme_fast(&b_state, i); @@ -998,7 +929,7 @@ PUREFUNC public bool Text$starts_with(Text_t text, Text_t prefix) { if (text.length < prefix.length) return false; - TextIter_t text_state = {text, 0, 0}, prefix_state = {prefix, 0, 0}; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), prefix_state = NEW_TEXT_ITER_STATE(prefix); for (int64_t i = 0; i < prefix.length; i++) { int32_t text_i = Text$get_grapheme_fast(&text_state, i); int32_t prefix_i = Text$get_grapheme_fast(&prefix_state, i); @@ -1011,7 +942,7 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) { if (text.length < suffix.length) return false; - TextIter_t text_state = {text, 0, 0}, suffix_state = {suffix, 0, 0}; + TextIter_t text_state = NEW_TEXT_ITER_STATE(text), suffix_state = NEW_TEXT_ITER_STATE(suffix); for (int64_t i = 0; i < suffix.length; i++) { int32_t text_i = Text$get_grapheme_fast(&text_state, text.length - suffix.length + i); int32_t suffix_i = Text$get_grapheme_fast(&suffix_state, i); @@ -1022,10 +953,11 @@ PUREFUNC public bool Text$ends_with(Text_t text, Text_t suffix) PUREFUNC public bool Text$equal_values(Text_t a, Text_t b) { - if (a.length != b.length || (a.hash != 0 && b.hash != 0 && a.hash != b.hash)) + if (a.length != b.length) return false; int64_t len = a.length; - TextIter_t a_state = {a, 0, 0}, b_state = {b, 0, 0}; + TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b); + // TODO: make this smarter and more efficient for (int64_t i = 0; i < len; i++) { int32_t ai = Text$get_grapheme_fast(&a_state, i); int32_t bi = Text$get_grapheme_fast(&b_state, i); @@ -1045,7 +977,7 @@ PUREFUNC public bool Text$equal_ignoring_case(Text_t a, Text_t b) if (a.length != b.length) return false; int64_t len = a.length; - TextIter_t a_state = {a, 0, 0}, b_state = {b, 0, 0}; + TextIter_t a_state = NEW_TEXT_ITER_STATE(a), b_state = NEW_TEXT_ITER_STATE(b); const char *language = uc_locale_language(); for (int64_t i = 0; i < len; i++) { int32_t ai = Text$get_grapheme_fast(&a_state, i); @@ -1110,37 +1042,36 @@ public int printf_text_size(const struct printf_info *info, size_t n, int argtyp if (n < 1) return -1; (void)info; argtypes[0] = PA_POINTER; - sizes[0] = sizeof(Text_t*); + sizes[0] = sizeof(Text_t); return 1; } public int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]) { - Text_t t = **(Text_t**)args[0]; + Text_t *t = *(Text_t**)args[0]; if (info->alt) - return text_visualize(stream, t); + return text_visualize(stream, *t, 0); else - return Text$print(stream, t); + return Text$print(stream, *t); } static INLINE Text_t _quoted(Text_t text, bool colorize, char quote_char) { - // TODO: optimize for ASCII and short strings - Array_t graphemes = {.atomic=1}; -#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) -#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) - if (colorize) - add_str("\x1b[35m"); + Text_t ret = colorize ? Text("\x1b[35m") : EMPTY_TEXT; if (quote_char != '"' && quote_char != '\'' && quote_char != '`') - add_char('$'); - add_char(quote_char); + ret = concat2_assuming_safe(ret, Text("$")); -#define add_escaped(str) ({ if (colorize) add_str("\x1b[34;1m"); \ - if (!just_escaped) add_char('$'); \ - add_char('\\'); add_str(str); just_escaped = true; \ - if (colorize) add_str("\x1b[0;35m"); }) - TextIter_t state = {text, 0, 0}; + Text_t quote_text = Text$from_strn("e_char, 1); + ret = concat2_assuming_safe(ret, quote_text); + +#define add_escaped(str) ({ if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); \ + if (!just_escaped) ret = concat2_assuming_safe(ret, Text("$")); \ + ret = concat2_assuming_safe(ret, Text("\\" str)); \ + just_escaped = true; \ + if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); }) + TextIter_t state = NEW_TEXT_ITER_STATE(text); bool just_escaped = false; + // TODO: optimize for spans of non-escaped text for (int64_t i = 0; i < text.length; i++) { int32_t g = Text$get_grapheme_fast(&state, i); switch (g) { @@ -1156,14 +1087,14 @@ static INLINE Text_t _quoted(Text_t text, bool colorize, char quote_char) if (just_escaped) { add_escaped("\\"); } else { - add_char('\\'); + ret = concat2_assuming_safe(ret, Text("\\")); just_escaped = false; } break; } case '$': { if (quote_char == '\'') { - add_char('$'); + ret = concat2_assuming_safe(ret, Text("$")); just_escaped = false; } else { add_escaped("$"); @@ -1172,57 +1103,57 @@ static INLINE Text_t _quoted(Text_t text, bool colorize, char quote_char) } case '\x00' ... '\x06': case '\x0E' ... '\x1A': case '\x1C' ... '\x1F': case '\x7F' ... '\x7F': { - if (colorize) add_str("\x1b[34;1m"); - add_char('\\'); - add_char('x'); - char tmp[4]; + if (colorize) ret = concat2_assuming_safe(ret, Text("\x1b[34;1m")); + ret = concat2_assuming_safe(ret, Text("\\x")); + char tmp[2]; sprintf(tmp, "%02X", g); - add_str(tmp); + ret = concat2_assuming_safe(ret, Text$from_strn(tmp, 2)); if (colorize) - add_str("\x1b[0;35m"); + ret = concat2_assuming_safe(ret, Text("\x1b[0;35m")); just_escaped = true; break; } default: { if (g == quote_char) { - add_escaped(((char[2]){quote_char, 0})); + ret = concat2_assuming_safe(ret, quote_text); } else { - add_char(g); - just_escaped = false; + ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1))); + just_escaped = false; } break; } } } - - add_char(quote_char); - if (colorize) - add_str("\x1b[m"); - - return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; -#undef add_str -#undef add_char #undef add_escaped + + ret = concat2_assuming_safe(ret, quote_text); + if (colorize) + ret = concat2_assuming_safe(ret, Text("\x1b[m")); + + return ret; } -public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *info) +public Text_t Text$as_text(const void *vtext, bool colorize, const TypeInfo_t *info) { (void)info; if (info->TextInfo.lang && streq(info->TextInfo.lang, "Path")) { - if (!text) return Text("Path"); - return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", text, colorize ? "\x1b[m" : ""); + if (!vtext) return Text("Path"); + Text_t text = *(Text_t*)vtext; + return Text$format("(%s%k%s)", colorize ? "\x1b[35m" : "", &text, colorize ? "\x1b[m" : ""); } - if (!text) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text"); + if (!vtext) return info && info->TextInfo.lang ? Text$from_str(info->TextInfo.lang) : Text("Text"); + + Text_t text = *(Text_t*)vtext; char quote_char; if (info == &Pattern$info) { - quote_char = Text$has(*(Text_t*)text, Pattern("/")) && !Text$has(*(Text_t*)text, Pattern("|")) ? '|' : '/'; + quote_char = Text$has(text, Pattern("/")) && !Text$has(text, Pattern("|")) ? '|' : '/'; } else { // Figure out the best quotation mark to use: bool has_dollar = false, has_double_quote = false, has_backtick = false, has_single_quote = false, needs_escapes = false; - TextIter_t state = {*(Text_t*)text, 0, 0}; - for (int64_t i = 0; i < state.text.length; i++) { + TextIter_t state = NEW_TEXT_ITER_STATE(text); + for (int64_t i = 0; i < text.length; i++) { int32_t g = Text$get_grapheme_fast(&state, i); if (g == '$') { has_dollar = true; @@ -1250,7 +1181,7 @@ public Text_t Text$as_text(const void *text, bool colorize, const TypeInfo_t *in quote_char = '"'; } - Text_t as_text = _quoted(*(Text_t*)text, colorize, quote_char); + Text_t as_text = _quoted(text, colorize, quote_char); if (info && info->TextInfo.lang && info != &Text$info && info != &Pattern$info) as_text = Text$concat( colorize ? Text("\x1b[1m$") : Text("$"), @@ -1267,7 +1198,7 @@ public Text_t Text$quoted(Text_t text, bool colorize) public Text_t Text$join(Text_t glue, Array_t pieces) { - if (pieces.length == 0) return (Text_t){.length=0}; + if (pieces.length == 0) return EMPTY_TEXT; Text_t result = *(Text_t*)pieces.data; for (int64_t i = 1; i < pieces.length; i++) { @@ -1284,19 +1215,9 @@ public Text_t Text$format(const char *fmt, ...) char buf[9]; int len = vsnprintf(buf, sizeof(buf), fmt, args); - Text_t ret; - if (len <= 8) { - ret = (Text_t){ - .length=len, - .tag=TEXT_SHORT_ASCII, - }; - for (int i = 0; i < len; i++) - ret.short_ascii[i] = buf[i]; - } else { - char *str = GC_MALLOC_ATOMIC((size_t)(len+1)); - vsnprintf(str, (size_t)(len+1), fmt, args); - ret = Text$from_str(str); - } + char *str = GC_MALLOC_ATOMIC((size_t)(len+1)); + vsnprintf(str, (size_t)(len+1), fmt, args); + Text_t ret = Text$from_str(str); va_end(args); return ret; } @@ -1314,7 +1235,7 @@ public Array_t Text$clusters(Text_t text) public Array_t Text$utf32_codepoints(Text_t text) { Array_t codepoints = {.atomic=1}; - TextIter_t state = {text, 0, 0}; + TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state, i); if (grapheme < 0) { @@ -1349,18 +1270,18 @@ static INLINE const char *codepoint_name(ucs4_t c) public Array_t Text$codepoint_names(Text_t text) { Array_t names = {}; - TextIter_t state = {text, 0, 0}; + TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state, i); if (grapheme < 0) { for (int64_t c = 0; c < NUM_GRAPHEME_CODEPOINTS(grapheme); c++) { const char *name = codepoint_name(GRAPHEME_CODEPOINTS(grapheme)[c]); - Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name}; + Text_t name_text = Text$from_str(name); Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); } } else { const char *name = codepoint_name((ucs4_t)grapheme); - Text_t name_text = (Text_t){.tag=TEXT_ASCII, .length=(int64_t)strlen(name), .ascii=name}; + Text_t name_text = Text$from_str(name); Array$insert(&names, &name_text, I_small(0), sizeof(Text_t)); } } @@ -1394,15 +1315,13 @@ public OptionalText_t Text$from_bytes(Array_t bytes) if (bytes.stride != sizeof(int8_t)) Array$compact(&bytes, sizeof(int8_t)); - int8_t nul = 0; - Array$insert(&bytes, &nul, I_small(0), sizeof(int8_t)); - return Text$from_str(bytes.data); + return Text$from_strn(bytes.data, (size_t)bytes.length); } public Array_t Text$lines(Text_t text) { Array_t lines = {}; - TextIter_t state = {text, 0, 0}; + TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0, line_start = 0; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state, i); if (grapheme == '\r' && Text$get_grapheme_fast(&state, i + 1) == '\n') { // CRLF @@ -1429,7 +1348,7 @@ typedef struct { static OptionalText_t next_line(line_iter_state_t *state) { - Text_t text = state->state.text; + Text_t text = state->state.stack[0].text; for (int64_t i = state->i; i < text.length; i++) { int32_t grapheme = Text$get_grapheme_fast(&state->state, i); if (grapheme == '\r' && Text$get_grapheme_fast(&state->state, i + 1) == '\n') { // CRLF @@ -1453,7 +1372,7 @@ public Closure_t Text$by_line(Text_t text) { return (Closure_t){ .fn=(void*)next_line, - .userdata=new(line_iter_state_t, .state={text, 0, 0}, .i=0), + .userdata=new(line_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0), }; } @@ -1490,32 +1409,24 @@ public const TypeInfo_t Text$info = { public Pattern_t Pattern$escape_text(Text_t text) { - // TODO: optimize for ASCII and short strings - Array_t graphemes = {.atomic=1}; -#define add_char(c) Array$insert_value(&graphemes, (ucs4_t)c, I_small(0), sizeof(ucs4_t)) -#define add_str(s) ({ for (const char *_c = s; *_c; ++_c) Array$insert_value(&graphemes, (ucs4_t)*_c, I_small(0), sizeof(ucs4_t)); }) - TextIter_t state = {text, 0, 0}; + // TODO: optimize for spans of non-escaped text + Text_t ret = EMPTY_TEXT; + TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { int32_t g = Text$get_grapheme_fast(&state, i); ucs4_t g0 = g < 0 ? GRAPHEME_CODEPOINTS(g)[0] : (ucs4_t)g; if (g == '{') { - add_str("{1{}"); + ret = concat2_assuming_safe(ret, Text("{1{}")); } else if (g0 == '?' || uc_is_property_quotation_mark(g0) || (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) { - add_char('{'); - add_char('1'); - add_char(g); - add_char('}'); + ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}")); } else { - add_char(g); + ret = concat2_assuming_safe(ret, Text$slice(text, I(i+1), I(i+1))); } } - return (Text_t){.length=graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=graphemes.data}; -#undef add_str -#undef add_char -#undef add_escaped + return ret; } // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/stdlib/text.h b/stdlib/text.h index 79c094a..64cf86f 100644 --- a/stdlib/text.h +++ b/stdlib/text.h @@ -13,18 +13,24 @@ #include "types.h" #include "util.h" +#define MAX_TEXT_DEPTH 48 + typedef struct { - Text_t text; - int64_t subtext, sum_of_previous_subtexts; + struct { + Text_t text; + int64_t offset; + } stack[MAX_TEXT_DEPTH]; + int64_t stack_index; } TextIter_t; +#define NEW_TEXT_ITER_STATE(t) (TextIter_t){.stack={{t, 0}}, .stack_index=0} + int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]); int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]); #define Text(str) ((Text_t){.length=sizeof(str)-1, .tag=TEXT_ASCII, .ascii="" str}) int Text$print(FILE *stream, Text_t t); -void Text$visualize(Text_t t); Text_t Text$_concat(int n, Text_t items[n]); #define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__}) #define Texts(...) Text$concat(__VA_ARGS__) @@ -69,11 +75,12 @@ void Text$deserialize(FILE *in, void *out, Array_t *, const TypeInfo_t *); MACROLIKE int32_t Text$get_grapheme(Text_t text, int64_t index) { - TextIter_t state = {text, 0, 0}; + TextIter_t state = NEW_TEXT_ITER_STATE(text); return Text$get_grapheme_fast(&state, index); } extern const TypeInfo_t Text$info; +extern Text_t EMPTY_TEXT; #define Text$metamethods ((metamethods_t){ \ .as_text=Text$as_text, \