Overhaul of Text implementation to be more like Cords and have much
better performance for long sequences of repeated concatenation.
This commit is contained in:
parent
c60ea2079f
commit
f93dde1449
19
compile.c
19
compile.c
@ -34,6 +34,7 @@ static CORD compile_none(type_t *t);
|
||||
static CORD compile_to_type(env_t *env, ast_t *ast, type_t *t);
|
||||
static CORD check_none(type_t *t, CORD value);
|
||||
static CORD optional_into_nonnone(type_t *t, CORD value);
|
||||
static CORD compile_string_literal(CORD literal);
|
||||
|
||||
CORD promote_to_optional(type_t *t, CORD code)
|
||||
{
|
||||
@ -569,7 +570,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
const uint8_t *raw = (const uint8_t*)CORD_to_const_char_star(test->output);
|
||||
uint8_t buf[128] = {0};
|
||||
size_t norm_len = sizeof(buf);
|
||||
uint8_t *norm = u8_normalize(UNINORM_NFD, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len);
|
||||
uint8_t *norm = u8_normalize(UNINORM_NFC, (uint8_t*)raw, strlen((char*)raw)+1, buf, &norm_len);
|
||||
assert(norm[norm_len-1] == 0);
|
||||
output = CORD_from_char_star((char*)norm);
|
||||
if (norm && norm != buf) free(norm);
|
||||
@ -579,7 +580,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
auto decl = Match(test->expr, Declare);
|
||||
const char *varname = Match(decl->var, Var)->name;
|
||||
if (streq(varname, "_"))
|
||||
return compile_statement(env, WrapAST(ast, DocTest, .expr=decl->value, .output=test->output, .skip_source=test->skip_source));
|
||||
return compile_statement(env, WrapAST(ast, DocTest, .expr=decl->value, .output=output, .skip_source=test->skip_source));
|
||||
CORD var = CORD_all("$", Match(decl->var, Var)->name);
|
||||
type_t *t = get_type(env, decl->value);
|
||||
CORD val_code = compile_maybe_incref(env, decl->value, t);
|
||||
@ -593,7 +594,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
compile_declaration(t, var),
|
||||
var, val_code,
|
||||
compile_type_info(env, get_type(env, decl->value)),
|
||||
CORD_quoted(output),
|
||||
compile_string_literal(output),
|
||||
(int64_t)(test->expr->start - test->expr->file->text),
|
||||
(int64_t)(test->expr->end - test->expr->file->text));
|
||||
} else if (test->expr->tag == Assign) {
|
||||
@ -612,12 +613,12 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
"test((%r), %r, %r, %ld, %ld);",
|
||||
compile_assignment(env, assign->targets->ast, value),
|
||||
compile_type_info(env, lhs_t),
|
||||
CORD_quoted(test->output),
|
||||
compile_string_literal(output),
|
||||
(int64_t)(test->expr->start - test->expr->file->text),
|
||||
(int64_t)(test->expr->end - test->expr->file->text));
|
||||
} else {
|
||||
// Multi-assign or assignment to potentially non-idempotent targets
|
||||
if (test->output && assign->targets->next)
|
||||
if (output && assign->targets->next)
|
||||
code_err(ast, "Sorry, but doctesting with '=' is not supported for multi-assignments");
|
||||
|
||||
CORD code = "test(({ // Assignment\n";
|
||||
@ -643,7 +644,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
|
||||
CORD_appendf(&code, "$1; }), %r, %r, %ld, %ld);",
|
||||
compile_type_info(env, first_type),
|
||||
CORD_quoted(test->output),
|
||||
compile_string_literal(output),
|
||||
(int64_t)(test->expr->start - test->expr->file->text),
|
||||
(int64_t)(test->expr->end - test->expr->file->text));
|
||||
return code;
|
||||
@ -667,7 +668,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
compile_lvalue(env, update->lhs),
|
||||
compile_statement(env, update_var),
|
||||
compile_type_info(env, lhs_t),
|
||||
CORD_quoted(test->output),
|
||||
compile_string_literal(output),
|
||||
(int64_t)(test->expr->start - test->expr->file->text),
|
||||
(int64_t)(test->expr->end - test->expr->file->text));
|
||||
} else if (expr_t->tag == VoidType || expr_t->tag == AbortType || expr_t->tag == ReturnType) {
|
||||
@ -681,7 +682,7 @@ CORD compile_statement(env_t *env, ast_t *ast)
|
||||
"test(%r, %r, %r, %ld, %ld);",
|
||||
compile(env, test->expr),
|
||||
compile_type_info(env, expr_t),
|
||||
CORD_quoted(output),
|
||||
compile_string_literal(output),
|
||||
(int64_t)(test->expr->start - test->expr->file->text),
|
||||
(int64_t)(test->expr->end - test->expr->file->text));
|
||||
}
|
||||
@ -1953,7 +1954,7 @@ CORD compile_math_method(env_t *env, binop_e op, ast_t *lhs, ast_t *rhs, type_t
|
||||
return CORD_EMPTY;
|
||||
}
|
||||
|
||||
static CORD compile_string_literal(CORD literal)
|
||||
CORD compile_string_literal(CORD literal)
|
||||
{
|
||||
CORD code = "\"";
|
||||
CORD_pos i;
|
||||
|
22
docs/text.md
22
docs/text.md
@ -9,12 +9,22 @@ etc.).
|
||||
|
||||
## Implementation
|
||||
|
||||
Internally, Tomo text's implementation is based on [Raku's
|
||||
strings](https://docs.raku.org/language/unicode). Strings store their grapheme
|
||||
cluster count and either a compact array of 8-bit ASCII characters (for ASCII
|
||||
text), an array of 32-bit normal-form grapheme cluster values (see below), or a
|
||||
flat subarray of multiple texts that are either ASCII or graphemes (the
|
||||
structure is not arbitrarily nested).
|
||||
Internally, Tomo text's implementation is based on [Raku/MoarVM's
|
||||
strings](https://docs.raku.org/language/unicode) and [Boehm et al's
|
||||
Cords](https://www.cs.tufts.edu/comp/150FP/archive/hans-boehm/ropes.pdf).
|
||||
Strings store their grapheme cluster count and either a compact array of 8-bit
|
||||
ASCII characters (for ASCII text), an array of 32-bit normal-form grapheme
|
||||
cluster values (see below), or a (roughly) balanced binary tree concatenation
|
||||
of two texts. The upside is that repeated concatenations are typically a
|
||||
constant-time operation, which will occasionally require a small rebalancing
|
||||
operation. Index-based text operations (like retrieving an arbitrary index or
|
||||
slicing) are very fast: typically a constant-time operation for arbitrary
|
||||
unicode text, but in the worst case scenario (text built from many
|
||||
concatenations), `O(log(n))` time with very generous constant factors typically
|
||||
amounting to only a handful of steps. Since concatenations use shared
|
||||
substructures, they are very memory-efficient and can be used efficiently for
|
||||
applications like implementing a text editor that stores a full edit history of
|
||||
a large file's contents.
|
||||
|
||||
### Normal-Form Graphemes
|
||||
|
||||
|
@ -17,15 +17,16 @@ PUREFUNC public Text_t Byte$as_text(const void *b, bool colorize, const TypeInfo
|
||||
}
|
||||
|
||||
public Text_t Byte$hex(Byte_t byte, bool uppercase, bool prefix) {
|
||||
Text_t text = {.tag=TEXT_SHORT_ASCII};
|
||||
struct Text_s text = {.tag=TEXT_ASCII};
|
||||
text.ascii = GC_MALLOC_ATOMIC(8);
|
||||
if (prefix && uppercase)
|
||||
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "0x%02X", byte);
|
||||
text.length = (int64_t)snprintf((char*)text.ascii, 8, "0x%02X", byte);
|
||||
else if (prefix && !uppercase)
|
||||
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "0x%02x", byte);
|
||||
text.length = (int64_t)snprintf((char*)text.ascii, 8, "0x%02x", byte);
|
||||
else if (!prefix && uppercase)
|
||||
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "%02X", byte);
|
||||
text.length = (int64_t)snprintf((char*)text.ascii, 8, "%02X", byte);
|
||||
else if (!prefix && !uppercase)
|
||||
text.length = (int64_t)snprintf(text.short_ascii, sizeof(text.short_ascii), "%02x", byte);
|
||||
text.length = (int64_t)snprintf((char*)text.ascii, 8, "%02x", byte);
|
||||
return text;
|
||||
}
|
||||
|
||||
|
@ -66,18 +66,24 @@ typedef struct Range_s {
|
||||
Int_t first, last, step;
|
||||
} Range_t;
|
||||
|
||||
enum text_type { TEXT_SHORT_ASCII, TEXT_ASCII, TEXT_SHORT_GRAPHEMES, TEXT_GRAPHEMES, TEXT_SUBTEXT };
|
||||
enum text_type { TEXT_ASCII, TEXT_GRAPHEMES, TEXT_CONCAT };
|
||||
|
||||
typedef struct Text_s {
|
||||
int64_t length; // Number of grapheme clusters
|
||||
uint64_t hash:61;
|
||||
uint8_t tag:3;
|
||||
int64_t length:54; // Number of grapheme clusters
|
||||
uint8_t depth:8;
|
||||
uint8_t tag:2;
|
||||
union {
|
||||
char short_ascii[8];
|
||||
const char *ascii;
|
||||
int32_t short_graphemes[2];
|
||||
const int32_t *graphemes;
|
||||
struct Text_s *subtexts;
|
||||
struct {
|
||||
const char *ascii;
|
||||
// char ascii_buf[8];
|
||||
};
|
||||
struct {
|
||||
const int32_t *graphemes;
|
||||
// int32_t grapheme_buf[2];
|
||||
};
|
||||
struct {
|
||||
const struct Text_s *left, *right;
|
||||
};
|
||||
};
|
||||
} Text_t;
|
||||
|
||||
|
@ -74,7 +74,7 @@ public void Optional$deserialize(FILE *in, void *outval, Array_t *pointers, cons
|
||||
_deserialize(in, outval, pointers, nonnull);
|
||||
} else {
|
||||
if (nonnull->tag == TextInfo)
|
||||
*(Text_t*)outval = (Text_t){.length=-1};
|
||||
*(Text_t*)outval = NONE_TEXT;
|
||||
else if (nonnull->tag == ArrayInfo)
|
||||
*(Array_t*)outval = (Array_t){.length=-1};
|
||||
else if (nonnull->tag == TableInfo)
|
||||
|
@ -36,7 +36,7 @@ typedef struct {
|
||||
|
||||
static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
|
||||
{
|
||||
while (*i < state->text.length) {
|
||||
while (*i < state->stack[0].text.length) {
|
||||
int32_t grapheme = Text$get_grapheme_fast(state, *i);
|
||||
if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
|
||||
return;
|
||||
@ -46,7 +46,7 @@ static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
|
||||
|
||||
static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
|
||||
{
|
||||
if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
|
||||
if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
|
||||
*i += 1;
|
||||
return true;
|
||||
}
|
||||
@ -57,7 +57,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
|
||||
{
|
||||
int64_t matched = 0;
|
||||
while (matched[str]) {
|
||||
if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
|
||||
if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
|
||||
return false;
|
||||
matched += 1;
|
||||
}
|
||||
@ -67,7 +67,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
|
||||
|
||||
static INLINE bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop)
|
||||
{
|
||||
if (*i >= state->text.length) return false;
|
||||
if (*i >= state->stack[0].text.length) return false;
|
||||
uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
|
||||
// TODO: check every codepoint in the cluster?
|
||||
if (uc_is_property(grapheme, prop)) {
|
||||
@ -95,7 +95,7 @@ static const char *get_property_name(TextIter_t *state, int64_t *i)
|
||||
skip_whitespace(state, i);
|
||||
char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
|
||||
char *dest = name;
|
||||
while (*i < state->text.length) {
|
||||
while (*i < state->stack[0].text.length) {
|
||||
int32_t grapheme = Text$get_grapheme_fast(state, *i);
|
||||
if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
|
||||
*dest = (char)grapheme;
|
||||
@ -406,10 +406,10 @@ static int64_t match_num(TextIter_t *state, int64_t index)
|
||||
|
||||
static int64_t match_newline(TextIter_t *state, int64_t index)
|
||||
{
|
||||
if (index >= state->text.length)
|
||||
if (index >= state->stack[0].text.length)
|
||||
return -1;
|
||||
|
||||
uint32_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index);
|
||||
uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
|
||||
if (grapheme == '\n')
|
||||
return 1;
|
||||
if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
|
||||
@ -419,7 +419,7 @@ static int64_t match_newline(TextIter_t *state, int64_t index)
|
||||
|
||||
static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
|
||||
{
|
||||
Text_t text = state->text;
|
||||
Text_t text = state->stack[0].text;
|
||||
int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
|
||||
|
||||
switch (pat.tag) {
|
||||
@ -516,7 +516,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
|
||||
int32_t close = open;
|
||||
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
|
||||
if (!match_grapheme(state, index, close))
|
||||
fail("Pattern's closing quote is missing: %k", &state->text);
|
||||
fail("Pattern's closing quote is missing: %k", &state->stack[0].text);
|
||||
|
||||
return (pat_t){
|
||||
.tag=PAT_QUOTE,
|
||||
@ -531,7 +531,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
|
||||
int32_t close = open;
|
||||
uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
|
||||
if (!match_grapheme(state, index, close))
|
||||
fail("Pattern's closing brace is missing: %k", &state->text);
|
||||
fail("Pattern's closing brace is missing: %k", &state->stack[0].text);
|
||||
|
||||
return (pat_t){
|
||||
.tag=PAT_PAIR,
|
||||
@ -571,19 +571,19 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
|
||||
skip_whitespace(state, index);
|
||||
int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
|
||||
if (!match_grapheme(state, index, '}'))
|
||||
fail("Missing closing '}' in pattern: %k", &state->text);
|
||||
fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
|
||||
return PAT(PAT_GRAPHEME, .grapheme=grapheme);
|
||||
} else if (strlen(prop_name) == 1) {
|
||||
// Single letter names: {1+ A}
|
||||
skip_whitespace(state, index);
|
||||
if (!match_grapheme(state, index, '}'))
|
||||
fail("Missing closing '}' in pattern: %k", &state->text);
|
||||
fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
|
||||
return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
|
||||
}
|
||||
|
||||
skip_whitespace(state, index);
|
||||
if (!match_grapheme(state, index, '}'))
|
||||
fail("Missing closing '}' in pattern: %k", &state->text);
|
||||
fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
|
||||
|
||||
switch (tolower(prop_name[0])) {
|
||||
case '.':
|
||||
@ -677,7 +677,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t
|
||||
return 0;
|
||||
|
||||
int64_t start_index = text_index;
|
||||
TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0};
|
||||
TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
|
||||
pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
|
||||
|
||||
if (pat.min == -1 && pat.max == -1) {
|
||||
@ -778,7 +778,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||
|
||||
TextIter_t text_state = {text, 0, 0};
|
||||
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
|
||||
for (int64_t i = first; i <= last; i++) {
|
||||
// Optimization: quickly skip ahead to first char in pattern:
|
||||
if (find_first) {
|
||||
@ -881,12 +881,12 @@ typedef struct {
|
||||
|
||||
static OptionalMatch_t next_match(match_iter_state_t *state)
|
||||
{
|
||||
if (Int_to_Int64(state->i, false) > state->state.text.length)
|
||||
if (Int_to_Int64(state->i, false) > state->state.stack[0].text.length)
|
||||
return NONE_MATCH;
|
||||
|
||||
OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i);
|
||||
OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i);
|
||||
if (m.index.small == 0) // No match
|
||||
state->i = I(state->state.text.length + 1);
|
||||
state->i = I(state->state.stack[0].text.length + 1);
|
||||
else
|
||||
state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
|
||||
return m;
|
||||
@ -896,7 +896,7 @@ public Closure_t Text$by_match(Text_t text, Pattern_t pattern)
|
||||
{
|
||||
return (Closure_t){
|
||||
.fn=(void*)next_match,
|
||||
.userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern),
|
||||
.userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern),
|
||||
};
|
||||
}
|
||||
|
||||
@ -911,7 +911,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||
|
||||
Text_t ret = Text("");
|
||||
TextIter_t replacement_state = {replacement, 0, 0};
|
||||
TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
|
||||
int64_t nonmatching_pos = 0;
|
||||
for (int64_t pos = 0; pos < replacement.length; ) {
|
||||
// Optimization: quickly skip ahead to first char in the backref pattern:
|
||||
@ -965,14 +965,14 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
|
||||
|
||||
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive)
|
||||
{
|
||||
Text_t ret = {.length=0};
|
||||
Text_t ret = EMPTY_TEXT;
|
||||
|
||||
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
|
||||
bool find_first = (first_grapheme != '{'
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||
|
||||
TextIter_t text_state = {text, 0, 0};
|
||||
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
|
||||
int64_t nonmatching_pos = 0;
|
||||
for (int64_t pos = 0; pos < text.length; ) {
|
||||
// Optimization: quickly skip ahead to first char in pattern:
|
||||
@ -1030,14 +1030,14 @@ public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool tri
|
||||
|
||||
public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
|
||||
{
|
||||
Text_t ret = {.length=0};
|
||||
Text_t ret = EMPTY_TEXT;
|
||||
|
||||
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
|
||||
bool find_first = (first_grapheme != '{'
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||
|
||||
TextIter_t text_state = {text, 0, 0};
|
||||
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
|
||||
int64_t nonmatching_pos = 0;
|
||||
|
||||
Text_t (*text_mapper)(Match_t, void*) = fn.fn;
|
||||
@ -1086,7 +1086,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn)
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
|
||||
&& !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||
|
||||
TextIter_t text_state = {text, 0, 0};
|
||||
TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
|
||||
void (*action)(Match_t, void*) = fn.fn;
|
||||
for (int64_t pos = 0; pos < text.length; pos++) {
|
||||
// Optimization: quickly skip ahead to first char in pattern:
|
||||
@ -1118,7 +1118,7 @@ public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref
|
||||
{
|
||||
if (replacements.entries.length == 0) return text;
|
||||
|
||||
Text_t ret = {.length=0};
|
||||
Text_t ret = EMPTY_TEXT;
|
||||
|
||||
int64_t nonmatch_pos = 0;
|
||||
for (int64_t pos = 0; pos < text.length; ) {
|
||||
@ -1194,11 +1194,11 @@ typedef struct {
|
||||
|
||||
static OptionalText_t next_split(split_iter_state_t *state)
|
||||
{
|
||||
Text_t text = state->state.text;
|
||||
Text_t text = state->state.stack[0].text;
|
||||
if (state->i >= text.length) {
|
||||
if (state->pattern.length > 0 && state->i == text.length) { // special case
|
||||
state->i = text.length + 1;
|
||||
return (Text_t){.length=0};
|
||||
return EMPTY_TEXT;
|
||||
}
|
||||
return NONE_TEXT;
|
||||
}
|
||||
@ -1220,7 +1220,7 @@ static OptionalText_t next_split(split_iter_state_t *state)
|
||||
state->i = MAX(found + len, state->i + 1);
|
||||
return Text$slice(text, I(start+1), I(found));
|
||||
} else {
|
||||
state->i = state->state.text.length + 1;
|
||||
state->i = state->state.stack[0].text.length + 1;
|
||||
return Text$slice(text, I(start+1), I(text.length));
|
||||
}
|
||||
}
|
||||
@ -1229,7 +1229,7 @@ public Closure_t Text$by_split(Text_t text, Pattern_t pattern)
|
||||
{
|
||||
return (Closure_t){
|
||||
.fn=(void*)next_split,
|
||||
.userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern),
|
||||
.userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern),
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -14,24 +14,7 @@
|
||||
|
||||
public Shell_t Shell$escape_text(Text_t text)
|
||||
{
|
||||
// TODO: optimize for ASCII and short strings
|
||||
Array_t shell_graphemes = {.atomic=1};
|
||||
#define add_char(c) Array$insert(&shell_graphemes, (uint32_t[1]){c}, I_small(0), sizeof(uint32_t))
|
||||
add_char('\'');
|
||||
const char *text_utf8 = Text$as_c_string(text);
|
||||
for (const char *p = text_utf8; *p; p++) {
|
||||
if (*p == '\'') {
|
||||
add_char('\'');
|
||||
add_char('"');
|
||||
add_char('\'');
|
||||
add_char('"');
|
||||
add_char('\'');
|
||||
} else
|
||||
add_char((uint8_t)*p);
|
||||
}
|
||||
add_char('\'');
|
||||
#undef add_char
|
||||
return (Text_t){.length=shell_graphemes.length, .tag=TEXT_GRAPHEMES, .graphemes=shell_graphemes.data};
|
||||
return Text$replace(text, Text("'"), Text("'\"'\"'"), Text(""), false);
|
||||
}
|
||||
|
||||
public Shell_t Shell$escape_text_array(Array_t texts)
|
||||
|
@ -500,12 +500,12 @@ public void end_test(const void *expr, const TypeInfo_t *type, const char *expec
|
||||
if (expected && expected[0]) {
|
||||
Text_t expected_text = Text$from_str(expected);
|
||||
Text_t expr_plain = USE_COLOR ? generic_as_text(expr, false, type) : expr_text;
|
||||
bool success = Text$equal(&expr_plain, &expected_text, &Text$info);
|
||||
bool success = Text$equal_values(expr_plain, expected_text);
|
||||
if (!success) {
|
||||
OptionalMatch_t colon = Text$find(expected_text, Text(":"), I_small(1));
|
||||
if (colon.index.small) {
|
||||
Text_t with_type = Text$concat(expr_plain, Text(" : "), type_name);
|
||||
success = Text$equal(&with_type, &expected_text, &Text$info);
|
||||
success = Text$equal_values(with_type, expected_text);
|
||||
}
|
||||
}
|
||||
|
||||
@ -594,7 +594,7 @@ public bool pop_flag(char **argv, int *i, const char *flag, Text_t *result)
|
||||
if (argv[*i][0] != '-' || argv[*i][1] != '-') {
|
||||
return false;
|
||||
} else if (streq(argv[*i] + 2, flag)) {
|
||||
*result = (Text_t){.length=0};
|
||||
*result = EMPTY_TEXT;
|
||||
argv[*i] = NULL;
|
||||
*i += 1;
|
||||
return true;
|
||||
|
857
stdlib/text.c
857
stdlib/text.c
File diff suppressed because it is too large
Load Diff
@ -13,18 +13,24 @@
|
||||
#include "types.h"
|
||||
#include "util.h"
|
||||
|
||||
#define MAX_TEXT_DEPTH 48
|
||||
|
||||
typedef struct {
|
||||
Text_t text;
|
||||
int64_t subtext, sum_of_previous_subtexts;
|
||||
struct {
|
||||
Text_t text;
|
||||
int64_t offset;
|
||||
} stack[MAX_TEXT_DEPTH];
|
||||
int64_t stack_index;
|
||||
} TextIter_t;
|
||||
|
||||
#define NEW_TEXT_ITER_STATE(t) (TextIter_t){.stack={{t, 0}}, .stack_index=0}
|
||||
|
||||
int printf_text(FILE *stream, const struct printf_info *info, const void *const args[]);
|
||||
int printf_text_size(const struct printf_info *info, size_t n, int argtypes[n], int sizes[n]);
|
||||
|
||||
#define Text(str) ((Text_t){.length=sizeof(str)-1, .tag=TEXT_ASCII, .ascii="" str})
|
||||
|
||||
int Text$print(FILE *stream, Text_t t);
|
||||
void Text$visualize(Text_t t);
|
||||
Text_t Text$_concat(int n, Text_t items[n]);
|
||||
#define Text$concat(...) Text$_concat(sizeof((Text_t[]){__VA_ARGS__})/sizeof(Text_t), (Text_t[]){__VA_ARGS__})
|
||||
#define Texts(...) Text$concat(__VA_ARGS__)
|
||||
@ -69,11 +75,12 @@ void Text$deserialize(FILE *in, void *out, Array_t *, const TypeInfo_t *);
|
||||
|
||||
MACROLIKE int32_t Text$get_grapheme(Text_t text, int64_t index)
|
||||
{
|
||||
TextIter_t state = {text, 0, 0};
|
||||
TextIter_t state = NEW_TEXT_ITER_STATE(text);
|
||||
return Text$get_grapheme_fast(&state, index);
|
||||
}
|
||||
|
||||
extern const TypeInfo_t Text$info;
|
||||
extern Text_t EMPTY_TEXT;
|
||||
|
||||
#define Text$metamethods ((metamethods_t){ \
|
||||
.as_text=Text$as_text, \
|
||||
|
Loading…
Reference in New Issue
Block a user