Overhaul of Text implementation to be more like Cords and have much

better performance for long sequences of repeated concatenation.
author: Bruce Hill <bruce@bruce-hill.com> 2025-01-23 15:33:56 -0500
committer: Bruce Hill <bruce@bruce-hill.com> 2025-01-23 15:33:56 -0500
commit: f93dde14496ef784df6b7b3e1de1030a868be985 (patch)
tree: e4f5bcc1852d13e2f2d853a1f6590ccdd93e18a2 /stdlib/patterns.c
parent: c60ea2079fb230213308904cd0966e5481d2d994 (diff)
1 files changed, 30 insertions, 30 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c
index 48f43aed..bee84760 100644
--- a/stdlib/patterns.c
+++ b/stdlib/patterns.c
@@ -36,7 +36,7 @@ typedef struct {
 
 static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
 {
-    while (*i < state->text.length) {
+    while (*i < state->stack[0].text.length) {
         int32_t grapheme = Text$get_grapheme_fast(state, *i);
         if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme))
             return;
@@ -46,7 +46,7 @@ static INLINE void skip_whitespace(TextIter_t *state, int64_t *i)
 
 static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme)
 {
-    if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
+    if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
         *i += 1;
         return true;
     }
@@ -57,7 +57,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
 {
     int64_t matched = 0;
     while (matched[str]) {
-        if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
+        if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
             return false;
         matched += 1;
     }
@@ -67,7 +67,7 @@ static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str)
 
 static INLINE bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop)
 {
-    if (*i >= state->text.length) return false;
+    if (*i >= state->stack[0].text.length) return false;
     uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
     // TODO: check every codepoint in the cluster?
     if (uc_is_property(grapheme, prop)) {
@@ -95,7 +95,7 @@ static const char *get_property_name(TextIter_t *state, int64_t *i)
     skip_whitespace(state, i);
     char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
     char *dest = name;
-    while (*i < state->text.length) {
+    while (*i < state->stack[0].text.length) {
         int32_t grapheme = Text$get_grapheme_fast(state, *i);
         if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
             *dest = (char)grapheme;
@@ -406,10 +406,10 @@ static int64_t match_num(TextIter_t *state, int64_t index)
 
 static int64_t match_newline(TextIter_t *state, int64_t index)
 {
-    if (index >= state->text.length)
+    if (index >= state->stack[0].text.length)
         return -1;
 
-    uint32_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index);
+    uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
     if (grapheme == '\n')
         return 1;
     if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n')
@@ -419,7 +419,7 @@ static int64_t match_newline(TextIter_t *state, int64_t index)
 
 static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat)
 {
-    Text_t text = state->text;
+    Text_t text = state->stack[0].text;
     int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
 
     switch (pat.tag) {
@@ -516,7 +516,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
         int32_t close = open;
         uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
         if (!match_grapheme(state, index, close))
-            fail("Pattern's closing quote is missing: %k", &state->text);
+            fail("Pattern's closing quote is missing: %k", &state->stack[0].text);
 
         return (pat_t){
             .tag=PAT_QUOTE,
@@ -531,7 +531,7 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
         int32_t close = open;
         uc_mirror_char((ucs4_t)open, (ucs4_t*)&close);
         if (!match_grapheme(state, index, close))
-            fail("Pattern's closing brace is missing: %k", &state->text);
+            fail("Pattern's closing brace is missing: %k", &state->stack[0].text);
         
         return (pat_t){
             .tag=PAT_PAIR,
@@ -571,19 +571,19 @@ static pat_t parse_next_pat(TextIter_t *state, int64_t *index)
             skip_whitespace(state, index);
             int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
             if (!match_grapheme(state, index, '}'))
-                fail("Missing closing '}' in pattern: %k", &state->text);
+                fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
             return PAT(PAT_GRAPHEME, .grapheme=grapheme);
         } else if (strlen(prop_name) == 1) {
             // Single letter names: {1+ A}
             skip_whitespace(state, index);
             if (!match_grapheme(state, index, '}'))
-                fail("Missing closing '}' in pattern: %k", &state->text);
+                fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
             return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]);
         }
 
         skip_whitespace(state, index);
         if (!match_grapheme(state, index, '}'))
-            fail("Missing closing '}' in pattern: %k", &state->text);
+            fail("Missing closing '}' in pattern: %k", &state->stack[0].text);
 
         switch (tolower(prop_name[0])) {
         case '.':
@@ -677,7 +677,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t
         return 0;
 
     int64_t start_index = text_index;
-    TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0};
+    TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
     pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
 
     if (pat.min == -1 && pat.max == -1) {
@@ -778,7 +778,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 
-    TextIter_t text_state = {text, 0, 0};
+    TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
     for (int64_t i = first; i <= last; i++) {
         // Optimization: quickly skip ahead to first char in pattern:
         if (find_first) {
@@ -881,12 +881,12 @@ typedef struct {
 
 static OptionalMatch_t next_match(match_iter_state_t *state)
 {
-    if (Int_to_Int64(state->i, false) > state->state.text.length)
+    if (Int_to_Int64(state->i, false) > state->state.stack[0].text.length)
         return NONE_MATCH;
 
-    OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i);
+    OptionalMatch_t m = Text$find(state->state.stack[0].text, state->pattern, state->i);
     if (m.index.small == 0) // No match
-        state->i = I(state->state.text.length + 1);
+        state->i = I(state->state.stack[0].text.length + 1);
     else
         state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
     return m;
@@ -896,7 +896,7 @@ public Closure_t Text$by_match(Text_t text, Pattern_t pattern)
 {
     return (Closure_t){
         .fn=(void*)next_match,
-        .userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern),
+        .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern),
     };
 }
 
@@ -911,7 +911,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 
     Text_t ret = Text("");
-    TextIter_t replacement_state = {replacement, 0, 0};
+    TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
     int64_t nonmatching_pos = 0;
     for (int64_t pos = 0; pos < replacement.length; ) {
         // Optimization: quickly skip ahead to first char in the backref pattern:
@@ -965,14 +965,14 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep
 
 public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive)
 {
-    Text_t ret = {.length=0};
+    Text_t ret = EMPTY_TEXT;
 
     int32_t first_grapheme = Text$get_grapheme(pattern, 0);
     bool find_first = (first_grapheme != '{'
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 
-    TextIter_t text_state = {text, 0, 0};
+    TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
     int64_t nonmatching_pos = 0;
     for (int64_t pos = 0; pos < text.length; ) {
         // Optimization: quickly skip ahead to first char in pattern:
@@ -1030,14 +1030,14 @@ public Text_t Text$trim(Text_t text, Pattern_t pattern, bool trim_left, bool tri
 
 public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
 {
-    Text_t ret = {.length=0};
+    Text_t ret = EMPTY_TEXT;
 
     int32_t first_grapheme = Text$get_grapheme(pattern, 0);
     bool find_first = (first_grapheme != '{'
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 
-    TextIter_t text_state = {text, 0, 0};
+    TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
     int64_t nonmatching_pos = 0;
 
     Text_t (*text_mapper)(Match_t, void*) = fn.fn;
@@ -1086,7 +1086,7 @@ public void Text$each(Text_t text, Pattern_t pattern, Closure_t fn)
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 
-    TextIter_t text_state = {text, 0, 0};
+    TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
     void (*action)(Match_t, void*) = fn.fn;
     for (int64_t pos = 0; pos < text.length; pos++) {
         // Optimization: quickly skip ahead to first char in pattern:
@@ -1118,7 +1118,7 @@ public Text_t Text$replace_all(Text_t text, Table_t replacements, Text_t backref
 {
     if (replacements.entries.length == 0) return text;
 
-    Text_t ret = {.length=0};
+    Text_t ret = EMPTY_TEXT;
 
     int64_t nonmatch_pos = 0;
     for (int64_t pos = 0; pos < text.length; ) {
@@ -1194,11 +1194,11 @@ typedef struct {
 
 static OptionalText_t next_split(split_iter_state_t *state)
 {
-    Text_t text = state->state.text;
+    Text_t text = state->state.stack[0].text;
     if (state->i >= text.length) {
         if (state->pattern.length > 0 && state->i == text.length) { // special case
             state->i = text.length + 1;
-            return (Text_t){.length=0};
+            return EMPTY_TEXT;
         }
         return NONE_TEXT;
     }
@@ -1220,7 +1220,7 @@ static OptionalText_t next_split(split_iter_state_t *state)
         state->i = MAX(found + len, state->i + 1);
         return Text$slice(text, I(start+1), I(found));
     } else {
-        state->i = state->state.text.length + 1;
+        state->i = state->state.stack[0].text.length + 1;
         return Text$slice(text, I(start+1), I(text.length));
     }
 }
@@ -1229,7 +1229,7 @@ public Closure_t Text$by_split(Text_t text, Pattern_t pattern)
 {
     return (Closure_t){
         .fn=(void*)next_split,
-        .userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern),
+        .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern),
     };
 }
author	Bruce Hill <bruce@bruce-hill.com>	2025-01-23 15:33:56 -0500
committer	Bruce Hill <bruce@bruce-hill.com>	2025-01-23 15:33:56 -0500
commit	f93dde14496ef784df6b7b3e1de1030a868be985 (patch)
tree	e4f5bcc1852d13e2f2d853a1f6590ccdd93e18a2 /stdlib/patterns.c
parent	c60ea2079fb230213308904cd0966e5481d2d994 (diff)