Add text:by_line()/:by_split()/:by_match()

author: Bruce Hill <bruce@bruce-hill.com> 2024-12-21 16:32:22 -0500
committer: Bruce Hill <bruce@bruce-hill.com> 2024-12-21 16:32:22 -0500
commit: 46b61d3ed2ae5bd5f74c9d580f5501b1226d9f4e (patch)
tree: 15256a67533f176332fec7a3feb56c6b4b2e4b23 /stdlib/patterns.c
parent: 325b367a1342826fe7174ce45cfab92091d4dbb5 (diff)
1 files changed, 77 insertions, 1 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c
index 66855d2d..48f43aed 100644
--- a/stdlib/patterns.c
+++ b/stdlib/patterns.c
@@ -873,6 +873,33 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern)
     return matches;
 }
 
+typedef struct {
+    TextIter_t state;
+    Int_t i;
+    Pattern_t pattern;
+} match_iter_state_t;
+
+static OptionalMatch_t next_match(match_iter_state_t *state)
+{
+    if (Int_to_Int64(state->i, false) > state->state.text.length)
+        return NONE_MATCH;
+
+    OptionalMatch_t m = Text$find(state->state.text, state->pattern, state->i);
+    if (m.index.small == 0) // No match
+        state->i = I(state->state.text.length + 1);
+    else
+        state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
+    return m;
+}
+
+public Closure_t Text$by_match(Text_t text, Pattern_t pattern)
+{
+    return (Closure_t){
+        .fn=(void*)next_match,
+        .userdata=new(match_iter_state_t, .state={text, 0, 0}, .i=I_small(1), .pattern=pattern),
+    };
+}
+
 static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
 {
     if (backref_pat.length == 0)
@@ -1145,10 +1172,12 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
     for (;;) {
         int64_t len = 0;
         int64_t found = _find(text, pattern, i, text.length-1, &len, NULL);
+        if (found == i && len == 0)
+            found = _find(text, pattern, i + 1, text.length-1, &len, NULL);
         if (found < 0) break;
         Text_t chunk = Text$slice(text, I(i+1), I(found));
         Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
-        i = found + MAX(len, 1);
+        i = MAX(found + len, i + 1);
     }
 
     Text_t last_chunk = Text$slice(text, I(i+1), I(text.length));
@@ -1157,6 +1186,53 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
     return chunks;
 }
 
+typedef struct {
+    TextIter_t state;
+    int64_t i;
+    Pattern_t pattern;
+} split_iter_state_t;
+
+static OptionalText_t next_split(split_iter_state_t *state)
+{
+    Text_t text = state->state.text;
+    if (state->i >= text.length) {
+        if (state->pattern.length > 0 && state->i == text.length) { // special case
+            state->i = text.length + 1;
+            return (Text_t){.length=0};
+        }
+        return NONE_TEXT;
+    }
+
+    if (state->pattern.length == 0) { // special case
+        Text_t ret = Text$cluster(text, I(state->i+1));
+        state->i += 1;
+        return ret;
+    }
+
+    int64_t start = state->i;
+    int64_t len = 0;
+    int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL);
+
+    if (found == start && len == 0)
+        found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL);
+
+    if (found >= 0) {
+        state->i = MAX(found + len, state->i + 1);
+        return Text$slice(text, I(start+1), I(found));
+    } else {
+        state->i = state->state.text.length + 1;
+        return Text$slice(text, I(start+1), I(text.length));
+    }
+}
+
+public Closure_t Text$by_split(Text_t text, Pattern_t pattern)
+{
+    return (Closure_t){
+        .fn=(void*)next_split,
+        .userdata=new(split_iter_state_t, .state={text, 0, 0}, .i=0, .pattern=pattern),
+    };
+}
+
 public const TypeInfo_t Pattern$info = {
     .size=sizeof(Pattern_t),
     .align=__alignof__(Pattern_t),
author	Bruce Hill <bruce@bruce-hill.com>	2024-12-21 16:32:22 -0500
committer	Bruce Hill <bruce@bruce-hill.com>	2024-12-21 16:32:22 -0500
commit	46b61d3ed2ae5bd5f74c9d580f5501b1226d9f4e (patch)
tree	15256a67533f176332fec7a3feb56c6b4b2e4b23 /stdlib/patterns.c
parent	325b367a1342826fe7174ce45cfab92091d4dbb5 (diff)