aboutsummaryrefslogtreecommitdiff
path: root/stdlib/patterns.c
diff options
context:
space:
mode:
Diffstat (limited to 'stdlib/patterns.c')
-rw-r--r--stdlib/patterns.c87
1 files changed, 66 insertions, 21 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c
index fdc7a79f..a5ca6971 100644
--- a/stdlib/patterns.c
+++ b/stdlib/patterns.c
@@ -380,7 +380,7 @@ static int64_t match_id(TextIter_t *state, int64_t index)
static int64_t match_int(TextIter_t *state, int64_t index)
{
int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
- return len >= 0 ? len : -1;
+ return len > 0 ? len : -1;
}
static int64_t match_alphanumeric(TextIter_t *state, int64_t index)
@@ -769,7 +769,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t
#undef EAT2
#undef EAT_MANY
-static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length)
+static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures)
{
int32_t first_grapheme = Text$get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
@@ -784,7 +784,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
++i;
}
- int64_t m = match(text, i, pattern, 0, NULL, 0);
+ int64_t m = match(text, i, pattern, 0, captures, 0);
if (m >= 0) {
if (match_length)
*match_length = m;
@@ -796,15 +796,30 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last
return -1;
}
-public OptionalInt_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index)
+public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index)
{
int64_t first = Int_to_Int64(from_index, false);
if (first == 0) fail("Invalid index: 0");
if (first < 0) first = text.length + first + 1;
if (first > text.length || first < 1)
- return NULL_INT;
- int64_t found = _find(text, pattern, first-1, text.length-1, NULL);
- return found == -1 ? NULL_INT : I(found+1);
+ return NULL_MATCH;
+
+ capture_t captures[MAX_BACKREFS] = {};
+ int64_t len = 0;
+ int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures);
+ if (found == -1)
+ return NULL_MATCH;
+
+ Array_t capture_array = {};
+ for (int i = 0; captures[i].occupied; i++) {
+ Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
+ Array$insert(&capture_array, &capture, I(0), sizeof(Text_t));
+ }
+ return (OptionalMatch_t){
+ .text=Text$slice(text, I(found+1), I(found+len)),
+ .index=I(found+1),
+ .captures=capture_array,
+ };
}
PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern)
@@ -820,7 +835,7 @@ PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern)
}
return false;
} else {
- int64_t found = _find(text, pattern, 0, text.length-1, NULL);
+ int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL);
return (found >= 0);
}
}
@@ -846,16 +861,13 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern)
return (Array_t){.length=0};
Array_t matches = {};
-
- for (int64_t i = 0; ; ) {
- int64_t len = 0;
- int64_t found = _find(text, pattern, i, text.length-1, &len);
- if (found < 0) break;
- Text_t match = Text$slice(text, I(found+1), I(found + len));
- Array$insert(&matches, &match, I_small(0), sizeof(Text_t));
- i = found + MAX(len, 1);
+ for (int64_t i = 1; ; ) {
+ OptionalMatch_t m = Text$find(text, pattern, I(i));
+ if (!m.index.small)
+ break;
+ i = Int_to_Int64(m.index, false) + m.text.length;
+ Array$insert(&matches, &m, I_small(0), sizeof(Match_t));
}
-
return matches;
}
@@ -999,7 +1011,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
TextIter_t text_state = {text, 0, 0};
int64_t nonmatching_pos = 0;
- Text_t (*text_mapper)(Text_t, void*) = fn.fn;
+ Text_t (*text_mapper)(Match_t, void*) = fn.fn;
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
@@ -1007,10 +1019,21 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn)
++pos;
}
- int64_t match_len = match(text, pos, pattern, 0, NULL, 0);
+ capture_t captures[MAX_BACKREFS] = {};
+ int64_t match_len = match(text, pos, pattern, 0, captures, 0);
if (match_len < 0) continue;
- Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata);
+ Match_t m = {
+ .text=Text$slice(text, I(pos+1), I(pos+match_len)),
+ .index=I(pos+1),
+ .captures={},
+ };
+ for (int i = 0; captures[i].occupied; i++) {
+ Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length));
+ Array$insert(&m.captures, &capture, I(0), sizeof(Text_t));
+ }
+
+ Text_t replacement = text_mapper(m, fn.userdata);
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, replacement);
@@ -1084,7 +1107,7 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
int64_t i = 0;
for (;;) {
int64_t len = 0;
- int64_t found = _find(text, pattern, i, text.length-1, &len);
+ int64_t found = _find(text, pattern, i, text.length-1, &len, NULL);
if (found < 0) break;
Text_t chunk = Text$slice(text, I(i+1), I(found));
Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
@@ -1097,5 +1120,27 @@ public Array_t Text$split(Text_t text, Pattern_t pattern)
return chunks;
}
+public const TypeInfo_t Pattern$info = {
+ .size=sizeof(Pattern_t),
+ .align=__alignof__(Pattern_t),
+ .tag=TextInfo,
+ .TextInfo={.lang="Pattern"},
+};
+
+static NamedType_t _match_fields[3] = {
+ {"text", &Text$info},
+ {"index", &Int$info},
+ {"captures", Array$info(&Text$info)},
+};
+public const TypeInfo_t Match$info = {
+ .size=sizeof(Match_t),
+ .align=__alignof__(Match_t),
+ .tag=StructInfo,
+ .StructInfo={
+ .name="Match",
+ .num_fields=3,
+ .fields=_match_fields,
+ },
+};
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0