diff options
Diffstat (limited to 'stdlib/patterns.c')
| -rw-r--r-- | stdlib/patterns.c | 87 |
1 files changed, 66 insertions, 21 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c index fdc7a79f..a5ca6971 100644 --- a/stdlib/patterns.c +++ b/stdlib/patterns.c @@ -380,7 +380,7 @@ static int64_t match_id(TextIter_t *state, int64_t index) static int64_t match_int(TextIter_t *state, int64_t index) { int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - return len >= 0 ? len : -1; + return len > 0 ? len : -1; } static int64_t match_alphanumeric(TextIter_t *state, int64_t index) @@ -769,7 +769,7 @@ static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t #undef EAT2 #undef EAT_MANY -static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length) +static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) { int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' @@ -784,7 +784,7 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last ++i; } - int64_t m = match(text, i, pattern, 0, NULL, 0); + int64_t m = match(text, i, pattern, 0, captures, 0); if (m >= 0) { if (match_length) *match_length = m; @@ -796,15 +796,30 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last return -1; } -public OptionalInt_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index) +public OptionalMatch_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index) { int64_t first = Int_to_Int64(from_index, false); if (first == 0) fail("Invalid index: 0"); if (first < 0) first = text.length + first + 1; if (first > text.length || first < 1) - return NULL_INT; - int64_t found = _find(text, pattern, first-1, text.length-1, NULL); - return found == -1 ? NULL_INT : I(found+1); + return NULL_MATCH; + + capture_t captures[MAX_BACKREFS] = {}; + int64_t len = 0; + int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures); + if (found == -1) + return NULL_MATCH; + + Array_t capture_array = {}; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); + } + return (OptionalMatch_t){ + .text=Text$slice(text, I(found+1), I(found+len)), + .index=I(found+1), + .captures=capture_array, + }; } PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) @@ -820,7 +835,7 @@ PUREFUNC public bool Text$has(Text_t text, Pattern_t pattern) } return false; } else { - int64_t found = _find(text, pattern, 0, text.length-1, NULL); + int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL); return (found >= 0); } } @@ -846,16 +861,13 @@ public Array_t Text$find_all(Text_t text, Pattern_t pattern) return (Array_t){.length=0}; Array_t matches = {}; - - for (int64_t i = 0; ; ) { - int64_t len = 0; - int64_t found = _find(text, pattern, i, text.length-1, &len); - if (found < 0) break; - Text_t match = Text$slice(text, I(found+1), I(found + len)); - Array$insert(&matches, &match, I_small(0), sizeof(Text_t)); - i = found + MAX(len, 1); + for (int64_t i = 1; ; ) { + OptionalMatch_t m = Text$find(text, pattern, I(i)); + if (!m.index.small) + break; + i = Int_to_Int64(m.index, false) + m.text.length; + Array$insert(&matches, &m, I_small(0), sizeof(Match_t)); } - return matches; } @@ -999,7 +1011,7 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) TextIter_t text_state = {text, 0, 0}; int64_t nonmatching_pos = 0; - Text_t (*text_mapper)(Text_t, void*) = fn.fn; + Text_t (*text_mapper)(Match_t, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { @@ -1007,10 +1019,21 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) ++pos; } - int64_t match_len = match(text, pos, pattern, 0, NULL, 0); + capture_t captures[MAX_BACKREFS] = {}; + int64_t match_len = match(text, pos, pattern, 0, captures, 0); if (match_len < 0) continue; - Text_t replacement = text_mapper(Text$slice(text, I(pos+1), I(pos+match_len)), fn.userdata); + Match_t m = { + .text=Text$slice(text, I(pos+1), I(pos+match_len)), + .index=I(pos+1), + .captures={}, + }; + for (int i = 0; captures[i].occupied; i++) { + Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); + Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); + } + + Text_t replacement = text_mapper(m, fn.userdata); if (pos > nonmatching_pos) { Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); ret = Text$concat(ret, before_slice, replacement); @@ -1084,7 +1107,7 @@ public Array_t Text$split(Text_t text, Pattern_t pattern) int64_t i = 0; for (;;) { int64_t len = 0; - int64_t found = _find(text, pattern, i, text.length-1, &len); + int64_t found = _find(text, pattern, i, text.length-1, &len, NULL); if (found < 0) break; Text_t chunk = Text$slice(text, I(i+1), I(found)); Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); @@ -1097,5 +1120,27 @@ public Array_t Text$split(Text_t text, Pattern_t pattern) return chunks; } +public const TypeInfo_t Pattern$info = { + .size=sizeof(Pattern_t), + .align=__alignof__(Pattern_t), + .tag=TextInfo, + .TextInfo={.lang="Pattern"}, +}; + +static NamedType_t _match_fields[3] = { + {"text", &Text$info}, + {"index", &Int$info}, + {"captures", Array$info(&Text$info)}, +}; +public const TypeInfo_t Match$info = { + .size=sizeof(Match_t), + .align=__alignof__(Match_t), + .tag=StructInfo, + .StructInfo={ + .name="Match", + .num_fields=3, + .fields=_match_fields, + }, +}; // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 |
