Add recursive mode to text replacement and update docs

This commit is contained in:
Bruce Hill 2024-09-03 23:16:45 -04:00
parent 02dbcbf8b5
commit b8bb4ada8b
5 changed files with 63 additions and 24 deletions

View File

@ -1051,6 +1051,7 @@ int64_t match_uri(Text_t text, int64_t text_index)
typedef struct {
int64_t index, length;
bool occupied, recursive;
} capture_t;
int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t pattern_index, capture_t *captures, int64_t capture_index)
@ -1077,8 +1078,10 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
start_of_quoted_text,
text_index - start_of_quoted_text,
.index=start_of_quoted_text,
.length=text_index - start_of_quoted_text,
.occupied=true,
.recursive=false,
};
}
@ -1116,8 +1119,10 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
start_of_interior,
text_index - start_of_interior - 1,
.index=start_of_interior,
.length=text_index - start_of_interior - 1,
.occupied=true,
.recursive=true,
};
}
@ -1171,8 +1176,10 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
#define SUCCESS() ({ \
if (captures && capture_index < MAX_BACKREFS) { \
captures[capture_index++] = (capture_t){ \
before_group, \
(text_index - before_group), \
.index=before_group, \
.length=(text_index - before_group), \
.occupied=true, \
.recursive=false, \
}; \
}; continue; 0; })
if (prop_name) {
@ -1303,8 +1310,10 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
before_group,
(text_index - before_group) + match_len,
.index=before_group,
.length=(text_index - before_group) + match_len,
.occupied=true,
.recursive=false,
};
}
return (text_index - start_index) + match_len;
@ -1340,8 +1349,10 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
before_group,
(text_index - before_group) + match_len,
.index=before_group,
.length=(text_index - before_group) + match_len,
.occupied=true,
.recursive=false,
};
}
@ -1525,7 +1536,7 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern)
return matches;
}
static Text_t apply_backrefs(Text_t text, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
{
if (backref_pat.length == 0)
return replacement;
@ -1563,7 +1574,14 @@ static Text_t apply_backrefs(Text_t text, Text_t replacement, Pattern_t backref_
if (_next_grapheme(replacement, &state, pos + backref_len) == ';')
backref_len += 1; // skip optional semicolon
if (!captures[backref].occupied)
fail("There is no capture number %ld!", backref);
Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length));
if (captures[backref].recursive && original_pattern.length > 0)
backref_text = Text$replace(backref_text, original_pattern, replacement, backref_pat, true);
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, backref_text);
@ -1581,7 +1599,7 @@ static Text_t apply_backrefs(Text_t text, Text_t replacement, Pattern_t backref_
return ret;
}
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat)
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat, bool recursive)
{
Text_t ret = {.length=0};
@ -1602,10 +1620,12 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, P
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, pattern, pos, 0, captures, 1);
if (match_len < 0) continue;
captures[0].index = pos;
captures[0].length = match_len;
captures[0] = (capture_t){
.index = pos, .length = match_len,
.occupied = true, .recursive = false,
};
Text_t replacement_text = apply_backrefs(text, replacement, backref_pat, captures);
Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures);
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, replacement_text);
@ -1622,7 +1642,7 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, P
return ret;
}
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t backref_pat)
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t backref_pat, bool recursive)
{
if (replacements.entries.length == 0) return text;
@ -1647,7 +1667,7 @@ public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t backref
// Concatenate the replacement:
Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
Text_t replacement_text = apply_backrefs(text, replacement, backref_pat, captures);
Text_t replacement_text = apply_backrefs(text, recursive ? pattern : Text(""), replacement, backref_pat, captures);
ret = concat2(ret, replacement_text);
pos += len > 0 ? len : 1;
nonmatch_pos = pos;

View File

@ -31,8 +31,8 @@ Text_t Text$lower(Text_t text);
Text_t Text$title(Text_t text);
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info);
Text_t Text$quoted(Text_t str, bool colorize);
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat);
Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t backref_pat);
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat, bool recursive);
Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t backref_pat, bool recursive);
array_t Text$split(Text_t text, Pattern_t pattern);
Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length);
array_t Text$find_all(Text_t text, Pattern_t pattern);

View File

@ -840,7 +840,7 @@ See [Patterns](#patterns) for more information about patterns.
**Usage:**
```tomo
replace(text: Text, pattern: Text, replacement: Text, backref: Pattern = $/\/) -> Text
replace(text: Text, pattern: Text, replacement: Text, backref: Pattern = $/\/, recursive: Bool = yes) -> Text
```
**Parameters:**
@ -852,6 +852,9 @@ replace(text: Text, pattern: Text, replacement: Text, backref: Pattern = $/\/) -
pattern followed by a number replaced with the corresponding backreference.
By default, the backreference pattern is a single backslash, so
backreferences look like `\0`, `\1`, etc.
- `recursive`: For backreferences of a nested capture, if recursive is set to
`yes`, then the whole replacement will be reapplied recursively to the
backreferenced text if it's used in the replacement.
**Backreferences**
If a backreference pattern is in the replacement, then that backreference is
@ -879,11 +882,18 @@ The text with occurrences of the pattern replaced.
>> "Hello world":replace($/{id}/, "\0")
= "(Hello) (world)"
>> "Hello world":replace($/{id}/, "(@0)", backref=$/@/)
= "(Hello) (world)"
>> "Hello world":replace($/{id} {id}/, "just \2")
= "just world"
>> " foo(x, fn(), y) ":replace($/foo(?)/, "baz(\1)")
= " baz(x, fn(), y) "
# Recursive is the default behavior:
>> " BAD(x, BAD(y), z) ":replace($/BAD(?)/, "good(\1)", recursive=yes)
= " good(x, good(y), z) "
>> " BAD(x, BAD(y), z) ":replace($/BAD(?)/, "good(\1)", recursive=no)
= " good(x, BAD(y), z) "
```
---
@ -911,6 +921,9 @@ replace_all(replacements:{Pattern:Text}, backref: Pattern = $/\/) -> Text
pattern followed by a number replaced with the corresponding backreference.
By default, the backreference pattern is a single backslash, so
backreferences look like `\0`, `\1`, etc.
- `recursive`: For backreferences of a nested capture, if recursive is set to
`yes`, then the matching replacement will be reapplied recursively to the
backreferenced text if it's used in the replacement.
**Returns:**
The text with all occurrences of the patterns replaced with their corresponding

View File

@ -248,8 +248,8 @@ env_t *new_compilation_unit(CORD *libname)
{"lines", "Text$lines", "func(text:Text)->[Text]"},
{"lower", "Text$lower", "func(text:Text)->Text"},
{"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$/\\/)->Text"},
{"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$/\\/)->Text"},
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, backref=$/\\/, recursive=yes)->Text"},
{"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, backref=$/\\/, recursive=yes)->Text"},
{"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"},
{"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"},
{"title", "Text$title", "func(text:Text)->Text"},

View File

@ -236,3 +236,9 @@ func main():
>> "<tag>":replace_all({$/</:"&lt;", $/>/:"&gt;"})
= "&lt;tag&gt;"
>> " BAD(x, fn(y), BAD(z), w) ":replace($/BAD(?)/, "good(\1)", recursive=yes)
= " good(x, fn(y), good(z), w) "
>> " BAD(x, fn(y), BAD(z), w) ":replace($/BAD(?)/, "good(\1)", recursive=no)
= " good(x, fn(y), BAD(z), w) "