Add Text.replace_all({Pattern:Text}) and tweak API for replacement to

support placeholders
This commit is contained in:
Bruce Hill 2024-09-03 20:48:11 -04:00
parent 3c2c1a308b
commit c14ed3e3e7
6 changed files with 283 additions and 13 deletions

View File

@ -1472,7 +1472,7 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern)
return matches;
}
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t placeholder)
{
Text_t ret = {.length=0};
@ -1481,11 +1481,18 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
int64_t len;
Int_t found = Text$find(text, pattern, i, &len);
if (I_is_zero(found)) break;
Text_t replacement_text = replacement;
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
replacement_text = Text$replace(replacement, placeholder, matched_text, Text(""));
}
if (Int$compare(&found, &i, &$Text) > 0) {
Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1)));
ret = Text$concat(ret, before_slice, replacement);
ret = Text$concat(ret, before_slice, replacement_text);
} else {
ret = concat2(ret, replacement);
ret = concat2(ret, replacement_text);
}
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
}
@ -1496,6 +1503,160 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
return ret;
}
public Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Text_t placeholder)
{
if (patterns.length != replacements.length)
fail("The number of patterns given (%ld) is not the same as the number of replacements (%ld)",
patterns.length, replacements.length);
if (patterns.length == 0) return text;
Text_t ret = {.length=0};
Pattern_t first_pattern = *(Pattern_t*)(patterns.data);
int32_t first_grapheme = get_grapheme(first_pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
iteration_state_t text_state = {0, 0};
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (pos < text.length && _next_grapheme(text, &text_state, pos) != first_grapheme)
++pos;
}
// Get all match lengths:
int64_t lengths[patterns.length] = {};
for (int64_t i = 0, match_pos = pos; i < patterns.length; i++) {
Pattern_t pattern = *(Pattern_t*)(patterns.data + i*patterns.stride);
// If one of the patterns is `?` sandwiched between two pats
if (i > 0 && i < patterns.length-1 && Text$equal(&pattern, (Pattern_t[1]){Text("?")})) {
Pattern_t prev_pat = *(Pattern_t*)(patterns.data + (i-1)*patterns.stride);
int32_t prev_last_grapheme = get_grapheme(prev_pat, prev_pat.length-1);
if (prev_last_grapheme < 0) goto literal_pat;
Pattern_t next_pat = *(Pattern_t*)(patterns.data + (i+1)*patterns.stride);
int32_t next_first_grapheme = get_grapheme(next_pat, 0);
if (next_first_grapheme < 0) goto literal_pat;
int32_t mirrored = prev_last_grapheme;
uc_mirror_char(prev_last_grapheme, (uint32_t*)&mirrored);
if (next_first_grapheme != mirrored)
goto literal_pat;
if ((uc_is_property_quotation_mark(prev_last_grapheme) && uc_is_property_quotation_mark(next_first_grapheme))
|| ((uc_is_property_paired_punctuation(prev_last_grapheme)
&& uc_is_property_paired_punctuation(next_first_grapheme)
&& uc_is_property_left_of_pair(prev_last_grapheme)))) {
// $/"/, $/?/, $/"/
// $/(/, $/?/, $/)/
Pattern_t matching_pair_pat = text_from_u32((uint32_t[3]){prev_last_grapheme, '?', next_first_grapheme}, 3, false);
int64_t enclosing_len = match(text, matching_pair_pat, match_pos-1, 0);
if (enclosing_len < 0) goto no_match;
assert(enclosing_len >= 2);
lengths[i] = enclosing_len - 2; // Exclude '(' and ')' or whatever delims
goto found_match;
}
}
literal_pat:;
lengths[i] = match(text, pattern, match_pos, 0);
if (lengths[i] < 0)
goto no_match;
found_match:
match_pos += lengths[i];
}
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
ret = concat2(ret, before_slice);
}
// Concatenate the slices/replacements
for (int64_t i = 0, replace_pos = pos; i < patterns.length; i++) {
Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride);
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, Int64_to_Int(replace_pos+1), Int64_to_Int(replace_pos + lengths[i]));
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
}
ret = concat2(ret, replacement);
replace_pos += lengths[i];
}
int64_t total_match_len = 0;
for (int64_t i = 0; i < patterns.length; i++)
total_match_len += lengths[i];
pos += (total_match_len <= 0) ? 1 : total_match_len;
nonmatch_pos = pos;
continue;
no_match:
pos += 1;
continue;
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeholder)
{
if (replacements.entries.length == 0) return text;
Text_t ret = {.length=0};
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
// Find the first matching pattern at this position:
for (int64_t i = 0; i < replacements.entries.length; i++) {
Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride);
int64_t len = match(text, pattern, pos, 0);
if (len < 0) continue;
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
ret = concat2(ret, before_slice);
}
// Concatenate the replacement:
Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, Int64_to_Int(pos+1), Int64_to_Int(pos + len));
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
}
ret = concat2(ret, replacement);
pos += len > 0 ? len : 1;
nonmatch_pos = pos;
goto next_pos;
}
pos += 1;
next_pos:
continue;
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public array_t Text$split(Text_t text, Pattern_t pattern)
{
if (text.length == 0) // special case

View File

@ -31,7 +31,9 @@ Text_t Text$lower(Text_t text);
Text_t Text$title(Text_t text);
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info);
Text_t Text$quoted(Text_t str, bool colorize);
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement);
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t placeholder);
Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Pattern_t placeholder);
Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t placeholder);
array_t Text$split(Text_t text, Pattern_t pattern);
Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length);
array_t Text$find_all(Text_t text, Pattern_t pattern);

View File

@ -271,11 +271,13 @@ Patterns are used in a small, but very powerful API that handles many text
functions that would normally be handled by a more extensive API:
```
Text.has(pattern:Pattern)->Bool
Text.find(pattern:Pattern, start=1, length=!&Int64?)->Int
Text.find_all(pattern:Pattern)->[Text]
Text.split(pattern:Pattern)->[Text]
Text.replace(pattern:Pattern, replacement:Text)->[Text]
Text.has(pattern:Pattern)->Bool
Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//)->[Text]
Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$//)->[Text]
Text.replace_chain(patterns:[Pattern], replacements:[Text], placeholder:Pattern=$//)->[Text]
```
See [Text Functions](#Text-Functions) for the full API documentation.
@ -838,7 +840,7 @@ See [Patterns](#patterns) for more information about patterns.
**Usage:**
```tomo
replace(text: Text, pattern: Text, replacement: Text) -> Text
replace(text: Text, pattern: Text, replacement: Text, placeholder: Pattern = $//) -> Text
```
**Parameters:**
@ -846,6 +848,8 @@ replace(text: Text, pattern: Text, replacement: Text) -> Text
- `text`: The text in which to perform replacements.
- `pattern`: The pattern to be replaced.
- `replacement`: The text to replace the pattern with.
- `placeholder`: If non-empty, the replacement text will have occurrences of the placeholder
pattern replaced with the matching text.
**Returns:**
The text with occurrences of the pattern replaced.
@ -857,6 +861,96 @@ The text with occurrences of the pattern replaced.
>> "Hello world":replace("{id}", "xxx")
= "xxx xxx"
>> "Hello world":replace("{id}", "(@)", placeholder=$/@/)
= "(Hello) (world)"
```
---
## `replace_all`
**Description:**
Takes a table mapping patterns to replacement texts and performs all the
replacements in the table on the whole text. At each position, the first
matching pattern's replacement is applied and the pattern matching moves on to
*after* the replacement text, so replacement text is not recursively modified.
See [Patterns](#patterns) for more information about patterns.
**Usage:**
```tomo
replace_all(replacements:{Pattern:Text}, placeholder: Pattern = $//) -> Text
```
**Parameters:**
- `text`: The text in which to perform replacements.
- `replacements`: A table mapping from patterns to the replacement text
associated with that pattern.
- `placeholder`: If non-empty, the replacement text will have occurrences of
the placeholder pattern replaced with the matching text.
**Returns:**
The text with all occurrences of the patterns replaced with their corresponding
replacement text.
**Example:**
```tomo
>> "A <tag> & an amperand":replace_all({
$/&/: "&amp;",
$/</: "&lt;",
$/>/: "&gt;",
$/"/: "&quot",
$/'/: "&#39;",
}
= "A &lt;tag&gt; &amp; an ampersand"
>> "Hello":replace_all({$/{lower}/:"[@]", $/{upper}/:"{@}"}, placeholder=$/@/)
= "{H}[ello]"
```
---
## `replace_chain`
**Description:**
Takes an array of patterns and a corresponding array of replacement texts and
if all patterns in the patterns array match _consecutively_, then the
replacement texts are substituted for the corresponding matches. This is useful
if you want to replace parts of a match with new text while leaving other parts
unchanged.
As as special case, if a pattern `$/?/` is sandwiched between corresponding
matching quotes or matching braces, it will match the inside part of the nested
or quoted pair.
See [Patterns](#patterns) for more information about patterns.
**Usage:**
```tomo
replace_chain(patterns:[Pattern], replacements:[Text], placeholder: Pattern = $//) -> Text
```
**Parameters:**
- `text`: The text in which to perform replacements.
- `patterns`: An array of patterns to be matched consecutively.
- `replacements`: An array of replacement texts corresponding to each pattern.
This must be the same length as `patterns` or an error will be raised.
- `placeholder`: If non-empty, the replacement text will have occurrences of
the placeholder pattern replaced with the matching text.
**Returns:**
The text with all occurrences of the patterns replaced with their corresponding
replacement text.
**Example:**
```tomo
>> " foo(blah(), 2) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz(", "@", ")"], placeholder=$/@/)
= " baz(blah(), 2) "
>> " foo.field_name ":replace_chain([$/{id}/, $/.field_name/], ["@", ".other_field"], placeholder=$/@/)
= " foo.other_field "
```
---

View File

@ -248,7 +248,9 @@ env_t *new_compilation_unit(CORD *libname)
{"lines", "Text$lines", "func(text:Text)->[Text]"},
{"lower", "Text$lower", "func(text:Text)->Text"},
{"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text)->Text"},
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$//)->Text"},
{"replace_chain", "Text$replace_chain", "func(text:Text, patterns:[Pattern], replacements:[Text], placeholder=$//)->Text"},
{"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$//)->Text"},
{"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"},
{"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"},
{"title", "Text$title", "func(text:Text)->Text"},

View File

@ -1,11 +1,14 @@
lang HTML:
HEADER := $HTML"<!DOCTYPE HTML>"
func escape(t:Text)->HTML:
t = t:replace($/&/, "&amp;")
t = t:replace($/</, "&lt;")
t = t:replace($/>/, "&gt;")
t = t:replace($/"/, "&quot;")
t = t:replace($/'/, "&#39;")
t = t:replace_all({
$/&/: "&amp;",
$/</: "&lt;",
$/>/: "&gt;",
$/"/: "&quot",
$/'/: "&#39;",
})
return HTML.from_unsafe_text(t)
func escape_int(i:Int)->HTML:

View File

@ -227,4 +227,12 @@ func main():
>> $/$malicious/
= $/{1{}xxx}/
>> "Hello":replace($/{lower}/, "(@)", $/@/)
= "H(ello)"
>> " foo(xyz) foo(yyy) foo(z()) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz[", "@", "]"], $/@/)
= " baz[xyz] baz[yyy] baz[z()] "
>> "<tag>":replace_all({$/</:"&lt;", $/>/:"&gt;"})
= "&lt;tag&gt;"