Add Text.replace_all({Pattern:Text}) and tweak API for replacement to
support placeholders
This commit is contained in:
parent
3c2c1a308b
commit
c14ed3e3e7
167
builtins/text.c
167
builtins/text.c
@ -1472,7 +1472,7 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern)
|
|||||||
return matches;
|
return matches;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
|
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t placeholder)
|
||||||
{
|
{
|
||||||
Text_t ret = {.length=0};
|
Text_t ret = {.length=0};
|
||||||
|
|
||||||
@ -1481,11 +1481,18 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
|
|||||||
int64_t len;
|
int64_t len;
|
||||||
Int_t found = Text$find(text, pattern, i, &len);
|
Int_t found = Text$find(text, pattern, i, &len);
|
||||||
if (I_is_zero(found)) break;
|
if (I_is_zero(found)) break;
|
||||||
|
|
||||||
|
Text_t replacement_text = replacement;
|
||||||
|
if (placeholder.length > 0) {
|
||||||
|
Text_t matched_text = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
|
||||||
|
replacement_text = Text$replace(replacement, placeholder, matched_text, Text(""));
|
||||||
|
}
|
||||||
|
|
||||||
if (Int$compare(&found, &i, &$Text) > 0) {
|
if (Int$compare(&found, &i, &$Text) > 0) {
|
||||||
Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1)));
|
Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1)));
|
||||||
ret = Text$concat(ret, before_slice, replacement);
|
ret = Text$concat(ret, before_slice, replacement_text);
|
||||||
} else {
|
} else {
|
||||||
ret = concat2(ret, replacement);
|
ret = concat2(ret, replacement_text);
|
||||||
}
|
}
|
||||||
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
|
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
|
||||||
}
|
}
|
||||||
@ -1496,6 +1503,160 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
|
|||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Text_t placeholder)
|
||||||
|
{
|
||||||
|
if (patterns.length != replacements.length)
|
||||||
|
fail("The number of patterns given (%ld) is not the same as the number of replacements (%ld)",
|
||||||
|
patterns.length, replacements.length);
|
||||||
|
|
||||||
|
if (patterns.length == 0) return text;
|
||||||
|
|
||||||
|
Text_t ret = {.length=0};
|
||||||
|
|
||||||
|
Pattern_t first_pattern = *(Pattern_t*)(patterns.data);
|
||||||
|
int32_t first_grapheme = get_grapheme(first_pattern, 0);
|
||||||
|
bool find_first = (first_grapheme != '{'
|
||||||
|
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
|
||||||
|
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||||
|
|
||||||
|
iteration_state_t text_state = {0, 0};
|
||||||
|
|
||||||
|
int64_t nonmatch_pos = 0;
|
||||||
|
for (int64_t pos = 0; pos < text.length; ) {
|
||||||
|
// Optimization: quickly skip ahead to first char in pattern:
|
||||||
|
if (find_first) {
|
||||||
|
while (pos < text.length && _next_grapheme(text, &text_state, pos) != first_grapheme)
|
||||||
|
++pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Get all match lengths:
|
||||||
|
int64_t lengths[patterns.length] = {};
|
||||||
|
for (int64_t i = 0, match_pos = pos; i < patterns.length; i++) {
|
||||||
|
Pattern_t pattern = *(Pattern_t*)(patterns.data + i*patterns.stride);
|
||||||
|
|
||||||
|
// If one of the patterns is `?` sandwiched between two pats
|
||||||
|
if (i > 0 && i < patterns.length-1 && Text$equal(&pattern, (Pattern_t[1]){Text("?")})) {
|
||||||
|
Pattern_t prev_pat = *(Pattern_t*)(patterns.data + (i-1)*patterns.stride);
|
||||||
|
int32_t prev_last_grapheme = get_grapheme(prev_pat, prev_pat.length-1);
|
||||||
|
if (prev_last_grapheme < 0) goto literal_pat;
|
||||||
|
|
||||||
|
Pattern_t next_pat = *(Pattern_t*)(patterns.data + (i+1)*patterns.stride);
|
||||||
|
int32_t next_first_grapheme = get_grapheme(next_pat, 0);
|
||||||
|
if (next_first_grapheme < 0) goto literal_pat;
|
||||||
|
|
||||||
|
int32_t mirrored = prev_last_grapheme;
|
||||||
|
uc_mirror_char(prev_last_grapheme, (uint32_t*)&mirrored);
|
||||||
|
|
||||||
|
if (next_first_grapheme != mirrored)
|
||||||
|
goto literal_pat;
|
||||||
|
|
||||||
|
if ((uc_is_property_quotation_mark(prev_last_grapheme) && uc_is_property_quotation_mark(next_first_grapheme))
|
||||||
|
|| ((uc_is_property_paired_punctuation(prev_last_grapheme)
|
||||||
|
&& uc_is_property_paired_punctuation(next_first_grapheme)
|
||||||
|
&& uc_is_property_left_of_pair(prev_last_grapheme)))) {
|
||||||
|
// $/"/, $/?/, $/"/
|
||||||
|
// $/(/, $/?/, $/)/
|
||||||
|
|
||||||
|
Pattern_t matching_pair_pat = text_from_u32((uint32_t[3]){prev_last_grapheme, '?', next_first_grapheme}, 3, false);
|
||||||
|
int64_t enclosing_len = match(text, matching_pair_pat, match_pos-1, 0);
|
||||||
|
if (enclosing_len < 0) goto no_match;
|
||||||
|
|
||||||
|
assert(enclosing_len >= 2);
|
||||||
|
lengths[i] = enclosing_len - 2; // Exclude '(' and ')' or whatever delims
|
||||||
|
goto found_match;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
literal_pat:;
|
||||||
|
lengths[i] = match(text, pattern, match_pos, 0);
|
||||||
|
if (lengths[i] < 0)
|
||||||
|
goto no_match;
|
||||||
|
|
||||||
|
found_match:
|
||||||
|
match_pos += lengths[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// If we skipped over some non-matching text before finding a match, insert it here:
|
||||||
|
if (pos > nonmatch_pos) {
|
||||||
|
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
|
||||||
|
ret = concat2(ret, before_slice);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Concatenate the slices/replacements
|
||||||
|
for (int64_t i = 0, replace_pos = pos; i < patterns.length; i++) {
|
||||||
|
Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride);
|
||||||
|
if (placeholder.length > 0) {
|
||||||
|
Text_t matched_text = Text$slice(text, Int64_to_Int(replace_pos+1), Int64_to_Int(replace_pos + lengths[i]));
|
||||||
|
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = concat2(ret, replacement);
|
||||||
|
replace_pos += lengths[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int64_t total_match_len = 0;
|
||||||
|
for (int64_t i = 0; i < patterns.length; i++)
|
||||||
|
total_match_len += lengths[i];
|
||||||
|
|
||||||
|
pos += (total_match_len <= 0) ? 1 : total_match_len;
|
||||||
|
nonmatch_pos = pos;
|
||||||
|
continue;
|
||||||
|
|
||||||
|
no_match:
|
||||||
|
pos += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (nonmatch_pos <= text.length) {
|
||||||
|
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
|
||||||
|
ret = concat2(ret, last_slice);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeholder)
|
||||||
|
{
|
||||||
|
if (replacements.entries.length == 0) return text;
|
||||||
|
|
||||||
|
Text_t ret = {.length=0};
|
||||||
|
|
||||||
|
int64_t nonmatch_pos = 0;
|
||||||
|
for (int64_t pos = 0; pos < text.length; ) {
|
||||||
|
// Find the first matching pattern at this position:
|
||||||
|
for (int64_t i = 0; i < replacements.entries.length; i++) {
|
||||||
|
Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride);
|
||||||
|
int64_t len = match(text, pattern, pos, 0);
|
||||||
|
if (len < 0) continue;
|
||||||
|
|
||||||
|
// If we skipped over some non-matching text before finding a match, insert it here:
|
||||||
|
if (pos > nonmatch_pos) {
|
||||||
|
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
|
||||||
|
ret = concat2(ret, before_slice);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Concatenate the replacement:
|
||||||
|
Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
|
||||||
|
if (placeholder.length > 0) {
|
||||||
|
Text_t matched_text = Text$slice(text, Int64_to_Int(pos+1), Int64_to_Int(pos + len));
|
||||||
|
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
|
||||||
|
}
|
||||||
|
ret = concat2(ret, replacement);
|
||||||
|
pos += len > 0 ? len : 1;
|
||||||
|
nonmatch_pos = pos;
|
||||||
|
goto next_pos;
|
||||||
|
}
|
||||||
|
|
||||||
|
pos += 1;
|
||||||
|
next_pos:
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nonmatch_pos <= text.length) {
|
||||||
|
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
|
||||||
|
ret = concat2(ret, last_slice);
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
public array_t Text$split(Text_t text, Pattern_t pattern)
|
public array_t Text$split(Text_t text, Pattern_t pattern)
|
||||||
{
|
{
|
||||||
if (text.length == 0) // special case
|
if (text.length == 0) // special case
|
||||||
|
@ -31,7 +31,9 @@ Text_t Text$lower(Text_t text);
|
|||||||
Text_t Text$title(Text_t text);
|
Text_t Text$title(Text_t text);
|
||||||
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info);
|
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info);
|
||||||
Text_t Text$quoted(Text_t str, bool colorize);
|
Text_t Text$quoted(Text_t str, bool colorize);
|
||||||
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement);
|
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t placeholder);
|
||||||
|
Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Pattern_t placeholder);
|
||||||
|
Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t placeholder);
|
||||||
array_t Text$split(Text_t text, Pattern_t pattern);
|
array_t Text$split(Text_t text, Pattern_t pattern);
|
||||||
Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length);
|
Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length);
|
||||||
array_t Text$find_all(Text_t text, Pattern_t pattern);
|
array_t Text$find_all(Text_t text, Pattern_t pattern);
|
||||||
|
100
docs/text.md
100
docs/text.md
@ -271,11 +271,13 @@ Patterns are used in a small, but very powerful API that handles many text
|
|||||||
functions that would normally be handled by a more extensive API:
|
functions that would normally be handled by a more extensive API:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
Text.has(pattern:Pattern)->Bool
|
||||||
Text.find(pattern:Pattern, start=1, length=!&Int64?)->Int
|
Text.find(pattern:Pattern, start=1, length=!&Int64?)->Int
|
||||||
Text.find_all(pattern:Pattern)->[Text]
|
Text.find_all(pattern:Pattern)->[Text]
|
||||||
Text.split(pattern:Pattern)->[Text]
|
Text.split(pattern:Pattern)->[Text]
|
||||||
Text.replace(pattern:Pattern, replacement:Text)->[Text]
|
Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//)->[Text]
|
||||||
Text.has(pattern:Pattern)->Bool
|
Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$//)->[Text]
|
||||||
|
Text.replace_chain(patterns:[Pattern], replacements:[Text], placeholder:Pattern=$//)->[Text]
|
||||||
```
|
```
|
||||||
|
|
||||||
See [Text Functions](#Text-Functions) for the full API documentation.
|
See [Text Functions](#Text-Functions) for the full API documentation.
|
||||||
@ -838,7 +840,7 @@ See [Patterns](#patterns) for more information about patterns.
|
|||||||
|
|
||||||
**Usage:**
|
**Usage:**
|
||||||
```tomo
|
```tomo
|
||||||
replace(text: Text, pattern: Text, replacement: Text) -> Text
|
replace(text: Text, pattern: Text, replacement: Text, placeholder: Pattern = $//) -> Text
|
||||||
```
|
```
|
||||||
|
|
||||||
**Parameters:**
|
**Parameters:**
|
||||||
@ -846,6 +848,8 @@ replace(text: Text, pattern: Text, replacement: Text) -> Text
|
|||||||
- `text`: The text in which to perform replacements.
|
- `text`: The text in which to perform replacements.
|
||||||
- `pattern`: The pattern to be replaced.
|
- `pattern`: The pattern to be replaced.
|
||||||
- `replacement`: The text to replace the pattern with.
|
- `replacement`: The text to replace the pattern with.
|
||||||
|
- `placeholder`: If non-empty, the replacement text will have occurrences of the placeholder
|
||||||
|
pattern replaced with the matching text.
|
||||||
|
|
||||||
**Returns:**
|
**Returns:**
|
||||||
The text with occurrences of the pattern replaced.
|
The text with occurrences of the pattern replaced.
|
||||||
@ -857,6 +861,96 @@ The text with occurrences of the pattern replaced.
|
|||||||
|
|
||||||
>> "Hello world":replace("{id}", "xxx")
|
>> "Hello world":replace("{id}", "xxx")
|
||||||
= "xxx xxx"
|
= "xxx xxx"
|
||||||
|
|
||||||
|
>> "Hello world":replace("{id}", "(@)", placeholder=$/@/)
|
||||||
|
= "(Hello) (world)"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `replace_all`
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Takes a table mapping patterns to replacement texts and performs all the
|
||||||
|
replacements in the table on the whole text. At each position, the first
|
||||||
|
matching pattern's replacement is applied and the pattern matching moves on to
|
||||||
|
*after* the replacement text, so replacement text is not recursively modified.
|
||||||
|
See [Patterns](#patterns) for more information about patterns.
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```tomo
|
||||||
|
replace_all(replacements:{Pattern:Text}, placeholder: Pattern = $//) -> Text
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
|
||||||
|
- `text`: The text in which to perform replacements.
|
||||||
|
- `replacements`: A table mapping from patterns to the replacement text
|
||||||
|
associated with that pattern.
|
||||||
|
- `placeholder`: If non-empty, the replacement text will have occurrences of
|
||||||
|
the placeholder pattern replaced with the matching text.
|
||||||
|
|
||||||
|
**Returns:**
|
||||||
|
The text with all occurrences of the patterns replaced with their corresponding
|
||||||
|
replacement text.
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```tomo
|
||||||
|
>> "A <tag> & an amperand":replace_all({
|
||||||
|
$/&/: "&",
|
||||||
|
$/</: "<",
|
||||||
|
$/>/: ">",
|
||||||
|
$/"/: """,
|
||||||
|
$/'/: "'",
|
||||||
|
}
|
||||||
|
= "A <tag> & an ampersand"
|
||||||
|
|
||||||
|
>> "Hello":replace_all({$/{lower}/:"[@]", $/{upper}/:"{@}"}, placeholder=$/@/)
|
||||||
|
= "{H}[ello]"
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## `replace_chain`
|
||||||
|
|
||||||
|
**Description:**
|
||||||
|
Takes an array of patterns and a corresponding array of replacement texts and
|
||||||
|
if all patterns in the patterns array match _consecutively_, then the
|
||||||
|
replacement texts are substituted for the corresponding matches. This is useful
|
||||||
|
if you want to replace parts of a match with new text while leaving other parts
|
||||||
|
unchanged.
|
||||||
|
|
||||||
|
As as special case, if a pattern `$/?/` is sandwiched between corresponding
|
||||||
|
matching quotes or matching braces, it will match the inside part of the nested
|
||||||
|
or quoted pair.
|
||||||
|
|
||||||
|
See [Patterns](#patterns) for more information about patterns.
|
||||||
|
|
||||||
|
**Usage:**
|
||||||
|
```tomo
|
||||||
|
replace_chain(patterns:[Pattern], replacements:[Text], placeholder: Pattern = $//) -> Text
|
||||||
|
```
|
||||||
|
|
||||||
|
**Parameters:**
|
||||||
|
|
||||||
|
- `text`: The text in which to perform replacements.
|
||||||
|
- `patterns`: An array of patterns to be matched consecutively.
|
||||||
|
- `replacements`: An array of replacement texts corresponding to each pattern.
|
||||||
|
This must be the same length as `patterns` or an error will be raised.
|
||||||
|
- `placeholder`: If non-empty, the replacement text will have occurrences of
|
||||||
|
the placeholder pattern replaced with the matching text.
|
||||||
|
|
||||||
|
**Returns:**
|
||||||
|
The text with all occurrences of the patterns replaced with their corresponding
|
||||||
|
replacement text.
|
||||||
|
|
||||||
|
**Example:**
|
||||||
|
```tomo
|
||||||
|
>> " foo(blah(), 2) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz(", "@", ")"], placeholder=$/@/)
|
||||||
|
= " baz(blah(), 2) "
|
||||||
|
|
||||||
|
>> " foo.field_name ":replace_chain([$/{id}/, $/.field_name/], ["@", ".other_field"], placeholder=$/@/)
|
||||||
|
= " foo.other_field "
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -248,7 +248,9 @@ env_t *new_compilation_unit(CORD *libname)
|
|||||||
{"lines", "Text$lines", "func(text:Text)->[Text]"},
|
{"lines", "Text$lines", "func(text:Text)->[Text]"},
|
||||||
{"lower", "Text$lower", "func(text:Text)->Text"},
|
{"lower", "Text$lower", "func(text:Text)->Text"},
|
||||||
{"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
|
{"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
|
||||||
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text)->Text"},
|
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$//)->Text"},
|
||||||
|
{"replace_chain", "Text$replace_chain", "func(text:Text, patterns:[Pattern], replacements:[Text], placeholder=$//)->Text"},
|
||||||
|
{"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$//)->Text"},
|
||||||
{"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"},
|
{"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"},
|
||||||
{"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"},
|
{"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"},
|
||||||
{"title", "Text$title", "func(text:Text)->Text"},
|
{"title", "Text$title", "func(text:Text)->Text"},
|
||||||
|
13
test/lang.tm
13
test/lang.tm
@ -1,11 +1,14 @@
|
|||||||
lang HTML:
|
lang HTML:
|
||||||
HEADER := $HTML"<!DOCTYPE HTML>"
|
HEADER := $HTML"<!DOCTYPE HTML>"
|
||||||
func escape(t:Text)->HTML:
|
func escape(t:Text)->HTML:
|
||||||
t = t:replace($/&/, "&")
|
t = t:replace_all({
|
||||||
t = t:replace($/</, "<")
|
$/&/: "&",
|
||||||
t = t:replace($/>/, ">")
|
$/</: "<",
|
||||||
t = t:replace($/"/, """)
|
$/>/: ">",
|
||||||
t = t:replace($/'/, "'")
|
$/"/: """,
|
||||||
|
$/'/: "'",
|
||||||
|
})
|
||||||
|
|
||||||
return HTML.from_unsafe_text(t)
|
return HTML.from_unsafe_text(t)
|
||||||
|
|
||||||
func escape_int(i:Int)->HTML:
|
func escape_int(i:Int)->HTML:
|
||||||
|
@ -227,4 +227,12 @@ func main():
|
|||||||
>> $/$malicious/
|
>> $/$malicious/
|
||||||
= $/{1{}xxx}/
|
= $/{1{}xxx}/
|
||||||
|
|
||||||
|
>> "Hello":replace($/{lower}/, "(@)", $/@/)
|
||||||
|
= "H(ello)"
|
||||||
|
|
||||||
|
>> " foo(xyz) foo(yyy) foo(z()) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz[", "@", "]"], $/@/)
|
||||||
|
= " baz[xyz] baz[yyy] baz[z()] "
|
||||||
|
|
||||||
|
>> "<tag>":replace_all({$/</:"<", $/>/:">"})
|
||||||
|
= "<tag>"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user