diff --git a/builtins/text.c b/builtins/text.c index 1bffbf2..c318b2c 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -1472,7 +1472,7 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern) return matches; } -public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement) +public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t placeholder) { Text_t ret = {.length=0}; @@ -1481,11 +1481,18 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement) int64_t len; Int_t found = Text$find(text, pattern, i, &len); if (I_is_zero(found)) break; + + Text_t replacement_text = replacement; + if (placeholder.length > 0) { + Text_t matched_text = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1))); + replacement_text = Text$replace(replacement, placeholder, matched_text, Text("")); + } + if (Int$compare(&found, &i, &$Text) > 0) { Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1))); - ret = Text$concat(ret, before_slice, replacement); + ret = Text$concat(ret, before_slice, replacement_text); } else { - ret = concat2(ret, replacement); + ret = concat2(ret, replacement_text); } i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len)); } @@ -1496,6 +1503,160 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement) return ret; } +public Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Text_t placeholder) +{ + if (patterns.length != replacements.length) + fail("The number of patterns given (%ld) is not the same as the number of replacements (%ld)", + patterns.length, replacements.length); + + if (patterns.length == 0) return text; + + Text_t ret = {.length=0}; + + Pattern_t first_pattern = *(Pattern_t*)(patterns.data); + int32_t first_grapheme = get_grapheme(first_pattern, 0); + bool find_first = (first_grapheme != '{' + && !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK) + && !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); + + iteration_state_t text_state = {0, 0}; + + int64_t nonmatch_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Optimization: quickly skip ahead to first char in pattern: + if (find_first) { + while (pos < text.length && _next_grapheme(text, &text_state, pos) != first_grapheme) + ++pos; + } + + // Get all match lengths: + int64_t lengths[patterns.length] = {}; + for (int64_t i = 0, match_pos = pos; i < patterns.length; i++) { + Pattern_t pattern = *(Pattern_t*)(patterns.data + i*patterns.stride); + + // If one of the patterns is `?` sandwiched between two pats + if (i > 0 && i < patterns.length-1 && Text$equal(&pattern, (Pattern_t[1]){Text("?")})) { + Pattern_t prev_pat = *(Pattern_t*)(patterns.data + (i-1)*patterns.stride); + int32_t prev_last_grapheme = get_grapheme(prev_pat, prev_pat.length-1); + if (prev_last_grapheme < 0) goto literal_pat; + + Pattern_t next_pat = *(Pattern_t*)(patterns.data + (i+1)*patterns.stride); + int32_t next_first_grapheme = get_grapheme(next_pat, 0); + if (next_first_grapheme < 0) goto literal_pat; + + int32_t mirrored = prev_last_grapheme; + uc_mirror_char(prev_last_grapheme, (uint32_t*)&mirrored); + + if (next_first_grapheme != mirrored) + goto literal_pat; + + if ((uc_is_property_quotation_mark(prev_last_grapheme) && uc_is_property_quotation_mark(next_first_grapheme)) + || ((uc_is_property_paired_punctuation(prev_last_grapheme) + && uc_is_property_paired_punctuation(next_first_grapheme) + && uc_is_property_left_of_pair(prev_last_grapheme)))) { + // $/"/, $/?/, $/"/ + // $/(/, $/?/, $/)/ + + Pattern_t matching_pair_pat = text_from_u32((uint32_t[3]){prev_last_grapheme, '?', next_first_grapheme}, 3, false); + int64_t enclosing_len = match(text, matching_pair_pat, match_pos-1, 0); + if (enclosing_len < 0) goto no_match; + + assert(enclosing_len >= 2); + lengths[i] = enclosing_len - 2; // Exclude '(' and ')' or whatever delims + goto found_match; + } + } + + literal_pat:; + lengths[i] = match(text, pattern, match_pos, 0); + if (lengths[i] < 0) + goto no_match; + + found_match: + match_pos += lengths[i]; + } + + // If we skipped over some non-matching text before finding a match, insert it here: + if (pos > nonmatch_pos) { + Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos)); + ret = concat2(ret, before_slice); + } + + // Concatenate the slices/replacements + for (int64_t i = 0, replace_pos = pos; i < patterns.length; i++) { + Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride); + if (placeholder.length > 0) { + Text_t matched_text = Text$slice(text, Int64_to_Int(replace_pos+1), Int64_to_Int(replace_pos + lengths[i])); + replacement = Text$replace(replacement, placeholder, matched_text, Text("")); + } + + ret = concat2(ret, replacement); + replace_pos += lengths[i]; + } + + int64_t total_match_len = 0; + for (int64_t i = 0; i < patterns.length; i++) + total_match_len += lengths[i]; + + pos += (total_match_len <= 0) ? 1 : total_match_len; + nonmatch_pos = pos; + continue; + + no_match: + pos += 1; + continue; + } + if (nonmatch_pos <= text.length) { + Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length)); + ret = concat2(ret, last_slice); + } + return ret; +} + +public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeholder) +{ + if (replacements.entries.length == 0) return text; + + Text_t ret = {.length=0}; + + int64_t nonmatch_pos = 0; + for (int64_t pos = 0; pos < text.length; ) { + // Find the first matching pattern at this position: + for (int64_t i = 0; i < replacements.entries.length; i++) { + Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride); + int64_t len = match(text, pattern, pos, 0); + if (len < 0) continue; + + // If we skipped over some non-matching text before finding a match, insert it here: + if (pos > nonmatch_pos) { + Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos)); + ret = concat2(ret, before_slice); + } + + // Concatenate the replacement: + Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t)); + if (placeholder.length > 0) { + Text_t matched_text = Text$slice(text, Int64_to_Int(pos+1), Int64_to_Int(pos + len)); + replacement = Text$replace(replacement, placeholder, matched_text, Text("")); + } + ret = concat2(ret, replacement); + pos += len > 0 ? len : 1; + nonmatch_pos = pos; + goto next_pos; + } + + pos += 1; + next_pos: + continue; + } + + if (nonmatch_pos <= text.length) { + Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length)); + ret = concat2(ret, last_slice); + } + return ret; +} + public array_t Text$split(Text_t text, Pattern_t pattern) { if (text.length == 0) // special case diff --git a/builtins/text.h b/builtins/text.h index ec65a7f..7c7e243 100644 --- a/builtins/text.h +++ b/builtins/text.h @@ -31,7 +31,9 @@ Text_t Text$lower(Text_t text); Text_t Text$title(Text_t text); Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info); Text_t Text$quoted(Text_t str, bool colorize); -Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement); +Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t placeholder); +Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Pattern_t placeholder); +Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t placeholder); array_t Text$split(Text_t text, Pattern_t pattern); Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length); array_t Text$find_all(Text_t text, Pattern_t pattern); diff --git a/docs/text.md b/docs/text.md index cf60f6a..47f60e3 100644 --- a/docs/text.md +++ b/docs/text.md @@ -271,11 +271,13 @@ Patterns are used in a small, but very powerful API that handles many text functions that would normally be handled by a more extensive API: ``` +Text.has(pattern:Pattern)->Bool Text.find(pattern:Pattern, start=1, length=!&Int64?)->Int Text.find_all(pattern:Pattern)->[Text] Text.split(pattern:Pattern)->[Text] -Text.replace(pattern:Pattern, replacement:Text)->[Text] -Text.has(pattern:Pattern)->Bool +Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//)->[Text] +Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$//)->[Text] +Text.replace_chain(patterns:[Pattern], replacements:[Text], placeholder:Pattern=$//)->[Text] ``` See [Text Functions](#Text-Functions) for the full API documentation. @@ -838,7 +840,7 @@ See [Patterns](#patterns) for more information about patterns. **Usage:** ```tomo -replace(text: Text, pattern: Text, replacement: Text) -> Text +replace(text: Text, pattern: Text, replacement: Text, placeholder: Pattern = $//) -> Text ``` **Parameters:** @@ -846,6 +848,8 @@ replace(text: Text, pattern: Text, replacement: Text) -> Text - `text`: The text in which to perform replacements. - `pattern`: The pattern to be replaced. - `replacement`: The text to replace the pattern with. +- `placeholder`: If non-empty, the replacement text will have occurrences of the placeholder + pattern replaced with the matching text. **Returns:** The text with occurrences of the pattern replaced. @@ -857,6 +861,96 @@ The text with occurrences of the pattern replaced. >> "Hello world":replace("{id}", "xxx") = "xxx xxx" + +>> "Hello world":replace("{id}", "(@)", placeholder=$/@/) += "(Hello) (world)" +``` + +--- + +## `replace_all` + +**Description:** +Takes a table mapping patterns to replacement texts and performs all the +replacements in the table on the whole text. At each position, the first +matching pattern's replacement is applied and the pattern matching moves on to +*after* the replacement text, so replacement text is not recursively modified. +See [Patterns](#patterns) for more information about patterns. + +**Usage:** +```tomo +replace_all(replacements:{Pattern:Text}, placeholder: Pattern = $//) -> Text +``` + +**Parameters:** + +- `text`: The text in which to perform replacements. +- `replacements`: A table mapping from patterns to the replacement text + associated with that pattern. +- `placeholder`: If non-empty, the replacement text will have occurrences of + the placeholder pattern replaced with the matching text. + +**Returns:** +The text with all occurrences of the patterns replaced with their corresponding +replacement text. + +**Example:** +```tomo +>> "A & an amperand":replace_all({ + $/&/: "&", + $//: ">", + $/"/: """, + $/'/: "'", +} += "A <tag> & an ampersand" + +>> "Hello":replace_all({$/{lower}/:"[@]", $/{upper}/:"{@}"}, placeholder=$/@/) += "{H}[ello]" +``` + +--- + +## `replace_chain` + +**Description:** +Takes an array of patterns and a corresponding array of replacement texts and +if all patterns in the patterns array match _consecutively_, then the +replacement texts are substituted for the corresponding matches. This is useful +if you want to replace parts of a match with new text while leaving other parts +unchanged. + +As as special case, if a pattern `$/?/` is sandwiched between corresponding +matching quotes or matching braces, it will match the inside part of the nested +or quoted pair. + +See [Patterns](#patterns) for more information about patterns. + +**Usage:** +```tomo +replace_chain(patterns:[Pattern], replacements:[Text], placeholder: Pattern = $//) -> Text +``` + +**Parameters:** + +- `text`: The text in which to perform replacements. +- `patterns`: An array of patterns to be matched consecutively. +- `replacements`: An array of replacement texts corresponding to each pattern. + This must be the same length as `patterns` or an error will be raised. +- `placeholder`: If non-empty, the replacement text will have occurrences of + the placeholder pattern replaced with the matching text. + +**Returns:** +The text with all occurrences of the patterns replaced with their corresponding +replacement text. + +**Example:** +```tomo +>> " foo(blah(), 2) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz(", "@", ")"], placeholder=$/@/) += " baz(blah(), 2) " + +>> " foo.field_name ":replace_chain([$/{id}/, $/.field_name/], ["@", ".other_field"], placeholder=$/@/) += " foo.other_field " ``` --- diff --git a/environment.c b/environment.c index 1eee6ec..431672b 100644 --- a/environment.c +++ b/environment.c @@ -248,7 +248,9 @@ env_t *new_compilation_unit(CORD *libname) {"lines", "Text$lines", "func(text:Text)->[Text]"}, {"lower", "Text$lower", "func(text:Text)->Text"}, {"quoted", "Text$quoted", "func(text:Text, color=no)->Text"}, - {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text)->Text"}, + {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$//)->Text"}, + {"replace_chain", "Text$replace_chain", "func(text:Text, patterns:[Pattern], replacements:[Text], placeholder=$//)->Text"}, + {"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$//)->Text"}, {"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"}, {"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"}, {"title", "Text$title", "func(text:Text)->Text"}, diff --git a/test/lang.tm b/test/lang.tm index e2093a6..e23410e 100644 --- a/test/lang.tm +++ b/test/lang.tm @@ -1,11 +1,14 @@ lang HTML: HEADER := $HTML"" func escape(t:Text)->HTML: - t = t:replace($/&/, "&") - t = t:replace($//, ">") - t = t:replace($/"/, """) - t = t:replace($/'/, "'") + t = t:replace_all({ + $/&/: "&", + $//: ">", + $/"/: """, + $/'/: "'", + }) + return HTML.from_unsafe_text(t) func escape_int(i:Int)->HTML: diff --git a/test/text.tm b/test/text.tm index 52c74af..d75a56a 100644 --- a/test/text.tm +++ b/test/text.tm @@ -227,4 +227,12 @@ func main(): >> $/$malicious/ = $/{1{}xxx}/ + >> "Hello":replace($/{lower}/, "(@)", $/@/) + = "H(ello)" + + >> " foo(xyz) foo(yyy) foo(z()) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz[", "@", "]"], $/@/) + = " baz[xyz] baz[yyy] baz[z()] " + + >> "":replace_all({$//:">"}) + = "<tag>"