Add Text.replace_all({Pattern:Text}) and tweak API for replacement to

support placeholders
2024-09-03 20:48:11 -04:00 · 2024-09-03 20:48:11 -04:00 · c14ed3e3e7
commit c14ed3e3e7
parent 3c2c1a308b
6 changed files with 283 additions and 13 deletions
--- a/builtins/text.c
+++ b/builtins/text.c
@ -1472,7 +1472,7 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern)
    return matches;
 }
-public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
+public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t placeholder)
 {
    Text_t ret = {.length=0};
@ -1481,11 +1481,18 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
        int64_t len;
        Int_t found = Text$find(text, pattern, i, &len);
        if (I_is_zero(found)) break;
        Text_t replacement_text = replacement;
        if (placeholder.length > 0) {
            Text_t matched_text = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
            replacement_text = Text$replace(replacement, placeholder, matched_text, Text(""));
        }
        if (Int$compare(&found, &i, &$Text) > 0) {
            Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1)));
-            ret = Text$concat(ret, before_slice, replacement);
+            ret = Text$concat(ret, before_slice, replacement_text);
        } else {
-            ret = concat2(ret, replacement);
+            ret = concat2(ret, replacement_text);
        }
        i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
    }
@ -1496,6 +1503,160 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement)
    return ret;
 }
 public Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Text_t placeholder)
 {
    if (patterns.length != replacements.length)
        fail("The number of patterns given (%ld) is not the same as the number of replacements (%ld)",
             patterns.length, replacements.length);
    if (patterns.length == 0) return text;
    Text_t ret = {.length=0};
    Pattern_t first_pattern = *(Pattern_t*)(patterns.data);
    int32_t first_grapheme = get_grapheme(first_pattern, 0);
    bool find_first = (first_grapheme != '{'
                       && !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
                       && !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
    iteration_state_t text_state = {0, 0};
    int64_t nonmatch_pos = 0;
    for (int64_t pos = 0; pos < text.length; ) {
        // Optimization: quickly skip ahead to first char in pattern:
        if (find_first) {
            while (pos < text.length && _next_grapheme(text, &text_state, pos) != first_grapheme)
                ++pos;
        }
        // Get all match lengths:
        int64_t lengths[patterns.length] = {};
        for (int64_t i = 0, match_pos = pos; i < patterns.length; i++) {
            Pattern_t pattern = *(Pattern_t*)(patterns.data + i*patterns.stride);
            // If one of the patterns is `?` sandwiched between two pats
            if (i > 0 && i < patterns.length-1 && Text$equal(&pattern, (Pattern_t[1]){Text("?")})) {
                Pattern_t prev_pat = *(Pattern_t*)(patterns.data + (i-1)*patterns.stride);
                int32_t prev_last_grapheme = get_grapheme(prev_pat, prev_pat.length-1);
                if (prev_last_grapheme < 0) goto literal_pat;
                Pattern_t next_pat = *(Pattern_t*)(patterns.data + (i+1)*patterns.stride);
                int32_t next_first_grapheme = get_grapheme(next_pat, 0);
                if (next_first_grapheme < 0) goto literal_pat;
                int32_t mirrored = prev_last_grapheme;
                uc_mirror_char(prev_last_grapheme, (uint32_t*)&mirrored);
                if (next_first_grapheme != mirrored)
                    goto literal_pat;
                if ((uc_is_property_quotation_mark(prev_last_grapheme) && uc_is_property_quotation_mark(next_first_grapheme))
                    || ((uc_is_property_paired_punctuation(prev_last_grapheme)
                         && uc_is_property_paired_punctuation(next_first_grapheme)
                         && uc_is_property_left_of_pair(prev_last_grapheme)))) {
                    // $/"/, $/?/, $/"/
                    // $/(/, $/?/, $/)/
                    Pattern_t matching_pair_pat = text_from_u32((uint32_t[3]){prev_last_grapheme, '?', next_first_grapheme}, 3, false);
                    int64_t enclosing_len = match(text, matching_pair_pat, match_pos-1, 0);
                    if (enclosing_len < 0) goto no_match;
                    assert(enclosing_len >= 2);
                    lengths[i] = enclosing_len - 2; // Exclude '(' and ')' or whatever delims
                    goto found_match;
                }
            }
          literal_pat:;
            lengths[i] = match(text, pattern, match_pos, 0);
            if (lengths[i] < 0)
                goto no_match;
          found_match:
            match_pos += lengths[i];
        }
        // If we skipped over some non-matching text before finding a match, insert it here:
        if (pos > nonmatch_pos) {
            Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
            ret = concat2(ret, before_slice);
        }
        // Concatenate the slices/replacements
        for (int64_t i = 0, replace_pos = pos; i < patterns.length; i++) {
            Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride);
            if (placeholder.length > 0) {
                Text_t matched_text = Text$slice(text, Int64_to_Int(replace_pos+1), Int64_to_Int(replace_pos + lengths[i]));
                replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
            }
            ret = concat2(ret, replacement);
            replace_pos += lengths[i];
        }
        int64_t total_match_len = 0;
        for (int64_t i = 0; i < patterns.length; i++)
            total_match_len += lengths[i];
        pos += (total_match_len <= 0) ? 1 : total_match_len;
        nonmatch_pos = pos;
        continue;
      no_match:
        pos += 1;
        continue;
    }
    if (nonmatch_pos <= text.length) {
        Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
        ret = concat2(ret, last_slice);
    }
    return ret;
 }
 public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeholder)
 {
    if (replacements.entries.length == 0) return text;
    Text_t ret = {.length=0};
    int64_t nonmatch_pos = 0;
    for (int64_t pos = 0; pos < text.length; ) {
        // Find the first matching pattern at this position:
        for (int64_t i = 0; i < replacements.entries.length; i++) {
            Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride);
            int64_t len = match(text, pattern, pos, 0);
            if (len < 0) continue;
            // If we skipped over some non-matching text before finding a match, insert it here:
            if (pos > nonmatch_pos) {
                Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
                ret = concat2(ret, before_slice);
            }
            // Concatenate the replacement:
            Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
            if (placeholder.length > 0) {
                Text_t matched_text = Text$slice(text, Int64_to_Int(pos+1), Int64_to_Int(pos + len));
                replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
            }
            ret = concat2(ret, replacement);
            pos += len > 0 ? len : 1;
            nonmatch_pos = pos;
            goto next_pos;
        }
        pos += 1;
      next_pos:
        continue;
    }
    if (nonmatch_pos <= text.length) {
        Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
        ret = concat2(ret, last_slice);
    }
    return ret;
 }
 public array_t Text$split(Text_t text, Pattern_t pattern)
 {
    if (text.length == 0) // special case
--- a/builtins/text.h
+++ b/builtins/text.h
@ -31,7 +31,9 @@ Text_t Text$lower(Text_t text);
 Text_t Text$title(Text_t text);
 Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info);
 Text_t Text$quoted(Text_t str, bool colorize);
-Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement);
+Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t placeholder);
 Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Pattern_t placeholder);
 Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t placeholder);
 array_t Text$split(Text_t text, Pattern_t pattern);
 Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length);
 array_t Text$find_all(Text_t text, Pattern_t pattern);
--- a/docs/text.md
+++ b/docs/text.md
@ -271,11 +271,13 @@ Patterns are used in a small, but very powerful API that handles many text
 functions that would normally be handled by a more extensive API:
 ```
 Text.has(pattern:Pattern)->Bool
 Text.find(pattern:Pattern, start=1, length=!&Int64?)->Int
 Text.find_all(pattern:Pattern)->[Text]
 Text.split(pattern:Pattern)->[Text]
-Text.replace(pattern:Pattern, replacement:Text)->[Text]
+Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//)->[Text]
-Text.has(pattern:Pattern)->Bool
+Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$//)->[Text]
 Text.replace_chain(patterns:[Pattern], replacements:[Text], placeholder:Pattern=$//)->[Text]
 ```
 See [Text Functions](#Text-Functions) for the full API documentation.
@ -838,7 +840,7 @@ See [Patterns](#patterns) for more information about patterns.
 **Usage:**  
 ```tomo
-replace(text: Text, pattern: Text, replacement: Text) -> Text
+replace(text: Text, pattern: Text, replacement: Text, placeholder: Pattern = $//) -> Text
 ```
 **Parameters:**
@ -846,6 +848,8 @@ replace(text: Text, pattern: Text, replacement: Text) -> Text
 - `text`: The text in which to perform replacements.
 - `pattern`: The pattern to be replaced.
 - `replacement`: The text to replace the pattern with.
 - `placeholder`: If non-empty, the replacement text will have occurrences of the placeholder
  pattern replaced with the matching text.
 **Returns:**  
 The text with occurrences of the pattern replaced.
@ -857,6 +861,96 @@ The text with occurrences of the pattern replaced.
 >> "Hello world":replace("{id}", "xxx")
 = "xxx xxx"
 >> "Hello world":replace("{id}", "(@)", placeholder=$/@/)
 = "(Hello) (world)"
 ```
 ---
 ## `replace_all`
 **Description:**  
 Takes a table mapping patterns to replacement texts and performs all the
 replacements in the table on the whole text. At each position, the first
 matching pattern's replacement is applied and the pattern matching moves on to
 *after* the replacement text, so replacement text is not recursively modified.
 See [Patterns](#patterns) for more information about patterns.
 **Usage:**  
 ```tomo
 replace_all(replacements:{Pattern:Text}, placeholder: Pattern = $//) -> Text
 ```
 **Parameters:**
 - `text`: The text in which to perform replacements.
 - `replacements`: A table mapping from patterns to the replacement text
  associated with that pattern.
 - `placeholder`: If non-empty, the replacement text will have occurrences of
  the placeholder pattern replaced with the matching text.
 **Returns:**  
 The text with all occurrences of the patterns replaced with their corresponding
 replacement text.
 **Example:**  
 ```tomo
 >> "A <tag> & an amperand":replace_all({
    $/&/: "&amp;",
    $/</: "&lt;",
    $/>/: "&gt;",
    $/"/: "&quot",
    $/'/: "&#39;",
 }
 = "A &lt;tag&gt; &amp; an ampersand"
 >> "Hello":replace_all({$/{lower}/:"[@]", $/{upper}/:"{@}"}, placeholder=$/@/)
 = "{H}[ello]"
 ```
 ---
 ## `replace_chain`
 **Description:**  
 Takes an array of patterns and a corresponding array of replacement texts and
 if all patterns in the patterns array match _consecutively_, then the
 replacement texts are substituted for the corresponding matches. This is useful
 if you want to replace parts of a match with new text while leaving other parts
 unchanged.
 As as special case, if a pattern `$/?/` is sandwiched between corresponding
 matching quotes or matching braces, it will match the inside part of the nested
 or quoted pair.
 See [Patterns](#patterns) for more information about patterns.
 **Usage:**  
 ```tomo
 replace_chain(patterns:[Pattern], replacements:[Text], placeholder: Pattern = $//) -> Text
 ```
 **Parameters:**
 - `text`: The text in which to perform replacements.
 - `patterns`: An array of patterns to be matched consecutively.
 - `replacements`: An array of replacement texts corresponding to each pattern.
  This must be the same length as `patterns` or an error will be raised.
 - `placeholder`: If non-empty, the replacement text will have occurrences of
  the placeholder pattern replaced with the matching text.
 **Returns:**  
 The text with all occurrences of the patterns replaced with their corresponding
 replacement text.
 **Example:**  
 ```tomo
 >> "  foo(blah(), 2)  ":replace_chain([$/foo(/, $/?/, $/)/], ["baz(", "@", ")"], placeholder=$/@/)
 = "  baz(blah(), 2)  "
 >> "  foo.field_name  ":replace_chain([$/{id}/, $/.field_name/], ["@", ".other_field"], placeholder=$/@/)
 = "  foo.other_field  "
 ```
 ---
--- a/environment.c
+++ b/environment.c
@ -248,7 +248,9 @@ env_t *new_compilation_unit(CORD *libname)
            {"lines", "Text$lines", "func(text:Text)->[Text]"},
            {"lower", "Text$lower", "func(text:Text)->Text"},
            {"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
-            {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text)->Text"},
+            {"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$//)->Text"},
            {"replace_chain", "Text$replace_chain", "func(text:Text, patterns:[Pattern], replacements:[Text], placeholder=$//)->Text"},
            {"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$//)->Text"},
            {"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"},
            {"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"},
            {"title", "Text$title", "func(text:Text)->Text"},
--- a/test/lang.tm
+++ b/test/lang.tm
@ -1,11 +1,14 @@
 lang HTML:
 	HEADER := $HTML"<!DOCTYPE HTML>"
 	func escape(t:Text)->HTML:
-		t = t:replace($/&/, "&amp;")
+		t = t:replace_all({
-		t = t:replace($/</, "&lt;")
+			$/&/: "&amp;",
-		t = t:replace($/>/, "&gt;")
+			$/</: "&lt;",
-		t = t:replace($/"/, "&quot;")
+			$/>/: "&gt;",
-		t = t:replace($/'/, "&#39;")
+			$/"/: "&quot",
 			$/'/: "&#39;",
 		})
 		return HTML.from_unsafe_text(t)
 	func escape_int(i:Int)->HTML:
--- a/test/text.tm
+++ b/test/text.tm
@ -227,4 +227,12 @@ func main():
 	>> $/$malicious/
 	= $/{1{}xxx}/
 	>> "Hello":replace($/{lower}/, "(@)", $/@/)
 	= "H(ello)"
 	>> " foo(xyz) foo(yyy) foo(z()) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz[", "@", "]"], $/@/)
 	= " baz[xyz] baz[yyy] baz[z()] "
 	>> "<tag>":replace_all({$/</:"&lt;", $/>/:"&gt;"})
 	= "&lt;tag&gt;"