Clean up text replacement API to use backrefs instead of match_chain()

This commit is contained in:
Bruce Hill 2024-09-03 22:56:53 -04:00
parent 79b48365d6
commit 850fc8fbe1
5 changed files with 183 additions and 204 deletions

View File

@ -39,6 +39,7 @@ typedef struct {
} synthetic_grapheme_t;
#define MAX_SYNTHETIC_GRAPHEMES 1024
#define MAX_BACKREFS 100
static synthetic_grapheme_t synthetic_graphemes[MAX_SYNTHETIC_GRAPHEMES] = {};
static int32_t num_synthetic_graphemes = 0;
@ -338,7 +339,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
num_subtexts += 1;
}
if (num_subtexts == 1)
return Text$slice(subtexts[0], Int64_to_Int(first), Int64_to_Int(last));
return Text$slice(subtexts[0], I(first), I(last));
Text_t ret = {
.length=needed_len,
@ -346,7 +347,7 @@ public Text_t Text$slice(Text_t text, Int_t first_int, Int_t last_int)
.subtexts=GC_MALLOC(sizeof(Text_t[num_subtexts])),
};
for (int64_t i = 0; i < num_subtexts; i++) {
ret.subtexts[i] = Text$slice(subtexts[i], Int64_to_Int(first), Int64_to_Int(last));
ret.subtexts[i] = Text$slice(subtexts[i], I(first), I(last));
first = 1;
needed_len -= ret.subtexts[i].length;
last = first + needed_len - 1;
@ -1048,7 +1049,11 @@ int64_t match_uri(Text_t text, int64_t text_index)
return text_index - start_index;
}
int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t pattern_index)
typedef struct {
int64_t index, length;
} capture_t;
int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t pattern_index, capture_t *captures, int64_t capture_index)
{
if (pattern_index >= pattern.length) return 0;
int64_t start_index = text_index;
@ -1061,6 +1066,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
// Quotations: "?", '?', etc
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
if (!match_grapheme(text, &text_index, open)) return -1;
int64_t start_of_quoted_text = text_index;
int32_t close = open;
uc_mirror_char(open, (uint32_t*)&close);
if (!match_grapheme(pattern, &pattern_index, close))
@ -1068,6 +1074,14 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
while (text_index < text.length) {
int32_t c = _next_grapheme(text, &text_state, text_index);
if (c == close) {
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
start_of_quoted_text,
text_index - start_of_quoted_text,
};
}
++text_index;
goto next_part_of_pattern;
}
@ -1085,6 +1099,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
// Nested punctuation: (?), [?], etc
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
if (!match_grapheme(text, &text_index, open)) return -1;
int64_t start_of_interior = text_index;
int32_t close = open;
uc_mirror_char(open, (uint32_t*)&close);
if (!match_grapheme(pattern, &pattern_index, close))
@ -1097,6 +1112,15 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
else if (c == close)
depth -= 1;
}
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
start_of_interior,
text_index - start_of_interior - 1,
};
}
if (depth > 0) return -1;
} else if (EAT1(pattern, &pattern_state, pattern_index,
grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
@ -1144,6 +1168,13 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
int64_t before_group = text_index;
#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } })
#define SUCCESS() ({ \
if (captures && capture_index < MAX_BACKREFS) { \
captures[capture_index++] = (capture_t){ \
before_group, \
(text_index - before_group), \
}; \
}; continue; 0; })
if (prop_name) {
switch (tolower(prop_name[0])) {
case '.':
@ -1163,13 +1194,13 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (strcasecmp(prop_name, "end") == 0) {
if (text_index != text.length)
FAIL();
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "email") == 0) {
int64_t len = match_email(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "emoji") == 0) {
prop = UC_PROPERTY_EMOJI;
goto got_prop;
@ -1182,26 +1213,26 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
FAIL();
EAT_MANY(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "int") == 0) {
EAT1(text, &text_state, text_index, grapheme == '-');
int64_t n = EAT_MANY(text, &text_state, text_index,
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
if (n <= 0)
FAIL();
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "ipv4") == 0) {
int64_t len = match_ipv4(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "ipv6") == 0) {
int64_t len = match_ipv6(text, text_index);
if (len < 0)
FAIL();
text_index += len;
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "ip") == 0) {
int64_t len = match_ipv6(text, text_index);
if (len < 0)
@ -1209,7 +1240,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (len < 0)
FAIL();
text_index += len;
continue;
SUCCESS();
}
break;
case 'n':
@ -1222,13 +1253,13 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
if (pre_decimal == 0 && post_decimal == 0)
FAIL();
continue;
SUCCESS();
}
break;
case 's':
if (strcasecmp(prop_name, "start") == 0) {
if (text_index != 0) return -1;
continue;
SUCCESS();
}
break;
case 'u':
@ -1237,7 +1268,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (len < 0)
FAIL();
text_index += len;
continue;
SUCCESS();
} else if (prop_name && strcasecmp(prop_name, "url") == 0) {
int64_t lookahead = text_index;
if (!(match_str(text, &lookahead, "https:")
@ -1251,7 +1282,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (len < 0)
FAIL();
text_index += len;
continue;
SUCCESS();
}
break;
}
@ -1267,9 +1298,17 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (min == 0 && pattern_index < pattern.length) {
// Try matching the rest of the pattern immediately:
int64_t match_len = match(text, pattern, text_index, pattern_index);
if (match_len >= 0)
int64_t match_len = match(text, pattern, text_index, pattern_index, captures, capture_index + 1);
if (match_len >= 0) {
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
before_group,
(text_index - before_group) + match_len,
};
}
return (text_index - start_index) + match_len;
}
}
for (int64_t count = 0; count < max; ) {
@ -1296,8 +1335,16 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (count >= min) {
if (pattern_index < pattern.length) {
// If we have the minimum and we match the rest of the pattern, we're good:
int64_t match_len = match(text, pattern, text_index, pattern_index);
int64_t match_len = match(text, pattern, text_index, pattern_index, captures, capture_index + 1);
if (match_len >= 0) {
// Save this as a capture, including only the interior text:
if (captures && capture_index < MAX_BACKREFS) {
captures[capture_index++] = (capture_t){
before_group,
(text_index - before_group) + match_len,
};
}
return (text_index - start_index) + match_len;
}
} else if (text_index >= text.length) {
@ -1350,7 +1397,7 @@ public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t
++i;
}
int64_t m = match(text, pattern, i, 0);
int64_t m = match(text, pattern, i, 0, NULL, 0);
if (m >= 0) {
if (match_length)
*match_length = m;
@ -1470,156 +1517,112 @@ public array_t Text$find_all(Text_t text, Pattern_t pattern)
int64_t len;
Int_t found = Text$find(text, pattern, i, &len);
if (I_is_zero(found)) break;
Text_t match = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
Text_t match = Text$slice(text, found, Int$plus(found, I(len-1)));
Array$insert(&matches, &match, I_small(0), sizeof(Text_t));
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
i = Int$plus(found, I(len <= 0 ? 1 : len));
}
return matches;
}
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t placeholder)
static Text_t apply_backrefs(Text_t text, Text_t replacement, Pattern_t backref_pat, capture_t *captures)
{
Text_t ret = {.length=0};
if (backref_pat.length == 0)
return replacement;
Int_t i = I_small(1);
for (;;) {
int64_t len;
Int_t found = Text$find(text, pattern, i, &len);
if (I_is_zero(found)) break;
int32_t first_grapheme = get_grapheme(backref_pat, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
Text_t replacement_text = replacement;
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, found, Int$plus(found, Int64_to_Int(len-1)));
replacement_text = Text$replace(replacement, placeholder, matched_text, Text(""));
Text_t ret = Text("");
iteration_state_t state = {0, 0};
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < replacement.length; ) {
// Optimization: quickly skip ahead to first char in the backref pattern:
if (find_first) {
while (pos < replacement.length && _next_grapheme(replacement, &state, pos) != first_grapheme)
++pos;
}
if (Int$compare(&found, &i, &$Text) > 0) {
Text_t before_slice = Text$slice(text, i, Int$minus(found, I_small(1)));
ret = Text$concat(ret, before_slice, replacement_text);
int64_t backref_len = match(replacement, backref_pat, pos, 0, NULL, 0);
if (backref_len < 0) {
pos += 1;
continue;
}
int64_t after_backref = pos + backref_len;
int64_t backref = parse_int(replacement, &after_backref);
if (after_backref == pos + backref_len) { // Not actually a backref if there's no number
pos += 1;
continue;
}
if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1);
backref_len = (after_backref - pos);
if (_next_grapheme(replacement, &state, pos + backref_len) == ';')
backref_len += 1; // skip optional semicolon
Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length));
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, backref_text);
} else {
ret = concat2(ret, replacement_text);
ret = concat2(ret, backref_text);
}
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
pos += backref_len;
nonmatching_pos = pos;
}
if (Int_to_Int64(i, false) <= text.length) {
Text_t last_slice = Text$slice(text, i, Int64_to_Int(text.length));
if (nonmatching_pos < replacement.length) {
Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Text_t placeholder)
public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, Pattern_t backref_pat)
{
if (patterns.length != replacements.length)
fail("The number of patterns given (%ld) is not the same as the number of replacements (%ld)",
patterns.length, replacements.length);
if (patterns.length == 0) return text;
Text_t ret = {.length=0};
Pattern_t first_pattern = *(Pattern_t*)(patterns.data);
int32_t first_grapheme = get_grapheme(first_pattern, 0);
int32_t first_grapheme = get_grapheme(pattern, 0);
bool find_first = (first_grapheme != '{'
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
iteration_state_t text_state = {0, 0};
int64_t nonmatch_pos = 0;
for (int64_t pos = 0; pos < text.length; ) {
int64_t nonmatching_pos = 0;
for (int64_t pos = 0; pos < text.length; pos++) {
// Optimization: quickly skip ahead to first char in pattern:
if (find_first) {
while (pos < text.length && _next_grapheme(text, &text_state, pos) != first_grapheme)
++pos;
}
// Get all match lengths:
int64_t lengths[patterns.length] = {};
for (int64_t i = 0, match_pos = pos; i < patterns.length; i++) {
Pattern_t pattern = *(Pattern_t*)(patterns.data + i*patterns.stride);
capture_t captures[MAX_BACKREFS] = {};
int64_t match_len = match(text, pattern, pos, 0, captures, 1);
if (match_len < 0) continue;
captures[0].index = pos;
captures[0].length = match_len;
// If one of the patterns is `?` sandwiched between two pats
if (i > 0 && i < patterns.length-1 && Text$equal(&pattern, (Pattern_t[1]){Text("?")})) {
Pattern_t prev_pat = *(Pattern_t*)(patterns.data + (i-1)*patterns.stride);
int32_t prev_last_grapheme = get_grapheme(prev_pat, prev_pat.length-1);
if (prev_last_grapheme < 0) goto literal_pat;
Pattern_t next_pat = *(Pattern_t*)(patterns.data + (i+1)*patterns.stride);
int32_t next_first_grapheme = get_grapheme(next_pat, 0);
if (next_first_grapheme < 0) goto literal_pat;
int32_t mirrored = prev_last_grapheme;
uc_mirror_char(prev_last_grapheme, (uint32_t*)&mirrored);
if (next_first_grapheme != mirrored)
goto literal_pat;
if ((uc_is_property_quotation_mark(prev_last_grapheme) && uc_is_property_quotation_mark(next_first_grapheme))
|| ((uc_is_property_paired_punctuation(prev_last_grapheme)
&& uc_is_property_paired_punctuation(next_first_grapheme)
&& uc_is_property_left_of_pair(prev_last_grapheme)))) {
// $/"/, $/?/, $/"/
// $/(/, $/?/, $/)/
Pattern_t matching_pair_pat = text_from_u32((uint32_t[3]){prev_last_grapheme, '?', next_first_grapheme}, 3, false);
int64_t enclosing_len = match(text, matching_pair_pat, match_pos-1, 0);
if (enclosing_len < 0) goto no_match;
assert(enclosing_len >= 2);
lengths[i] = enclosing_len - 2; // Exclude '(' and ')' or whatever delims
goto found_match;
}
}
literal_pat:;
lengths[i] = match(text, pattern, match_pos, 0);
if (lengths[i] < 0)
goto no_match;
found_match:
match_pos += lengths[i];
Text_t replacement_text = apply_backrefs(text, replacement, backref_pat, captures);
if (pos > nonmatching_pos) {
Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos));
ret = Text$concat(ret, before_slice, replacement_text);
} else {
ret = concat2(ret, replacement_text);
}
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
ret = concat2(ret, before_slice);
}
// Concatenate the slices/replacements
for (int64_t i = 0, replace_pos = pos; i < patterns.length; i++) {
Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride);
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, Int64_to_Int(replace_pos+1), Int64_to_Int(replace_pos + lengths[i]));
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
}
ret = concat2(ret, replacement);
replace_pos += lengths[i];
}
int64_t total_match_len = 0;
for (int64_t i = 0; i < patterns.length; i++)
total_match_len += lengths[i];
pos += (total_match_len <= 0) ? 1 : total_match_len;
nonmatch_pos = pos;
continue;
no_match:
pos += 1;
continue;
nonmatching_pos = pos + match_len;
pos += (match_len - 1);
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
if (nonmatching_pos < text.length) {
Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length));
ret = concat2(ret, last_slice);
}
return ret;
}
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeholder)
public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t backref_pat)
{
if (replacements.entries.length == 0) return text;
@ -1630,22 +1633,22 @@ public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeho
// Find the first matching pattern at this position:
for (int64_t i = 0; i < replacements.entries.length; i++) {
Pattern_t pattern = *(Pattern_t*)(replacements.entries.data + i*replacements.entries.stride);
int64_t len = match(text, pattern, pos, 0);
capture_t captures[MAX_BACKREFS] = {};
int64_t len = match(text, pattern, pos, 0, captures, 1);
if (len < 0) continue;
captures[0].index = pos;
captures[0].length = len;
// If we skipped over some non-matching text before finding a match, insert it here:
if (pos > nonmatch_pos) {
Text_t before_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(pos));
Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos));
ret = concat2(ret, before_slice);
}
// Concatenate the replacement:
Text_t replacement = *(Text_t*)(replacements.entries.data + i*replacements.entries.stride + sizeof(Text_t));
if (placeholder.length > 0) {
Text_t matched_text = Text$slice(text, Int64_to_Int(pos+1), Int64_to_Int(pos + len));
replacement = Text$replace(replacement, placeholder, matched_text, Text(""));
}
ret = concat2(ret, replacement);
Text_t replacement_text = apply_backrefs(text, replacement, backref_pat, captures);
ret = concat2(ret, replacement_text);
pos += len > 0 ? len : 1;
nonmatch_pos = pos;
goto next_pos;
@ -1657,7 +1660,7 @@ public Text_t Text$replace_all(Text_t text, table_t replacements, Text_t placeho
}
if (nonmatch_pos <= text.length) {
Text_t last_slice = Text$slice(text, Int64_to_Int(nonmatch_pos+1), Int64_to_Int(text.length));
Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length));
ret = concat2(ret, last_slice);
}
return ret;
@ -1680,10 +1683,10 @@ public array_t Text$split(Text_t text, Pattern_t pattern)
if (I_is_zero(found)) break;
Text_t chunk = Text$slice(text, i, Int$minus(found, I_small(1)));
Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
i = Int$plus(found, Int64_to_Int(len <= 0 ? 1 : len));
i = Int$plus(found, I(len <= 0 ? 1 : len));
}
Text_t last_chunk = Text$slice(text, i, Int64_to_Int(text.length));
Text_t last_chunk = Text$slice(text, i, I(text.length));
Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));
return chunks;
@ -1728,7 +1731,7 @@ public array_t Text$clusters(Text_t text)
{
array_t clusters = {.atomic=1};
for (int64_t i = 1; i <= text.length; i++) {
Text_t cluster = Text$slice(text, Int64_to_Int(i), Int64_to_Int(i));
Text_t cluster = Text$slice(text, I(i), I(i));
Array$insert(&clusters, &cluster, I_small(0), sizeof(Text_t));
}
return clusters;
@ -1826,16 +1829,16 @@ public array_t Text$lines(Text_t text)
for (int64_t i = 0, line_start = 0; i < text.length; i++) {
int32_t grapheme = _next_grapheme(text, &state, i);
if (grapheme == '\r' && _next_grapheme(text, &state, i + 1) == '\n') { // CRLF
Text_t line = Text$slice(text, Int64_to_Int(line_start+1), Int64_to_Int(i));
Text_t line = Text$slice(text, I(line_start+1), I(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
i += 1; // skip one extra for CR
line_start = i + 1;
} else if (grapheme == '\n') { // newline
Text_t line = Text$slice(text, Int64_to_Int(line_start+1), Int64_to_Int(i));
Text_t line = Text$slice(text, I(line_start+1), I(i));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
line_start = i + 1;
} else if (i == text.length-1 && line_start != i) { // last line
Text_t line = Text$slice(text, Int64_to_Int(line_start+1), Int64_to_Int(i+1));
Text_t line = Text$slice(text, I(line_start+1), I(i+1));
Array$insert(&lines, &line, I_small(0), sizeof(Text_t));
}
}

View File

@ -31,9 +31,8 @@ Text_t Text$lower(Text_t text);
Text_t Text$title(Text_t text);
Text_t Text$as_text(const void *text, bool colorize, const TypeInfo *info);
Text_t Text$quoted(Text_t str, bool colorize);
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t placeholder);
Text_t Text$replace_chain(Text_t text, array_t patterns, array_t replacements, Pattern_t placeholder);
Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t placeholder);
Text_t Text$replace(Text_t str, Pattern_t pat, Text_t replacement, Pattern_t backref_pat);
Text_t Text$replace_all(Text_t text, table_t replacements, Pattern_t backref_pat);
array_t Text$split(Text_t text, Pattern_t pattern);
Int_t Text$find(Text_t text, Pattern_t pattern, Int_t i, int64_t *match_length);
array_t Text$find_all(Text_t text, Pattern_t pattern);

View File

@ -277,7 +277,6 @@ Text.find_all(pattern:Pattern)->[Text]
Text.split(pattern:Pattern)->[Text]
Text.replace(pattern:Pattern, replacement:Text, placeholder:Pattern=$//)->[Text]
Text.replace_all(replacements:{Pattern:Text}, placeholder:Pattern=$//)->[Text]
Text.replace_chain(patterns:[Pattern], replacements:[Text], placeholder:Pattern=$//)->[Text]
```
See [Text Functions](#Text-Functions) for the full API documentation.
@ -836,11 +835,12 @@ The text formatted as a quoted string.
**Description:**
Replaces occurrences of a pattern in the text with a replacement string.
See [Patterns](#patterns) for more information about patterns.
**Usage:**
```tomo
replace(text: Text, pattern: Text, replacement: Text, placeholder: Pattern = $//) -> Text
replace(text: Text, pattern: Text, replacement: Text, backref: Pattern = $/\/) -> Text
```
**Parameters:**
@ -848,8 +848,22 @@ replace(text: Text, pattern: Text, replacement: Text, placeholder: Pattern = $//
- `text`: The text in which to perform replacements.
- `pattern`: The pattern to be replaced.
- `replacement`: The text to replace the pattern with.
- `placeholder`: If non-empty, the replacement text will have occurrences of the placeholder
pattern replaced with the matching text.
- `backref`: If non-empty, the replacement text will have occurrences of this
pattern followed by a number replaced with the corresponding backreference.
By default, the backreference pattern is a single backslash, so
backreferences look like `\0`, `\1`, etc.
**Backreferences**
If a backreference pattern is in the replacement, then that backreference is
replaced with the corresponding group from the matching text. Backreference
`0` is the entire matching text, backreference `1` is the first matched group,
and so on. Literal text is not captured for backreferences, only named group
captures (`{foo}`), quoted captures (`"?"`), and nested group captures (`(?)`).
For quoted and nested group captures, the backreference refers to the *inside*
of the capture without the enclosing punctuation.
If you need to insert a digit immediately after a backreference, you can use an
optional semicolon: `\1;2` (backref 1, followed by the replacement text`"2"`).
**Returns:**
The text with occurrences of the pattern replaced.
@ -862,8 +876,14 @@ The text with occurrences of the pattern replaced.
>> "Hello world":replace("{id}", "xxx")
= "xxx xxx"
>> "Hello world":replace("{id}", "(@)", placeholder=$/@/)
>> "Hello world":replace("{id}", "\0")
= "(Hello) (world)"
>> "Hello world":replace("{id} {id}", "just \2")
= "just world"
>> "some (parenthesized (with inner parens) text) here":replace("(?)", "non-\1")
= "some non-parenthesized (with inner parens) text here"
```
---
@ -875,11 +895,11 @@ Takes a table mapping patterns to replacement texts and performs all the
replacements in the table on the whole text. At each position, the first
matching pattern's replacement is applied and the pattern matching moves on to
*after* the replacement text, so replacement text is not recursively modified.
See [Patterns](#patterns) for more information about patterns.
See [`replace()`](#replace) for more information about replacement behavior.
**Usage:**
```tomo
replace_all(replacements:{Pattern:Text}, placeholder: Pattern = $//) -> Text
replace_all(replacements:{Pattern:Text}, backref: Pattern = $/\/) -> Text
```
**Parameters:**
@ -887,8 +907,10 @@ replace_all(replacements:{Pattern:Text}, placeholder: Pattern = $//) -> Text
- `text`: The text in which to perform replacements.
- `replacements`: A table mapping from patterns to the replacement text
associated with that pattern.
- `placeholder`: If non-empty, the replacement text will have occurrences of
the placeholder pattern replaced with the matching text.
- `backref`: If non-empty, the replacement text will have occurrences of this
pattern followed by a number replaced with the corresponding backreference.
By default, the backreference pattern is a single backslash, so
backreferences look like `\0`, `\1`, etc.
**Returns:**
The text with all occurrences of the patterns replaced with their corresponding
@ -905,56 +927,12 @@ replacement text.
}
= "A &lt;tag&gt; &amp; an ampersand"
>> "Hello":replace_all({$/{lower}/:"[@]", $/{upper}/:"{@}"}, placeholder=$/@/)
>> "Hello":replace_all({$/{lower}/:"[\0]", $/{upper}/:"{\0}"})
= "{H}[ello]"
```
---
## `replace_chain`
**Description:**
Takes an array of patterns and a corresponding array of replacement texts and
if all patterns in the patterns array match _consecutively_, then the
replacement texts are substituted for the corresponding matches. This is useful
if you want to replace parts of a match with new text while leaving other parts
unchanged.
As as special case, if a pattern `$/?/` is sandwiched between corresponding
matching quotes or matching braces, it will match the inside part of the nested
or quoted pair.
See [Patterns](#patterns) for more information about patterns.
**Usage:**
```tomo
replace_chain(patterns:[Pattern], replacements:[Text], placeholder: Pattern = $//) -> Text
```
**Parameters:**
- `text`: The text in which to perform replacements.
- `patterns`: An array of patterns to be matched consecutively.
- `replacements`: An array of replacement texts corresponding to each pattern.
This must be the same length as `patterns` or an error will be raised.
- `placeholder`: If non-empty, the replacement text will have occurrences of
the placeholder pattern replaced with the matching text.
**Returns:**
The text with all occurrences of the patterns replaced with their corresponding
replacement text.
**Example:**
```tomo
>> " foo(blah(), 2) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz(", "@", ")"], placeholder=$/@/)
= " baz(blah(), 2) "
>> " foo.field_name ":replace_chain([$/{id}/, $/.field_name/], ["@", ".other_field"], placeholder=$/@/)
= " foo.other_field "
```
---
## `split`
**Description:**

View File

@ -248,9 +248,8 @@ env_t *new_compilation_unit(CORD *libname)
{"lines", "Text$lines", "func(text:Text)->[Text]"},
{"lower", "Text$lower", "func(text:Text)->Text"},
{"quoted", "Text$quoted", "func(text:Text, color=no)->Text"},
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$//)->Text"},
{"replace_chain", "Text$replace_chain", "func(text:Text, patterns:[Pattern], replacements:[Text], placeholder=$//)->Text"},
{"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$//)->Text"},
{"replace", "Text$replace", "func(text:Text, pattern:Pattern, replacement:Text, placeholder=$/\\/)->Text"},
{"replace_all", "Text$replace_all", "func(text:Text, replacements:{Pattern:Text}, placeholder=$/\\/)->Text"},
{"split", "Text$split", "func(text:Text, pattern=$Pattern'')->[Text]"},
{"slice", "Text$slice", "func(text:Text, from=1, to=-1)->Text"},
{"title", "Text$title", "func(text:Text)->Text"},

View File

@ -227,11 +227,11 @@ func main():
>> $/$malicious/
= $/{1{}xxx}/
>> "Hello":replace($/{lower}/, "(@)", $/@/)
>> "Hello":replace($/{lower}/, "(\0)")
= "H(ello)"
>> " foo(xyz) foo(yyy) foo(z()) ":replace_chain([$/foo(/, $/?/, $/)/], ["baz[", "@", "]"], $/@/)
= " baz[xyz] baz[yyy] baz[z()] "
>> " foo(xyz) foo(yyy) foo(z()) ":replace($/foo(?)/, "baz(\1)")
= " baz(xyz) baz(yyy) baz(z()) "
>> "<tag>":replace_all({$/</:"&lt;", $/>/:"&gt;"})
= "&lt;tag&gt;"