aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--builtins/text.c268
-rw-r--r--docs/text.md73
-rw-r--r--test/text.tm47
3 files changed, 203 insertions, 185 deletions
diff --git a/builtins/text.c b/builtins/text.c
index d9da1248..d23f2dc3 100644
--- a/builtins/text.c
+++ b/builtins/text.c
@@ -814,28 +814,39 @@ const char *get_property_name(Text_t text, int64_t *i)
++dest;
if (dest >= name + UNINAME_MAX - 1)
break;
- } else if (dest == name && grapheme >= 0 && grapheme != ']') {
- // Literal character escape: [..[] --> "LEFT SQUARE BRACKET"
- name = unicode_character_name(grapheme, name);
- *i += 1;
- return name;
} else {
break;
}
*i += 1;
}
+
+ while (dest > name && dest[-1] == ' ')
+ *(dest--) = '\0';
+
if (dest == name) return NULL;
*dest = '\0';
return name;
}
-#define EAT1(state, cond) ({\
- int32_t grapheme = _next_grapheme(text, state, text_index); \
+#define EAT1(text, state, index, cond) ({\
+ int32_t grapheme = _next_grapheme(text, state, index); \
bool success = (cond); \
- if (success) text_index += 1; \
+ if (success) index += 1; \
+ success; })
+
+#define EAT2(text, state, index, cond1, cond2) ({\
+ int32_t grapheme = _next_grapheme(text, state, index); \
+ bool success = (cond1); \
+ if (success) { \
+ grapheme = _next_grapheme(text, state, index + 1); \
+ success = (cond2); \
+ if (success) \
+ index += 2; \
+ } \
success; })
-#define EAT_MANY(state, cond) ({ int64_t _n = 0; while (EAT1(state, cond)) { _n += 1; } _n; })
+
+#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; })
int64_t match_email(Text_t text, int64_t text_index)
{
@@ -858,19 +869,21 @@ int64_t match_email(Text_t text, int64_t text_index)
// Local part:
int64_t local_len = 0;
static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
- while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
+ while (EAT1(text, &state, text_index,
+ (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
local_len += 1;
if (local_len > 64) return -1;
}
- if (!EAT1(&state, grapheme == '@'))
+ if (!EAT1(text, &state, text_index, grapheme == '@'))
return -1;
// Host
int64_t host_len = 0;
do {
int64_t label_len = 0;
- while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
+ while (EAT1(text, &state, text_index,
+ (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
label_len += 1;
if (label_len > 63) return -1;
}
@@ -882,7 +895,7 @@ int64_t match_email(Text_t text, int64_t text_index)
if (host_len > 255)
return -1;
host_len += 1;
- } while (EAT1(&state, grapheme == '.'));
+ } while (EAT1(text, &state, text_index, grapheme == '.'));
return text_index - start_index;
}
@@ -900,21 +913,21 @@ int64_t match_ipv6(Text_t text, int64_t text_index)
bool double_colon_used = false;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 4; digits++) {
- if (!EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+ if (!EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
break;
}
- if (EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
+ if (EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1) {
break;
- } else if (!EAT1(&state, grapheme == ':')) {
+ } else if (!EAT1(text, &state, text_index, grapheme == ':')) {
if (double_colon_used)
break;
return -1;
}
- if (EAT1(&state, grapheme == ':')) {
+ if (EAT1(text, &state, text_index, grapheme == ':')) {
if (double_colon_used)
return -1;
double_colon_used = true;
@@ -936,18 +949,18 @@ static int64_t match_ipv4(Text_t text, int64_t text_index)
const int NUM_CLUSTERS = 4;
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
for (int digits = 0; digits < 3; digits++) {
- if (!EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
+ if (!EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
if (digits == 0) return -1;
break;
}
}
- if (EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
+ if (EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
return -1; // Too many digits
if (cluster == NUM_CLUSTERS-1)
break;
- else if (!EAT1(&state, grapheme == '.'))
+ else if (!EAT1(text, &state, text_index, grapheme == '.'))
return -1;
}
return (text_index - start_index);
@@ -971,10 +984,11 @@ int64_t match_uri(Text_t text, int64_t text_index)
int64_t start_index = text_index;
// Scheme:
- if (!EAT1(&state, isalpha(grapheme)))
+ if (!EAT1(text, &state, text_index, isalpha(grapheme)))
return -1;
- EAT_MANY(&state, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
+ EAT_MANY(text, &state, text_index,
+ !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
if (text_index == start_index)
return -1;
@@ -987,12 +1001,12 @@ int64_t match_uri(Text_t text, int64_t text_index)
int64_t authority_start = text_index;
// Username or host:
static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
- if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
+ if (EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
return -1;
- if (EAT1(&state, grapheme == '@')) {
+ if (EAT1(text, &state, text_index, grapheme == '@')) {
// Found a username, now get a host:
- if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
+ if (EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
return -1;
} else {
int64_t ip = authority_start;
@@ -1007,29 +1021,29 @@ int64_t match_uri(Text_t text, int64_t text_index)
}
// Port:
- if (EAT1(&state, grapheme == ':')) {
- if (EAT_MANY(&state, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
+ if (EAT1(text, &state, text_index, grapheme == ':')) {
+ if (EAT_MANY(text, &state, text_index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
return -1;
}
- if (!EAT1(&state, grapheme == '/'))
+ if (!EAT1(text, &state, text_index, grapheme == '/'))
return (text_index - start_index); // No path
} else {
// Optional path root:
- EAT1(&state, grapheme == '/');
+ EAT1(text, &state, text_index, grapheme == '/');
}
// Path:
static const char *non_path = " \"#?<>[]{}\\^`|";
- EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
+ EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
- if (EAT1(&state, grapheme == '?')) { // Query
+ if (EAT1(text, &state, text_index, grapheme == '?')) { // Query
static const char *non_query = " \"#<>[]{}\\^`|";
- EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
+ EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
}
- if (EAT1(&state, grapheme == '#')) { // Fragment
+ if (EAT1(text, &state, text_index, grapheme == '#')) { // Fragment
static const char *non_fragment = " \"#<>[]{}\\^`|";
- EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
+ EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
}
return text_index - start_index;
}
@@ -1041,7 +1055,49 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
iteration_state_t pattern_state = {0, 0}, text_state = {0, 0};
while (pattern_index < pattern.length) {
int64_t old_pat_index = pattern_index;
- if (match_str(pattern, &pattern_index, "[..")) {
+ if (EAT2(pattern, &pattern_state, pattern_index,
+ uc_is_property(grapheme, UC_PROPERTY_QUOTATION_MARK),
+ grapheme == '?')) {
+ // Quotations: "?", '?', etc
+ int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
+ if (!match_grapheme(text, &text_index, open)) return -1;
+ int32_t close = open;
+ uc_mirror_char(open, (uint32_t*)&close);
+ if (!match_grapheme(pattern, &pattern_index, close))
+ fail("Pattern's closing brace is missing: %k", &pattern);
+ while (text_index < text.length) {
+ int32_t c = _next_grapheme(text, &text_state, text_index);
+ if (c == close)
+ return (text_index - start_index);
+
+ if (c == '\\' && text_index < text.length) {
+ text_index += 2;
+ } else {
+ text_index += 1;
+ }
+ }
+ return -1;
+ } else if (EAT2(pattern, &pattern_state, pattern_index,
+ uc_is_property(grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
+ grapheme == '?')) {
+ // Nested punctuation: (?), [?], etc
+ int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
+ if (!match_grapheme(text, &text_index, open)) return -1;
+ int32_t close = open;
+ uc_mirror_char(open, (uint32_t*)&close);
+ if (!match_grapheme(pattern, &pattern_index, close))
+ fail("Pattern's closing brace is missing: %k", &pattern);
+ int64_t depth = 1;
+ for (; depth > 0 && text_index < text.length; ++text_index) {
+ int32_t c = _next_grapheme(text, &text_state, text_index);
+ if (c == open)
+ depth += 1;
+ else if (c == close)
+ depth -= 1;
+ }
+ if (depth > 0) return -1;
+ } else if (EAT1(pattern, &pattern_state, pattern_index,
+ grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
skip_whitespace(pattern, &pattern_index);
int64_t min, max;
if (uc_is_digit(_next_grapheme(pattern, &pattern_state, pattern_index))) {
@@ -1059,21 +1115,42 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
}
skip_whitespace(pattern, &pattern_index);
- bool want_to_match = !match_grapheme(pattern, &pattern_index, '!');
- const char *prop_name = get_property_name(pattern, &pattern_index);
- skip_whitespace(pattern, &pattern_index);
- if (!match_grapheme(pattern, &pattern_index, ']'))
- fail("Missing closing ']' in pattern: %k", &pattern);
-
- int64_t before_group = text_index;
bool any = false;
uc_property_t prop;
int32_t specific_codepoint = UNINAME_INVALID;
+ bool want_to_match = !match_grapheme(pattern, &pattern_index, '!');
+ const char *prop_name;
+ if (match_str(pattern, &pattern_index, ".."))
+ prop_name = "..";
+ else
+ prop_name = get_property_name(pattern, &pattern_index);
+
+ if (!prop_name) {
+ // Literal character, e.g. {1?}
+ specific_codepoint = _next_grapheme(pattern, &pattern_state, pattern_index);
+ pattern_index += 1;
+ } else if (strlen(prop_name) == 1) {
+ // Single letter names: {1+ A}
+ specific_codepoint = prop_name[0];
+ prop_name = NULL;
+ }
+ skip_whitespace(pattern, &pattern_index);
+ if (!match_grapheme(pattern, &pattern_index, '}'))
+ fail("Missing closing '}' in pattern: %k", &pattern);
+
+ int64_t before_group = text_index;
#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } })
if (prop_name) {
switch (tolower(prop_name[0])) {
+ case '.':
+ if (prop_name[1] == '.') {
+ any = true;
+ prop = UC_PROPERTY_PRIVATE_USE;
+ break;
+ }
+ break;
case 'd':
if (strcasecmp(prop_name, "digit") == 0) {
prop = UC_PROPERTY_DECIMAL_DIGIT;
@@ -1098,13 +1175,16 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
break;
case 'i':
if (prop_name && strcasecmp(prop_name, "id") == 0) {
- if (!EAT1(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_START)))
+ if (!EAT1(text, &text_state, text_index,
+ uc_is_property(grapheme, UC_PROPERTY_XID_START)))
FAIL();
- EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
+ EAT_MANY(text, &text_state, text_index,
+ uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
continue;
} else if (prop_name && strcasecmp(prop_name, "int") == 0) {
- EAT1(&text_state, grapheme == '-');
- int64_t n = EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
+ EAT1(text, &text_state, text_index, grapheme == '-');
+ int64_t n = EAT_MANY(text, &text_state, text_index,
+ uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
if (n <= 0)
FAIL();
continue;
@@ -1132,10 +1212,12 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
break;
case 'n':
if (prop_name && strcasecmp(prop_name, "num") == 0) {
- EAT1(&text_state, grapheme == '-');
- int64_t pre_decimal = EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
- bool decimal = (EAT1(&text_state, grapheme == '.') == 1);
- int64_t post_decimal = decimal ? EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
+ EAT1(text, &text_state, text_index, grapheme == '-');
+ int64_t pre_decimal = EAT_MANY(text, &text_state, text_index,
+ uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
+ bool decimal = (EAT1(text, &text_state, text_index, grapheme == '.') == 1);
+ int64_t post_decimal = decimal ? EAT_MANY(text, &text_state, text_index,
+ uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
if (pre_decimal == 0 && post_decimal == 0)
FAIL();
continue;
@@ -1178,9 +1260,6 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
if (specific_codepoint == UNINAME_INVALID)
fail("Not a valid property or character name: %s", prop_name);
}
- } else {
- any = true;
- prop = UC_PROPERTY_PRIVATE_USE;
}
got_prop:;
@@ -1222,80 +1301,16 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
}
}
}
- } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_QUOTATION_MARK)
- && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) {
- // Quotation: "?", '?', etc
- int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
- if (!match_grapheme(text, &text_index, open)) return -1;
- int32_t close = open;
- uc_mirror_char(open, (uint32_t*)&close);
- if (!match_grapheme(pattern, &pattern_index, close))
- fail("Pattern's closing brace is missing: %k", &pattern);
- while (text_index < text.length) {
- int32_t c = _next_grapheme(text, &text_state, text_index);
- if (c == close)
- return (text_index - start_index);
-
- if (c == '\\' && text_index < text.length) {
- text_index += 2;
- } else {
- text_index += 1;
- }
- }
- return -1;
- } else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_PAIRED_PUNCTUATION)
- && (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) {
- // Nested punctuation: (?), [?], etc
- int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
- if (!match_grapheme(text, &text_index, open)) return -1;
- int32_t close = open;
- uc_mirror_char(open, (uint32_t*)&close);
- if (!match_grapheme(pattern, &pattern_index, close))
- fail("Pattern's closing brace is missing: %k", &pattern);
- int64_t depth = 1;
- for (; depth > 0 && text_index < text.length; ++text_index) {
- int32_t c = _next_grapheme(text, &text_state, text_index);
- if (c == open)
- depth += 1;
- else if (c == close)
- depth -= 1;
- }
- if (depth > 0) return -1;
} else {
// Plain character:
pattern_index = old_pat_index;
int32_t pat_grapheme = _next_grapheme(pattern, &pattern_state, pattern_index);
-
- // if (pattern_index == 0 && text_index > 0) {
- // int32_t pat_codepoint = pat_grapheme;
- // if (pat_codepoint < 0)
- // pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0];
-
- // int32_t prev_codepoint = _next_grapheme(text, &text_state, text_index - 1);
- // if (prev_codepoint < 0)
- // prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
- // if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(prev_codepoint))
- // return -1;
- // }
-
int32_t text_grapheme = _next_grapheme(text, &text_state, text_index);
if (pat_grapheme != text_grapheme)
return -1;
pattern_index += 1;
text_index += 1;
-
- // if (pattern_index == pattern.length && text_index < text.length) {
- // int32_t pat_codepoint = pat_grapheme;
- // if (pat_codepoint < 0)
- // pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0];
-
- // int32_t next_codepoint = _next_grapheme(text, &text_state, text_index);
- // if (next_codepoint < 0)
- // next_codepoint = synthetic_graphemes[-next_codepoint-1].codepoints[0];
- // if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(next_codepoint))
- // return -1;
- // }
}
}
if (text_index >= text.length && pattern_index < pattern.length)
@@ -1304,6 +1319,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
}
#undef EAT1
+#undef EAT2
#undef EAT_MANY
public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length)
@@ -1315,7 +1331,7 @@ public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t
return I_small(0);
int32_t first_grapheme = get_grapheme(pattern, 0);
- bool find_first = (first_grapheme != '['
+ bool find_first = (first_grapheme != '{'
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
@@ -1677,16 +1693,14 @@ public Pattern_t Pattern$escape_text(Text_t text)
int32_t g = _next_grapheme(text, &state, i);
uint32_t g0 = g < 0 ? synthetic_graphemes[-g-1].codepoints[0] : (uint32_t)g;
- if (g == '[') {
- add_str("[..1[]");
- } else if (uc_is_property_quotation_mark(g0)) {
- add_str("[..1");
- add_char(g);
- add_char(']');
- } else if (uc_is_property_paired_punctuation(g0)) {
- add_str("[..1");
+ if (g == '{') {
+ add_str("{1{}");
+ } else if (uc_is_property_quotation_mark(g0)
+ || (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) {
+ add_char('{');
+ add_char('1');
add_char(g);
- add_char(']');
+ add_char('}');
} else {
add_char(g);
}
diff --git a/docs/text.md b/docs/text.md
index 855c3c6c..cf60f6a3 100644
--- a/docs/text.md
+++ b/docs/text.md
@@ -284,9 +284,9 @@ See [Text Functions](#Text-Functions) for the full API documentation.
Patterns have three types of syntax:
-- `[..` followed by an optional count (`n`, `n-m`, or `n+`), followed by an
+- `{` followed by an optional count (`n`, `n-m`, or `n+`), followed by an
optional `!` to negate the pattern, followed by an optional pattern name or
- Unicode character name, followed by a required `]`.
+ Unicode character name, followed by a required `}`.
- Any matching pair of quotes or parentheses or braces with a `?` in the middle
(e.g. `"?"` or `(?)`).
@@ -296,10 +296,11 @@ Patterns have three types of syntax:
## Named Patterns
Named patterns match certain pre-defined patterns that are commonly useful. To
-use a named pattern, use the syntax `[..name]`. Names are case-insensitive and
+use a named pattern, use the syntax `{name}`. Names are case-insensitive and
mostly ignore spaces, underscores, and dashes.
-- ` ` - If no name is given, any character is accepted.
+- `..` - Any character (note that a single `.` would mean the literal period
+ character).
- `digit` - A unicode digit
- `email` - an email address
- `emoji` - an emoji
@@ -315,8 +316,8 @@ mostly ignore spaces, underscores, and dashes.
- `url` - a URL (URI that specifically starts with `http://`, `https://`, `ws://`, `wss://`, or `ftp://`)
For non-alphabetic characters, any single character is treated as matching
-exactly that character. For example, `[..1 []` matches exactly one `[`
-character. Or, `[..1 (]` matches exactly one `(` character.
+exactly that character. For example, `{1{}` matches exactly one `{`
+character. Or, `{1.}` matches exactly one `.` character.
Patterns can also use any Unicode property name. Some helpful ones are:
@@ -326,37 +327,37 @@ Patterns can also use any Unicode property name. Some helpful ones are:
- `upper` - Uppercase letters
- `whitespace` - Whitespace characters
-Patterns may also use exact Unicode codepoint names. For example: `[..1 latin
-small letter A]` matches `a`.
+Patterns may also use exact Unicode codepoint names. For example: `{1 latin
+small letter A}` matches `a`.
## Negating Patterns
If an exclamation mark (`!`) is placed before a pattern's name, then characters
-are matched only when they _don't_ match the pattern. For example, `[..!alpha]`
+are matched only when they _don't_ match the pattern. For example, `{!alpha}`
will match all characters _except_ alphabetic ones.
## Interpolating Text and Escaping
To escape a character in a pattern (e.g. if you want to match the literal
-character `?`), you can use the syntax `[..1 ?]`. This is almost never
-necessary unless you have text that looks like a Tomo text pattern and has
-something like `[..` or `(?)` inside it.
+character `?`), you can use the syntax `{1 ?}`. This is almost never necessary
+unless you have text that looks like a Tomo text pattern and has something like
+`{` or `(?)` inside it.
However, if you're trying to do an exact match of arbitrary text values, you'll
want to have the text automatically escaped. Fortunately, Tomo's injection-safe
DSL text interpolation supports automatic text escaping. This means that if you
use text interpolation with the `$` sign to insert a text value, the value will
-be automatically escaped using the `[..1 ?]` rule described above:
+be automatically escaped using the `{1 ?}` rule described above:
```tomo
# Risk of code injection (would cause an error because 'xxx' is not a valid
# pattern name:
>> user_input := get_user_input()
-= "[..xxx]"
+= "{xxx}"
# Interpolation automatically escapes:
>> $/$user_input/
-= $/[..1 []..xxx]/
+= $/{1{}..xxx}/
# No error:
>> some_text:find($/$user_input/)
@@ -366,8 +367,8 @@ be automatically escaped using the `[..1 ?]` rule described above:
If you prefer, you can also use this to insert literal characters:
```tomo
->> $/literal $"[..]"/
-= $/literal [..1]]..]/
+>> $/literal $"{..}"/
+= $/literal {1{}..}/
```
## Repetitions
@@ -378,11 +379,11 @@ many repetitions you want by putting a number or range of numbers first using
(`n` or more repetitions):
```
-[..4-5 alpha]
-0x[..hex]
-[..4 digit]-[..2 digit]-[..2 digit]
-[..2+ space]
-[..0-1 question mark]
+{4-5 alpha}
+0x{hex}
+{4 digit}-{2 digit}-{2 digit}
+{2+ space}
+{0-1 question mark}
```
# Text Functions
@@ -625,17 +626,17 @@ found.
**Example:**
```tomo
->> " one two three ":find("[..id]", start=-999)
+>> " one two three ":find("{id}", start=-999)
= 0
->> " one two three ":find("[..id]", start=999)
+>> " one two three ":find("{id}", start=999)
= 0
->> " one two three ":find("[..id]")
+>> " one two three ":find("{id}")
= 2
->> " one two three ":find("[..id]", start=5)
+>> " one two three ":find("{id}", start=5)
= 8
>> len := 0_i64
->> " one ":find("[..id]", length=&len)
+>> " one ":find("{id}", length=&len)
= 4
>> len
= 3_i64
@@ -665,16 +666,16 @@ Note: if `text` or `pattern` is empty, an empty array will be returned.
**Example:**
```tomo
->> " one two three ":find_all("[..alpha]")
+>> " one two three ":find_all("{alpha}")
= ["one", "two", "three"]
->> " one two three ":find_all("[..!space]")
+>> " one two three ":find_all("{!space}")
= ["one", "two", "three"]
->> " ":find_all("[..alpha]")
+>> " ":find_all("{alpha}")
= []
->> " foo(baz(), 1) doop() ":find_all("[..id](?)")
+>> " foo(baz(), 1) doop() ":find_all("{id}(?)")
= ["foo(baz(), 1)", "doop()"]
>> "":find_all("")
@@ -708,11 +709,11 @@ has(text: Text, pattern: Text) -> Bool
```tomo
>> "hello world":has("wo")
= yes
->> "hello world":has("[..alpha]")
+>> "hello world":has("{alpha}")
= yes
->> "hello world":has("[..digit]")
+>> "hello world":has("{digit}")
= no
->> "hello world":has("[..start]he")
+>> "hello world":has("{start}he")
= yes
```
@@ -854,7 +855,7 @@ The text with occurrences of the pattern replaced.
>> "Hello world":replace("world", "there")
= "Hello there"
->> "Hello world":replace("[..id]", "xxx")
+>> "Hello world":replace("{id}", "xxx")
= "xxx xxx"
```
@@ -888,7 +889,7 @@ An array of substrings resulting from the split.
>> "abc":split()
= ["a", "b", "c"]
->> "a b c":split("[..space]")
+>> "a b c":split("{space}")
= ["a", "b", "c"]
>> "a,b,c,":split(",")
diff --git a/test/text.tm b/test/text.tm
index 3049af99..52c74afd 100644
--- a/test/text.tm
+++ b/test/text.tm
@@ -45,23 +45,23 @@ func main():
>> "Hello":has($/l/)
= yes
- >> "Hello":has($/l[..end]/)
+ >> "Hello":has($/l{end}/)
= no
- >> "Hello":has($/[..start]l/)
+ >> "Hello":has($/{start}l/)
= no
>> "Hello":has($/o/)
= yes
- >> "Hello":has($/o[..end]/)
+ >> "Hello":has($/o{end}/)
= yes
- >> "Hello":has($/[..start]o/)
+ >> "Hello":has($/{start}o/)
= no
>> "Hello":has($/H/)
= yes
- >> "Hello":has($/H[..end]/)
+ >> "Hello":has($/H{end}/)
= no
- >> "Hello":has($/[..start]H/)
+ >> "Hello":has($/{start}H/)
= yes
>> "Hello":replace($/l/, "")
@@ -73,9 +73,9 @@ func main():
>> "One two three four five six":replace($/e /, "")
= "Ontwo threfour fivsix"
- >> " one ":replace($/[..start][..space]/, "")
+ >> " one ":replace($/{start}{space}/, "")
= "one "
- >> " one ":replace($/[..space][..end]/, "")
+ >> " one ":replace($/{space}{end}/, "")
= " one"
>> amelie:has($/$amelie2/)
@@ -104,9 +104,9 @@ func main():
>> $(one (nested) two $(1+2))
= "one (nested) two 3"
- >> "one two three":replace($/[..alpha]/, "")
+ >> "one two three":replace($/{alpha}/, "")
= " "
- >> "one two three":replace($/[..alpha]/, "word")
+ >> "one two three":replace($/{alpha}/, "word")
= "word word word"
>> c := "É̩"
@@ -137,7 +137,7 @@ func main():
>> "one,two,three,":split($/,/)
= ["one", "two", "three", ""]
- >> "one two three":split($/[..space]/)
+ >> "one two three":split($/{space}/)
= ["one", "two", "three"]
>> "abc":split($//)
@@ -159,16 +159,16 @@ func main():
= []
!! Test text:find_all()
- >> " one two three ":find_all($/[..alpha]/)
+ >> " one two three ":find_all($/{alpha}/)
= ["one", "two", "three"]
- >> " one two three ":find_all($/[..!space]/)
+ >> " one two three ":find_all($/{!space}/)
= ["one", "two", "three"]
- >> " ":find_all($/[..alpha]/)
+ >> " ":find_all($/{alpha}/)
= []
- >> " foo(baz(), 1) doop() ":find_all($/[..id](?)/)
+ >> " foo(baz(), 1) doop() ":find_all($/{id}(?)/)
= ["foo(baz(), 1)", "doop()"]
>> "":find_all($Pattern'')
@@ -178,17 +178,17 @@ func main():
= []
!! Test text:find()
- >> " one two three ":find($/[..id]/, start=-999)
+ >> " one two three ":find($/{id}/, start=-999)
= 0
- >> " one two three ":find($/[..id]/, start=999)
+ >> " one two three ":find($/{id}/, start=999)
= 0
- >> " one two three ":find($/[..id]/)
+ >> " one two three ":find($/{id}/)
= 2
- >> " one two three ":find($/[..id]/, start=5)
+ >> " one two three ":find($/{id}/, start=5)
= 8
>> len := 0_i64
- >> " one ":find($/[..id]/, length=&len)
+ >> " one ":find($/{id}/, length=&len)
= 4
>> len
= 3_i64
@@ -220,8 +220,11 @@ func main():
>> Text.from_codepoint_names(["not a valid name here buddy"])
= ""
- >> malicious := "[..xxx"
+ >> "one two; three four":find_all($/; {..}/)
+ = ["; three four"]
+
+ >> malicious := "{xxx}"
>> $/$malicious/
- = $/[..1[]..xxx/
+ = $/{1{}xxx}/