Change pattern syntax from [..pat] to {pat}
This commit is contained in:
parent
64143f0a13
commit
91c5dc61c1
268
builtins/text.c
268
builtins/text.c
@ -814,28 +814,39 @@ const char *get_property_name(Text_t text, int64_t *i)
|
||||
++dest;
|
||||
if (dest >= name + UNINAME_MAX - 1)
|
||||
break;
|
||||
} else if (dest == name && grapheme >= 0 && grapheme != ']') {
|
||||
// Literal character escape: [..[] --> "LEFT SQUARE BRACKET"
|
||||
name = unicode_character_name(grapheme, name);
|
||||
*i += 1;
|
||||
return name;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
*i += 1;
|
||||
}
|
||||
|
||||
while (dest > name && dest[-1] == ' ')
|
||||
*(dest--) = '\0';
|
||||
|
||||
if (dest == name) return NULL;
|
||||
*dest = '\0';
|
||||
return name;
|
||||
}
|
||||
|
||||
#define EAT1(state, cond) ({\
|
||||
int32_t grapheme = _next_grapheme(text, state, text_index); \
|
||||
#define EAT1(text, state, index, cond) ({\
|
||||
int32_t grapheme = _next_grapheme(text, state, index); \
|
||||
bool success = (cond); \
|
||||
if (success) text_index += 1; \
|
||||
if (success) index += 1; \
|
||||
success; })
|
||||
|
||||
#define EAT_MANY(state, cond) ({ int64_t _n = 0; while (EAT1(state, cond)) { _n += 1; } _n; })
|
||||
#define EAT2(text, state, index, cond1, cond2) ({\
|
||||
int32_t grapheme = _next_grapheme(text, state, index); \
|
||||
bool success = (cond1); \
|
||||
if (success) { \
|
||||
grapheme = _next_grapheme(text, state, index + 1); \
|
||||
success = (cond2); \
|
||||
if (success) \
|
||||
index += 2; \
|
||||
} \
|
||||
success; })
|
||||
|
||||
|
||||
#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; })
|
||||
|
||||
int64_t match_email(Text_t text, int64_t text_index)
|
||||
{
|
||||
@ -858,19 +869,21 @@ int64_t match_email(Text_t text, int64_t text_index)
|
||||
// Local part:
|
||||
int64_t local_len = 0;
|
||||
static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
|
||||
while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
|
||||
while (EAT1(text, &state, text_index,
|
||||
(grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
|
||||
local_len += 1;
|
||||
if (local_len > 64) return -1;
|
||||
}
|
||||
|
||||
if (!EAT1(&state, grapheme == '@'))
|
||||
if (!EAT1(text, &state, text_index, grapheme == '@'))
|
||||
return -1;
|
||||
|
||||
// Host
|
||||
int64_t host_len = 0;
|
||||
do {
|
||||
int64_t label_len = 0;
|
||||
while (EAT1(&state, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
|
||||
while (EAT1(text, &state, text_index,
|
||||
(grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
|
||||
label_len += 1;
|
||||
if (label_len > 63) return -1;
|
||||
}
|
||||
@ -882,7 +895,7 @@ int64_t match_email(Text_t text, int64_t text_index)
|
||||
if (host_len > 255)
|
||||
return -1;
|
||||
host_len += 1;
|
||||
} while (EAT1(&state, grapheme == '.'));
|
||||
} while (EAT1(text, &state, text_index, grapheme == '.'));
|
||||
|
||||
return text_index - start_index;
|
||||
}
|
||||
@ -900,21 +913,21 @@ int64_t match_ipv6(Text_t text, int64_t text_index)
|
||||
bool double_colon_used = false;
|
||||
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
|
||||
for (int digits = 0; digits < 4; digits++) {
|
||||
if (!EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
|
||||
if (!EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
|
||||
break;
|
||||
}
|
||||
if (EAT1(&state, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
|
||||
if (EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme)))
|
||||
return -1; // Too many digits
|
||||
|
||||
if (cluster == NUM_CLUSTERS-1) {
|
||||
break;
|
||||
} else if (!EAT1(&state, grapheme == ':')) {
|
||||
} else if (!EAT1(text, &state, text_index, grapheme == ':')) {
|
||||
if (double_colon_used)
|
||||
break;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (EAT1(&state, grapheme == ':')) {
|
||||
if (EAT1(text, &state, text_index, grapheme == ':')) {
|
||||
if (double_colon_used)
|
||||
return -1;
|
||||
double_colon_used = true;
|
||||
@ -936,18 +949,18 @@ static int64_t match_ipv4(Text_t text, int64_t text_index)
|
||||
const int NUM_CLUSTERS = 4;
|
||||
for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
|
||||
for (int digits = 0; digits < 3; digits++) {
|
||||
if (!EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
|
||||
if (!EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
|
||||
if (digits == 0) return -1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (EAT1(&state, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
|
||||
if (EAT1(text, &state, text_index, ~(grapheme & ~0x7F) && isdigit((char)grapheme)))
|
||||
return -1; // Too many digits
|
||||
|
||||
if (cluster == NUM_CLUSTERS-1)
|
||||
break;
|
||||
else if (!EAT1(&state, grapheme == '.'))
|
||||
else if (!EAT1(text, &state, text_index, grapheme == '.'))
|
||||
return -1;
|
||||
}
|
||||
return (text_index - start_index);
|
||||
@ -971,10 +984,11 @@ int64_t match_uri(Text_t text, int64_t text_index)
|
||||
int64_t start_index = text_index;
|
||||
|
||||
// Scheme:
|
||||
if (!EAT1(&state, isalpha(grapheme)))
|
||||
if (!EAT1(text, &state, text_index, isalpha(grapheme)))
|
||||
return -1;
|
||||
|
||||
EAT_MANY(&state, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
|
||||
EAT_MANY(text, &state, text_index,
|
||||
!(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
|
||||
|
||||
if (text_index == start_index)
|
||||
return -1;
|
||||
@ -987,12 +1001,12 @@ int64_t match_uri(Text_t text, int64_t text_index)
|
||||
int64_t authority_start = text_index;
|
||||
// Username or host:
|
||||
static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/";
|
||||
if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
|
||||
if (EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
|
||||
return -1;
|
||||
|
||||
if (EAT1(&state, grapheme == '@')) {
|
||||
if (EAT1(text, &state, text_index, grapheme == '@')) {
|
||||
// Found a username, now get a host:
|
||||
if (EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
|
||||
if (EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0)
|
||||
return -1;
|
||||
} else {
|
||||
int64_t ip = authority_start;
|
||||
@ -1007,29 +1021,29 @@ int64_t match_uri(Text_t text, int64_t text_index)
|
||||
}
|
||||
|
||||
// Port:
|
||||
if (EAT1(&state, grapheme == ':')) {
|
||||
if (EAT_MANY(&state, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
|
||||
if (EAT1(text, &state, text_index, grapheme == ':')) {
|
||||
if (EAT_MANY(text, &state, text_index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0)
|
||||
return -1;
|
||||
}
|
||||
if (!EAT1(&state, grapheme == '/'))
|
||||
if (!EAT1(text, &state, text_index, grapheme == '/'))
|
||||
return (text_index - start_index); // No path
|
||||
} else {
|
||||
// Optional path root:
|
||||
EAT1(&state, grapheme == '/');
|
||||
EAT1(text, &state, text_index, grapheme == '/');
|
||||
}
|
||||
|
||||
// Path:
|
||||
static const char *non_path = " \"#?<>[]{}\\^`|";
|
||||
EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
|
||||
EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
|
||||
|
||||
if (EAT1(&state, grapheme == '?')) { // Query
|
||||
if (EAT1(text, &state, text_index, grapheme == '?')) { // Query
|
||||
static const char *non_query = " \"#<>[]{}\\^`|";
|
||||
EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
|
||||
EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
|
||||
}
|
||||
|
||||
if (EAT1(&state, grapheme == '#')) { // Fragment
|
||||
if (EAT1(text, &state, text_index, grapheme == '#')) { // Fragment
|
||||
static const char *non_fragment = " \"#<>[]{}\\^`|";
|
||||
EAT_MANY(&state, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
|
||||
EAT_MANY(text, &state, text_index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
|
||||
}
|
||||
return text_index - start_index;
|
||||
}
|
||||
@ -1041,7 +1055,49 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
iteration_state_t pattern_state = {0, 0}, text_state = {0, 0};
|
||||
while (pattern_index < pattern.length) {
|
||||
int64_t old_pat_index = pattern_index;
|
||||
if (match_str(pattern, &pattern_index, "[..")) {
|
||||
if (EAT2(pattern, &pattern_state, pattern_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_QUOTATION_MARK),
|
||||
grapheme == '?')) {
|
||||
// Quotations: "?", '?', etc
|
||||
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
|
||||
if (!match_grapheme(text, &text_index, open)) return -1;
|
||||
int32_t close = open;
|
||||
uc_mirror_char(open, (uint32_t*)&close);
|
||||
if (!match_grapheme(pattern, &pattern_index, close))
|
||||
fail("Pattern's closing brace is missing: %k", &pattern);
|
||||
while (text_index < text.length) {
|
||||
int32_t c = _next_grapheme(text, &text_state, text_index);
|
||||
if (c == close)
|
||||
return (text_index - start_index);
|
||||
|
||||
if (c == '\\' && text_index < text.length) {
|
||||
text_index += 2;
|
||||
} else {
|
||||
text_index += 1;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
} else if (EAT2(pattern, &pattern_state, pattern_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_PAIRED_PUNCTUATION),
|
||||
grapheme == '?')) {
|
||||
// Nested punctuation: (?), [?], etc
|
||||
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
|
||||
if (!match_grapheme(text, &text_index, open)) return -1;
|
||||
int32_t close = open;
|
||||
uc_mirror_char(open, (uint32_t*)&close);
|
||||
if (!match_grapheme(pattern, &pattern_index, close))
|
||||
fail("Pattern's closing brace is missing: %k", &pattern);
|
||||
int64_t depth = 1;
|
||||
for (; depth > 0 && text_index < text.length; ++text_index) {
|
||||
int32_t c = _next_grapheme(text, &text_state, text_index);
|
||||
if (c == open)
|
||||
depth += 1;
|
||||
else if (c == close)
|
||||
depth -= 1;
|
||||
}
|
||||
if (depth > 0) return -1;
|
||||
} else if (EAT1(pattern, &pattern_state, pattern_index,
|
||||
grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
|
||||
skip_whitespace(pattern, &pattern_index);
|
||||
int64_t min, max;
|
||||
if (uc_is_digit(_next_grapheme(pattern, &pattern_state, pattern_index))) {
|
||||
@ -1059,21 +1115,42 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
}
|
||||
|
||||
skip_whitespace(pattern, &pattern_index);
|
||||
bool want_to_match = !match_grapheme(pattern, &pattern_index, '!');
|
||||
const char *prop_name = get_property_name(pattern, &pattern_index);
|
||||
|
||||
skip_whitespace(pattern, &pattern_index);
|
||||
if (!match_grapheme(pattern, &pattern_index, ']'))
|
||||
fail("Missing closing ']' in pattern: %k", &pattern);
|
||||
|
||||
int64_t before_group = text_index;
|
||||
bool any = false;
|
||||
uc_property_t prop;
|
||||
int32_t specific_codepoint = UNINAME_INVALID;
|
||||
bool want_to_match = !match_grapheme(pattern, &pattern_index, '!');
|
||||
const char *prop_name;
|
||||
if (match_str(pattern, &pattern_index, ".."))
|
||||
prop_name = "..";
|
||||
else
|
||||
prop_name = get_property_name(pattern, &pattern_index);
|
||||
|
||||
if (!prop_name) {
|
||||
// Literal character, e.g. {1?}
|
||||
specific_codepoint = _next_grapheme(pattern, &pattern_state, pattern_index);
|
||||
pattern_index += 1;
|
||||
} else if (strlen(prop_name) == 1) {
|
||||
// Single letter names: {1+ A}
|
||||
specific_codepoint = prop_name[0];
|
||||
prop_name = NULL;
|
||||
}
|
||||
|
||||
skip_whitespace(pattern, &pattern_index);
|
||||
if (!match_grapheme(pattern, &pattern_index, '}'))
|
||||
fail("Missing closing '}' in pattern: %k", &pattern);
|
||||
|
||||
int64_t before_group = text_index;
|
||||
#define FAIL() ({ if (min < 1) { text_index = before_group; continue; } else { return -1; } })
|
||||
if (prop_name) {
|
||||
switch (tolower(prop_name[0])) {
|
||||
case '.':
|
||||
if (prop_name[1] == '.') {
|
||||
any = true;
|
||||
prop = UC_PROPERTY_PRIVATE_USE;
|
||||
break;
|
||||
}
|
||||
break;
|
||||
case 'd':
|
||||
if (strcasecmp(prop_name, "digit") == 0) {
|
||||
prop = UC_PROPERTY_DECIMAL_DIGIT;
|
||||
@ -1098,13 +1175,16 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
break;
|
||||
case 'i':
|
||||
if (prop_name && strcasecmp(prop_name, "id") == 0) {
|
||||
if (!EAT1(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_START)))
|
||||
if (!EAT1(text, &text_state, text_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_XID_START)))
|
||||
FAIL();
|
||||
EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
|
||||
EAT_MANY(text, &text_state, text_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_XID_CONTINUE));
|
||||
continue;
|
||||
} else if (prop_name && strcasecmp(prop_name, "int") == 0) {
|
||||
EAT1(&text_state, grapheme == '-');
|
||||
int64_t n = EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
|
||||
EAT1(text, &text_state, text_index, grapheme == '-');
|
||||
int64_t n = EAT_MANY(text, &text_state, text_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
|
||||
if (n <= 0)
|
||||
FAIL();
|
||||
continue;
|
||||
@ -1132,10 +1212,12 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
break;
|
||||
case 'n':
|
||||
if (prop_name && strcasecmp(prop_name, "num") == 0) {
|
||||
EAT1(&text_state, grapheme == '-');
|
||||
int64_t pre_decimal = EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
|
||||
bool decimal = (EAT1(&text_state, grapheme == '.') == 1);
|
||||
int64_t post_decimal = decimal ? EAT_MANY(&text_state, uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
|
||||
EAT1(text, &text_state, text_index, grapheme == '-');
|
||||
int64_t pre_decimal = EAT_MANY(text, &text_state, text_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT));
|
||||
bool decimal = (EAT1(text, &text_state, text_index, grapheme == '.') == 1);
|
||||
int64_t post_decimal = decimal ? EAT_MANY(text, &text_state, text_index,
|
||||
uc_is_property(grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
|
||||
if (pre_decimal == 0 && post_decimal == 0)
|
||||
FAIL();
|
||||
continue;
|
||||
@ -1178,9 +1260,6 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
if (specific_codepoint == UNINAME_INVALID)
|
||||
fail("Not a valid property or character name: %s", prop_name);
|
||||
}
|
||||
} else {
|
||||
any = true;
|
||||
prop = UC_PROPERTY_PRIVATE_USE;
|
||||
}
|
||||
got_prop:;
|
||||
|
||||
@ -1222,80 +1301,16 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
}
|
||||
}
|
||||
}
|
||||
} else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_QUOTATION_MARK)
|
||||
&& (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) {
|
||||
// Quotation: "?", '?', etc
|
||||
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
|
||||
if (!match_grapheme(text, &text_index, open)) return -1;
|
||||
int32_t close = open;
|
||||
uc_mirror_char(open, (uint32_t*)&close);
|
||||
if (!match_grapheme(pattern, &pattern_index, close))
|
||||
fail("Pattern's closing brace is missing: %k", &pattern);
|
||||
while (text_index < text.length) {
|
||||
int32_t c = _next_grapheme(text, &text_state, text_index);
|
||||
if (c == close)
|
||||
return (text_index - start_index);
|
||||
|
||||
if (c == '\\' && text_index < text.length) {
|
||||
text_index += 2;
|
||||
} else {
|
||||
text_index += 1;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
} else if (uc_is_property(_next_grapheme(pattern, &pattern_state, pattern_index), UC_PROPERTY_PAIRED_PUNCTUATION)
|
||||
&& (pattern_index += 1, match_grapheme(pattern, &pattern_index, '?'))) {
|
||||
// Nested punctuation: (?), [?], etc
|
||||
int32_t open = _next_grapheme(pattern, &pattern_state, pattern_index-2);
|
||||
if (!match_grapheme(text, &text_index, open)) return -1;
|
||||
int32_t close = open;
|
||||
uc_mirror_char(open, (uint32_t*)&close);
|
||||
if (!match_grapheme(pattern, &pattern_index, close))
|
||||
fail("Pattern's closing brace is missing: %k", &pattern);
|
||||
int64_t depth = 1;
|
||||
for (; depth > 0 && text_index < text.length; ++text_index) {
|
||||
int32_t c = _next_grapheme(text, &text_state, text_index);
|
||||
if (c == open)
|
||||
depth += 1;
|
||||
else if (c == close)
|
||||
depth -= 1;
|
||||
}
|
||||
if (depth > 0) return -1;
|
||||
} else {
|
||||
// Plain character:
|
||||
pattern_index = old_pat_index;
|
||||
int32_t pat_grapheme = _next_grapheme(pattern, &pattern_state, pattern_index);
|
||||
|
||||
// if (pattern_index == 0 && text_index > 0) {
|
||||
// int32_t pat_codepoint = pat_grapheme;
|
||||
// if (pat_codepoint < 0)
|
||||
// pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0];
|
||||
|
||||
// int32_t prev_codepoint = _next_grapheme(text, &text_state, text_index - 1);
|
||||
// if (prev_codepoint < 0)
|
||||
// prev_codepoint = synthetic_graphemes[-prev_codepoint-1].codepoints[0];
|
||||
// if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(prev_codepoint))
|
||||
// return -1;
|
||||
// }
|
||||
|
||||
int32_t text_grapheme = _next_grapheme(text, &text_state, text_index);
|
||||
if (pat_grapheme != text_grapheme)
|
||||
return -1;
|
||||
|
||||
pattern_index += 1;
|
||||
text_index += 1;
|
||||
|
||||
// if (pattern_index == pattern.length && text_index < text.length) {
|
||||
// int32_t pat_codepoint = pat_grapheme;
|
||||
// if (pat_codepoint < 0)
|
||||
// pat_codepoint = synthetic_graphemes[-pat_codepoint-1].codepoints[0];
|
||||
|
||||
// int32_t next_codepoint = _next_grapheme(text, &text_state, text_index);
|
||||
// if (next_codepoint < 0)
|
||||
// next_codepoint = synthetic_graphemes[-next_codepoint-1].codepoints[0];
|
||||
// if (uc_is_property_alphabetic(pat_codepoint) && uc_is_property_alphabetic(next_codepoint))
|
||||
// return -1;
|
||||
// }
|
||||
}
|
||||
}
|
||||
if (text_index >= text.length && pattern_index < pattern.length)
|
||||
@ -1304,6 +1319,7 @@ int64_t match(Text_t text, Pattern_t pattern, int64_t text_index, int64_t patter
|
||||
}
|
||||
|
||||
#undef EAT1
|
||||
#undef EAT2
|
||||
#undef EAT_MANY
|
||||
|
||||
public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t *match_length)
|
||||
@ -1315,7 +1331,7 @@ public Int_t Text$find(Text_t text, Pattern_t pattern, Int_t from_index, int64_t
|
||||
return I_small(0);
|
||||
|
||||
int32_t first_grapheme = get_grapheme(pattern, 0);
|
||||
bool find_first = (first_grapheme != '['
|
||||
bool find_first = (first_grapheme != '{'
|
||||
&& !uc_is_property(first_grapheme, UC_PROPERTY_QUOTATION_MARK)
|
||||
&& !uc_is_property(first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
|
||||
|
||||
@ -1677,16 +1693,14 @@ public Pattern_t Pattern$escape_text(Text_t text)
|
||||
int32_t g = _next_grapheme(text, &state, i);
|
||||
uint32_t g0 = g < 0 ? synthetic_graphemes[-g-1].codepoints[0] : (uint32_t)g;
|
||||
|
||||
if (g == '[') {
|
||||
add_str("[..1[]");
|
||||
} else if (uc_is_property_quotation_mark(g0)) {
|
||||
add_str("[..1");
|
||||
if (g == '{') {
|
||||
add_str("{1{}");
|
||||
} else if (uc_is_property_quotation_mark(g0)
|
||||
|| (uc_is_property_paired_punctuation(g0) && uc_is_property_left_of_pair(g0))) {
|
||||
add_char('{');
|
||||
add_char('1');
|
||||
add_char(g);
|
||||
add_char(']');
|
||||
} else if (uc_is_property_paired_punctuation(g0)) {
|
||||
add_str("[..1");
|
||||
add_char(g);
|
||||
add_char(']');
|
||||
add_char('}');
|
||||
} else {
|
||||
add_char(g);
|
||||
}
|
||||
|
73
docs/text.md
73
docs/text.md
@ -284,9 +284,9 @@ See [Text Functions](#Text-Functions) for the full API documentation.
|
||||
|
||||
Patterns have three types of syntax:
|
||||
|
||||
- `[..` followed by an optional count (`n`, `n-m`, or `n+`), followed by an
|
||||
- `{` followed by an optional count (`n`, `n-m`, or `n+`), followed by an
|
||||
optional `!` to negate the pattern, followed by an optional pattern name or
|
||||
Unicode character name, followed by a required `]`.
|
||||
Unicode character name, followed by a required `}`.
|
||||
|
||||
- Any matching pair of quotes or parentheses or braces with a `?` in the middle
|
||||
(e.g. `"?"` or `(?)`).
|
||||
@ -296,10 +296,11 @@ Patterns have three types of syntax:
|
||||
## Named Patterns
|
||||
|
||||
Named patterns match certain pre-defined patterns that are commonly useful. To
|
||||
use a named pattern, use the syntax `[..name]`. Names are case-insensitive and
|
||||
use a named pattern, use the syntax `{name}`. Names are case-insensitive and
|
||||
mostly ignore spaces, underscores, and dashes.
|
||||
|
||||
- ` ` - If no name is given, any character is accepted.
|
||||
- `..` - Any character (note that a single `.` would mean the literal period
|
||||
character).
|
||||
- `digit` - A unicode digit
|
||||
- `email` - an email address
|
||||
- `emoji` - an emoji
|
||||
@ -315,8 +316,8 @@ mostly ignore spaces, underscores, and dashes.
|
||||
- `url` - a URL (URI that specifically starts with `http://`, `https://`, `ws://`, `wss://`, or `ftp://`)
|
||||
|
||||
For non-alphabetic characters, any single character is treated as matching
|
||||
exactly that character. For example, `[..1 []` matches exactly one `[`
|
||||
character. Or, `[..1 (]` matches exactly one `(` character.
|
||||
exactly that character. For example, `{1{}` matches exactly one `{`
|
||||
character. Or, `{1.}` matches exactly one `.` character.
|
||||
|
||||
Patterns can also use any Unicode property name. Some helpful ones are:
|
||||
|
||||
@ -326,37 +327,37 @@ Patterns can also use any Unicode property name. Some helpful ones are:
|
||||
- `upper` - Uppercase letters
|
||||
- `whitespace` - Whitespace characters
|
||||
|
||||
Patterns may also use exact Unicode codepoint names. For example: `[..1 latin
|
||||
small letter A]` matches `a`.
|
||||
Patterns may also use exact Unicode codepoint names. For example: `{1 latin
|
||||
small letter A}` matches `a`.
|
||||
|
||||
## Negating Patterns
|
||||
|
||||
If an exclamation mark (`!`) is placed before a pattern's name, then characters
|
||||
are matched only when they _don't_ match the pattern. For example, `[..!alpha]`
|
||||
are matched only when they _don't_ match the pattern. For example, `{!alpha}`
|
||||
will match all characters _except_ alphabetic ones.
|
||||
|
||||
## Interpolating Text and Escaping
|
||||
|
||||
To escape a character in a pattern (e.g. if you want to match the literal
|
||||
character `?`), you can use the syntax `[..1 ?]`. This is almost never
|
||||
necessary unless you have text that looks like a Tomo text pattern and has
|
||||
something like `[..` or `(?)` inside it.
|
||||
character `?`), you can use the syntax `{1 ?}`. This is almost never necessary
|
||||
unless you have text that looks like a Tomo text pattern and has something like
|
||||
`{` or `(?)` inside it.
|
||||
|
||||
However, if you're trying to do an exact match of arbitrary text values, you'll
|
||||
want to have the text automatically escaped. Fortunately, Tomo's injection-safe
|
||||
DSL text interpolation supports automatic text escaping. This means that if you
|
||||
use text interpolation with the `$` sign to insert a text value, the value will
|
||||
be automatically escaped using the `[..1 ?]` rule described above:
|
||||
be automatically escaped using the `{1 ?}` rule described above:
|
||||
|
||||
```tomo
|
||||
# Risk of code injection (would cause an error because 'xxx' is not a valid
|
||||
# pattern name:
|
||||
>> user_input := get_user_input()
|
||||
= "[..xxx]"
|
||||
= "{xxx}"
|
||||
|
||||
# Interpolation automatically escapes:
|
||||
>> $/$user_input/
|
||||
= $/[..1 []..xxx]/
|
||||
= $/{1{}..xxx}/
|
||||
|
||||
# No error:
|
||||
>> some_text:find($/$user_input/)
|
||||
@ -366,8 +367,8 @@ be automatically escaped using the `[..1 ?]` rule described above:
|
||||
If you prefer, you can also use this to insert literal characters:
|
||||
|
||||
```tomo
|
||||
>> $/literal $"[..]"/
|
||||
= $/literal [..1]]..]/
|
||||
>> $/literal $"{..}"/
|
||||
= $/literal {1{}..}/
|
||||
```
|
||||
|
||||
## Repetitions
|
||||
@ -378,11 +379,11 @@ many repetitions you want by putting a number or range of numbers first using
|
||||
(`n` or more repetitions):
|
||||
|
||||
```
|
||||
[..4-5 alpha]
|
||||
0x[..hex]
|
||||
[..4 digit]-[..2 digit]-[..2 digit]
|
||||
[..2+ space]
|
||||
[..0-1 question mark]
|
||||
{4-5 alpha}
|
||||
0x{hex}
|
||||
{4 digit}-{2 digit}-{2 digit}
|
||||
{2+ space}
|
||||
{0-1 question mark}
|
||||
```
|
||||
|
||||
# Text Functions
|
||||
@ -625,17 +626,17 @@ found.
|
||||
|
||||
**Example:**
|
||||
```tomo
|
||||
>> " one two three ":find("[..id]", start=-999)
|
||||
>> " one two three ":find("{id}", start=-999)
|
||||
= 0
|
||||
>> " one two three ":find("[..id]", start=999)
|
||||
>> " one two three ":find("{id}", start=999)
|
||||
= 0
|
||||
>> " one two three ":find("[..id]")
|
||||
>> " one two three ":find("{id}")
|
||||
= 2
|
||||
>> " one two three ":find("[..id]", start=5)
|
||||
>> " one two three ":find("{id}", start=5)
|
||||
= 8
|
||||
|
||||
>> len := 0_i64
|
||||
>> " one ":find("[..id]", length=&len)
|
||||
>> " one ":find("{id}", length=&len)
|
||||
= 4
|
||||
>> len
|
||||
= 3_i64
|
||||
@ -665,16 +666,16 @@ Note: if `text` or `pattern` is empty, an empty array will be returned.
|
||||
|
||||
**Example:**
|
||||
```tomo
|
||||
>> " one two three ":find_all("[..alpha]")
|
||||
>> " one two three ":find_all("{alpha}")
|
||||
= ["one", "two", "three"]
|
||||
|
||||
>> " one two three ":find_all("[..!space]")
|
||||
>> " one two three ":find_all("{!space}")
|
||||
= ["one", "two", "three"]
|
||||
|
||||
>> " ":find_all("[..alpha]")
|
||||
>> " ":find_all("{alpha}")
|
||||
= []
|
||||
|
||||
>> " foo(baz(), 1) doop() ":find_all("[..id](?)")
|
||||
>> " foo(baz(), 1) doop() ":find_all("{id}(?)")
|
||||
= ["foo(baz(), 1)", "doop()"]
|
||||
|
||||
>> "":find_all("")
|
||||
@ -708,11 +709,11 @@ has(text: Text, pattern: Text) -> Bool
|
||||
```tomo
|
||||
>> "hello world":has("wo")
|
||||
= yes
|
||||
>> "hello world":has("[..alpha]")
|
||||
>> "hello world":has("{alpha}")
|
||||
= yes
|
||||
>> "hello world":has("[..digit]")
|
||||
>> "hello world":has("{digit}")
|
||||
= no
|
||||
>> "hello world":has("[..start]he")
|
||||
>> "hello world":has("{start}he")
|
||||
= yes
|
||||
```
|
||||
|
||||
@ -854,7 +855,7 @@ The text with occurrences of the pattern replaced.
|
||||
>> "Hello world":replace("world", "there")
|
||||
= "Hello there"
|
||||
|
||||
>> "Hello world":replace("[..id]", "xxx")
|
||||
>> "Hello world":replace("{id}", "xxx")
|
||||
= "xxx xxx"
|
||||
```
|
||||
|
||||
@ -888,7 +889,7 @@ An array of substrings resulting from the split.
|
||||
>> "abc":split()
|
||||
= ["a", "b", "c"]
|
||||
|
||||
>> "a b c":split("[..space]")
|
||||
>> "a b c":split("{space}")
|
||||
= ["a", "b", "c"]
|
||||
|
||||
>> "a,b,c,":split(",")
|
||||
|
47
test/text.tm
47
test/text.tm
@ -45,23 +45,23 @@ func main():
|
||||
|
||||
>> "Hello":has($/l/)
|
||||
= yes
|
||||
>> "Hello":has($/l[..end]/)
|
||||
>> "Hello":has($/l{end}/)
|
||||
= no
|
||||
>> "Hello":has($/[..start]l/)
|
||||
>> "Hello":has($/{start}l/)
|
||||
= no
|
||||
|
||||
>> "Hello":has($/o/)
|
||||
= yes
|
||||
>> "Hello":has($/o[..end]/)
|
||||
>> "Hello":has($/o{end}/)
|
||||
= yes
|
||||
>> "Hello":has($/[..start]o/)
|
||||
>> "Hello":has($/{start}o/)
|
||||
= no
|
||||
|
||||
>> "Hello":has($/H/)
|
||||
= yes
|
||||
>> "Hello":has($/H[..end]/)
|
||||
>> "Hello":has($/H{end}/)
|
||||
= no
|
||||
>> "Hello":has($/[..start]H/)
|
||||
>> "Hello":has($/{start}H/)
|
||||
= yes
|
||||
|
||||
>> "Hello":replace($/l/, "")
|
||||
@ -73,9 +73,9 @@ func main():
|
||||
>> "One two three four five six":replace($/e /, "")
|
||||
= "Ontwo threfour fivsix"
|
||||
|
||||
>> " one ":replace($/[..start][..space]/, "")
|
||||
>> " one ":replace($/{start}{space}/, "")
|
||||
= "one "
|
||||
>> " one ":replace($/[..space][..end]/, "")
|
||||
>> " one ":replace($/{space}{end}/, "")
|
||||
= " one"
|
||||
|
||||
>> amelie:has($/$amelie2/)
|
||||
@ -104,9 +104,9 @@ func main():
|
||||
>> $(one (nested) two $(1+2))
|
||||
= "one (nested) two 3"
|
||||
|
||||
>> "one two three":replace($/[..alpha]/, "")
|
||||
>> "one two three":replace($/{alpha}/, "")
|
||||
= " "
|
||||
>> "one two three":replace($/[..alpha]/, "word")
|
||||
>> "one two three":replace($/{alpha}/, "word")
|
||||
= "word word word"
|
||||
|
||||
>> c := "É̩"
|
||||
@ -137,7 +137,7 @@ func main():
|
||||
>> "one,two,three,":split($/,/)
|
||||
= ["one", "two", "three", ""]
|
||||
|
||||
>> "one two three":split($/[..space]/)
|
||||
>> "one two three":split($/{space}/)
|
||||
= ["one", "two", "three"]
|
||||
|
||||
>> "abc":split($//)
|
||||
@ -159,16 +159,16 @@ func main():
|
||||
= []
|
||||
|
||||
!! Test text:find_all()
|
||||
>> " one two three ":find_all($/[..alpha]/)
|
||||
>> " one two three ":find_all($/{alpha}/)
|
||||
= ["one", "two", "three"]
|
||||
|
||||
>> " one two three ":find_all($/[..!space]/)
|
||||
>> " one two three ":find_all($/{!space}/)
|
||||
= ["one", "two", "three"]
|
||||
|
||||
>> " ":find_all($/[..alpha]/)
|
||||
>> " ":find_all($/{alpha}/)
|
||||
= []
|
||||
|
||||
>> " foo(baz(), 1) doop() ":find_all($/[..id](?)/)
|
||||
>> " foo(baz(), 1) doop() ":find_all($/{id}(?)/)
|
||||
= ["foo(baz(), 1)", "doop()"]
|
||||
|
||||
>> "":find_all($Pattern'')
|
||||
@ -178,17 +178,17 @@ func main():
|
||||
= []
|
||||
|
||||
!! Test text:find()
|
||||
>> " one two three ":find($/[..id]/, start=-999)
|
||||
>> " one two three ":find($/{id}/, start=-999)
|
||||
= 0
|
||||
>> " one two three ":find($/[..id]/, start=999)
|
||||
>> " one two three ":find($/{id}/, start=999)
|
||||
= 0
|
||||
>> " one two three ":find($/[..id]/)
|
||||
>> " one two three ":find($/{id}/)
|
||||
= 2
|
||||
>> " one two three ":find($/[..id]/, start=5)
|
||||
>> " one two three ":find($/{id}/, start=5)
|
||||
= 8
|
||||
|
||||
>> len := 0_i64
|
||||
>> " one ":find($/[..id]/, length=&len)
|
||||
>> " one ":find($/{id}/, length=&len)
|
||||
= 4
|
||||
>> len
|
||||
= 3_i64
|
||||
@ -220,8 +220,11 @@ func main():
|
||||
>> Text.from_codepoint_names(["not a valid name here buddy"])
|
||||
= ""
|
||||
|
||||
>> malicious := "[..xxx"
|
||||
>> "one two; three four":find_all($/; {..}/)
|
||||
= ["; three four"]
|
||||
|
||||
>> malicious := "{xxx}"
|
||||
>> $/$malicious/
|
||||
= $/[..1[]..xxx/
|
||||
= $/{1{}xxx}/
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user