diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-09-14 00:12:52 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-09-14 00:12:52 -0400 |
| commit | 6012a00763afdd467e71b1657bd9a39a4cba4493 (patch) | |
| tree | 6994a7074b4182211c3beec8553c02f638976b7f /stdlib/patterns.c | |
| parent | 2b0556084919ace0700e4480f7fa2886cf31b3e4 (diff) | |
Clean up pattern code to make better use of TextIter_t and fix up
URI/URL patterns
Diffstat (limited to 'stdlib/patterns.c')
| -rw-r--r-- | stdlib/patterns.c | 418 |
1 files changed, 218 insertions, 200 deletions
diff --git a/stdlib/patterns.c b/stdlib/patterns.c index 81beaffe..2de7fe3a 100644 --- a/stdlib/patterns.c +++ b/stdlib/patterns.c @@ -14,32 +14,48 @@ #define MAX_BACKREFS 100 -static inline void skip_whitespace(Text_t text, int64_t *i) +typedef struct { + int64_t index, length; + bool occupied, recursive; +} capture_t; + +typedef struct { + enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; + bool negated, non_capturing; + int64_t min, max; + union { + int32_t grapheme; + uc_property_t property; + int64_t (*fn)(TextIter_t *, int64_t); + int32_t quote_graphemes[2]; + int32_t pair_graphemes[2]; + }; +} pat_t; + +static inline void skip_whitespace(TextIter_t *state, int64_t *i) { - TextIter_t state = {0, 0}; - while (*i < text.length) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); + while (*i < state->text.length) { + int32_t grapheme = Text$get_grapheme_fast(state, *i); if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) return; *i += 1; } } -static inline bool match_grapheme(Text_t text, int64_t *i, int32_t grapheme) +static inline bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) { - if (*i < text.length && Text$get_grapheme(text, *i) == grapheme) { + if (*i < state->text.length && Text$get_grapheme_fast(state, *i) == grapheme) { *i += 1; return true; } return false; } -static inline bool match_str(Text_t text, int64_t *i, const char *str) +static inline bool match_str(TextIter_t *state, int64_t *i, const char *str) { - TextIter_t state = {0, 0}; int64_t matched = 0; while (matched[str]) { - if (*i + matched >= text.length || Text$get_grapheme_fast(text, &state, *i + matched) != str[matched]) + if (*i + matched >= state->text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) return false; matched += 1; } @@ -47,11 +63,10 @@ static inline bool match_str(Text_t text, int64_t *i, const char *str) return true; } -static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) +static inline bool match_property(TextIter_t *state, int64_t *i, uc_property_t prop) { - if (*i >= text.length) return false; - TextIter_t state = {}; - ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); + if (*i >= state->text.length) return false; + ucs4_t grapheme = Text$get_main_grapheme_fast(state, *i); // TODO: check every codepoint in the cluster? if (uc_is_property(grapheme, prop)) { *i += 1; @@ -60,12 +75,11 @@ static inline bool match_property(Text_t text, int64_t *i, uc_property_t prop) return false; } -static int64_t parse_int(Text_t text, int64_t *i) +static int64_t parse_int(TextIter_t *state, int64_t *i) { - TextIter_t state = {0, 0}; int64_t value = 0; for (;; *i += 1) { - ucs4_t grapheme = Text$get_main_grapheme_fast(text, &state, *i); + ucs4_t grapheme = Text$get_main_grapheme_fast(state, *i); int digit = uc_digit_value((ucs4_t)grapheme); if (digit < 0) break; if (value >= INT64_MAX/10) break; @@ -74,14 +88,13 @@ static int64_t parse_int(Text_t text, int64_t *i) return value; } -const char *get_property_name(Text_t text, int64_t *i) +static const char *get_property_name(TextIter_t *state, int64_t *i) { - skip_whitespace(text, i); + skip_whitespace(state, i); char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); char *dest = name; - TextIter_t state = {0, 0}; - while (*i < text.length) { - int32_t grapheme = Text$get_grapheme_fast(text, &state, *i); + while (*i < state->text.length) { + int32_t grapheme = Text$get_grapheme_fast(state, *i); if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { *dest = (char)grapheme; ++dest; @@ -101,17 +114,17 @@ const char *get_property_name(Text_t text, int64_t *i) return name; } -#define EAT1(text, state, index, cond) ({\ - int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ +#define EAT1(state, index, cond) ({\ + int32_t grapheme = Text$get_grapheme_fast(state, index); \ bool success = (cond); \ if (success) index += 1; \ success; }) -#define EAT2(text, state, index, cond1, cond2) ({\ - int32_t grapheme = Text$get_grapheme_fast(text, state, index); \ +#define EAT2(state, index, cond1, cond2) ({\ + int32_t grapheme = Text$get_grapheme_fast(state, index); \ bool success = (cond1); \ if (success) { \ - grapheme = Text$get_grapheme_fast(text, state, index + 1); \ + grapheme = Text$get_grapheme_fast(state, index + 1); \ success = (cond2); \ if (success) \ index += 2; \ @@ -119,18 +132,17 @@ const char *get_property_name(Text_t text, int64_t *i) success; }) -#define EAT_MANY(text, state, index, cond) ({ int64_t _n = 0; while (EAT1(text, state, index, cond)) { _n += 1; } _n; }) +#define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; }) -int64_t match_email(Text_t text, int64_t index) +static int64_t match_email(TextIter_t *state, int64_t index) { // email = local "@" domain // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) // domain = dns-label ("." dns-label)* // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) - TextIter_t state = {0, 0}; if (index > 0) { - ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); + ucs4_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); if (uc_is_property_alphabetic((ucs4_t)prev_codepoint)) return -1; } @@ -140,20 +152,20 @@ int64_t match_email(Text_t text, int64_t index) // Local part: int64_t local_len = 0; static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; - while (EAT1(text, &state, index, + while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { local_len += 1; if (local_len > 64) return -1; } - if (!EAT1(text, &state, index, grapheme == '@')) + if (!EAT1(state, index, grapheme == '@')) return -1; // Host int64_t host_len = 0; do { int64_t label_len = 0; - while (EAT1(text, &state, index, + while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { label_len += 1; if (label_len > 63) return -1; @@ -166,16 +178,15 @@ int64_t match_email(Text_t text, int64_t index) if (host_len > 255) return -1; host_len += 1; - } while (EAT1(text, &state, index, grapheme == '.')); + } while (EAT1(state, index, grapheme == '.')); return index - start_index; } -int64_t match_ipv6(Text_t text, int64_t index) +static int64_t match_ipv6(TextIter_t *state, int64_t index) { - TextIter_t state = {0, 0}; if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); + int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) return -1; } @@ -184,21 +195,21 @@ int64_t match_ipv6(Text_t text, int64_t index) bool double_colon_used = false; for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { for (int digits = 0; digits < 4; digits++) { - if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) break; } - if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) + if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) return -1; // Too many digits if (cluster == NUM_CLUSTERS-1) { break; - } else if (!EAT1(text, &state, index, grapheme == ':')) { + } else if (!EAT1(state, index, grapheme == ':')) { if (double_colon_used) break; return -1; } - if (EAT1(text, &state, index, grapheme == ':')) { + if (EAT1(state, index, grapheme == ':')) { if (double_colon_used) return -1; double_colon_used = true; @@ -207,11 +218,10 @@ int64_t match_ipv6(Text_t text, int64_t index) return index - start_index; } -static int64_t match_ipv4(Text_t text, int64_t index) +static int64_t match_ipv4(TextIter_t *state, int64_t index) { - TextIter_t state = {0, 0}; if (index > 0) { - int32_t prev_codepoint = Text$get_grapheme_fast(text, &state, index - 1); + int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) return -1; } @@ -220,40 +230,86 @@ static int64_t match_ipv4(Text_t text, int64_t index) const int NUM_CLUSTERS = 4; for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { for (int digits = 0; digits < 3; digits++) { - if (!EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { + if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { if (digits == 0) return -1; break; } } - if (EAT1(text, &state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) + if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) return -1; // Too many digits if (cluster == NUM_CLUSTERS-1) break; - else if (!EAT1(text, &state, index, grapheme == '.')) + else if (!EAT1(state, index, grapheme == '.')) return -1; } return (index - start_index); } -int64_t match_ip(Text_t text, int64_t index) +static int64_t match_ip(TextIter_t *state, int64_t index) { - int64_t len = match_ipv6(text, index); + int64_t len = match_ipv6(state, index); if (len >= 0) return len; - len = match_ipv4(text, index); + len = match_ipv4(state, index); return (len >= 0) ? len : -1; } -int64_t match_uri(Text_t text, int64_t index) +static int64_t match_host(TextIter_t *state, int64_t index) +{ + int64_t ip_len = match_ip(state, index); + if (ip_len > 0) return ip_len; + + int64_t start_index = index; + if (match_grapheme(state, &index, '[')) { + ip_len = match_ip(state, index); + if (ip_len <= 0) return -1; + index += ip_len; + if (match_grapheme(state, &index, ']')) + return (index - start_index); + return -1; + } + + if (!EAT1(state, index, isalpha(grapheme))) + return -1; + + static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme)); + return (index - start_index); +} + +static int64_t match_authority(TextIter_t *state, int64_t index) +{ + int64_t authority_start = index; + static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`."; + + // Optional user@ prefix: + int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme)); + if (username_len < 1 || !EAT1(state, index, grapheme == '@')) + index = authority_start; // No user@ part + + // Host: + int64_t host_len = match_host(state, index); + if (host_len <= 0) return -1; + index += host_len; + + // Port: + if (EAT1(state, index, grapheme == ':')) { + if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) + return -1; + } + return (index - authority_start); +} + +static int64_t match_uri(TextIter_t *state, int64_t index) { // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] // scheme = [a-zA-Z] [a-zA-Z0-9+.-] // authority = [userinfo "@"] host [":" port] - TextIter_t state = {0, 0}; if (index > 0) { - ucs4_t prev_codepoint = Text$get_main_grapheme_fast(text, &state, index - 1); + // Don't match if we're not at a word edge: + ucs4_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); if (uc_is_property_alphabetic(prev_codepoint)) return -1; } @@ -261,147 +317,101 @@ int64_t match_uri(Text_t text, int64_t index) int64_t start_index = index; // Scheme: - if (!EAT1(text, &state, index, isalpha(grapheme))) + if (!EAT1(state, index, isalpha(grapheme))) return -1; - - EAT_MANY(text, &state, index, - !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); - - if (index == start_index) - return -1; - - if (!match_grapheme(text, &index, ':')) + EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); + if (!match_grapheme(state, &index, ':')) return -1; // Authority: - if (match_str(text, &index, "//")) { - int64_t authority_start = index; - // Username or host: - static const char *forbidden = "#?:@ \t\r\n<>[]{}\\^|\"`/"; - if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) - return -1; + int64_t authority_len; + if (match_str(state, &index, "//")) { + authority_len = match_authority(state, index); + if (authority_len > 0) + index += authority_len; + } else { + authority_len = 0; + } - if (EAT1(text, &state, index, grapheme == '@')) { - // Found a username, now get a host: - if (EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(forbidden, (char)grapheme)) == 0) - return -1; - } else { - int64_t ip = authority_start; - int64_t ipv4_len = match_ipv4(text, ip); - if (ipv4_len > 0) { - ip += ipv4_len; - } else if (match_grapheme(text, &ip, '[')) { - ip += match_ipv6(text, ip); - if (ip > authority_start + 1 && match_grapheme(text, &ip, ']')) - index = ip; - } + // Path: + int64_t path_start = index; + if (EAT1(state, index, grapheme == '/') || authority_len <= 0) { + static const char *non_path = " \"#?<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + + if (EAT1(state, index, grapheme == '?')) { // Query + static const char *non_query = " \"#<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); } - - // Port: - if (EAT1(text, &state, index, grapheme == ':')) { - if (EAT_MANY(text, &state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) - return -1; + + if (EAT1(state, index, grapheme == '#')) { // Fragment + static const char *non_fragment = " \"#<>[]{}\\^`|"; + EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); } - if (!EAT1(text, &state, index, grapheme == '/')) - return (index - start_index); // No path - } else { - // Optional path root: - EAT1(text, &state, index, grapheme == '/'); } - // Path: - static const char *non_path = " \"#?<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); + if (authority_len <= 0 && index == path_start) + return -1; - if (EAT1(text, &state, index, grapheme == '?')) { // Query - static const char *non_query = " \"#<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); - } - - if (EAT1(text, &state, index, grapheme == '#')) { // Fragment - static const char *non_fragment = " \"#<>[]{}\\^`|"; - EAT_MANY(text, &state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); - } return index - start_index; } -int64_t match_url(Text_t text, int64_t index) +static int64_t match_url(TextIter_t *state, int64_t index) { int64_t lookahead = index; - if (!(match_str(text, &lookahead, "https:") - || match_str(text, &lookahead, "http:") - || match_str(text, &lookahead, "ftp:") - || match_str(text, &lookahead, "wss:") - || match_str(text, &lookahead, "ws:"))) + if (!(match_str(state, &lookahead, "https:") + || match_str(state, &lookahead, "http:") + || match_str(state, &lookahead, "ftp:") + || match_str(state, &lookahead, "wss:") + || match_str(state, &lookahead, "ws:"))) return -1; - return match_uri(text, index); + return match_uri(state, index); } -int64_t match_id(Text_t text, int64_t index) +static int64_t match_id(TextIter_t *state, int64_t index) { - TextIter_t state = {0, 0}; - if (!EAT1(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) + if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) return -1; - return 1 + EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); + return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); } -int64_t match_int(Text_t text, int64_t index) +static int64_t match_int(TextIter_t *state, int64_t index) { - TextIter_t state = {0, 0}; - int64_t len = EAT_MANY(text, &state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); + int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); return len >= 0 ? len : -1; } -int64_t match_num(Text_t text, int64_t index) +static int64_t match_num(TextIter_t *state, int64_t index) { - TextIter_t state = {0, 0}; - bool negative = EAT1(text, &state, index, grapheme == '-') ? 1 : 0; - int64_t pre_decimal = EAT_MANY(text, &state, index, + bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0; + int64_t pre_decimal = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); - bool decimal = (EAT1(text, &state, index, grapheme == '.') == 1); - int64_t post_decimal = decimal ? EAT_MANY(text, &state, index, + bool decimal = (EAT1(state, index, grapheme == '.') == 1); + int64_t post_decimal = decimal ? EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; if (pre_decimal == 0 && post_decimal == 0) return -1; return negative + pre_decimal + decimal + post_decimal; } -int64_t match_newline(Text_t text, int64_t index) +static int64_t match_newline(TextIter_t *state, int64_t index) { - if (index >= text.length) + if (index >= state->text.length) return -1; - TextIter_t state = {0, 0}; - ucs4_t grapheme = index >= text.length ? 0 : Text$get_main_grapheme_fast(text, &state, index); + ucs4_t grapheme = index >= state->text.length ? 0 : Text$get_main_grapheme_fast(state, index); if (grapheme == '\n') return 1; - if (grapheme == '\r' && Text$get_grapheme_fast(text, &state, index + 1) == '\n') + if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') return 2; return -1; } -typedef struct { - int64_t index, length; - bool occupied, recursive; -} capture_t; - -typedef struct { - enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; - bool negated, non_capturing; - int64_t min, max; - union { - int32_t grapheme; - uc_property_t property; - int64_t (*fn)(Text_t, int64_t); - int32_t quote_graphemes[2]; - int32_t pair_graphemes[2]; - }; -} pat_t; - -int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) +static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) { - int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(text, state, index); + Text_t text = state->text; + int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); switch (pat.tag) { case PAT_START: { @@ -448,7 +458,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) if (index + match_len >= text.length) return pat.negated ? 1 : -1; - int32_t c = Text$get_grapheme_fast(text, state, index + match_len); + int32_t c = Text$get_grapheme_fast(state, index + match_len); if (c == open) depth += 1; else if (c == close) @@ -467,7 +477,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) int32_t close = pat.quote_graphemes[1]; for (int64_t i = index + 1; i < text.length; i++) { - int32_t c = Text$get_grapheme_fast(text, state, i); + int32_t c = Text$get_grapheme_fast(state, i); if (c == close) { return pat.negated ? -1 : (i - index) + 1; } else if (c == '\\' && index + 1 < text.length) { @@ -477,7 +487,7 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) return pat.negated ? 1 : -1; } case PAT_FUNCTION: { - int64_t match_len = pat.fn(text, index); + int64_t match_len = pat.fn(state, index); if (match_len >= 0) return pat.negated ? -1 : match_len; return pat.negated ? 1 : -1; @@ -487,49 +497,48 @@ int64_t match_pat(Text_t text, TextIter_t *state, int64_t index, pat_t pat) errx(1, "Unreachable"); } -pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) +static pat_t parse_next_pat(TextIter_t *state, int64_t *index) { - if (EAT2(pattern, state, *index, + if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), grapheme == '?')) { // Quotations: "?", '?', etc - int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); + int32_t open = Text$get_grapheme_fast(state, *index-2); int32_t close = open; uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(pattern, index, close)) - fail("Pattern's closing quote is missing: %k", &pattern); + if (!match_grapheme(state, index, close)) + fail("Pattern's closing quote is missing: %k", &state->text); return (pat_t){ .tag=PAT_QUOTE, .min=1, .max=1, .quote_graphemes={open, close}, }; - } else if (EAT2(pattern, state, *index, + } else if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), grapheme == '?')) { // Nested punctuation: (?), [?], etc - int32_t open = Text$get_grapheme_fast(pattern, state, *index-2); + int32_t open = Text$get_grapheme_fast(state, *index-2); int32_t close = open; uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); - if (!match_grapheme(pattern, index, close)) - fail("Pattern's closing brace is missing: %k", &pattern); + if (!match_grapheme(state, index, close)) + fail("Pattern's closing brace is missing: %k", &state->text); return (pat_t){ .tag=PAT_PAIR, .min=1, .max=1, .pair_graphemes={open, close}, }; - } else if (EAT1(pattern, state, *index, - grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. - skip_whitespace(pattern, index); + } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. + skip_whitespace(state, index); int64_t min, max; - if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(pattern, state, *index))) { - min = parse_int(pattern, index); - skip_whitespace(pattern, index); - if (match_grapheme(pattern, index, '+')) { + if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) { + min = parse_int(state, index); + skip_whitespace(state, index); + if (match_grapheme(state, index, '+')) { max = INT64_MAX; - } else if (match_grapheme(pattern, index, '-')) { - max = parse_int(pattern, index); + } else if (match_grapheme(state, index, '-')) { + max = parse_int(state, index); } else { max = min; } @@ -538,34 +547,34 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) min = -1, max = -1; } - skip_whitespace(pattern, index); + skip_whitespace(state, index); - bool negated = match_grapheme(pattern, index, '!'); + bool negated = match_grapheme(state, index, '!'); #define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) const char *prop_name; - if (match_str(pattern, index, "..")) + if (match_str(state, index, "..")) prop_name = ".."; else - prop_name = get_property_name(pattern, index); + prop_name = get_property_name(state, index); if (!prop_name) { // Literal character, e.g. {1?} - skip_whitespace(pattern, index); - int32_t grapheme = Text$get_grapheme_fast(pattern, state, (*index)++); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); + skip_whitespace(state, index); + int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: %k", &state->text); return PAT(PAT_GRAPHEME, .grapheme=grapheme); } else if (strlen(prop_name) == 1) { // Single letter names: {1+ A} - skip_whitespace(pattern, index); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); + skip_whitespace(state, index); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: %k", &state->text); return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); } - skip_whitespace(pattern, index); - if (!match_grapheme(pattern, index, '}')) - fail("Missing closing '}' in pattern: %k", &pattern); + skip_whitespace(state, index); + if (!match_grapheme(state, index, '}')) + fail("Missing closing '}' in pattern: %k", &state->text); switch (tolower(prop_name[0])) { case '.': @@ -576,6 +585,11 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) return PAT(PAT_ANY); } break; + case 'a': + if (strcasecmp(prop_name, "authority") == 0) { + return PAT(PAT_FUNCTION, .fn=match_authority); + } + break; case 'd': if (strcasecmp(prop_name, "digit") == 0) { return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); @@ -590,6 +604,11 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); } break; + case 'h': + if (strcasecmp(prop_name, "host") == 0) { + return PAT(PAT_FUNCTION, .fn=match_host); + } + break; case 'i': if (strcasecmp(prop_name, "id") == 0) { return PAT(PAT_FUNCTION, .fn=match_id); @@ -636,18 +655,18 @@ pat_t parse_next_pat(Text_t pattern, TextIter_t *state, int64_t *index) return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); #undef PAT } else { - return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(pattern, state, (*index)++)}; + return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)}; } } -int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) +static int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) { if (pattern_index >= pattern.length) // End of the pattern return 0; int64_t start_index = text_index; - TextIter_t pattern_state = {0, 0}, text_state = {0, 0}; - pat_t pat = parse_next_pat(pattern, &pattern_state, &pattern_index); + TextIter_t pattern_state = {pattern, 0, 0}, text_state = {text, 0, 0}; + pat_t pat = parse_next_pat(&pattern_state, &pattern_index); if (pat.min == -1 && pat.max == -1) { if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { @@ -677,7 +696,7 @@ int64_t match(Text_t text, int64_t text_index, Pattern_t pattern, int64_t patter } while (count < pat.max) { - int64_t match_len = match_pat(text, &text_state, text_index, pat); + int64_t match_len = match_pat(&text_state, text_index, pat); if (match_len < 0) break; capture_len += match_len; @@ -747,12 +766,11 @@ static int64_t _find(Text_t text, Pattern_t pattern, int64_t first, int64_t last && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {0, 0}; - + TextIter_t text_state = {text, 0, 0}; for (int64_t i = first; i <= last; i++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { - while (i < text.length && Text$get_grapheme_fast(text, &text_state, i) != first_grapheme) + while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme) ++i; } @@ -833,12 +851,12 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); Text_t ret = Text(""); - TextIter_t state = {0, 0}; + TextIter_t replacement_state = {replacement, 0, 0}; int64_t nonmatching_pos = 0; for (int64_t pos = 0; pos < replacement.length; ) { // Optimization: quickly skip ahead to first char in the backref pattern: if (find_first) { - while (pos < replacement.length && Text$get_grapheme_fast(replacement, &state, pos) != first_grapheme) + while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme) ++pos; } @@ -849,7 +867,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep } int64_t after_backref = pos + backref_len; - int64_t backref = parse_int(replacement, &after_backref); + int64_t backref = parse_int(&replacement_state, &after_backref); if (after_backref == pos + backref_len) { // Not actually a backref if there's no number pos += 1; continue; @@ -857,7 +875,7 @@ static Text_t apply_backrefs(Text_t text, Pattern_t original_pattern, Text_t rep if (backref < 0 || backref > 9) fail("Invalid backref index: %ld (only 0-%d are allowed)", backref, MAX_BACKREFS-1); backref_len = (after_backref - pos); - if (Text$get_grapheme_fast(replacement, &state, pos + backref_len) == ';') + if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';') backref_len += 1; // skip optional semicolon if (!captures[backref].occupied) @@ -894,12 +912,12 @@ public Text_t Text$replace(Text_t text, Pattern_t pattern, Text_t replacement, P && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {0, 0}; + TextIter_t text_state = {text, 0, 0}; int64_t nonmatching_pos = 0; for (int64_t pos = 0; pos < text.length; ) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) ++pos; } @@ -959,14 +977,14 @@ public Text_t Text$map(Text_t text, Pattern_t pattern, Closure_t fn) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); - TextIter_t text_state = {0, 0}; + TextIter_t text_state = {text, 0, 0}; int64_t nonmatching_pos = 0; Text_t (*text_mapper)(Text_t, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { - while (pos < text.length && Text$get_grapheme_fast(text, &text_state, pos) != first_grapheme) + while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) ++pos; } |
