// Logic for text pattern matching #include #include #include #include #include #include #define MAX_BACKREFS 100 typedef struct { Text_t text; Int_t index; Array_t captures; } PatternMatch; typedef struct { Text_t text; Int_t index; Array_t captures; bool is_none:1; } OptionalPatternMatch; #define NONE_MATCH ((OptionalPatternMatch){.is_none=true}) typedef struct { int64_t index, length; bool occupied, recursive; } capture_t; typedef struct { enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag; bool negated, non_capturing; int64_t min, max; union { int32_t grapheme; uc_property_t property; int64_t (*fn)(TextIter_t *, int64_t); int32_t quote_graphemes[2]; int32_t pair_graphemes[2]; }; } pat_t; static Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive); static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) { while (*i < state->stack[0].text.length) { int32_t grapheme = Text$get_grapheme_fast(state, *i); if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) return; *i += 1; } } static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) { if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) { *i += 1; return true; } return false; } static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) { int64_t matched = 0; while (matched[str]) { if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched]) return false; matched += 1; } *i += matched; return true; } static int64_t parse_int(TextIter_t *state, int64_t *i) { int64_t value = 0; for (;; *i += 1) { uint32_t grapheme = Text$get_main_grapheme_fast(state, *i); int digit = uc_digit_value(grapheme); if (digit < 0) break; if (value >= INT64_MAX/10) break; value = 10*value + digit; } return value; } static const char *get_property_name(TextIter_t *state, int64_t *i) { skip_whitespace(state, i); char *name = GC_MALLOC_ATOMIC(UNINAME_MAX); char *dest = name; while (*i < state->stack[0].text.length) { int32_t grapheme = Text$get_grapheme_fast(state, *i); if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) { *dest = (char)grapheme; ++dest; if (dest >= name + UNINAME_MAX - 1) break; } else { break; } *i += 1; } while (dest > name && dest[-1] == ' ') *(dest--) = '\0'; if (dest == name) return NULL; *dest = '\0'; return name; } #define EAT1(state, index, cond) ({\ int32_t grapheme = Text$get_grapheme_fast(state, index); \ bool success = (cond); \ if (success) index += 1; \ success; }) #define EAT2(state, index, cond1, cond2) ({\ int32_t grapheme = Text$get_grapheme_fast(state, index); \ bool success = (cond1); \ if (success) { \ grapheme = Text$get_grapheme_fast(state, index + 1); \ success = (cond2); \ if (success) \ index += 2; \ } \ success; }) #define EAT_MANY(state, index, cond) ({ int64_t _n = 0; while (EAT1(state, index, cond)) { _n += 1; } _n; }) static int64_t match_email(TextIter_t *state, int64_t index) { // email = local "@" domain // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii) // domain = dns-label ("." dns-label)* // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii) if (index > 0) { uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); if (uc_is_property_alphabetic(prev_codepoint)) return -1; } int64_t start_index = index; // Local part: int64_t local_len = 0; static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~"; while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) { local_len += 1; if (local_len > 64) return -1; } if (!EAT1(state, index, grapheme == '@')) return -1; // Host int64_t host_len = 0; do { int64_t label_len = 0; while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) { label_len += 1; if (label_len > 63) return -1; } if (label_len == 0) return -1; host_len += label_len; if (host_len > 255) return -1; host_len += 1; } while (EAT1(state, index, grapheme == '.')); return index - start_index; } static int64_t match_ipv6(TextIter_t *state, int64_t index) { if (index > 0) { int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) return -1; } int64_t start_index = index; const int NUM_CLUSTERS = 8; bool double_colon_used = false; for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { for (int digits = 0; digits < 4; digits++) { if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) break; } if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) return -1; // Too many digits if (cluster == NUM_CLUSTERS-1) { break; } else if (!EAT1(state, index, grapheme == ':')) { if (double_colon_used) break; return -1; } if (EAT1(state, index, grapheme == ':')) { if (double_colon_used) return -1; double_colon_used = true; } } return index - start_index; } static int64_t match_ipv4(TextIter_t *state, int64_t index) { if (index > 0) { int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1); if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) return -1; } int64_t start_index = index; const int NUM_CLUSTERS = 4; for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) { for (int digits = 0; digits < 3; digits++) { if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) { if (digits == 0) return -1; break; } } if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) return -1; // Too many digits if (cluster == NUM_CLUSTERS-1) break; else if (!EAT1(state, index, grapheme == '.')) return -1; } return (index - start_index); } static int64_t match_ip(TextIter_t *state, int64_t index) { int64_t len = match_ipv6(state, index); if (len >= 0) return len; len = match_ipv4(state, index); return (len >= 0) ? len : -1; } static int64_t match_host(TextIter_t *state, int64_t index) { int64_t ip_len = match_ip(state, index); if (ip_len > 0) return ip_len; int64_t start_index = index; if (match_grapheme(state, &index, '[')) { ip_len = match_ip(state, index); if (ip_len <= 0) return -1; index += ip_len; if (match_grapheme(state, &index, ']')) return (index - start_index); return -1; } if (!EAT1(state, index, isalpha(grapheme))) return -1; static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`"; EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme)); return (index - start_index); } static int64_t match_authority(TextIter_t *state, int64_t index) { int64_t authority_start = index; static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`."; // Optional user@ prefix: int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme)); if (username_len < 1 || !EAT1(state, index, grapheme == '@')) index = authority_start; // No user@ part // Host: int64_t host_len = match_host(state, index); if (host_len <= 0) return -1; index += host_len; // Port: if (EAT1(state, index, grapheme == ':')) { if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) return -1; } return (index - authority_start); } static int64_t match_uri(TextIter_t *state, int64_t index) { // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment] // scheme = [a-zA-Z] [a-zA-Z0-9+.-] // authority = [userinfo "@"] host [":" port] if (index > 0) { // Don't match if we're not at a word edge: uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1); if (uc_is_property_alphabetic(prev_codepoint)) return -1; } int64_t start_index = index; // Scheme: if (!EAT1(state, index, isalpha(grapheme))) return -1; EAT_MANY(state, index, !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-')); if (!match_grapheme(state, &index, ':')) return -1; // Authority: int64_t authority_len; if (match_str(state, &index, "//")) { authority_len = match_authority(state, index); if (authority_len > 0) index += authority_len; } else { authority_len = 0; } // Path: int64_t path_start = index; if (EAT1(state, index, grapheme == '/') || authority_len <= 0) { static const char *non_path = " \"#?<>[]{}\\^`|"; EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme)); if (EAT1(state, index, grapheme == '?')) { // Query static const char *non_query = " \"#<>[]{}\\^`|"; EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme)); } if (EAT1(state, index, grapheme == '#')) { // Fragment static const char *non_fragment = " \"#<>[]{}\\^`|"; EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme)); } } if (authority_len <= 0 && index == path_start) return -1; return index - start_index; } static int64_t match_url(TextIter_t *state, int64_t index) { int64_t lookahead = index; if (!(match_str(state, &lookahead, "https:") || match_str(state, &lookahead, "http:") || match_str(state, &lookahead, "ftp:") || match_str(state, &lookahead, "wss:") || match_str(state, &lookahead, "ws:"))) return -1; return match_uri(state, index); } static int64_t match_id(TextIter_t *state, int64_t index) { if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) return -1; return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE)); } static int64_t match_int(TextIter_t *state, int64_t index) { int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0; int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); return len > 0 ? negative + len : -1; } static int64_t match_alphanumeric(TextIter_t *state, int64_t index) { return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme)) ? 1 : -1; } static int64_t match_num(TextIter_t *state, int64_t index) { bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0; int64_t pre_decimal = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)); bool decimal = (EAT1(state, index, grapheme == '.') == 1); int64_t post_decimal = decimal ? EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0; if (pre_decimal == 0 && post_decimal == 0) return -1; return negative + pre_decimal + decimal + post_decimal; } static int64_t match_newline(TextIter_t *state, int64_t index) { if (index >= state->stack[0].text.length) return -1; uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index); if (grapheme == '\n') return 1; if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') return 2; return -1; } static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) { Text_t text = state->stack[0].text; int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index); switch (pat.tag) { case PAT_START: { if (index == 0) return pat.negated ? -1 : 0; return pat.negated ? 0 : -1; } case PAT_END: { if (index >= text.length) return pat.negated ? -1 : 0; return pat.negated ? 0 : -1; } case PAT_ANY: { assert(!pat.negated); return (index < text.length) ? 1 : -1; } case PAT_GRAPHEME: { if (index >= text.length) return -1; else if (grapheme == pat.grapheme) return pat.negated ? -1 : 1; return pat.negated ? 1 : -1; } case PAT_PROPERTY: { if (index >= text.length) return -1; else if (uc_is_property((ucs4_t)grapheme, pat.property)) return pat.negated ? -1 : 1; return pat.negated ? 1 : -1; } case PAT_PAIR: { // Nested punctuation: (?), [?], etc if (index >= text.length) return -1; int32_t open = pat.pair_graphemes[0]; if (grapheme != open) return pat.negated ? 1 : -1; int32_t close = pat.pair_graphemes[1]; int64_t depth = 1; int64_t match_len = 1; for (; depth > 0; match_len++) { if (index + match_len >= text.length) return pat.negated ? 1 : -1; int32_t c = Text$get_grapheme_fast(state, index + match_len); if (c == open) depth += 1; else if (c == close) depth -= 1; } return pat.negated ? -1 : match_len; } case PAT_QUOTE: { // Nested quotes: "?", '?', etc if (index >= text.length) return -1; int32_t open = pat.quote_graphemes[0]; if (grapheme != open) return pat.negated ? 1 : -1; int32_t close = pat.quote_graphemes[1]; for (int64_t i = index + 1; i < text.length; i++) { int32_t c = Text$get_grapheme_fast(state, i); if (c == close) { return pat.negated ? -1 : (i - index) + 1; } else if (c == '\\' && index + 1 < text.length) { i += 1; // Skip ahead an extra step } } return pat.negated ? 1 : -1; } case PAT_FUNCTION: { int64_t match_len = pat.fn(state, index); if (match_len >= 0) return pat.negated ? -1 : match_len; return pat.negated ? 1 : -1; } default: errx(1, "Invalid pattern"); } errx(1, "Unreachable"); } static pat_t parse_next_pat(TextIter_t *state, int64_t *index) { if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), grapheme == '?')) { // Quotations: "?", '?', etc int32_t open = Text$get_grapheme_fast(state, *index-2); int32_t close = open; uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); if (!match_grapheme(state, index, close)) fail("Pattern's closing quote is missing: ", state->stack[0].text); return (pat_t){ .tag=PAT_QUOTE, .min=1, .max=1, .quote_graphemes={open, close}, }; } else if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), grapheme == '?')) { // Nested punctuation: (?), [?], etc int32_t open = Text$get_grapheme_fast(state, *index-2); int32_t close = open; uc_mirror_char((ucs4_t)open, (ucs4_t*)&close); if (!match_grapheme(state, index, close)) fail("Pattern's closing brace is missing: ", state->stack[0].text); return (pat_t){ .tag=PAT_PAIR, .min=1, .max=1, .pair_graphemes={open, close}, }; } else if (EAT1(state, *index, grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc. skip_whitespace(state, index); int64_t min, max; if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) { min = parse_int(state, index); skip_whitespace(state, index); if (match_grapheme(state, index, '+')) { max = INT64_MAX; } else if (match_grapheme(state, index, '-')) { max = parse_int(state, index); } else { max = min; } if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")"); } else { min = -1, max = -1; } skip_whitespace(state, index); bool negated = match_grapheme(state, index, '!'); #define PAT(_tag, ...) ((pat_t){.min=min, .max=max, .negated=negated, .tag=_tag, __VA_ARGS__}) const char *prop_name; if (match_str(state, index, "..")) prop_name = ".."; else prop_name = get_property_name(state, index); if (!prop_name) { // Literal character, e.g. {1?} skip_whitespace(state, index); int32_t grapheme = Text$get_grapheme_fast(state, (*index)++); if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text); return PAT(PAT_GRAPHEME, .grapheme=grapheme); } else if (strlen(prop_name) == 1) { // Single letter names: {1+ A} skip_whitespace(state, index); if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text); return PAT(PAT_GRAPHEME, .grapheme=prop_name[0]); } skip_whitespace(state, index); if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text); switch (tolower(prop_name[0])) { case '.': if (prop_name[1] == '.') { if (negated) return ((pat_t){.tag=PAT_END, .min=min, .max=max, .non_capturing=true}); else return PAT(PAT_ANY); } break; case 'a': if (strcasecmp(prop_name, "authority") == 0) { return PAT(PAT_FUNCTION, .fn=match_authority); } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0 || strcasecmp(prop_name, "alphanumeric") == 0) { return PAT(PAT_FUNCTION, .fn=match_alphanumeric); } break; case 'c': if (strcasecmp(prop_name, "crlf") == 0) return PAT(PAT_FUNCTION, .fn=match_newline); break; case 'd': if (strcasecmp(prop_name, "digit") == 0) { return PAT(PAT_PROPERTY, .property=UC_PROPERTY_DECIMAL_DIGIT); } break; case 'e': if (strcasecmp(prop_name, "end") == 0) { return PAT(PAT_END, .non_capturing=!negated); } else if (strcasecmp(prop_name, "email") == 0) { return PAT(PAT_FUNCTION, .fn=match_email); } #if _LIBUNISTRING_VERSION >= 0x0100000 else if (strcasecmp(prop_name, "emoji") == 0) { return PAT(PAT_PROPERTY, .property=UC_PROPERTY_EMOJI); } #endif break; case 'h': if (strcasecmp(prop_name, "host") == 0) { return PAT(PAT_FUNCTION, .fn=match_host); } break; case 'i': if (strcasecmp(prop_name, "id") == 0) { return PAT(PAT_FUNCTION, .fn=match_id); } else if (strcasecmp(prop_name, "int") == 0) { return PAT(PAT_FUNCTION, .fn=match_int); } else if (strcasecmp(prop_name, "ipv4") == 0) { return PAT(PAT_FUNCTION, .fn=match_ipv4); } else if (strcasecmp(prop_name, "ipv6") == 0) { return PAT(PAT_FUNCTION, .fn=match_ipv6); } else if (strcasecmp(prop_name, "ip") == 0) { return PAT(PAT_FUNCTION, .fn=match_ip); } break; case 'n': if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) { return PAT(PAT_FUNCTION, .fn=match_newline); } else if (strcasecmp(prop_name, "num") == 0) { return PAT(PAT_FUNCTION, .fn=match_num); } break; case 's': if (strcasecmp(prop_name, "start") == 0) { return PAT(PAT_START, .non_capturing=!negated); } break; case 'u': if (strcasecmp(prop_name, "uri") == 0) { return PAT(PAT_FUNCTION, .fn=match_uri); } else if (strcasecmp(prop_name, "url") == 0) { return PAT(PAT_FUNCTION, .fn=match_url); } break; case 'w': if (strcasecmp(prop_name, "word") == 0) { return PAT(PAT_FUNCTION, .fn=match_id); } break; default: break; } uc_property_t prop = uc_property_byname(prop_name); if (uc_property_is_valid(prop)) return PAT(PAT_PROPERTY, .property=prop); ucs4_t grapheme = unicode_name_character(prop_name); if (grapheme == UNINAME_INVALID) fail("Not a valid property or character name: ", prop_name); return PAT(PAT_GRAPHEME, .grapheme=(int32_t)grapheme); #undef PAT } else { return (pat_t){.tag=PAT_GRAPHEME, .non_capturing=true, .min=1, .max=1, .grapheme=Text$get_grapheme_fast(state, (*index)++)}; } } static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures, int64_t capture_index) { if (pattern_index >= pattern.length) // End of the pattern return 0; int64_t start_index = text_index; TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text); pat_t pat = parse_next_pat(&pattern_state, &pattern_index); if (pat.min == -1 && pat.max == -1) { if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { pat.min = pat.max = MAX(1, text.length - text_index); } else { pat.min = 1; pat.max = INT64_MAX; } } int64_t capture_start = text_index; int64_t count = 0, capture_len = 0, next_match_len = 0; if (pat.tag == PAT_ANY && pattern_index >= pattern.length) { int64_t remaining = text.length - text_index; capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1; text_index += capture_len; goto success; } if (pat.min == 0 && pattern_index < pattern.length) { next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); if (next_match_len >= 0) { capture_len = 0; goto success; } } while (count < pat.max) { int64_t match_len = match_pat(&text_state, text_index, pat); if (match_len < 0) break; capture_len += match_len; text_index += match_len; count += 1; if (pattern_index < pattern.length) { // More stuff after this if (count < pat.min) next_match_len = -1; else next_match_len = match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1)); } else { next_match_len = 0; } if (match_len == 0) { if (next_match_len >= 0) { // If we're good to go, no need to keep re-matching zero-length // matches till we hit max: count = pat.max; break; } else { return -1; } } if (pattern_index < pattern.length && next_match_len >= 0) break; // Next guy exists and wants to stop here if (text_index >= text.length) break; } if (count < pat.min || next_match_len < 0) return -1; success: if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) { if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) { assert(capture_len > 0); captures[capture_index] = (capture_t){ .index=capture_start + 1, // Skip leading quote/paren .length=capture_len - 2, // Skip open/close .occupied=true, .recursive=(pat.tag == PAT_PAIR), }; } else { captures[capture_index] = (capture_t){ .index=capture_start, .length=capture_len, .occupied=true, .recursive=false, }; } } return (text_index - start_index) + next_match_len; } #undef EAT1 #undef EAT2 #undef EAT_MANY static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length, capture_t *captures) { int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); TextIter_t text_state = NEW_TEXT_ITER_STATE(text); for (int64_t i = first; i <= last; i++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme) ++i; } int64_t m = match(text, i, pattern, 0, captures, 0); if (m >= 0) { if (match_length) *match_length = m; return i; } } if (match_length) *match_length = -1; return -1; } static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) { int64_t first = Int64$from_int(from_index, false); if (first == 0) fail("Invalid index: 0"); if (first < 0) first = text.length + first + 1; if (first > text.length || first < 1) return NONE_MATCH; capture_t captures[MAX_BACKREFS] = {}; int64_t len = 0; int64_t found = _find(text, pattern, first-1, text.length-1, &len, captures); if (found == -1) return NONE_MATCH; Array_t capture_array = {}; for (int i = 0; captures[i].occupied; i++) { Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); } return (OptionalPatternMatch){ .text=Text$slice(text, I(found+1), I(found+len)), .index=I(found+1), .captures=capture_array, }; } PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) { if (Text$starts_with(pattern, Text("{start}"))) { int64_t m = match(text, 0, pattern, 0, NULL, 0); return m >= 0; } else if (Text$ends_with(text, Text("{end}"))) { for (int64_t i = text.length-1; i >= 0; i--) { int64_t match_len = match(text, i, pattern, 0, NULL, 0); if (match_len >= 0 && i + match_len == text.length) return true; } return false; } else { int64_t found = _find(text, pattern, 0, text.length-1, NULL, NULL); return (found >= 0); } } static bool Pattern$matches(Text_t text, Text_t pattern) { capture_t captures[MAX_BACKREFS] = {}; int64_t match_len = match(text, 0, pattern, 0, NULL, 0); return (match_len == text.length); } static OptionalArray_t Pattern$captures(Text_t text, Text_t pattern) { capture_t captures[MAX_BACKREFS] = {}; int64_t match_len = match(text, 0, pattern, 0, captures, 0); if (match_len != text.length) return NONE_ARRAY; Array_t capture_array = {}; for (int i = 0; captures[i].occupied; i++) { Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); Array$insert(&capture_array, &capture, I(0), sizeof(Text_t)); } return capture_array; } static Array_t Pattern$find_all(Text_t text, Text_t pattern) { if (pattern.length == 0) // special case return (Array_t){.length=0}; Array_t matches = {}; for (int64_t i = 1; ; ) { OptionalPatternMatch m = find(text, pattern, I(i)); if (m.is_none) break; i = Int64$from_int(m.index, false) + m.text.length; Array$insert(&matches, &m, I_small(0), sizeof(PatternMatch)); } return matches; } typedef struct { TextIter_t state; Int_t i; Text_t pattern; } match_iter_state_t; static OptionalPatternMatch next_match(match_iter_state_t *state) { if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) return NONE_MATCH; OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i); if (m.is_none) // No match state->i = I(state->state.stack[0].text.length + 1); else state->i = Int$plus(m.index, I(MAX(1, m.text.length))); return m; } static Closure_t Pattern$by_match(Text_t text, Text_t pattern) { return (Closure_t){ .fn=(void*)next_match, .userdata=new(match_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=I_small(1), .pattern=pattern), }; } static Text_t apply_backrefs(Text_t text, Array_t recursive_replacements, Text_t replacement, Text_t backref_pat, capture_t *captures) { if (backref_pat.length == 0) return replacement; int32_t first_grapheme = Text$get_grapheme(backref_pat, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); Text_t ret = Text(""); TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement); int64_t nonmatching_pos = 0; for (int64_t pos = 0; pos < replacement.length; ) { // Optimization: quickly skip ahead to first char in the backref pattern: if (find_first) { while (pos < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme) ++pos; } int64_t backref_len = match(replacement, pos, backref_pat, 0, NULL, 0); if (backref_len < 0) { pos += 1; continue; } int64_t after_backref = pos + backref_len; int64_t backref = parse_int(&replacement_state, &after_backref); if (after_backref == pos + backref_len) { // Not actually a backref if there's no number pos += 1; continue; } if (backref < 0 || backref > 9) fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS-1, " are allowed)"); backref_len = (after_backref - pos); if (Text$get_grapheme_fast(&replacement_state, pos + backref_len) == ';') backref_len += 1; // skip optional semicolon if (!captures[backref].occupied) fail("There is no capture number ", backref, "!"); Text_t backref_text = Text$slice(text, I(captures[backref].index+1), I(captures[backref].index + captures[backref].length)); if (captures[backref].recursive && recursive_replacements.length > 0) backref_text = replace_array(backref_text, recursive_replacements, backref_pat, true); if (pos > nonmatching_pos) { Text_t before_slice = Text$slice(replacement, I(nonmatching_pos+1), I(pos)); ret = Text$concat(ret, before_slice, backref_text); } else { ret = Text$concat(ret, backref_text); } pos += backref_len; nonmatching_pos = pos; } if (nonmatching_pos < replacement.length) { Text_t last_slice = Text$slice(replacement, I(nonmatching_pos+1), I(replacement.length)); ret = Text$concat(ret, last_slice); } return ret; } static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_pat, bool recursive) { Text_t ret = EMPTY_TEXT; int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); Text_t entries[2] = {pattern, replacement}; Array_t replacements = { .data=entries, .length=1, .stride=sizeof(entries), }; TextIter_t text_state = NEW_TEXT_ITER_STATE(text); int64_t nonmatching_pos = 0; for (int64_t pos = 0; pos < text.length; ) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) ++pos; } capture_t captures[MAX_BACKREFS] = {}; int64_t match_len = match(text, pos, pattern, 0, captures, 1); if (match_len < 0) { pos += 1; continue; } captures[0] = (capture_t){ .index = pos, .length = match_len, .occupied = true, .recursive = false, }; Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures); if (pos > nonmatching_pos) { Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); ret = Text$concat(ret, before_slice, replacement_text); } else { ret = Text$concat(ret, replacement_text); } nonmatching_pos = pos + match_len; pos += MAX(match_len, 1); } if (nonmatching_pos < text.length) { Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); ret = Text$concat(ret, last_slice); } return ret; } static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) { int64_t first = 0, last = text.length-1; if (trim_left) { int64_t match_len = match(text, 0, pattern, 0, NULL, 0); if (match_len > 0) first = match_len; } if (trim_right) { for (int64_t i = text.length-1; i >= first; i--) { int64_t match_len = match(text, i, pattern, 0, NULL, 0); if (match_len > 0 && i + match_len == text.length) last = i-1; } } return Text$slice(text, I(first+1), I(last+1)); } static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) { Text_t ret = EMPTY_TEXT; int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); TextIter_t text_state = NEW_TEXT_ITER_STATE(text); int64_t nonmatching_pos = 0; Text_t (*text_mapper)(PatternMatch, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) ++pos; } capture_t captures[MAX_BACKREFS] = {}; int64_t match_len = match(text, pos, pattern, 0, captures, 0); if (match_len < 0) continue; PatternMatch m = { .text=Text$slice(text, I(pos+1), I(pos+match_len)), .index=I(pos+1), .captures={}, }; for (int i = 0; captures[i].occupied; i++) { Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); if (recursive) capture = Pattern$map(capture, pattern, fn, recursive); Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); } Text_t replacement = text_mapper(m, fn.userdata); if (pos > nonmatching_pos) { Text_t before_slice = Text$slice(text, I(nonmatching_pos+1), I(pos)); ret = Text$concat(ret, before_slice, replacement); } else { ret = Text$concat(ret, replacement); } nonmatching_pos = pos + match_len; pos += (match_len - 1); } if (nonmatching_pos < text.length) { Text_t last_slice = Text$slice(text, I(nonmatching_pos+1), I(text.length)); ret = Text$concat(ret, last_slice); } return ret; } static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) { int32_t first_grapheme = Text$get_grapheme(pattern, 0); bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK) && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION)); TextIter_t text_state = NEW_TEXT_ITER_STATE(text); void (*action)(PatternMatch, void*) = fn.fn; for (int64_t pos = 0; pos < text.length; pos++) { // Optimization: quickly skip ahead to first char in pattern: if (find_first) { while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme) ++pos; } capture_t captures[MAX_BACKREFS] = {}; int64_t match_len = match(text, pos, pattern, 0, captures, 0); if (match_len < 0) continue; PatternMatch m = { .text=Text$slice(text, I(pos+1), I(pos+match_len)), .index=I(pos+1), .captures={}, }; for (int i = 0; captures[i].occupied; i++) { Text_t capture = Text$slice(text, I(captures[i].index+1), I(captures[i].index+captures[i].length)); if (recursive) Pattern$each(capture, pattern, fn, recursive); Array$insert(&m.captures, &capture, I(0), sizeof(Text_t)); } action(m, fn.userdata); pos += (match_len - 1); } } Text_t replace_array(Text_t text, Array_t replacements, Text_t backref_pat, bool recursive) { if (replacements.length == 0) return text; Text_t ret = EMPTY_TEXT; int64_t nonmatch_pos = 0; for (int64_t pos = 0; pos < text.length; ) { // Find the first matching pattern at this position: for (int64_t i = 0; i < replacements.length; i++) { Text_t pattern = *(Text_t*)(replacements.data + i*replacements.stride); capture_t captures[MAX_BACKREFS] = {}; int64_t len = match(text, pos, pattern, 0, captures, 1); if (len < 0) continue; captures[0].index = pos; captures[0].length = len; // If we skipped over some non-matching text before finding a match, insert it here: if (pos > nonmatch_pos) { Text_t before_slice = Text$slice(text, I(nonmatch_pos+1), I(pos)); ret = Text$concat(ret, before_slice); } // Concatenate the replacement: Text_t replacement = *(Text_t*)(replacements.data + i*replacements.stride + sizeof(Text_t)); Text_t replacement_text = apply_backrefs(text, recursive ? replacements : (Array_t){}, replacement, backref_pat, captures); ret = Text$concat(ret, replacement_text); pos += MAX(len, 1); nonmatch_pos = pos; goto next_pos; } pos += 1; next_pos: continue; } if (nonmatch_pos <= text.length) { Text_t last_slice = Text$slice(text, I(nonmatch_pos+1), I(text.length)); ret = Text$concat(ret, last_slice); } return ret; } static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_pat, bool recursive) { return replace_array(text, replacements.entries, backref_pat, recursive); } static Array_t Pattern$split(Text_t text, Text_t pattern) { if (text.length == 0) // special case return (Array_t){.length=0}; if (pattern.length == 0) // special case return Text$clusters(text); Array_t chunks = {}; int64_t i = 0; for (;;) { int64_t len = 0; int64_t found = _find(text, pattern, i, text.length-1, &len, NULL); if (found == i && len == 0) found = _find(text, pattern, i + 1, text.length-1, &len, NULL); if (found < 0) break; Text_t chunk = Text$slice(text, I(i+1), I(found)); Array$insert(&chunks, &chunk, I_small(0), sizeof(Text_t)); i = MAX(found + len, i + 1); } Text_t last_chunk = Text$slice(text, I(i+1), I(text.length)); Array$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t)); return chunks; } typedef struct { TextIter_t state; int64_t i; Text_t pattern; } split_iter_state_t; static OptionalText_t next_split(split_iter_state_t *state) { Text_t text = state->state.stack[0].text; if (state->i >= text.length) { if (state->pattern.length > 0 && state->i == text.length) { // special case state->i = text.length + 1; return EMPTY_TEXT; } return NONE_TEXT; } if (state->pattern.length == 0) { // special case Text_t ret = Text$cluster(text, I(state->i+1)); state->i += 1; return ret; } int64_t start = state->i; int64_t len = 0; int64_t found = _find(text, state->pattern, start, text.length-1, &len, NULL); if (found == start && len == 0) found = _find(text, state->pattern, start + 1, text.length-1, &len, NULL); if (found >= 0) { state->i = MAX(found + len, state->i + 1); return Text$slice(text, I(start+1), I(found)); } else { state->i = state->state.stack[0].text.length + 1; return Text$slice(text, I(start+1), I(text.length)); } } static Closure_t Pattern$by_split(Text_t text, Text_t pattern) { return (Closure_t){ .fn=(void*)next_split, .userdata=new(split_iter_state_t, .state=NEW_TEXT_ITER_STATE(text), .i=0, .pattern=pattern), }; } static Text_t Pattern$escape_text(Text_t text) { // TODO: optimize for spans of non-escaped text Text_t ret = EMPTY_TEXT; TextIter_t state = NEW_TEXT_ITER_STATE(text); for (int64_t i = 0; i < text.length; i++) { uint32_t g = Text$get_main_grapheme_fast(&state, i); if (g == '{') { ret = Text$concat(ret, Text("{1{}")); } else if (g == '?' || uc_is_property_quotation_mark(g) || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) { ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i+1), I(i+1)), Text("}")); } else { ret = Text$concat(ret, Text$slice(text, I(i+1), I(i+1))); } } return ret; } static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) { (void)info; if (!obj) return Text("Pattern"); Text_t pat = *(Text_t*)obj; Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/"); return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote)); } // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0