patterns.c - tomo-patterns

(1.2K lines)
   1 // Logic for text pattern matching
   2 
   3 #include <ctype.h>
   4 #include <string.h>
   5 #include <strings.h>
   6 #include <sys/param.h>
   7 #include <unictype.h>
   8 #include <uniname.h>
   9 #include <unistring/version.h>
  10 
  11 #define MAX_BACKREFS 100
  12 
  13 typedef struct {
  14     Text_t text;
  15     Int_t index;
  16     List_t captures;
  17 } PatternMatch;
  18 
  19 typedef struct {
  20     Text_t text;
  21     Int_t index;
  22     List_t captures;
  23     bool is_none : 1;
  24 } OptionalPatternMatch;
  25 
  26 #define NONE_MATCH ((OptionalPatternMatch){.is_none = true})
  27 
  28 typedef struct {
  29     int64_t index, length;
  30     bool occupied, recursive;
  31 } capture_t;
  32 
  33 typedef struct {
  34     enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
  35     bool negated, non_capturing;
  36     int64_t min, max;
  37     union {
  38         int32_t grapheme;
  39         uc_property_t property;
  40         int64_t (*fn)(TextIter_t *, int64_t);
  41         int32_t quote_graphemes[2];
  42         int32_t pair_graphemes[2];
  43     };
  44 } pat_t;
  45 
  46 static Text_t replace_list(Text_t text, List_t replacements, Text_t backref_marker, bool recursive);
  47 
  48 static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) {
  49     while (*i < state->stack[0].text.length) {
  50         int32_t grapheme = Text$get_grapheme_fast(state, *i);
  51         if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) return;
  52         *i += 1;
  53     }
  54 }
  55 
  56 static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) {
  57     if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
  58         *i += 1;
  59         return true;
  60     }
  61     return false;
  62 }
  63 
  64 static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) {
  65     int64_t matched = 0;
  66     while (matched[str]) {
  67         if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
  68             return false;
  69         matched += 1;
  70     }
  71     *i += matched;
  72     return true;
  73 }
  74 
  75 static int64_t parse_int(TextIter_t *state, int64_t *i) {
  76     int64_t value = 0;
  77     for (;; *i += 1) {
  78         uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
  79         int digit = uc_digit_value(grapheme);
  80         if (digit < 0) break;
  81         if (value >= INT64_MAX / 10) break;
  82         value = 10 * value + digit;
  83     }
  84     return value;
  85 }
  86 
  87 static const char *get_property_name(TextIter_t *state, int64_t *i) {
  88     skip_whitespace(state, i);
  89     char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
  90     char *dest = name;
  91     while (*i < state->stack[0].text.length) {
  92         int32_t grapheme = Text$get_grapheme_fast(state, *i);
  93         if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
  94             *dest = (char)grapheme;
  95             ++dest;
  96             if (dest >= name + UNINAME_MAX - 1) break;
  97         } else {
  98             break;
  99         }
 100         *i += 1;
 101     }
 102 
 103     while (dest > name && dest[-1] == ' ')
 104         *(dest--) = '\0';
 105 
 106     if (dest == name) return NULL;
 107     *dest = '\0';
 108     return name;
 109 }
 110 
 111 #define EAT1(state, index, cond)                                                                                       \
 112     ({                                                                                                                 \
 113         int32_t grapheme = Text$get_grapheme_fast(state, index);                                                       \
 114         bool success = (cond);                                                                                         \
 115         if (success) index += 1;                                                                                       \
 116         success;                                                                                                       \
 117     })
 118 
 119 #define EAT2(state, index, cond1, cond2)                                                                               \
 120     ({                                                                                                                 \
 121         int32_t grapheme = Text$get_grapheme_fast(state, index);                                                       \
 122         bool success = (cond1);                                                                                        \
 123         if (success) {                                                                                                 \
 124             grapheme = Text$get_grapheme_fast(state, index + 1);                                                       \
 125             success = (cond2);                                                                                         \
 126             if (success) index += 2;                                                                                   \
 127         }                                                                                                              \
 128         success;                                                                                                       \
 129     })
 130 
 131 #define EAT_MANY(state, index, cond)                                                                                   \
 132     ({                                                                                                                 \
 133         int64_t _n = 0;                                                                                                \
 134         while (EAT1(state, index, cond)) {                                                                             \
 135             _n += 1;                                                                                                   \
 136         }                                                                                                              \
 137         _n;                                                                                                            \
 138     })
 139 
 140 static int64_t match_email(TextIter_t *state, int64_t index) {
 141     // email = local "@" domain
 142     // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
 143     // domain = dns-label ("." dns-label)*
 144     // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
 145 
 146     if (index > 0) {
 147         uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
 148         if (uc_is_property_alphabetic(prev_codepoint)) return -1;
 149     }
 150 
 151     int64_t start_index = index;
 152 
 153     // Local part:
 154     int64_t local_len = 0;
 155     static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
 156     while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
 157         local_len += 1;
 158         if (local_len > 64) return -1;
 159     }
 160 
 161     if (!EAT1(state, index, grapheme == '@')) return -1;
 162 
 163     // Host
 164     int64_t host_len = 0;
 165     do {
 166         int64_t label_len = 0;
 167         while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
 168             label_len += 1;
 169             if (label_len > 63) return -1;
 170         }
 171 
 172         if (label_len == 0) return -1;
 173 
 174         host_len += label_len;
 175         if (host_len > 255) return -1;
 176         host_len += 1;
 177     } while (EAT1(state, index, grapheme == '.'));
 178 
 179     return index - start_index;
 180 }
 181 
 182 static int64_t match_ipv6(TextIter_t *state, int64_t index) {
 183     if (index > 0) {
 184         int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
 185         if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) return -1;
 186     }
 187     int64_t start_index = index;
 188     const int NUM_CLUSTERS = 8;
 189     bool double_colon_used = false;
 190     for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
 191         for (int digits = 0; digits < 4; digits++) {
 192             if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) break;
 193         }
 194         if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) return -1; // Too many digits
 195 
 196         if (cluster == NUM_CLUSTERS - 1) {
 197             break;
 198         } else if (!EAT1(state, index, grapheme == ':')) {
 199             if (double_colon_used) break;
 200             return -1;
 201         }
 202 
 203         if (EAT1(state, index, grapheme == ':')) {
 204             if (double_colon_used) return -1;
 205             double_colon_used = true;
 206         }
 207     }
 208     return index - start_index;
 209 }
 210 
 211 static int64_t match_ipv4(TextIter_t *state, int64_t index) {
 212     if (index > 0) {
 213         int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
 214         if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) return -1;
 215     }
 216     int64_t start_index = index;
 217 
 218     const int NUM_CLUSTERS = 4;
 219     for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
 220         for (int digits = 0; digits < 3; digits++) {
 221             if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
 222                 if (digits == 0) return -1;
 223                 break;
 224             }
 225         }
 226 
 227         if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) return -1; // Too many digits
 228 
 229         if (cluster == NUM_CLUSTERS - 1) break;
 230         else if (!EAT1(state, index, grapheme == '.')) return -1;
 231     }
 232     return (index - start_index);
 233 }
 234 
 235 static int64_t match_ip(TextIter_t *state, int64_t index) {
 236     int64_t len = match_ipv6(state, index);
 237     if (len >= 0) return len;
 238     len = match_ipv4(state, index);
 239     return (len >= 0) ? len : -1;
 240 }
 241 
 242 static int64_t match_host(TextIter_t *state, int64_t index) {
 243     int64_t ip_len = match_ip(state, index);
 244     if (ip_len > 0) return ip_len;
 245 
 246     int64_t start_index = index;
 247     if (match_grapheme(state, &index, '[')) {
 248         ip_len = match_ip(state, index);
 249         if (ip_len <= 0) return -1;
 250         index += ip_len;
 251         if (match_grapheme(state, &index, ']')) return (index - start_index);
 252         return -1;
 253     }
 254 
 255     if (!EAT1(state, index, isalpha(grapheme))) return -1;
 256 
 257     static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`";
 258     EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme));
 259     return (index - start_index);
 260 }
 261 
 262 static int64_t match_authority(TextIter_t *state, int64_t index) {
 263     int64_t authority_start = index;
 264     static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`.";
 265 
 266     // Optional user@ prefix:
 267     int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme));
 268     if (username_len < 1 || !EAT1(state, index, grapheme == '@')) index = authority_start; // No user@ part
 269 
 270     // Host:
 271     int64_t host_len = match_host(state, index);
 272     if (host_len <= 0) return -1;
 273     index += host_len;
 274 
 275     // Port:
 276     if (EAT1(state, index, grapheme == ':')) {
 277         if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) return -1;
 278     }
 279     return (index - authority_start);
 280 }
 281 
 282 static int64_t match_uri(TextIter_t *state, int64_t index) {
 283     // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
 284     // scheme = [a-zA-Z] [a-zA-Z0-9+.-]
 285     // authority = [userinfo "@"] host [":" port]
 286 
 287     if (index > 0) {
 288         // Don't match if we're not at a word edge:
 289         uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
 290         if (uc_is_property_alphabetic(prev_codepoint)) return -1;
 291     }
 292 
 293     int64_t start_index = index;
 294 
 295     // Scheme:
 296     if (!EAT1(state, index, isalpha(grapheme))) return -1;
 297     EAT_MANY(state, index,
 298              !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
 299     if (!match_grapheme(state, &index, ':')) return -1;
 300 
 301     // Authority:
 302     int64_t authority_len;
 303     if (match_str(state, &index, "//")) {
 304         authority_len = match_authority(state, index);
 305         if (authority_len > 0) index += authority_len;
 306     } else {
 307         authority_len = 0;
 308     }
 309 
 310     // Path:
 311     int64_t path_start = index;
 312     if (EAT1(state, index, grapheme == '/') || authority_len <= 0) {
 313         static const char *non_path = " \"#?<>[]{}\\^`|";
 314         EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
 315 
 316         if (EAT1(state, index, grapheme == '?')) { // Query
 317             static const char *non_query = " \"#<>[]{}\\^`|";
 318             EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
 319         }
 320 
 321         if (EAT1(state, index, grapheme == '#')) { // Fragment
 322             static const char *non_fragment = " \"#<>[]{}\\^`|";
 323             EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
 324         }
 325     }
 326 
 327     if (authority_len <= 0 && index == path_start) return -1;
 328 
 329     return index - start_index;
 330 }
 331 
 332 static int64_t match_url(TextIter_t *state, int64_t index) {
 333     int64_t lookahead = index;
 334     if (!(match_str(state, &lookahead, "https:") || match_str(state, &lookahead, "http:")
 335           || match_str(state, &lookahead, "ftp:") || match_str(state, &lookahead, "wss:")
 336           || match_str(state, &lookahead, "ws:")))
 337         return -1;
 338 
 339     return match_uri(state, index);
 340 }
 341 
 342 static int64_t match_id(TextIter_t *state, int64_t index) {
 343     if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) return -1;
 344     return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
 345 }
 346 
 347 static int64_t match_int(TextIter_t *state, int64_t index) {
 348     int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
 349     int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
 350     return len > 0 ? negative + len : -1;
 351 }
 352 
 353 static int64_t match_alphanumeric(TextIter_t *state, int64_t index) {
 354     return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme))
 355                ? 1
 356                : -1;
 357 }
 358 
 359 static int64_t match_num(TextIter_t *state, int64_t index) {
 360     bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
 361     int64_t pre_decimal = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
 362     bool decimal = (EAT1(state, index, grapheme == '.') == 1);
 363     int64_t post_decimal =
 364         decimal ? EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
 365     if (pre_decimal == 0 && post_decimal == 0) return -1;
 366     return negative + pre_decimal + decimal + post_decimal;
 367 }
 368 
 369 static int64_t match_newline(TextIter_t *state, int64_t index) {
 370     if (index >= state->stack[0].text.length) return -1;
 371 
 372     uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
 373     if (grapheme == '\n') return 1;
 374     if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') return 2;
 375     return -1;
 376 }
 377 
 378 static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) {
 379     Text_t text = state->stack[0].text;
 380     int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
 381 
 382     switch (pat.tag) {
 383     case PAT_START: {
 384         if (index == 0) return pat.negated ? -1 : 0;
 385         return pat.negated ? 0 : -1;
 386     }
 387     case PAT_END: {
 388         if (index >= text.length) return pat.negated ? -1 : 0;
 389         return pat.negated ? 0 : -1;
 390     }
 391     case PAT_ANY: {
 392         assert(!pat.negated);
 393         return (index < text.length) ? 1 : -1;
 394     }
 395     case PAT_GRAPHEME: {
 396         if (index >= text.length) return -1;
 397         else if (grapheme == pat.grapheme) return pat.negated ? -1 : 1;
 398         return pat.negated ? 1 : -1;
 399     }
 400     case PAT_PROPERTY: {
 401         if (index >= text.length) return -1;
 402         else if (uc_is_property((ucs4_t)grapheme, pat.property)) return pat.negated ? -1 : 1;
 403         return pat.negated ? 1 : -1;
 404     }
 405     case PAT_PAIR: {
 406         // Nested punctuation: (?), [?], etc
 407         if (index >= text.length) return -1;
 408 
 409         int32_t open = pat.pair_graphemes[0];
 410         if (grapheme != open) return pat.negated ? 1 : -1;
 411 
 412         int32_t close = pat.pair_graphemes[1];
 413         int64_t depth = 1;
 414         int64_t match_len = 1;
 415         for (; depth > 0; match_len++) {
 416             if (index + match_len >= text.length) return pat.negated ? 1 : -1;
 417 
 418             int32_t c = Text$get_grapheme_fast(state, index + match_len);
 419             if (c == open) depth += 1;
 420             else if (c == close) depth -= 1;
 421         }
 422         return pat.negated ? -1 : match_len;
 423     }
 424     case PAT_QUOTE: {
 425         // Nested quotes: "?", '?', etc
 426         if (index >= text.length) return -1;
 427 
 428         int32_t open = pat.quote_graphemes[0];
 429         if (grapheme != open) return pat.negated ? 1 : -1;
 430 
 431         int32_t close = pat.quote_graphemes[1];
 432         for (int64_t i = index + 1; i < text.length; i++) {
 433             int32_t c = Text$get_grapheme_fast(state, i);
 434             if (c == close) {
 435                 return pat.negated ? -1 : (i - index) + 1;
 436             } else if (c == '\\' && index + 1 < text.length) {
 437                 i += 1; // Skip ahead an extra step
 438             }
 439         }
 440         return pat.negated ? 1 : -1;
 441     }
 442     case PAT_FUNCTION: {
 443         int64_t match_len = pat.fn(state, index);
 444         if (match_len >= 0) return pat.negated ? -1 : match_len;
 445         return pat.negated ? 1 : -1;
 446     }
 447     default: errx(1, "Invalid pattern");
 448     }
 449     errx(1, "Unreachable");
 450     return 0;
 451 }
 452 
 453 static pat_t parse_next_pat(TextIter_t *state, int64_t *index) {
 454     if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), grapheme == '?')) {
 455         // Quotations: "?", '?', etc
 456         int32_t open = Text$get_grapheme_fast(state, *index - 2);
 457         int32_t close = open;
 458         uc_mirror_char((ucs4_t)open, (ucs4_t *)&close);
 459         if (!match_grapheme(state, index, close)) fail("Pattern's closing quote is missing: ", state->stack[0].text);
 460 
 461         return (pat_t){
 462             .tag = PAT_QUOTE,
 463             .min = 1,
 464             .max = 1,
 465             .quote_graphemes = {open, close},
 466         };
 467     } else if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), grapheme == '?')) {
 468         // Nested punctuation: (?), [?], etc
 469         int32_t open = Text$get_grapheme_fast(state, *index - 2);
 470         int32_t close = open;
 471         uc_mirror_char((ucs4_t)open, (ucs4_t *)&close);
 472         if (!match_grapheme(state, index, close)) fail("Pattern's closing brace is missing: ", state->stack[0].text);
 473 
 474         return (pat_t){
 475             .tag = PAT_PAIR,
 476             .min = 1,
 477             .max = 1,
 478             .pair_graphemes = {open, close},
 479         };
 480     } else if (EAT1(state, *index,
 481                     grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
 482         skip_whitespace(state, index);
 483         int64_t min, max;
 484         if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) {
 485             min = parse_int(state, index);
 486             skip_whitespace(state, index);
 487             if (match_grapheme(state, index, '+')) {
 488                 max = INT64_MAX;
 489             } else if (match_grapheme(state, index, '-')) {
 490                 max = parse_int(state, index);
 491             } else {
 492                 max = min;
 493             }
 494             if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")");
 495         } else {
 496             min = -1, max = -1;
 497         }
 498 
 499         skip_whitespace(state, index);
 500 
 501         bool negated = match_grapheme(state, index, '!');
 502 #define PAT(_tag, ...) ((pat_t){.min = min, .max = max, .negated = negated, .tag = _tag, __VA_ARGS__})
 503         const char *prop_name;
 504         if (match_str(state, index, "..")) prop_name = "..";
 505         else prop_name = get_property_name(state, index);
 506 
 507         if (!prop_name) {
 508             // Literal character, e.g. {1?}
 509             skip_whitespace(state, index);
 510             int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
 511             if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text);
 512             return PAT(PAT_GRAPHEME, .grapheme = grapheme);
 513         } else if (strlen(prop_name) == 1) {
 514             // Single letter names: {1+ A}
 515             skip_whitespace(state, index);
 516             if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text);
 517             return PAT(PAT_GRAPHEME, .grapheme = prop_name[0]);
 518         }
 519 
 520         skip_whitespace(state, index);
 521         if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text);
 522 
 523         switch (tolower(prop_name[0])) {
 524         case '.':
 525             if (prop_name[1] == '.') {
 526                 if (negated) return ((pat_t){.tag = PAT_END, .min = min, .max = max, .non_capturing = true});
 527                 else return PAT(PAT_ANY);
 528             }
 529             break;
 530         case 'a':
 531             if (strcasecmp(prop_name, "authority") == 0) {
 532                 return PAT(PAT_FUNCTION, .fn = match_authority);
 533             } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0
 534                        || strcasecmp(prop_name, "alphanumeric") == 0) {
 535                 return PAT(PAT_FUNCTION, .fn = match_alphanumeric);
 536             }
 537             break;
 538         case 'c':
 539             if (strcasecmp(prop_name, "crlf") == 0) return PAT(PAT_FUNCTION, .fn = match_newline);
 540             break;
 541         case 'd':
 542             if (strcasecmp(prop_name, "digit") == 0) {
 543                 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_DECIMAL_DIGIT);
 544             }
 545             break;
 546         case 'e':
 547             if (strcasecmp(prop_name, "end") == 0) {
 548                 return PAT(PAT_END, .non_capturing = !negated);
 549             } else if (strcasecmp(prop_name, "email") == 0) {
 550                 return PAT(PAT_FUNCTION, .fn = match_email);
 551             }
 552 #if _LIBUNISTRING_VERSION >= 0x0100000
 553             else if (strcasecmp(prop_name, "emoji") == 0) {
 554                 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_EMOJI);
 555             }
 556 #endif
 557             break;
 558         case 'h':
 559             if (strcasecmp(prop_name, "host") == 0) {
 560                 return PAT(PAT_FUNCTION, .fn = match_host);
 561             }
 562             break;
 563         case 'i':
 564             if (strcasecmp(prop_name, "id") == 0) {
 565                 return PAT(PAT_FUNCTION, .fn = match_id);
 566             } else if (strcasecmp(prop_name, "int") == 0) {
 567                 return PAT(PAT_FUNCTION, .fn = match_int);
 568             } else if (strcasecmp(prop_name, "ipv4") == 0) {
 569                 return PAT(PAT_FUNCTION, .fn = match_ipv4);
 570             } else if (strcasecmp(prop_name, "ipv6") == 0) {
 571                 return PAT(PAT_FUNCTION, .fn = match_ipv6);
 572             } else if (strcasecmp(prop_name, "ip") == 0) {
 573                 return PAT(PAT_FUNCTION, .fn = match_ip);
 574             }
 575             break;
 576         case 'l':
 577             if (strcasecmp(prop_name, "letter") == 0) {
 578                 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_ALPHABETIC);
 579             }
 580         case 'n':
 581             if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) {
 582                 return PAT(PAT_FUNCTION, .fn = match_newline);
 583             } else if (strcasecmp(prop_name, "num") == 0) {
 584                 return PAT(PAT_FUNCTION, .fn = match_num);
 585             }
 586             break;
 587         case 's':
 588             if (strcasecmp(prop_name, "start") == 0) {
 589                 return PAT(PAT_START, .non_capturing = !negated);
 590             }
 591             break;
 592         case 'u':
 593             if (strcasecmp(prop_name, "uri") == 0) {
 594                 return PAT(PAT_FUNCTION, .fn = match_uri);
 595             } else if (strcasecmp(prop_name, "url") == 0) {
 596                 return PAT(PAT_FUNCTION, .fn = match_url);
 597             }
 598             break;
 599         case 'w':
 600             if (strcasecmp(prop_name, "word") == 0) {
 601                 return PAT(PAT_FUNCTION, .fn = match_id);
 602             } else if (strcasecmp(prop_name, "ws") == 0 || strcasecmp(prop_name, "whitespace") == 0) {
 603                 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_WHITE_SPACE);
 604             }
 605             break;
 606         default: break;
 607         }
 608 
 609         uc_property_t prop = uc_property_byname(prop_name);
 610         if (uc_property_is_valid(prop)) return PAT(PAT_PROPERTY, .property = prop);
 611 
 612         ucs4_t grapheme = unicode_name_character(prop_name);
 613         if (grapheme == UNINAME_INVALID) fail("Not a valid property or character name: ", prop_name);
 614         return PAT(PAT_GRAPHEME, .grapheme = (int32_t)grapheme);
 615 #undef PAT
 616     } else {
 617         return (pat_t){.tag = PAT_GRAPHEME,
 618                        .non_capturing = true,
 619                        .min = 1,
 620                        .max = 1,
 621                        .grapheme = Text$get_grapheme_fast(state, (*index)++)};
 622     }
 623 }
 624 
 625 static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures,
 626                      int64_t capture_index) {
 627     if (pattern_index >= pattern.length) // End of the pattern
 628         return 0;
 629 
 630     int64_t start_index = text_index;
 631     TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
 632     pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
 633 
 634     if (pat.min == -1 && pat.max == -1) {
 635         if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
 636             pat.min = pat.max = MAX(1, text.length - text_index);
 637         } else {
 638             pat.min = 1;
 639             pat.max = INT64_MAX;
 640         }
 641     }
 642 
 643     int64_t capture_start = text_index;
 644     int64_t count = 0, capture_len = 0, next_match_len = 0;
 645 
 646     if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
 647         int64_t remaining = text.length - text_index;
 648         capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1;
 649         text_index += capture_len;
 650         goto success;
 651     }
 652 
 653     if (pat.min == 0 && pattern_index < pattern.length) {
 654         next_match_len =
 655             match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
 656         if (next_match_len >= 0) {
 657             capture_len = 0;
 658             goto success;
 659         }
 660     }
 661 
 662     while (count < pat.max) {
 663         int64_t match_len = match_pat(&text_state, text_index, pat);
 664         if (match_len < 0) break;
 665         capture_len += match_len;
 666         text_index += match_len;
 667         count += 1;
 668 
 669         if (pattern_index < pattern.length) { // More stuff after this
 670             if (count < pat.min) next_match_len = -1;
 671             else
 672                 next_match_len = match(text, text_index, pattern, pattern_index, captures,
 673                                        capture_index + (pat.non_capturing ? 0 : 1));
 674         } else {
 675             next_match_len = 0;
 676         }
 677 
 678         if (match_len == 0) {
 679             if (next_match_len >= 0) {
 680                 // If we're good to go, no need to keep re-matching zero-length
 681                 // matches till we hit max:
 682                 count = pat.max;
 683                 break;
 684             } else {
 685                 return -1;
 686             }
 687         }
 688 
 689         if (pattern_index < pattern.length && next_match_len >= 0) break; // Next guy exists and wants to stop here
 690 
 691         if (text_index >= text.length) break;
 692     }
 693 
 694     if (count < pat.min || next_match_len < 0) return -1;
 695 
 696 success:
 697     if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) {
 698         if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) {
 699             assert(capture_len > 0);
 700             captures[capture_index] = (capture_t){
 701                 .index = capture_start + 1, // Skip leading quote/paren
 702                 .length = capture_len - 2, // Skip open/close
 703                 .occupied = true,
 704                 .recursive = (pat.tag == PAT_PAIR),
 705             };
 706         } else {
 707             captures[capture_index] = (capture_t){
 708                 .index = capture_start,
 709                 .length = capture_len,
 710                 .occupied = true,
 711                 .recursive = false,
 712             };
 713         }
 714     }
 715     return (text_index - start_index) + next_match_len;
 716 }
 717 
 718 #undef EAT1
 719 #undef EAT2
 720 #undef EAT_MANY
 721 
 722 static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length,
 723                      capture_t *captures) {
 724     int32_t first_grapheme = Text$get_grapheme(pattern, 0);
 725     bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
 726                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 727 
 728     TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
 729     for (int64_t i = first; i <= last; i++) {
 730         // Optimization: quickly skip ahead to first char in pattern:
 731         if (find_first) {
 732             while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme)
 733                 ++i;
 734         }
 735 
 736         int64_t m = match(text, i, pattern, 0, captures, 0);
 737         if (m >= 0) {
 738             if (match_length) *match_length = m;
 739             return i;
 740         }
 741     }
 742     if (match_length) *match_length = -1;
 743     return -1;
 744 }
 745 
 746 static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) {
 747     int64_t first = Int64$from_int(from_index, false);
 748     if (first == 0) fail("Invalid index: 0");
 749     if (first < 0) first = text.length + first + 1;
 750     if (first > text.length || first < 1) return NONE_MATCH;
 751 
 752     capture_t captures[MAX_BACKREFS] = {};
 753     int64_t len = 0;
 754     int64_t found = _find(text, pattern, first - 1, text.length - 1, &len, captures);
 755     if (found == -1) return NONE_MATCH;
 756 
 757     List_t capture_list = {};
 758     for (int i = 0; captures[i].occupied; i++) {
 759         Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
 760         List$insert(&capture_list, &capture, I(0), sizeof(Text_t));
 761     }
 762     return (OptionalPatternMatch){
 763         .text = Text$slice(text, I(found + 1), I(found + len)),
 764         .index = I(found + 1),
 765         .captures = capture_list,
 766     };
 767 }
 768 
 769 PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) {
 770     if (pattern.length == 0) {
 771         return true;
 772     } else if (Text$starts_with(pattern, Text("{start}"), &pattern)) {
 773         int64_t m = match(text, 0, pattern, 0, NULL, 0);
 774         return m >= 0;
 775     } else if (Text$ends_with(text, Text("{end}"), NULL)) {
 776         for (int64_t i = text.length - 1; i >= 0; i--) {
 777             int64_t match_len = match(text, i, pattern, 0, NULL, 0);
 778             if (match_len >= 0 && i + match_len == text.length) return true;
 779         }
 780         return false;
 781     } else {
 782         int64_t found = _find(text, pattern, 0, text.length - 1, NULL, NULL);
 783         return (found >= 0);
 784     }
 785 }
 786 
 787 static bool Pattern$matches(Text_t text, Text_t pattern) {
 788     if (pattern.length == 0) return true;
 789     capture_t captures[MAX_BACKREFS] = {};
 790     int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
 791     return (match_len == text.length);
 792 }
 793 
 794 static bool Pattern$match_at(Text_t text, Text_t pattern, Int_t pos, PatternMatch *dest) {
 795     if (pattern.length == 0) return true;
 796     int64_t start = Int64$from_int(pos, false) - 1;
 797     capture_t captures[MAX_BACKREFS] = {};
 798     int64_t match_len = match(text, start, pattern, 0, captures, 0);
 799     if (match_len < 0) return false;
 800 
 801     List_t capture_list = {};
 802     for (int i = 0; captures[i].occupied; i++) {
 803         Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
 804         List$insert(&capture_list, &capture, I(0), sizeof(Text_t));
 805     }
 806     dest->text = Text$slice(text, I(start + 1), I(start + match_len));
 807     dest->index = I(start + 1);
 808     dest->captures = capture_list;
 809     return true;
 810 }
 811 
 812 static OptionalList_t Pattern$captures(Text_t text, Text_t pattern) {
 813     if (pattern.length == 0) return EMPTY_LIST;
 814     capture_t captures[MAX_BACKREFS] = {};
 815     int64_t match_len = match(text, 0, pattern, 0, captures, 0);
 816     if (match_len != text.length) return NONE_LIST;
 817 
 818     List_t capture_list = {};
 819     for (int i = 0; captures[i].occupied; i++) {
 820         Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
 821         List$insert(&capture_list, &capture, I(0), sizeof(Text_t));
 822     }
 823     return capture_list;
 824 }
 825 
 826 static List_t Pattern$find_all(Text_t text, Text_t pattern) {
 827     if (text.length == 0 || pattern.length == 0) // special case
 828         return EMPTY_LIST;
 829 
 830     List_t matches = {};
 831     for (int64_t i = 1;;) {
 832         OptionalPatternMatch m = find(text, pattern, I(i));
 833         if (m.is_none) break;
 834         i = Int64$from_int(m.index, false) + m.text.length;
 835         List$insert(&matches, &m, I_small(0), sizeof(PatternMatch));
 836     }
 837     return matches;
 838 }
 839 
 840 typedef struct {
 841     TextIter_t state;
 842     Int_t i;
 843     Text_t pattern;
 844 } match_iter_state_t;
 845 
 846 static OptionalPatternMatch next_match(match_iter_state_t *state) {
 847     if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) return NONE_MATCH;
 848 
 849     OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i);
 850     if (m.is_none) // No match
 851         state->i = I(state->state.stack[0].text.length + 1);
 852     else state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
 853     return m;
 854 }
 855 
 856 static Closure_t Pattern$by_match(Text_t text, Text_t pattern) {
 857     return (Closure_t){
 858         .fn = (void *)next_match,
 859         .userdata = new (match_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = I_small(1), .pattern = pattern),
 860     };
 861 }
 862 
 863 static bool substring_match_at(TextIter_t *haystack, TextIter_t *needle, int64_t pos) {
 864     for (int64_t i = 0; i < needle->stack[0].text.length; i++) {
 865         if (Text$get_grapheme_fast(haystack, pos + i) != Text$get_grapheme_fast(needle, i)) return false;
 866     }
 867     return true;
 868 }
 869 
 870 static Text_t apply_backrefs(Text_t text, List_t recursive_replacements, Text_t replacement, Text_t backref_marker,
 871                              capture_t *captures) {
 872     if (backref_marker.length == 0) return replacement;
 873 
 874     Text_t ret = Text("");
 875     TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
 876     TextIter_t backref_state = NEW_TEXT_ITER_STATE(backref_marker);
 877     int32_t first_grapheme = Text$get_grapheme_fast(&backref_state, 0);
 878     int64_t nonmatching_pos = 0;
 879     for (int64_t pos = 0; pos < replacement.length;) {
 880         // Optimization: quickly skip ahead to first char in the backref pattern:
 881         while (pos + 1 < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme)
 882             ++pos;
 883 
 884         // If it's not a backref marker, proceed normally
 885         if (!substring_match_at(&replacement_state, &backref_state, pos)) {
 886             pos += 1;
 887             continue;
 888         }
 889 
 890         // For double backrefs like "@@", treat it as an escape
 891         if (substring_match_at(&replacement_state, &backref_state, pos + backref_marker.length)) {
 892             if (pos > nonmatching_pos) {
 893                 Text_t before_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(pos));
 894                 ret = Texts(ret, before_slice);
 895             }
 896             ret = Texts(ret, backref_marker);
 897             pos += 2 * backref_marker.length;
 898             nonmatching_pos = pos;
 899             continue;
 900         }
 901 
 902         int64_t after_backref = pos + backref_marker.length;
 903         int64_t backref = parse_int(&replacement_state, &after_backref);
 904         if (after_backref == pos + backref_marker.length) { // Not actually a backref if there's no number
 905             pos += 1;
 906             continue;
 907         }
 908         if (backref < 0 || backref >= MAX_BACKREFS)
 909             fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS - 1, " are allowed)");
 910 
 911         if (!captures[backref].occupied) fail("There is no capture number ", backref, "!");
 912 
 913         if (Text$get_grapheme_fast(&replacement_state, after_backref) == ';')
 914             after_backref += 1; // skip optional semicolon
 915 
 916         Text_t backref_text =
 917             Text$slice(text, I(captures[backref].index + 1), I(captures[backref].index + captures[backref].length));
 918 
 919         if (captures[backref].recursive && recursive_replacements.length > 0)
 920             backref_text = replace_list(backref_text, recursive_replacements, backref_marker, true);
 921 
 922         if (pos > nonmatching_pos) {
 923             Text_t before_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(pos));
 924             ret = Text$concat(ret, before_slice, backref_text);
 925         } else {
 926             ret = Text$concat(ret, backref_text);
 927         }
 928 
 929         pos = after_backref;
 930         nonmatching_pos = pos;
 931     }
 932     if (nonmatching_pos < replacement.length) {
 933         Text_t last_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(replacement.length));
 934         ret = Text$concat(ret, last_slice);
 935     }
 936     return ret;
 937 }
 938 
 939 static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_marker, bool recursive) {
 940     if (text.length == 0 || pattern.length == 0) return text;
 941     Text_t ret = EMPTY_TEXT;
 942 
 943     int32_t first_grapheme = Text$get_grapheme(pattern, 0);
 944     bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
 945                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
 946 
 947     Text_t entries[2] = {pattern, replacement};
 948     List_t replacements = {
 949         .data = entries,
 950         .length = 1,
 951         .stride = sizeof(entries),
 952     };
 953 
 954     TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
 955     int64_t nonmatching_pos = 0;
 956     for (int64_t pos = 0; pos < text.length;) {
 957         // Optimization: quickly skip ahead to first char in pattern:
 958         if (find_first) {
 959             while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
 960                 ++pos;
 961         }
 962 
 963         capture_t captures[MAX_BACKREFS] = {};
 964         int64_t match_len = match(text, pos, pattern, 0, captures, 1);
 965         if (match_len < 0) {
 966             pos += 1;
 967             continue;
 968         }
 969         captures[0] = (capture_t){
 970             .index = pos,
 971             .length = match_len,
 972             .occupied = true,
 973             .recursive = false,
 974         };
 975 
 976         Text_t replacement_text =
 977             apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_marker, captures);
 978         if (pos > nonmatching_pos) {
 979             Text_t before_slice = Text$slice(text, I(nonmatching_pos + 1), I(pos));
 980             ret = Text$concat(ret, before_slice, replacement_text);
 981         } else {
 982             ret = Text$concat(ret, replacement_text);
 983         }
 984         nonmatching_pos = pos + match_len;
 985         pos += MAX(match_len, 1);
 986     }
 987     if (nonmatching_pos < text.length) {
 988         Text_t last_slice = Text$slice(text, I(nonmatching_pos + 1), I(text.length));
 989         ret = Text$concat(ret, last_slice);
 990     }
 991     return ret;
 992 }
 993 
 994 static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) {
 995     if (text.length == 0 || pattern.length == 0) return text;
 996     int64_t first = 0, last = text.length - 1;
 997     if (trim_left) {
 998         int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
 999         if (match_len > 0) first = match_len;
1000     }
1001 
1002     if (trim_right) {
1003         for (int64_t i = text.length - 1; i >= first; i--) {
1004             int64_t match_len = match(text, i, pattern, 0, NULL, 0);
1005             if (match_len > 0 && i + match_len == text.length) last = i - 1;
1006         }
1007     }
1008     return Text$slice(text, I(first + 1), I(last + 1));
1009 }
1010 
1011 static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) {
1012     if (text.length == 0 || pattern.length == 0) return text;
1013     Text_t ret = EMPTY_TEXT;
1014 
1015     int32_t first_grapheme = Text$get_grapheme(pattern, 0);
1016     bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
1017                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
1018 
1019     TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1020     int64_t nonmatching_pos = 0;
1021 
1022     Text_t (*text_mapper)(PatternMatch, void *) = fn.fn;
1023     for (int64_t pos = 0; pos < text.length; pos++) {
1024         // Optimization: quickly skip ahead to first char in pattern:
1025         if (find_first) {
1026             while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
1027                 ++pos;
1028         }
1029 
1030         capture_t captures[MAX_BACKREFS] = {};
1031         int64_t match_len = match(text, pos, pattern, 0, captures, 0);
1032         if (match_len < 0) continue;
1033 
1034         PatternMatch m = {
1035             .text = Text$slice(text, I(pos + 1), I(pos + match_len)),
1036             .index = I(pos + 1),
1037             .captures = {},
1038         };
1039         for (int i = 0; captures[i].occupied; i++) {
1040             Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
1041             if (recursive) capture = Pattern$map(capture, pattern, fn, recursive);
1042             List$insert(&m.captures, &capture, I(0), sizeof(Text_t));
1043         }
1044 
1045         Text_t replacement = text_mapper(m, fn.userdata);
1046         if (pos > nonmatching_pos) {
1047             Text_t before_slice = Text$slice(text, I(nonmatching_pos + 1), I(pos));
1048             ret = Text$concat(ret, before_slice, replacement);
1049         } else {
1050             ret = Text$concat(ret, replacement);
1051         }
1052         nonmatching_pos = pos + match_len;
1053         pos += (match_len - 1);
1054     }
1055     if (nonmatching_pos < text.length) {
1056         Text_t last_slice = Text$slice(text, I(nonmatching_pos + 1), I(text.length));
1057         ret = Text$concat(ret, last_slice);
1058     }
1059     return ret;
1060 }
1061 
1062 static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) {
1063     if (text.length == 0 || pattern.length == 0) return;
1064     int32_t first_grapheme = Text$get_grapheme(pattern, 0);
1065     bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
1066                        && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
1067 
1068     TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1069     void (*action)(PatternMatch, void *) = fn.fn;
1070     for (int64_t pos = 0; pos < text.length; pos++) {
1071         // Optimization: quickly skip ahead to first char in pattern:
1072         if (find_first) {
1073             while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
1074                 ++pos;
1075         }
1076 
1077         capture_t captures[MAX_BACKREFS] = {};
1078         int64_t match_len = match(text, pos, pattern, 0, captures, 0);
1079         if (match_len < 0) continue;
1080 
1081         PatternMatch m = {
1082             .text = Text$slice(text, I(pos + 1), I(pos + match_len)),
1083             .index = I(pos + 1),
1084             .captures = {},
1085         };
1086         for (int i = 0; captures[i].occupied; i++) {
1087             Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
1088             if (recursive) Pattern$each(capture, pattern, fn, recursive);
1089             List$insert(&m.captures, &capture, I(0), sizeof(Text_t));
1090         }
1091 
1092         action(m, fn.userdata);
1093         pos += (match_len - 1);
1094     }
1095 }
1096 
1097 Text_t replace_list(Text_t text, List_t replacements, Text_t backref_marker, bool recursive) {
1098     if (text.length == 0 || replacements.length == 0) return text;
1099 
1100     Text_t ret = EMPTY_TEXT;
1101 
1102     int64_t nonmatch_pos = 0;
1103     for (int64_t pos = 0; pos < text.length;) {
1104         // Find the first matching pattern at this position:
1105         for (int64_t i = 0; i < replacements.length; i++) {
1106             Text_t pattern = *(Text_t *)(replacements.data + i * replacements.stride);
1107             capture_t captures[MAX_BACKREFS] = {};
1108             int64_t len = match(text, pos, pattern, 0, captures, 1);
1109             if (len < 0) continue;
1110             captures[0].index = pos;
1111             captures[0].length = len;
1112 
1113             // If we skipped over some non-matching text before finding a match,
1114             // insert it here:
1115             if (pos > nonmatch_pos) {
1116                 Text_t before_slice = Text$slice(text, I(nonmatch_pos + 1), I(pos));
1117                 ret = Text$concat(ret, before_slice);
1118             }
1119 
1120             // Concatenate the replacement:
1121             Text_t replacement = *(Text_t *)(replacements.data + i * replacements.stride + sizeof(Text_t));
1122             Text_t replacement_text =
1123                 apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_marker, captures);
1124             ret = Text$concat(ret, replacement_text);
1125             pos += MAX(len, 1);
1126             nonmatch_pos = pos;
1127             goto next_pos;
1128         }
1129 
1130         pos += 1;
1131     next_pos:
1132         continue;
1133     }
1134 
1135     if (nonmatch_pos <= text.length) {
1136         Text_t last_slice = Text$slice(text, I(nonmatch_pos + 1), I(text.length));
1137         ret = Text$concat(ret, last_slice);
1138     }
1139     return ret;
1140 }
1141 
1142 static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_marker, bool recursive) {
1143     return replace_list(text, replacements.entries, backref_marker, recursive);
1144 }
1145 
1146 static List_t Pattern$split(Text_t text, Text_t pattern) {
1147     if (text.length == 0) // special case
1148         return EMPTY_LIST;
1149 
1150     if (pattern.length == 0) // special case
1151         return Text$clusters(text);
1152 
1153     List_t chunks = {};
1154 
1155     int64_t i = 0;
1156     for (;;) {
1157         int64_t len = 0;
1158         int64_t found = _find(text, pattern, i, text.length - 1, &len, NULL);
1159         if (found == i && len == 0) found = _find(text, pattern, i + 1, text.length - 1, &len, NULL);
1160         if (found < 0) break;
1161         Text_t chunk = Text$slice(text, I(i + 1), I(found));
1162         List$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
1163         i = MAX(found + len, i + 1);
1164     }
1165 
1166     Text_t last_chunk = Text$slice(text, I(i + 1), I(text.length));
1167     List$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));
1168 
1169     return chunks;
1170 }
1171 
1172 typedef struct {
1173     TextIter_t state;
1174     int64_t i;
1175     Text_t pattern;
1176 } split_iter_state_t;
1177 
1178 static OptionalText_t next_split(split_iter_state_t *state) {
1179     Text_t text = state->state.stack[0].text;
1180     if (state->i >= text.length) {
1181         if (state->pattern.length > 0 && state->i == text.length) { // special case
1182             state->i = text.length + 1;
1183             return EMPTY_TEXT;
1184         }
1185         return NONE_TEXT;
1186     }
1187 
1188     if (state->pattern.length == 0) { // special case
1189         Text_t ret = Text$cluster(text, I(state->i + 1));
1190         state->i += 1;
1191         return ret;
1192     }
1193 
1194     int64_t start = state->i;
1195     int64_t len = 0;
1196     int64_t found = _find(text, state->pattern, start, text.length - 1, &len, NULL);
1197 
1198     if (found == start && len == 0) found = _find(text, state->pattern, start + 1, text.length - 1, &len, NULL);
1199 
1200     if (found >= 0) {
1201         state->i = MAX(found + len, state->i + 1);
1202         return Text$slice(text, I(start + 1), I(found));
1203     } else {
1204         state->i = state->state.stack[0].text.length + 1;
1205         return Text$slice(text, I(start + 1), I(text.length));
1206     }
1207 }
1208 
1209 static Closure_t Pattern$by_split(Text_t text, Text_t pattern) {
1210     return (Closure_t){
1211         .fn = (void *)next_split,
1212         .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .pattern = pattern),
1213     };
1214 }
1215 
1216 static Text_t Pattern$escape_text(Text_t text) {
1217     // TODO: optimize for spans of non-escaped text
1218     Text_t ret = EMPTY_TEXT;
1219     TextIter_t state = NEW_TEXT_ITER_STATE(text);
1220     for (int64_t i = 0; i < text.length; i++) {
1221         uint32_t g = Text$get_main_grapheme_fast(&state, i);
1222         if (g == '{') {
1223             ret = Text$concat(ret, Text("{1{}"));
1224         } else if (g == '?' || uc_is_property_quotation_mark(g)
1225                    || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) {
1226             ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i + 1), I(i + 1)), Text("}"));
1227         } else {
1228             ret = Text$concat(ret, Text$slice(text, I(i + 1), I(i + 1)));
1229         }
1230     }
1231     return ret;
1232 }
1233 
1234 static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) {
1235     (void)info;
1236     if (!obj) return Text("Pattern");
1237 
1238     Text_t pat = *(Text_t *)obj;
1239     Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/");
1240     return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote));
1241 }