code / tomo-patterns

Lines1.6K C1.0K Markdown343 Tomo178
(1.2K lines)
1 // Logic for text pattern matching
3 #include <ctype.h>
4 #include <string.h>
5 #include <strings.h>
6 #include <sys/param.h>
7 #include <unictype.h>
8 #include <uniname.h>
9 #include <unistring/version.h>
11 #define MAX_BACKREFS 100
13 typedef struct {
14 Text_t text;
15 Int_t index;
16 List_t captures;
17 } PatternMatch;
19 typedef struct {
20 Text_t text;
21 Int_t index;
22 List_t captures;
23 bool is_none : 1;
24 } OptionalPatternMatch;
26 #define NONE_MATCH ((OptionalPatternMatch){.is_none = true})
28 typedef struct {
29 int64_t index, length;
30 bool occupied, recursive;
31 } capture_t;
33 typedef struct {
34 enum { PAT_START, PAT_END, PAT_ANY, PAT_GRAPHEME, PAT_PROPERTY, PAT_QUOTE, PAT_PAIR, PAT_FUNCTION } tag;
35 bool negated, non_capturing;
36 int64_t min, max;
37 union {
38 int32_t grapheme;
39 uc_property_t property;
40 int64_t (*fn)(TextIter_t *, int64_t);
41 int32_t quote_graphemes[2];
42 int32_t pair_graphemes[2];
43 };
44 } pat_t;
46 static Text_t replace_list(Text_t text, List_t replacements, Text_t backref_marker, bool recursive);
48 static INLINE void skip_whitespace(TextIter_t *state, int64_t *i) {
49 while (*i < state->stack[0].text.length) {
50 int32_t grapheme = Text$get_grapheme_fast(state, *i);
51 if (grapheme > 0 && !uc_is_property_white_space((ucs4_t)grapheme)) return;
52 *i += 1;
56 static INLINE bool match_grapheme(TextIter_t *state, int64_t *i, int32_t grapheme) {
57 if (*i < state->stack[0].text.length && Text$get_grapheme_fast(state, *i) == grapheme) {
58 *i += 1;
59 return true;
61 return false;
64 static INLINE bool match_str(TextIter_t *state, int64_t *i, const char *str) {
65 int64_t matched = 0;
66 while (matched[str]) {
67 if (*i + matched >= state->stack[0].text.length || Text$get_grapheme_fast(state, *i + matched) != str[matched])
68 return false;
69 matched += 1;
71 *i += matched;
72 return true;
75 static int64_t parse_int(TextIter_t *state, int64_t *i) {
76 int64_t value = 0;
77 for (;; *i += 1) {
78 uint32_t grapheme = Text$get_main_grapheme_fast(state, *i);
79 int digit = uc_digit_value(grapheme);
80 if (digit < 0) break;
81 if (value >= INT64_MAX / 10) break;
82 value = 10 * value + digit;
84 return value;
87 static const char *get_property_name(TextIter_t *state, int64_t *i) {
88 skip_whitespace(state, i);
89 char *name = GC_MALLOC_ATOMIC(UNINAME_MAX);
90 char *dest = name;
91 while (*i < state->stack[0].text.length) {
92 int32_t grapheme = Text$get_grapheme_fast(state, *i);
93 if (!(grapheme & ~0xFF) && (isalnum(grapheme) || grapheme == ' ' || grapheme == '_' || grapheme == '-')) {
94 *dest = (char)grapheme;
95 ++dest;
96 if (dest >= name + UNINAME_MAX - 1) break;
97 } else {
98 break;
100 *i += 1;
103 while (dest > name && dest[-1] == ' ')
104 *(dest--) = '\0';
106 if (dest == name) return NULL;
107 *dest = '\0';
108 return name;
111 #define EAT1(state, index, cond) \
112 ({ \
113 int32_t grapheme = Text$get_grapheme_fast(state, index); \
114 bool success = (cond); \
115 if (success) index += 1; \
116 success; \
119 #define EAT2(state, index, cond1, cond2) \
120 ({ \
121 int32_t grapheme = Text$get_grapheme_fast(state, index); \
122 bool success = (cond1); \
123 if (success) { \
124 grapheme = Text$get_grapheme_fast(state, index + 1); \
125 success = (cond2); \
126 if (success) index += 2; \
127 } \
128 success; \
131 #define EAT_MANY(state, index, cond) \
132 ({ \
133 int64_t _n = 0; \
134 while (EAT1(state, index, cond)) { \
135 _n += 1; \
136 } \
137 _n; \
140 static int64_t match_email(TextIter_t *state, int64_t index) {
141 // email = local "@" domain
142 // local = 1-64 ([a-zA-Z0-9!#$%&‘*+–/=?^_`.{|}~] | non-ascii)
143 // domain = dns-label ("." dns-label)*
144 // dns-label = 1-63 ([a-zA-Z0-9-] | non-ascii)
146 if (index > 0) {
147 uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
148 if (uc_is_property_alphabetic(prev_codepoint)) return -1;
151 int64_t start_index = index;
153 // Local part:
154 int64_t local_len = 0;
155 static const char *allowed_local = "!#$%&‘*+–/=?^_`.{|}~";
156 while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || strchr(allowed_local, (char)grapheme))) {
157 local_len += 1;
158 if (local_len > 64) return -1;
161 if (!EAT1(state, index, grapheme == '@')) return -1;
163 // Host
164 int64_t host_len = 0;
165 do {
166 int64_t label_len = 0;
167 while (EAT1(state, index, (grapheme & ~0x7F) || isalnum((char)grapheme) || grapheme == '-')) {
168 label_len += 1;
169 if (label_len > 63) return -1;
172 if (label_len == 0) return -1;
174 host_len += label_len;
175 if (host_len > 255) return -1;
176 host_len += 1;
177 } while (EAT1(state, index, grapheme == '.'));
179 return index - start_index;
182 static int64_t match_ipv6(TextIter_t *state, int64_t index) {
183 if (index > 0) {
184 int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
185 if ((prev_codepoint & ~0x7F) && (isxdigit(prev_codepoint) || prev_codepoint == ':')) return -1;
187 int64_t start_index = index;
188 const int NUM_CLUSTERS = 8;
189 bool double_colon_used = false;
190 for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
191 for (int digits = 0; digits < 4; digits++) {
192 if (!EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) break;
194 if (EAT1(state, index, ~(grapheme & ~0x7F) && isxdigit((char)grapheme))) return -1; // Too many digits
196 if (cluster == NUM_CLUSTERS - 1) {
197 break;
198 } else if (!EAT1(state, index, grapheme == ':')) {
199 if (double_colon_used) break;
200 return -1;
203 if (EAT1(state, index, grapheme == ':')) {
204 if (double_colon_used) return -1;
205 double_colon_used = true;
208 return index - start_index;
211 static int64_t match_ipv4(TextIter_t *state, int64_t index) {
212 if (index > 0) {
213 int32_t prev_codepoint = Text$get_grapheme_fast(state, index - 1);
214 if ((prev_codepoint & ~0x7F) && (isdigit(prev_codepoint) || prev_codepoint == '.')) return -1;
216 int64_t start_index = index;
218 const int NUM_CLUSTERS = 4;
219 for (int cluster = 0; cluster < NUM_CLUSTERS; cluster++) {
220 for (int digits = 0; digits < 3; digits++) {
221 if (!EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) {
222 if (digits == 0) return -1;
223 break;
227 if (EAT1(state, index, ~(grapheme & ~0x7F) && isdigit((char)grapheme))) return -1; // Too many digits
229 if (cluster == NUM_CLUSTERS - 1) break;
230 else if (!EAT1(state, index, grapheme == '.')) return -1;
232 return (index - start_index);
235 static int64_t match_ip(TextIter_t *state, int64_t index) {
236 int64_t len = match_ipv6(state, index);
237 if (len >= 0) return len;
238 len = match_ipv4(state, index);
239 return (len >= 0) ? len : -1;
242 static int64_t match_host(TextIter_t *state, int64_t index) {
243 int64_t ip_len = match_ip(state, index);
244 if (ip_len > 0) return ip_len;
246 int64_t start_index = index;
247 if (match_grapheme(state, &index, '[')) {
248 ip_len = match_ip(state, index);
249 if (ip_len <= 0) return -1;
250 index += ip_len;
251 if (match_grapheme(state, &index, ']')) return (index - start_index);
252 return -1;
255 if (!EAT1(state, index, isalpha(grapheme))) return -1;
257 static const char *non_host_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`";
258 EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_host_chars, (char)grapheme));
259 return (index - start_index);
262 static int64_t match_authority(TextIter_t *state, int64_t index) {
263 int64_t authority_start = index;
264 static const char *non_segment_chars = "/#?:@ \t\r\n<>[]{}\\^|\"`.";
266 // Optional user@ prefix:
267 int64_t username_len = EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_segment_chars, (char)grapheme));
268 if (username_len < 1 || !EAT1(state, index, grapheme == '@')) index = authority_start; // No user@ part
270 // Host:
271 int64_t host_len = match_host(state, index);
272 if (host_len <= 0) return -1;
273 index += host_len;
275 // Port:
276 if (EAT1(state, index, grapheme == ':')) {
277 if (EAT_MANY(state, index, !(grapheme & ~0x7F) && isdigit(grapheme)) == 0) return -1;
279 return (index - authority_start);
282 static int64_t match_uri(TextIter_t *state, int64_t index) {
283 // URI = scheme ":" ["//" authority] path ["?" query] ["#" fragment]
284 // scheme = [a-zA-Z] [a-zA-Z0-9+.-]
285 // authority = [userinfo "@"] host [":" port]
287 if (index > 0) {
288 // Don't match if we're not at a word edge:
289 uint32_t prev_codepoint = Text$get_main_grapheme_fast(state, index - 1);
290 if (uc_is_property_alphabetic(prev_codepoint)) return -1;
293 int64_t start_index = index;
295 // Scheme:
296 if (!EAT1(state, index, isalpha(grapheme))) return -1;
297 EAT_MANY(state, index,
298 !(grapheme & ~0x7F) && (isalnum(grapheme) || grapheme == '+' || grapheme == '.' || grapheme == '-'));
299 if (!match_grapheme(state, &index, ':')) return -1;
301 // Authority:
302 int64_t authority_len;
303 if (match_str(state, &index, "//")) {
304 authority_len = match_authority(state, index);
305 if (authority_len > 0) index += authority_len;
306 } else {
307 authority_len = 0;
310 // Path:
311 int64_t path_start = index;
312 if (EAT1(state, index, grapheme == '/') || authority_len <= 0) {
313 static const char *non_path = " \"#?<>[]{}\\^`|";
314 EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_path, (char)grapheme));
316 if (EAT1(state, index, grapheme == '?')) { // Query
317 static const char *non_query = " \"#<>[]{}\\^`|";
318 EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_query, (char)grapheme));
321 if (EAT1(state, index, grapheme == '#')) { // Fragment
322 static const char *non_fragment = " \"#<>[]{}\\^`|";
323 EAT_MANY(state, index, (grapheme & ~0x7F) || !strchr(non_fragment, (char)grapheme));
327 if (authority_len <= 0 && index == path_start) return -1;
329 return index - start_index;
332 static int64_t match_url(TextIter_t *state, int64_t index) {
333 int64_t lookahead = index;
334 if (!(match_str(state, &lookahead, "https:") || match_str(state, &lookahead, "http:")
335 || match_str(state, &lookahead, "ftp:") || match_str(state, &lookahead, "wss:")
336 || match_str(state, &lookahead, "ws:")))
337 return -1;
339 return match_uri(state, index);
342 static int64_t match_id(TextIter_t *state, int64_t index) {
343 if (!EAT1(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_START))) return -1;
344 return 1 + EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_XID_CONTINUE));
347 static int64_t match_int(TextIter_t *state, int64_t index) {
348 int64_t negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
349 int64_t len = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
350 return len > 0 ? negative + len : -1;
353 static int64_t match_alphanumeric(TextIter_t *state, int64_t index) {
354 return EAT1(state, index, uc_is_property_alphabetic((ucs4_t)grapheme) || uc_is_property_numeric((ucs4_t)grapheme))
355 ? 1
356 : -1;
359 static int64_t match_num(TextIter_t *state, int64_t index) {
360 bool negative = EAT1(state, index, grapheme == '-') ? 1 : 0;
361 int64_t pre_decimal = EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT));
362 bool decimal = (EAT1(state, index, grapheme == '.') == 1);
363 int64_t post_decimal =
364 decimal ? EAT_MANY(state, index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_DECIMAL_DIGIT)) : 0;
365 if (pre_decimal == 0 && post_decimal == 0) return -1;
366 return negative + pre_decimal + decimal + post_decimal;
369 static int64_t match_newline(TextIter_t *state, int64_t index) {
370 if (index >= state->stack[0].text.length) return -1;
372 uint32_t grapheme = index >= state->stack[0].text.length ? 0 : Text$get_main_grapheme_fast(state, index);
373 if (grapheme == '\n') return 1;
374 if (grapheme == '\r' && Text$get_grapheme_fast(state, index + 1) == '\n') return 2;
375 return -1;
378 static int64_t match_pat(TextIter_t *state, int64_t index, pat_t pat) {
379 Text_t text = state->stack[0].text;
380 int32_t grapheme = index >= text.length ? 0 : Text$get_grapheme_fast(state, index);
382 switch (pat.tag) {
383 case PAT_START: {
384 if (index == 0) return pat.negated ? -1 : 0;
385 return pat.negated ? 0 : -1;
387 case PAT_END: {
388 if (index >= text.length) return pat.negated ? -1 : 0;
389 return pat.negated ? 0 : -1;
391 case PAT_ANY: {
392 assert(!pat.negated);
393 return (index < text.length) ? 1 : -1;
395 case PAT_GRAPHEME: {
396 if (index >= text.length) return -1;
397 else if (grapheme == pat.grapheme) return pat.negated ? -1 : 1;
398 return pat.negated ? 1 : -1;
400 case PAT_PROPERTY: {
401 if (index >= text.length) return -1;
402 else if (uc_is_property((ucs4_t)grapheme, pat.property)) return pat.negated ? -1 : 1;
403 return pat.negated ? 1 : -1;
405 case PAT_PAIR: {
406 // Nested punctuation: (?), [?], etc
407 if (index >= text.length) return -1;
409 int32_t open = pat.pair_graphemes[0];
410 if (grapheme != open) return pat.negated ? 1 : -1;
412 int32_t close = pat.pair_graphemes[1];
413 int64_t depth = 1;
414 int64_t match_len = 1;
415 for (; depth > 0; match_len++) {
416 if (index + match_len >= text.length) return pat.negated ? 1 : -1;
418 int32_t c = Text$get_grapheme_fast(state, index + match_len);
419 if (c == open) depth += 1;
420 else if (c == close) depth -= 1;
422 return pat.negated ? -1 : match_len;
424 case PAT_QUOTE: {
425 // Nested quotes: "?", '?', etc
426 if (index >= text.length) return -1;
428 int32_t open = pat.quote_graphemes[0];
429 if (grapheme != open) return pat.negated ? 1 : -1;
431 int32_t close = pat.quote_graphemes[1];
432 for (int64_t i = index + 1; i < text.length; i++) {
433 int32_t c = Text$get_grapheme_fast(state, i);
434 if (c == close) {
435 return pat.negated ? -1 : (i - index) + 1;
436 } else if (c == '\\' && index + 1 < text.length) {
437 i += 1; // Skip ahead an extra step
440 return pat.negated ? 1 : -1;
442 case PAT_FUNCTION: {
443 int64_t match_len = pat.fn(state, index);
444 if (match_len >= 0) return pat.negated ? -1 : match_len;
445 return pat.negated ? 1 : -1;
447 default: errx(1, "Invalid pattern");
449 errx(1, "Unreachable");
450 return 0;
453 static pat_t parse_next_pat(TextIter_t *state, int64_t *index) {
454 if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_QUOTATION_MARK), grapheme == '?')) {
455 // Quotations: "?", '?', etc
456 int32_t open = Text$get_grapheme_fast(state, *index - 2);
457 int32_t close = open;
458 uc_mirror_char((ucs4_t)open, (ucs4_t *)&close);
459 if (!match_grapheme(state, index, close)) fail("Pattern's closing quote is missing: ", state->stack[0].text);
461 return (pat_t){
462 .tag = PAT_QUOTE,
463 .min = 1,
464 .max = 1,
465 .quote_graphemes = {open, close},
467 } else if (EAT2(state, *index, uc_is_property((ucs4_t)grapheme, UC_PROPERTY_PAIRED_PUNCTUATION), grapheme == '?')) {
468 // Nested punctuation: (?), [?], etc
469 int32_t open = Text$get_grapheme_fast(state, *index - 2);
470 int32_t close = open;
471 uc_mirror_char((ucs4_t)open, (ucs4_t *)&close);
472 if (!match_grapheme(state, index, close)) fail("Pattern's closing brace is missing: ", state->stack[0].text);
474 return (pat_t){
475 .tag = PAT_PAIR,
476 .min = 1,
477 .max = 1,
478 .pair_graphemes = {open, close},
480 } else if (EAT1(state, *index,
481 grapheme == '{')) { // named patterns {id}, {2-3 hex}, etc.
482 skip_whitespace(state, index);
483 int64_t min, max;
484 if (uc_is_digit((ucs4_t)Text$get_grapheme_fast(state, *index))) {
485 min = parse_int(state, index);
486 skip_whitespace(state, index);
487 if (match_grapheme(state, index, '+')) {
488 max = INT64_MAX;
489 } else if (match_grapheme(state, index, '-')) {
490 max = parse_int(state, index);
491 } else {
492 max = min;
494 if (min > max) fail("Minimum repetitions (", min, ") is less than the maximum (", max, ")");
495 } else {
496 min = -1, max = -1;
499 skip_whitespace(state, index);
501 bool negated = match_grapheme(state, index, '!');
502 #define PAT(_tag, ...) ((pat_t){.min = min, .max = max, .negated = negated, .tag = _tag, __VA_ARGS__})
503 const char *prop_name;
504 if (match_str(state, index, "..")) prop_name = "..";
505 else prop_name = get_property_name(state, index);
507 if (!prop_name) {
508 // Literal character, e.g. {1?}
509 skip_whitespace(state, index);
510 int32_t grapheme = Text$get_grapheme_fast(state, (*index)++);
511 if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text);
512 return PAT(PAT_GRAPHEME, .grapheme = grapheme);
513 } else if (strlen(prop_name) == 1) {
514 // Single letter names: {1+ A}
515 skip_whitespace(state, index);
516 if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text);
517 return PAT(PAT_GRAPHEME, .grapheme = prop_name[0]);
520 skip_whitespace(state, index);
521 if (!match_grapheme(state, index, '}')) fail("Missing closing '}' in pattern: ", state->stack[0].text);
523 switch (tolower(prop_name[0])) {
524 case '.':
525 if (prop_name[1] == '.') {
526 if (negated) return ((pat_t){.tag = PAT_END, .min = min, .max = max, .non_capturing = true});
527 else return PAT(PAT_ANY);
529 break;
530 case 'a':
531 if (strcasecmp(prop_name, "authority") == 0) {
532 return PAT(PAT_FUNCTION, .fn = match_authority);
533 } else if (strcasecmp(prop_name, "alphanum") == 0 || strcasecmp(prop_name, "anum") == 0
534 || strcasecmp(prop_name, "alphanumeric") == 0) {
535 return PAT(PAT_FUNCTION, .fn = match_alphanumeric);
537 break;
538 case 'c':
539 if (strcasecmp(prop_name, "crlf") == 0) return PAT(PAT_FUNCTION, .fn = match_newline);
540 break;
541 case 'd':
542 if (strcasecmp(prop_name, "digit") == 0) {
543 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_DECIMAL_DIGIT);
545 break;
546 case 'e':
547 if (strcasecmp(prop_name, "end") == 0) {
548 return PAT(PAT_END, .non_capturing = !negated);
549 } else if (strcasecmp(prop_name, "email") == 0) {
550 return PAT(PAT_FUNCTION, .fn = match_email);
552 #if _LIBUNISTRING_VERSION >= 0x0100000
553 else if (strcasecmp(prop_name, "emoji") == 0) {
554 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_EMOJI);
556 #endif
557 break;
558 case 'h':
559 if (strcasecmp(prop_name, "host") == 0) {
560 return PAT(PAT_FUNCTION, .fn = match_host);
562 break;
563 case 'i':
564 if (strcasecmp(prop_name, "id") == 0) {
565 return PAT(PAT_FUNCTION, .fn = match_id);
566 } else if (strcasecmp(prop_name, "int") == 0) {
567 return PAT(PAT_FUNCTION, .fn = match_int);
568 } else if (strcasecmp(prop_name, "ipv4") == 0) {
569 return PAT(PAT_FUNCTION, .fn = match_ipv4);
570 } else if (strcasecmp(prop_name, "ipv6") == 0) {
571 return PAT(PAT_FUNCTION, .fn = match_ipv6);
572 } else if (strcasecmp(prop_name, "ip") == 0) {
573 return PAT(PAT_FUNCTION, .fn = match_ip);
575 break;
576 case 'l':
577 if (strcasecmp(prop_name, "letter") == 0) {
578 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_ALPHABETIC);
580 case 'n':
581 if (strcasecmp(prop_name, "nl") == 0 || strcasecmp(prop_name, "newline") == 0) {
582 return PAT(PAT_FUNCTION, .fn = match_newline);
583 } else if (strcasecmp(prop_name, "num") == 0) {
584 return PAT(PAT_FUNCTION, .fn = match_num);
586 break;
587 case 's':
588 if (strcasecmp(prop_name, "start") == 0) {
589 return PAT(PAT_START, .non_capturing = !negated);
591 break;
592 case 'u':
593 if (strcasecmp(prop_name, "uri") == 0) {
594 return PAT(PAT_FUNCTION, .fn = match_uri);
595 } else if (strcasecmp(prop_name, "url") == 0) {
596 return PAT(PAT_FUNCTION, .fn = match_url);
598 break;
599 case 'w':
600 if (strcasecmp(prop_name, "word") == 0) {
601 return PAT(PAT_FUNCTION, .fn = match_id);
602 } else if (strcasecmp(prop_name, "ws") == 0 || strcasecmp(prop_name, "whitespace") == 0) {
603 return PAT(PAT_PROPERTY, .property = UC_PROPERTY_WHITE_SPACE);
605 break;
606 default: break;
609 uc_property_t prop = uc_property_byname(prop_name);
610 if (uc_property_is_valid(prop)) return PAT(PAT_PROPERTY, .property = prop);
612 ucs4_t grapheme = unicode_name_character(prop_name);
613 if (grapheme == UNINAME_INVALID) fail("Not a valid property or character name: ", prop_name);
614 return PAT(PAT_GRAPHEME, .grapheme = (int32_t)grapheme);
615 #undef PAT
616 } else {
617 return (pat_t){.tag = PAT_GRAPHEME,
618 .non_capturing = true,
619 .min = 1,
620 .max = 1,
621 .grapheme = Text$get_grapheme_fast(state, (*index)++)};
625 static int64_t match(Text_t text, int64_t text_index, Text_t pattern, int64_t pattern_index, capture_t *captures,
626 int64_t capture_index) {
627 if (pattern_index >= pattern.length) // End of the pattern
628 return 0;
630 int64_t start_index = text_index;
631 TextIter_t pattern_state = NEW_TEXT_ITER_STATE(pattern), text_state = NEW_TEXT_ITER_STATE(text);
632 pat_t pat = parse_next_pat(&pattern_state, &pattern_index);
634 if (pat.min == -1 && pat.max == -1) {
635 if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
636 pat.min = pat.max = MAX(1, text.length - text_index);
637 } else {
638 pat.min = 1;
639 pat.max = INT64_MAX;
643 int64_t capture_start = text_index;
644 int64_t count = 0, capture_len = 0, next_match_len = 0;
646 if (pat.tag == PAT_ANY && pattern_index >= pattern.length) {
647 int64_t remaining = text.length - text_index;
648 capture_len = remaining >= pat.min ? MIN(remaining, pat.max) : -1;
649 text_index += capture_len;
650 goto success;
653 if (pat.min == 0 && pattern_index < pattern.length) {
654 next_match_len =
655 match(text, text_index, pattern, pattern_index, captures, capture_index + (pat.non_capturing ? 0 : 1));
656 if (next_match_len >= 0) {
657 capture_len = 0;
658 goto success;
662 while (count < pat.max) {
663 int64_t match_len = match_pat(&text_state, text_index, pat);
664 if (match_len < 0) break;
665 capture_len += match_len;
666 text_index += match_len;
667 count += 1;
669 if (pattern_index < pattern.length) { // More stuff after this
670 if (count < pat.min) next_match_len = -1;
671 else
672 next_match_len = match(text, text_index, pattern, pattern_index, captures,
673 capture_index + (pat.non_capturing ? 0 : 1));
674 } else {
675 next_match_len = 0;
678 if (match_len == 0) {
679 if (next_match_len >= 0) {
680 // If we're good to go, no need to keep re-matching zero-length
681 // matches till we hit max:
682 count = pat.max;
683 break;
684 } else {
685 return -1;
689 if (pattern_index < pattern.length && next_match_len >= 0) break; // Next guy exists and wants to stop here
691 if (text_index >= text.length) break;
694 if (count < pat.min || next_match_len < 0) return -1;
696 success:
697 if (captures && capture_index < MAX_BACKREFS && !pat.non_capturing) {
698 if (pat.tag == PAT_PAIR || pat.tag == PAT_QUOTE) {
699 assert(capture_len > 0);
700 captures[capture_index] = (capture_t){
701 .index = capture_start + 1, // Skip leading quote/paren
702 .length = capture_len - 2, // Skip open/close
703 .occupied = true,
704 .recursive = (pat.tag == PAT_PAIR),
706 } else {
707 captures[capture_index] = (capture_t){
708 .index = capture_start,
709 .length = capture_len,
710 .occupied = true,
711 .recursive = false,
715 return (text_index - start_index) + next_match_len;
718 #undef EAT1
719 #undef EAT2
720 #undef EAT_MANY
722 static int64_t _find(Text_t text, Text_t pattern, int64_t first, int64_t last, int64_t *match_length,
723 capture_t *captures) {
724 int32_t first_grapheme = Text$get_grapheme(pattern, 0);
725 bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
726 && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
728 TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
729 for (int64_t i = first; i <= last; i++) {
730 // Optimization: quickly skip ahead to first char in pattern:
731 if (find_first) {
732 while (i < text.length && Text$get_grapheme_fast(&text_state, i) != first_grapheme)
733 ++i;
736 int64_t m = match(text, i, pattern, 0, captures, 0);
737 if (m >= 0) {
738 if (match_length) *match_length = m;
739 return i;
742 if (match_length) *match_length = -1;
743 return -1;
746 static OptionalPatternMatch find(Text_t text, Text_t pattern, Int_t from_index) {
747 int64_t first = Int64$from_int(from_index, false);
748 if (first == 0) fail("Invalid index: 0");
749 if (first < 0) first = text.length + first + 1;
750 if (first > text.length || first < 1) return NONE_MATCH;
752 capture_t captures[MAX_BACKREFS] = {};
753 int64_t len = 0;
754 int64_t found = _find(text, pattern, first - 1, text.length - 1, &len, captures);
755 if (found == -1) return NONE_MATCH;
757 List_t capture_list = {};
758 for (int i = 0; captures[i].occupied; i++) {
759 Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
760 List$insert(&capture_list, &capture, I(0), sizeof(Text_t));
762 return (OptionalPatternMatch){
763 .text = Text$slice(text, I(found + 1), I(found + len)),
764 .index = I(found + 1),
765 .captures = capture_list,
769 PUREFUNC static bool Pattern$has(Text_t text, Text_t pattern) {
770 if (pattern.length == 0) {
771 return true;
772 } else if (Text$starts_with(pattern, Text("{start}"), &pattern)) {
773 int64_t m = match(text, 0, pattern, 0, NULL, 0);
774 return m >= 0;
775 } else if (Text$ends_with(text, Text("{end}"), NULL)) {
776 for (int64_t i = text.length - 1; i >= 0; i--) {
777 int64_t match_len = match(text, i, pattern, 0, NULL, 0);
778 if (match_len >= 0 && i + match_len == text.length) return true;
780 return false;
781 } else {
782 int64_t found = _find(text, pattern, 0, text.length - 1, NULL, NULL);
783 return (found >= 0);
787 static bool Pattern$matches(Text_t text, Text_t pattern) {
788 if (pattern.length == 0) return true;
789 capture_t captures[MAX_BACKREFS] = {};
790 int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
791 return (match_len == text.length);
794 static bool Pattern$match_at(Text_t text, Text_t pattern, Int_t pos, PatternMatch *dest) {
795 if (pattern.length == 0) return true;
796 int64_t start = Int64$from_int(pos, false) - 1;
797 capture_t captures[MAX_BACKREFS] = {};
798 int64_t match_len = match(text, start, pattern, 0, captures, 0);
799 if (match_len < 0) return false;
801 List_t capture_list = {};
802 for (int i = 0; captures[i].occupied; i++) {
803 Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
804 List$insert(&capture_list, &capture, I(0), sizeof(Text_t));
806 dest->text = Text$slice(text, I(start + 1), I(start + match_len));
807 dest->index = I(start + 1);
808 dest->captures = capture_list;
809 return true;
812 static OptionalList_t Pattern$captures(Text_t text, Text_t pattern) {
813 if (pattern.length == 0) return EMPTY_LIST;
814 capture_t captures[MAX_BACKREFS] = {};
815 int64_t match_len = match(text, 0, pattern, 0, captures, 0);
816 if (match_len != text.length) return NONE_LIST;
818 List_t capture_list = {};
819 for (int i = 0; captures[i].occupied; i++) {
820 Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
821 List$insert(&capture_list, &capture, I(0), sizeof(Text_t));
823 return capture_list;
826 static List_t Pattern$find_all(Text_t text, Text_t pattern) {
827 if (text.length == 0 || pattern.length == 0) // special case
828 return EMPTY_LIST;
830 List_t matches = {};
831 for (int64_t i = 1;;) {
832 OptionalPatternMatch m = find(text, pattern, I(i));
833 if (m.is_none) break;
834 i = Int64$from_int(m.index, false) + m.text.length;
835 List$insert(&matches, &m, I_small(0), sizeof(PatternMatch));
837 return matches;
840 typedef struct {
841 TextIter_t state;
842 Int_t i;
843 Text_t pattern;
844 } match_iter_state_t;
846 static OptionalPatternMatch next_match(match_iter_state_t *state) {
847 if (Int64$from_int(state->i, false) > state->state.stack[0].text.length) return NONE_MATCH;
849 OptionalPatternMatch m = find(state->state.stack[0].text, state->pattern, state->i);
850 if (m.is_none) // No match
851 state->i = I(state->state.stack[0].text.length + 1);
852 else state->i = Int$plus(m.index, I(MAX(1, m.text.length)));
853 return m;
856 static Closure_t Pattern$by_match(Text_t text, Text_t pattern) {
857 return (Closure_t){
858 .fn = (void *)next_match,
859 .userdata = new (match_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = I_small(1), .pattern = pattern),
863 static bool substring_match_at(TextIter_t *haystack, TextIter_t *needle, int64_t pos) {
864 for (int64_t i = 0; i < needle->stack[0].text.length; i++) {
865 if (Text$get_grapheme_fast(haystack, pos + i) != Text$get_grapheme_fast(needle, i)) return false;
867 return true;
870 static Text_t apply_backrefs(Text_t text, List_t recursive_replacements, Text_t replacement, Text_t backref_marker,
871 capture_t *captures) {
872 if (backref_marker.length == 0) return replacement;
874 Text_t ret = Text("");
875 TextIter_t replacement_state = NEW_TEXT_ITER_STATE(replacement);
876 TextIter_t backref_state = NEW_TEXT_ITER_STATE(backref_marker);
877 int32_t first_grapheme = Text$get_grapheme_fast(&backref_state, 0);
878 int64_t nonmatching_pos = 0;
879 for (int64_t pos = 0; pos < replacement.length;) {
880 // Optimization: quickly skip ahead to first char in the backref pattern:
881 while (pos + 1 < replacement.length && Text$get_grapheme_fast(&replacement_state, pos) != first_grapheme)
882 ++pos;
884 // If it's not a backref marker, proceed normally
885 if (!substring_match_at(&replacement_state, &backref_state, pos)) {
886 pos += 1;
887 continue;
890 // For double backrefs like "@@", treat it as an escape
891 if (substring_match_at(&replacement_state, &backref_state, pos + backref_marker.length)) {
892 if (pos > nonmatching_pos) {
893 Text_t before_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(pos));
894 ret = Texts(ret, before_slice);
896 ret = Texts(ret, backref_marker);
897 pos += 2 * backref_marker.length;
898 nonmatching_pos = pos;
899 continue;
902 int64_t after_backref = pos + backref_marker.length;
903 int64_t backref = parse_int(&replacement_state, &after_backref);
904 if (after_backref == pos + backref_marker.length) { // Not actually a backref if there's no number
905 pos += 1;
906 continue;
908 if (backref < 0 || backref >= MAX_BACKREFS)
909 fail("Invalid backref index: ", backref, " (only 0-", MAX_BACKREFS - 1, " are allowed)");
911 if (!captures[backref].occupied) fail("There is no capture number ", backref, "!");
913 if (Text$get_grapheme_fast(&replacement_state, after_backref) == ';')
914 after_backref += 1; // skip optional semicolon
916 Text_t backref_text =
917 Text$slice(text, I(captures[backref].index + 1), I(captures[backref].index + captures[backref].length));
919 if (captures[backref].recursive && recursive_replacements.length > 0)
920 backref_text = replace_list(backref_text, recursive_replacements, backref_marker, true);
922 if (pos > nonmatching_pos) {
923 Text_t before_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(pos));
924 ret = Text$concat(ret, before_slice, backref_text);
925 } else {
926 ret = Text$concat(ret, backref_text);
929 pos = after_backref;
930 nonmatching_pos = pos;
932 if (nonmatching_pos < replacement.length) {
933 Text_t last_slice = Text$slice(replacement, I(nonmatching_pos + 1), I(replacement.length));
934 ret = Text$concat(ret, last_slice);
936 return ret;
939 static Text_t Pattern$replace(Text_t text, Text_t pattern, Text_t replacement, Text_t backref_marker, bool recursive) {
940 if (text.length == 0 || pattern.length == 0) return text;
941 Text_t ret = EMPTY_TEXT;
943 int32_t first_grapheme = Text$get_grapheme(pattern, 0);
944 bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
945 && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
947 Text_t entries[2] = {pattern, replacement};
948 List_t replacements = {
949 .data = entries,
950 .length = 1,
951 .stride = sizeof(entries),
954 TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
955 int64_t nonmatching_pos = 0;
956 for (int64_t pos = 0; pos < text.length;) {
957 // Optimization: quickly skip ahead to first char in pattern:
958 if (find_first) {
959 while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
960 ++pos;
963 capture_t captures[MAX_BACKREFS] = {};
964 int64_t match_len = match(text, pos, pattern, 0, captures, 1);
965 if (match_len < 0) {
966 pos += 1;
967 continue;
969 captures[0] = (capture_t){
970 .index = pos,
971 .length = match_len,
972 .occupied = true,
973 .recursive = false,
976 Text_t replacement_text =
977 apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_marker, captures);
978 if (pos > nonmatching_pos) {
979 Text_t before_slice = Text$slice(text, I(nonmatching_pos + 1), I(pos));
980 ret = Text$concat(ret, before_slice, replacement_text);
981 } else {
982 ret = Text$concat(ret, replacement_text);
984 nonmatching_pos = pos + match_len;
985 pos += MAX(match_len, 1);
987 if (nonmatching_pos < text.length) {
988 Text_t last_slice = Text$slice(text, I(nonmatching_pos + 1), I(text.length));
989 ret = Text$concat(ret, last_slice);
991 return ret;
994 static Text_t Pattern$trim(Text_t text, Text_t pattern, bool trim_left, bool trim_right) {
995 if (text.length == 0 || pattern.length == 0) return text;
996 int64_t first = 0, last = text.length - 1;
997 if (trim_left) {
998 int64_t match_len = match(text, 0, pattern, 0, NULL, 0);
999 if (match_len > 0) first = match_len;
1002 if (trim_right) {
1003 for (int64_t i = text.length - 1; i >= first; i--) {
1004 int64_t match_len = match(text, i, pattern, 0, NULL, 0);
1005 if (match_len > 0 && i + match_len == text.length) last = i - 1;
1008 return Text$slice(text, I(first + 1), I(last + 1));
1011 static Text_t Pattern$map(Text_t text, Text_t pattern, Closure_t fn, bool recursive) {
1012 if (text.length == 0 || pattern.length == 0) return text;
1013 Text_t ret = EMPTY_TEXT;
1015 int32_t first_grapheme = Text$get_grapheme(pattern, 0);
1016 bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
1017 && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
1019 TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1020 int64_t nonmatching_pos = 0;
1022 Text_t (*text_mapper)(PatternMatch, void *) = fn.fn;
1023 for (int64_t pos = 0; pos < text.length; pos++) {
1024 // Optimization: quickly skip ahead to first char in pattern:
1025 if (find_first) {
1026 while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
1027 ++pos;
1030 capture_t captures[MAX_BACKREFS] = {};
1031 int64_t match_len = match(text, pos, pattern, 0, captures, 0);
1032 if (match_len < 0) continue;
1034 PatternMatch m = {
1035 .text = Text$slice(text, I(pos + 1), I(pos + match_len)),
1036 .index = I(pos + 1),
1037 .captures = {},
1039 for (int i = 0; captures[i].occupied; i++) {
1040 Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
1041 if (recursive) capture = Pattern$map(capture, pattern, fn, recursive);
1042 List$insert(&m.captures, &capture, I(0), sizeof(Text_t));
1045 Text_t replacement = text_mapper(m, fn.userdata);
1046 if (pos > nonmatching_pos) {
1047 Text_t before_slice = Text$slice(text, I(nonmatching_pos + 1), I(pos));
1048 ret = Text$concat(ret, before_slice, replacement);
1049 } else {
1050 ret = Text$concat(ret, replacement);
1052 nonmatching_pos = pos + match_len;
1053 pos += (match_len - 1);
1055 if (nonmatching_pos < text.length) {
1056 Text_t last_slice = Text$slice(text, I(nonmatching_pos + 1), I(text.length));
1057 ret = Text$concat(ret, last_slice);
1059 return ret;
1062 static void Pattern$each(Text_t text, Text_t pattern, Closure_t fn, bool recursive) {
1063 if (text.length == 0 || pattern.length == 0) return;
1064 int32_t first_grapheme = Text$get_grapheme(pattern, 0);
1065 bool find_first = (first_grapheme != '{' && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_QUOTATION_MARK)
1066 && !uc_is_property((ucs4_t)first_grapheme, UC_PROPERTY_PAIRED_PUNCTUATION));
1068 TextIter_t text_state = NEW_TEXT_ITER_STATE(text);
1069 void (*action)(PatternMatch, void *) = fn.fn;
1070 for (int64_t pos = 0; pos < text.length; pos++) {
1071 // Optimization: quickly skip ahead to first char in pattern:
1072 if (find_first) {
1073 while (pos < text.length && Text$get_grapheme_fast(&text_state, pos) != first_grapheme)
1074 ++pos;
1077 capture_t captures[MAX_BACKREFS] = {};
1078 int64_t match_len = match(text, pos, pattern, 0, captures, 0);
1079 if (match_len < 0) continue;
1081 PatternMatch m = {
1082 .text = Text$slice(text, I(pos + 1), I(pos + match_len)),
1083 .index = I(pos + 1),
1084 .captures = {},
1086 for (int i = 0; captures[i].occupied; i++) {
1087 Text_t capture = Text$slice(text, I(captures[i].index + 1), I(captures[i].index + captures[i].length));
1088 if (recursive) Pattern$each(capture, pattern, fn, recursive);
1089 List$insert(&m.captures, &capture, I(0), sizeof(Text_t));
1092 action(m, fn.userdata);
1093 pos += (match_len - 1);
1097 Text_t replace_list(Text_t text, List_t replacements, Text_t backref_marker, bool recursive) {
1098 if (text.length == 0 || replacements.length == 0) return text;
1100 Text_t ret = EMPTY_TEXT;
1102 int64_t nonmatch_pos = 0;
1103 for (int64_t pos = 0; pos < text.length;) {
1104 // Find the first matching pattern at this position:
1105 for (int64_t i = 0; i < replacements.length; i++) {
1106 Text_t pattern = *(Text_t *)(replacements.data + i * replacements.stride);
1107 capture_t captures[MAX_BACKREFS] = {};
1108 int64_t len = match(text, pos, pattern, 0, captures, 1);
1109 if (len < 0) continue;
1110 captures[0].index = pos;
1111 captures[0].length = len;
1113 // If we skipped over some non-matching text before finding a match,
1114 // insert it here:
1115 if (pos > nonmatch_pos) {
1116 Text_t before_slice = Text$slice(text, I(nonmatch_pos + 1), I(pos));
1117 ret = Text$concat(ret, before_slice);
1120 // Concatenate the replacement:
1121 Text_t replacement = *(Text_t *)(replacements.data + i * replacements.stride + sizeof(Text_t));
1122 Text_t replacement_text =
1123 apply_backrefs(text, recursive ? replacements : (List_t){}, replacement, backref_marker, captures);
1124 ret = Text$concat(ret, replacement_text);
1125 pos += MAX(len, 1);
1126 nonmatch_pos = pos;
1127 goto next_pos;
1130 pos += 1;
1131 next_pos:
1132 continue;
1135 if (nonmatch_pos <= text.length) {
1136 Text_t last_slice = Text$slice(text, I(nonmatch_pos + 1), I(text.length));
1137 ret = Text$concat(ret, last_slice);
1139 return ret;
1142 static Text_t Pattern$replace_all(Text_t text, Table_t replacements, Text_t backref_marker, bool recursive) {
1143 return replace_list(text, replacements.entries, backref_marker, recursive);
1146 static List_t Pattern$split(Text_t text, Text_t pattern) {
1147 if (text.length == 0) // special case
1148 return EMPTY_LIST;
1150 if (pattern.length == 0) // special case
1151 return Text$clusters(text);
1153 List_t chunks = {};
1155 int64_t i = 0;
1156 for (;;) {
1157 int64_t len = 0;
1158 int64_t found = _find(text, pattern, i, text.length - 1, &len, NULL);
1159 if (found == i && len == 0) found = _find(text, pattern, i + 1, text.length - 1, &len, NULL);
1160 if (found < 0) break;
1161 Text_t chunk = Text$slice(text, I(i + 1), I(found));
1162 List$insert(&chunks, &chunk, I_small(0), sizeof(Text_t));
1163 i = MAX(found + len, i + 1);
1166 Text_t last_chunk = Text$slice(text, I(i + 1), I(text.length));
1167 List$insert(&chunks, &last_chunk, I_small(0), sizeof(Text_t));
1169 return chunks;
1172 typedef struct {
1173 TextIter_t state;
1174 int64_t i;
1175 Text_t pattern;
1176 } split_iter_state_t;
1178 static OptionalText_t next_split(split_iter_state_t *state) {
1179 Text_t text = state->state.stack[0].text;
1180 if (state->i >= text.length) {
1181 if (state->pattern.length > 0 && state->i == text.length) { // special case
1182 state->i = text.length + 1;
1183 return EMPTY_TEXT;
1185 return NONE_TEXT;
1188 if (state->pattern.length == 0) { // special case
1189 Text_t ret = Text$cluster(text, I(state->i + 1));
1190 state->i += 1;
1191 return ret;
1194 int64_t start = state->i;
1195 int64_t len = 0;
1196 int64_t found = _find(text, state->pattern, start, text.length - 1, &len, NULL);
1198 if (found == start && len == 0) found = _find(text, state->pattern, start + 1, text.length - 1, &len, NULL);
1200 if (found >= 0) {
1201 state->i = MAX(found + len, state->i + 1);
1202 return Text$slice(text, I(start + 1), I(found));
1203 } else {
1204 state->i = state->state.stack[0].text.length + 1;
1205 return Text$slice(text, I(start + 1), I(text.length));
1209 static Closure_t Pattern$by_split(Text_t text, Text_t pattern) {
1210 return (Closure_t){
1211 .fn = (void *)next_split,
1212 .userdata = new (split_iter_state_t, .state = NEW_TEXT_ITER_STATE(text), .i = 0, .pattern = pattern),
1216 static Text_t Pattern$escape_text(Text_t text) {
1217 // TODO: optimize for spans of non-escaped text
1218 Text_t ret = EMPTY_TEXT;
1219 TextIter_t state = NEW_TEXT_ITER_STATE(text);
1220 for (int64_t i = 0; i < text.length; i++) {
1221 uint32_t g = Text$get_main_grapheme_fast(&state, i);
1222 if (g == '{') {
1223 ret = Text$concat(ret, Text("{1{}"));
1224 } else if (g == '?' || uc_is_property_quotation_mark(g)
1225 || (uc_is_property_paired_punctuation(g) && uc_is_property_left_of_pair(g))) {
1226 ret = Text$concat(ret, Text("{1"), Text$slice(text, I(i + 1), I(i + 1)), Text("}"));
1227 } else {
1228 ret = Text$concat(ret, Text$slice(text, I(i + 1), I(i + 1)));
1231 return ret;
1234 static Text_t Pattern$as_text(const void *obj, bool colorize, const TypeInfo_t *info) {
1235 (void)info;
1236 if (!obj) return Text("Pattern");
1238 Text_t pat = *(Text_t *)obj;
1239 Text_t quote = Pattern$has(pat, Text("/")) && !Pattern$has(pat, Text("|")) ? Text("|") : Text("/");
1240 return Text$concat(colorize ? Text("\x1b[1m$\033[m") : Text("$"), Text$quoted(pat, colorize, quote));