code / bp

Lines4.3K C3.3K Markdown541 YAML273 make110 Shell77 Lua54
(972 lines)
1 //
2 // match.c - Code for the BP virtual machine that performs the matching.
3 //
5 #include <err.h>
6 #include <limits.h>
7 #include <setjmp.h>
8 #include <stdbool.h>
9 #include <stdio.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <sys/param.h>
14 #include "match.h"
15 #include "pattern.h"
16 #include "utf8.h"
17 #include "utils.h"
19 #define MAX_CACHE_SIZE (1 << 14)
21 // Cache entries for results of matching a pattern at a string position
22 typedef struct cache_entry_s {
23 bp_pat_t *pat;
24 const char *start;
25 // Cache entries use a chained scatter approach modeled after Lua's tables
26 struct cache_entry_s *next_probe;
27 } cache_entry_t;
29 // Cache uses a hash table to store places where matches will always fail
30 typedef struct {
31 unsigned int size, occupancy, next_free;
32 cache_entry_t *fails;
33 } cache_t;
35 // Data structure for holding ambient state values during matching
36 typedef struct match_ctx_s {
37 struct match_ctx_s *parent_ctx;
38 bp_pat_t *defs;
39 cache_t *cache;
40 const char *start, *end;
41 jmp_buf error_jump;
42 bool ignorecase;
43 } match_ctx_t;
45 // New match objects are either recycled from unused match objects or allocated
46 // from the heap. While it is in use, the match object is stored in the
47 // `in_use_matches` linked list. Once it is no longer needed, it is moved to
48 // the `unused_matches` linked list so it can be reused without the need for
49 // additional calls to malloc/free. Thus, it is an invariant that every match
50 // object is in one of these two lists:
51 static bp_match_t *unused_matches = NULL;
52 static bp_match_t *in_use_matches = NULL;
54 static void default_error_handler(char **msg) { errx(EXIT_FAILURE, "%s", *msg); }
56 static bp_errhand_t error_handler = default_error_handler;
58 public
59 bp_errhand_t bp_set_error_handler(bp_errhand_t new_handler) {
60 bp_errhand_t old_handler = error_handler;
61 error_handler = new_handler;
62 return old_handler;
65 #define MATCHES(...) \
66 (bp_match_t *[]) { __VA_ARGS__, NULL }
68 __attribute__((hot, nonnull(1, 2, 3))) static bp_match_t *match(match_ctx_t *ctx, const char *str, bp_pat_t *pat);
69 __attribute__((returns_nonnull)) static bp_match_t *new_match(bp_pat_t *pat, const char *start, const char *end,
70 bp_match_t *children[]);
72 char *error_message = NULL;
74 __attribute__((format(printf, 2, 3))) static inline void match_error(match_ctx_t *ctx, const char *fmt, ...) {
75 va_list args;
76 va_start(args, fmt);
77 if (error_message) free(error_message);
78 vasprintf(&error_message, fmt, args);
79 va_end(args);
80 longjmp(ctx->error_jump, 1);
83 static bp_match_t *clone_match(bp_match_t *m) {
84 if (!m) return NULL;
85 bp_match_t *ret = new_match(m->pat, m->start, m->end, NULL);
86 if (m->children) {
87 size_t child_cap = 0, nchildren = 0;
88 if (!m->children[0] || !m->children[1] || !m->children[2]) {
89 child_cap = 3;
90 ret->children = ret->_children;
92 for (int i = 0; m->children[i]; i++) {
93 if (nchildren + 1 >= child_cap) {
94 ret->children = grow(ret->children, child_cap += 5);
95 for (size_t j = nchildren; j < child_cap; j++)
96 ret->children[j] = NULL;
98 ret->children[nchildren++] = clone_match(m->children[i]);
101 return ret;
104 // Prepend to a doubly linked list
105 static inline void gc_list_prepend(bp_match_t **head, bp_match_t *m) {
106 if (m->gc.home) errx(1, "Node already has a home");
107 m->gc.home = head;
108 m->gc.next = *head;
109 if (*head) (*head)->gc.home = &m->gc.next;
110 *head = m;
113 // Remove from a doubly linked list
114 static inline void gc_list_remove(bp_match_t *m) {
115 if (!m->gc.home) errx(1, "Attempt to remove something that isn't in a list");
116 *m->gc.home = m->gc.next;
117 if (m->gc.next) m->gc.next->gc.home = m->gc.home;
118 m->gc.home = NULL;
119 m->gc.next = NULL;
123 // Hash a string position/pattern.
125 static inline size_t hash(const char *str, size_t pat_id) { return (size_t)str + 2 * pat_id; }
128 // Check if we have cached a failure to match a given pattern at the given position.
130 static bool has_cached_failure(match_ctx_t *ctx, const char *str, bp_pat_t *pat) {
131 if (!ctx->cache->fails) return false;
132 for (cache_entry_t *fail = &ctx->cache->fails[hash(str, pat->id) & (ctx->cache->size - 1)]; fail;
133 fail = fail->next_probe) {
134 if (fail->pat == pat && fail->start == str) return true;
136 return false;
140 // Insert into the hash table using a chained scatter table approach.
142 static void _hash_insert(cache_t *cache, const char *str, bp_pat_t *pat) {
143 size_t h = hash(str, pat->id) & (cache->size - 1);
144 if (cache->fails[h].pat == NULL) { // No collision
145 cache->fails[h].pat = pat;
146 cache->fails[h].start = str;
147 cache->fails[h].next_probe = NULL;
148 ++cache->occupancy;
149 return;
152 if (cache->fails[h].pat == pat && cache->fails[h].start == str) return; // Duplicate entry, just leave it be
154 // Shuffle the colliding entry along to a free space:
155 while (cache->fails[cache->next_free].pat)
156 ++cache->next_free;
157 cache_entry_t *free_slot = &cache->fails[cache->next_free];
158 *free_slot = cache->fails[h];
159 size_t h_orig = hash(free_slot->start, free_slot->pat->id) & (cache->size - 1);
161 // Put the new entry in its desired slot
162 cache->fails[h].pat = pat;
163 cache->fails[h].start = str;
164 cache->fails[h].next_probe = h_orig == h ? free_slot : NULL;
165 ++cache->occupancy;
167 if (h_orig != h) { // Maintain the chain that points to the colliding entry
168 cache_entry_t *prev = &cache->fails[h_orig]; // Start of the chain
169 while (prev->next_probe != &cache->fails[h])
170 prev = prev->next_probe;
171 prev->next_probe = free_slot;
176 // Save a match in the cache.
178 static void cache_failure(match_ctx_t *ctx, const char *str, bp_pat_t *pat) {
179 cache_t *cache = ctx->cache;
180 // Grow the hash if needed (>99% utilization):
181 if (cache->occupancy + 1 > (cache->size * 99) / 100) {
182 cache_entry_t *old_fails = cache->fails;
183 size_t old_size = cache->size;
184 cache->size = old_size == 0 ? 16 : 2 * old_size;
185 cache->fails = new (cache_entry_t[cache->size]);
186 cache->next_free = 0;
188 // Rehash:
189 for (size_t i = 0; i < old_size; i++) {
190 if (old_fails[i].pat) _hash_insert(cache, old_fails[i].start, old_fails[i].pat);
192 if (old_fails) delete (&old_fails);
195 _hash_insert(cache, str, pat);
199 // Clear and deallocate the cache.
201 void cache_destroy(match_ctx_t *ctx) {
202 cache_t *cache = ctx->cache;
203 if (cache->fails) delete (&cache->fails);
204 memset(cache, 0, sizeof(cache_t));
208 // Look up a pattern definition by name from a definition pattern.
210 static bp_pat_t *_lookup_def(match_ctx_t *ctx, bp_pat_t *defs, const char *name, size_t namelen) {
211 while (defs != NULL) {
212 if (defs->type == BP_CHAIN) {
213 auto chain = When(defs, BP_CHAIN);
214 bp_pat_t *second = _lookup_def(ctx, chain->second, name, namelen);
215 if (second) return second;
216 defs = chain->first;
217 } else if (defs->type == BP_DEFINITIONS) {
218 auto def = When(defs, BP_DEFINITIONS);
219 if (namelen == def->namelen && strncmp(def->name, name, namelen) == 0) return def->meaning;
220 defs = def->next_def;
221 } else {
222 match_error(ctx, "Invalid pattern type in definitions");
223 return NULL;
226 return NULL;
230 // Look up a pattern definition by name from a context.
232 __attribute__((nonnull(2))) bp_pat_t *lookup_ctx(match_ctx_t *ctx, const char *name, size_t namelen) {
233 for (; ctx; ctx = ctx->parent_ctx) {
234 bp_pat_t *def = _lookup_def(ctx, ctx->defs, name, namelen);
235 if (def) return def;
237 return NULL;
241 // If the given pattern is a reference, look it up and return the referenced
242 // pattern. This is used for an optimization to avoid repeated lookups.
244 __attribute__((nonnull(1))) static inline bp_pat_t *deref(match_ctx_t *ctx, bp_pat_t *pat) {
245 if (pat && pat->type == BP_REF) {
246 auto ref = When(pat, BP_REF);
247 bp_pat_t *def = lookup_ctx(ctx, ref->name, ref->len);
248 if (def) return def;
250 return pat;
254 // Find and return the first and simplest pattern that will definitely have to
255 // match for the whole pattern to match (if any). Ideally, this would be a
256 // string literal that can be quickly scanned for.
258 static bp_pat_t *get_prerequisite(match_ctx_t *ctx, bp_pat_t *pat) {
259 int derefs = 0;
260 for (bp_pat_t *p = pat; p;) {
261 switch (p->type) {
262 case BP_BEFORE: p = When(p, BP_BEFORE)->pat; break;
263 case BP_REPEAT:
264 if (When(p, BP_REPEAT)->min == 0) return p;
265 p = When(p, BP_REPEAT)->repeat_pat;
266 break;
267 case BP_CAPTURE: p = When(p, BP_CAPTURE)->pat; break;
268 case BP_TAGGED: p = When(p, BP_TAGGED)->pat; break;
269 case BP_CHAIN: {
270 auto chain = When(p, BP_CHAIN);
271 // If pattern is something like (|"foo"|), then use "foo" as the first thing to scan for
272 p = chain->first->max_matchlen == 0 ? chain->second : chain->first;
273 break;
275 case BP_MATCH: p = When(p, BP_MATCH)->pat; break;
276 case BP_NOT_MATCH: p = When(p, BP_NOT_MATCH)->pat; break;
277 case BP_REPLACE: p = When(p, BP_REPLACE)->pat; break;
278 case BP_REF: {
279 if (++derefs > 10) return p; // In case of left recursion
280 bp_pat_t *p2 = deref(ctx, p);
281 if (p2 == p) return p2;
282 p = p2;
283 break;
285 default: return p;
288 return pat;
292 // Find the next match after prev (or the first match if prev is NULL)
294 __attribute__((nonnull(1, 2, 3))) static bp_match_t *_next_match(match_ctx_t *ctx, const char *str, bp_pat_t *pat,
295 bp_pat_t *skip) {
296 // Clear the cache so it's not full of old cache values from different parts of the file:
297 cache_destroy(ctx);
299 bp_pat_t *first = get_prerequisite(ctx, pat);
301 // Don't bother looping if this can only match at the start/end:
302 if (first->type == BP_START_OF_FILE) return match(ctx, str, pat);
303 else if (first->type == BP_END_OF_FILE) return match(ctx, ctx->end, pat);
305 // Performance optimization: if the pattern starts with a string literal,
306 // we can just rely on the highly optimized memmem() implementation to skip
307 // past areas where we know we won't find a match.
308 if (!skip && first->type == BP_STRING && first->min_matchlen > 0) {
309 char *found = ctx->ignorecase ? strcasestr(str, When(first, BP_STRING)->string)
310 : memmem(str, (size_t)(ctx->end - str), When(first, BP_STRING)->string,
311 strlen(When(first, BP_STRING)->string));
312 str = found ? found : ctx->end;
313 } else if (!skip && str > ctx->start && (first->type == BP_START_OF_LINE || first->type == BP_END_OF_LINE)) {
314 char *found = memchr(str, '\n', (size_t)(ctx->end - str));
315 str = found ? (first->type == BP_START_OF_LINE ? found + 1 : found) : ctx->end;
318 do {
319 bp_match_t *m = match(ctx, str, pat);
320 if (m) return m;
321 bp_match_t *skipped = skip ? match(ctx, str, skip) : NULL;
322 if (skipped) {
323 str = skipped->end > str ? skipped->end : str + 1;
324 recycle_match(&skipped);
325 } else str = next_char(str, ctx->end);
326 } while (str < ctx->end);
327 return NULL;
331 // Attempt to match the given pattern against the input string and return a
332 // match object, or NULL if no match is found.
333 // The returned value should be free()'d to avoid memory leaking.
335 static bp_match_t *match(match_ctx_t *ctx, const char *str, bp_pat_t *pat) {
336 switch (pat->type) {
337 case BP_DEFINITIONS: {
338 match_ctx_t ctx2 = *ctx;
339 ctx2.cache = &(cache_t){0};
340 ctx2.parent_ctx = ctx;
341 ctx2.defs = pat;
342 bp_match_t *m = match(&ctx2, str, When(pat, BP_DEFINITIONS)->meaning);
343 cache_destroy(&ctx2);
344 return m;
346 case BP_LEFTRECURSION: {
347 // Left recursion occurs when a pattern directly or indirectly
348 // invokes itself at the same position in the text. It's handled as
349 // a special case, but if a pattern invokes itself at a later
350 // point, it can be handled with normal recursion.
351 // See: left-recursion.md for more details.
352 auto leftrec = When(pat, BP_LEFTRECURSION);
353 if (str == leftrec->at) {
354 leftrec->visited = true;
355 return clone_match(leftrec->match);
356 } else {
357 return match(leftrec->ctx, str, leftrec->fallback);
360 case BP_ANYCHAR: {
361 return (str < ctx->end && *str != '\n') ? new_match(pat, str, next_char(str, ctx->end), NULL) : NULL;
363 case BP_ID_START: {
364 return (str < ctx->end && isidstart(str, ctx->end)) ? new_match(pat, str, next_char(str, ctx->end), NULL)
365 : NULL;
367 case BP_ID_CONTINUE: {
368 return (str < ctx->end && isidcontinue(str, ctx->end)) ? new_match(pat, str, next_char(str, ctx->end), NULL)
369 : NULL;
371 case BP_START_OF_FILE: {
372 return (str == ctx->start) ? new_match(pat, str, str, NULL) : NULL;
374 case BP_START_OF_LINE: {
375 return (str == ctx->start || str[-1] == '\n') ? new_match(pat, str, str, NULL) : NULL;
377 case BP_END_OF_FILE: {
378 return (str == ctx->end || (str == ctx->end - 1 && *str == '\n')) ? new_match(pat, str, str, NULL) : NULL;
380 case BP_END_OF_LINE: {
381 return (str == ctx->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL;
383 case BP_WORD_BOUNDARY: {
384 return (str == ctx->start || isidcontinue(str, ctx->end) != isidcontinue(prev_char(ctx->start, str), ctx->end))
385 ? new_match(pat, str, str, NULL)
386 : NULL;
388 case BP_STRING: {
389 if (&str[pat->min_matchlen] > ctx->end) return NULL;
390 if (pat->min_matchlen > 0
391 && (ctx->ignorecase ? strncasecmp : strncmp)(str, When(pat, BP_STRING)->string, pat->min_matchlen) != 0)
392 return NULL;
393 return new_match(pat, str, str + pat->min_matchlen, NULL);
395 case BP_RANGE: {
396 if (str >= ctx->end) return NULL;
397 auto range = When(pat, BP_RANGE);
398 if ((unsigned char)*str < range->low || (unsigned char)*str > range->high) return NULL;
399 return new_match(pat, str, str + 1, NULL);
401 case BP_NOT: {
402 bp_match_t *m = match(ctx, str, When(pat, BP_NOT)->pat);
403 if (m != NULL) {
404 recycle_match(&m);
405 return NULL;
407 return new_match(pat, str, str, NULL);
409 case BP_UPTO:
410 case BP_UPTO_STRICT: {
411 bp_match_t *m = new_match(pat, str, str, NULL);
412 bp_pat_t *target =
413 deref(ctx, pat->type == BP_UPTO ? When(pat, BP_UPTO)->target : When(pat, BP_UPTO_STRICT)->target),
414 *skip = deref(ctx, pat->type == BP_UPTO ? When(pat, BP_UPTO)->skip : When(pat, BP_UPTO_STRICT)->skip);
415 if (!target && !skip) {
416 while (str < ctx->end && *str != '\n')
417 ++str;
418 m->end = str;
419 return m;
422 size_t child_cap = 0, nchildren = 0;
423 for (const char *prev = NULL; prev < str;) {
424 prev = str;
425 if (target) {
426 bp_match_t *p = match(ctx, str, target);
427 if (p != NULL) {
428 recycle_match(&p);
429 m->end = str;
430 return m;
432 } else if (str == ctx->end || *str == '\n') {
433 m->end = str;
434 return m;
436 if (skip) {
437 bp_match_t *s = match(ctx, str, skip);
438 if (s != NULL) {
439 str = s->end;
440 if (nchildren + 2 >= child_cap) {
441 m->children = grow(m->children, child_cap += 5);
442 for (size_t i = nchildren; i < child_cap; i++)
443 m->children[i] = NULL;
445 m->children[nchildren++] = s;
446 continue;
449 // This isn't in the for() structure because there needs to
450 // be at least once chance to match the pattern, even if
451 // we're at the end of the string already (e.g. "..$").
452 if (str < ctx->end && *str != '\n' && pat->type != BP_UPTO_STRICT) str = next_char(str, ctx->end);
454 recycle_match(&m);
455 return NULL;
457 case BP_REPEAT: {
458 bp_match_t *m = new_match(pat, str, str, NULL);
459 size_t reps = 0;
460 auto repeat = When(pat, BP_REPEAT);
461 bp_pat_t *repeating = deref(ctx, repeat->repeat_pat);
462 bp_pat_t *sep = deref(ctx, repeat->sep);
463 size_t child_cap = 0, nchildren = 0;
464 for (reps = 0; repeat->max == -1 || reps < (size_t)repeat->max; ++reps) {
465 const char *start = str;
466 // Separator
467 bp_match_t *msep = NULL;
468 if (sep != NULL && reps > 0) {
469 msep = match(ctx, str, sep);
470 if (msep == NULL) break;
471 str = msep->end;
473 bp_match_t *mp = match(ctx, str, repeating);
474 if (mp == NULL) {
475 str = start;
476 if (msep) recycle_match(&msep);
477 break;
479 if (mp->end == start && reps > 0) {
480 // Since no forward progress was made on either `repeating`
481 // or `sep` and BP does not have mutable state, it's
482 // guaranteed that no progress will be made on the next
483 // loop either. We know that this will continue to loop
484 // until reps==max, so let's just cut to the chase instead
485 // of looping infinitely.
486 if (msep) recycle_match(&msep);
487 recycle_match(&mp);
488 if (repeat->max == -1) reps = ~(size_t)0;
489 else reps = (size_t)repeat->max;
490 break;
492 if (msep) {
493 if (nchildren + 2 >= child_cap) {
494 m->children = grow(m->children, child_cap += 5);
495 for (size_t i = nchildren; i < child_cap; i++)
496 m->children[i] = NULL;
498 m->children[nchildren++] = msep;
501 if (nchildren + 2 >= child_cap) {
502 m->children = grow(m->children, child_cap += 5);
503 for (size_t i = nchildren; i < child_cap; i++)
504 m->children[i] = NULL;
506 m->children[nchildren++] = mp;
507 str = mp->end;
510 if (reps < (size_t)repeat->min) {
511 recycle_match(&m);
512 return NULL;
514 m->end = str;
515 return m;
517 case BP_AFTER: {
518 bp_pat_t *back = deref(ctx, When(pat, BP_AFTER)->pat);
519 if (!back) return NULL;
521 // We only care about the region from the backtrack pos up to the
522 // current pos, so mock it out as a file slice.
523 // TODO: this breaks ^/^^/$/$$, but that can probably be ignored
524 // because you rarely need to check those in a backtrack.
525 match_ctx_t slice_ctx = *ctx;
526 slice_ctx.cache = &(cache_t){0};
527 slice_ctx.start = ctx->start;
528 slice_ctx.end = str;
529 for (const char *pos = &str[-(long)back->min_matchlen];
530 pos >= ctx->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
531 pos = prev_char(ctx->start, pos)) {
532 cache_destroy(&slice_ctx);
533 slice_ctx.start = (char *)pos;
534 bp_match_t *m = match(&slice_ctx, pos, back);
535 // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
536 if (m && m->end != str) recycle_match(&m);
537 else if (m) {
538 cache_destroy(&slice_ctx);
539 return new_match(pat, str, str, MATCHES(m));
541 if (pos == ctx->start) break;
542 // To prevent extreme performance degradation, don't keep
543 // walking backwards endlessly over newlines.
544 if (back->max_matchlen == -1 && *pos == '\n') break;
546 cache_destroy(&slice_ctx);
547 return NULL;
549 case BP_BEFORE: {
550 bp_match_t *after = match(ctx, str, When(pat, BP_BEFORE)->pat);
551 return after ? new_match(pat, str, str, MATCHES(after)) : NULL;
553 case BP_CAPTURE:
554 case BP_TAGGED: {
555 bp_pat_t *to_match = pat->type == BP_CAPTURE ? When(pat, BP_CAPTURE)->pat : When(pat, BP_TAGGED)->pat;
556 if (!to_match) return new_match(pat, str, str, NULL);
557 bp_match_t *p = match(ctx, str, to_match);
558 return p ? new_match(pat, str, p->end, MATCHES(p)) : NULL;
560 case BP_OTHERWISE: {
561 bp_match_t *m = match(ctx, str, When(pat, BP_OTHERWISE)->first);
562 return m ? m : match(ctx, str, When(pat, BP_OTHERWISE)->second);
564 case BP_CHAIN: {
565 auto chain = When(pat, BP_CHAIN);
566 if (chain->first->type == BP_DEFINITIONS) {
567 match_ctx_t ctx2 = *ctx;
568 ctx2.cache = &(cache_t){0};
569 ctx2.parent_ctx = ctx;
570 ctx2.defs = chain->first;
571 bp_match_t *m = match(&ctx2, str, chain->second);
572 cache_destroy(&ctx2);
573 return m;
576 bp_match_t *m1 = match(ctx, str, chain->first);
577 if (m1 == NULL) return NULL;
579 bp_match_t *m2;
580 // Push backrefs and run matching, then cleanup
581 if (m1->pat->type == BP_CAPTURE && When(m1->pat, BP_CAPTURE)->name && When(m1->pat, BP_CAPTURE)->backreffable) {
582 // Temporarily add a rule that the backref name matches the
583 // exact string of the original match (no replacements)
584 bp_pat_t *backref;
585 if (m1->children && m1->children[0]->pat->type == BP_CURDENT) {
586 const char *linestart = m1->start;
587 while (linestart > ctx->start && linestart[-1] != '\n')
588 --linestart;
590 // Current indentation:
591 char denter = *linestart;
592 size_t dents = 0;
593 if (denter == ' ' || denter == '\t') {
594 while (linestart[dents] == denter && &linestart[dents] < ctx->end)
595 ++dents;
597 backref = bp_raw_literal(linestart, dents);
598 } else {
599 backref = bp_raw_literal(m1->start, (size_t)(m1->end - m1->start));
601 match_ctx_t ctx2 = *ctx;
602 ctx2.cache = &(cache_t){0};
603 ctx2.parent_ctx = ctx;
604 ctx2.defs = &(bp_pat_t){
605 .type = BP_DEFINITIONS,
606 .start = m1->pat->start,
607 .end = m1->pat->end,
608 .__tagged.BP_DEFINITIONS =
610 .name = When(m1->pat, BP_CAPTURE)->name,
611 .namelen = When(m1->pat, BP_CAPTURE)->namelen,
612 .meaning = backref,
615 m2 = match(&ctx2, m1->end, chain->second);
616 if (!m2) // No need to keep the backref in memory if it didn't match
617 delete_pat(&backref, false);
618 cache_destroy(&ctx2);
619 } else {
620 m2 = match(ctx, m1->end, chain->second);
623 if (m2 == NULL) {
624 recycle_match(&m1);
625 return NULL;
628 return new_match(pat, str, m2->end, MATCHES(m1, m2));
630 case BP_MATCH:
631 case BP_NOT_MATCH: {
632 bp_pat_t *target = pat->type == BP_MATCH ? When(pat, BP_MATCH)->pat : When(pat, BP_NOT_MATCH)->pat;
633 bp_match_t *m1 = match(ctx, str, target);
634 if (m1 == NULL) return NULL;
636 // <p1>~<p2> matches iff the text of <p1> matches <p2>
637 // <p1>!~<p2> matches iff the text of <p1> does not match <p2>
638 match_ctx_t slice_ctx = *ctx;
639 slice_ctx.cache = &(cache_t){0};
640 slice_ctx.start = m1->start;
641 slice_ctx.end = m1->end;
642 bp_match_t *ret = NULL, *m2 = NULL;
643 if (pat->type == BP_MATCH) {
644 m2 = _next_match(&slice_ctx, slice_ctx.start, When(pat, BP_MATCH)->must_match, NULL);
645 if (m2) ret = new_match(pat, m1->start, m1->end, MATCHES(m1, m2));
646 } else {
647 m2 = _next_match(&slice_ctx, slice_ctx.start, When(pat, BP_NOT_MATCH)->must_not_match, NULL);
648 if (!m2) ret = new_match(pat, m1->start, m1->end, MATCHES(m1));
650 cache_destroy(&slice_ctx);
651 if (!ret) {
652 if (m2) recycle_match(&m2);
653 recycle_match(&m1);
655 return ret;
657 case BP_REPLACE: {
658 bp_match_t *p = NULL;
659 auto replace = When(pat, BP_REPLACE);
660 if (replace->pat) {
661 p = match(ctx, str, replace->pat);
662 if (p == NULL) return NULL;
664 return new_match(pat, str, p ? p->end : str, MATCHES(p));
666 case BP_REF: {
667 if (has_cached_failure(ctx, str, pat)) return NULL;
669 auto ref_pat = When(pat, BP_REF);
670 bp_pat_t *ref = lookup_ctx(ctx, ref_pat->name, ref_pat->len);
671 if (ref == NULL) {
672 match_error(ctx, "Unknown pattern: '%.*s'", (int)ref_pat->len, ref_pat->name);
673 return NULL;
676 if (ref->type == BP_LEFTRECURSION) return match(ctx, str, ref);
678 bp_pat_t rec_op = {
679 .type = BP_LEFTRECURSION,
680 .start = ref->start,
681 .end = ref->end,
682 .min_matchlen = 0,
683 .max_matchlen = -1,
684 .__tagged.BP_LEFTRECURSION =
686 .match = NULL,
687 .visited = false,
688 .at = str,
689 .fallback = pat,
690 .ctx = (void *)ctx,
693 match_ctx_t ctx2 = *ctx;
694 ctx2.parent_ctx = ctx;
695 ctx2.defs = &(bp_pat_t){
696 .type = BP_DEFINITIONS,
697 .start = pat->start,
698 .end = pat->end,
699 .__tagged.BP_DEFINITIONS =
701 .name = ref_pat->name,
702 .namelen = ref_pat->len,
703 .meaning = &rec_op,
707 bp_match_t *m = match(&ctx2, str, ref);
708 // If left recursion was involved, keep retrying while forward progress can be made:
709 if (m && rec_op.__tagged.BP_LEFTRECURSION.visited) {
710 while (1) {
711 const char *prev = m->end;
712 rec_op.__tagged.BP_LEFTRECURSION.match = m;
713 ctx2.cache = &(cache_t){0};
714 bp_match_t *m2 = match(&ctx2, str, ref);
715 cache_destroy(&ctx2);
716 if (!m2) break;
717 if (m2->end <= prev) {
718 recycle_match(&m2);
719 break;
721 recycle_match(&m);
722 m = m2;
726 if (!m) {
727 cache_failure(ctx, str, pat);
728 return NULL;
731 // This match wrapper mainly exists for record-keeping purposes.
732 // It also helps with visualization of match results.
733 // OPTIMIZE: remove this if necessary
734 return new_match(pat, m->start, m->end, MATCHES(m));
736 case BP_NODENT: {
737 if (*str != '\n') return NULL;
738 const char *start = str;
740 const char *p = str;
741 while (p > ctx->start && p[-1] != '\n')
742 --p;
744 // Current indentation:
745 char denter = *p;
746 int dents = 0;
747 if (denter == ' ' || denter == '\t') {
748 for (; *p == denter && p < ctx->end; ++p)
749 ++dents;
752 // Subsequent indentation:
753 while (*str == '\n' || *str == '\n')
754 ++str;
755 for (int i = 0; i < dents; i++)
756 if (&str[i] >= ctx->end || str[i] != denter) return NULL;
758 return new_match(pat, start, &str[dents], NULL);
760 case BP_CURDENT: {
761 return new_match(pat, str, str, NULL);
763 default: {
764 match_error(ctx, "Unknown pattern type: %u", pat->type);
765 return NULL;
771 // Return a match object which can be used (may be allocated or recycled).
773 bp_match_t *new_match(bp_pat_t *pat, const char *start, const char *end, bp_match_t *children[]) {
774 bp_match_t *m;
775 if (unused_matches) {
776 m = unused_matches;
777 gc_list_remove(m);
778 memset(m, 0, sizeof(bp_match_t));
779 } else {
780 m = new (bp_match_t);
782 // Keep track of the object:
783 gc_list_prepend(&in_use_matches, m);
785 m->pat = pat;
786 m->start = start;
787 m->end = end;
789 if (children) {
790 for (int i = 0; children[i]; i++)
791 m->_children[i] = children[i];
792 m->children = m->_children;
794 return m;
798 // If the given match is not currently a child member of another match (or
799 // otherwise reserved) then put it back in the pool of unused match objects.
801 public
802 void recycle_match(bp_match_t **at_m) {
803 bp_match_t *m = *at_m;
804 if (m->children) {
805 for (int i = 0; m->children[i]; i++)
806 recycle_match(&m->children[i]);
807 if (m->children != m->_children) delete (&m->children);
810 gc_list_remove(m);
811 (void)memset(m, 0, sizeof(bp_match_t));
812 gc_list_prepend(&unused_matches, m);
813 *at_m = NULL;
817 // Force all match objects into the pool of unused match objects.
819 public
820 size_t recycle_all_matches(void) {
821 size_t count = 0;
822 for (bp_match_t *m; (m = in_use_matches); ++count) {
823 gc_list_remove(m);
824 if (m->children && m->children != m->_children) delete (&m->children);
825 gc_list_prepend(&unused_matches, m);
827 return count;
831 // Free all match objects in memory.
833 public
834 size_t free_all_matches(void) {
835 size_t count = 0;
836 recycle_all_matches();
837 for (bp_match_t *m; (m = unused_matches); ++count) {
838 gc_list_remove(m);
839 delete (&m);
841 return count;
845 // Iterate over matches.
846 // Usage: for (bp_match_t *m = NULL; next_match(&m, ...); ) {...}
848 public
849 bool next_match(bp_match_t **m, const char *start, const char *end, bp_pat_t *pat, bp_pat_t *defs, bp_pat_t *skip,
850 bool ignorecase) {
851 const char *pos;
852 if (*m) {
853 // Make sure forward progress is occurring, even after zero-width matches:
854 pos = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end + 1;
855 recycle_match(m);
856 } else {
857 pos = start;
860 if (!pat) {
861 error_handler = default_error_handler;
862 return false;
865 match_ctx_t ctx = {
866 .cache = &(cache_t){0},
867 .start = start,
868 .end = end,
869 .ignorecase = ignorecase,
870 .defs = defs,
872 if (setjmp(ctx.error_jump) == 0) {
873 *m = (pos <= end) ? _next_match(&ctx, pos, pat, skip) : NULL;
874 cache_destroy(&ctx);
875 } else {
876 recycle_all_matches();
877 cache_destroy(&ctx);
878 *m = NULL;
879 if (error_handler) error_handler(&error_message);
881 if (error_message) {
882 free(error_message);
883 error_message = NULL;
886 return *m != NULL;
890 // Helper function to track state while doing a depth-first search.
892 __attribute__((nonnull)) static bp_match_t *_get_numbered_capture(bp_match_t *m, int *n) {
893 if ((m->pat->type == BP_CAPTURE && When(m->pat, BP_CAPTURE)->namelen == 0) || m->pat->type == BP_TAGGED) {
894 if (*n == 1) {
895 return m;
896 } else {
897 --(*n);
898 return NULL;
902 if (m->pat->type == BP_CAPTURE || m->pat->type == BP_TAGGED) return NULL;
904 if (m->children) {
905 for (int i = 0; m->children[i]; i++) {
906 bp_match_t *cap = _get_numbered_capture(m->children[i], n);
907 if (cap) return cap;
910 return NULL;
914 // Get a specific numbered pattern capture.
916 public
917 bp_match_t *get_numbered_capture(bp_match_t *m, int n) {
918 if (n <= 0) return m;
919 if (m->pat->type == BP_TAGGED || m->pat->type == BP_CAPTURE) {
920 if (n == 1 && m->pat->type == BP_CAPTURE && When(m->pat, BP_CAPTURE)->namelen == 0) return m;
921 if (m->children) {
922 for (int i = 0; m->children[i]; i++) {
923 bp_match_t *cap = _get_numbered_capture(m->children[i], &n);
924 if (cap) return cap;
927 return NULL;
928 } else {
929 return _get_numbered_capture(m, &n);
934 // Helper function for get_named_capture()
936 bp_match_t *_get_named_capture(bp_match_t *m, const char *name, size_t namelen) {
937 if (m->pat->type == BP_CAPTURE && When(m->pat, BP_CAPTURE)->name && When(m->pat, BP_CAPTURE)->namelen == namelen
938 && strncmp(When(m->pat, BP_CAPTURE)->name, name, When(m->pat, BP_CAPTURE)->namelen) == 0)
939 return m;
941 if (m->pat->type == BP_TAGGED || m->pat->type == BP_CAPTURE) return NULL;
943 if (m->children) {
944 for (int i = 0; m->children[i]; i++) {
945 bp_match_t *cap = _get_named_capture(m->children[i], name, namelen);
946 if (cap) return cap;
949 return NULL;
953 // Get a capture with a specific name.
955 public
956 bp_match_t *get_named_capture(bp_match_t *m, const char *name, ssize_t _namelen) {
957 size_t namelen = _namelen < 0 ? strlen(name) : (size_t)_namelen;
958 if (m->pat->type == BP_TAGGED) { // || (m->pat->type == BP_CAPTURE && m->pat->args.capture.namelen > 0)) {
959 if (m->children) {
960 for (int i = 0; m->children[i]; i++) {
961 bp_match_t *cap = _get_named_capture(m->children[i], name, namelen);
962 if (cap) return cap;
965 return NULL;
966 } else {
967 return _get_named_capture(m, name, namelen);
969 return NULL;
972 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0