diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2020-09-13 00:48:39 -0700 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2020-09-13 00:48:39 -0700 |
| commit | 507479be37540806ed32b33d947b145113df99f2 (patch) | |
| tree | 10b97e60d454b4a93cd00f0ee2e560a0a5476c01 | |
| parent | 8f090c68c074d2de46e11e165e76d5e108d918be (diff) | |
Tentative support for left-recursion
| -rw-r--r-- | vm.c | 78 |
1 files changed, 58 insertions, 20 deletions
@@ -62,14 +62,22 @@ static size_t push_backrefs(grammar_t *g, match_t *m) return count; } +typedef struct recursive_ref_s { + const vm_op_t *op; + const char *pos; + struct recursive_ref_s *prev; + int hit; + match_t *result; +} recursive_ref_t; + + /* * Run virtual machine operation against a string and return * a match struct, or NULL if no match is found. * The returned value should be free()'d to avoid memory leaking. */ -match_t *match(grammar_t *g, const char *str, vm_op_t *op) +static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref_t *rec) { - //tailcall: switch (op->op) { case VM_EMPTY: { match_t *m = calloc(sizeof(match_t), 1); @@ -109,7 +117,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) if (op->op == VM_ANYTHING_BUT) if (!*str || (!op->multiline && *str == '\n')) return NULL; - match_t *m = match(g, str, op->args.pat); + match_t *m = _match(g, str, op->args.pat, rec); if (m != NULL) { destroy_match(&m); return NULL; @@ -128,7 +136,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) match_t *p = NULL; for (const char *prev = NULL; p == NULL && prev < str; ) { prev = str; - p = match(g, str, op->args.pat); + p = _match(g, str, op->args.pat, rec); if (*str && (op->multiline || *str != '\n')) ++str; } @@ -155,11 +163,11 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) // Separator match_t *sep = NULL; if (op->args.repetitions.sep != NULL && reps > 0) { - sep = match(g, str, op->args.repetitions.sep); + sep = _match(g, str, op->args.repetitions.sep, rec); if (sep == NULL) break; str = sep->end; } - match_t *p = match(g, str, op->args.repetitions.repeat_pat); + match_t *p = _match(g, str, op->args.repetitions.repeat_pat, rec); if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops destroy_match(&sep); destroy_match(&p); @@ -189,7 +197,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) for (int i = 0; i < backtrack; i++) { if (str[-i] == '\0') return NULL; } - match_t *before = match(g, str - backtrack, op->args.pat); + match_t *before = _match(g, str - backtrack, op->args.pat, rec); if (before == NULL) return NULL; destroy_match(&before); match_t *m = calloc(sizeof(match_t), 1); @@ -199,7 +207,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) return m; } case VM_BEFORE: { - match_t *after = match(g, str, op->args.pat); + match_t *after = _match(g, str, op->args.pat, rec); if (after == NULL) return NULL; destroy_match(&after); match_t *m = calloc(sizeof(match_t), 1); @@ -209,7 +217,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) return m; } case VM_CAPTURE: { - match_t *p = match(g, str, op->args.pat); + match_t *p = _match(g, str, op->args.pat, rec); if (p == NULL) return NULL; match_t *m = calloc(sizeof(match_t), 1); m->start = str; @@ -222,16 +230,16 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) return m; } case VM_OTHERWISE: { - match_t *m = match(g, str, op->args.multiple.first); - if (m == NULL) m = match(g, str, op->args.multiple.second); + match_t *m = _match(g, str, op->args.multiple.first, rec); + if (m == NULL) m = _match(g, str, op->args.multiple.second, rec); return m; } case VM_CHAIN: { - match_t *m1 = match(g, str, op->args.multiple.first); + match_t *m1 = _match(g, str, op->args.multiple.first, rec); if (m1 == NULL) return NULL; size_t nbackrefs = push_backrefs(g, m1); - match_t *m2 = match(g, m1->end, op->args.multiple.second); + match_t *m2 = _match(g, m1->end, op->args.multiple.second, rec); pop_backrefs(g, nbackrefs); if (m2 == NULL) { destroy_match(&m1); @@ -250,7 +258,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) m->start = str; m->op = op; if (op->args.replace.replace_pat) { - match_t *p = match(g, str, op->args.replace.replace_pat); + match_t *p = _match(g, str, op->args.replace.replace_pat, rec); if (p == NULL) return NULL; m->child = p; m->end = p->end; @@ -264,14 +272,39 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) case VM_REF: { vm_op_t *r = lookup(g, op->args.s); check(r != NULL, "Unknown identifier: '%s'", op->args.s); - // Could use tailcall here, but current have disabled - match_t *p = match(g, str, r); - if (p == NULL) return NULL; + + // Prevent infinite left recursion: + for (recursive_ref_t *p = rec; p; p = p->prev) { + if (p->pos == str && p->op == r) { + ++p->hit; + return p->result; + } + } + + recursive_ref_t wrap = { + .op = r, + .pos = str, + .prev = rec, + .hit = 0, + .result = NULL, + }; + match_t *best = NULL; + left_recursive:; + match_t *p = _match(g, str, r, &wrap); + if (p == NULL) return best; + if (wrap.hit && (best == NULL || p->end > best->end)) { + best = p; + wrap.hit = 0; + wrap.result = p; + goto left_recursive; + } else if (best == NULL) { + best = p; + } match_t *m = calloc(sizeof(match_t), 1); - m->start = p->start; - m->end = p->end; + m->start = best->start; + m->end = best->end; m->op = op; - m->child = p; + m->child = best; m->name_or_replacement = op->args.s; m->is_ref = 1; return m; @@ -593,4 +626,9 @@ static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap) return ret; } +match_t *match(grammar_t *g, const char *str, vm_op_t *op) +{ + return _match(g, str, op, NULL); +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 |
