aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2020-09-13 00:48:39 -0700
committerBruce Hill <bruce@bruce-hill.com>2020-09-13 00:48:39 -0700
commit507479be37540806ed32b33d947b145113df99f2 (patch)
tree10b97e60d454b4a93cd00f0ee2e560a0a5476c01
parent8f090c68c074d2de46e11e165e76d5e108d918be (diff)
Tentative support for left-recursion
-rw-r--r--vm.c78
1 files changed, 58 insertions, 20 deletions
diff --git a/vm.c b/vm.c
index 97c0e7c..2182d8d 100644
--- a/vm.c
+++ b/vm.c
@@ -62,14 +62,22 @@ static size_t push_backrefs(grammar_t *g, match_t *m)
return count;
}
+typedef struct recursive_ref_s {
+ const vm_op_t *op;
+ const char *pos;
+ struct recursive_ref_s *prev;
+ int hit;
+ match_t *result;
+} recursive_ref_t;
+
+
/*
* Run virtual machine operation against a string and return
* a match struct, or NULL if no match is found.
* The returned value should be free()'d to avoid memory leaking.
*/
-match_t *match(grammar_t *g, const char *str, vm_op_t *op)
+static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref_t *rec)
{
- //tailcall:
switch (op->op) {
case VM_EMPTY: {
match_t *m = calloc(sizeof(match_t), 1);
@@ -109,7 +117,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
if (op->op == VM_ANYTHING_BUT)
if (!*str || (!op->multiline && *str == '\n'))
return NULL;
- match_t *m = match(g, str, op->args.pat);
+ match_t *m = _match(g, str, op->args.pat, rec);
if (m != NULL) {
destroy_match(&m);
return NULL;
@@ -128,7 +136,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
match_t *p = NULL;
for (const char *prev = NULL; p == NULL && prev < str; ) {
prev = str;
- p = match(g, str, op->args.pat);
+ p = _match(g, str, op->args.pat, rec);
if (*str && (op->multiline || *str != '\n'))
++str;
}
@@ -155,11 +163,11 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
// Separator
match_t *sep = NULL;
if (op->args.repetitions.sep != NULL && reps > 0) {
- sep = match(g, str, op->args.repetitions.sep);
+ sep = _match(g, str, op->args.repetitions.sep, rec);
if (sep == NULL) break;
str = sep->end;
}
- match_t *p = match(g, str, op->args.repetitions.repeat_pat);
+ match_t *p = _match(g, str, op->args.repetitions.repeat_pat, rec);
if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
destroy_match(&sep);
destroy_match(&p);
@@ -189,7 +197,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
for (int i = 0; i < backtrack; i++) {
if (str[-i] == '\0') return NULL;
}
- match_t *before = match(g, str - backtrack, op->args.pat);
+ match_t *before = _match(g, str - backtrack, op->args.pat, rec);
if (before == NULL) return NULL;
destroy_match(&before);
match_t *m = calloc(sizeof(match_t), 1);
@@ -199,7 +207,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
return m;
}
case VM_BEFORE: {
- match_t *after = match(g, str, op->args.pat);
+ match_t *after = _match(g, str, op->args.pat, rec);
if (after == NULL) return NULL;
destroy_match(&after);
match_t *m = calloc(sizeof(match_t), 1);
@@ -209,7 +217,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
return m;
}
case VM_CAPTURE: {
- match_t *p = match(g, str, op->args.pat);
+ match_t *p = _match(g, str, op->args.pat, rec);
if (p == NULL) return NULL;
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
@@ -222,16 +230,16 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
return m;
}
case VM_OTHERWISE: {
- match_t *m = match(g, str, op->args.multiple.first);
- if (m == NULL) m = match(g, str, op->args.multiple.second);
+ match_t *m = _match(g, str, op->args.multiple.first, rec);
+ if (m == NULL) m = _match(g, str, op->args.multiple.second, rec);
return m;
}
case VM_CHAIN: {
- match_t *m1 = match(g, str, op->args.multiple.first);
+ match_t *m1 = _match(g, str, op->args.multiple.first, rec);
if (m1 == NULL) return NULL;
size_t nbackrefs = push_backrefs(g, m1);
- match_t *m2 = match(g, m1->end, op->args.multiple.second);
+ match_t *m2 = _match(g, m1->end, op->args.multiple.second, rec);
pop_backrefs(g, nbackrefs);
if (m2 == NULL) {
destroy_match(&m1);
@@ -250,7 +258,7 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
m->start = str;
m->op = op;
if (op->args.replace.replace_pat) {
- match_t *p = match(g, str, op->args.replace.replace_pat);
+ match_t *p = _match(g, str, op->args.replace.replace_pat, rec);
if (p == NULL) return NULL;
m->child = p;
m->end = p->end;
@@ -264,14 +272,39 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
case VM_REF: {
vm_op_t *r = lookup(g, op->args.s);
check(r != NULL, "Unknown identifier: '%s'", op->args.s);
- // Could use tailcall here, but current have disabled
- match_t *p = match(g, str, r);
- if (p == NULL) return NULL;
+
+ // Prevent infinite left recursion:
+ for (recursive_ref_t *p = rec; p; p = p->prev) {
+ if (p->pos == str && p->op == r) {
+ ++p->hit;
+ return p->result;
+ }
+ }
+
+ recursive_ref_t wrap = {
+ .op = r,
+ .pos = str,
+ .prev = rec,
+ .hit = 0,
+ .result = NULL,
+ };
+ match_t *best = NULL;
+ left_recursive:;
+ match_t *p = _match(g, str, r, &wrap);
+ if (p == NULL) return best;
+ if (wrap.hit && (best == NULL || p->end > best->end)) {
+ best = p;
+ wrap.hit = 0;
+ wrap.result = p;
+ goto left_recursive;
+ } else if (best == NULL) {
+ best = p;
+ }
match_t *m = calloc(sizeof(match_t), 1);
- m->start = p->start;
- m->end = p->end;
+ m->start = best->start;
+ m->end = best->end;
m->op = op;
- m->child = p;
+ m->child = best;
m->name_or_replacement = op->args.s;
m->is_ref = 1;
return m;
@@ -593,4 +626,9 @@ static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap)
return ret;
}
+match_t *match(grammar_t *g, const char *str, vm_op_t *op)
+{
+ return _match(g, str, op, NULL);
+}
+
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1