From 304d5c35a8718b2b0f190e53dece6362d14df1b4 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sat, 2 Oct 2021 11:33:40 -0700 Subject: [PATCH] Left recursion correctness fixes --- grammars/bp.bp | 14 +++++++------ match.c | 53 ++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 49 insertions(+), 18 deletions(-) diff --git a/grammars/bp.bp b/grammars/bp.bp index a6e6f0b..68d1ec8 100644 --- a/grammars/bp.bp +++ b/grammars/bp.bp @@ -17,7 +17,7 @@ String-pattern: ..%(\n / Nodent / Identifier-char / Identifier-start / Escape / pat: simple-pat !(__("!~"/"~")) / suffixed-pat simple-pat: (Upto-and / Dot / Word-boundary/ String / Chars / Nodent / Identifier-char / Identifier-start / Escape-range - / Escape / Repeat / Optional / No / After / Before / Capture / Empty-replacement + / Escape / Repeat / Optional / No / After / Before / Capture / Start-of-File / Start-of-Line / End-of-File / End-of-Line / Ref / parens) suffixed-pat: ( @@ -63,10 +63,11 @@ After: `< __ pat Before: `> __ pat Capture: `@ [__ @capture-name=(id/`!) __ !"=>" `=] __ (@capture=pat / @error=(=>"Expected pattern to capture")) Replace: ( - @replace-pat=(Replace / Chain / pat) __ "=>" (__ @replacement=String / @error=(=>"Expected replacement string")) + @replace-pat=[Chain-noreplace / pat] __ "=>" (__ @replacement=String / @error=(=>"Expected replacement string")) ) +Replace-chain: Replace-chain __ "=>" (__ @replacement=String / @error=(=>"Expected replacement string")) Empty-replacement: ( - @replace-pat=(=>"''") "=>" (__ @replacement=String / @error=(=>"Expected replacement string")) + (=>"EMPTY") @replace-pat=(=>"''") "=>" (__ @replacement=String / @error=(=>"Expected replacement string")) ) Ref: @name=id !(__`:) Start-of-File: "^^" @@ -76,9 +77,10 @@ End-of-Line: "$" parens: `( __ extended-pat (__ `) / @error=(=>"Expected closing parenthesis here")) -Chain: 2+@(pat !(__"=>") / Replace)%__ -Otherwise: 2+@(Replace / Chain / pat)%(__`/__) -extended-pat: Otherwise / Replace / Chain / pat +Chain: @(Replace/pat) __ @(Chain/Replace/pat) +Chain-noreplace: @pat __ @(Chain-noreplace/pat) +Otherwise: 2+@(Chain / Replace / pat)%(__`/__) +extended-pat: Otherwise / Chain / Replace / pat # Special-symbol rules: _: *(` / \t) diff --git a/match.c b/match.c index 2156e05..17f88bb 100644 --- a/match.c +++ b/match.c @@ -56,6 +56,27 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat); __attribute__((returns_nonnull)) static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *children[]); +static match_t *clone_match(match_t *m) +{ + if (!m) return NULL; + match_t *ret = new_match(m->pat, m->start, m->end, NULL); + if (m->children) { + size_t child_cap = 0, nchildren = 0; + if (!m->children[0] || !m->children[1] || !m->children[2]) { + child_cap = 3; + ret->children = ret->_children; + } + for (int i = 0; m->children[i]; i++) { + if (nchildren+1 >= child_cap) { + ret->children = grow(ret->children, child_cap += 5); + for (size_t i = nchildren; i < child_cap; i++) ret->children[i] = NULL; + } + ret->children[nchildren++] = clone_match(m->children[i]); + } + } + return ret; +} + // Prepend to a doubly linked list static inline void gc_list_prepend(match_t **head, match_t *m) { @@ -227,7 +248,7 @@ static pat_t *get_prerequisite(match_ctx_t *ctx, pat_t *pat) case BP_REPLACE: p = p->args.replace.pat; break; case BP_REF: { - if (++derefs > 10) return p; + if (++derefs > 10) return p; // In case of left recursion pat_t *p2 = deref(ctx, p); if (p2 == p) return p2; p = p2; @@ -259,7 +280,6 @@ static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t // Clear the cache so it's not full of old cache values from different parts of the file: cache_destroy(ctx); - pat = deref(ctx, pat); // Avoid repeated lookups pat_t *first = get_prerequisite(ctx, pat); // Don't bother looping if this can only match at the start/end: @@ -316,7 +336,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) // See: left-recursion.md for more details. if (str == pat->args.leftrec.at) { ++pat->args.leftrec.visits; - return pat->args.leftrec.match; + return clone_match(pat->args.leftrec.match); } else { return match(ctx, str, pat->args.leftrec.fallback); } @@ -602,6 +622,9 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) if (ref == NULL) errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name); + if (ref->type == BP_LEFTRECURSION) + return match(ctx, str, ref); + pat_t rec_op = { .type = BP_LEFTRECURSION, .start = ref->start, .end = ref->end, @@ -627,15 +650,21 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) }, }; - match_t *m = NULL; - // Keep matching as long as left recursion keeps happening and forward progress is made - for (const char *prev = str; m == NULL || (rec_op.args.leftrec.visits > 0 && m->end > prev); prev = m->end) { - rec_op.args.leftrec.match = m; - rec_op.args.leftrec.visits = 0; - match_t *m2 = match(&ctx2, str, ref); - if (!m2) break; - if (m) recycle_match(&m); - m = m2; + match_t *m = match(&ctx2, str, ref); + // If left recursion was involved, keep retrying while forward progress can be made: + if (m && rec_op.args.leftrec.visits > 0) { + while (1) { + const char *prev = m->end; + rec_op.args.leftrec.match = m; + match_t *m2 = match(&ctx2, str, ref); + if (!m2) break; + if (m2->end <= prev) { + recycle_match(&m2); + break; + } + recycle_match(&m); + m = m2; + } } if (!m) {