From 56da250d69615d70774dbc361fa2693cca1e3df4 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Thu, 12 May 2022 12:11:28 -0400 Subject: Split backref/named captures into separate concepts for performance reasons. --- README.md | 5 +++-- bp.1 | 14 ++++++++++---- bp.1.md | 12 ++++++++---- grammars/bp.bp | 4 ++-- grammars/html.bp | 4 ++-- grammars/lua.bp | 4 ++-- grammars/shell.bp | 2 +- match.c | 2 +- pattern.c | 9 ++++++++- pattern.h | 1 + tests/24-backref.sh | 2 +- 11 files changed, 39 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index 5e7d716..7e7d714 100644 --- a/README.md +++ b/README.md @@ -74,8 +74,9 @@ Pattern | Meaning `+pat` | 1 or more occurrences of `pat` (shorthand for `1+pat`) `pat` | `pat` matches just in front of the current position (lookahead) -`@pat` | Capture `pat` (used for text replacement and backreferences) -`@foo=pat` | Let `foo` be the text of `pat` (used for text replacement and backreferences) +`@pat` | Capture `pat` (used for text replacement) +`@foo=pat` | Capture `pat` with the name `foo` attached (used for text replacement) +`@foo:pat` | Let `foo` be the text of `pat` (used for backreferences) `pat => "replacement"` | Match `pat` and replace it with `replacement` `(pat1 @keep=pat2) => "@keep"` | Match `pat1` followed by `pat2` and replace it with the text of `pat2` `pat1~pat2` | `pat1` when `pat2` can be found within the result diff --git a/bp.1 b/bp.1 index 2524840..2c53ca6 100644 --- a/bp.1 +++ b/bp.1 @@ -305,7 +305,8 @@ against the edges file or line. Matches \f[I]pat\f[R], but does not consume any input (lookahead). .TP \f[B]\[at]\f[R] \f[I]pat\f[R] -Capture \f[I]pat\f[R] +Capture \f[I]pat\f[R]. +Captured patterns can be used in replacements. .TP \f[B]foo\f[R] The named pattern whose name is \f[B]\[lq]foo\[rq]\f[R]. @@ -315,11 +316,16 @@ Pattern names may contain dashes (\f[B]-\f[R]), but not underscores (\f[B]_\f[R]), since the underscore is used to match whitespace. See the \f[B]GRAMMAR FILES\f[R] section for more info. .TP +\f[B]\[at]\f[R] \f[I]name\f[R] \f[B]:\f[R] \f[I]pat\f[R] +For the rest of the current chain, define \f[I]name\f[R] to match +whatever \f[I]pat\f[R] matches, i.e.\ a backreference. +For example, \f[B]\[at]foo:word \[ga]( foo \[ga])\f[R] (matches +\f[B]\[lq]asdf(asdf)\[rq]\f[R] or \f[B]\[lq]baz(baz)\[rq]\f[R], but not +\f[B]\[lq]foo(baz)\[rq]\f[R]) +.TP \f[B]\[at]\f[R] \f[I]name\f[R] \f[B]=\f[R] \f[I]pat\f[R] Let \f[I]name\f[R] equal \f[I]pat\f[R] (named capture). -Named captures can be used as backreferences like so: \f[B]\[at]foo=word -\[ga]( foo \[ga])\f[R] (matches \f[B]\[lq]asdf(asdf)\[rq]\f[R] or -\f[B]\[lq]baz(baz)\[rq]\f[R], but not \f[B]\[lq]foo(baz)\[rq]\f[R]) +Named captures can be used in text replacements. .TP \f[I]pat\f[R] \f[B]=>\f[R] \f[B]\[dq]\f[R]\f[I]replacement\f[R]\f[B]\[dq]\f[R] Replace \f[I]pat\f[R] with \f[I]replacement\f[R]. diff --git a/bp.1.md b/bp.1.md index 869ea42..9473d24 100644 --- a/bp.1.md +++ b/bp.1.md @@ -252,7 +252,7 @@ against the edges file or line. : Matches *pat*, but does not consume any input (lookahead). `@` *pat* -: Capture *pat* +: Capture *pat*. Captured patterns can be used in replacements. `foo` : The named pattern whose name is **"foo"**. Pattern names come from @@ -260,10 +260,14 @@ definitions in grammar files or from named captures. Pattern names may contain dashes (`-`), but not underscores (`_`), since the underscore is used to match whitespace. See the **GRAMMAR FILES** section for more info. +`@` *name* `:` *pat* +: For the rest of the current chain, define *name* to match whatever *pat* +matches, i.e. a backreference. For example, `` @foo:word `( foo `) `` (matches +**"asdf(asdf)"** or **"baz(baz)"**, but not **"foo(baz)"**) + `@` *name* `=` *pat* -: Let *name* equal *pat* (named capture). Named captures can be used as -backreferences like so: `` @foo=word `( foo `) `` (matches **"asdf(asdf)"** or -**"baz(baz)"**, but not **"foo(baz)"**) +: Let *name* equal *pat* (named capture). Named captures can be used in +text replacements. *pat* `=>` `"`*replacement*`"` : Replace *pat* with *replacement*. Note: *replacement* should be a string diff --git a/grammars/bp.bp b/grammars/bp.bp index 2782934..a77881b 100644 --- a/grammars/bp.bp +++ b/grammars/bp.bp @@ -6,7 +6,7 @@ # language grammars. Grammar: __ *(Def [__`;])%__ __ [@error=(+(./\n) => "Could not parse this code: @0")] -Def: @name=id __ `: __ ( +Def: @name=id __ 1-2`: __ ( @definition=extended-pat / $$ @error=(=>"No definition for rule") / @error=(..%\n>(`;/id_`:/$) => "Invalid definition: @0")) @@ -62,7 +62,7 @@ Repeat: ( Optional: `[ __ extended-pat (__`] / @error=(=>"Expected closing square bracket here")) After: `< __ pat Before: `> __ pat -Capture: `@ [__ @capture-name=(id/`!) __ !"=>" `=] __ (@capture=pat / @error=(=>"Expected pattern to capture")) +Capture: `@ [__ @capture-name=(id/`!) __ !"=>" `=,:] __ (@capture=pat / @error=(=>"Expected pattern to capture")) Replace: ( @replace-pat=[Chain-noreplace / pat] __ "=>" (__ @replacement=String / @error=(=>"Expected replacement string")) ) diff --git a/grammars/html.bp b/grammars/html.bp index 1ed83df..1000670 100644 --- a/grammars/html.bp +++ b/grammars/html.bp @@ -14,9 +14,9 @@ void-element: `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"me template-element: "" -raw-element: `< @tag=("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n (") +raw-element: `< @tag:("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n (") -normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / element) (") +normal-element: `< @tag:id __attributes__ `> ..%(\n / comment / element) (") comment: "" diff --git a/grammars/lua.bp b/grammars/lua.bp index 3d3b862..4f417d2 100644 --- a/grammars/lua.bp +++ b/grammars/lua.bp @@ -6,8 +6,8 @@ # full parse tree, and having one makes the task considerably more complicated. # See the accompanying README.md for more info. -comment: "--" (`[ @eqs=*`= `[ ..%\n (`]eqs`]) / ..$) -string: `"..%string-escape `" / `' ..%string-escape `' / `[ @eqs=*`= `[ ..%\n (`]eqs`]) +comment: "--" (`[ @eqs:*`= `[ ..%\n (`]eqs`]) / ..$) +string: `"..%string-escape `" / `' ..%string-escape `' / `[ @eqs:*`= `[ ..%\n (`]eqs`]) table: `{ ..%(table/string/comment/\n) `} keyword: ("and" / "break" / "do" / "else" / "elseif" / "end" / "false" / "for" / "function" / "goto" / "if" / "in" / "local" / "nil" / "not" / "or" / diff --git a/grammars/shell.bp b/grammars/shell.bp index 76cdcfa..d305d6c 100644 --- a/grammars/shell.bp +++ b/grammars/shell.bp @@ -7,7 +7,7 @@ # See the accompanying README.md for more info. comment: `#..$ -string: `" ..%(string-escape / subcommand / \n) `" / `' ..%\n `' / "<<" _ @delim=id _$ ..%\n (^delim$) +string: `" ..%(string-escape / subcommand / \n) `" / `' ..%\n `' / "<<" _ @delim:id _$ ..%\n (^delim$) string-escape: `\ `",` subcommand: `` ..%\n `` / "$" (parens/braces) keyword: ("echo" / "read" / "set" / "unset" / "readonly" / "shift" / "export" / "if" / "fi" / diff --git a/match.c b/match.c index 474bd85..a9f78ec 100644 --- a/match.c +++ b/match.c @@ -562,7 +562,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) match_t *m2; // Push backrefs and run matching, then cleanup - if (m1->pat->type == BP_CAPTURE && m1->pat->args.capture.name) { + if (m1->pat->type == BP_CAPTURE && m1->pat->args.capture.name && m1->pat->args.capture.backreffable) { // Temporarily add a rule that the backref name matches the // exact string of the original match (no replacements) pat_t *backref; diff --git a/pattern.c b/pattern.c index 53970ac..edd007d 100644 --- a/pattern.c +++ b/pattern.c @@ -479,7 +479,13 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st size_t namelen = 0; const char *a = after_name(str, end); const char *eq = a; - if (a > str && !matchstr(&eq, "=>", false, end) && matchchar(&eq, '=', false, end)) { + bool backreffable = false; + if (a > str && matchchar(&eq, ':', false, end)) { + name = str; + namelen = (size_t)(a-str); + str = eq; + backreffable = true; + } else if (a > str && !matchstr(&eq, "=>", false, end) && matchchar(&eq, '=', false, end)) { name = str; namelen = (size_t)(a-str); str = eq; @@ -492,6 +498,7 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st capture->args.capture.capture_pat = pat; capture->args.capture.name = name; capture->args.capture.namelen = namelen; + capture->args.capture.backreffable = backreffable; return capture; } // Start of file/line diff --git a/pattern.h b/pattern.h index 89d57b8..16b9654 100644 --- a/pattern.h +++ b/pattern.h @@ -82,6 +82,7 @@ typedef struct pat_s { struct pat_s *capture_pat; const char *name; size_t namelen; + bool backreffable; } capture; struct { struct match_s *match; diff --git a/tests/24-backref.sh b/tests/24-backref.sh index 01a6280..28a203b 100644 --- a/tests/24-backref.sh +++ b/tests/24-backref.sh @@ -1,3 +1,3 @@ # With @-capturing, you can reference previous captures # Example: bp -p '@first=`a-z .. first' matches "aba" and "xyzx", but not "abc" -bp -p '@first=+Abc _ +Abc _ first' +bp -p '@first:+Abc _ +Abc _ first' -- cgit v1.2.3