From e6e482054de77f3fe5d65344da86065373cf5f23 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sat, 25 Nov 2023 14:57:19 -0500 Subject: Deprecate '-p' flag and replace backslash interpolation with curly brace interpolation --- README.md | 9 +- bp.1 | 154 ++++++++++++++++++--------------- bp.1.md | 150 ++++++++++++++++++-------------- bp.c | 6 +- pattern.c | 53 ++++-------- tests/01-literal.sh | 4 +- tests/02-char.sh | 4 +- tests/03-char-range.sh | 4 +- tests/04-sequence.sh | 4 +- tests/05-dot.sh | 4 +- tests/06-start-of-line.sh | 4 +- tests/07-end-of-line.sh | 4 +- tests/08-spaces.sh | 4 +- tests/09-ellipsis.sh | 4 +- tests/10-words.sh | 4 +- tests/11-ordered-choice.sh | 6 +- tests/12-star.sh | 4 +- tests/13-plus.sh | 4 +- tests/14-repeat-sep.sh | 4 +- tests/15-repeating.sh | 4 +- tests/16-lookahead.sh | 4 +- tests/17-lookbehind.sh | 4 +- tests/18-lookbehind-variable-length.sh | 4 +- tests/19-negation.sh | 4 +- tests/20-submatch.sh | 4 +- tests/21-no-submatch.sh | 4 +- tests/22-replace.sh | 4 +- tests/23-nested-parens.sh | 4 +- tests/24-backref.sh | 4 +- tests/25-replace-capture.sh | 6 +- tests/26-null-byte.sh | 2 +- tests/27-left-recursion.sh | 2 +- tests/28-left-recursion2.sh | 2 +- tests/29-left-recursion3.sh | 2 +- tutorial.sh | 6 +- 35 files changed, 251 insertions(+), 239 deletions(-) diff --git a/README.md b/README.md index 7e7d714..e8747d5 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,14 @@ Run `make tutorial` to run through the tutorial. It walks through some basic pat ## Usage -`bp [flags] [...]` +``` +bp [flags] [...] +``` +BP is optimized for matching literal strings, so the main pattern argument is +interpreted as a string literal. BP pattern syntax is inserted using curly +brace interpolations like `bp 'foo{..}baz'` (match the string literal "foo" up +to and including the next occurrence of "baz" on the same line). ### Flags @@ -25,7 +31,6 @@ Run `make tutorial` to run through the tutorial. It walks through some basic pat * `-e` `--explain` print an explanation of the matches * `-j` `--json` print matches as JSON objects * `-l` `--list-files` print only filenames containing matches -* `-p` `--pattern ` provide a pattern (equivalent to `bp '\()'`) * `-r` `--replace ` replace the input pattern with the given replacement * `-s` `--skip ` skip over the given pattern when looking for matches * `-B` `--context-before ` change how many lines of context are printed before each match diff --git a/bp.1 b/bp.1 index 3e585a4..1e7f46e 100644 --- a/bp.1 +++ b/bp.1 @@ -1,41 +1,25 @@ -.\" Automatically generated by Pandoc 2.18 +.\" Automatically generated by Pandoc 3.1.8 .\" -.\" Define V font for inline verbatim, using C font in formats -.\" that render this, and otherwise B font. -.ie "\f[CB]x\f[]"x" \{\ -. ftr V B -. ftr VI BI -. ftr VB B -. ftr VBI BI -.\} -.el \{\ -. ftr V CR -. ftr VI CI -. ftr VB CB -. ftr VBI CBI -.\} .TH "BP" "1" "May 17 2021" "" "" -.hy .SH NAME -.PP bp - Bruce\[aq]s Parsing Expression Grammar tool .SH SYNOPSIS -.PP \f[B]bp\f[R] [\f[I]options\&...\f[R]] \f[I]pattern\f[R] [[\f[B]--\f[R]] \f[I]files\&...\f[R]] .SH DESCRIPTION -.PP \f[B]bp\f[R] is a tool that matches parsing expression grammars using a custom syntax. .SH OPTIONS .TP -\f[B]-p\f[R], \f[B]--pattern\f[R] \f[I]pat\f[R] -Give a pattern in BP syntax instead of string syntax (equivalent to -\f[B]bp \[aq]\[rs](pat)\[aq]\f[R] +\f[I]pattern\f[R] +The text to search for. +The main argument for \f[B]bp\f[R] is a string literals which may +contain BP syntax patterns. +See the \f[B]STRING PATTERNS\f[R] section below. .TP \f[B]-w\f[R], \f[B]--word\f[R] \f[I]word\f[R] Surround a string pattern with word boundaries (equivalent to \f[B]bp -\[aq]\[rs]|word\[rs]|\[aq]\f[R]) +\[aq]{|}word{|}\[aq]\f[R]) .TP \f[B]-e\f[R], \f[B]--explain\f[R] Print a visual explanation of the matches. @@ -101,11 +85,6 @@ formatting otherwise. \f[B]-h\f[R], \f[B]--help\f[R] Print the usage and exit. .TP -\f[I]pattern\f[R] -The main pattern for bp to match. -By default, this pattern is a string pattern (see the \f[B]STRING -PATTERNS\f[R] section below). -.TP \f[I]files\&...\f[R] The input files to search. If no input files are provided and data was piped in, that data will be @@ -113,19 +92,22 @@ used instead. If neither are provided, \f[B]bp\f[R] will search through all files in the current directory and its subdirectories (recursively). .SH STRING PATTERNS -.PP One of the most common use cases for pattern matching tools is matching plain, literal strings, or strings that are primarily plain strings, with one or two patterns. \f[B]bp\f[R] is designed around this fact. The default mode for bp patterns is \[lq]string pattern mode\[rq]. In string pattern mode, all characters are interpreted literally except -for the backslash (\f[B]\[rs]\f[R]), which may be followed by an escape -or a bp pattern (see the \f[B]PATTERNS\f[R] section below). -Optionally, the bp pattern may be terminated by a semicolon -(\f[B];\f[R]). +for curly braces \f[B]{}\f[R], which mark a region of BP syntax patterns +(see the \f[B]PATTERNS\f[R] section below). +In other words, when passing a search query to \f[B]bp\f[R], you do not +need to escape periods, quotation marks, backslashes, or any other +character, as long as it fits inside a shell string literal. +In order to match a literal \f[B]{\f[R], you can either search for the +character literal: \f[B]{\[ga]{}\f[R], the string literal: +\f[B]{\[dq]{\[dq]}\f[R], or a pair of matching curly braces using the +\f[B]braces\f[R] rule: \f[B]{braces}\f[R]. .SH PATTERNS -.PP \f[B]bp\f[R] patterns are based off of a combination of Parsing Expression Grammars and regular expression syntax. The syntax is designed to map closely to verbal descriptions of the @@ -146,7 +128,7 @@ A choice: \f[I]pat1\f[R], or if it doesn\[aq]t match, then \f[I]pat2\f[R] .TP \f[B].\f[R] -Any character (excluding newline) +The period pattern matches single character (excluding newline) .TP \f[B]\[ha]\f[R] Start of a line @@ -227,11 +209,14 @@ A word boundary (i.e.\ the edge of a word). \f[B]\[rs]b\f[R] Alias for \f[B]|\f[R] (word boundary) .TP +\f[B](\f[R] \f[I]pat\f[R] \f[B])\f[R] +Parentheses can be used to delineate patterns, as in most languages. +.TP \f[B]!\f[R] \f[I]pat\f[R] -Not \f[I]pat\f[R] +Not \f[I]pat\f[R] (don\[cq]t match if \f[I]pat\f[R] matches here) .TP \f[B][\f[R] \f[I]pat\f[R] \f[B]]\f[R] -Maybe \f[I]pat\f[R] +Maybe \f[I]pat\f[R] (match zero or one occurrences of \f[I]pat\f[R]) .TP \f[I]N\f[R] \f[I]pat\f[R] Exactly \f[I]N\f[R] repetitions of \f[I]pat\f[R] (e.g.\ \f[B]5 @@ -253,7 +238,7 @@ Any \f[I]pat\f[R]s (zero or more, e.g.\ \f[B]* \[dq]x\[dq]\f[R] matches etc.) .TP \f[B]+\f[R] \f[I]pat\f[R] -Some \f[I]pat\f[R]s (e.g.\ \f[B]+ \[dq]x\[dq]\f[R] matches +Some \f[I]pat\f[R]s (one or more, e.g.\ \f[B]+ \[dq]x\[dq]\f[R] matches \f[B]\[lq]x\[rq]\f[R], \f[B]\[lq]xx\[rq]\f[R], \f[B]\[lq]xxx\[rq]\f[R], etc.) .TP @@ -263,14 +248,18 @@ etc.) comma-separated words) .TP \f[B]..\f[R] \f[I]pat\f[R] -Any text (except newlines) up to and including \f[I]pat\f[R] +Any text (except newlines) up to and including \f[I]pat\f[R]. +This is a non-greedy match and does not span newlines. .TP \f[B].. %\f[R] \f[I]skip\f[R] \f[I]pat\f[R] Any text (except newlines) up to and including \f[I]pat\f[R], skipping over instances of \f[I]skip\f[R] (e.g.\ \f[B]\[aq]\[dq]\[aq] \&..%(\[aq]\[rs]\[aq] .) \[aq]\[dq]\[aq]\f[R] opening quote, up to closing quote, skipping over -backslash followed by a single character) +backslash followed by a single character). +A useful application of the \f[B]%\f[R] operator is to skip over +newlines to perform multi-line matches, e.g.\ \f[B]pat1 ..%\[rs]n +pat2\f[R] .TP \f[B].. =\f[R] \f[I]only\f[R] \f[I]pat\f[R] Any number of repetitions of the pattern \f[I]only\f[R] up to and @@ -285,21 +274,14 @@ pat\f[R] \f[B]<\f[R] \f[I]pat\f[R] Matches at the current position if \f[I]pat\f[R] matches immediately before the current position (lookbehind). -Conceptually, you can think of this as creating a file containing only -the \f[I]N\f[R] characters immediately before the current position and -attempting to match \f[I]pat\f[R] on that file, for all values of -\f[I]N\f[R] from the minimum number of characters \f[I]pat\f[R] can -match up to maximum number of characters \f[I]pat\f[R] can match (or the -length of the current line upto the current position, whichever is -smaller). \f[B]Note:\f[R] For fixed-length lookbehinds, this is quite efficient -(e.g.\ \f[B]<(100 \[dq]x\[dq])\f[R]), however this could cause -performance problems with variable-length lookbehinds -(e.g.\ \f[B]<(\[dq]x\[dq] 0-100\[dq]y\[dq])\f[R]). -Also, it is worth noting that \f[B]\[ha]\f[R], \f[B]\[ha]\[ha]\f[R], -\f[B]$\f[R], and \f[B]$$\f[R] all match against the edges of the slice, -which may give false positives if you were expecting them to match only -against the edges file or line. +(e.g.\ \f[B]<(100 \[dq]x\[dq])\f[R]), however this can cause performance +problems with variable-length lookbehinds (e.g.\ \f[B]<(\[dq]x\[dq] +0-100\[dq]y\[dq])\f[R]). +Also, patterns like \f[B]\[ha]\f[R], \f[B]\[ha]\[ha]\f[R], \f[B]$\f[R], +and \f[B]$$\f[R] that match against line/file edges will match against +the edge of the lookbehind window, so they should generally be avoided +in lookbehinds. .TP \f[B]>\f[R] \f[I]pat\f[R] Matches \f[I]pat\f[R], but does not consume any input (lookahead). @@ -319,7 +301,7 @@ See the \f[B]GRAMMAR FILES\f[R] section for more info. \f[B]\[at]\f[R] \f[I]name\f[R] \f[B]:\f[R] \f[I]pat\f[R] For the rest of the current chain, define \f[I]name\f[R] to match whatever \f[I]pat\f[R] matches, i.e.\ a backreference. -For example, \f[B]\[at]foo:word \[ga]( foo \[ga])\f[R] (matches +For example, \f[B]\[at]my-word:word \[ga]( my-word \[ga])\f[R] (matches \f[B]\[lq]asdf(asdf)\[rq]\f[R] or \f[B]\[lq]baz(baz)\[rq]\f[R], but not \f[B]\[lq]foo(baz)\[rq]\f[R]) .TP @@ -343,17 +325,21 @@ series of words, a colon, a newline, a tab, and then the first word. \f[I]pat1\f[R] \f[B]\[ti]\f[R] \f[I]pat2\f[R] Matches when \f[I]pat1\f[R] matches and \f[I]pat2\f[R] can be found within the text of that match. -(e.g.\ \f[B]comment \[ti] {TODO}\f[R] matches comments that contain the -word \f[B]\[lq]TODO\[rq]\f[R]) +(e.g.\ \f[B]comment \[ti] \[dq]TODO\[dq]\f[R] matches comments that +contain \f[B]\[lq]TODO\[rq]\f[R]) .TP \f[I]pat1\f[R] \f[B]!\[ti]\f[R] \f[I]pat2\f[R] Matches when \f[I]pat1\f[R] matches, but \f[I]pat2\f[R] can not be found within the text of that match. -(e.g.\ \f[B]comment \[ti] {IGNORE}\f[R] matches only comments that do -not contain the word \f[B]\[lq]IGNORE\[rq]\f[R]) +(e.g.\ \f[B]comment \[ti] \[dq]IGNORE\[dq]\f[R] matches only comments +that do not contain \f[B]\[lq]IGNORE\[rq]\f[R]) .TP -\f[I]name\f[R]\f[B]:\f[R] \f[I]pat\f[R] -Define \f[I]name\f[R] to mean \f[I]pat\f[R] (pattern definition) +\f[I]name\f[R]\f[B]:\f[R] \f[I]pat1\f[R]; \f[I]pat2\f[R] +Define \f[I]name\f[R] to mean \f[I]pat1\f[R] (pattern definition) inside +the pattern \f[I]pat2\f[R]. +For example, a recursive pattern can be defined and used like this: +\f[B]paren-comment: \[dq](*\[dq] ..%paren-comment \[dq]*)\[dq]; +paren-comment\f[R] .TP \f[B]\[at]:\f[R]\f[I]name\f[R] \f[B]=\f[R] \f[I]pat\f[R] Match \f[I]pat\f[R] and tag it with the given name as metadata. @@ -364,9 +350,8 @@ Syntactic sugar for \f[I]name\f[R]\f[B]:\f[R] that also attaches a metadata tag of the same name) .TP \f[B]#\f[R] \f[I]comment\f[R] -A line comment +A line comment, ignored by BP .SH GRAMMAR FILES -.PP \f[B]bp\f[R] allows loading extra grammar files, which define patterns which may be used for matching. The \f[B]builtins\f[R] grammar file is loaded by default, and it defines @@ -375,9 +360,36 @@ For example, it defines the \f[B]parens\f[R] rule, which matches pairs of matching parentheses, accounting for nested inner parentheses: .RS .PP -\f[B]bp -p \[aq]\[dq]my_func\[dq] parens\[aq]\f[R] +\f[B]bp \[aq]my_func{parens}\[aq]\f[R] .RE .PP +BP\[cq]s builtin grammar file defines a few other commonly used patterns +such as: +.IP \[bu] 2 +\f[B]braces\f[R] (matching \f[B]{}\f[R] pairs), \f[B]brackets\f[R] +(matching \f[B][]\f[R] pairs), \f[B]anglebraces\f[R] (matching +\f[B]<>\f[R] pairs) +.IP \[bu] 2 +\f[B]string\f[R]: a single- or double-quote delimited string, including +standard escape sequences +.IP \[bu] 2 +\f[B]id\f[R] or \f[B]var\f[R]: an identifier (full UTF-8 support) +.IP \[bu] 2 +\f[B]word\f[R]: similar to \f[B]id\f[R]/\f[B]var\f[R], but can start +with a number +.IP \[bu] 2 +\f[B]Hex\f[R], \f[B]hex\f[R], \f[B]HEX\f[R]: a mixed-case, lowercase, or +uppercase hex digit +.IP \[bu] 2 +\f[B]digit\f[R]: a digit from 0-9 +.IP \[bu] 2 +\f[B]int\f[R]: one or more digits +.IP \[bu] 2 +\f[B]number\f[R]: an int or floating point literal +.IP \[bu] 2 +\f[B]esc\f[R], \f[B]tab\f[R], \f[B]nl\f[R], \f[B]cr\f[R], +\f[B]crlf\f[R], \f[B]lf\f[R]: Shorthand for escape sequences +.PP \f[B]bp\f[R] also comes with a few grammar files for common programming languages, which may be loaded on demand. These grammar files are not comprehensive syntax definitions, but only @@ -389,35 +401,35 @@ Thus, you can find all comments with the word \[lq]TODO\[rq] with the following command: .RS .PP -\f[B]bp -g c++ -p \[aq]comment \[ti] {TODO}\[aq] *.cpp\f[R] +\f[B]bp -g c++ \[aq]{comment \[ti] \[dq]TODO\[dq]}\[aq] *.cpp\f[R] .RE .SH EXAMPLES -.PP -Find files containing the string \[lq]foo\[rq] (a string pattern): +Find files containing the literal string \[lq]foo.baz\[rq] (a string +pattern): .RS .PP -\f[B]ls | bp foo\f[R] +\f[B]ls | bp foo.baz\f[R] .RE .PP Find files ending with \[lq].c\[rq] and print the name with the \[lq].c\[rq] replaced with \[lq].h\[rq]: .RS .PP -\f[B]ls | bp \[aq].c\[rs]$\[aq] -r \[aq].h\[aq]\f[R] +\f[B]ls | bp \[aq].c{$}\[aq] -r \[aq].h\[aq]\f[R] .RE .PP Find the word \[lq]foobar\[rq], followed by a pair of matching parentheses in the file \f[I]my_file.py\f[R]: .RS .PP -\f[B]bp -p \[aq]{foobar} parens\[aq] my_file.py\f[R] +\f[B]bp \[aq]foobar{parens}\[aq] my_file.py\f[R] .RE .PP Using the \f[I]html\f[R] grammar, find all \f[I]element\f[R]s matching the tag \f[I]a\f[R] in the file \f[I]foo.html\f[R]: .RS .PP -\f[B]bp -g html -p \[aq]element \[ti] (\[ha]\[ha]\[dq]` *pat* : Matches *pat*, but does not consume any input (lookahead). @@ -256,50 +257,51 @@ against the edges file or line. `foo` : The named pattern whose name is **"foo"**. Pattern names come from -definitions in grammar files or from named captures. Pattern names may contain -dashes (`-`), but not underscores (`_`), since the underscore is used to match -whitespace. See the **GRAMMAR FILES** section for more info. + definitions in grammar files or from named captures. Pattern names may + contain dashes (`-`), but not underscores (`_`), since the underscore is used + to match whitespace. See the **GRAMMAR FILES** section for more info. `@` *name* `:` *pat* : For the rest of the current chain, define *name* to match whatever *pat* -matches, i.e. a backreference. For example, `` @foo:word `( foo `) `` (matches -**"asdf(asdf)"** or **"baz(baz)"**, but not **"foo(baz)"**) + matches, i.e. a backreference. For example, `` @my-word:word `( my-word `) `` + (matches **"asdf(asdf)"** or **"baz(baz)"**, but not **"foo(baz)"**) `@` *name* `=` *pat* -: Let *name* equal *pat* (named capture). Named captures can be used in -text replacements. +: Let *name* equal *pat* (named capture). Named captures can be used in text + replacements. *pat* `=>` `"`*replacement*`"` : Replace *pat* with *replacement*. Note: *replacement* should be a string -(single or double quoted), and it may contain escape sequences (e.g. `\n`) or -references to captured values: `@0` (the whole of *pat*), `@1` (the first -capture in *pat*), `@`*foo* (the capture named *foo* in *pat*), etc. For -example, `@word _ @rest=(*word % _) => "@rest:\n\t@1"` matches a word followed -by whitespace, followed by a series of words and replaces it with the series -of words, a colon, a newline, a tab, and then the first word. + (single or double quoted), and it may contain escape sequences (e.g. `\n`) or + references to captured values: `@0` (the whole of *pat*), `@1` (the first + capture in *pat*), `@`*foo* (the capture named *foo* in *pat*), etc. For + example, `@word _ @rest=(*word % _) => "@rest:\n\t@1"` matches a word + followed by whitespace, followed by a series of words and replaces it with + the series of words, a colon, a newline, a tab, and then the first word. *pat1* `~` *pat2* : Matches when *pat1* matches and *pat2* can be found within the text of that -match. (e.g. `comment ~ {TODO}` matches comments that contain the word -**"TODO"**) + match. (e.g. `comment ~ "TODO"` matches comments that contain **"TODO"**) *pat1* `!~` *pat2* : Matches when *pat1* matches, but *pat2* can not be found within the text of -that match. (e.g. `comment ~ {IGNORE}` matches only comments that do not -contain the word **"IGNORE"**) + that match. (e.g. `comment ~ "IGNORE"` matches only comments that do not + contain **"IGNORE"**) -*name*`:` *pat* -: Define *name* to mean *pat* (pattern definition) +*name*`:` *pat1*; *pat2* +: Define *name* to mean *pat1* (pattern definition) inside the pattern *pat2*. + For example, a recursive pattern can be defined and used like this: + `paren-comment: "(*" ..%paren-comment "*)"; paren-comment` `@:`*name* `=` *pat* : Match *pat* and tag it with the given name as metadata. *name*`::` *pat* : Syntactic sugar for *name*`:` `@:`*name*`=`*pat* (define a pattern that also -attaches a metadata tag of the same name) + attaches a metadata tag of the same name) `#` *comment* -: A line comment +: A line comment, ignored by BP # GRAMMAR FILES @@ -311,9 +313,23 @@ defines a few useful general-purpose patterns. For example, it defines the nested inner parentheses: ``` -bp -p '"my_func" parens' +bp 'my_func{parens}' ``` +BP's builtin grammar file defines a few other commonly used patterns such as: + +- `braces` (matching `{}` pairs), `brackets` (matching `[]` pairs), + `anglebraces` (matching `<>` pairs) +- `string`: a single- or double-quote delimited string, including standard + escape sequences +- `id` or `var`: an identifier (full UTF-8 support) +- `word`: similar to `id`/`var`, but can start with a number +- `Hex`, `hex`, `HEX`: a mixed-case, lowercase, or uppercase hex digit +- `digit`: a digit from 0-9 +- `int`: one or more digits +- `number`: an int or floating point literal +- `esc`, `tab`, `nl`, `cr`, `crlf`, `lf`: Shorthand for escape sequences + **bp** also comes with a few grammar files for common programming languages, which may be loaded on demand. These grammar files are not comprehensive syntax definitions, but only some common patterns. For example, the c++ grammar file @@ -322,31 +338,31 @@ block comments. Thus, you can find all comments with the word "TODO" with the following command: ``` -bp -g c++ -p 'comment ~ {TODO}' *.cpp +bp -g c++ '{comment ~ "TODO"}' *.cpp ``` # EXAMPLES -Find files containing the string "foo" (a string pattern): +Find files containing the literal string "foo.baz" (a string pattern): ``` -ls | bp foo +ls | bp foo.baz ``` Find files ending with ".c" and print the name with the ".c" replaced with ".h": ``` -ls | bp '.c\$' -r '.h' +ls | bp '.c{$}' -r '.h' ``` Find the word "foobar", followed by a pair of matching parentheses in the file *my_file.py*: ``` -bp -p '{foobar} parens' my_file.py +bp 'foobar{parens}' my_file.py ``` Using the *html* grammar, find all *element*s matching the tag *a* in the file *foo.html*: ``` -bp -g html -p 'element ~ (^^" provide a pattern (equivalent to bp '\\()')\n" " -w --word find words matching the given string pattern\n" " -r --replace replace the input pattern with the given replacement\n" " -s --skip skip over the given pattern when looking for matches\n" @@ -383,7 +382,7 @@ static int print_matches(FILE *out, file_t *f, pat_t *pattern, pat_t *defs) // Print trailing context if needed: if (matches > 0) { fprint_context(out, f, prev, NULL); - if (last_line_num < 0) { // Hacky fix to ensure line number gets printed for `bp -p '$$'` + if (last_line_num < 0) { // Hacky fix to ensure line number gets printed for `bp '{$$}'` fprint_linenum(out, f, f->nlines, print_opts.normal_color); fputc('\n', out); } @@ -599,9 +598,6 @@ int main(int argc, char *argv[]) if (f == NULL) errx(EXIT_FAILURE, "Couldn't find grammar: %s", flag); defs = load_grammar(defs, f); // Keep in memory for debug output - } else if (FLAG("-p") || FLAG("--pattern")) { - pat_t *p = assert_pat(flag, NULL, bp_pattern(flag, flag+strlen(flag))); - pattern = chain_together(pattern, p); } else if (FLAG("-w") || FLAG("--word")) { require(asprintf(&flag, "\\|%s\\|", flag), "Could not allocate memory"); file_t *arg_file = spoof_file(&loaded_files, "", flag, -1); diff --git a/pattern.c b/pattern.c index 75b21b3..150b638 100644 --- a/pattern.c +++ b/pattern.c @@ -97,8 +97,8 @@ static pat_t *expand_replacements(pat_t *replace_pat, const char *end, bool allo const char *repstr; size_t replen; if (matchchar(&str, '"', allow_nl, end) || matchchar(&str, '\'', allow_nl, end) - || matchchar(&str, '{', allow_nl, end) || matchchar(&str, '\002', allow_nl, end)) { - char closequote = str[-1] == '{' ? '}' : (str[-1] == '\002' ? '\003' : str[-1]); + || matchchar(&str, '}', allow_nl, end) || matchchar(&str, '\002', allow_nl, end)) { + char closequote = str[-1] == '}' ? '{' : (str[-1] == '\002' ? '\003' : str[-1]); repstr = str; for (; str < end && *str != closequote; str = next_char(str, end)) { if (*str == '\\') { @@ -235,8 +235,7 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st } pat_t *target; if (inside_stringpattern) { - maybe_pat_t maybe_target = bp_stringpattern(str, end); - target = maybe_target.success ? maybe_target.value.pat : NULL; + target = NULL; } else { target = bp_simplepattern(str, end); } @@ -332,8 +331,8 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st return Pattern(BP_WORD_BOUNDARY, start, str, 0, 0); } // String literal - case '"': case '\'': case '\002': case '{': { - char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c); + case '"': case '\'': case '\002': case '}': { + char endquote = c == '\002' ? '\003' : (c == '}' ? '{' : c); char *litstart = (char*)str; while (str < end && *str != endquote) str = next_char(str, end); @@ -494,42 +493,26 @@ static pat_t *_bp_simplepattern(const char *str, const char *end, bool inside_st } // -// Similar to bp_simplepattern, except that the pattern begins with an implicit, unclosable quote. +// Similar to bp_simplepattern, except that the pattern begins with an implicit +// '}' open quote that can be closed with '{' // maybe_pat_t bp_stringpattern(const char *str, const char *end) { __TRY_PATTERN__ if (!end) end = str + strlen(str); - pat_t *ret = NULL; - while (str < end) { - char *start = (char*)str; - pat_t *interp = NULL; - for (; str < end; str = next_char(str, end)) { - if (*str == '\\' && str+1 < end) { - if (str[1] == '\\' || isalnum(str[1])) - interp = _bp_simplepattern(str, end, true); - else - interp = _bp_simplepattern(str+1, end, true); - if (interp) break; - // If there is no interpolated value, this is just a plain ol' regular backslash - } - } - // End of string - size_t len = (size_t)(str - start); - if (len > 0) { - pat_t *str_chunk = Pattern(BP_STRING, start, str, len, (ssize_t)len, .string=start); - ret = chain_together(ret, str_chunk); - } - if (interp) { - ret = chain_together(ret, interp); - str = interp->end; - // allow terminating seq - (void)matchchar(&str, ';', false, end); - } + char *start = (char*)str; + while (str < end && *str != '{') + str = next_char(str, end); + size_t len = (size_t)(str - start); + pat_t *pat = Pattern(BP_STRING, start, str, len, (ssize_t)len, .string=start); + str += 1; + if (str < end) { + pat_t *interp = bp_pattern_nl(str, end, true); + if (interp) + pat = chain_together(pat, interp); } - if (!ret) ret = Pattern(BP_STRING, str, str, 0, 0); __END_TRY_PATTERN__ - return (maybe_pat_t){.success = true, .value.pat = ret}; + return (maybe_pat_t){.success = true, .value.pat = pat}; } // diff --git a/tests/01-literal.sh b/tests/01-literal.sh index 618dc4b..ddfe1a9 100644 --- a/tests/01-literal.sh +++ b/tests/01-literal.sh @@ -1,3 +1,3 @@ # Use double quotation marks to match literal strings -# Example: bp -p '"baz"' -bp -p '"foo"' +# Example: bp '{ "baz" }' +bp '{ "foo" }' diff --git a/tests/02-char.sh b/tests/02-char.sh index e630e45..1493fa6 100644 --- a/tests/02-char.sh +++ b/tests/02-char.sh @@ -1,3 +1,3 @@ # Match a single character with backtick: -# Example: bp -p '`A' matches the letter "A" -bp -p '`x' +# Example: bp '{`A}' matches the letter "A" +bp '{`x}' diff --git a/tests/03-char-range.sh b/tests/03-char-range.sh index 7904466..36ae483 100644 --- a/tests/03-char-range.sh +++ b/tests/03-char-range.sh @@ -1,3 +1,3 @@ # Character sets and ranges work with backticks -# Example: bp -p '`a-z,A-Z' matches all lowercase and uppercase letters -bp -p '`0-9,a-f' +# Example: bp '{`a-z,A-Z}' matches all lowercase and uppercase letters +bp '{`0-9,a-f}' diff --git a/tests/04-sequence.sh b/tests/04-sequence.sh index dc61eeb..3d8df71 100644 --- a/tests/04-sequence.sh +++ b/tests/04-sequence.sh @@ -1,5 +1,5 @@ # Multiple patterns in a row represent a sequence. # bp pattern syntax mostly doesn't care about whitespace, so you can have # spaces between patterns if you want, but it's not required. -# Example: bp -p '"foo" `0-9' matches "foo1", "foo2", etc. -bp -p '"one" "two"' +# Example: bp '{"foo" `0-9}' matches "foo1", "foo2", etc. +bp '{"one" "two"}' diff --git a/tests/05-dot.sh b/tests/05-dot.sh index ec5c9e7..c963aed 100644 --- a/tests/05-dot.sh +++ b/tests/05-dot.sh @@ -1,3 +1,3 @@ # The dot matches a single character -# Example: bp -p '.' -bp -p '`a .' +# Example: bp '{.}' +bp '{`a .}' diff --git a/tests/06-start-of-line.sh b/tests/06-start-of-line.sh index c44000e..c8e5504 100644 --- a/tests/06-start-of-line.sh +++ b/tests/06-start-of-line.sh @@ -1,3 +1,3 @@ # ^ matches start of a line -# Example: bp -p '^ "x"' matches lines starting with "x" -bp -p '^ "foo"' +# Example: bp '{^ "x"}' matches lines starting with "x" +bp '{^ "foo"}' diff --git a/tests/07-end-of-line.sh b/tests/07-end-of-line.sh index 4a1e05d..5954c10 100644 --- a/tests/07-end-of-line.sh +++ b/tests/07-end-of-line.sh @@ -1,3 +1,3 @@ # $ matches end of line -# Example: bp -p '"x" $' matches lines ending with "x" -bp -p '"foo" $' +# Example: bp '{"x" $}' matches lines ending with "x" +bp '{"foo" $}' diff --git a/tests/08-spaces.sh b/tests/08-spaces.sh index cc16d96..a6a62c9 100644 --- a/tests/08-spaces.sh +++ b/tests/08-spaces.sh @@ -1,3 +1,3 @@ # The _ pattern matches zero or more spaces/tabs -# Example: bp -p '`= _ "foo"' matches "=foo", "= foo", "= foo", etc. -bp -p '"one" _ "two"' +# Example: bp '{`= _ "foo"}' matches "=foo", "= foo", "= foo", etc. +bp '{"one" _ "two"}' diff --git a/tests/09-ellipsis.sh b/tests/09-ellipsis.sh index 0763fd1..c755ab6 100644 --- a/tests/09-ellipsis.sh +++ b/tests/09-ellipsis.sh @@ -1,3 +1,3 @@ # The ellipsis matches text upto the following pattern, not counting newlines -# Example: bp -p '"/*" .. "*/"' matches "/* blah blah */" or "/**/" -bp -p '"hello" .. "world"' +# Example: bp '{"/*" .. "*/"}' matches "/* blah blah */" or "/**/" +bp '{"hello" .. "world"}' diff --git a/tests/10-words.sh b/tests/10-words.sh index c5f5193..afb064d 100644 --- a/tests/10-words.sh +++ b/tests/10-words.sh @@ -1,3 +1,3 @@ # The | operator matches word edges -# Example: bp -p '|"baz"|' matches the word "baz" -bp '\|foo\|' +# Example: bp '{|"baz"|}' matches the word "baz" +bp '{|}foo{|}' diff --git a/tests/11-ordered-choice.sh b/tests/11-ordered-choice.sh index 277741d..4334643 100644 --- a/tests/11-ordered-choice.sh +++ b/tests/11-ordered-choice.sh @@ -1,5 +1,5 @@ # The ordered choice operator (/) picks the first choice that matches -# Example: bp -p '"cabaret"/"cab"' matches either "cabaret" or "cab" +# Example: bp '{"cabaret"/"cab"}' matches either "cabaret" or "cab" # Note: if a match occurs, the options to the right will *never* be attempted, -# so bp -p '"cab"/"cabaret"' will always match "cab" instead of "cabaret" -bp -p '"foo" / "bar"' +# so bp '{"cab"/"cabaret"}' will always match "cab" instead of "cabaret" +bp '{"foo" / "bar"}' diff --git a/tests/12-star.sh b/tests/12-star.sh index bd2d578..f0a9de9 100644 --- a/tests/12-star.sh +++ b/tests/12-star.sh @@ -1,3 +1,3 @@ # The star (*) prefix operator matches zero or more repetitions -# Example: bp -p '"Ha" *"ha"' will match "Ha", "Haha", "Hahaha", etc. -bp -p '`( *`x `)' +# Example: bp '{"Ha" *"ha"}' will match "Ha", "Haha", "Hahaha", etc. +bp '{`( *`x `)}' diff --git a/tests/13-plus.sh b/tests/13-plus.sh index c2aa7cc..7ac39e7 100644 --- a/tests/13-plus.sh +++ b/tests/13-plus.sh @@ -1,3 +1,3 @@ # The plus (+) prefix operator matches one or more of a pattern -# Example: bp -p '"l" +"ol"' will match "lol", "lolol", "lololol", etc. -bp -p '`( +`x `)' +# Example: bp '{"l" +"ol"}' will match "lol", "lolol", "lololol", etc. +bp '{`( +`x `)}' diff --git a/tests/14-repeat-sep.sh b/tests/14-repeat-sep.sh index 879ec4b..f61a4bc 100644 --- a/tests/14-repeat-sep.sh +++ b/tests/14-repeat-sep.sh @@ -1,3 +1,3 @@ # The '%' operator modifies repeating patterns, allowing you to give a separator between matches -# Example: bp -p '+"x" % ":"' will match "x", "x:x", "x:x:x", etc. -bp -p '`( +int % `, `)' +# Example: bp '{+"x" % ":"}' will match "x", "x:x", "x:x:x", etc. +bp '{`( +int % `, `)}' diff --git a/tests/15-repeating.sh b/tests/15-repeating.sh index 115e791..7672195 100644 --- a/tests/15-repeating.sh +++ b/tests/15-repeating.sh @@ -1,3 +1,3 @@ # Numbers allow you to specify repetitions of a pattern -# Example: bp -p '3 "x"' matches "xxx" -bp -p '`( 4 . `)' +# Example: bp '{3 "x"}' matches "xxx" +bp '{`( 4 . `)}' diff --git a/tests/16-lookahead.sh b/tests/16-lookahead.sh index 18761bd..4b8d5f8 100644 --- a/tests/16-lookahead.sh +++ b/tests/16-lookahead.sh @@ -1,3 +1,3 @@ # >pat is a lookahead -# Example: bp -p '"foo" >`(' will match "foo" only when it is followed by a parenthesis -bp -p '>`t word' +# Example: bp '{"foo" >`(}' will match "foo" only when it is followed by a parenthesis +bp '{>`t word}' diff --git a/tests/17-lookbehind.sh b/tests/17-lookbehind.sh index 0b5136a..fe14678 100644 --- a/tests/17-lookbehind.sh +++ b/tests/17-lookbehind.sh @@ -1,3 +1,3 @@ # "replacement" -# Example: bp -p '"foo" => "baz"' matches "foobar" and replaces it with "bazbar" -bp -p '"s" => "$"' +# Example: bp '{"foo" => "baz"}' matches "foobar" and replaces it with "bazbar" +bp '{"s" => "$"}' diff --git a/tests/23-nested-parens.sh b/tests/23-nested-parens.sh index 1bf872e..6b3c01b 100644 --- a/tests/23-nested-parens.sh +++ b/tests/23-nested-parens.sh @@ -1,3 +1,3 @@ # parens is a pattern matching nested parentheses -# Example: bp -p '"foo" parens' matches "foo()" or "foo(baz(), qux())", but not "foo(()" -bp -p 'id parens `;' +# Example: bp '{"foo" parens}' matches "foo()" or "foo(baz(), qux())", but not "foo(()" +bp '{id parens `;}' diff --git a/tests/24-backref.sh b/tests/24-backref.sh index 28a203b..2a7d89b 100644 --- a/tests/24-backref.sh +++ b/tests/24-backref.sh @@ -1,3 +1,3 @@ # With @-capturing, you can reference previous captures -# Example: bp -p '@first=`a-z .. first' matches "aba" and "xyzx", but not "abc" -bp -p '@first:+Abc _ +Abc _ first' +# Example: bp '{@first=`a-z .. first}' matches "aba" and "xyzx", but not "abc" +bp '{@first:+Abc _ +Abc _ first}' diff --git a/tests/25-replace-capture.sh b/tests/25-replace-capture.sh index f401605..70fc889 100644 --- a/tests/25-replace-capture.sh +++ b/tests/25-replace-capture.sh @@ -1,4 +1,4 @@ # Captures with @ can be referenced in a replacement by @1, @2, etc. -# Example: bp -p '"=" _ @+`0-9 => "= -@1"' replaces "x = 5" with "x = -5" -# Note: @0 refers to the entire match, e.g. bp -p '"foo" => "xx@0xx"' replaces "foo" with "xxfooxx" -bp -p '@`a,e,i,o,u => "{@1}" / .' +# Example: bp '{"=" _ @+`0-9 => "= -@1"}' replaces "x = 5" with "x = -5" +# Note: @0 refers to the entire match, e.g. bp '{"foo" => "xx@0xx"}' replaces "foo" with "xxfooxx" +bp '{@`a,e,i,o,u => "{@1}" / .}' diff --git a/tests/26-null-byte.sh b/tests/26-null-byte.sh index 2c825c3..84996a8 100644 --- a/tests/26-null-byte.sh +++ b/tests/26-null-byte.sh @@ -1,2 +1,2 @@ # Null bytes can be matched with \x00 -bp -p '\x00' +bp '{\x00}' diff --git a/tests/27-left-recursion.sh b/tests/27-left-recursion.sh index ec11cab..3472fb6 100644 --- a/tests/27-left-recursion.sh +++ b/tests/27-left-recursion.sh @@ -1,2 +1,2 @@ # Left recursion should work -bp -p 'xys: (xys / `x) `y; xys => "{@0}"' +bp '{xys: (xys / `x) `y; xys => "{@0}"}' diff --git a/tests/28-left-recursion2.sh b/tests/28-left-recursion2.sh index 995f82c..e348340 100644 --- a/tests/28-left-recursion2.sh +++ b/tests/28-left-recursion2.sh @@ -1,2 +1,2 @@ # Left recursion has some tricky edge cases like this: -bp -p 'foo: (foo / `a-z) (foo / `a-z) `!; foo => "{@0}"' +bp '{foo: (foo / `a-z) (foo / `a-z) `!; foo => "{@0}"}' diff --git a/tests/29-left-recursion3.sh b/tests/29-left-recursion3.sh index 7de7379..936ca78 100644 --- a/tests/29-left-recursion3.sh +++ b/tests/29-left-recursion3.sh @@ -1,2 +1,2 @@ # Left recursion has some tricky edge cases like this: -bp -p 'shout: phrase `!; phrase: shout / id; phrase => "{@0}"' +bp '{shout: phrase `!; phrase: shout / id; phrase => "{@0}"}' diff --git a/tutorial.sh b/tutorial.sh index 70f87dd..7e66132 100755 --- a/tutorial.sh +++ b/tutorial.sh @@ -12,12 +12,12 @@ for t in $([ $# -gt 0 ] && echo "$@" || ls -v tests/*.sh); do printf "\033[33;1mGiven these lines: Give this output:\033[m\n" diff -y -W60 --color=always "${t/.sh/.in}" "${t/.sh/.out}" while true; do - printf "\n\033[1m$ bp --pattern ''\033[D\033[m" + printf "\n\033[1m$ bp \033[m" read -r pat pat="${pat%\'}" - printf "\033[0;2mRunning: \033[32m%s\033[m\n\n" "bp -p '$pat'" + printf "\033[0;2mRunning: \033[32m%s\033[m\n\n" "bp '$pat'" printf "\033[33;1mExpected output: Your pattern's output:\033[m\n" - bp -p "$pat" < "${t/.sh/.in}" 2>"$tmpfile" | diff -y -W60 --color=always "${t/.sh/.out}" - && break + bp "$pat" < "${t/.sh/.in}" 2>"$tmpfile" | diff -y -W60 --color=always "${t/.sh/.out}" - && break cat "$tmpfile" printf "\n\033[0;1;31mSorry, try again!\033[m\n" done -- cgit v1.2.3