From 4135115229d27c54b70cd945e2211e652ab58d2f Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 13 Sep 2020 23:31:38 -0700 Subject: Spruced up a bunch of stuff, tweaked the grammar, added docs --- bpeg.1 | 162 +++++++++++++++++++++++++++++++++++++++++++------ bpeg.bpeg | 5 +- bpeg.c | 46 +++----------- compiler.c | 11 ---- grammars/builtins.bpeg | 10 +-- grammars/html.bpeg | 8 +-- types.h | 3 +- vm.c | 25 +++----- 8 files changed, 175 insertions(+), 95 deletions(-) diff --git a/bpeg.1 b/bpeg.1 index 3dfb806..4f2c18c 100644 --- a/bpeg.1 +++ b/bpeg.1 @@ -7,43 +7,164 @@ bpeg \- Bruce's Parsing Expression Grammar tool .B bpeg [\fI-h\fR|\fI--help\fR] [\fI-v\fR|\fI--verbose\fR] +[\fI-p\fR|\fI--pattern\fR \fI\fR] +[\fI-P\fR|\fI--pattern-string\fR \fI\fR] [\fI-d\fR|\fI--define\fR \fI\fR=\fI\fR] +[\fI-D\fR|\fI--define-string\fR \fI\fR=\fI\fR] [\fI-r\fR|\fI--replace\fR \fI\fR] [\fI-g\fR|\fI--grammar\fR \fI\fR] +[\fI-m\fR|\fI--mode\fR \fI\fR] \fI\fR] +[[--] \fI\fR] .SH DESCRIPTION \fBbpeg\fR is a tool that matches parsing expression grammars using a custom syntax. .SH OPTIONS -.B \--verbose +.B \-v\fR, \fB--verbose Print debugging information. -.B \--define = -Define a grammar rule. +.B \-d\fR, \fB--define \fI\fR=\fI\fR +Define a grammar rule using a bpeg pattern. -.B \--replace +.B \-D\fR, \fB--define-string \fI\fR=\fI\fR +Define a grammar rule using a bpeg string pattern. + +.B \-r\fR, \fB--replace \fI\fR Replace all occurrences of the main pattern with the given string. -.B \--grammar +.B \-g\fR, \fB--grammar \fI\fR Load the grammar from the given file. +.B \-m\fR, \fB--mode \fI\fR +The mode to operate in. Options are: \fIfind-all\fR (the default), +\fIonly-matches\fR, \fIpattern\fR, \fIreplacement\fR, \fIreplace-all\fR +(implied by \fB--replace\fR), or any other grammar rule name. + .B \--help Print the usage and exit. -.B -The main pattern for bpeg to match. By default, this pattern -is in "string literal" mode (i.e. a backslash is requres for -non-literal patterns). The default mode is to find \fBall\fR -occurrences of the pattern and highlight them. +.B +The main pattern for bpeg to match. By default, this pattern is a string +pattern (see the \fBSTRING PATTERNS\fR section below). + +.B +The input files to search. If no input files are provided and data was +piped in, that data will be used instead. If neither are provided, +\fBbpeg\fR will search through all files in the current directory and +its subdirectories (recursively). + +.SH PATTERNS +Bpeg patterns are based off of a combination of Parsing Expression Grammars +and regular expression syntax. The syntax is designed to map closely to +verbal descriptions of the patterns, and prefix operators are preferred over +suffix operators (as is common in regex syntax). + +Some patterns additionally have "multi-line" variants, which means that they +include the newline character. + +.I +A chain of patterns, pronounced \fI\fB-then-\fI\fR + +.I \fB/\fI \fR +A series of ordered choices (if one pattern matches, the following patterns +will not be attempted), pronounced \fI\fB-or-\fI\fR + +.B .. +Any text \fBup-to\fR the following pattern, if any (multiline: \fB...\fR) + +.B . +\fBAny\fR character (multiline: $.) + +.B ^ +\fBStart-of-a-line\fR + +.B ^^ +\fBStart-of-the-text\fR + +.B $ +\fBEnd-of-a-line\fR (does not include newline character) + +.B $$ +\fBEnd-of-the-text\fR + +.B _ +Zero or more \fBwhitespace\fR characters (specifically, spaces and tabs) + +.B __ +Zero or more \fBwhitespace-or-newline\fR characters + +.B `\fI\fR +The literal \fBcharacter-\fI\fR + +.B `\fI\fB-\fI\fR +The \fBcharacter-range-\fI\fB-to-\fI\fR + +.B \\\fI\fR +The \fBescape-sequence-\fI\fR (\fB\\n\fR, \fB\\x1F\fR, \fB\\033\fR, etc.) -.B -The input file to search (default: stdin). +.B \\\fI\fB-\fI\fR +The \fBescape-sequence-range-\fI\fB-to-\fI\fR + +.B !\fI\fR +\fBNot-\fI\fR + +.B \fI \fR +.B \fI\fB-\fI \fR +.B \fI\fB+ \fI\fR +.B \fI\fB- \fI\fR +\fI\fB-to-\fI\fB-\fI\fBs\fR (repetitions of a pattern) + +.B *\fI\fR +\fBAny-\fI\fBs\fR (zero or more) + +.B +\fI\fR +\fBSome-\fI\fBs\fR (one or more) + +.B \fI\fR \fB%\fI \fR +\fI\fB-separated-by-\fI\fR (equivalent to \fI +\fB*(\fI\fB)\fR) + +.B <\fI\fR +\fBJust-after-\fI\fR (lookbehind) + +.B >\fI\fR +\fBJust-before-\fI\fR (lookahead) + +.B @\fI\fR +\fBCapture-\fI\fR + +.B @[\fI\fB]\fI\fR +\fBLet-\fI\fB-equal-\fI\fR (named capture) + +.B {\fI\fB => "\fI\fB"} +\fBReplace-\fI\fB-with-\fI\fR. Note: \fI\fR should +be a string, and it may contain references to captured values: \fB@0\fR +(the whole of \fI\fR), \fB@1\fR (the first capture in \fI\fR), +\fB@[\fIfoo\fR]\fR (the capture named \fIfoo\fR in \fI\fR), etc. + +.B \fI\fB == \fI\fR +Will match only if \fI\fR and \fI\fR both match and have the exact +same length. Pronounced \fI\fB-assuming-it-equals-\fI\fR + +.B (/) +The empty string (a pattern that always matches). + +.B # \fI\fR +A comment + +.SH STRING PATTERNS +One of the most common use cases for pattern matching tools is matching plain, +literal strings, or strings that are primarily plain strings, with one or two +patterns. \fBbpeg\fR is designed around this fact. The default mode for bpeg +patterns is "string pattern mode". In string pattern mode, all characters +are interpreted literally except for the backslash (\fB\\\fR), which may be +followed by a bpeg pattern (see the \fBPATTERNS\fR section above). Optionally, +the bpeg pattern may be terminated by a semicolon (\fB;\fR). .SH EXAMPLES .TP .B ls | bpeg foo -Find files containing the string "foo" +Find files containing the string "foo" (a string pattern) .TP .B @@ -52,9 +173,16 @@ Find files ending with ".c" and replace the extension with ".h" .TP .B -bpeg -g grammar.bpeg '\\myThing' my_file.txt -Find ocurrences of the grammar rule "myThing" in the file \fBmy_file.txt\fR -using the grammar rules defined in \fBgrammar.bpeg\fR +bpeg -p '"foobar"==id parens' my_file.py +Find the literal string \fB"foobar"\fR, assuming it's a complete identifier, +followed by a pair of matching parentheses in the file \fImy_file.py\fR + +.TP +.B +bpeg -g html -p html-element -D matching-tag=a foo.html +Using the \fIhtml\fR grammar, find all \fIhtml-element\fRs matching +the tag \fIa\fR in the file \fIfoo.html\fR + .SH AUTHOR Bruce Hill (bruce@bruce-hill.com) diff --git a/bpeg.bpeg b/bpeg.bpeg index 39e0f3f..a0bbf19 100644 --- a/bpeg.bpeg +++ b/bpeg.bpeg @@ -16,8 +16,8 @@ Eq-pat = @[first]simple-pat "==" @[second]pat; Empty = `/ >(__ (`)/`})); Dot = `. !`.; String = ( - `" @[s]*(Escape / ~`") `" - / `' @[s]*(Escape / ~`') `' + `" @[s]*(Escape / !`"$.) `" + / `' @[s]*(Escape / !`'$.) `' ); Char-range = `` @[low]. `- @[high].; Char = `` @[s].; @@ -29,7 +29,6 @@ escape-sequence = ( /`a/`b/`e/`n/`r/`t/`v / . / \n ); No = `! _ @pat; -Anything-but = `~ ?`~ _ @pat; Upto = 2-3`. ?>(_@pat); Repeat = ( @[min]int _ `- _ @[max]int diff --git a/bpeg.c b/bpeg.c index 2245ff8..e78c9f6 100644 --- a/bpeg.c +++ b/bpeg.c @@ -1,36 +1,7 @@ /* * bpeg.c - Source code for the bpeg parser * - * Grammar: - * # comment - * .. any text up to the following pattern (if any); (multiline: ...) - * . any character (multiline: $.) - * ^ beginning of a line (^^: beginning of file) - * $ end of a line ($$: end of file) - * _ 0 or more spaces or tabs (__: include newlines and comments) - * ` character - * `- character between and - * \ escape sequence (e.g. \n, \033) - * \- escape sequence range (e.g. \x00-\xF0) - * ! no - * ~ any character as long as it doesn't match (multiline: ~~) - * + [% ] or more s (separated by ) - * * [% ] sugar for "0+ [% ]" - * - [% ] or fewer s (separated by ) - * ? sugar for "1- " - * - to (inclusive) s - * < after , ... - * > before , ... - * ( ) - * @ capture - * @ [ ] named - * { => } replaced with - * == iff matches at the same spot for the same length - * "@1" or "@[1]" first capture - * "@foo" or "@[foo]" capture named "foo" - * followed by - * / otherwise - * ; = is defined to be + * See `man ./bpeg.1` for more details */ #include #include @@ -53,8 +24,8 @@ static const char *usage = ( " -v --verbose\t print verbose debugging info\n" " -d --define =\t define a grammar rule\n" " -D --define-string =\t define a grammar rule (string-pattern)\n" - " -e --escaped \t provide an escaped pattern (equivalent to bpeg '\\()')\n" - " -s --string \t provide a string pattern (equivalent to bpeg '', but may be useful if '' begins with a '-')\n" + " -p --pattern \t provide a pattern (equivalent to bpeg '\\()')\n" + " -P --pattern-string \t provide a string pattern (equivalent to bpeg '', but may be useful if '' begins with a '-')\n" " -r --replace replace the input pattern with the given replacement\n" " -m --mode \t set the behavior mode (defult: find-all)\n" " -g --grammar use the specified file as a grammar\n"); @@ -104,14 +75,15 @@ int main(int argc, char *argv[]) { int verbose = 0; char *flag = NULL; + char path[PATH_MAX] = {0}; const char *rule = "find-all"; grammar_t *g = new_grammar(); + // Load builtins: int fd; if ((fd=open("/etc/xdg/bpeg/builtins.bpeg", O_RDONLY)) >= 0) load_grammar(g, readfile(fd)); // Keep in memory for debugging output - char path[PATH_MAX] = {0}; sprintf(path, "%s/.config/bpeg/builtins.bpeg", getenv("HOME")); if ((fd=open(path, O_RDONLY)) >= 0) load_grammar(g, readfile(fd)); // Keep in memory for debugging output @@ -121,10 +93,10 @@ int main(int argc, char *argv[]) if (streq(argv[i], "--")) { ++i; break; - } else if (FLAG("--help") || FLAG("-h")) { + } else if (streq(argv[i], "--help") || streq(argv[i], "-h")) { printf("%s\n", usage); return 0; - } else if (FLAG("--verbose") || FLAG("-v")) { + } else if (streq(argv[i], "--verbose") || streq(argv[i], "-v")) { verbose = 1; } else if (FLAG("--replace") || FLAG("-r")) { vm_op_t *p = bpeg_replacement(bpeg_pattern("pattern"), flag); @@ -166,13 +138,13 @@ int main(int argc, char *argv[]) vm_op_t *pat = bpeg_stringpattern(src); check(pat, "Failed to compile pattern"); add_def(g, src, def, pat); - } else if (FLAG("--escaped") || FLAG("-e")) { + } else if (FLAG("--pattern") || FLAG("-p")) { check(npatterns == 0, "Cannot define multiple patterns"); vm_op_t *p = bpeg_pattern(flag); check(p, "Pattern failed to compile: '%s'", flag); add_def(g, flag, "pattern", p); ++npatterns; - } else if (FLAG("--string") || FLAG("-s")) { + } else if (FLAG("--pattern-string") || FLAG("-P")) { vm_op_t *p = bpeg_stringpattern(flag); check(p, "Pattern failed to compile"); add_def(g, flag, "pattern", p); diff --git a/compiler.c b/compiler.c index df34e44..267ea27 100644 --- a/compiler.c +++ b/compiler.c @@ -184,17 +184,6 @@ vm_op_t *bpeg_simplepattern(const char *str) op->args.pat = p; break; } - // Anything but - case '~': { - if (matchchar(&str, '~')) op->multiline = 1; - vm_op_t *p = bpeg_simplepattern(str); - check(p, "Expected pattern after '~'\n"); - str = p->end; - op->op = VM_ANYTHING_BUT; - op->len = -1; - op->args.pat = p; - break; - } // Number of repetitions: (- / - / + / "") case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index c558c21..c871408 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -1,6 +1,6 @@ # Meta-rules for acting on everything pattern = !(/); # Not defined by default -replacement = {!(/)=>}; # Not defined by default +replacement = !(/); # Not defined by default replace-all = +(...@replacement) ...; find-all = {... >matching-line =>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; only-matches = +{...@pattern=>'@1\n'}; @@ -10,10 +10,10 @@ non-matching-line = {..$=>}; # Helper definitions (commonly used) crlf = \r\n; cr = \r; r = \r; -anglebraces = `< *(anglebraces / ~~`>) `>; -brackets = `[ *(brackets / ~~`]) `]; -braces = `{ *(braces / ~~`}) `}; -parens = `( *(parens / ~~`)) `); +anglebraces = `< *(anglebraces / !`>.) `>; +brackets = `[ *(brackets / !`].) `]; +braces = `{ *(braces / !`}.) `}; +parens = `( *(parens / !`).) `); id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_); HEX = `0-9/`A-F; diff --git a/grammars/html.bpeg b/grammars/html.bpeg index 7f8976a..9d74f5e 100644 --- a/grammars/html.bpeg +++ b/grammars/html.bpeg @@ -11,17 +11,17 @@ html-element = ( void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>; -template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~(")) ("); +template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(!`<$. / comment / html-element / !(")$.) ("); -raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~(") ("); +raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body].. ("); -normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~(")) "; +normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(!`<$. / comment / html-element / !(")$.) "; comment = ""; attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__; attribute = (+id%`:)__`=__ (id / `" &`" / `' &`'); -attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `'); +attribute = (+id%`:)__`=__ (id / `" .. `" / `' .. `'); match-attribute = attribute; match-tag = id; match-body = (/); diff --git a/types.h b/types.h index 6346342..c8f7a17 100644 --- a/types.h +++ b/types.h @@ -12,7 +12,6 @@ enum VMOpcode { VM_EMPTY = 0, VM_ANYCHAR = 1, - VM_ANYTHING_BUT, VM_STRING, VM_RANGE, VM_NOT, @@ -34,7 +33,7 @@ enum VMOpcode { */ typedef struct vm_op_s { enum VMOpcode op; - unsigned int multiline:1; + unsigned int multiline:1, negate:1; const char *start, *end; // Length of the match, if constant, otherwise -1 ssize_t len; diff --git a/vm.c b/vm.c index b69b3cb..2a44dd7 100644 --- a/vm.c +++ b/vm.c @@ -16,7 +16,6 @@ static match_t *get_capture_named(match_t *m, const char *name); static const char *opcode_names[] = { [VM_EMPTY] = "EMPTY", [VM_ANYCHAR] = "ANYCHAR", - [VM_ANYTHING_BUT] = "ANYTHING_BUT", [VM_STRING] = "STRING", [VM_RANGE] = "RANGE", [VM_NOT] = "NOT", @@ -114,10 +113,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m->end = str + 1; return m; } - case VM_NOT: case VM_ANYTHING_BUT: { - if (op->op == VM_ANYTHING_BUT) - if (!*str || (!op->multiline && *str == '\n')) - return NULL; + case VM_NOT: { match_t *m = _match(g, str, op->args.pat, rec); if (m != NULL) { destroy_match(&m); @@ -126,7 +122,6 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m = calloc(sizeof(match_t), 1); m->op = op; m->start = str; - if (op->op == VM_ANYTHING_BUT) ++str; m->end = str; return m; } @@ -160,13 +155,11 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m->start = str; m->end = str; m->op = op; - if (op->args.repetitions.max == 0) return m; match_t **dest = &m->child; - const char *prev = str; - size_t reps; - for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) { + size_t reps = 0; + for (;;) { // Separator match_t *sep = NULL; if (op->args.repetitions.sep != NULL && reps > 0) { @@ -188,6 +181,12 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref dest = &p->nextsibling; str = p->end; prev = str; + + ++reps; + if (op->args.repetitions.max != -1 && reps > (size_t)op->args.repetitions.max) { + destroy_match(&m); + return NULL; + } } if ((ssize_t)reps < op->args.repetitions.min) { @@ -386,12 +385,6 @@ void print_pattern(vm_op_t *op) fprintf(stderr, ")"); break; } - case VM_ANYTHING_BUT: { - fprintf(stderr, "anything but ("); - print_pattern(op->args.pat); - fprintf(stderr, ")"); - break; - } case VM_AFTER: { fprintf(stderr, "after ("); print_pattern(op->args.pat); -- cgit v1.2.3