diff options
| -rw-r--r-- | bpeg.1 | 162 | ||||
| -rw-r--r-- | bpeg.bpeg | 5 | ||||
| -rw-r--r-- | bpeg.c | 46 | ||||
| -rw-r--r-- | compiler.c | 11 | ||||
| -rw-r--r-- | grammars/builtins.bpeg | 10 | ||||
| -rw-r--r-- | grammars/html.bpeg | 8 | ||||
| -rw-r--r-- | types.h | 3 | ||||
| -rw-r--r-- | vm.c | 25 |
8 files changed, 175 insertions, 95 deletions
@@ -7,43 +7,164 @@ bpeg \- Bruce's Parsing Expression Grammar tool .B bpeg [\fI-h\fR|\fI--help\fR] [\fI-v\fR|\fI--verbose\fR] +[\fI-p\fR|\fI--pattern\fR \fI<pattern>\fR] +[\fI-P\fR|\fI--pattern-string\fR \fI<string-pattern>\fR] [\fI-d\fR|\fI--define\fR \fI<name>\fR=\fI<pattern>\fR] +[\fI-D\fR|\fI--define-string\fR \fI<name>\fR=\fI<string-pattern>\fR] [\fI-r\fR|\fI--replace\fR \fI<replacement>\fR] [\fI-g\fR|\fI--grammar\fR \fI<grammar file>\fR] +[\fI-m\fR|\fI--mode\fR \fI<mode>\fR] \fI<pattern\fR -[[--] \fI<input file>\fR] +[[--] \fI<input files...>\fR] .SH DESCRIPTION \fBbpeg\fR is a tool that matches parsing expression grammars using a custom syntax. .SH OPTIONS -.B \--verbose +.B \-v\fR, \fB--verbose Print debugging information. -.B \--define <name>=<pattern> -Define a grammar rule. +.B \-d\fR, \fB--define \fI<name>\fR=\fI<pattern>\fR +Define a grammar rule using a bpeg pattern. -.B \--replace <replacement> +.B \-D\fR, \fB--define-string \fI<name>\fR=\fI<string-pattern>\fR +Define a grammar rule using a bpeg string pattern. + +.B \-r\fR, \fB--replace \fI<replacement>\fR Replace all occurrences of the main pattern with the given string. -.B \--grammar <grammar file> +.B \-g\fR, \fB--grammar \fI<grammar file>\fR Load the grammar from the given file. +.B \-m\fR, \fB--mode \fI<mode>\fR +The mode to operate in. Options are: \fIfind-all\fR (the default), +\fIonly-matches\fR, \fIpattern\fR, \fIreplacement\fR, \fIreplace-all\fR +(implied by \fB--replace\fR), or any other grammar rule name. + .B \--help Print the usage and exit. -.B <pattern> -The main pattern for bpeg to match. By default, this pattern -is in "string literal" mode (i.e. a backslash is requres for -non-literal patterns). The default mode is to find \fBall\fR -occurrences of the pattern and highlight them. +.B <string-pattern> +The main pattern for bpeg to match. By default, this pattern is a string +pattern (see the \fBSTRING PATTERNS\fR section below). + +.B <input files...> +The input files to search. If no input files are provided and data was +piped in, that data will be used instead. If neither are provided, +\fBbpeg\fR will search through all files in the current directory and +its subdirectories (recursively). + +.SH PATTERNS +Bpeg patterns are based off of a combination of Parsing Expression Grammars +and regular expression syntax. The syntax is designed to map closely to +verbal descriptions of the patterns, and prefix operators are preferred over +suffix operators (as is common in regex syntax). + +Some patterns additionally have "multi-line" variants, which means that they +include the newline character. + +.I <pat1> <pat2> +A chain of patterns, pronounced \fI<pat1>\fB-then-\fI<pat2>\fR + +.I <pat1> \fB/\fI <pat2>\fR +A series of ordered choices (if one pattern matches, the following patterns +will not be attempted), pronounced \fI<pat1>\fB-or-\fI<pat2>\fR + +.B .. +Any text \fBup-to\fR the following pattern, if any (multiline: \fB...\fR) + +.B . +\fBAny\fR character (multiline: $.) + +.B ^ +\fBStart-of-a-line\fR + +.B ^^ +\fBStart-of-the-text\fR + +.B $ +\fBEnd-of-a-line\fR (does not include newline character) + +.B $$ +\fBEnd-of-the-text\fR + +.B _ +Zero or more \fBwhitespace\fR characters (specifically, spaces and tabs) + +.B __ +Zero or more \fBwhitespace-or-newline\fR characters + +.B `\fI<c>\fR +The literal \fBcharacter-\fI<c>\fR + +.B `\fI<c1>\fB-\fI<c2>\fR +The \fBcharacter-range-\fI<c1>\fB-to-\fI<c2>\fR + +.B \\\fI<esc>\fR +The \fBescape-sequence-\fI<esc>\fR (\fB\\n\fR, \fB\\x1F\fR, \fB\\033\fR, etc.) -.B <input file> -The input file to search (default: stdin). +.B \\\fI<esc1>\fB-\fI<esc2>\fR +The \fBescape-sequence-range-\fI<esc1>\fB-to-\fI<esc2>\fR + +.B !\fI<pat>\fR +\fBNot-\fI<pat>\fR + +.B \fI<N> <pat>\fR +.B \fI<MIN>\fB-\fI<MAX> <pat>\fR +.B \fI<MIN>\fB+ \fI<pat>\fR +.B \fI<MAX>\fB- \fI<pat>\fR +\fI<MIN>\fB-to-\fI<MAX>\fB-\fI<pat>\fBs\fR (repetitions of a pattern) + +.B *\fI<pat>\fR +\fBAny-\fI<pat>\fBs\fR (zero or more) + +.B +\fI<pat>\fR +\fBSome-\fI<pat>\fBs\fR (one or more) + +.B \fI<repeating-pat>\fR \fB%\fI <sep>\fR +\fI<repeating-pat>\fB-separated-by-\fI<sep>\fR (equivalent to \fI<pat> +\fB*(\fI<sep><pat>\fB)\fR) + +.B <\fI<pat>\fR +\fBJust-after-\fI<pat>\fR (lookbehind) + +.B >\fI<pat>\fR +\fBJust-before-\fI<pat>\fR (lookahead) + +.B @\fI<pat>\fR +\fBCapture-\fI<pat>\fR + +.B @[\fI<name>\fB]\fI<pat>\fR +\fBLet-\fI<name>\fB-equal-\fI<pat>\fR (named capture) + +.B {\fI<pat>\fB => "\fI<replacement>\fB"} +\fBReplace-\fI<pat>\fB-with-\fI<replacement>\fR. Note: \fI<replacement>\fR should +be a string, and it may contain references to captured values: \fB@0\fR +(the whole of \fI<pat>\fR), \fB@1\fR (the first capture in \fI<pat>\fR), +\fB@[\fIfoo\fR]\fR (the capture named \fIfoo\fR in \fI<pat>\fR), etc. + +.B \fI<pat1>\fB == \fI<pat2>\fR +Will match only if \fI<pat1>\fR and \fI<pat2>\fR both match and have the exact +same length. Pronounced \fI<pat1>\fB-assuming-it-equals-\fI<pat2>\fR + +.B (/) +The empty string (a pattern that always matches). + +.B # \fI<comment>\fR +A comment + +.SH STRING PATTERNS +One of the most common use cases for pattern matching tools is matching plain, +literal strings, or strings that are primarily plain strings, with one or two +patterns. \fBbpeg\fR is designed around this fact. The default mode for bpeg +patterns is "string pattern mode". In string pattern mode, all characters +are interpreted literally except for the backslash (\fB\\\fR), which may be +followed by a bpeg pattern (see the \fBPATTERNS\fR section above). Optionally, +the bpeg pattern may be terminated by a semicolon (\fB;\fR). .SH EXAMPLES .TP .B ls | bpeg foo -Find files containing the string "foo" +Find files containing the string "foo" (a string pattern) .TP .B @@ -52,9 +173,16 @@ Find files ending with ".c" and replace the extension with ".h" .TP .B -bpeg -g grammar.bpeg '\\myThing' my_file.txt -Find ocurrences of the grammar rule "myThing" in the file \fBmy_file.txt\fR -using the grammar rules defined in \fBgrammar.bpeg\fR +bpeg -p '"foobar"==id parens' my_file.py +Find the literal string \fB"foobar"\fR, assuming it's a complete identifier, +followed by a pair of matching parentheses in the file \fImy_file.py\fR + +.TP +.B +bpeg -g html -p html-element -D matching-tag=a foo.html +Using the \fIhtml\fR grammar, find all \fIhtml-element\fRs matching +the tag \fIa\fR in the file \fIfoo.html\fR + .SH AUTHOR Bruce Hill (bruce@bruce-hill.com) @@ -16,8 +16,8 @@ Eq-pat = @[first]simple-pat "==" @[second]pat; Empty = `/ >(__ (`)/`})); Dot = `. !`.; String = ( - `" @[s]*(Escape / ~`") `" - / `' @[s]*(Escape / ~`') `' + `" @[s]*(Escape / !`"$.) `" + / `' @[s]*(Escape / !`'$.) `' ); Char-range = `` @[low]. `- @[high].; Char = `` @[s].; @@ -29,7 +29,6 @@ escape-sequence = ( /`a/`b/`e/`n/`r/`t/`v / . / \n ); No = `! _ @pat; -Anything-but = `~ ?`~ _ @pat; Upto = 2-3`. ?>(_@pat); Repeat = ( @[min]int _ `- _ @[max]int @@ -1,36 +1,7 @@ /* * bpeg.c - Source code for the bpeg parser * - * Grammar: - * # <comment> comment - * .. any text up to the following pattern (if any); (multiline: ...) - * . any character (multiline: $.) - * ^ beginning of a line (^^: beginning of file) - * $ end of a line ($$: end of file) - * _ 0 or more spaces or tabs (__: include newlines and comments) - * `<c> character <c> - * `<a>-<z> character between <a> and <z> - * \<e> escape sequence (e.g. \n, \033) - * \<e1>-<e2> escape sequence range (e.g. \x00-\xF0) - * ! <pat> no <pat> - * ~ <pat> any character as long as it doesn't match <pat> (multiline: ~~<pat>) - * <N=1> + <pat> [% <sep="">] <N> or more <pat>s (separated by <sep>) - * * <pat> [% <sep="">] sugar for "0+ <pat> [% <sep>]" - * <N=1> - <pat> [% <sep="">] <N> or fewer <pat>s (separated by <sep>) - * ? <pat> sugar for "1- <pat>" - * <N> - <M> <pat> <N> to <M> (inclusive) <pat>s - * < <pat> after <pat>, ... - * > <pat> before <pat>, ... - * ( <pat> ) <pat> - * @ <pat> capture <pat> - * @ [ <name> ] <pat> <pat> named <name> - * { <pat> => <str> } <pat> replaced with <str> - * <pat1> == <pat2> <pat1> iff <pat2> matches at the same spot for the same length - * "@1" or "@[1]" first capture - * "@foo" or "@[foo]" capture named "foo" - * <pat1> <pat2> <pat1> followed by <pat2> - * <pat> / <alt> <pat> otherwise <alt> - * ; <name> = <pat> <name> is defined to be <pat> + * See `man ./bpeg.1` for more details */ #include <fcntl.h> #include <glob.h> @@ -53,8 +24,8 @@ static const char *usage = ( " -v --verbose\t print verbose debugging info\n" " -d --define <name>=<def>\t define a grammar rule\n" " -D --define-string <name>=<def>\t define a grammar rule (string-pattern)\n" - " -e --escaped <pat>\t provide an escaped pattern (equivalent to bpeg '\\(<pat>)')\n" - " -s --string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n" + " -p --pattern <pat>\t provide a pattern (equivalent to bpeg '\\(<pat>)')\n" + " -P --pattern-string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n" " -r --replace <replacement> replace the input pattern with the given replacement\n" " -m --mode <mode>\t set the behavior mode (defult: find-all)\n" " -g --grammar <grammar file> use the specified file as a grammar\n"); @@ -104,14 +75,15 @@ int main(int argc, char *argv[]) { int verbose = 0; char *flag = NULL; + char path[PATH_MAX] = {0}; const char *rule = "find-all"; grammar_t *g = new_grammar(); + // Load builtins: int fd; if ((fd=open("/etc/xdg/bpeg/builtins.bpeg", O_RDONLY)) >= 0) load_grammar(g, readfile(fd)); // Keep in memory for debugging output - char path[PATH_MAX] = {0}; sprintf(path, "%s/.config/bpeg/builtins.bpeg", getenv("HOME")); if ((fd=open(path, O_RDONLY)) >= 0) load_grammar(g, readfile(fd)); // Keep in memory for debugging output @@ -121,10 +93,10 @@ int main(int argc, char *argv[]) if (streq(argv[i], "--")) { ++i; break; - } else if (FLAG("--help") || FLAG("-h")) { + } else if (streq(argv[i], "--help") || streq(argv[i], "-h")) { printf("%s\n", usage); return 0; - } else if (FLAG("--verbose") || FLAG("-v")) { + } else if (streq(argv[i], "--verbose") || streq(argv[i], "-v")) { verbose = 1; } else if (FLAG("--replace") || FLAG("-r")) { vm_op_t *p = bpeg_replacement(bpeg_pattern("pattern"), flag); @@ -166,13 +138,13 @@ int main(int argc, char *argv[]) vm_op_t *pat = bpeg_stringpattern(src); check(pat, "Failed to compile pattern"); add_def(g, src, def, pat); - } else if (FLAG("--escaped") || FLAG("-e")) { + } else if (FLAG("--pattern") || FLAG("-p")) { check(npatterns == 0, "Cannot define multiple patterns"); vm_op_t *p = bpeg_pattern(flag); check(p, "Pattern failed to compile: '%s'", flag); add_def(g, flag, "pattern", p); ++npatterns; - } else if (FLAG("--string") || FLAG("-s")) { + } else if (FLAG("--pattern-string") || FLAG("-P")) { vm_op_t *p = bpeg_stringpattern(flag); check(p, "Pattern failed to compile"); add_def(g, flag, "pattern", p); @@ -184,17 +184,6 @@ vm_op_t *bpeg_simplepattern(const char *str) op->args.pat = p; break; } - // Anything but <pat> - case '~': { - if (matchchar(&str, '~')) op->multiline = 1; - vm_op_t *p = bpeg_simplepattern(str); - check(p, "Expected pattern after '~'\n"); - str = p->end; - op->op = VM_ANYTHING_BUT; - op->len = -1; - op->args.pat = p; - break; - } // Number of repetitions: <N>(-<N> / - / + / "") case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index c558c21..c871408 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -1,6 +1,6 @@ # Meta-rules for acting on everything pattern = !(/); # Not defined by default -replacement = {!(/)=>}; # Not defined by default +replacement = !(/); # Not defined by default replace-all = +(...@replacement) ...; find-all = {... >matching-line =>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; only-matches = +{...@pattern=>'@1\n'}; @@ -10,10 +10,10 @@ non-matching-line = {..$=>}; # Helper definitions (commonly used) crlf = \r\n; cr = \r; r = \r; -anglebraces = `< *(anglebraces / ~~`>) `>; -brackets = `[ *(brackets / ~~`]) `]; -braces = `{ *(braces / ~~`}) `}; -parens = `( *(parens / ~~`)) `); +anglebraces = `< *(anglebraces / !`>.) `>; +brackets = `[ *(brackets / !`].) `]; +braces = `{ *(braces / !`}.) `}; +parens = `( *(parens / !`).) `); id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_); HEX = `0-9/`A-F; diff --git a/grammars/html.bpeg b/grammars/html.bpeg index 7f8976a..9d74f5e 100644 --- a/grammars/html.bpeg +++ b/grammars/html.bpeg @@ -11,17 +11,17 @@ html-element = ( void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>; -template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~("</"tag__`>)) ("</"tag__`>); +template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(!`<$. / comment / html-element / !("</"tag__`>)$.) ("</"tag__`>); -raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~("</"tag__`>) ("</"tag__`>); +raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body].. ("</"tag__`>); -normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>; +normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(!`<$. / comment / html-element / !("</"tag__`>)$.) "</"tag__`>; comment = "<!--" &&"-->"; attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__; attribute = (+id%`:)__`=__ (id / `" &`" / `' &`'); -attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `'); +attribute = (+id%`:)__`=__ (id / `" .. `" / `' .. `'); match-attribute = attribute; match-tag = id; match-body = (/); @@ -12,7 +12,6 @@ enum VMOpcode { VM_EMPTY = 0, VM_ANYCHAR = 1, - VM_ANYTHING_BUT, VM_STRING, VM_RANGE, VM_NOT, @@ -34,7 +33,7 @@ enum VMOpcode { */ typedef struct vm_op_s { enum VMOpcode op; - unsigned int multiline:1; + unsigned int multiline:1, negate:1; const char *start, *end; // Length of the match, if constant, otherwise -1 ssize_t len; @@ -16,7 +16,6 @@ static match_t *get_capture_named(match_t *m, const char *name); static const char *opcode_names[] = { [VM_EMPTY] = "EMPTY", [VM_ANYCHAR] = "ANYCHAR", - [VM_ANYTHING_BUT] = "ANYTHING_BUT", [VM_STRING] = "STRING", [VM_RANGE] = "RANGE", [VM_NOT] = "NOT", @@ -114,10 +113,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m->end = str + 1; return m; } - case VM_NOT: case VM_ANYTHING_BUT: { - if (op->op == VM_ANYTHING_BUT) - if (!*str || (!op->multiline && *str == '\n')) - return NULL; + case VM_NOT: { match_t *m = _match(g, str, op->args.pat, rec); if (m != NULL) { destroy_match(&m); @@ -126,7 +122,6 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m = calloc(sizeof(match_t), 1); m->op = op; m->start = str; - if (op->op == VM_ANYTHING_BUT) ++str; m->end = str; return m; } @@ -160,13 +155,11 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m->start = str; m->end = str; m->op = op; - if (op->args.repetitions.max == 0) return m; match_t **dest = &m->child; - const char *prev = str; - size_t reps; - for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) { + size_t reps = 0; + for (;;) { // Separator match_t *sep = NULL; if (op->args.repetitions.sep != NULL && reps > 0) { @@ -188,6 +181,12 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref dest = &p->nextsibling; str = p->end; prev = str; + + ++reps; + if (op->args.repetitions.max != -1 && reps > (size_t)op->args.repetitions.max) { + destroy_match(&m); + return NULL; + } } if ((ssize_t)reps < op->args.repetitions.min) { @@ -386,12 +385,6 @@ void print_pattern(vm_op_t *op) fprintf(stderr, ")"); break; } - case VM_ANYTHING_BUT: { - fprintf(stderr, "anything but ("); - print_pattern(op->args.pat); - fprintf(stderr, ")"); - break; - } case VM_AFTER: { fprintf(stderr, "after ("); print_pattern(op->args.pat); |
