From 4135115229d27c54b70cd945e2211e652ab58d2f Mon Sep 17 00:00:00 2001
From: Bruce Hill <bruce@bruce-hill.com>
Date: Sun, 13 Sep 2020 23:31:38 -0700
Subject: Spruced up a bunch of stuff, tweaked the grammar, added docs

---
 bpeg.1                 | 162 +++++++++++++++++++++++++++++++++++++++++++------
 bpeg.bpeg              |   5 +-
 bpeg.c                 |  46 +++-----------
 compiler.c             |  11 ----
 grammars/builtins.bpeg |  10 +--
 grammars/html.bpeg     |   8 +--
 types.h                |   3 +-
 vm.c                   |  25 +++-----
 8 files changed, 175 insertions(+), 95 deletions(-)
diff --git a/bpeg.1 b/bpeg.1
index 3dfb806..4f2c18c 100644
--- a/bpeg.1
+++ b/bpeg.1
@@ -7,43 +7,164 @@ bpeg \- Bruce's Parsing Expression Grammar tool
 .B bpeg
 [\fI-h\fR|\fI--help\fR]
 [\fI-v\fR|\fI--verbose\fR]
+[\fI-p\fR|\fI--pattern\fR \fI<pattern>\fR]
+[\fI-P\fR|\fI--pattern-string\fR \fI<string-pattern>\fR]
 [\fI-d\fR|\fI--define\fR \fI<name>\fR=\fI<pattern>\fR]
+[\fI-D\fR|\fI--define-string\fR \fI<name>\fR=\fI<string-pattern>\fR]
 [\fI-r\fR|\fI--replace\fR \fI<replacement>\fR]
 [\fI-g\fR|\fI--grammar\fR \fI<grammar file>\fR]
+[\fI-m\fR|\fI--mode\fR \fI<mode>\fR]
 \fI<pattern\fR
-[[--] \fI<input file>\fR]
+[[--] \fI<input files...>\fR]
 .SH DESCRIPTION
 \fBbpeg\fR is a tool that matches parsing expression grammars using a custom syntax.
 .SH OPTIONS
-.B \--verbose
+.B \-v\fR, \fB--verbose
 Print debugging information.
 
-.B \--define <name>=<pattern>
-Define a grammar rule.
+.B \-d\fR, \fB--define \fI<name>\fR=\fI<pattern>\fR
+Define a grammar rule using a bpeg pattern.
 
-.B \--replace <replacement>
+.B \-D\fR, \fB--define-string \fI<name>\fR=\fI<string-pattern>\fR
+Define a grammar rule using a bpeg string pattern.
+
+.B \-r\fR, \fB--replace \fI<replacement>\fR
 Replace all occurrences of the main pattern with the given string.
 
-.B \--grammar <grammar file>
+.B \-g\fR, \fB--grammar \fI<grammar file>\fR
 Load the grammar from the given file.
 
+.B \-m\fR, \fB--mode \fI<mode>\fR
+The mode to operate in. Options are: \fIfind-all\fR (the default),
+\fIonly-matches\fR, \fIpattern\fR, \fIreplacement\fR, \fIreplace-all\fR
+(implied by \fB--replace\fR), or any other grammar rule name.
+
 .B \--help
 Print the usage and exit.
 
-.B <pattern>
-The main pattern for bpeg to match. By default, this pattern
-is in "string literal" mode (i.e. a backslash is requres for
-non-literal patterns). The default mode is to find \fBall\fR
-occurrences of the pattern and highlight them.
+.B <string-pattern>
+The main pattern for bpeg to match. By default, this pattern is a string
+pattern (see the \fBSTRING PATTERNS\fR section below).
+
+.B <input files...>
+The input files to search. If no input files are provided and data was
+piped in, that data will be used instead. If neither are provided,
+\fBbpeg\fR will search through all files in the current directory and
+its subdirectories (recursively).
+
+.SH PATTERNS
+Bpeg patterns are based off of a combination of Parsing Expression Grammars
+and regular expression syntax. The syntax is designed to map closely to
+verbal descriptions of the patterns, and prefix operators are preferred over
+suffix operators (as is common in regex syntax).
+
+Some patterns additionally have "multi-line" variants, which means that they
+include the newline character.
+
+.I <pat1> <pat2>
+A chain of patterns, pronounced \fI<pat1>\fB-then-\fI<pat2>\fR
+
+.I <pat1> \fB/\fI <pat2>\fR
+A series of ordered choices (if one pattern matches, the following patterns
+will not be attempted), pronounced \fI<pat1>\fB-or-\fI<pat2>\fR
+
+.B ..
+Any text \fBup-to\fR the following pattern, if any (multiline: \fB...\fR)
+
+.B .
+\fBAny\fR character (multiline: $.)
+
+.B ^
+\fBStart-of-a-line\fR
+
+.B ^^
+\fBStart-of-the-text\fR
+
+.B $
+\fBEnd-of-a-line\fR (does not include newline character)
+
+.B $$
+\fBEnd-of-the-text\fR
+
+.B _
+Zero or more \fBwhitespace\fR characters (specifically, spaces and tabs)
+
+.B __
+Zero or more \fBwhitespace-or-newline\fR characters
+
+.B `\fI<c>\fR
+The literal \fBcharacter-\fI<c>\fR
+
+.B `\fI<c1>\fB-\fI<c2>\fR
+The \fBcharacter-range-\fI<c1>\fB-to-\fI<c2>\fR
+
+.B \\\fI<esc>\fR
+The \fBescape-sequence-\fI<esc>\fR (\fB\\n\fR, \fB\\x1F\fR, \fB\\033\fR, etc.)
 
-.B <input file>
-The input file to search (default: stdin).
+.B \\\fI<esc1>\fB-\fI<esc2>\fR
+The \fBescape-sequence-range-\fI<esc1>\fB-to-\fI<esc2>\fR
+
+.B !\fI<pat>\fR
+\fBNot-\fI<pat>\fR
+
+.B \fI<N> <pat>\fR
+.B \fI<MIN>\fB-\fI<MAX> <pat>\fR
+.B \fI<MIN>\fB+ \fI<pat>\fR
+.B \fI<MAX>\fB- \fI<pat>\fR
+\fI<MIN>\fB-to-\fI<MAX>\fB-\fI<pat>\fBs\fR (repetitions of a pattern)
+
+.B *\fI<pat>\fR
+\fBAny-\fI<pat>\fBs\fR (zero or more)
+
+.B +\fI<pat>\fR
+\fBSome-\fI<pat>\fBs\fR (one or more)
+
+.B \fI<repeating-pat>\fR \fB%\fI <sep>\fR
+\fI<repeating-pat>\fB-separated-by-\fI<sep>\fR (equivalent to \fI<pat>
+\fB*(\fI<sep><pat>\fB)\fR)
+
+.B <\fI<pat>\fR
+\fBJust-after-\fI<pat>\fR (lookbehind)
+
+.B >\fI<pat>\fR
+\fBJust-before-\fI<pat>\fR (lookahead)
+
+.B @\fI<pat>\fR
+\fBCapture-\fI<pat>\fR
+
+.B @[\fI<name>\fB]\fI<pat>\fR
+\fBLet-\fI<name>\fB-equal-\fI<pat>\fR (named capture)
+
+.B {\fI<pat>\fB => "\fI<replacement>\fB"}
+\fBReplace-\fI<pat>\fB-with-\fI<replacement>\fR. Note: \fI<replacement>\fR should
+be a string, and it may contain references to captured values: \fB@0\fR
+(the whole of \fI<pat>\fR), \fB@1\fR (the first capture in \fI<pat>\fR),
+\fB@[\fIfoo\fR]\fR (the capture named \fIfoo\fR in \fI<pat>\fR), etc.
+
+.B \fI<pat1>\fB == \fI<pat2>\fR
+Will match only if \fI<pat1>\fR and \fI<pat2>\fR both match and have the exact
+same length. Pronounced \fI<pat1>\fB-assuming-it-equals-\fI<pat2>\fR
+
+.B (/)
+The empty string (a pattern that always matches).
+
+.B # \fI<comment>\fR
+A comment
+
+.SH STRING PATTERNS
+One of the most common use cases for pattern matching tools is matching plain,
+literal strings, or strings that are primarily plain strings, with one or two
+patterns. \fBbpeg\fR is designed around this fact. The default mode for bpeg
+patterns is "string pattern mode". In string pattern mode, all characters
+are interpreted literally except for the backslash (\fB\\\fR), which may be
+followed by a bpeg pattern (see the \fBPATTERNS\fR section above). Optionally,
+the bpeg pattern may be terminated by a semicolon (\fB;\fR).
 
 .SH EXAMPLES
 .TP
 .B
 ls | bpeg foo
-Find files containing the string "foo"
+Find files containing the string "foo" (a string pattern)
 
 .TP
 .B
@@ -52,9 +173,16 @@ Find files ending with ".c" and replace the extension with ".h"
 
 .TP
 .B
-bpeg -g grammar.bpeg '\\myThing' my_file.txt
-Find ocurrences of the grammar rule "myThing" in the file \fBmy_file.txt\fR
-using the grammar rules defined in \fBgrammar.bpeg\fR
+bpeg -p '"foobar"==id parens' my_file.py
+Find the literal string \fB"foobar"\fR, assuming it's a complete identifier,
+followed by a pair of matching parentheses in the file \fImy_file.py\fR
+
+.TP
+.B
+bpeg -g html -p html-element -D matching-tag=a foo.html
+Using the \fIhtml\fR grammar, find all \fIhtml-element\fRs matching
+the tag \fIa\fR in the file \fIfoo.html\fR
+
 
 .SH AUTHOR
 Bruce Hill (bruce@bruce-hill.com)
diff --git a/bpeg.bpeg b/bpeg.bpeg
index 39e0f3f..a0bbf19 100644
--- a/bpeg.bpeg
+++ b/bpeg.bpeg
@@ -16,8 +16,8 @@ Eq-pat = @[first]simple-pat "==" @[second]pat;
 Empty = `/ >(__ (`)/`}));
 Dot = `. !`.;
 String = (
-        `" @[s]*(Escape / ~`") `"
-      / `' @[s]*(Escape / ~`') `'
+        `" @[s]*(Escape / !`"$.) `"
+      / `' @[s]*(Escape / !`'$.) `'
     );
 Char-range = `` @[low]. `- @[high].;
 Char = `` @[s].;
@@ -29,7 +29,6 @@ escape-sequence = (
       /`a/`b/`e/`n/`r/`t/`v / . / \n
     );
 No = `! _ @pat;
-Anything-but = `~ ?`~ _ @pat;
 Upto = 2-3`. ?>(_@pat);
 Repeat = (
         @[min]int _ `- _ @[max]int
diff --git a/bpeg.c b/bpeg.c
index 2245ff8..e78c9f6 100644
--- a/bpeg.c
+++ b/bpeg.c
@@ -1,36 +1,7 @@
 /*
  * bpeg.c - Source code for the bpeg parser
  *
- * Grammar:
- *     # <comment>                 comment
- *     ..                          any text up to the following pattern (if any); (multiline: ...)
- *     .                           any character (multiline: $.)
- *     ^                           beginning of a line (^^: beginning of file)
- *     $                           end of a line ($$: end of file)
- *     _                           0 or more spaces or tabs (__: include newlines and comments)
- *     `<c>                        character <c>
- *     `<a>-<z>                    character between <a> and <z>
- *     \<e>                        escape sequence (e.g. \n, \033)
- *     \<e1>-<e2>                  escape sequence range (e.g. \x00-\xF0)
- *     ! <pat>                     no <pat>
- *     ~ <pat>                     any character as long as it doesn't match <pat> (multiline: ~~<pat>)
- *     <N=1> + <pat> [% <sep="">]  <N> or more <pat>s (separated by <sep>)
- *     * <pat> [% <sep="">]        sugar for "0+ <pat> [% <sep>]"
- *     <N=1> - <pat> [% <sep="">]  <N> or fewer <pat>s (separated by <sep>)
- *     ? <pat>                     sugar for "1- <pat>"
- *     <N> - <M> <pat>             <N> to <M> (inclusive) <pat>s
- *     < <pat>                     after <pat>, ...
- *     > <pat>                     before <pat>, ...
- *     ( <pat> )                   <pat>
- *     @ <pat>                     capture <pat>
- *     @ [ <name> ] <pat>          <pat> named <name>
- *     { <pat> => <str> }          <pat> replaced with <str>
- *     <pat1> == <pat2>            <pat1> iff <pat2> matches at the same spot for the same length
- *     "@1" or "@[1]"              first capture
- *     "@foo" or "@[foo]"          capture named "foo"
- *     <pat1> <pat2>               <pat1> followed by <pat2>
- *     <pat> / <alt>               <pat> otherwise <alt>
- *     ; <name> = <pat>            <name> is defined to be <pat>
+ * See `man ./bpeg.1` for more details
  */
 #include <fcntl.h>
 #include <glob.h>
@@ -53,8 +24,8 @@ static const char *usage = (
     "  -v --verbose\t print verbose debugging info\n"
     "  -d --define <name>=<def>\t define a grammar rule\n"
     "  -D --define-string <name>=<def>\t define a grammar rule (string-pattern)\n"
-    "  -e --escaped <pat>\t provide an escaped pattern (equivalent to bpeg '\\(<pat>)')\n"
-    "  -s --string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n"
+    "  -p --pattern <pat>\t provide a pattern (equivalent to bpeg '\\(<pat>)')\n"
+    "  -P --pattern-string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n"
     "  -r --replace <replacement>   replace the input pattern with the given replacement\n"
     "  -m --mode <mode>\t set the behavior mode (defult: find-all)\n"
     "  -g --grammar <grammar file>  use the specified file as a grammar\n");
@@ -104,14 +75,15 @@ int main(int argc, char *argv[])
 {
     int verbose = 0;
     char *flag = NULL;
+    char path[PATH_MAX] = {0};
     const char *rule = "find-all";
 
     grammar_t *g = new_grammar();
 
+    // Load builtins:
     int fd;
     if ((fd=open("/etc/xdg/bpeg/builtins.bpeg", O_RDONLY)) >= 0)
         load_grammar(g, readfile(fd)); // Keep in memory for debugging output
-    char path[PATH_MAX] = {0};
     sprintf(path, "%s/.config/bpeg/builtins.bpeg", getenv("HOME"));
     if ((fd=open(path, O_RDONLY)) >= 0)
         load_grammar(g, readfile(fd)); // Keep in memory for debugging output
@@ -121,10 +93,10 @@ int main(int argc, char *argv[])
         if (streq(argv[i], "--")) {
             ++i;
             break;
-        } else if (FLAG("--help") || FLAG("-h")) {
+        } else if (streq(argv[i], "--help") || streq(argv[i], "-h")) {
             printf("%s\n", usage);
             return 0;
-        } else if (FLAG("--verbose") || FLAG("-v")) {
+        } else if (streq(argv[i], "--verbose") || streq(argv[i], "-v")) {
             verbose = 1;
         } else if (FLAG("--replace") || FLAG("-r")) {
             vm_op_t *p = bpeg_replacement(bpeg_pattern("pattern"), flag);
@@ -166,13 +138,13 @@ int main(int argc, char *argv[])
             vm_op_t *pat = bpeg_stringpattern(src);
             check(pat, "Failed to compile pattern");
             add_def(g, src, def, pat);
-        } else if (FLAG("--escaped") || FLAG("-e")) {
+        } else if (FLAG("--pattern") || FLAG("-p")) {
             check(npatterns == 0, "Cannot define multiple patterns");
             vm_op_t *p = bpeg_pattern(flag);
             check(p, "Pattern failed to compile: '%s'", flag);
             add_def(g, flag, "pattern", p);
             ++npatterns;
-        } else if (FLAG("--string") || FLAG("-s")) {
+        } else if (FLAG("--pattern-string") || FLAG("-P")) {
             vm_op_t *p = bpeg_stringpattern(flag);
             check(p, "Pattern failed to compile");
             add_def(g, flag, "pattern", p);
diff --git a/compiler.c b/compiler.c
index df34e44..267ea27 100644
--- a/compiler.c
+++ b/compiler.c
@@ -184,17 +184,6 @@ vm_op_t *bpeg_simplepattern(const char *str)
             op->args.pat = p;
             break;
         }
-        // Anything but <pat>
-        case '~': {
-            if (matchchar(&str, '~')) op->multiline = 1;
-            vm_op_t *p = bpeg_simplepattern(str);
-            check(p, "Expected pattern after '~'\n");
-            str = p->end;
-            op->op = VM_ANYTHING_BUT;
-            op->len = -1;
-            op->args.pat = p;
-            break;
-        }
         // Number of repetitions: <N>(-<N> / - / + / "")
         case '0': case '1': case '2': case '3': case '4': case '5':
         case '6': case '7': case '8': case '9': {
diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg
index c558c21..c871408 100644
--- a/grammars/builtins.bpeg
+++ b/grammars/builtins.bpeg
@@ -1,6 +1,6 @@
 # Meta-rules for acting on everything
 pattern = !(/); # Not defined by default
-replacement = {!(/)=>}; # Not defined by default
+replacement = !(/); # Not defined by default
 replace-all = +(...@replacement) ...;
 find-all = {... >matching-line =>} +(matching-line/non-matching-line) ?{!<\n => "\n"};
 only-matches = +{...@pattern=>'@1\n'};
@@ -10,10 +10,10 @@ non-matching-line = {..$=>};
 # Helper definitions (commonly used)
 crlf = \r\n;
 cr = \r; r = \r;
-anglebraces = `< *(anglebraces / ~~`>) `>;
-brackets = `[ *(brackets / ~~`]) `];
-braces = `{ *(braces / ~~`}) `};
-parens = `( *(parens / ~~`)) `);
+anglebraces = `< *(anglebraces / !`>.) `>;
+brackets = `[ *(brackets / !`].) `];
+braces = `{ *(braces / !`}.) `};
+parens = `( *(parens / !`).) `);
 id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);
 word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_);
 HEX = `0-9/`A-F;
diff --git a/grammars/html.bpeg b/grammars/html.bpeg
index 7f8976a..9d74f5e 100644
--- a/grammars/html.bpeg
+++ b/grammars/html.bpeg
@@ -11,17 +11,17 @@ html-element = (
 
 void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>;
 
-template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~("</"tag__`>)) ("</"tag__`>);
+template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(!`<$. / comment / html-element / !("</"tag__`>)$.) ("</"tag__`>);
 
-raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~("</"tag__`>) ("</"tag__`>);
+raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body].. ("</"tag__`>);
 
-normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>;
+normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(!`<$. / comment / html-element / !("</"tag__`>)$.) "</"tag__`>;
 
 comment = "<!--" &&"-->";
 
 attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__;
 attribute = (+id%`:)__`=__ (id / `" &`" / `' &`');
-attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `');
+attribute = (+id%`:)__`=__ (id / `" .. `" / `' .. `');
 match-attribute = attribute;
 match-tag = id;
 match-body = (/);
diff --git a/types.h b/types.h
index 6346342..c8f7a17 100644
--- a/types.h
+++ b/types.h
@@ -12,7 +12,6 @@
 enum VMOpcode {
     VM_EMPTY = 0,
     VM_ANYCHAR = 1,
-    VM_ANYTHING_BUT,
     VM_STRING,
     VM_RANGE,
     VM_NOT,
@@ -34,7 +33,7 @@ enum VMOpcode {
  */
 typedef struct vm_op_s {
     enum VMOpcode op;
-    unsigned int multiline:1;
+    unsigned int multiline:1, negate:1;
     const char *start, *end;
     // Length of the match, if constant, otherwise -1
     ssize_t len;
diff --git a/vm.c b/vm.c
index b69b3cb..2a44dd7 100644
--- a/vm.c
+++ b/vm.c
@@ -16,7 +16,6 @@ static match_t *get_capture_named(match_t *m, const char *name);
 static const char *opcode_names[] = {
     [VM_EMPTY] = "EMPTY",
     [VM_ANYCHAR] = "ANYCHAR",
-    [VM_ANYTHING_BUT] = "ANYTHING_BUT",
     [VM_STRING] = "STRING",
     [VM_RANGE] = "RANGE",
     [VM_NOT] = "NOT",
@@ -114,10 +113,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
             m->end = str + 1;
             return m;
         }
-        case VM_NOT: case VM_ANYTHING_BUT: {
-            if (op->op == VM_ANYTHING_BUT)
-                if (!*str || (!op->multiline && *str == '\n'))
-                    return NULL;
+        case VM_NOT: {
             match_t *m = _match(g, str, op->args.pat, rec);
             if (m != NULL) {
                 destroy_match(&m);
@@ -126,7 +122,6 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
             m = calloc(sizeof(match_t), 1);
             m->op = op;
             m->start = str;
-            if (op->op == VM_ANYTHING_BUT) ++str;
             m->end = str;
             return m;
         }
@@ -160,13 +155,11 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
             m->start = str;
             m->end = str;
             m->op = op;
-            if (op->args.repetitions.max == 0) return m;
 
             match_t **dest = &m->child;
-
             const char *prev = str;
-            size_t reps;
-            for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
+            size_t reps = 0;
+            for (;;) {
                 // Separator
                 match_t *sep = NULL;
                 if (op->args.repetitions.sep != NULL && reps > 0) {
@@ -188,6 +181,12 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
                 dest = &p->nextsibling;
                 str = p->end;
                 prev = str;
+
+                ++reps;
+                if (op->args.repetitions.max != -1 && reps > (size_t)op->args.repetitions.max) {
+                    destroy_match(&m);
+                    return NULL;
+                }
             }
 
             if ((ssize_t)reps < op->args.repetitions.min) {
@@ -386,12 +385,6 @@ void print_pattern(vm_op_t *op)
             fprintf(stderr, ")");
             break;
         }
-        case VM_ANYTHING_BUT: {
-            fprintf(stderr, "anything but (");
-            print_pattern(op->args.pat);
-            fprintf(stderr, ")");
-            break;
-        }
         case VM_AFTER: {
             fprintf(stderr, "after (");
             print_pattern(op->args.pat);
-- 
cgit v1.2.3