Some bug fixes, syntax tweaks, added ^^/$$/__, and bpeg meta grammar.

2020-09-10 02:02:40 -07:00 · 2020-09-10 02:02:40 -07:00 · 8126489f81
commit 8126489f81
parent 384fb6293a
2 changed files with 119 additions and 25 deletions
--- a/bpeg.bpeg
+++ b/bpeg.bpeg
@ -0,0 +1,59 @@
+# This is a file defining the BPEG grammar using BPEG syntax
+
+grammar;
+grammar = __ @[mainPattern]extendedPat __ (*def % (__`;__)) ?(__ `;) __;
+def = @[name]ref __ `= __ @[definition]extendedPat;
+
+# This is used for command line arguments:
+stringGrammar = *(`\ pat ?`; / .);
+
+pat = empty / dot / string / charRange / char / escape / no / anythingBut
+    / uptoAnd / repeat / after / before / capture / replace / ref / parens;
+
+empty = `/ >(__ (`}/`}));
+dot = `.;
+string = (
+        `" @[s]*(escape / ~`") `"
+      / `' @[s]*(escape / ~`') `'
+    );
+charRange = `` @[low]. `- @[high].;
+char = `` @[s].;
+escape = `\ @[s](
+        1-3 `0-7
+      / `x 2 (`0-9/`a-f/`A-F)
+      /`a/`b/`e/`n/`r/`t/`v / . / \n
+    );
+no = `! _ @pat;
+anythingBut = `~ _ @pat;
+uptoAnd = `& _ @pat;
+repeat = (
+           @[min]int _ `- _ @[max]int
+      /    @[min]{->"0"}    @[max]int _ `-
+      /    @[min]int _ `+   @[max](/)
+      /    @[min]           @[max]int
+      / `+ @[min]{->"1"}    @[max](/)
+      / `* @[min]{->"0"}    @[max](/)
+      / `? @[min]{->"0"}    @[max]{->"1"}
+    ) _ @[repeatPat]pat ?( __ `% __ @[sep]pat);
+after = `< _ pat;
+before = `> _ pat;
+capture = `@ ?(_ `[ @[captureName]ref `]) _ @[capture]pat;
+replace = `{ __ (
+      ?(@[replacePat]extendedPat __) "=>" ?(__ @[replacement]string)
+    ) __ `};
+ref = @[name](
+        "^^" / "^" / "__" / "_" / "$$" / "$" /
+        (`a-z/`A-Z) *(`a-z/`A-Z/`0-9));
+
+parens = `( __ extendedPat __ `);
+
+chain = +@pat % (__);
+otherwise = +@(chain/pat) % (__`/__);
+extendedPat = otherwise / chain / pat;
+
+_ = *(`  / \t);
+__ = *(`  / \t / \r / \n / comment);
+hashComment = `# *.;
+
+# Note: comments are undefined by default in regular BPEG
+comment = hashComment;
--- a/bpeg.c
+++ b/bpeg.c
@ -1,5 +1,5 @@
 /*
- * bpeg.h - Source code for the bpeg parser
+ * bpeg.c - Source code for the bpeg parser
 *
 * Grammar:
 *     # <comment>                 comment
@ -9,10 +9,7 @@
 *     _                           0 or more spaces or tabs
 *     `<c>                        character <c>
 *     `<a>-<z>                    character between <a> and <z>
- *     `<a>,<b>,...                character <a> or <b> (WIP)
 *     \<e>                        escape sequence (e.g. \n, \033)
- *     \<e1>-<e2>                  escape sequence range (WIP)
- *     \<e1>,<e2>,...              one of multiple escape sequences (WIP)
 *     ! <pat>                     no <pat>
 *     ~ <pat>                     any character as long as it doesn't match <pat>
 *     & <pat>                     upto and including <pat> (aka *~<pat> <pat>)
@ -23,14 +20,15 @@
 *     <N> - <M> <pat>             <N> to <M> (inclusive) <pat>s
 *     < <pat>                     after <pat>, ...
 *     > <pat>                     before <pat>, ...
- *     <pat> / <alt>               <pat> otherwise <alt>
 *     ( <pat> )                   <pat>
 *     @ <pat>                     capture <pat>
 *     @ [ <name> ] <pat>          <pat> named <name>
+ *     { <pat> => <str> }           <pat> replaced with <str>
+ *     "@1" or "@[1]"              first capture
+ *     "@foo" or "@[foo]"          capture named "foo"
+ *     <pat1> <pat2>               <pat1> followed by <pat2>
+ *     <pat> / <alt>               <pat> otherwise <alt>
 *     ; <name> = <pat>            <name> is defined to be <pat>
- *     { <pat> -> <str> }           <pat> replaced with <str>
- *     "@1" or "@{1}"              first capture
- *     "@foo" or "@{foo}"          capture named "foo"
 */

 #include "bpeg.h"
@ -103,9 +101,11 @@ static match_t *match(const char *str, vm_op_t *op)
        case VM_UPTO_AND: {
            match_t *m = calloc(sizeof(match_t), 1);
            m->start = str;
-            while (*str && (multiline_dot || (*str != '\n' && *str != '\r'))) {
+            while (*str) {
                match_t *p = match(str, op->args.pat);
                if (p == NULL) {
+                    if (!multiline_dot && (*str == '\n' || *str == '\r'))
+                        break;
                    ++str;
                } else {
                    m->end = p->end;
@ -135,7 +135,7 @@ static match_t *match(const char *str, vm_op_t *op)
                    str = sep->end;
                }
                match_t *p = match(str, op->args.repetitions.repeat_pat);
-                if (p == NULL || p->end == prev) { // Prevent infinite loops
+                if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
                    if (sep) sep = free_match(sep);
                    if (p) p = free_match(p);
                    break;
@ -612,16 +612,17 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
            visualize(source, str, "Replacement");
            str = after_spaces(str);
            vm_op_t *pat = NULL;
-            if (strncmp(str, "->", 2) == 0) {
-                str += strlen("->");
+            if (strncmp(str, "=>", 2) == 0) {
+                str += strlen("=>");
            } else {
                pat = compile_bpeg(source, str);
                check(pat, "Invalid pattern after '{'");
                pat = expand_choices(source, pat);
                str = pat->end;
                str = after_spaces(str);
-                check(matchchar(&str, '-') && matchchar(&str, '>'),
-                      "Expected '->' after pattern in replacement");
+                printf("at: '%s'\n", str);
+                check(matchchar(&str, '=') && matchchar(&str, '>'),
+                      "Expected '=>' after pattern in replacement");
            }
            visualize(source, str, NULL);
            str = after_spaces(str);
@ -630,6 +631,7 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
            const char *replacement;
            if (matchchar(&str, '}')) {
                replacement = strdup("");
+                visualize(source, str, NULL);
            } else {
                check(matchchar(&str, '"') || matchchar(&str, '\''),
                      "Expected string literal for replacement");
@ -643,9 +645,8 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
                }
                replacement = strndup(replacement, (size_t)(str-replacement));
                check(matchchar(&str, quote), "Expected closing quote");
+                check(matchchar(&str, '}'), "Expected a closing '}'");
            }
-            visualize(source, str, NULL);
-            check(matchchar(&str, '}'), "Expected a closing '}'");
            op->op = VM_REPLACE;
            op->args.replace.replace_pat = pat;
            op->args.replace.replacement = replacement;
@ -657,9 +658,24 @@ static vm_op_t *compile_bpeg(const char *source, const char *str)
        }
        // Special rules:
        case '_': case '^': case '$': {
-            visualize(source, str, NULL);
            op->op = VM_REF;
-            op->args.s = strndup(&c, 1);
+            if (matchchar(&str, c)) { // double __, ^^, $$
+                char tmp[3] = {c, c, '\0'};
+                op->args.s = strdup(tmp);
+            } else 
+                op->args.s = strndup(&c, 1);
+            visualize(source, str, op->args.s);
+            break;
+        }
+        // Empty choice (/) or {/}
+        case '/': {
+            str = after_spaces(str);
+            if (*str == ')' || *str == '}') {
+                op->op = VM_EMPTY;
+            } else {
+                free(op);
+                return NULL;
+            }
            break;
        }
        default: {
@ -795,10 +811,19 @@ static void load_defs(void)
    load_def("esc", "\\e"); load_def("e", "\\e");
    load_def("tab", "\\t"); load_def("t", "\\t");
    load_def("nl", "\\n"); load_def("lf", "\\n"); load_def("n", "\\n");
-    load_def("ws", "` /\\t/\\n/\\r");
-    load_def("_", "*(` /\\t/\\n/\\r)");
-    load_def("$", ">\\n / !. / >\\r\\n");
+    load_def("cBlockComment", "'/*' &'*/'");
+    load_def("cLineComment", "'//' &$");
+    load_def("cComment", "cLineComment / cBlockComment");
+    load_def("hashComment", "`# &$");
+    load_def("comment", "!(/)"); // undefined by default
+    load_def("WS", "` /\\t/\\n/\\r/comment");
+    load_def("ws", "` /\\t");
+    load_def("$$", "!(. / \\n)");
+    load_def("$", "!. / >\\n");
+    load_def("^^", "!<(. / \\n)");
    load_def("^", "<\\n / !<.");
+    load_def("__", "*(` /\\t/\\n/\\r/comment)");
+    load_def("_", "*(` /\\t)");
 }

 static match_t *get_capture_n(match_t *m, int *n)
@ -905,7 +930,7 @@ static void print_grammar(vm_op_t *op)
 {
    switch (op->op) {
        case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
-        case VM_EMPTY: fprintf(stderr, "empty"); break;
+        case VM_EMPTY: fprintf(stderr, "the empty string"); break;
        case VM_ANYCHAR: fprintf(stderr, "any char"); break;
        case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
        case VM_RANGE: {
@ -1014,9 +1039,19 @@ static vm_op_t *load_grammar(const char *grammar)
        if (verbose) fprintf(stderr, "\n");
        defs = after_spaces(defs);
        const char *name = defs;
-        check(isalpha(*name), "Definition must begin with a name");
-        while (isalpha(*defs)) ++defs;
-        name = strndup(name, (size_t)(defs-name));
+        if (strncmp(name, "^^", 2) == 0 ||
+            strncmp(name, "__", 2) == 0 ||
+            strncmp(name, "$$", 2) == 0) {
+            name = strndup(name, 2);
+            defs += 2;
+        } else if (*name == '^' || *name == '_' || *name == '$') {
+            name = strndup(name, 1);
+            defs += 1;
+        } else {
+            check(isalpha(*name), "Definition must begin with a name");
+            while (isalnum(*defs)) ++defs;
+            name = strndup(name, (size_t)(defs-name));
+        }
        defs = after_spaces(defs);
        check(matchchar(&defs, '='), "Expected '=' in definition");
        vm_op_t *def = load_def(name, defs);