From 384fb6293a85502bbc9423b8b098337e77bb3738 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Thu, 10 Sep 2020 00:27:51 -0700 Subject: Added "~x" (anything but x) and dropped "^x" (up to x), added "^" (start of line), added "$" (end of line), and changed "`a,z" to "`a-z" --- bpeg.c | 138 ++++++++++++++++++++++++++++++++++++----------------------------- bpeg.h | 15 ++++++- 2 files changed, 90 insertions(+), 63 deletions(-) diff --git a/bpeg.c b/bpeg.c index d100af0..95c28be 100644 --- a/bpeg.c +++ b/bpeg.c @@ -3,10 +3,19 @@ * * Grammar: * # comment - * ` character + * . any character + * ^ beginning of a line + * $ end of a line + * _ 0 or more spaces or tabs + * ` character + * `- character between and + * `,,... character or (WIP) + * \ escape sequence (e.g. \n, \033) + * \- escape sequence range (WIP) + * \,,... one of multiple escape sequences (WIP) * ! no - * ^ upto - * & upto and including + * ~ any character as long as it doesn't match + * & upto and including (aka *~ ) * + [% ] or more s (separated by ) * * [% ] sugar for "0+ [% ]" * - [% ] or fewer s (separated by ) @@ -14,13 +23,12 @@ * - to (inclusive) s * < after , ... * > before , ... - * . any character * / otherwise * ( ) * @ capture * @ [ ] named * ; = is defined to be - * { ~ } replaced with + * { -> } replaced with * "@1" or "@{1}" first capture * "@foo" or "@{foo}" capture named "foo" */ @@ -79,7 +87,8 @@ static match_t *match(const char *str, vm_op_t *op) m->end = str + 1; return m; } - case VM_NOT: { + case VM_NOT: case VM_ANYTHING_BUT: { + if (op->op == VM_ANYTHING_BUT && !*str && *str != '\n') return NULL; match_t *m = match(str, op->args.pat); if (m != NULL) { m = free_match(m); @@ -87,20 +96,17 @@ static match_t *match(const char *str, vm_op_t *op) } m = calloc(sizeof(match_t), 1); m->start = str; + if (op->op == VM_ANYTHING_BUT) ++str; m->end = str; return m; } - case VM_UPTO: case VM_UPTO_AND: { + case VM_UPTO_AND: { match_t *m = calloc(sizeof(match_t), 1); m->start = str; while (*str && (multiline_dot || (*str != '\n' && *str != '\r'))) { match_t *p = match(str, op->args.pat); if (p == NULL) { ++str; - } else if (op->op == VM_UPTO) { - p = free_match(p); - m->end = str; - return m; } else { m->end = p->end; m->child = p; @@ -222,7 +228,8 @@ static match_t *match(const char *str, vm_op_t *op) return m; } case VM_REF: { - for (size_t i = 0; i < ndefs; i++) { + // Search backwards so newer defs take precedence + for (int i = ndefs-1; i >= 0; i--) { if (streq(defs[i].name, op->args.s)) { // Bingo! op = defs[i].op; @@ -386,15 +393,17 @@ static vm_op_t *compile_bpeg(const char *source, const char *str) visualize(source, str, "Char literal"); check(literal[0], "Expected character after '`'\n"); op->len = 1; - if (matchchar(&str, ',')) { // Range + if (matchchar(&str, '-')) { // Range visualize(source, str, "Char range"); - //debug("Char range\n"); char c2 = *str; - check(c2, "Expected character after ','"); + check(c2, "Expected character after '-'"); op->op = VM_RANGE; op->args.range.low = literal[0]; op->args.range.high = c2; ++str; + } else if (matchchar(&str, ',')) { // Set + // TODO: implement + check(0, "Sorry, character sets are not yet implemented!"); } else { //debug("Char literal\n"); op->op = VM_STRING; @@ -448,21 +457,19 @@ static vm_op_t *compile_bpeg(const char *source, const char *str) op->args.pat = p; break; } - // Upto - case '^': { - visualize(source, str, "Upto "); - //debug("Upto pattern\n"); + // Anything but + case '~': { + visualize(source, str, "Anything but "); vm_op_t *p = compile_bpeg(source, str); - check(p, "Expected pattern after '^'\n"); + check(p, "Expected pattern after '~'\n"); str = p->end; - op->op = VM_UPTO; + op->op = VM_ANYTHING_BUT; op->len = -1; op->args.pat = p; break; } // Upto and including case '&': { - //debug("Upto-and pattern\n"); visualize(source, str, "Upto and including "); vm_op_t *p = compile_bpeg(source, str); check(p, "Expected pattern after '&'\n"); @@ -605,13 +612,16 @@ static vm_op_t *compile_bpeg(const char *source, const char *str) visualize(source, str, "Replacement"); str = after_spaces(str); vm_op_t *pat = NULL; - if (!matchchar(&str, '~')) { + if (strncmp(str, "->", 2) == 0) { + str += strlen("->"); + } else { pat = compile_bpeg(source, str); - check(pat, "Expected pattern after '{'"); + check(pat, "Invalid pattern after '{'"); pat = expand_choices(source, pat); str = pat->end; str = after_spaces(str); - check(matchchar(&str, '~'), "Expected '~' after pattern in replacement"); + check(matchchar(&str, '-') && matchchar(&str, '>'), + "Expected '->' after pattern in replacement"); } visualize(source, str, NULL); str = after_spaces(str); @@ -645,12 +655,11 @@ static vm_op_t *compile_bpeg(const char *source, const char *str) visualize(source, str, NULL); break; } - // Whitespace - case '_': { - //debug("Whitespace\n"); + // Special rules: + case '_': case '^': case '$': { visualize(source, str, NULL); op->op = VM_REF; - op->args.s = strdup("_"); + op->args.s = strndup(&c, 1); break; } default: { @@ -745,41 +754,51 @@ static vm_op_t *compile_bpeg_replacement(vm_op_t *pat, const char *str) return op; } -static vm_op_t *load_def(const char *name, const char *source) +static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op) { + check(ndefs < (sizeof(defs)/sizeof(defs[0])), "Too many definitions!"); defs[ndefs].name = name; - vm_op_t *op = compile_bpeg(source, source); - op = expand_choices(source, op); defs[ndefs].op = op; - defs[ndefs].source = strndup((char*)op->start, (int)(op->end - op->start)); + defs[ndefs].source = source; ++ndefs; return op; } +static vm_op_t *load_def(const char *name, const char *source) +{ + vm_op_t *op = compile_bpeg(source, source); + source = strndup((char*)op->start, (int)(op->end - op->start)); + op = expand_choices(source, op); + return add_def(name, source, op); +} + static void load_defs(void) { - load_def("_", "*(` /\\t/\\n/\\r)"); - load_def("__", "+(` /\\t/\\n/\\r)"); - load_def("nl", "\\n"); load_def("n", "\\n"); - load_def("cr", "\\r"); load_def("r", "\\r"); - load_def("tab", "\\t"); load_def("t", "\\t"); - load_def("tab", "\\t"); load_def("t", "\\t"); - load_def("esc", "\\e"); load_def("e", "\\e"); + // Approximately these are in least-to-most used order so they will be + // found most efficiently by the lookup, which goes in reverse order. load_def("crlf", "\\r\\n"); - load_def("abc", "`a,z"); - load_def("ABC", "`A,Z"); - load_def("Abc", "`a,z/`A,Z"); - load_def("digit", "`0,9"); - load_def("number", "+`0,9 ?(`. *`0,9) / `. +`0,9"); - load_def("hex", "`0,9/`a,f"); - load_def("Hex", "`0,9/`a,f/`A,F"); - load_def("HEX", "`0,9/`A,F"); - load_def("id", "(`a,z/`A,Z/`_) *(`a,z/`A,Z/`_/`0,9)"); - load_def("line", "^(?\\r\\n / !.)"); - load_def("parens", "`( *(parens / !`) .) `)"); - load_def("braces", "`{ *(braces / !`} .) `}"); - load_def("brackets", "`[ *(brackets / !`] .) `]"); + load_def("cr", "\\r"); load_def("r", "\\r"); load_def("anglebraces", "`< *(anglebraces / !`> .) `>"); + load_def("brackets", "`[ *(brackets / !`] .) `]"); + load_def("braces", "`{ *(braces / !`} .) `}"); + load_def("parens", "`( *(parens / !`) .) `)"); + load_def("line", "^(?\\r\\n / !.)"); + load_def("id", "(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9)"); + load_def("HEX", "`0-9/`A-F"); + load_def("Hex", "`0-9/`a-f/`A-F"); + load_def("hex", "`0-9/`a-f"); + load_def("number", "+`0-9 ?(`. *`0-9) / `. +`0-9"); + load_def("digit", "`0-9"); + load_def("Abc", "`a-z/`A-Z"); + load_def("ABC", "`A-Z"); + load_def("abc", "`a-z"); + load_def("esc", "\\e"); load_def("e", "\\e"); + load_def("tab", "\\t"); load_def("t", "\\t"); + load_def("nl", "\\n"); load_def("lf", "\\n"); load_def("n", "\\n"); + load_def("ws", "` /\\t/\\n/\\r"); + load_def("_", "*(` /\\t/\\n/\\r)"); + load_def("$", ">\\n / !. / >\\r\\n"); + load_def("^", "<\\n / !<."); } static match_t *get_capture_n(match_t *m, int *n) @@ -915,14 +934,14 @@ static void print_grammar(vm_op_t *op) fprintf(stderr, ")"); break; } - case VM_UPTO: { - fprintf(stderr, "text up to ("); + case VM_UPTO_AND: { + fprintf(stderr, "text up to and including ("); print_grammar(op->args.pat); fprintf(stderr, ")"); break; } - case VM_UPTO_AND: { - fprintf(stderr, "text up to and including ("); + case VM_ANYTHING_BUT: { + fprintf(stderr, "anything but ("); print_grammar(op->args.pat); fprintf(stderr, ")"); break; @@ -1064,10 +1083,7 @@ int main(int argc, char *argv[]) pat = compile_bpeg_replacement(pat, replacement); } - defs[ndefs].name = "pattern"; - defs[ndefs].op = pat; - defs[ndefs].source = pattern; - ++ndefs; + add_def("pattern", pattern, pat); const char *grammar = "*(@pattern / \\n / .)"; op = compile_bpeg(grammar, grammar); diff --git a/bpeg.h b/bpeg.h index 3ca21c2..0109f8d 100644 --- a/bpeg.h +++ b/bpeg.h @@ -10,7 +10,16 @@ #include "utils.h" -const char *usage = "Usage: bpeg [-m|--multiline] [-v|--verbose] [-h|--help] [-s|--slow] []"; +const char *usage = ( + "Usage:\n" + " bpeg [flags] [...]\n\n" + "Flags:\n" + " -m --multiline\t dot and similar patterns include newlines\n" + " -v --verbose\t print verbose debugging info\n" + " -h --help\t print the usage and quit\n" + " -s --slow\t run in slow mode for debugging\n" + " -r --replace replace the input pattern with the given replacement\n" + " -g --grammar use the specified file as a grammar\n"); /* * Pattern matching result object */ @@ -28,10 +37,10 @@ typedef struct match_s { enum VMOpcode { VM_EMPTY = 0, VM_ANYCHAR = 1, + VM_ANYTHING_BUT, VM_STRING, VM_RANGE, VM_NOT, - VM_UPTO, VM_UPTO_AND, VM_REPEAT, VM_BEFORE, @@ -82,6 +91,8 @@ static match_t *free_match(match_t *m); static match_t *match(const char *str, vm_op_t *op); static vm_op_t *compile_bpeg(const char *source, const char *str); static vm_op_t *load_grammar(const char *grammar); +static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op); +static vm_op_t *load_def(const char *name, const char *source); static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second); static vm_op_t *compile_bpeg_string(const char *source, const char *str); static vm_op_t *expand_chain(const char *source, vm_op_t *first); -- cgit v1.2.3