From 1d1c3d35aae0e060a6527d6e83575dd7ff71328e Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Sun, 13 Sep 2020 20:33:11 -0700 Subject: Added == operator --- bpeg.bpeg | 8 ++++++-- bpeg.c | 21 +++++++++++++++++---- compiler.c | 26 ++++++++++++++++++++++++-- grammars/builtins.bpeg | 5 +++-- grammars/html.bpeg | 21 +++++++++++++++------ types.h | 1 + vm.c | 25 +++++++++++++++++++++++-- 7 files changed, 89 insertions(+), 18 deletions(-) diff --git a/bpeg.bpeg b/bpeg.bpeg index 0a03aa8..3301155 100644 --- a/bpeg.bpeg +++ b/bpeg.bpeg @@ -4,10 +4,14 @@ Grammar = __ *Def%(__`;__) ?(`;__); Def = @[name]Ref __ `= __ @[definition]extended-pat; # This is used for command line arguments: -String-grammar = *(`\ pat ?`; / .); +String-pattern = *(`\ pat ?`; / .); -pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but +pat = suffixed-pat / simple-pat; +simple-pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but / Upto-and / Repeat / After / Before / Capture / Replace / Ref / parens; +suffixed-pat = Eq-pat; + +Eq-pat = @[first]simple-pat "==" @[second]pat; Empty = `/ >(__ (`)/`})); Dot = `.; diff --git a/bpeg.c b/bpeg.c index 95f133c..89f35e7 100644 --- a/bpeg.c +++ b/bpeg.c @@ -24,7 +24,8 @@ * ( ) * @ capture * @ [ ] named - * { => } replaced with + * { => } replaced with + * == iff matches at the same spot for the same length * "@1" or "@[1]" first capture * "@foo" or "@[foo]" capture named "foo" * followed by @@ -50,7 +51,8 @@ static const char *usage = ( "Flags:\n" " -h --help\t print the usage and quit\n" " -v --verbose\t print verbose debugging info\n" - " -d --define = define a grammar rule\n" + " -d --define =\t define a grammar rule\n" + " -D --define-string =\t define a grammar rule (string-pattern)\n" " -e --escaped \t provide an escaped pattern (equivalent to bpeg '\\()')\n" " -s --string \t provide a string pattern (equivalent to bpeg '', but may be useful if '' begins with a '-')\n" " -r --replace replace the input pattern with the given replacement\n" @@ -83,8 +85,10 @@ static int run_match(grammar_t *g, const char *filename, vm_op_t *pattern, int v } match_t *m = match(g, input, pattern); if (m != NULL && m->end > m->start + 1) { - if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename); - else printf("%s\n", filename); + if (filename != NULL) { + if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename); + else printf("%s\n", filename); + } print_match(m, isatty(STDOUT_FILENO) ? "\033[0m" : NULL, verbose); freefile(input); return 0; @@ -153,6 +157,15 @@ int main(int argc, char *argv[]) vm_op_t *pat = bpeg_pattern(src); check(pat, "Failed to compile pattern"); add_def(g, src, def, pat); + } else if (FLAG("--define-string") || FLAG("-D")) { + char *def = flag; + char *eq = strchr(def, '='); + check(eq, usage); + *eq = '\0'; + char *src = ++eq; + vm_op_t *pat = bpeg_stringpattern(src); + check(pat, "Failed to compile pattern"); + add_def(g, src, def, pat); } else if (FLAG("--escaped") || FLAG("-e")) { check(npatterns == 0, "Cannot define multiple patterns"); vm_op_t *p = bpeg_pattern(flag); diff --git a/compiler.c b/compiler.c index 7c5220d..4e2b185 100644 --- a/compiler.c +++ b/compiler.c @@ -372,8 +372,7 @@ vm_op_t *bpeg_simplepattern(const char *str) const char *refname = str; str = after_name(str); op->op = VM_REF; - op->len = (ssize_t)(str - refname); - op->args.s = strndup(refname, (size_t)op->len); + op->args.s = strndup(refname, (size_t)(str - refname)); break; } else { free(op); @@ -382,6 +381,29 @@ vm_op_t *bpeg_simplepattern(const char *str) } } op->end = str; + + // Postfix operators: + postfix: + str = after_spaces(str); + if (strncmp(str, "==", 2) == 0) { + str += 2; + vm_op_t *first = op; + vm_op_t *second = bpeg_simplepattern(str); + check(second, "Expected pattern after '=='"); + check(first->len == -1 || second->len == -1 || first->len == second->len, + "Two patterns cannot possibly match the same (different lengths: %ld != %ld)", + first->len, second->len); + op = calloc(sizeof(vm_op_t), 1); + op->op = VM_EQUAL; + op->start = str; + op->end = second->end; + op->len = (first->len == -1 || second->len == -1) ? -1 : first->len; + op->args.multiple.first = first; + op->args.multiple.second = second; + str = op->end; + goto postfix; + } + return op; } diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index 0741bfc..f92e0aa 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -2,7 +2,7 @@ pattern = !(/); # Not defined by default replacement = {!(/)=>}; # Not defined by default replace-all = +&&@replacement &&$$; -find-all = {&&>matching-line=>} +(matching-line/non-matching-line); +find-all = {&&>matching-line=>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; only-matches = +{&&@pattern=>'@1\n'}; matching-line = +&@pattern *. $ ?\n; non-matching-line = {&&(\n/$$)=>}; @@ -14,7 +14,8 @@ anglebraces = `< *(anglebraces / ~~`>) `>; brackets = `[ *(brackets / ~~`]) `]; braces = `{ *(braces / ~~`}) `}; parens = `( *(parens / ~~`)) `); -id = (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); +id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); +word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_); HEX = `0-9/`A-F; Hex = `0-9/`a-f/`A-F; hex = `0-9/`a-f; diff --git a/grammars/html.bpeg b/grammars/html.bpeg index f6812bd..7f8976a 100644 --- a/grammars/html.bpeg +++ b/grammars/html.bpeg @@ -3,16 +3,25 @@ HTML = __ ?(doctype __) *html-element%__ __; doctype = "; -html-element = void-element / template-element / raw-text-element / normal-element; +html-element = ( + >(`<("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr")) void-element + / >(`<("script"/"style"/"textarea"/"title")) raw-element + / >(`<("template")) template-element + / normal-element); -void-element = `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") *(__attribute) __ ?`/ __ `>; +void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>; -template-element = " __ *(~~`< / comment / html-element / ~~(")) ("); +template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~(")) ("); -raw-text-element = `<@[tag]("script"/"style"/"textarea"/"title") *(__attribute) __ `> &("); +raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~(") ("); -normal-element = !raw-text-element `<@[tag]id *(__attribute) __ `> *(~~`< / comment / html-element / ~~(")) "; +normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~(")) "; comment = ""; -attribute = +id%`:__`=__(id / `" &`" / `' &`'); +attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__; +attribute = (+id%`:)__`=__ (id / `" &`" / `' &`'); +attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `'); +match-attribute = attribute; +match-tag = id; +match-body = (/); diff --git a/types.h b/types.h index 82c282c..f335285 100644 --- a/types.h +++ b/types.h @@ -23,6 +23,7 @@ enum VMOpcode { VM_CAPTURE, VM_OTHERWISE, VM_CHAIN, + VM_EQUAL, VM_REPLACE, VM_REF, VM_BACKREF, diff --git a/vm.c b/vm.c index c1427a5..8e0957d 100644 --- a/vm.c +++ b/vm.c @@ -28,6 +28,7 @@ static const char *opcode_names[] = { [VM_OTHERWISE] = "OTHERWISE", [VM_CHAIN] = "CHAIN", [VM_REPLACE] = "REPLACE", + [VM_EQUAL] = "EQUAL", [VM_REF] = "REF", [VM_BACKREF] = "BACKREF", }; @@ -253,6 +254,25 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m1->nextsibling = m2; return m; } + case VM_EQUAL: { + match_t *m1 = _match(g, str, op->args.multiple.first, rec); + if (m1 == NULL) return NULL; + + // == matches iff both have the same start and end point: + match_t *m2 = _match(g, str, op->args.multiple.second, rec); + if (m2 == NULL || m2->end != m1->end) { + destroy_match(&m1); + destroy_match(&m2); + return NULL; + } + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = m2->end; + m->op = op; + m->child = m1; + m1->nextsibling = m2; + return m; + } case VM_REPLACE: { match_t *m = calloc(sizeof(match_t), 1); m->start = str; @@ -512,8 +532,8 @@ void print_match(match_t *m, const char *color, int verbose) const char *prev = m->start; for (match_t *child = m->child; child; child = child->nextsibling) { // Skip children from e.g. zero-width matches like >@foo - if (!(m->start <= child->start && child->start <= m->end && - m->start <= child->end && child->end <= m->end)) + if (!(prev <= child->start && child->start <= m->end && + prev <= child->end && child->end <= m->end)) continue; if (child->start > prev) printf("%s%.*s", color ? color : "", (int)(child->start - prev), prev); @@ -608,6 +628,7 @@ static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap) str += len; prev = child->start; } + if (child->start < prev) continue; *dest = match_backref(str, op, child); if (*dest == NULL) { destroy_match(&ret); -- cgit v1.2.3