diff options
| -rw-r--r-- | bpeg.bpeg | 8 | ||||
| -rw-r--r-- | bpeg.c | 21 | ||||
| -rw-r--r-- | compiler.c | 26 | ||||
| -rw-r--r-- | grammars/builtins.bpeg | 5 | ||||
| -rw-r--r-- | grammars/html.bpeg | 21 | ||||
| -rw-r--r-- | types.h | 1 | ||||
| -rw-r--r-- | vm.c | 25 |
7 files changed, 89 insertions, 18 deletions
@@ -4,10 +4,14 @@ Grammar = __ *Def%(__`;__) ?(`;__); Def = @[name]Ref __ `= __ @[definition]extended-pat; # This is used for command line arguments: -String-grammar = *(`\ pat ?`; / .); +String-pattern = *(`\ pat ?`; / .); -pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but +pat = suffixed-pat / simple-pat; +simple-pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but / Upto-and / Repeat / After / Before / Capture / Replace / Ref / parens; +suffixed-pat = Eq-pat; + +Eq-pat = @[first]simple-pat "==" @[second]pat; Empty = `/ >(__ (`)/`})); Dot = `.; @@ -24,7 +24,8 @@ * ( <pat> ) <pat> * @ <pat> capture <pat> * @ [ <name> ] <pat> <pat> named <name> - * { <pat> => <str> } <pat> replaced with <str> + * { <pat> => <str> } <pat> replaced with <str> + * <pat1> == <pat2> <pat1> iff <pat2> matches at the same spot for the same length * "@1" or "@[1]" first capture * "@foo" or "@[foo]" capture named "foo" * <pat1> <pat2> <pat1> followed by <pat2> @@ -50,7 +51,8 @@ static const char *usage = ( "Flags:\n" " -h --help\t print the usage and quit\n" " -v --verbose\t print verbose debugging info\n" - " -d --define <name>=<def> define a grammar rule\n" + " -d --define <name>=<def>\t define a grammar rule\n" + " -D --define-string <name>=<def>\t define a grammar rule (string-pattern)\n" " -e --escaped <pat>\t provide an escaped pattern (equivalent to bpeg '\\(<pat>)')\n" " -s --string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n" " -r --replace <replacement> replace the input pattern with the given replacement\n" @@ -83,8 +85,10 @@ static int run_match(grammar_t *g, const char *filename, vm_op_t *pattern, int v } match_t *m = match(g, input, pattern); if (m != NULL && m->end > m->start + 1) { - if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename); - else printf("%s\n", filename); + if (filename != NULL) { + if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename); + else printf("%s\n", filename); + } print_match(m, isatty(STDOUT_FILENO) ? "\033[0m" : NULL, verbose); freefile(input); return 0; @@ -153,6 +157,15 @@ int main(int argc, char *argv[]) vm_op_t *pat = bpeg_pattern(src); check(pat, "Failed to compile pattern"); add_def(g, src, def, pat); + } else if (FLAG("--define-string") || FLAG("-D")) { + char *def = flag; + char *eq = strchr(def, '='); + check(eq, usage); + *eq = '\0'; + char *src = ++eq; + vm_op_t *pat = bpeg_stringpattern(src); + check(pat, "Failed to compile pattern"); + add_def(g, src, def, pat); } else if (FLAG("--escaped") || FLAG("-e")) { check(npatterns == 0, "Cannot define multiple patterns"); vm_op_t *p = bpeg_pattern(flag); @@ -372,8 +372,7 @@ vm_op_t *bpeg_simplepattern(const char *str) const char *refname = str; str = after_name(str); op->op = VM_REF; - op->len = (ssize_t)(str - refname); - op->args.s = strndup(refname, (size_t)op->len); + op->args.s = strndup(refname, (size_t)(str - refname)); break; } else { free(op); @@ -382,6 +381,29 @@ vm_op_t *bpeg_simplepattern(const char *str) } } op->end = str; + + // Postfix operators: + postfix: + str = after_spaces(str); + if (strncmp(str, "==", 2) == 0) { + str += 2; + vm_op_t *first = op; + vm_op_t *second = bpeg_simplepattern(str); + check(second, "Expected pattern after '=='"); + check(first->len == -1 || second->len == -1 || first->len == second->len, + "Two patterns cannot possibly match the same (different lengths: %ld != %ld)", + first->len, second->len); + op = calloc(sizeof(vm_op_t), 1); + op->op = VM_EQUAL; + op->start = str; + op->end = second->end; + op->len = (first->len == -1 || second->len == -1) ? -1 : first->len; + op->args.multiple.first = first; + op->args.multiple.second = second; + str = op->end; + goto postfix; + } + return op; } diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index 0741bfc..f92e0aa 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -2,7 +2,7 @@ pattern = !(/); # Not defined by default replacement = {!(/)=>}; # Not defined by default replace-all = +&&@replacement &&$$; -find-all = {&&>matching-line=>} +(matching-line/non-matching-line); +find-all = {&&>matching-line=>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; only-matches = +{&&@pattern=>'@1\n'}; matching-line = +&@pattern *. $ ?\n; non-matching-line = {&&(\n/$$)=>}; @@ -14,7 +14,8 @@ anglebraces = `< *(anglebraces / ~~`>) `>; brackets = `[ *(brackets / ~~`]) `]; braces = `{ *(braces / ~~`}) `}; parens = `( *(parens / ~~`)) `); -id = (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); +id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); +word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_); HEX = `0-9/`A-F; Hex = `0-9/`a-f/`A-F; hex = `0-9/`a-f; diff --git a/grammars/html.bpeg b/grammars/html.bpeg index f6812bd..7f8976a 100644 --- a/grammars/html.bpeg +++ b/grammars/html.bpeg @@ -3,16 +3,25 @@ HTML = __ ?(doctype __) *html-element%__ __; doctype = "<!DOCTYPE" &`>; -html-element = void-element / template-element / raw-text-element / normal-element; +html-element = ( + >(`<("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr")) void-element + / >(`<("script"/"style"/"textarea"/"title")) raw-element + / >(`<("template")) template-element + / normal-element); -void-element = `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") *(__attribute) __ ?`/ __ `>; +void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>; -template-element = "<template" __`> __ *(~~`< / comment / html-element / ~~("</template"__`>)) ("</template"__`>); +template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~("</"tag__`>)) ("</"tag__`>); -raw-text-element = `<@[tag]("script"/"style"/"textarea"/"title") *(__attribute) __ `> &("</"tag__`>); +raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~("</"tag__`>) ("</"tag__`>); -normal-element = !raw-text-element `<@[tag]id *(__attribute) __ `> *(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>; +normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>; comment = "<!--" &&"-->"; -attribute = +id%`:__`=__(id / `" &`" / `' &`'); +attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__; +attribute = (+id%`:)__`=__ (id / `" &`" / `' &`'); +attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `'); +match-attribute = attribute; +match-tag = id; +match-body = (/); @@ -23,6 +23,7 @@ enum VMOpcode { VM_CAPTURE, VM_OTHERWISE, VM_CHAIN, + VM_EQUAL, VM_REPLACE, VM_REF, VM_BACKREF, @@ -28,6 +28,7 @@ static const char *opcode_names[] = { [VM_OTHERWISE] = "OTHERWISE", [VM_CHAIN] = "CHAIN", [VM_REPLACE] = "REPLACE", + [VM_EQUAL] = "EQUAL", [VM_REF] = "REF", [VM_BACKREF] = "BACKREF", }; @@ -253,6 +254,25 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m1->nextsibling = m2; return m; } + case VM_EQUAL: { + match_t *m1 = _match(g, str, op->args.multiple.first, rec); + if (m1 == NULL) return NULL; + + // <p1>==<p2> matches iff both have the same start and end point: + match_t *m2 = _match(g, str, op->args.multiple.second, rec); + if (m2 == NULL || m2->end != m1->end) { + destroy_match(&m1); + destroy_match(&m2); + return NULL; + } + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = m2->end; + m->op = op; + m->child = m1; + m1->nextsibling = m2; + return m; + } case VM_REPLACE: { match_t *m = calloc(sizeof(match_t), 1); m->start = str; @@ -512,8 +532,8 @@ void print_match(match_t *m, const char *color, int verbose) const char *prev = m->start; for (match_t *child = m->child; child; child = child->nextsibling) { // Skip children from e.g. zero-width matches like >@foo - if (!(m->start <= child->start && child->start <= m->end && - m->start <= child->end && child->end <= m->end)) + if (!(prev <= child->start && child->start <= m->end && + prev <= child->end && child->end <= m->end)) continue; if (child->start > prev) printf("%s%.*s", color ? color : "", (int)(child->start - prev), prev); @@ -608,6 +628,7 @@ static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap) str += len; prev = child->start; } + if (child->start < prev) continue; *dest = match_backref(str, op, child); if (*dest == NULL) { destroy_match(&ret); |
