From 21807a663d0ab1fc934e1bb3ad485fe1c3e9c821 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Wed, 16 Sep 2020 17:57:56 -0700 Subject: Consolidated repetition ops (instead of '+', '*', '?', etc. now it's all number based: '1+', '0+', '0-1') and reverted to UPTO_AND behavior instead of UPTO --- bpeg.bpeg | 70 -------------------------------------------------- compiler.c | 36 +++++--------------------- grammars/bpeg.bpeg | 66 +++++++++++++++++++++++++++++++++++++++++++++++ grammars/builtins.bpeg | 32 +++++++++++------------ grammars/html.bpeg | 12 ++++----- types.h | 2 +- vm.c | 11 ++++---- 7 files changed, 102 insertions(+), 127 deletions(-) delete mode 100644 bpeg.bpeg create mode 100644 grammars/bpeg.bpeg diff --git a/bpeg.bpeg b/bpeg.bpeg deleted file mode 100644 index de0c676..0000000 --- a/bpeg.bpeg +++ /dev/null @@ -1,70 +0,0 @@ -# This is a file defining the BPEG grammar using BPEG syntax - -Grammar = __ *Def%(__`;__) ?(`;__); -Def = @[name]Ref __ `= __ @[definition]extended-pat; - -# This is used for command line arguments: -String-pattern = *(`\ pat ?`; / .); - -pat = suffixed-pat / simple-pat; -simple-pat = Empty / Upto / Dot / String / Char-range / Char / Escape-range / Escape / No - / Nodent / Repeat / After / Before / Capture / Replace / Ref / parens; -suffixed-pat = Eq-pat; - -Eq-pat = @[first]simple-pat "==" @[second]pat; - -Empty = `/ >(__ (`)/`})); -Dot = `. !`.; -String = ( - `" @[s]*(Escape / !`"$.) `" - / `' @[s]*(Escape / !`'$.) `' - ); -Char-range = `` @[low]. `- @[high].; -Char = `` @[s].; -Escape-range = `\ @[low]escape-sequence `- @[high]escape-sequence; -Escape = `\ @[s]escape-sequence; -escape-sequence = ( - 1-3 `0-7 - / `x 2 (`0-9/`a-f/`A-F) - /`a/`b/`e/`n/`r/`t/`v / . / \n - ); -No = `! _ @pat; -Nodent = `|; -Upto = 2-3`. ?>(_@pat); -Repeat = ( - @[min]int _ `- _ @[max]int - /{@[min]{=>"0"}=>} @[max]int _ `- - / @[min]int _ `+ @[max](/) - / @[min]@[max]int - /{@[min]{=>"1"}=>} `+ @[max](/) - /{@[min]{=>"0"}=>} `* @[max](/) - /{@[min]{=>"0"}=>} `? {@[max]{=>"1"}=>} - ) _ @[repeat-pat]pat ?( __ `% __ @[sep]pat); -After = `< _ pat; -Before = `> _ pat; -Capture = `@ ?(_ `[ @[capture-name]Ref `]) _ @[capture]pat; -Replace = `{ __ ( - ?(@[replace-pat]extended-pat __) "=>" ?(__ @[replacement]String) - ) __ `}; -Ref = @[name]( - "^^" / "^" / "__" / "_" / "$$" / "$" / - (`a-z/`A-Z) *(`a-z/`A-Z/`0-9/`-)); - -parens = `( __ extended-pat __ `); - -Chain = +@pat % (__); -Otherwise = +@(Chain/pat) % (__`/__); -extended-pat = Otherwise / Chain / pat; - -# Special-symbol rules: -_ = *(` / \t); -__ = *(` / \t / \r / \n / comment); -$$ = !$.; -$ = !.; -^^ = !<$.; -^ = !<.; - -hash-comment = `# .. $; - -# Note: comments are undefined by default in regular BPEG -comment = hash-comment; diff --git a/compiler.c b/compiler.c index 1c0a817..b27717b 100644 --- a/compiler.c +++ b/compiler.c @@ -101,10 +101,11 @@ vm_op_t *bpeg_simplepattern(const char *str) if (matchchar(&str, '.')) // "..." op->multiline = 1; vm_op_t *till = bpeg_simplepattern(str); - // Don't advance str, the following pattern will be re-matched. - op->op = VM_UPTO; + op->op = VM_UPTO_AND; op->len = -1; op->args.pat = till; + if (till) + str = till->end; break; } else { anychar: @@ -205,38 +206,15 @@ vm_op_t *bpeg_simplepattern(const char *str) check(pat, "Expected pattern after repetition count"); str = pat->end; str = after_spaces(str); + vm_op_t *sep = NULL; if (matchchar(&str, '%')) { - vm_op_t *sep = bpeg_simplepattern(str); + sep = bpeg_simplepattern(str); check(sep, "Expected pattern for separator after '%%'"); str = sep->end; - set_range(op, min, max, pat, sep); } else { str = pat->end; - set_range(op, min, max, pat, NULL); - } - break; - } - // Special repetitions: - case '+': case '*': case '?': { - ssize_t min = -1, max = -1; - switch (c) { - case '+': min = 1, max = -1; break; - case '*': min = 0, max = -1; break; - case '?': min = 0, max = 1; break; - } - vm_op_t *pat = bpeg_simplepattern(str); - check(pat, "Expected pattern after '%c'", c); - str = pat->end; - str = after_spaces(str); - if (matchchar(&str, '%')) { - vm_op_t *sep = bpeg_simplepattern(str); - check(sep, "Expected pattern for separator after '%%'"); - str = sep->end; - set_range(op, min, max, pat, sep); - } else { - str = pat->end; - set_range(op, min, max, pat, NULL); } + set_range(op, min, max, pat, sep); break; } // Lookbehind @@ -269,7 +247,7 @@ vm_op_t *bpeg_simplepattern(const char *str) op = expand_choices(op); str = op->end; str = after_spaces(str); - check(matchchar(&str, ')'), "Expected closing parenthesis"); + check(matchchar(&str, ')'), "Expected closing ')' instead of \"%s\"", str); break; } // Capture diff --git a/grammars/bpeg.bpeg b/grammars/bpeg.bpeg new file mode 100644 index 0000000..dd3a3b4 --- /dev/null +++ b/grammars/bpeg.bpeg @@ -0,0 +1,66 @@ +# This is a file defining the BPEG grammar using BPEG syntax + +Grammar = __ 0+Def%(__`;__) 0-1(`;__); +Def = @[name]Ref __ `= __ @[definition]extended-pat; + +# This is used for command line arguments: +String-pattern = 0+(`\ pat 0-1`; / .); + +pat = suffixed-pat / simple-pat; +simple-pat = Empty / Upto-and / Dot / String / Char-range / Char / Escape-range / Escape / No + / Nodent / Repeat / After / Before / Capture / Replace / Ref / parens; +suffixed-pat = Eq-pat; + +Eq-pat = @[first]simple-pat "==" @[second]pat; + +Empty = `/ >(__ (`)/`})); +Dot = `. !`.; +String = ( + `" @[s]0+(Escape / !`"$.) `" + / `' @[s]0+(Escape / !`'$.) `' + ); +Char-range = `` @[low]. `- @[high].; +Char = `` @[s].; +Escape-range = `\ @[low]escape-sequence `- @[high]escape-sequence; +Escape = `\ @[s]escape-sequence; +escape-sequence = ( + 1-3 `0-7 + / `x 2 (`0-9/`a-f/`A-F) + /`a/`b/`e/`n/`r/`t/`v / . / \n + ); +No = `! _ @pat; +Nodent = `|; +Upto-and = 2-3`. 0-1(_@pat); +Repeat = ( + @[min]int _ `- _ @[max]int + / @[min]int _ `+ @[max](/) + / @[min]@[max]int + ) _ @[repeat-pat]pat 0-1( __ `% __ @[sep]pat); +After = `< _ pat; +Before = `> _ pat; +Capture = `@ 0-1(_ `[ @[capture-name]Ref `]) _ @[capture]pat; +Replace = `{ __ ( + 0-1(@[replace-pat]extended-pat __) "=>" 0-1(__ @[replacement]String) + ) __ `}; +Ref = @[name]( + "^^" / "^" / "__" / "_" / "$$" / "$" / + (`a-z/`A-Z) 0+(`a-z/`A-Z/`0-9/`-)); + +parens = `( __ extended-pat __ `); + +Chain = 2+@pat % (__); +Otherwise = 2+@(Chain/pat) % (__`/__); +extended-pat = Otherwise / Chain / pat; + +# Special-symbol rules: +_ = 0+(` / \t); +__ = 0+(` / \t / \r / \n / comment); +$$ = !$.; +$ = !.; +^^ = !<$.; +^ = !<.; + +hash-comment = `# .. $; + +# Note: comments are undefined by default in regular BPEG +comment = hash-comment; diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index b90ac12..6ba31df 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -1,29 +1,29 @@ # Meta-rules for acting on everything pattern = !(/); # Not defined by default replacement = !(/); # Not defined by default -replace-all = +(...@replacement) ...; -find-all = +find-next%\n ?{!<\n => "\n"}; +replace-all = 1+(...@replacement) ...; +find-all = 1+find-next%\n 0-1{!<\n => "\n"}; find-next = matching-line / {..\n =>} find-next; -only-matches = +{...@pattern=>'@1\n'}; -matching-line = +(..@pattern) ..$; +only-matches = 1+{...@pattern=>'@1\n'}; +matching-line = 1+(..@pattern) ..$; # Helper definitions (commonly used) -indent = \n|+(\t/' '); +indent = \n|1+(\t/' '); dedent = $ !(\n|); -indented-block = |` ..$ *(\n|..$); +indented-block = |` ..$ 0+(\n|..$); crlf = \r\n; cr = \r; r = \r; -anglebraces = `< *(anglebraces / !`>.) `>; -brackets = `[ *(brackets / !`].) `]; -braces = `{ *(braces / !`}.) `}; -parens = `( *(parens / !`).) `); -id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); -word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_); +anglebraces = `< 0+(anglebraces / !`>.) `>; +brackets = `[ 0+(brackets / !`].) `]; +braces = `{ 0+(braces / !`}.) `}; +parens = `( 0+(parens / !`).) `); +id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) 0+(`a-z/`A-Z/`_/`0-9); +word = !<(`a-z/`A-Z/`_/`0-9) 1+(`a-z/`A-Z) !>(`0-9/`_); HEX = `0-9/`A-F; Hex = `0-9/`a-f/`A-F; hex = `0-9/`a-f; -number = +`0-9 ?(`. *`0-9) / `. +`0-9; -int = +`0-9; +number = 1+`0-9 0-1(`. 0+`0-9) / `. 1+`0-9; +int = 1+`0-9; digit = `0-9; Abc = `a-z/`A-Z; ABC = `A-Z; @@ -42,5 +42,5 @@ $$ = !$.; $ = !.; ^^ = !<$.; ^ = !<.; -__ = *(` /\t/\n/\r/comment); -_ = *(` /\t); +__ = 0+(` /\t/\n/\r/comment); +_ = 0+(` /\t); diff --git a/grammars/html.bpeg b/grammars/html.bpeg index 7af1f63..451e61c 100644 --- a/grammars/html.bpeg +++ b/grammars/html.bpeg @@ -1,5 +1,5 @@ # HTML grammar -HTML = __ ?(doctype __) *html-element%__ __; +HTML = __ 0-1(doctype __) 0+html-element%__ __; doctype = "; @@ -9,19 +9,19 @@ html-element = ( / >(`<("template")) template-element / normal-element); -void-element = `< @[tag](id==match-tag) __attributes__ ?`/ __ `>; +void-element = `< @[tag](id==match-tag) __attributes__ 0-1`/ __ `>; template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(!`<$. / comment / html-element / !(")$.) ("); raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body].. ("); -normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(!`<$. / comment / html-element / !(")$.) "; +normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]0+(!`<$. / comment / html-element / !(")$.) "; comment = ""; -attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__; -attribute = (+id%`:)__`=__ (id / `" ..`" / `' ..`'); -attribute = (+id%`:)__`=__ (id / `" ..`" / `' ..`'); +attributes = 0+(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ 0+attribute%__; +attribute = (1+id%`:)__`=__ (id / `" ..`" / `' ..`'); +attribute = (1+id%`:)__`=__ (id / `" ..`" / `' ..`'); match-attribute = attribute; match-tag = id; match-body = (/); diff --git a/types.h b/types.h index b2461a2..3749156 100644 --- a/types.h +++ b/types.h @@ -20,7 +20,7 @@ enum VMOpcode { VM_STRING, VM_RANGE, VM_NOT, - VM_UPTO, + VM_UPTO_AND, VM_REPEAT, VM_BEFORE, VM_AFTER, diff --git a/vm.c b/vm.c index 77d6d69..a245efb 100644 --- a/vm.c +++ b/vm.c @@ -19,7 +19,7 @@ static const char *opcode_names[] = { [VM_STRING] = "STRING", [VM_RANGE] = "RANGE", [VM_NOT] = "NOT", - [VM_UPTO] = "UPTO", + [VM_UPTO_AND] = "UPTO_AND", [VM_REPEAT] = "REPEAT", [VM_BEFORE] = "BEFORE", [VM_AFTER] = "AFTER", @@ -127,7 +127,7 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int m->end = str; return m; } - case VM_UPTO: { + case VM_UPTO_AND: { match_t *m = calloc(sizeof(match_t), 1); m->start = str; m->op = op; @@ -136,7 +136,8 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, unsigned int prev = str; match_t *p = _match(g, str, op->args.pat, flags, rec); if (p) { - destroy_match(&p); + m->child = p; + str = p->end; break; } // This isn't in the for() structure because there needs to @@ -410,8 +411,8 @@ void print_pattern(vm_op_t *op) fprintf(stderr, ")"); break; } - case VM_UPTO: { - fprintf(stderr, "text up to ("); + case VM_UPTO_AND: { + fprintf(stderr, "text up to and including ("); print_pattern(op->args.pat); fprintf(stderr, ")"); break; -- cgit v1.2.3