From 90b8db84a48ca9ea1311abd202a546a4f697f4e6 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Mon, 28 Sep 2020 18:08:23 -0700 Subject: Moved */+ back to prefix, and dropped ? --- README.md | 4 ++-- bpeg.1 | 8 ++++---- compiler.c | 34 +++++++++++++++++----------------- grammars/bpeg.bpeg | 26 ++++++++++---------------- grammars/builtins.bpeg | 40 ++++++++++++++++++++-------------------- grammars/html.bpeg | 12 ++++++------ grammars/utf8-id.bpeg | 2 +- 7 files changed, 60 insertions(+), 66 deletions(-) diff --git a/README.md b/README.md index 5167d4d..9eb3b03 100644 --- a/README.md +++ b/README.md @@ -49,8 +49,8 @@ Pattern | Meaning `2-4 pat` | Between 2 and 4 occurrences of `pat` (inclusive) `5+ pat` | 5 or more occurrences of `pat` `5+ pat % sep` | 5 or more occurrences of `pat`, separated by `sep` (e.g. `0+ int % ","` matches `1,2,3`) -`pat*` `pat* % sep`| 0 or more occurrences of `pat` (optionally separated by `sep`) -`pat+` `pat+ % sep`| 1 or more occurrences of `pat` (optionally separated by `sep`) +`*pat` | 0 or more occurrences of `pat` (shorthand for `0+pat`) +`+pat` | 1 or more occurrences of `pat` (shorthand for `1+pat`) `pat` | `pat` matches just in front of the current position (lookahead) `@pat` | Capture `pat` (used for text replacement and backreferences) diff --git a/bpeg.1 b/bpeg.1 index 833be35..9f36e0b 100644 --- a/bpeg.1 +++ b/bpeg.1 @@ -122,11 +122,11 @@ The \fBescape-sequence-range-\fI\fB-to-\fI\fR .B \fI\fB+ \fI\fR \fI\fB-to-\fI\fB-\fI\fBs\fR (repetitions of a pattern) -.B \fI\fR* -\fI\fB-zero-or-more-times\fR +.B *\fI\fR +\fBsome-\fI\fBs\fR -.B \fI\fR+ -\fI\fB-one-or-more-times\fR +.B +\fI\fR +\fBat-least-one-\fI\fBs\fR .B \fI\fR \fB%\fI \fR \fI\fB-separated-by-\fI\fR (equivalent to \fI diff --git a/compiler.c b/compiler.c index 39a12b1..b7a2458 100644 --- a/compiler.c +++ b/compiler.c @@ -269,6 +269,22 @@ vm_op_t *bpeg_simplepattern(file_t *f, const char *str) set_range(op, 0, 1, pat, NULL); break; } + // Repeating + case '*': case '+': { + ssize_t min = c == '*' ? 0 : 1; + vm_op_t *pat = bpeg_simplepattern(f, str); + check(pat, "Expected pattern after '%c'", *str); + str = pat->end; + str = after_spaces(str); + vm_op_t *sep = NULL; + if (matchchar(&str, '%')) { + sep = bpeg_simplepattern(f, str); + check(sep, "Expected pattern for separator after '%%'"); + str = sep->end; + } + set_range(op, min, -1, pat, sep); + break; + } // Capture case '@': { op->op = VM_CAPTURE; @@ -373,23 +389,7 @@ vm_op_t *bpeg_simplepattern(file_t *f, const char *str) postfix: if (f ? str >= f->end : !*str) return op; str = after_spaces(str); - if (*str == '*' || *str == '+' || *str == '?') { // Repetitions: *, +, ? - char operator = *str; - ++str; - vm_op_t *pat = op; - vm_op_t *sep = NULL; - if (operator != '?' && matchchar(&str, '%')) { - sep = bpeg_simplepattern(f, str); - check(sep, "Expected pattern for separator after '%%'"); - str = sep->end; - } - op = calloc(sizeof(vm_op_t), 1); - set_range(op, operator == '+' ? 1 : 0, operator == '?' ? 1 : -1, pat, sep); - op->start = pat->start; - op->end = str; - op->len = -1; - goto postfix; - } else if ((str[0] == '=' || str[0] == '!') && str[1] == '=') { // Equality == and inequality != + if ((str[0] == '=' || str[0] == '!') && str[1] == '=') { // Equality == and inequality != int equal = str[0] == '='; str = after_spaces(str+2); vm_op_t *first = op; diff --git a/grammars/bpeg.bpeg b/grammars/bpeg.bpeg index 0c43c0e..288ceee 100644 --- a/grammars/bpeg.bpeg +++ b/grammars/bpeg.bpeg @@ -1,43 +1,37 @@ # This is a file defining the BPEG grammar using BPEG syntax -Grammar: __ 0+(Def [__`;])%__ __ ($$ / @!={... => "Could not parse this code"}) +Grammar: __ *(Def [__`;])%__ __ ($$ / @!={... => "Could not parse this code"}) Def: @name=id _ `: __ ( @definition=extended-pat / $$ @!={=>"No definition for rule"} / @!={...>(`;/id_`:/$) => "Invalid definition: @0"}) # This is used for command line arguments: -String-pattern: 0+(`\ (escape-sequence / pat [`;]) / .) +String-pattern: *(`\ (escape-sequence / pat [`;]) / .) -pat: simple-pat !(__("!="/"=="/`*/`+/`?)) / suffixed-pat +pat: simple-pat !(__("!="/"==")) / suffixed-pat simple-pat: Upto-and / Dot / String / Char-range / Char / Escape-range / Escape / No / Nodent / Repeat / Optional / After / Before / Capture / Replace / Ref / parens suffixed-pat: ( Eq-pat / Not-eq-pat - / Star-pat - / Plus-pat - / Question-pat ) Eq-pat: @first=pat__"=="__@second=pat Not-eq-pat: @first=pat__"!="__@second=pat -Star-pat: pat __ `* @min={=>"0"} @max="" [__`%__@sep=pat] -Plus-pat: pat __ `+ @min={=>"1"} @max="" [__`%__@sep=pat] -Question-pat: pat __ `? Dot: `. !`. String: ( - `" @s=0+(Escape / !`".) (`" / @!={=> "Expected closing quote here"}) - / `' @s=0+(Escape / !`'.) (`' / @!={=> "Expected closing quote here"}) + `" @s=*(Escape / !`".) (`" / @!={=> "Expected closing quote here"}) + / `' @s=*(Escape / !`'.) (`' / @!={=> "Expected closing quote here"}) ) Char-range: `` @low=. `- (@high=. / @!={=> "Expected a second character to form a character range"}) Char: `` (@s=. / @!={=> "Expected a character following the '`'"}) Escape-range: `\ @low=escape-sequence `- @high=escape-sequence Escape: `\ (@s=escape-sequence / $ @!={=>"Backslashes are used for escape sequences, not splitting lines"} - / @!={. 0+(Abc/`0-9) => "Invalid escape sequence: '@0'"} + / @!={. *(Abc/`0-9) => "Invalid escape sequence: '@0'"} ) escape-sequence: ( `n/`t/`r/`e/`b/`a/`v @@ -68,17 +62,17 @@ Otherwise: 2+@(Chain/pat)%(__`/__) extended-pat: Otherwise / Chain / pat # Special-symbol rules: -_: 0+(` / \t) -__: 0+(` / \t / \r / \n / comment) +_: *(` / \t) +__: *(` / \t / \r / \n / comment) $$: !$. $: !. ^^: !<$. ^: !<. -id: "^^" / "^" / "__" / "_" / "$$" / "$" / (`a-z/`A-Z) 0+(`a-z/`A-Z/`0-9/`-) +id: "^^" / "^" / "__" / "_" / "$$" / "$" / (`a-z/`A-Z) *(`a-z/`A-Z/`0-9/`-) line-comment: `# .. $ -block-comment: "#(" 0+(block-comment / !")#" .) ")#" +block-comment: "#(" *(block-comment / !")#" .) ")#" # Note: comments are undefined by default in regular BPEG comment: block-comment / line-comment diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index c05b4c7..be380d3 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -16,21 +16,21 @@ replace-all: ( (include-binary-files / is-text-file) define-highlights add-filename - 0+(...(>pattern hl-replacement)) ... + *(...(>pattern hl-replacement)) ... ) find-all: ( (include-binary-files / is-text-file) define-highlights add-filename - 0+ (!..pattern {..\n=>}) - 1+ (>..pattern add-line-number 1+(..hl-pattern) ..\n / {..\n=>}) + *(!..pattern {..\n=>}) + +(>..pattern add-line-number +(..hl-pattern) ..\n / {..\n=>}) [{!<\n => "\n"}] ) only-matches: ( (include-binary-files / is-text-file) define-highlights add-filename - 1+{...@hl-pattern =>'@1\n'} + +{...@hl-pattern =>'@1\n'} ) add-filename: [print-filenames (is-tty {=>"\033[33;1;4m@&:\033[0m\n"} / {=>"@&:\n"})] add-line-number: [print-line-numbers (is-tty {=>"\033[2m@#\033[5G|\033[0m "} / {=>"@#| "})] @@ -41,21 +41,21 @@ define-highlights: highlight @hl-start={=>"\033[31;1m"} @hl-end={=>"\033[0m"} / # Helper definitions (commonly used) #( url: ( - "file://" 1+(`/ 0+url-char) + "file://" +(`/ *url-char) / "mailto:" email - / ("https"/"http"/"ftp") "://" [1+url-char [`: 1+url-char] `@] (ipv4/ipv6/domain) [`: int] [url-path] + / ("https"/"http"/"ftp") "://" [+url-char [`: +url-char] `@] (ipv4/ipv6/domain) [`: int] [url-path] ) -url-path: 1+(`/ 0+url-char) [`? 1+(1+url-char`=1+url-char] +url-path: +(`/ *url-char) [`? +(+url-char`=+url-char] ipv4: 4 int % `. ipv6: 8 (4 Hex) % `: -domain: 1+(Abc/digit/`-)%`. +domain: +(Abc/digit/`-)%`. url-char: Abc/digit/`$/`-/`_/`./`+/`!/`*/`'/`(/`)/`,/`% url: @(https?|ftp)://(-\.)?([^\s/?\.#-]+\.?)+(/[^\s]*)?$@iS )# -indent: \n|1+(\t/' ') +indent: \n|+(\t/' ') dedent: $ !(\n|) -indented-block: |` ..$ 0+(\n|..$) +indented-block: |` ..$ *(\n|..$) utf8-codepoint: ( \x00-x7f / \xc0-xdf 1\x80-xbf @@ -64,18 +64,18 @@ utf8-codepoint: ( ) crlf: \r\n cr: \r -anglebraces: `< 0+(anglebraces / !`>$.) `> -brackets: `[ 0+(brackets / !`]$.) `] -braces: `{ 0+(braces / !`}$.) `} -parens: `( 0+(parens / !`)$.) `) -id: !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) 0+(`a-z/`A-Z/`_/`0-9) +anglebraces: `< *(anglebraces / !`>$.) `> +brackets: `[ *(brackets / !`]$.) `] +braces: `{ *(braces / !`}$.) `} +parens: `( *(parens / !`)$.) `) +id: !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9) id-char: `a-z/`A-Z/`_/`0-9 -word: !<(`a-z/`A-Z/`_/`0-9) 1+(`a-z/`A-Z) !>(`0-9/`_) +word: !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_) HEX: `0-9/`A-F Hex: `0-9/`a-f/`A-F hex: `0-9/`a-f -number: 1+`0-9 [`. 0+`0-9] / `. 1+`0-9 -int: 1+`0-9 +number: +`0-9 [`. *`0-9] / `. +`0-9 +int: +`0-9 digit: `0-9 Abc: `a-z/`A-Z ABC: `A-Z @@ -94,5 +94,5 @@ $$: !$. $: !. ^^: !<$. ^: !<. -__: 0+(` /\t/\n/\r/comment) -_: 0+(` /\t) +__: *(` /\t/\n/\r/comment) +_: *(` /\t) diff --git a/grammars/html.bpeg b/grammars/html.bpeg index 5dd93cd..cea19c1 100644 --- a/grammars/html.bpeg +++ b/grammars/html.bpeg @@ -1,5 +1,5 @@ # HTML grammar -HTML: __ [doctype __] 0+html-element%__ __ +HTML: __ [doctype __] *html-element%__ __ doctype: " @@ -11,16 +11,16 @@ html-element: ( void-element: `< @tag=(id==match-tag) __attributes__ [`/] __ `> -template-element: `< @tag=(id==match-tag) __`> __ >match-body @body=0+(!`<$. / comment / html-element / !(")$.) (") +template-element: `< @tag=(id==match-tag) __`> __ >match-body @body=*(!`<$. / comment / html-element / !(")$.) (") raw-element: `< @tag=(id==match-tag) __attributes__ `> >match-body @body=.. (") -normal-element: `< @tag=(id==match-tag) __attributes__ `> >match-body @body=0+(!`<$. / comment / html-element / !(")$.) " +normal-element: `< @tag=(id==match-tag) __attributes__ `> >match-body @body=*(!`<$. / comment / html-element / !(")$.) " comment: "" -attributes: 0+attribute%__ -attribute: (1+id%`:)__`=__ (id / `" ..`" / `' ..`') -attribute: (1+id%`:)__`=__ (id / `" ..`" / `' ..`') +attributes: *attribute%__ +attribute: (+id%`:)__`=__ (id / `" ..`" / `' ..`') +attribute: (+id%`:)__`=__ (id / `" ..`" / `' ..`') match-tag: id match-body: '' diff --git a/grammars/utf8-id.bpeg b/grammars/utf8-id.bpeg index a1c0bc4..26e98ba 100644 --- a/grammars/utf8-id.bpeg +++ b/grammars/utf8-id.bpeg @@ -1,5 +1,5 @@ # Definitions of UTF8-compliant identifiers -utf8-id: utf8-id-start 0+utf8-id-cont +utf8-id: utf8-id-start *utf8-id-cont utf8-id-start: `A-Z / `a-z / !\x00-x7F ( \xc2 (\xaa / \xb5 / \xba) -- cgit v1.2.3