diff options
| -rw-r--r-- | bpeg.bpeg | 10 | ||||
| -rw-r--r-- | bpeg.c | 2 | ||||
| -rw-r--r-- | compiler.c | 30 | ||||
| -rw-r--r-- | grammars/builtins.bpeg | 16 | ||||
| -rw-r--r-- | types.h | 2 | ||||
| -rw-r--r-- | vm.c | 40 |
6 files changed, 53 insertions, 47 deletions
@@ -7,14 +7,14 @@ Def = @[name]Ref __ `= __ @[definition]extended-pat; String-pattern = *(`\ pat ?`; / .); pat = suffixed-pat / simple-pat; -simple-pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but - / Upto-and / Repeat / After / Before / Capture / Replace / Ref / parens; +simple-pat = Empty / Upto / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but + / Repeat / After / Before / Capture / Replace / Ref / parens; suffixed-pat = Eq-pat; Eq-pat = @[first]simple-pat "==" @[second]pat; Empty = `/ >(__ (`)/`})); -Dot = `.; +Dot = `. !`.; String = ( `" @[s]*(Escape / ~`") `" / `' @[s]*(Escape / ~`') `' @@ -30,7 +30,7 @@ escape-sequence = ( ); No = `! _ @pat; Anything-but = `~ ?`~ _ @pat; -Upto-and = `& ?`& _ @pat; +Upto = 2-3`. ?>(_@pat); Repeat = ( @[min]int _ `- _ @[max]int /{@[min]{=>"0"}=>} @[max]int _ `- @@ -64,7 +64,7 @@ $ = !.; ^^ = !<$.; ^ = !<.; -hash-comment = `# *.; +hash-comment = `# .. $; # Note: comments are undefined by default in regular BPEG comment = hash-comment; @@ -3,6 +3,7 @@ * * Grammar: * # <comment> comment + * .. any text up to the following pattern (if any); (multiline: ...) * . any character (multiline: $.) * ^ beginning of a line (^^: beginning of file) * $ end of a line ($$: end of file) @@ -13,7 +14,6 @@ * \<e1>-<e2> escape sequence range (e.g. \x00-\xF0) * ! <pat> no <pat> * ~ <pat> any character as long as it doesn't match <pat> (multiline: ~~<pat>) - * & <pat> upto and including <pat> (aka *~<pat> <pat>) (multiline: &&<pat>) * <N=1> + <pat> [% <sep="">] <N> or more <pat>s (separated by <sep>) * * <pat> [% <sep="">] sugar for "0+ <pat> [% <sep>]" * <N=1> - <pat> [% <sep="">] <N> or fewer <pat>s (separated by <sep>) @@ -97,10 +97,21 @@ vm_op_t *bpeg_simplepattern(const char *str) switch (c) { // Any char (dot) ($. is multiline anychar) case '.': { - anychar: - op->op = VM_ANYCHAR; - op->len = 1; - break; + if (matchchar(&str, '.')) { // ".." + if (matchchar(&str, '.')) // "..." + op->multiline = 1; + vm_op_t *till = bpeg_simplepattern(str); + str = str; // Don't advance str, the following pattern will be re-matched. + op->op = VM_UPTO; + op->len = -1; + op->args.pat = till; + break; + } else { + anychar: + op->op = VM_ANYCHAR; + op->len = 1; + break; + } } // Char literals case '`': { @@ -184,17 +195,6 @@ vm_op_t *bpeg_simplepattern(const char *str) op->args.pat = p; break; } - // Upto and including <pat> - case '&': { - if (matchchar(&str, '&')) op->multiline = 1; - vm_op_t *p = bpeg_simplepattern(str); - check(p, "Expected pattern after '&'\n"); - str = p->end; - op->op = VM_UPTO_AND; - op->len = -1; - op->args.pat = p; - break; - } // Number of repetitions: <N>(-<N> / - / + / "") case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index f92e0aa..c558c21 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -1,11 +1,11 @@ # Meta-rules for acting on everything pattern = !(/); # Not defined by default replacement = {!(/)=>}; # Not defined by default -replace-all = +&&@replacement &&$$; -find-all = {&&>matching-line=>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; -only-matches = +{&&@pattern=>'@1\n'}; -matching-line = +&@pattern *. $ ?\n; -non-matching-line = {&&(\n/$$)=>}; +replace-all = +(...@replacement) ...; +find-all = {... >matching-line =>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; +only-matches = +{...@pattern=>'@1\n'}; +matching-line = +(..@pattern)..$ ?\n; +non-matching-line = {..$=>}; # Helper definitions (commonly used) crlf = \r\n; @@ -28,10 +28,10 @@ abc = `a-z; esc = \e; e = \e; tab = \t; t = \t; nl = \n; lf = \n; n = \n; -c-block-comment = '/*' &&'*/'; -c-line-comment = '//' &$; +c-block-comment = '/*' ... '*/'; +c-line-comment = '//' ..$; c-comment = c-line-comment / c-block-comment; -hash-comment = `# &$; +hash-comment = `# ..$; comment = !(/); # No default definition, can be overridden WS = ` /\t/\n/\r/comment; ws = ` /\t; @@ -16,7 +16,7 @@ enum VMOpcode { VM_STRING, VM_RANGE, VM_NOT, - VM_UPTO_AND, + VM_UPTO, VM_REPEAT, VM_BEFORE, VM_AFTER, @@ -20,7 +20,7 @@ static const char *opcode_names[] = { [VM_STRING] = "STRING", [VM_RANGE] = "RANGE", [VM_NOT] = "NOT", - [VM_UPTO_AND] = "UPTO_AND", + [VM_UPTO] = "UPTO", [VM_REPEAT] = "REPEAT", [VM_BEFORE] = "BEFORE", [VM_AFTER] = "AFTER", @@ -130,24 +130,30 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m->end = str; return m; } - case VM_UPTO_AND: { + case VM_UPTO: { match_t *m = calloc(sizeof(match_t), 1); m->start = str; m->op = op; - match_t *p = NULL; - for (const char *prev = NULL; p == NULL && prev < str; ) { - prev = str; - p = _match(g, str, op->args.pat, rec); - if (*str && (op->multiline || *str != '\n')) - ++str; - } - if (p) { - m->end = p->end; - m->child = p; - return m; + if (op->args.pat) { + for (const char *prev = NULL; prev < str; ) { + prev = str; + match_t *p = _match(g, str, op->args.pat, rec); + if (p) { + destroy_match(&p); + break; + } + // This isn't in the for() structure because there needs to + // be at least once chance to match the pattern, even if + // we're at the end of the string already (e.g. "..$"). + if (*str && (op->multiline || *str != '\n')) ++str; + } + } else if (op->multiline) { + while (*str) ++str; + } else { + while (*str && *str != '\n') ++str; } - destroy_match(&m); - return NULL; + m->end = str; + return m; } case VM_REPEAT: { match_t *m = calloc(sizeof(match_t), 1); @@ -374,8 +380,8 @@ void print_pattern(vm_op_t *op) fprintf(stderr, ")"); break; } - case VM_UPTO_AND: { - fprintf(stderr, "text up to and including ("); + case VM_UPTO: { + fprintf(stderr, "text up to ("); print_pattern(op->args.pat); fprintf(stderr, ")"); break; |
