diff --git a/bpeg.bpeg b/bpeg.bpeg index 3301155..39e0f3f 100644 --- a/bpeg.bpeg +++ b/bpeg.bpeg @@ -7,14 +7,14 @@ Def = @[name]Ref __ `= __ @[definition]extended-pat; String-pattern = *(`\ pat ?`; / .); pat = suffixed-pat / simple-pat; -simple-pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but - / Upto-and / Repeat / After / Before / Capture / Replace / Ref / parens; +simple-pat = Empty / Upto / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but + / Repeat / After / Before / Capture / Replace / Ref / parens; suffixed-pat = Eq-pat; Eq-pat = @[first]simple-pat "==" @[second]pat; Empty = `/ >(__ (`)/`})); -Dot = `.; +Dot = `. !`.; String = ( `" @[s]*(Escape / ~`") `" / `' @[s]*(Escape / ~`') `' @@ -30,7 +30,7 @@ escape-sequence = ( ); No = `! _ @pat; Anything-but = `~ ?`~ _ @pat; -Upto-and = `& ?`& _ @pat; +Upto = 2-3`. ?>(_@pat); Repeat = ( @[min]int _ `- _ @[max]int /{@[min]{=>"0"}=>} @[max]int _ `- @@ -64,7 +64,7 @@ $ = !.; ^^ = !<$.; ^ = !<.; -hash-comment = `# *.; +hash-comment = `# .. $; # Note: comments are undefined by default in regular BPEG comment = hash-comment; diff --git a/bpeg.c b/bpeg.c index 89f35e7..2245ff8 100644 --- a/bpeg.c +++ b/bpeg.c @@ -3,6 +3,7 @@ * * Grammar: * # comment + * .. any text up to the following pattern (if any); (multiline: ...) * . any character (multiline: $.) * ^ beginning of a line (^^: beginning of file) * $ end of a line ($$: end of file) @@ -13,7 +14,6 @@ * \- escape sequence range (e.g. \x00-\xF0) * ! no * ~ any character as long as it doesn't match (multiline: ~~) - * & upto and including (aka *~ ) (multiline: &&) * + [% ] or more s (separated by ) * * [% ] sugar for "0+ [% ]" * - [% ] or fewer s (separated by ) diff --git a/compiler.c b/compiler.c index 4e2b185..df34e44 100644 --- a/compiler.c +++ b/compiler.c @@ -97,10 +97,21 @@ vm_op_t *bpeg_simplepattern(const char *str) switch (c) { // Any char (dot) ($. is multiline anychar) case '.': { - anychar: - op->op = VM_ANYCHAR; - op->len = 1; - break; + if (matchchar(&str, '.')) { // ".." + if (matchchar(&str, '.')) // "..." + op->multiline = 1; + vm_op_t *till = bpeg_simplepattern(str); + str = str; // Don't advance str, the following pattern will be re-matched. + op->op = VM_UPTO; + op->len = -1; + op->args.pat = till; + break; + } else { + anychar: + op->op = VM_ANYCHAR; + op->len = 1; + break; + } } // Char literals case '`': { @@ -184,17 +195,6 @@ vm_op_t *bpeg_simplepattern(const char *str) op->args.pat = p; break; } - // Upto and including - case '&': { - if (matchchar(&str, '&')) op->multiline = 1; - vm_op_t *p = bpeg_simplepattern(str); - check(p, "Expected pattern after '&'\n"); - str = p->end; - op->op = VM_UPTO_AND; - op->len = -1; - op->args.pat = p; - break; - } // Number of repetitions: (- / - / + / "") case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg index f92e0aa..c558c21 100644 --- a/grammars/builtins.bpeg +++ b/grammars/builtins.bpeg @@ -1,11 +1,11 @@ # Meta-rules for acting on everything pattern = !(/); # Not defined by default replacement = {!(/)=>}; # Not defined by default -replace-all = +&&@replacement &&$$; -find-all = {&&>matching-line=>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; -only-matches = +{&&@pattern=>'@1\n'}; -matching-line = +&@pattern *. $ ?\n; -non-matching-line = {&&(\n/$$)=>}; +replace-all = +(...@replacement) ...; +find-all = {... >matching-line =>} +(matching-line/non-matching-line) ?{!<\n => "\n"}; +only-matches = +{...@pattern=>'@1\n'}; +matching-line = +(..@pattern)..$ ?\n; +non-matching-line = {..$=>}; # Helper definitions (commonly used) crlf = \r\n; @@ -28,10 +28,10 @@ abc = `a-z; esc = \e; e = \e; tab = \t; t = \t; nl = \n; lf = \n; n = \n; -c-block-comment = '/*' &&'*/'; -c-line-comment = '//' &$; +c-block-comment = '/*' ... '*/'; +c-line-comment = '//' ..$; c-comment = c-line-comment / c-block-comment; -hash-comment = `# &$; +hash-comment = `# ..$; comment = !(/); # No default definition, can be overridden WS = ` /\t/\n/\r/comment; ws = ` /\t; diff --git a/types.h b/types.h index f335285..6346342 100644 --- a/types.h +++ b/types.h @@ -16,7 +16,7 @@ enum VMOpcode { VM_STRING, VM_RANGE, VM_NOT, - VM_UPTO_AND, + VM_UPTO, VM_REPEAT, VM_BEFORE, VM_AFTER, diff --git a/vm.c b/vm.c index 8e0957d..b69b3cb 100644 --- a/vm.c +++ b/vm.c @@ -20,7 +20,7 @@ static const char *opcode_names[] = { [VM_STRING] = "STRING", [VM_RANGE] = "RANGE", [VM_NOT] = "NOT", - [VM_UPTO_AND] = "UPTO_AND", + [VM_UPTO] = "UPTO", [VM_REPEAT] = "REPEAT", [VM_BEFORE] = "BEFORE", [VM_AFTER] = "AFTER", @@ -130,24 +130,30 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref m->end = str; return m; } - case VM_UPTO_AND: { + case VM_UPTO: { match_t *m = calloc(sizeof(match_t), 1); m->start = str; m->op = op; - match_t *p = NULL; - for (const char *prev = NULL; p == NULL && prev < str; ) { - prev = str; - p = _match(g, str, op->args.pat, rec); - if (*str && (op->multiline || *str != '\n')) - ++str; + if (op->args.pat) { + for (const char *prev = NULL; prev < str; ) { + prev = str; + match_t *p = _match(g, str, op->args.pat, rec); + if (p) { + destroy_match(&p); + break; + } + // This isn't in the for() structure because there needs to + // be at least once chance to match the pattern, even if + // we're at the end of the string already (e.g. "..$"). + if (*str && (op->multiline || *str != '\n')) ++str; + } + } else if (op->multiline) { + while (*str) ++str; + } else { + while (*str && *str != '\n') ++str; } - if (p) { - m->end = p->end; - m->child = p; - return m; - } - destroy_match(&m); - return NULL; + m->end = str; + return m; } case VM_REPEAT: { match_t *m = calloc(sizeof(match_t), 1); @@ -374,8 +380,8 @@ void print_pattern(vm_op_t *op) fprintf(stderr, ")"); break; } - case VM_UPTO_AND: { - fprintf(stderr, "text up to and including ("); + case VM_UPTO: { + fprintf(stderr, "text up to ("); print_pattern(op->args.pat); fprintf(stderr, ")"); break;