diff options
| -rw-r--r-- | README.md | 8 | ||||
| -rw-r--r-- | bp.1 | 5 | ||||
| -rw-r--r-- | grammars/bp.bp | 2 | ||||
| -rw-r--r-- | grammars/builtins.bp | 12 | ||||
| -rw-r--r-- | grammars/c++.bp | 5 | ||||
| -rw-r--r-- | grammars/c.bp | 5 | ||||
| -rw-r--r-- | grammars/go.bp | 3 | ||||
| -rw-r--r-- | grammars/javascript.bp | 3 | ||||
| -rw-r--r-- | grammars/lisp.bp | 1 | ||||
| -rw-r--r-- | grammars/lua.bp | 3 | ||||
| -rw-r--r-- | grammars/python.bp | 4 | ||||
| -rw-r--r-- | grammars/rust.bp | 3 | ||||
| -rw-r--r-- | grammars/shell.bp | 3 | ||||
| -rw-r--r-- | grammars/utf8-id.bp | 8 | ||||
| -rw-r--r-- | match.c | 36 | ||||
| -rw-r--r-- | pattern.c | 14 |
16 files changed, 58 insertions, 57 deletions
@@ -49,8 +49,7 @@ Pattern | Meaning `$` | The end of a line `__` | Zero or more whitespace characters (including newlines) `_` | Zero or more whitespace characters (excluding newlines) -`|` | A word boundary (i.e. the left or right edge of a word) -`{foo}` | The literal string `foo` with word boundaries on both ends (shorthand for `|"foo"|`) +`{foo}` | The literal string `foo` with word boundaries on both ends `` `c `` | The literal character `c` `` `a-z `` | The character range `a` through `z` `` `a,b `` | The character `a` or the character `b` @@ -99,11 +98,6 @@ Name | Meaning `number` | An integer or floating point number `Hex` | A hexadecimal character `id` | An identifier -`|` | A word boundary -`^` | Start of a line -`^^` | Start of a file -`$` | End of a line -`$$` | End of a file As well as these common definitions, BP also comes with a set of language-specific or domain-specific grammars. These are not full language @@ -116,9 +116,6 @@ Zero or more whitespace characters (specifically, spaces and tabs) .B __ Zero or more whitespace or newline characters -.B | -A word boundary - .B "foo" .B 'foo' The literal string \fIstring\fR. Escape sequences are not allowed. @@ -237,7 +234,7 @@ Find files ending with ".c" and replace the extension with ".h" .TP .B -bp -p '|"foobar"| parens' my_file.py +bp -p '{foobar} parens' my_file.py Find the literal string \fB"foobar"\fR, assuming it's a complete word, followed by a pair of matching parentheses in the file \fImy_file.py\fR diff --git a/grammars/bp.bp b/grammars/bp.bp index 5e4dcc8..ed7eb80 100644 --- a/grammars/bp.bp +++ b/grammars/bp.bp @@ -78,6 +78,6 @@ extended-pat: Otherwise / Replace / Chain / pat _: *(` / \t) __: *(` / \t / \r / \n / comment) -id: "__" / "_" / "|" / `a-z,A-Z *`a-z,A-Z,0-9,- +id: "__" / "_" / `a-z,A-Z *`a-z,A-Z,0-9,- comment: `# .. $ diff --git a/grammars/builtins.bp b/grammars/builtins.bp index 354ac50..526498e 100644 --- a/grammars/builtins.bp +++ b/grammars/builtins.bp @@ -16,12 +16,18 @@ brackets: `[ ..%(\n/brackets/string) `] braces: `{ ..%(\n/braces/string) `} parens: `( ..%(\n/parens/string) `) string: `" ..%(`\.) `" / `' ..%(`\.) `' -id: !<`a-z,A-Z,_,0-9 (`a-z,A-Z,_ *`a-z,A-Z,_,0-9)!=keyword | +left-id-boundary: ^ / <(\x00-x7f!=id-char) / <((\xc0-xdf \x80-xbf)!=id-char) + / <((\xe0-xef 2\x80-xbf)!=id-char) / <((\xf0-xf7 3\x80-xbf)!=id-char) +right-id-boundary: !id-char +id: left-id-boundary !`0-9 (+id-char)!=keyword id-char: `a-z,A-Z,_,0-9 -|: !<`a-z,A-Z,_,0-9 / !`a-z,A-Z,_,0-9 var: id keyword: !"" # No keywords defined by default -word: |+`a-z,A-Z !`0-9,_ +left-word-boundary: ^ / <(\x00-x7f!=word-char) / <((\xc0-xdf \x80-xbf)!=word-char) + / <((\xe0-xef 2\x80-xbf)!=word-char) / <((\xf0-xf7 3\x80-xbf)!=word-char) +right-word-boundary: !word-char +word-char: `a-z,A-Z,_,0-9,-,' +word: left-word-boundary +word-char HEX: `0-9,A-F Hex: `0-9,a-f,A-F hex: `0-9,a-f diff --git a/grammars/c++.bp b/grammars/c++.bp index 33e3fb7..a48d5a2 100644 --- a/grammars/c++.bp +++ b/grammars/c++.bp @@ -8,7 +8,7 @@ comment: "//" .. $ / "/*" ..%\n "*/" string: `" ..%(`\.) `" -keyword: |( +keyword: "alignas" / "alignof" / "and" / "and_eq" / "asm" / "atomic_cancel" / "atomic_commit" / "atomic_noexcept" / "auto" / "bitand" / "bitor" / "bool" / "break" / "case" / "catch" / "char" / "char8_t" / "char16_t" / "char32_t" / "class" / "compl" / "concept" / "const" / @@ -22,8 +22,7 @@ keyword: |( "static_cast" / "struct" / "switch" / "synchronized" / "template" / "this" / "thread_local" / "throw" / "true" / "try" / "typedef" / "typeid" / "typename" / "union" / "unsigned" / "using" / "virtual" / "void" / "volatile" / "wchar_t" / "while" / "xor" / "xor_eq" -)| -function-def: ^_ 2+(keyword / id / anglebraces / `*) % __ parens (__`; / >(__`{)) +function-def: ^_ 2+(id / keyword / anglebraces / `*) % __ parens (__`; / >(__`{)) function: function-def __ braces macro: ^{#define} ..$ *(<`\ \n..$) import: ^({#include}/{#import}) __ (string / `<..`>) diff --git a/grammars/c.bp b/grammars/c.bp index be91cbe..2c74c3d 100644 --- a/grammars/c.bp +++ b/grammars/c.bp @@ -8,14 +8,13 @@ comment: "//" .. $ / "/*" ..%\n "*/" string: `" ..%(`\.) `" -keyword: |( +keyword: "auto" / "break" / "case" / "char" / "const" / "continue" / "default" / "do" / "double" / "else" / "enum" / "extern" / "float" / "for" / "goto" / "if" / "int" / "long" / "register" / "return" / "short" / "signed" / "sizeof" / "static" / "struct" / "switch" / "typedef" / "union" / "unsigned" / "void" / "volatile" / "while" -)| -function-def: ^_ 2+(keyword / id / `*) % __ parens (__`; / >(__`{)) +function-def: ^_ 2+(id / keyword / `*) % __ parens (__`; / >(__`{)) function: function-def __ braces macro: ^{#define} ..$ *(<`\ \n..$) import: ^{#include} __ (string / `<..`>) diff --git a/grammars/go.bp b/grammars/go.bp index 425325a..c9f7377 100644 --- a/grammars/go.bp +++ b/grammars/go.bp @@ -8,11 +8,10 @@ comment: "//" .. $ / "/*" ..%\n "*/" string: `" ..%(`\.) `" -keyword: |( +keyword: "break" / "default" / "func" / "interface" / "select" / "case" / "defer" / "go" / "map" / "struct" / "chan" / "else" / "goto" / "package" / "switch" / "const" / "fallthrough" / "if" / "range" / "type" / "continue" / "for" / "import" / "return" / "var" -)| function-def: {func} __ id __ parens __ [id / parens] >(__`{) function: function-def __ braces import: {import} __ (parens / string) diff --git a/grammars/javascript.bp b/grammars/javascript.bp index b663050..fdd76a1 100644 --- a/grammars/javascript.bp +++ b/grammars/javascript.bp @@ -8,7 +8,7 @@ comment: "//" .. $ / "/*" ..%\n "*/" string: `" ..%(`\.) `" / `' ..%(`\.) `' / `/ ..%(`\.) `/ -keyword: |( +keyword: "abstract" / "arguments" / "await" / "boolean" / "break" / "byte" / "case" / "catch" / "char" / "class" / "const" / "continue" / "debugger" / "default" / "delete" / "do" / "double" / "else" / "enum" / "eval" / "export" / "extends" / @@ -18,7 +18,6 @@ keyword: |( "public" / "return" / "short" / "static" / "super" / "switch" / "synchronized" / "this" / "throw" / "throws" / "transient" / "true" / "try" / "typeof" / "var" / "void" / "volatile" / "while" / "with" / "yield" -)| function-def: {function} __ [id__] parens / (id / parens) __ "=>" function: function-def __ braces import: {import} ..%braces (`; / $) diff --git a/grammars/lisp.bp b/grammars/lisp.bp index 3e1f4dc..d02a458 100644 --- a/grammars/lisp.bp +++ b/grammars/lisp.bp @@ -13,4 +13,3 @@ function-def: `(__{defun}__id function: function-def ..%parens `) id-char: `A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ id: !<`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ +`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ -|: !<`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ / !`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ diff --git a/grammars/lua.bp b/grammars/lua.bp index 5554167..6967f1e 100644 --- a/grammars/lua.bp +++ b/grammars/lua.bp @@ -9,11 +9,10 @@ comment: "--" (`[ @eqs=*`= `[ ..%\n (`]eqs`]) / ..$) string: `"..%(`\.) `" / `' ..%(`\.) `' / `[ @eqs=*`= `[ ..%\n (`]eqs`]) table: `{ ..%(table/string/comment/\n) `} -keyword: |( +keyword: "and" / "break" / "do" / "else" / "elseif" / "end" / "false" / "for" / "function" / "goto" / "if" / "in" / "local" / "nil" / "not" / "or" / "repeat" / "return" / "then" / "true" / "until" / "while" -)| function-def: {function}[_id (*(`.id)[`:id])]_ parens block: function / ({do}/{then}) ..%(comment/string/block/\n) {end} function: function-def ..%(comment/string/block/\n) {end} diff --git a/grammars/python.bp b/grammars/python.bp index daab6a5..37d6a88 100644 --- a/grammars/python.bp +++ b/grammars/python.bp @@ -8,11 +8,11 @@ comment: `# ..$ string: "'''" ..%\n "'''" / '"""' ..%\n '"""' / `" ..%(`\.) `" / `' ..%(`\.) `' -keyword: |("and" / "as" / "assert" / "break" / "class" / "continue" / "def" / +keyword: "and" / "as" / "assert" / "break" / "class" / "continue" / "def" / "del" / "elif" / "else" / "except" / "finally" / "for" / "from" / "global" / "if" / "import" / "in" / "is" / "lambda" / "None" / "nonlocal" / "not" / "or" / "pass" / "raise" / "return" / "try" / "while" / - "with" / "yield")| + "with" / "yield" class: class-def +(\N ..$) class-def: ^_{class}_id[_parens]_`: function: function-def +(\N ..$) diff --git a/grammars/rust.bp b/grammars/rust.bp index bb58a61..97a1e73 100644 --- a/grammars/rust.bp +++ b/grammars/rust.bp @@ -8,12 +8,11 @@ comment: "//" .. $ / "/*" ..%(comment / \n) "*/" string: `" ..%(`\.) `" -keyword: |( +keyword: "as" / "break" / "const" / "continue" / "crate" / "else" / "enum" / "extern" / "false" / "fn" / "for" / "if" / "impl" / "in" / "let" / "loop" / "match" / "mod" / "move" / "mut" / "pub" / "ref" / "return" / "self" / "Self" / "static" / "struct" / "super" / "trait" / "true" / "type" / "unsafe" / "use" / "where" / "while" -)| function-def: {fn} __ id __ parens __ ["->"__(id / parens)] >(__`{) function: function-def __ braces import: {use} _ *(id / braces) % "::" _ `; diff --git a/grammars/shell.bp b/grammars/shell.bp index 1ad182d..ff13d0b 100644 --- a/grammars/shell.bp +++ b/grammars/shell.bp @@ -9,11 +9,10 @@ comment: `#..$ string: `" ..%(`\./subcommand/\n) `" / `' ..%\n `' / "<<" _ @delim=id _$ ..%\n (^delim$) subcommand: `` ..%\n `` / "$" parens -keyword: |( +keyword: "echo" / "read" / "set" / "unset" / "readonly" / "shift" / "export" / "if" / "fi" / "else" / "while" / "do" / "done" / "for" / "until" / "case" / "esac" / "break" / "continue" / "exit" / "return" / "trap" / "wait" / "eval" / "exec" / "ulimit" / "umask" -)| function-def: ^_ ["function"_] id _ `(_`) >(__`{) function: function-def __ braces var: `$ (id / braces) diff --git a/grammars/utf8-id.bp b/grammars/utf8-id.bp index 9d5381d..b8ba2d3 100644 --- a/grammars/utf8-id.bp +++ b/grammars/utf8-id.bp @@ -1,11 +1,7 @@ # Definitions of UTF8-compliant identifiers -id: | utf8-id-start *utf8-id-cont +id: left-word-boundary (utf8-id-start *utf8-id-cont)!=keyword id-char: utf8-id-cont / utf8-id-start -|: !id-char / ( - !<(\x00-x7f==id-char) - !<((\xc0-xdf \x80-xbf)==id-char) - !<((\xe0-xef 2\x80-xbf)==id-char) - !<((\xf0-xf7 3\x80-xbf)==id-char)) +word-char: utf8-id-cont / utf8-id-start utf8-id-start: `A-Z / `a-z / !\x00-x7F ( \xc2 (\xaa / \xb5 / \xba) @@ -35,6 +35,7 @@ static match_t *unused_matches = NULL; static match_t *in_use_matches = NULL; #endif +static inline pat_t *deref(def_t *defs, pat_t *pat); __attribute__((returns_nonnull)) static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child); __attribute__((nonnull, pure)) @@ -49,6 +50,18 @@ __attribute__((hot, nonnull(2,3,4))) static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase); // +// If the given pattern is a reference, look it up and return the referenced +// pattern. This is used for an optimization to avoid repeated lookups. +// +static inline pat_t *deref(def_t *defs, pat_t *pat) +{ + if (pat && pat->type == BP_REF) { + def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); + if (def) pat = def->pat; + } + return pat; +} +// // Return the location of the next character or UTF8 codepoint. // (i.e. skip forward one codepoint at a time, not one byte at a time) // @@ -209,7 +222,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool } case BP_UPTO: { match_t *m = new_match(pat, str, str, NULL); - pat_t *target = pat->args.multiple.first, *skip = pat->args.multiple.second; + pat_t *target = deref(defs, pat->args.multiple.first), + *skip = deref(defs, pat->args.multiple.second); if (!target && !skip) { while (str < f->end && *str != '\n') ++str; m->end = str; @@ -253,28 +267,30 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool match_t **dest = &m->child; size_t reps = 0; ssize_t max = pat->args.repetitions.max; + pat_t *repeating = deref(defs, pat->args.repetitions.repeat_pat); + pat_t *sep = deref(defs, pat->args.repetitions.sep); for (reps = 0; max == -1 || reps < (size_t)max; ++reps) { const char *start = str; // Separator match_t *msep = NULL; - if (pat->args.repetitions.sep != NULL && reps > 0) { - msep = match(defs, f, str, pat->args.repetitions.sep, ignorecase); + if (sep != NULL && reps > 0) { + msep = match(defs, f, str, sep, ignorecase); if (msep == NULL) break; str = msep->end; } - match_t *mp = match(defs, f, str, pat->args.repetitions.repeat_pat, ignorecase); + match_t *mp = match(defs, f, str, repeating, ignorecase); if (mp == NULL) { str = start; if (msep) recycle_if_unused(&msep); break; } if (mp->end == start && reps > 0) { - // Since no forward progress was made on either - // `repeat_pat` or `sep` and BP does not have mutable - // state, it's guaranteed that no progress will be made on - // the next loop either. We know that this will continue to - // loop until reps==max, so let's just cut to the chase - // instead of looping infinitely. + // Since no forward progress was made on either `repeating` + // or `sep` and BP does not have mutable state, it's + // guaranteed that no progress will be made on the next + // loop either. We know that this will continue to loop + // until reps==max, so let's just cut to the chase instead + // of looping infinitely. if (msep) recycle_if_unused(&msep); recycle_if_unused(&mp); if (pat->args.repetitions.max == -1) @@ -305,12 +305,12 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) if (c == '{') { // Surround with `|` word boundaries pat_t *left = new_pat(f, start, start+1, -1, BP_REF); - left->args.ref.name = "|"; - left->args.ref.len = 1; + left->args.ref.name = "left-word-boundary"; + left->args.ref.len = strlen(left->args.ref.name); pat_t *right = new_pat(f, str, str+1, -1, BP_REF); - right->args.ref.name = "|"; - right->args.ref.len = 1; + right->args.ref.name = "right-word-boundary"; + right->args.ref.len = strlen(right->args.ref.name); pat = chain_together(f, left, chain_together(f, pat, right)); } @@ -454,10 +454,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) return new_pat(f, start, str, 0, BP_END_OF_FILE); return new_pat(f, start, str, 0, BP_END_OF_LINE); } - // Special rules: - case '_': case '|': { + // Whitespace: + case '_': { size_t namelen = 1; - if (c == '_' && matchchar(&str, c)) // double __, ^^, $$ + if (matchchar(&str, '_')) // double __ (whitespace with newlines) ++namelen; if (matchchar(&str, ':')) return NULL; // Don't match definitions pat_t *ref = new_pat(f, start, str, -1, BP_REF); |
