Overhaul of |-word boundaries (| is deprecated), performance
improvements for repeating matches, tweaks to the logic of word vs. id
This commit is contained in:
parent
3359a804c8
commit
a93220972f
@ -49,8 +49,7 @@ Pattern | Meaning
|
||||
`$` | The end of a line
|
||||
`__` | Zero or more whitespace characters (including newlines)
|
||||
`_` | Zero or more whitespace characters (excluding newlines)
|
||||
`|` | A word boundary (i.e. the left or right edge of a word)
|
||||
`{foo}` | The literal string `foo` with word boundaries on both ends (shorthand for `|"foo"|`)
|
||||
`{foo}` | The literal string `foo` with word boundaries on both ends
|
||||
`` `c `` | The literal character `c`
|
||||
`` `a-z `` | The character range `a` through `z`
|
||||
`` `a,b `` | The character `a` or the character `b`
|
||||
@ -99,11 +98,6 @@ Name | Meaning
|
||||
`number` | An integer or floating point number
|
||||
`Hex` | A hexadecimal character
|
||||
`id` | An identifier
|
||||
`|` | A word boundary
|
||||
`^` | Start of a line
|
||||
`^^` | Start of a file
|
||||
`$` | End of a line
|
||||
`$$` | End of a file
|
||||
|
||||
As well as these common definitions, BP also comes with a set of
|
||||
language-specific or domain-specific grammars. These are not full language
|
||||
|
5
bp.1
5
bp.1
@ -116,9 +116,6 @@ Zero or more whitespace characters (specifically, spaces and tabs)
|
||||
.B __
|
||||
Zero or more whitespace or newline characters
|
||||
|
||||
.B |
|
||||
A word boundary
|
||||
|
||||
.B "foo"
|
||||
.B 'foo'
|
||||
The literal string \fIstring\fR. Escape sequences are not allowed.
|
||||
@ -237,7 +234,7 @@ Find files ending with ".c" and replace the extension with ".h"
|
||||
|
||||
.TP
|
||||
.B
|
||||
bp -p '|"foobar"| parens' my_file.py
|
||||
bp -p '{foobar} parens' my_file.py
|
||||
Find the literal string \fB"foobar"\fR, assuming it's a complete word, followed
|
||||
by a pair of matching parentheses in the file \fImy_file.py\fR
|
||||
|
||||
|
@ -78,6 +78,6 @@ extended-pat: Otherwise / Replace / Chain / pat
|
||||
_: *(` / \t)
|
||||
__: *(` / \t / \r / \n / comment)
|
||||
|
||||
id: "__" / "_" / "|" / `a-z,A-Z *`a-z,A-Z,0-9,-
|
||||
id: "__" / "_" / `a-z,A-Z *`a-z,A-Z,0-9,-
|
||||
|
||||
comment: `# .. $
|
||||
|
@ -16,12 +16,18 @@ brackets: `[ ..%(\n/brackets/string) `]
|
||||
braces: `{ ..%(\n/braces/string) `}
|
||||
parens: `( ..%(\n/parens/string) `)
|
||||
string: `" ..%(`\.) `" / `' ..%(`\.) `'
|
||||
id: !<`a-z,A-Z,_,0-9 (`a-z,A-Z,_ *`a-z,A-Z,_,0-9)!=keyword |
|
||||
left-id-boundary: ^ / <(\x00-x7f!=id-char) / <((\xc0-xdf \x80-xbf)!=id-char)
|
||||
/ <((\xe0-xef 2\x80-xbf)!=id-char) / <((\xf0-xf7 3\x80-xbf)!=id-char)
|
||||
right-id-boundary: !id-char
|
||||
id: left-id-boundary !`0-9 (+id-char)!=keyword
|
||||
id-char: `a-z,A-Z,_,0-9
|
||||
|: !<`a-z,A-Z,_,0-9 / !`a-z,A-Z,_,0-9
|
||||
var: id
|
||||
keyword: !"" # No keywords defined by default
|
||||
word: |+`a-z,A-Z !`0-9,_
|
||||
left-word-boundary: ^ / <(\x00-x7f!=word-char) / <((\xc0-xdf \x80-xbf)!=word-char)
|
||||
/ <((\xe0-xef 2\x80-xbf)!=word-char) / <((\xf0-xf7 3\x80-xbf)!=word-char)
|
||||
right-word-boundary: !word-char
|
||||
word-char: `a-z,A-Z,_,0-9,-,'
|
||||
word: left-word-boundary +word-char
|
||||
HEX: `0-9,A-F
|
||||
Hex: `0-9,a-f,A-F
|
||||
hex: `0-9,a-f
|
||||
|
@ -8,7 +8,7 @@
|
||||
|
||||
comment: "//" .. $ / "/*" ..%\n "*/"
|
||||
string: `" ..%(`\.) `"
|
||||
keyword: |(
|
||||
keyword:
|
||||
"alignas" / "alignof" / "and" / "and_eq" / "asm" / "atomic_cancel" / "atomic_commit" /
|
||||
"atomic_noexcept" / "auto" / "bitand" / "bitor" / "bool" / "break" / "case" / "catch" /
|
||||
"char" / "char8_t" / "char16_t" / "char32_t" / "class" / "compl" / "concept" / "const" /
|
||||
@ -22,8 +22,7 @@ keyword: |(
|
||||
"static_cast" / "struct" / "switch" / "synchronized" / "template" / "this" /
|
||||
"thread_local" / "throw" / "true" / "try" / "typedef" / "typeid" / "typename" / "union" /
|
||||
"unsigned" / "using" / "virtual" / "void" / "volatile" / "wchar_t" / "while" / "xor" / "xor_eq"
|
||||
)|
|
||||
function-def: ^_ 2+(keyword / id / anglebraces / `*) % __ parens (__`; / >(__`{))
|
||||
function-def: ^_ 2+(id / keyword / anglebraces / `*) % __ parens (__`; / >(__`{))
|
||||
function: function-def __ braces
|
||||
macro: ^{#define} ..$ *(<`\ \n..$)
|
||||
import: ^({#include}/{#import}) __ (string / `<..`>)
|
||||
|
@ -8,14 +8,13 @@
|
||||
|
||||
comment: "//" .. $ / "/*" ..%\n "*/"
|
||||
string: `" ..%(`\.) `"
|
||||
keyword: |(
|
||||
keyword:
|
||||
"auto" / "break" / "case" / "char" / "const" / "continue" / "default" / "do" /
|
||||
"double" / "else" / "enum" / "extern" / "float" / "for" / "goto" / "if" /
|
||||
"int" / "long" / "register" / "return" / "short" / "signed" / "sizeof" /
|
||||
"static" / "struct" / "switch" / "typedef" / "union" / "unsigned" / "void" /
|
||||
"volatile" / "while"
|
||||
)|
|
||||
function-def: ^_ 2+(keyword / id / `*) % __ parens (__`; / >(__`{))
|
||||
function-def: ^_ 2+(id / keyword / `*) % __ parens (__`; / >(__`{))
|
||||
function: function-def __ braces
|
||||
macro: ^{#define} ..$ *(<`\ \n..$)
|
||||
import: ^{#include} __ (string / `<..`>)
|
||||
|
@ -8,11 +8,10 @@
|
||||
|
||||
comment: "//" .. $ / "/*" ..%\n "*/"
|
||||
string: `" ..%(`\.) `"
|
||||
keyword: |(
|
||||
keyword:
|
||||
"break" / "default" / "func" / "interface" / "select" / "case" / "defer" / "go" /
|
||||
"map" / "struct" / "chan" / "else" / "goto" / "package" / "switch" / "const" /
|
||||
"fallthrough" / "if" / "range" / "type" / "continue" / "for" / "import" / "return" / "var"
|
||||
)|
|
||||
function-def: {func} __ id __ parens __ [id / parens] >(__`{)
|
||||
function: function-def __ braces
|
||||
import: {import} __ (parens / string)
|
||||
|
@ -8,7 +8,7 @@
|
||||
|
||||
comment: "//" .. $ / "/*" ..%\n "*/"
|
||||
string: `" ..%(`\.) `" / `' ..%(`\.) `' / `/ ..%(`\.) `/
|
||||
keyword: |(
|
||||
keyword:
|
||||
"abstract" / "arguments" / "await" / "boolean" / "break" / "byte" / "case" /
|
||||
"catch" / "char" / "class" / "const" / "continue" / "debugger" / "default" /
|
||||
"delete" / "do" / "double" / "else" / "enum" / "eval" / "export" / "extends" /
|
||||
@ -18,7 +18,6 @@ keyword: |(
|
||||
"public" / "return" / "short" / "static" / "super" / "switch" / "synchronized" /
|
||||
"this" / "throw" / "throws" / "transient" / "true" / "try" / "typeof" / "var" /
|
||||
"void" / "volatile" / "while" / "with" / "yield"
|
||||
)|
|
||||
function-def: {function} __ [id__] parens / (id / parens) __ "=>"
|
||||
function: function-def __ braces
|
||||
import: {import} ..%braces (`; / $)
|
||||
|
@ -13,4 +13,3 @@ function-def: `(__{defun}__id
|
||||
function: function-def ..%parens `)
|
||||
id-char: `A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~
|
||||
id: !<`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ +`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~
|
||||
|: !<`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~ / !`A-Z,a-z,0-9,!,$,%,&,*,+,-,.,/,:,<,=,>,?,@,^,_,~
|
||||
|
@ -9,11 +9,10 @@
|
||||
comment: "--" (`[ @eqs=*`= `[ ..%\n (`]eqs`]) / ..$)
|
||||
string: `"..%(`\.) `" / `' ..%(`\.) `' / `[ @eqs=*`= `[ ..%\n (`]eqs`])
|
||||
table: `{ ..%(table/string/comment/\n) `}
|
||||
keyword: |(
|
||||
keyword:
|
||||
"and" / "break" / "do" / "else" / "elseif" / "end" / "false" / "for" /
|
||||
"function" / "goto" / "if" / "in" / "local" / "nil" / "not" / "or" /
|
||||
"repeat" / "return" / "then" / "true" / "until" / "while"
|
||||
)|
|
||||
function-def: {function}[_id (*(`.id)[`:id])]_ parens
|
||||
block: function / ({do}/{then}) ..%(comment/string/block/\n) {end}
|
||||
function: function-def ..%(comment/string/block/\n) {end}
|
||||
|
@ -8,11 +8,11 @@
|
||||
|
||||
comment: `# ..$
|
||||
string: "'''" ..%\n "'''" / '"""' ..%\n '"""' / `" ..%(`\.) `" / `' ..%(`\.) `'
|
||||
keyword: |("and" / "as" / "assert" / "break" / "class" / "continue" / "def" /
|
||||
keyword: "and" / "as" / "assert" / "break" / "class" / "continue" / "def" /
|
||||
"del" / "elif" / "else" / "except" / "finally" / "for" / "from" /
|
||||
"global" / "if" / "import" / "in" / "is" / "lambda" / "None" / "nonlocal" /
|
||||
"not" / "or" / "pass" / "raise" / "return" / "try" / "while" /
|
||||
"with" / "yield")|
|
||||
"with" / "yield"
|
||||
class: class-def +(\N ..$)
|
||||
class-def: ^_{class}_id[_parens]_`:
|
||||
function: function-def +(\N ..$)
|
||||
|
@ -8,12 +8,11 @@
|
||||
|
||||
comment: "//" .. $ / "/*" ..%(comment / \n) "*/"
|
||||
string: `" ..%(`\.) `"
|
||||
keyword: |(
|
||||
keyword:
|
||||
"as" / "break" / "const" / "continue" / "crate" / "else" / "enum" / "extern" /
|
||||
"false" / "fn" / "for" / "if" / "impl" / "in" / "let" / "loop" / "match" /
|
||||
"mod" / "move" / "mut" / "pub" / "ref" / "return" / "self" / "Self" / "static" /
|
||||
"struct" / "super" / "trait" / "true" / "type" / "unsafe" / "use" / "where" / "while"
|
||||
)|
|
||||
function-def: {fn} __ id __ parens __ ["->"__(id / parens)] >(__`{)
|
||||
function: function-def __ braces
|
||||
import: {use} _ *(id / braces) % "::" _ `;
|
||||
|
@ -9,11 +9,10 @@
|
||||
comment: `#..$
|
||||
string: `" ..%(`\./subcommand/\n) `" / `' ..%\n `' / "<<" _ @delim=id _$ ..%\n (^delim$)
|
||||
subcommand: `` ..%\n `` / "$" parens
|
||||
keyword: |(
|
||||
keyword:
|
||||
"echo" / "read" / "set" / "unset" / "readonly" / "shift" / "export" / "if" / "fi" /
|
||||
"else" / "while" / "do" / "done" / "for" / "until" / "case" / "esac" / "break" /
|
||||
"continue" / "exit" / "return" / "trap" / "wait" / "eval" / "exec" / "ulimit" / "umask"
|
||||
)|
|
||||
function-def: ^_ ["function"_] id _ `(_`) >(__`{)
|
||||
function: function-def __ braces
|
||||
var: `$ (id / braces)
|
||||
|
@ -1,11 +1,7 @@
|
||||
# Definitions of UTF8-compliant identifiers
|
||||
id: | utf8-id-start *utf8-id-cont
|
||||
id: left-word-boundary (utf8-id-start *utf8-id-cont)!=keyword
|
||||
id-char: utf8-id-cont / utf8-id-start
|
||||
|: !id-char / (
|
||||
!<(\x00-x7f==id-char)
|
||||
!<((\xc0-xdf \x80-xbf)==id-char)
|
||||
!<((\xe0-xef 2\x80-xbf)==id-char)
|
||||
!<((\xf0-xf7 3\x80-xbf)==id-char))
|
||||
word-char: utf8-id-cont / utf8-id-start
|
||||
|
||||
utf8-id-start: `A-Z / `a-z / !\x00-x7F (
|
||||
\xc2 (\xaa / \xb5 / \xba)
|
||||
|
36
match.c
36
match.c
@ -35,6 +35,7 @@ static match_t *unused_matches = NULL;
|
||||
static match_t *in_use_matches = NULL;
|
||||
#endif
|
||||
|
||||
static inline pat_t *deref(def_t *defs, pat_t *pat);
|
||||
__attribute__((returns_nonnull))
|
||||
static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child);
|
||||
__attribute__((nonnull, pure))
|
||||
@ -48,6 +49,18 @@ static match_t *get_capture_by_name(match_t *m, const char *name);
|
||||
__attribute__((hot, nonnull(2,3,4)))
|
||||
static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase);
|
||||
|
||||
//
|
||||
// If the given pattern is a reference, look it up and return the referenced
|
||||
// pattern. This is used for an optimization to avoid repeated lookups.
|
||||
//
|
||||
static inline pat_t *deref(def_t *defs, pat_t *pat)
|
||||
{
|
||||
if (pat && pat->type == BP_REF) {
|
||||
def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
|
||||
if (def) pat = def->pat;
|
||||
}
|
||||
return pat;
|
||||
}
|
||||
//
|
||||
// Return the location of the next character or UTF8 codepoint.
|
||||
// (i.e. skip forward one codepoint at a time, not one byte at a time)
|
||||
@ -209,7 +222,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
}
|
||||
case BP_UPTO: {
|
||||
match_t *m = new_match(pat, str, str, NULL);
|
||||
pat_t *target = pat->args.multiple.first, *skip = pat->args.multiple.second;
|
||||
pat_t *target = deref(defs, pat->args.multiple.first),
|
||||
*skip = deref(defs, pat->args.multiple.second);
|
||||
if (!target && !skip) {
|
||||
while (str < f->end && *str != '\n') ++str;
|
||||
m->end = str;
|
||||
@ -253,28 +267,30 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
|
||||
match_t **dest = &m->child;
|
||||
size_t reps = 0;
|
||||
ssize_t max = pat->args.repetitions.max;
|
||||
pat_t *repeating = deref(defs, pat->args.repetitions.repeat_pat);
|
||||
pat_t *sep = deref(defs, pat->args.repetitions.sep);
|
||||
for (reps = 0; max == -1 || reps < (size_t)max; ++reps) {
|
||||
const char *start = str;
|
||||
// Separator
|
||||
match_t *msep = NULL;
|
||||
if (pat->args.repetitions.sep != NULL && reps > 0) {
|
||||
msep = match(defs, f, str, pat->args.repetitions.sep, ignorecase);
|
||||
if (sep != NULL && reps > 0) {
|
||||
msep = match(defs, f, str, sep, ignorecase);
|
||||
if (msep == NULL) break;
|
||||
str = msep->end;
|
||||
}
|
||||
match_t *mp = match(defs, f, str, pat->args.repetitions.repeat_pat, ignorecase);
|
||||
match_t *mp = match(defs, f, str, repeating, ignorecase);
|
||||
if (mp == NULL) {
|
||||
str = start;
|
||||
if (msep) recycle_if_unused(&msep);
|
||||
break;
|
||||
}
|
||||
if (mp->end == start && reps > 0) {
|
||||
// Since no forward progress was made on either
|
||||
// `repeat_pat` or `sep` and BP does not have mutable
|
||||
// state, it's guaranteed that no progress will be made on
|
||||
// the next loop either. We know that this will continue to
|
||||
// loop until reps==max, so let's just cut to the chase
|
||||
// instead of looping infinitely.
|
||||
// Since no forward progress was made on either `repeating`
|
||||
// or `sep` and BP does not have mutable state, it's
|
||||
// guaranteed that no progress will be made on the next
|
||||
// loop either. We know that this will continue to loop
|
||||
// until reps==max, so let's just cut to the chase instead
|
||||
// of looping infinitely.
|
||||
if (msep) recycle_if_unused(&msep);
|
||||
recycle_if_unused(&mp);
|
||||
if (pat->args.repetitions.max == -1)
|
||||
|
14
pattern.c
14
pattern.c
@ -305,12 +305,12 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
|
||||
if (c == '{') { // Surround with `|` word boundaries
|
||||
pat_t *left = new_pat(f, start, start+1, -1, BP_REF);
|
||||
left->args.ref.name = "|";
|
||||
left->args.ref.len = 1;
|
||||
left->args.ref.name = "left-word-boundary";
|
||||
left->args.ref.len = strlen(left->args.ref.name);
|
||||
|
||||
pat_t *right = new_pat(f, str, str+1, -1, BP_REF);
|
||||
right->args.ref.name = "|";
|
||||
right->args.ref.len = 1;
|
||||
right->args.ref.name = "right-word-boundary";
|
||||
right->args.ref.len = strlen(right->args.ref.name);
|
||||
|
||||
pat = chain_together(f, left, chain_together(f, pat, right));
|
||||
}
|
||||
@ -454,10 +454,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
|
||||
return new_pat(f, start, str, 0, BP_END_OF_FILE);
|
||||
return new_pat(f, start, str, 0, BP_END_OF_LINE);
|
||||
}
|
||||
// Special rules:
|
||||
case '_': case '|': {
|
||||
// Whitespace:
|
||||
case '_': {
|
||||
size_t namelen = 1;
|
||||
if (c == '_' && matchchar(&str, c)) // double __, ^^, $$
|
||||
if (matchchar(&str, '_')) // double __ (whitespace with newlines)
|
||||
++namelen;
|
||||
if (matchchar(&str, ':')) return NULL; // Don't match definitions
|
||||
pat_t *ref = new_pat(f, start, str, -1, BP_REF);
|
||||
|
Loading…
Reference in New Issue
Block a user