diff options
| -rw-r--r-- | README.md | 4 | ||||
| -rw-r--r-- | bp.1 | 41 | ||||
| -rw-r--r-- | bp.1.md | 33 | ||||
| -rw-r--r-- | grammars/bp.bp | 10 | ||||
| -rw-r--r-- | grammars/builtins.bp | 10 | ||||
| -rw-r--r-- | grammars/html.bp | 6 | ||||
| -rw-r--r-- | match.c | 25 | ||||
| -rw-r--r-- | pattern.c | 23 | ||||
| -rw-r--r-- | types.h | 4 |
9 files changed, 79 insertions, 77 deletions
@@ -70,8 +70,8 @@ Pattern | Meaning `@foo=pat` | Let `foo` be the text of `pat` (used for text replacement and backreferences) `pat => "replacement"` | Match `pat` and replace it with `replacement` `(pat1 @keep=pat2) => "@keep"` | Match `pat1` followed by `pat2` and replace it with the text of `pat2` -`pat1==pat2` | `pat1`, assuming `pat2` also matches with the same length -`pat1!=pat2` | `pat1`, unless `pat2` also matches with the same length +`pat1~pat2` | `pat1` when `pat2` can be found within the result +`pat1!~pat2` | `pat1` when `pat2` can not be found within the result `name:pat2` | `name` is defined to mean `pat` `# line comment` | A line comment @@ -256,17 +256,17 @@ references to captured values: \f[B]\[at]0\f[R] (the whole of For example, \f[B]\[at]word _ \[at]rest=(*word % _) => \[dq]\[at]rest \[at]1\[dq]\f[R] .TP -\f[I]pat1\f[R] \f[B]==\f[R] \f[I]pat2\f[R] -Matches \f[I]pat1\f[R], if and only if \f[I]pat2\f[R] also matches the -text of \f[I]pat1\f[R]\[aq]s match. -(e.g.\ \f[B]word == (\[dq]foo_\[dq] *.)\f[R] matches words that start -with \f[B]\[lq]foo_\[rq]\f[R]) -.TP -\f[I]pat1\f[R] \f[B]!=\f[R] \f[I]pat2\f[R] -Matches \f[I]pat1\f[R], if and only if \f[I]pat2\f[R] does not match the -text of \f[I]pat1\f[R]\[aq]s match. -(e.g.\ \f[B]word == (\[dq]foo_\[dq] *.)\f[R] matches words that do not -start with \f[B]\[lq]foo_\[rq]\f[R]) +\f[I]pat1\f[R] \f[B]\[ti]\f[R] \f[I]pat2\f[R] +Matches when \f[I]pat1\f[R] matches and \f[I]pat2\f[R] can be found +within the text of that match. +(e.g.\ \f[B]comment \[ti] {TODO}\f[R] matches comments that contain the +word \f[B]\[lq]TODO\[rq]\f[R]) +.TP +\f[I]pat1\f[R] \f[B]!\[ti]\f[R] \f[I]pat2\f[R] +Matches when \f[I]pat1\f[R] matches, but \f[I]pat2\f[R] can not be found +within the text of that match. +(e.g.\ \f[B]comment \[ti] {IGNORE}\f[R] matches only comments that do +not contain the word \f[B]\[lq]IGNORE\[rq]\f[R]) .TP \f[I]name\f[R]\f[B]:\f[R] \f[I]pat\f[R] Define \f[I]name\f[R] to mean \f[I]pat\f[R] (pattern definition) @@ -295,12 +295,12 @@ some common patterns. For example, the c++ grammar file contains definitions for \f[B]//\f[R]-style line comments as well as \f[B]/*\&...*/\f[R]-style block comments. -Thus, you can find all comments with the string \[lq]TODO\[rq] with the +Thus, you can find all comments with the word \[lq]TODO\[rq] with the following command: .IP .nf \f[C] -bp -g c++ -p \[aq]comment==(..%\[rs]n \[dq]TODO\[dq] ..%\[rs]n$$)\[aq] *.cpp +bp -g c++ -p \[aq]comment\[ti]{TODO}\[aq] *.cpp \f[R] .fi .SH EXAMPLES @@ -313,12 +313,15 @@ Find files ending with \[dq].c\[dq] and replace the extension with \[dq].h\[dq] .TP \f[B]bp -p \[aq]{foobar} parens\[aq] my_file.py\f[R] -Find the literal string \f[B]\[dq]foobar\[dq]\f[R], assuming it\[aq]s a -complete word, followed by a pair of matching parentheses in the file -\f[I]my_file.py\f[R] +Find the word \f[B]\[dq]foobar\[dq]\f[R], followed by a pair of matching +parentheses in the file \f[I]my_file.py\f[R] .TP -\f[B]bp -g html -p `html-element==(\[dq]<a \[dq]..%\[rs]n$$)' foo.html\f[R] -Using the \f[I]html\f[R] grammar, find all \f[I]html-element\f[R]s -matching the tag \f[I]a\f[R] in the file \f[I]foo.html\f[R] +\f[B]bp -g html -p \[aq]element \[ti] (\[ha]\[ha]\[dq]<a \[dq])\[aq] foo.html\f[R] +Using the \f[I]html\f[R] grammar, find all \f[I]element\f[R]s matching +the tag \f[I]a\f[R] in the file \f[I]foo.html\f[R] +.TP +\f[B]bp -g python -p \[aq]comment\[ti]{TODO}\[aq] *.py\f[R] +Find all comments with the word \f[B]\[lq]TODO\[rq]\f[R] in local python +files. .SH AUTHORS Bruce Hill (\f[I]bruce\[at]bruce-hill.com\f[R]). @@ -228,15 +228,15 @@ string, and it may contain references to captured values: **\@0** (the whole of named *foo* in *pat*), etc. For example, **\@word \_ \@rest=(\*word % \_) =\> \"\@rest \@1\"** -*pat1* **==** *pat2* -: Matches *pat1*, if and only if *pat2* also matches the text of -*pat1*\'s match. (e.g. **word == (\"foo\_\" \*.)** matches words that start -with **"foo\_"**) +*pat1* **~** *pat2* +: Matches when *pat1* matches and *pat2* can be found within the text of that +match. (e.g. **comment ~ {TODO}** matches comments that contain the word +**"TODO"**) -*pat1* **!=** *pat2* -: Matches *pat1*, if and only if *pat2* does not match the text of -*pat1*\'s match. (e.g. **word == (\"foo\_\" \*.)** matches words that do -not start with **"foo\_"**) +*pat1* **!~** *pat2* +: Matches when *pat1* matches, but *pat2* can not be found within the text of +that match. (e.g. **comment ~ {IGNORE}** matches only comments that do not +contain the word **"IGNORE"**) *name***:** *pat* : Define *name* to mean *pat* (pattern definition) @@ -262,10 +262,10 @@ which may be loaded on demand. These grammar files are not comprehensive syntax definitions, but only some common patterns. For example, the c++ grammar file contains definitions for **//**-style line comments as well as **/\*...\*/**-style block comments. Thus, you can find all comments with the -string "TODO" with the following command: +word "TODO" with the following command: ``` -bp -g c++ -p 'comment==(..%\n "TODO" ..%\n$$)' *.cpp +bp -g c++ -p 'comment~{TODO}' *.cpp ``` @@ -278,9 +278,12 @@ bp -g c++ -p 'comment==(..%\n "TODO" ..%\n$$)' *.cpp : Find files ending with \".c\" and replace the extension with \".h\" **bp -p \'{foobar} parens\' my_file.py** -: Find the literal string **\"foobar\"**, assuming it\'s a complete word, -followed by a pair of matching parentheses in the file *my_file.py* +: Find the word **\"foobar\"**, followed by a pair of matching parentheses in +the file *my_file.py* -**bp -g html -p 'html-element==(\"\<a \"..%\\n\$\$)' foo.html** -: Using the *html* grammar, find all *html-element*s matching the tag *a* in -the file *foo.html* +**bp -g html -p \'element ~ (^^\"\<a \")\' foo.html** +: Using the *html* grammar, find all *element*s matching the tag *a* in the +file *foo.html* + +**bp -g python -p \'comment~{TODO}\' \*.py** +: Find all comments with the word **"TODO"** in local python files. diff --git a/grammars/bp.bp b/grammars/bp.bp index ed7eb80..7240729 100644 --- a/grammars/bp.bp +++ b/grammars/bp.bp @@ -14,18 +14,18 @@ Def: @name=id __ `: __ ( # This is used for command line arguments: String-pattern: ..%(\n / Nodent / Escape / `\ pat [`;])$$ -pat: simple-pat !(__("!="/"==")) / suffixed-pat +pat: simple-pat !(__("!~"/"~")) / suffixed-pat simple-pat: Upto-and / Dot / String / Chars / Nodent / Escape-range / Escape / Repeat / Optional / No / After / Before / Capture / Start-of-File / Start-of-Line / End-of-File / End-of-Line / Ref / parens suffixed-pat: ( - Eq-pat - / Not-eq-pat + Match-pat + / Not-match-pat ) -Eq-pat: @first=(suffixed-pat / simple-pat)__"=="__@second=pat -Not-eq-pat: @first=(suffixed-pat / simple-pat)__"!="__@second=pat +Match-pat: @first=(suffixed-pat / simple-pat)__"~"__@second=(pat / @!=(''=> "Expected pattern after '~'")) +Not-match-pat: @first=(suffixed-pat / simple-pat)__"!~"__@second=(pat / @!=(''=> "Expected pattern after '!~'")) Dot: `. !`. String: ( diff --git a/grammars/builtins.bp b/grammars/builtins.bp index 6c0fc75..42a41ac 100644 --- a/grammars/builtins.bp +++ b/grammars/builtins.bp @@ -17,15 +17,15 @@ braces: `{ ..%(\n/braces/string) `} parens: `( ..%(\n/parens/string) `) string: `" ..%string-escape `" / `' ..%string-escape `' string-escape: `\ (`x 2 Hex / 1-3 `0-7 / `u 1-4 Hex / .) -left-id-edge: ^ / <(\x00-x7f!=id-char) / <((\xc0-xdf \x80-xbf)!=id-char) - / <((\xe0-xef 2\x80-xbf)!=id-char) / <((\xf0-xf7 3\x80-xbf)!=id-char) +left-id-edge: ^ / <(\x00-x7f!~(^^id-char)) / <((\xc0-xdf \x80-xbf)!~(^^id-char)) + / <((\xe0-xef 2\x80-xbf)!~(^^id-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^id-char)) right-id-edge: !id-char -id: left-id-edge !`0-9 (+id-char)!=keyword +id: left-id-edge !`0-9 !(keyword left-id-edge) +id-char id-char: `a-z,A-Z,_,0-9 var: id keyword: !"" # No keywords defined by default -left-word-edge: ^ / <(\x00-x7f!=word-char) / <((\xc0-xdf \x80-xbf)!=word-char) - / <((\xe0-xef 2\x80-xbf)!=word-char) / <((\xf0-xf7 3\x80-xbf)!=word-char) +left-word-edge: ^ / <(\x00-x7f!~(^^word-char)) / <((\xc0-xdf \x80-xbf)!~(^^word-char)) + / <((\xe0-xef 2\x80-xbf)!~(^^word-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^word-char)) right-word-edge: !word-char word-char: `a-z,A-Z,_,0-9,-,' word: left-word-edge +word-char diff --git a/grammars/html.bp b/grammars/html.bp index 483db6a..457a183 100644 --- a/grammars/html.bp +++ b/grammars/html.bp @@ -8,15 +8,15 @@ doctype: "<!DOCTYPE" ..%\n `> -html-element: void-element / raw-element / template-element / normal-element +element: void-element / raw-element / template-element / normal-element void-element: `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") __attributes__ [`/] __ `> -template-element: "<template>" ..%(\n / comment / html-element) "</template>" +template-element: "<template>" ..%(\n / comment / element) "</template>" raw-element: `< @tag=("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n ("</"tag__`>) -normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / html-element) ("</"tag`>) +normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / element) ("</"tag`>) comment: "<!--" ..%\n "-->" @@ -114,8 +114,7 @@ static const char *match_backref(const char *str, match_t *cap, bool ignorecase) for (match_t *child = cap->child; child; child = child->nextsibling) { if (child->start > prev) { size_t len = (size_t)(child->start - prev); - if (ignorecase ? memicmp(str, prev, len) != 0 - : memcmp(str, prev, len) != 0) { + if ((ignorecase ? memicmp : memcmp)(str, prev, len) != 0) { return NULL; } str += len; @@ -128,8 +127,7 @@ static const char *match_backref(const char *str, match_t *cap, bool ignorecase) } if (cap->end > prev) { size_t len = (size_t)(cap->end - prev); - if (ignorecase ? memicmp(str, prev, len) != 0 - : memcmp(str, prev, len) != 0) { + if ((ignorecase ? memicmp : memcmp)(str, prev, len) != 0) { return NULL; } str += len; @@ -151,9 +149,11 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk } else { str = f->contents; } + bool only_start = pat->type == BP_START_OF_FILE || (pat->type == BP_CHAIN && pat->args.multiple.first->type == BP_START_OF_FILE); while (str <= f->end) { match_t *m = match(defs, f, str, pat, ignorecase); if (m) return m; + if (only_start) return NULL; match_t *s; if (skip && (s = match(defs, f, str, skip, ignorecase))) { str = s->end > str ? s->end : str + 1; @@ -201,8 +201,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool } case BP_STRING: { if (&str[pat->len] > f->end) return NULL; - if (ignorecase ? memicmp(str, pat->args.string, (size_t)pat->len) != 0 - : memcmp(str, pat->args.string, (size_t)pat->len) != 0) + if ((ignorecase ? memicmp : memcmp)(str, pat->args.string, (size_t)pat->len) != 0) return NULL; return new_match(pat, str, str + pat->len, NULL); } @@ -360,7 +359,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ADD_OWNER(m1->nextsibling, m2); return new_match(pat, str, m2->end, m1); } - case BP_EQUAL: case BP_NOT_EQUAL: { + case BP_MATCH: case BP_NOT_MATCH: { match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); if (m1 == NULL) return NULL; @@ -374,17 +373,13 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool .mmapped=f->mmapped, .pats = NULL, .next = NULL, }; - match_t *m2 = match(defs, &inner, str, pat->args.multiple.second, ignorecase); - if ((m2 == NULL || m2->end != m1->end) == (pat->type == BP_EQUAL)) { + match_t *m2 = next_match(defs, &inner, NULL, pat->args.multiple.second, NULL, ignorecase); + if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { + recycle_if_unused(&m2); recycle_if_unused(&m1); - if (m2 != NULL) recycle_if_unused(&m2); return NULL; } - if (pat->type == BP_EQUAL) { - ADD_OWNER(m1->nextsibling, m2); - } else { - recycle_if_unused(&m2); - } + if (pat->type == BP_MATCH) ADD_OWNER(m1->nextsibling, m2); return new_match(pat, m1->start, m1->end, m1); } case BP_REPLACE: { @@ -139,7 +139,7 @@ pat_t *chain_together(file_t *f, pat_t *first, pat_t *second) p = p->args.capture.capture_pat; } else if (p->type == BP_CHAIN) { p = p->args.multiple.second; - } else if (p->type == BP_EQUAL || p->type == BP_NOT_EQUAL) { + } else if (p->type == BP_MATCH || p->type == BP_NOT_MATCH) { p = p->args.pat; } else break; } @@ -174,19 +174,20 @@ static pat_t *bp_simplepattern(file_t *f, const char *str) // Expand postfix operators (if any) str = after_spaces(pat->end); - while (str+2 < f->end && (matchstr(&str, "!=") || matchstr(&str, "=="))) { // Equality <pat1>==<pat2> and inequality <pat1>!=<pat2> - bool equal = str[-2] == '='; + while (str+2 < f->end) { + enum pattype_e type; + if (matchchar(&str, '~')) + type = BP_MATCH; + else if (matchstr(&str, "!~")) + type = BP_NOT_MATCH; + else break; + pat_t *first = pat; pat_t *second = bp_simplepattern(f, str); if (!second) - file_err(f, str, str, "The '%c=' operator expects a pattern before and after.", equal?'=':'!'); - if (equal) { - if (!(first->len == -1 || second->len == -1 || first->len == second->len)) - file_err(f, pat->start, second->end, - "These two patterns cannot possibly give the same result (different lengths: %ld != %ld)", - first->len, second->len); - } - pat = new_pat(f, str, second->end, first->len, equal ? BP_EQUAL : BP_NOT_EQUAL); + file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~"); + + pat = new_pat(f, str, second->end, first->len, type); pat->args.multiple.first = first; pat->args.multiple.second = second; str = pat->end; @@ -22,8 +22,8 @@ enum pattype_e { BP_CAPTURE, BP_OTHERWISE, BP_CHAIN, - BP_EQUAL, - BP_NOT_EQUAL, + BP_MATCH, + BP_NOT_MATCH, BP_REPLACE, BP_REF, BP_BACKREF, |
