diff --git a/README.md b/README.md index 18e4fed..9e1d4d3 100644 --- a/README.md +++ b/README.md @@ -70,8 +70,8 @@ Pattern | Meaning `@foo=pat` | Let `foo` be the text of `pat` (used for text replacement and backreferences) `pat => "replacement"` | Match `pat` and replace it with `replacement` `(pat1 @keep=pat2) => "@keep"` | Match `pat1` followed by `pat2` and replace it with the text of `pat2` -`pat1==pat2` | `pat1`, assuming `pat2` also matches with the same length -`pat1!=pat2` | `pat1`, unless `pat2` also matches with the same length +`pat1~pat2` | `pat1` when `pat2` can be found within the result +`pat1!~pat2` | `pat1` when `pat2` can not be found within the result `name:pat2` | `name` is defined to mean `pat` `# line comment` | A line comment diff --git a/bp.1 b/bp.1 index 2ff84dc..4921be7 100644 --- a/bp.1 +++ b/bp.1 @@ -256,17 +256,17 @@ references to captured values: \f[B]\[at]0\f[R] (the whole of For example, \f[B]\[at]word _ \[at]rest=(*word % _) => \[dq]\[at]rest \[at]1\[dq]\f[R] .TP -\f[I]pat1\f[R] \f[B]==\f[R] \f[I]pat2\f[R] -Matches \f[I]pat1\f[R], if and only if \f[I]pat2\f[R] also matches the -text of \f[I]pat1\f[R]\[aq]s match. -(e.g.\ \f[B]word == (\[dq]foo_\[dq] *.)\f[R] matches words that start -with \f[B]\[lq]foo_\[rq]\f[R]) +\f[I]pat1\f[R] \f[B]\[ti]\f[R] \f[I]pat2\f[R] +Matches when \f[I]pat1\f[R] matches and \f[I]pat2\f[R] can be found +within the text of that match. +(e.g.\ \f[B]comment \[ti] {TODO}\f[R] matches comments that contain the +word \f[B]\[lq]TODO\[rq]\f[R]) .TP -\f[I]pat1\f[R] \f[B]!=\f[R] \f[I]pat2\f[R] -Matches \f[I]pat1\f[R], if and only if \f[I]pat2\f[R] does not match the -text of \f[I]pat1\f[R]\[aq]s match. -(e.g.\ \f[B]word == (\[dq]foo_\[dq] *.)\f[R] matches words that do not -start with \f[B]\[lq]foo_\[rq]\f[R]) +\f[I]pat1\f[R] \f[B]!\[ti]\f[R] \f[I]pat2\f[R] +Matches when \f[I]pat1\f[R] matches, but \f[I]pat2\f[R] can not be found +within the text of that match. +(e.g.\ \f[B]comment \[ti] {IGNORE}\f[R] matches only comments that do +not contain the word \f[B]\[lq]IGNORE\[rq]\f[R]) .TP \f[I]name\f[R]\f[B]:\f[R] \f[I]pat\f[R] Define \f[I]name\f[R] to mean \f[I]pat\f[R] (pattern definition) @@ -295,12 +295,12 @@ some common patterns. For example, the c++ grammar file contains definitions for \f[B]//\f[R]-style line comments as well as \f[B]/*\&...*/\f[R]-style block comments. -Thus, you can find all comments with the string \[lq]TODO\[rq] with the +Thus, you can find all comments with the word \[lq]TODO\[rq] with the following command: .IP .nf \f[C] -bp -g c++ -p \[aq]comment==(..%\[rs]n \[dq]TODO\[dq] ..%\[rs]n$$)\[aq] *.cpp +bp -g c++ -p \[aq]comment\[ti]{TODO}\[aq] *.cpp \f[R] .fi .SH EXAMPLES @@ -313,12 +313,15 @@ Find files ending with \[dq].c\[dq] and replace the extension with \[dq].h\[dq] .TP \f[B]bp -p \[aq]{foobar} parens\[aq] my_file.py\f[R] -Find the literal string \f[B]\[dq]foobar\[dq]\f[R], assuming it\[aq]s a -complete word, followed by a pair of matching parentheses in the file -\f[I]my_file.py\f[R] +Find the word \f[B]\[dq]foobar\[dq]\f[R], followed by a pair of matching +parentheses in the file \f[I]my_file.py\f[R] .TP -\f[B]bp -g html -p `html-element==(\[dq] \"\@rest \@1\"** -*pat1* **==** *pat2* -: Matches *pat1*, if and only if *pat2* also matches the text of -*pat1*\'s match. (e.g. **word == (\"foo\_\" \*.)** matches words that start -with **"foo\_"**) +*pat1* **~** *pat2* +: Matches when *pat1* matches and *pat2* can be found within the text of that +match. (e.g. **comment ~ {TODO}** matches comments that contain the word +**"TODO"**) -*pat1* **!=** *pat2* -: Matches *pat1*, if and only if *pat2* does not match the text of -*pat1*\'s match. (e.g. **word == (\"foo\_\" \*.)** matches words that do -not start with **"foo\_"**) +*pat1* **!~** *pat2* +: Matches when *pat1* matches, but *pat2* can not be found within the text of +that match. (e.g. **comment ~ {IGNORE}** matches only comments that do not +contain the word **"IGNORE"**) *name***:** *pat* : Define *name* to mean *pat* (pattern definition) @@ -262,10 +262,10 @@ which may be loaded on demand. These grammar files are not comprehensive syntax definitions, but only some common patterns. For example, the c++ grammar file contains definitions for **//**-style line comments as well as **/\*...\*/**-style block comments. Thus, you can find all comments with the -string "TODO" with the following command: +word "TODO" with the following command: ``` -bp -g c++ -p 'comment==(..%\n "TODO" ..%\n$$)' *.cpp +bp -g c++ -p 'comment~{TODO}' *.cpp ``` @@ -278,9 +278,12 @@ bp -g c++ -p 'comment==(..%\n "TODO" ..%\n$$)' *.cpp : Find files ending with \".c\" and replace the extension with \".h\" **bp -p \'{foobar} parens\' my_file.py** -: Find the literal string **\"foobar\"**, assuming it\'s a complete word, -followed by a pair of matching parentheses in the file *my_file.py* +: Find the word **\"foobar\"**, followed by a pair of matching parentheses in +the file *my_file.py* -**bp -g html -p 'html-element==(\"\ "Expected pattern after '~'")) +Not-match-pat: @first=(suffixed-pat / simple-pat)__"!~"__@second=(pat / @!=(''=> "Expected pattern after '!~'")) Dot: `. !`. String: ( diff --git a/grammars/builtins.bp b/grammars/builtins.bp index 6c0fc75..42a41ac 100644 --- a/grammars/builtins.bp +++ b/grammars/builtins.bp @@ -17,15 +17,15 @@ braces: `{ ..%(\n/braces/string) `} parens: `( ..%(\n/parens/string) `) string: `" ..%string-escape `" / `' ..%string-escape `' string-escape: `\ (`x 2 Hex / 1-3 `0-7 / `u 1-4 Hex / .) -left-id-edge: ^ / <(\x00-x7f!=id-char) / <((\xc0-xdf \x80-xbf)!=id-char) - / <((\xe0-xef 2\x80-xbf)!=id-char) / <((\xf0-xf7 3\x80-xbf)!=id-char) +left-id-edge: ^ / <(\x00-x7f!~(^^id-char)) / <((\xc0-xdf \x80-xbf)!~(^^id-char)) + / <((\xe0-xef 2\x80-xbf)!~(^^id-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^id-char)) right-id-edge: !id-char -id: left-id-edge !`0-9 (+id-char)!=keyword +id: left-id-edge !`0-9 !(keyword left-id-edge) +id-char id-char: `a-z,A-Z,_,0-9 var: id keyword: !"" # No keywords defined by default -left-word-edge: ^ / <(\x00-x7f!=word-char) / <((\xc0-xdf \x80-xbf)!=word-char) - / <((\xe0-xef 2\x80-xbf)!=word-char) / <((\xf0-xf7 3\x80-xbf)!=word-char) +left-word-edge: ^ / <(\x00-x7f!~(^^word-char)) / <((\xc0-xdf \x80-xbf)!~(^^word-char)) + / <((\xe0-xef 2\x80-xbf)!~(^^word-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^word-char)) right-word-edge: !word-char word-char: `a-z,A-Z,_,0-9,-,' word: left-word-edge +word-char diff --git a/grammars/html.bp b/grammars/html.bp index 483db6a..457a183 100644 --- a/grammars/html.bp +++ b/grammars/html.bp @@ -8,15 +8,15 @@ doctype: " -html-element: void-element / raw-element / template-element / normal-element +element: void-element / raw-element / template-element / normal-element void-element: `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") __attributes__ [`/] __ `> -template-element: "" +template-element: "" raw-element: `< @tag=("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n (") -normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / html-element) (") +normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / element) (") comment: "" diff --git a/match.c b/match.c index d4d6d27..f8f0ab4 100644 --- a/match.c +++ b/match.c @@ -114,8 +114,7 @@ static const char *match_backref(const char *str, match_t *cap, bool ignorecase) for (match_t *child = cap->child; child; child = child->nextsibling) { if (child->start > prev) { size_t len = (size_t)(child->start - prev); - if (ignorecase ? memicmp(str, prev, len) != 0 - : memcmp(str, prev, len) != 0) { + if ((ignorecase ? memicmp : memcmp)(str, prev, len) != 0) { return NULL; } str += len; @@ -128,8 +127,7 @@ static const char *match_backref(const char *str, match_t *cap, bool ignorecase) } if (cap->end > prev) { size_t len = (size_t)(cap->end - prev); - if (ignorecase ? memicmp(str, prev, len) != 0 - : memcmp(str, prev, len) != 0) { + if ((ignorecase ? memicmp : memcmp)(str, prev, len) != 0) { return NULL; } str += len; @@ -151,9 +149,11 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk } else { str = f->contents; } + bool only_start = pat->type == BP_START_OF_FILE || (pat->type == BP_CHAIN && pat->args.multiple.first->type == BP_START_OF_FILE); while (str <= f->end) { match_t *m = match(defs, f, str, pat, ignorecase); if (m) return m; + if (only_start) return NULL; match_t *s; if (skip && (s = match(defs, f, str, skip, ignorecase))) { str = s->end > str ? s->end : str + 1; @@ -201,8 +201,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool } case BP_STRING: { if (&str[pat->len] > f->end) return NULL; - if (ignorecase ? memicmp(str, pat->args.string, (size_t)pat->len) != 0 - : memcmp(str, pat->args.string, (size_t)pat->len) != 0) + if ((ignorecase ? memicmp : memcmp)(str, pat->args.string, (size_t)pat->len) != 0) return NULL; return new_match(pat, str, str + pat->len, NULL); } @@ -360,7 +359,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ADD_OWNER(m1->nextsibling, m2); return new_match(pat, str, m2->end, m1); } - case BP_EQUAL: case BP_NOT_EQUAL: { + case BP_MATCH: case BP_NOT_MATCH: { match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); if (m1 == NULL) return NULL; @@ -374,17 +373,13 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool .mmapped=f->mmapped, .pats = NULL, .next = NULL, }; - match_t *m2 = match(defs, &inner, str, pat->args.multiple.second, ignorecase); - if ((m2 == NULL || m2->end != m1->end) == (pat->type == BP_EQUAL)) { + match_t *m2 = next_match(defs, &inner, NULL, pat->args.multiple.second, NULL, ignorecase); + if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { + recycle_if_unused(&m2); recycle_if_unused(&m1); - if (m2 != NULL) recycle_if_unused(&m2); return NULL; } - if (pat->type == BP_EQUAL) { - ADD_OWNER(m1->nextsibling, m2); - } else { - recycle_if_unused(&m2); - } + if (pat->type == BP_MATCH) ADD_OWNER(m1->nextsibling, m2); return new_match(pat, m1->start, m1->end, m1); } case BP_REPLACE: { diff --git a/pattern.c b/pattern.c index 7c4c669..bc34e4b 100644 --- a/pattern.c +++ b/pattern.c @@ -139,7 +139,7 @@ pat_t *chain_together(file_t *f, pat_t *first, pat_t *second) p = p->args.capture.capture_pat; } else if (p->type == BP_CHAIN) { p = p->args.multiple.second; - } else if (p->type == BP_EQUAL || p->type == BP_NOT_EQUAL) { + } else if (p->type == BP_MATCH || p->type == BP_NOT_MATCH) { p = p->args.pat; } else break; } @@ -174,19 +174,20 @@ static pat_t *bp_simplepattern(file_t *f, const char *str) // Expand postfix operators (if any) str = after_spaces(pat->end); - while (str+2 < f->end && (matchstr(&str, "!=") || matchstr(&str, "=="))) { // Equality == and inequality != - bool equal = str[-2] == '='; + while (str+2 < f->end) { + enum pattype_e type; + if (matchchar(&str, '~')) + type = BP_MATCH; + else if (matchstr(&str, "!~")) + type = BP_NOT_MATCH; + else break; + pat_t *first = pat; pat_t *second = bp_simplepattern(f, str); if (!second) - file_err(f, str, str, "The '%c=' operator expects a pattern before and after.", equal?'=':'!'); - if (equal) { - if (!(first->len == -1 || second->len == -1 || first->len == second->len)) - file_err(f, pat->start, second->end, - "These two patterns cannot possibly give the same result (different lengths: %ld != %ld)", - first->len, second->len); - } - pat = new_pat(f, str, second->end, first->len, equal ? BP_EQUAL : BP_NOT_EQUAL); + file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~"); + + pat = new_pat(f, str, second->end, first->len, type); pat->args.multiple.first = first; pat->args.multiple.second = second; str = pat->end; diff --git a/types.h b/types.h index 19c6829..ae41d4a 100644 --- a/types.h +++ b/types.h @@ -22,8 +22,8 @@ enum pattype_e { BP_CAPTURE, BP_OTHERWISE, BP_CHAIN, - BP_EQUAL, - BP_NOT_EQUAL, + BP_MATCH, + BP_NOT_MATCH, BP_REPLACE, BP_REF, BP_BACKREF,