Added ~ and !~ operators as replacements for == and !=

2021-05-19 23:41:57 -07:00 · 2021-05-19 23:41:57 -07:00 · 355e06a58e
commit 355e06a58e
parent 3f0ab96f7f
9 changed files with 78 additions and 76 deletions
--- a/README.md
+++ b/README.md
@ -70,8 +70,8 @@ Pattern            | Meaning
 `@foo=pat`         | Let `foo` be the text of `pat` (used for text replacement and backreferences)
 `pat => "replacement"` | Match `pat` and replace it with `replacement`
 `(pat1 @keep=pat2) => "@keep"` | Match `pat1` followed by `pat2` and replace it with the text of `pat2`
-`pat1==pat2`       | `pat1`, assuming `pat2` also matches with the same length
-`pat1!=pat2`       | `pat1`, unless `pat2` also matches with the same length
+`pat1~pat2`        | `pat1` when `pat2` can be found within the result
+`pat1!~pat2`       | `pat1` when `pat2` can not be found within the result
 `name:pat2`        | `name` is defined to mean `pat`
 `# line comment`   | A line comment

--- a/bp.1
+++ b/bp.1
@ -256,17 +256,17 @@ references to captured values: \f[B]\[at]0\f[R] (the whole of
 For example, \f[B]\[at]word _ \[at]rest=(*word % _) => \[dq]\[at]rest
 \[at]1\[dq]\f[R]
 .TP
-\f[I]pat1\f[R] \f[B]==\f[R] \f[I]pat2\f[R]
-Matches \f[I]pat1\f[R], if and only if \f[I]pat2\f[R] also matches the
-text of \f[I]pat1\f[R]\[aq]s match.
-(e.g.\ \f[B]word == (\[dq]foo_\[dq] *.)\f[R] matches words that start
-with \f[B]\[lq]foo_\[rq]\f[R])
+\f[I]pat1\f[R] \f[B]\[ti]\f[R] \f[I]pat2\f[R]
+Matches when \f[I]pat1\f[R] matches and \f[I]pat2\f[R] can be found
+within the text of that match.
+(e.g.\ \f[B]comment \[ti] {TODO}\f[R] matches comments that contain the
+word \f[B]\[lq]TODO\[rq]\f[R])
 .TP
-\f[I]pat1\f[R] \f[B]!=\f[R] \f[I]pat2\f[R]
-Matches \f[I]pat1\f[R], if and only if \f[I]pat2\f[R] does not match the
-text of \f[I]pat1\f[R]\[aq]s match.
-(e.g.\ \f[B]word == (\[dq]foo_\[dq] *.)\f[R] matches words that do not
-start with \f[B]\[lq]foo_\[rq]\f[R])
+\f[I]pat1\f[R] \f[B]!\[ti]\f[R] \f[I]pat2\f[R]
+Matches when \f[I]pat1\f[R] matches, but \f[I]pat2\f[R] can not be found
+within the text of that match.
+(e.g.\ \f[B]comment \[ti] {IGNORE}\f[R] matches only comments that do
+not contain the word \f[B]\[lq]IGNORE\[rq]\f[R])
 .TP
 \f[I]name\f[R]\f[B]:\f[R] \f[I]pat\f[R]
 Define \f[I]name\f[R] to mean \f[I]pat\f[R] (pattern definition)
@ -295,12 +295,12 @@ some common patterns.
 For example, the c++ grammar file contains definitions for
 \f[B]//\f[R]-style line comments as well as \f[B]/*\&...*/\f[R]-style
 block comments.
-Thus, you can find all comments with the string \[lq]TODO\[rq] with the
+Thus, you can find all comments with the word \[lq]TODO\[rq] with the
 following command:
 .IP
 .nf
 \f[C]
-bp -g c++ -p \[aq]comment==(..%\[rs]n \[dq]TODO\[dq] ..%\[rs]n$$)\[aq] *.cpp
+bp -g c++ -p \[aq]comment\[ti]{TODO}\[aq] *.cpp
 \f[R]
 .fi
 .SH EXAMPLES
@ -313,12 +313,15 @@ Find files ending with \[dq].c\[dq] and replace the extension with
 \[dq].h\[dq]
 .TP
 \f[B]bp -p \[aq]{foobar} parens\[aq] my_file.py\f[R]
-Find the literal string \f[B]\[dq]foobar\[dq]\f[R], assuming it\[aq]s a
-complete word, followed by a pair of matching parentheses in the file
-\f[I]my_file.py\f[R]
+Find the word \f[B]\[dq]foobar\[dq]\f[R], followed by a pair of matching
+parentheses in the file \f[I]my_file.py\f[R]
 .TP
-\f[B]bp -g html -p `html-element==(\[dq]<a \[dq]..%\[rs]n$$)' foo.html\f[R]
-Using the \f[I]html\f[R] grammar, find all \f[I]html-element\f[R]s
-matching the tag \f[I]a\f[R] in the file \f[I]foo.html\f[R]
+\f[B]bp -g html -p \[aq]element \[ti] (\[ha]\[ha]\[dq]<a \[dq])\[aq] foo.html\f[R]
+Using the \f[I]html\f[R] grammar, find all \f[I]element\f[R]s matching
+the tag \f[I]a\f[R] in the file \f[I]foo.html\f[R]
+.TP
+\f[B]bp -g python -p \[aq]comment\[ti]{TODO}\[aq] *.py\f[R]
+Find all comments with the word \f[B]\[lq]TODO\[rq]\f[R] in local python
+files.
 .SH AUTHORS
 Bruce Hill (\f[I]bruce\[at]bruce-hill.com\f[R]).
--- a/bp.1.md
+++ b/bp.1.md
@ -228,15 +228,15 @@ string, and it may contain references to captured values: **\@0** (the whole of
 named *foo* in *pat*), etc. For example, **\@word \_ \@rest=(\*word % \_)
 =\> \"\@rest \@1\"**

-*pat1* **==** *pat2*
-: Matches *pat1*, if and only if *pat2* also matches the text of
-*pat1*\'s match. (e.g. **word == (\"foo\_\" \*.)** matches words that start
-with **"foo\_"**)
+*pat1* **~** *pat2*
+: Matches when *pat1* matches and *pat2* can be found within the text of that
+match. (e.g. **comment ~ {TODO}** matches comments that contain the word
+**"TODO"**)

-*pat1* **!=** *pat2*
-: Matches *pat1*, if and only if *pat2* does not match the text of
-*pat1*\'s match. (e.g. **word == (\"foo\_\" \*.)** matches words that do
-not start with **"foo\_"**)
+*pat1* **!~** *pat2*
+: Matches when *pat1* matches, but *pat2* can not be found within the text of
+that match. (e.g. **comment ~ {IGNORE}** matches only comments that do not
+contain the word **"IGNORE"**)

 *name***:** *pat*
 : Define *name* to mean *pat* (pattern definition)
@ -262,10 +262,10 @@ which may be loaded on demand. These grammar files are not comprehensive syntax
 definitions, but only some common patterns. For example, the c++ grammar file
 contains definitions for **//**-style line comments as well as
 **/\*...\*/**-style block comments. Thus, you can find all comments with the
-string "TODO" with the following command:
+word "TODO" with the following command:

 ```
-bp -g c++ -p 'comment==(..%\n "TODO" ..%\n$$)' *.cpp
+bp -g c++ -p 'comment~{TODO}' *.cpp
 ```


@ -278,9 +278,12 @@ bp -g c++ -p 'comment==(..%\n "TODO" ..%\n$$)' *.cpp
 : Find files ending with \".c\" and replace the extension with \".h\"

 **bp -p \'{foobar} parens\' my_file.py**
-: Find the literal string **\"foobar\"**, assuming it\'s a complete word,
-followed by a pair of matching parentheses in the file *my_file.py*
+: Find the word **\"foobar\"**, followed by a pair of matching parentheses in
+the file *my_file.py*

-**bp -g html -p 'html-element==(\"\<a \"..%\\n\$\$)' foo.html**
-: Using the *html* grammar, find all *html-element*s matching the tag *a* in
-the file *foo.html*
+**bp -g html -p \'element ~ (^^\"\<a \")\' foo.html**
+: Using the *html* grammar, find all *element*s matching the tag *a* in the
+file *foo.html*
+
+**bp -g python -p \'comment~{TODO}\' \*.py**
+: Find all comments with the word **"TODO"** in local python files.
--- a/grammars/bp.bp
+++ b/grammars/bp.bp
@ -14,18 +14,18 @@ Def: @name=id __ `: __ (
 # This is used for command line arguments:
 String-pattern: ..%(\n / Nodent / Escape / `\ pat [`;])$$

-pat: simple-pat !(__("!="/"==")) / suffixed-pat
+pat: simple-pat !(__("!~"/"~")) / suffixed-pat
 simple-pat: Upto-and / Dot / String / Chars / Nodent / Escape-range
    / Escape / Repeat / Optional / No / After / Before / Capture
    / Start-of-File / Start-of-Line / End-of-File / End-of-Line / Ref / parens

 suffixed-pat: (
-      Eq-pat
-    / Not-eq-pat
+      Match-pat
+    / Not-match-pat
 )

-Eq-pat: @first=(suffixed-pat / simple-pat)__"=="__@second=pat
-Not-eq-pat: @first=(suffixed-pat / simple-pat)__"!="__@second=pat
+Match-pat: @first=(suffixed-pat / simple-pat)__"~"__@second=(pat / @!=(''=> "Expected pattern after '~'"))
+Not-match-pat: @first=(suffixed-pat / simple-pat)__"!~"__@second=(pat / @!=(''=> "Expected pattern after '!~'"))

 Dot: `. !`.
 String: (
--- a/grammars/builtins.bp
+++ b/grammars/builtins.bp
@ -17,15 +17,15 @@ braces: `{ ..%(\n/braces/string) `}
 parens: `( ..%(\n/parens/string) `)
 string: `" ..%string-escape `" / `' ..%string-escape `'
 string-escape: `\ (`x 2 Hex / 1-3 `0-7 / `u 1-4 Hex / .)
-left-id-edge: ^ / <(\x00-x7f!=id-char) / <((\xc0-xdf \x80-xbf)!=id-char)
-    / <((\xe0-xef 2\x80-xbf)!=id-char) / <((\xf0-xf7 3\x80-xbf)!=id-char)
+left-id-edge: ^ / <(\x00-x7f!~(^^id-char)) / <((\xc0-xdf \x80-xbf)!~(^^id-char))
+    / <((\xe0-xef 2\x80-xbf)!~(^^id-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^id-char))
 right-id-edge: !id-char
-id: left-id-edge !`0-9 (+id-char)!=keyword
+id: left-id-edge !`0-9 !(keyword left-id-edge) +id-char
 id-char: `a-z,A-Z,_,0-9
 var: id
 keyword: !"" # No keywords defined by default
-left-word-edge: ^ / <(\x00-x7f!=word-char) / <((\xc0-xdf \x80-xbf)!=word-char)
-    / <((\xe0-xef 2\x80-xbf)!=word-char) / <((\xf0-xf7 3\x80-xbf)!=word-char)
+left-word-edge: ^ / <(\x00-x7f!~(^^word-char)) / <((\xc0-xdf \x80-xbf)!~(^^word-char))
+    / <((\xe0-xef 2\x80-xbf)!~(^^word-char)) / <((\xf0-xf7 3\x80-xbf)!~(^^word-char))
 right-word-edge: !word-char
 word-char: `a-z,A-Z,_,0-9,-,'
 word: left-word-edge +word-char
--- a/grammars/html.bp
+++ b/grammars/html.bp
@ -8,15 +8,15 @@

 doctype: "<!DOCTYPE" ..%\n `>

-html-element: void-element / raw-element / template-element / normal-element
+element: void-element / raw-element / template-element / normal-element

 void-element: `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") __attributes__ [`/] __ `>

-template-element: "<template>" ..%(\n / comment / html-element) "</template>"
+template-element: "<template>" ..%(\n / comment / element) "</template>"

 raw-element: `< @tag=("script"/"style"/"textarea"/"title") __attributes__ `> ..%\n ("</"tag__`>)

-normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / html-element) ("</"tag`>)
+normal-element: `< @tag=id __attributes__ `> ..%(\n / comment / element) ("</"tag`>)

 comment: "<!--" ..%\n "-->"

--- a/match.c
+++ b/match.c
@ -114,8 +114,7 @@ static const char *match_backref(const char *str, match_t *cap, bool ignorecase)
        for (match_t *child = cap->child; child; child = child->nextsibling) {
            if (child->start > prev) {
                size_t len = (size_t)(child->start - prev);
-                if (ignorecase ? memicmp(str, prev, len) != 0
-                               : memcmp(str, prev, len) != 0) {
+                if ((ignorecase ? memicmp : memcmp)(str, prev, len) != 0) {
                    return NULL;
                }
                str += len;
@ -128,8 +127,7 @@ static const char *match_backref(const char *str, match_t *cap, bool ignorecase)
        }
        if (cap->end > prev) {
            size_t len = (size_t)(cap->end - prev);
-            if (ignorecase ? memicmp(str, prev, len) != 0
-                           : memcmp(str, prev, len) != 0) {
+            if ((ignorecase ? memicmp : memcmp)(str, prev, len) != 0) {
                return NULL;
            }
            str += len;
@ -151,9 +149,11 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
    } else {
        str = f->contents;
    }
+    bool only_start = pat->type == BP_START_OF_FILE || (pat->type == BP_CHAIN && pat->args.multiple.first->type == BP_START_OF_FILE);
    while (str <= f->end) {
        match_t *m = match(defs, f, str, pat, ignorecase);
        if (m) return m;
+        if (only_start) return NULL;
        match_t *s;
        if (skip && (s = match(defs, f, str, skip, ignorecase))) {
            str = s->end > str ? s->end : str + 1;
@ -201,8 +201,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        }
        case BP_STRING: {
            if (&str[pat->len] > f->end) return NULL;
-            if (ignorecase ? memicmp(str, pat->args.string, (size_t)pat->len) != 0
-                           : memcmp(str, pat->args.string, (size_t)pat->len) != 0)
+            if ((ignorecase ? memicmp : memcmp)(str, pat->args.string, (size_t)pat->len) != 0)
                return NULL;
            return new_match(pat, str, str + pat->len, NULL);
        }
@ -360,7 +359,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
            ADD_OWNER(m1->nextsibling, m2);
            return new_match(pat, str, m2->end, m1);
        }
-        case BP_EQUAL: case BP_NOT_EQUAL: {
+        case BP_MATCH: case BP_NOT_MATCH: {
            match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
            if (m1 == NULL) return NULL;

@ -374,17 +373,13 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
                .mmapped=f->mmapped,
                .pats = NULL, .next = NULL,
            };
-            match_t *m2 = match(defs, &inner, str, pat->args.multiple.second, ignorecase);
-            if ((m2 == NULL || m2->end != m1->end) == (pat->type == BP_EQUAL)) {
+            match_t *m2 = next_match(defs, &inner, NULL, pat->args.multiple.second, NULL, ignorecase);
+            if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
+                recycle_if_unused(&m2);
                recycle_if_unused(&m1);
-                if (m2 != NULL) recycle_if_unused(&m2);
                return NULL;
            }
-            if (pat->type == BP_EQUAL) {
-                ADD_OWNER(m1->nextsibling, m2);
-            } else {
-                recycle_if_unused(&m2);
-            }
+            if (pat->type == BP_MATCH) ADD_OWNER(m1->nextsibling, m2);
            return new_match(pat, m1->start, m1->end, m1);
        }
        case BP_REPLACE: {
--- a/pattern.c
+++ b/pattern.c
@ -139,7 +139,7 @@ pat_t *chain_together(file_t *f, pat_t *first, pat_t *second)
            p = p->args.capture.capture_pat;
        } else if (p->type == BP_CHAIN) {
            p = p->args.multiple.second;
-        } else if (p->type == BP_EQUAL || p->type == BP_NOT_EQUAL) {
+        } else if (p->type == BP_MATCH || p->type == BP_NOT_MATCH) {
            p = p->args.pat;
        } else break;
    }
@ -174,19 +174,20 @@ static pat_t *bp_simplepattern(file_t *f, const char *str)

    // Expand postfix operators (if any)
    str = after_spaces(pat->end);
-    while (str+2 < f->end && (matchstr(&str, "!=") || matchstr(&str, "=="))) { // Equality <pat1>==<pat2> and inequality <pat1>!=<pat2>
-        bool equal = str[-2] == '=';
+    while (str+2 < f->end) {
+        enum pattype_e type;
+        if (matchchar(&str, '~'))
+            type = BP_MATCH;
+        else if (matchstr(&str, "!~"))
+            type = BP_NOT_MATCH;
+        else break;
+
        pat_t *first = pat;
        pat_t *second = bp_simplepattern(f, str);
        if (!second)
-            file_err(f, str, str, "The '%c=' operator expects a pattern before and after.", equal?'=':'!');
-        if (equal) {
-            if (!(first->len == -1 || second->len == -1 || first->len == second->len))
-                file_err(f, pat->start, second->end,
-                  "These two patterns cannot possibly give the same result (different lengths: %ld != %ld)",
-                  first->len, second->len);
-        }
-        pat = new_pat(f, str, second->end, first->len, equal ? BP_EQUAL : BP_NOT_EQUAL);
+            file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
+
+        pat = new_pat(f, str, second->end, first->len, type);
        pat->args.multiple.first = first;
        pat->args.multiple.second = second;
        str = pat->end;
--- a/types.h
+++ b/types.h
@ -22,8 +22,8 @@ enum pattype_e {
    BP_CAPTURE,
    BP_OTHERWISE,
    BP_CHAIN,
-    BP_EQUAL,
-    BP_NOT_EQUAL,
+    BP_MATCH,
+    BP_NOT_MATCH,
    BP_REPLACE,
    BP_REF,
    BP_BACKREF,