aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2020-09-13 20:33:11 -0700
committerBruce Hill <bruce@bruce-hill.com>2020-09-13 20:33:11 -0700
commit1d1c3d35aae0e060a6527d6e83575dd7ff71328e (patch)
treebc9e486906d47fac29d8d0f1e8c8915d2bf5ab9f
parentab5ef5a77af9f2fc7c3353f05bf716b1a6b93f73 (diff)
Added == operator
-rw-r--r--bpeg.bpeg8
-rw-r--r--bpeg.c21
-rw-r--r--compiler.c26
-rw-r--r--grammars/builtins.bpeg5
-rw-r--r--grammars/html.bpeg21
-rw-r--r--types.h1
-rw-r--r--vm.c25
7 files changed, 89 insertions, 18 deletions
diff --git a/bpeg.bpeg b/bpeg.bpeg
index 0a03aa8..3301155 100644
--- a/bpeg.bpeg
+++ b/bpeg.bpeg
@@ -4,10 +4,14 @@ Grammar = __ *Def%(__`;__) ?(`;__);
Def = @[name]Ref __ `= __ @[definition]extended-pat;
# This is used for command line arguments:
-String-grammar = *(`\ pat ?`; / .);
+String-pattern = *(`\ pat ?`; / .);
-pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but
+pat = suffixed-pat / simple-pat;
+simple-pat = Empty / Dot / String / Char-range / Char / Escape-range / Escape / No / Anything-but
/ Upto-and / Repeat / After / Before / Capture / Replace / Ref / parens;
+suffixed-pat = Eq-pat;
+
+Eq-pat = @[first]simple-pat "==" @[second]pat;
Empty = `/ >(__ (`)/`}));
Dot = `.;
diff --git a/bpeg.c b/bpeg.c
index 95f133c..89f35e7 100644
--- a/bpeg.c
+++ b/bpeg.c
@@ -24,7 +24,8 @@
* ( <pat> ) <pat>
* @ <pat> capture <pat>
* @ [ <name> ] <pat> <pat> named <name>
- * { <pat> => <str> } <pat> replaced with <str>
+ * { <pat> => <str> } <pat> replaced with <str>
+ * <pat1> == <pat2> <pat1> iff <pat2> matches at the same spot for the same length
* "@1" or "@[1]" first capture
* "@foo" or "@[foo]" capture named "foo"
* <pat1> <pat2> <pat1> followed by <pat2>
@@ -50,7 +51,8 @@ static const char *usage = (
"Flags:\n"
" -h --help\t print the usage and quit\n"
" -v --verbose\t print verbose debugging info\n"
- " -d --define <name>=<def> define a grammar rule\n"
+ " -d --define <name>=<def>\t define a grammar rule\n"
+ " -D --define-string <name>=<def>\t define a grammar rule (string-pattern)\n"
" -e --escaped <pat>\t provide an escaped pattern (equivalent to bpeg '\\(<pat>)')\n"
" -s --string <pat>\t provide a string pattern (equivalent to bpeg '<pat>', but may be useful if '<pat>' begins with a '-')\n"
" -r --replace <replacement> replace the input pattern with the given replacement\n"
@@ -83,8 +85,10 @@ static int run_match(grammar_t *g, const char *filename, vm_op_t *pattern, int v
}
match_t *m = match(g, input, pattern);
if (m != NULL && m->end > m->start + 1) {
- if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename);
- else printf("%s\n", filename);
+ if (filename != NULL) {
+ if (isatty(STDOUT_FILENO)) printf("\033[1;4;33m%s\033[0m\n", filename);
+ else printf("%s\n", filename);
+ }
print_match(m, isatty(STDOUT_FILENO) ? "\033[0m" : NULL, verbose);
freefile(input);
return 0;
@@ -153,6 +157,15 @@ int main(int argc, char *argv[])
vm_op_t *pat = bpeg_pattern(src);
check(pat, "Failed to compile pattern");
add_def(g, src, def, pat);
+ } else if (FLAG("--define-string") || FLAG("-D")) {
+ char *def = flag;
+ char *eq = strchr(def, '=');
+ check(eq, usage);
+ *eq = '\0';
+ char *src = ++eq;
+ vm_op_t *pat = bpeg_stringpattern(src);
+ check(pat, "Failed to compile pattern");
+ add_def(g, src, def, pat);
} else if (FLAG("--escaped") || FLAG("-e")) {
check(npatterns == 0, "Cannot define multiple patterns");
vm_op_t *p = bpeg_pattern(flag);
diff --git a/compiler.c b/compiler.c
index 7c5220d..4e2b185 100644
--- a/compiler.c
+++ b/compiler.c
@@ -372,8 +372,7 @@ vm_op_t *bpeg_simplepattern(const char *str)
const char *refname = str;
str = after_name(str);
op->op = VM_REF;
- op->len = (ssize_t)(str - refname);
- op->args.s = strndup(refname, (size_t)op->len);
+ op->args.s = strndup(refname, (size_t)(str - refname));
break;
} else {
free(op);
@@ -382,6 +381,29 @@ vm_op_t *bpeg_simplepattern(const char *str)
}
}
op->end = str;
+
+ // Postfix operators:
+ postfix:
+ str = after_spaces(str);
+ if (strncmp(str, "==", 2) == 0) {
+ str += 2;
+ vm_op_t *first = op;
+ vm_op_t *second = bpeg_simplepattern(str);
+ check(second, "Expected pattern after '=='");
+ check(first->len == -1 || second->len == -1 || first->len == second->len,
+ "Two patterns cannot possibly match the same (different lengths: %ld != %ld)",
+ first->len, second->len);
+ op = calloc(sizeof(vm_op_t), 1);
+ op->op = VM_EQUAL;
+ op->start = str;
+ op->end = second->end;
+ op->len = (first->len == -1 || second->len == -1) ? -1 : first->len;
+ op->args.multiple.first = first;
+ op->args.multiple.second = second;
+ str = op->end;
+ goto postfix;
+ }
+
return op;
}
diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg
index 0741bfc..f92e0aa 100644
--- a/grammars/builtins.bpeg
+++ b/grammars/builtins.bpeg
@@ -2,7 +2,7 @@
pattern = !(/); # Not defined by default
replacement = {!(/)=>}; # Not defined by default
replace-all = +&&@replacement &&$$;
-find-all = {&&>matching-line=>} +(matching-line/non-matching-line);
+find-all = {&&>matching-line=>} +(matching-line/non-matching-line) ?{!<\n => "\n"};
only-matches = +{&&@pattern=>'@1\n'};
matching-line = +&@pattern *. $ ?\n;
non-matching-line = {&&(\n/$$)=>};
@@ -14,7 +14,8 @@ anglebraces = `< *(anglebraces / ~~`>) `>;
brackets = `[ *(brackets / ~~`]) `];
braces = `{ *(braces / ~~`}) `};
parens = `( *(parens / ~~`)) `);
-id = (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);
+id = !<(`a-z/`A-Z/`_/`0-9) (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);
+word = !<(`a-z/`A-Z/`_/`0-9) +(`a-z/`A-Z) !>(`0-9/`_);
HEX = `0-9/`A-F;
Hex = `0-9/`a-f/`A-F;
hex = `0-9/`a-f;
diff --git a/grammars/html.bpeg b/grammars/html.bpeg
index f6812bd..7f8976a 100644
--- a/grammars/html.bpeg
+++ b/grammars/html.bpeg
@@ -3,16 +3,25 @@ HTML = __ ?(doctype __) *html-element%__ __;
doctype = "<!DOCTYPE" &`>;
-html-element = void-element / template-element / raw-text-element / normal-element;
+html-element = (
+ >(`<("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr")) void-element
+ / >(`<("script"/"style"/"textarea"/"title")) raw-element
+ / >(`<("template")) template-element
+ / normal-element);
-void-element = `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") *(__attribute) __ ?`/ __ `>;
+void-element = `< @[tag](id==match-tag) __attributes__ `/? __ `>;
-template-element = "<template" __`> __ *(~~`< / comment / html-element / ~~("</template"__`>)) ("</template"__`>);
+template-element = `< @[tag](id==match-tag) __`> __ >match-body @[body]0+(~~`< / comment / html-element / ~~("</"tag__`>)) ("</"tag__`>);
-raw-text-element = `<@[tag]("script"/"style"/"textarea"/"title") *(__attribute) __ `> &("</"tag__`>);
+raw-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*~~("</"tag__`>) ("</"tag__`>);
-normal-element = !raw-text-element `<@[tag]id *(__attribute) __ `> *(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>;
+normal-element = `< @[tag](id==match-tag) __attributes__ `> >match-body @[body]*(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>;
comment = "<!--" &&"-->";
-attribute = +id%`:__`=__(id / `" &`" / `' &`');
+attributes = *(!(attribute==match-attribute))%__ __(attribute==match-attribute)__ *attribute%__;
+attribute = (+id%`:)__`=__ (id / `" &`" / `' &`');
+attribute = (+id%`:)__`=__ (id / `" *~`" `" / `' *~`' `');
+match-attribute = attribute;
+match-tag = id;
+match-body = (/);
diff --git a/types.h b/types.h
index 82c282c..f335285 100644
--- a/types.h
+++ b/types.h
@@ -23,6 +23,7 @@ enum VMOpcode {
VM_CAPTURE,
VM_OTHERWISE,
VM_CHAIN,
+ VM_EQUAL,
VM_REPLACE,
VM_REF,
VM_BACKREF,
diff --git a/vm.c b/vm.c
index c1427a5..8e0957d 100644
--- a/vm.c
+++ b/vm.c
@@ -28,6 +28,7 @@ static const char *opcode_names[] = {
[VM_OTHERWISE] = "OTHERWISE",
[VM_CHAIN] = "CHAIN",
[VM_REPLACE] = "REPLACE",
+ [VM_EQUAL] = "EQUAL",
[VM_REF] = "REF",
[VM_BACKREF] = "BACKREF",
};
@@ -253,6 +254,25 @@ static match_t *_match(grammar_t *g, const char *str, vm_op_t *op, recursive_ref
m1->nextsibling = m2;
return m;
}
+ case VM_EQUAL: {
+ match_t *m1 = _match(g, str, op->args.multiple.first, rec);
+ if (m1 == NULL) return NULL;
+
+ // <p1>==<p2> matches iff both have the same start and end point:
+ match_t *m2 = _match(g, str, op->args.multiple.second, rec);
+ if (m2 == NULL || m2->end != m1->end) {
+ destroy_match(&m1);
+ destroy_match(&m2);
+ return NULL;
+ }
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->end = m2->end;
+ m->op = op;
+ m->child = m1;
+ m1->nextsibling = m2;
+ return m;
+ }
case VM_REPLACE: {
match_t *m = calloc(sizeof(match_t), 1);
m->start = str;
@@ -512,8 +532,8 @@ void print_match(match_t *m, const char *color, int verbose)
const char *prev = m->start;
for (match_t *child = m->child; child; child = child->nextsibling) {
// Skip children from e.g. zero-width matches like >@foo
- if (!(m->start <= child->start && child->start <= m->end &&
- m->start <= child->end && child->end <= m->end))
+ if (!(prev <= child->start && child->start <= m->end &&
+ prev <= child->end && child->end <= m->end))
continue;
if (child->start > prev)
printf("%s%.*s", color ? color : "", (int)(child->start - prev), prev);
@@ -608,6 +628,7 @@ static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap)
str += len;
prev = child->start;
}
+ if (child->start < prev) continue;
*dest = match_backref(str, op, child);
if (*dest == NULL) {
destroy_match(&ret);