aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2020-09-12 18:20:13 -0700
committerBruce Hill <bruce@bruce-hill.com>2020-09-12 18:20:13 -0700
commitc18eb4c9968289c4808d70f7124c0b6bed5eb022 (patch)
tree97fdbee687e2e83f83403161365b90c99c102ee6
parentc0125378b9ec96149aed3107bff719cd8a01b16d (diff)
Added backrefs
-rw-r--r--bpeg.c6
-rw-r--r--grammar.c48
-rw-r--r--grammar.h2
-rw-r--r--grammars/builtins.bpeg41
-rw-r--r--grammars/html.bpeg18
-rw-r--r--types.h13
-rw-r--r--utils.c2
-rw-r--r--vm.c153
-rw-r--r--vm.h1
9 files changed, 274 insertions, 10 deletions
diff --git a/bpeg.c b/bpeg.c
index eb5e7a7..824977e 100644
--- a/bpeg.c
+++ b/bpeg.c
@@ -127,6 +127,12 @@ int main(int argc, char *argv[])
vm_op_t *pat = bpeg_pattern(src);
check(pat, "Failed to compile pattern");
add_def(g, src, def, pat);
+ } else if (FLAG("--escaped") || FLAG("-e")) {
+ check(npatterns == 0, "Cannot define multiple patterns");
+ vm_op_t *p = bpeg_pattern(argv[i]);
+ check(p, "Pattern failed to compile");
+ add_def(g, argv[i], "pattern", p);
+ ++npatterns;
} else if (argv[i][0] != '-') {
if (npatterns > 0) break;
vm_op_t *p = bpeg_stringpattern(argv[i]);
diff --git a/grammar.c b/grammar.c
index 2be7cd5..61c82c9 100644
--- a/grammar.c
+++ b/grammar.c
@@ -9,20 +9,20 @@
grammar_t *new_grammar(void)
{
grammar_t *g = calloc(sizeof(grammar_t), 1);
- g->definitions = calloc(sizeof(def_t), (g->capacity = 128));
+ g->definitions = calloc(sizeof(def_t), (g->defcapacity = 128));
return g;
}
void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op)
{
- if (g->size >= g->capacity) {
- g->definitions = realloc(g->definitions, (g->capacity += 32));
+ if (g->defcount >= g->defcapacity) {
+ g->definitions = realloc(g->definitions, sizeof(&g->definitions[0])*(g->defcapacity += 32));
}
- int i = g->size;
+ int i = g->defcount;
g->definitions[i].source = src;
g->definitions[i].name = name;
g->definitions[i].op = op;
- ++g->size;
+ ++g->defcount;
}
/*
@@ -54,11 +54,45 @@ vm_op_t *load_grammar(grammar_t *g, const char *src)
vm_op_t *lookup(grammar_t *g, const char *name)
{
- // Search backwards so newer defs take precedence
- for (int i = g->size-1; i >= 0; i--) {
+ // Search backwards so newer backrefs/defs take precedence
+ for (int i = (int)g->backrefcount-1; i >= 0; i--) {
+ if (streq(g->backrefs[i].name, name)) {
+ return g->backrefs[i].op;
+ }
+ }
+ for (int i = g->defcount-1; i >= 0; i--) {
if (streq(g->definitions[i].name, name)) {
return g->definitions[i].op;
}
}
return NULL;
}
+
+void push_backref(grammar_t *g, const char *name, match_t *capture)
+{
+ check(capture, "No capture provided");
+ if (g->backrefcount >= g->backrefcapacity) {
+ g->backrefs = realloc(g->backrefs, sizeof(g->backrefs[0])*(g->backrefcapacity += 32));
+ }
+ size_t i = g->backrefcount++;
+ g->backrefs[i].name = name;
+ g->backrefs[i].capture = capture;
+ vm_op_t *op = calloc(sizeof(vm_op_t), 1);
+ op->op = VM_BACKREF;
+ op->start = capture->start;
+ op->end = capture->end;
+ op->len = -1; // TODO: maybe calculate this?
+ op->args.backref = (void*)capture;
+ g->backrefs[i].op = op;
+}
+
+void pop_backrefs(grammar_t *g, size_t count)
+{
+ check(count <= g->backrefcount, "Attempt to pop %ld backrefs when there are only %ld", count, g->backrefcount);
+ for ( ; count > 0; count--) {
+ //free(g->backrefs[i].op); // TODO: memory leak problem??
+ int i = (int)g->backrefcount - 1;
+ memset(&g->backrefs[i], 0, sizeof(g->backrefs[i]));
+ --g->backrefcount;
+ }
+}
diff --git a/grammar.h b/grammar.h
index 56d7ef6..f75b111 100644
--- a/grammar.h
+++ b/grammar.h
@@ -11,6 +11,8 @@
grammar_t *new_grammar(void);
void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op);
+void push_backref(grammar_t *g, const char *name, match_t *capture);
+void pop_backrefs(grammar_t *g, size_t count);
vm_op_t *load_grammar(grammar_t *g, const char *source);
vm_op_t *lookup(grammar_t *g, const char *name);
diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg
new file mode 100644
index 0000000..0de9ae3
--- /dev/null
+++ b/grammars/builtins.bpeg
@@ -0,0 +1,41 @@
+# Meta-rules for acting on everything
+pattern = !(/); # Not defined by default
+replacement = {!(/)=>}; # Not defined by default
+replace-all = *&&@replacement &&$$;
+find-all = *(matching-line / {&&(\n/$$)=>});
+only-matches = *{&&@pattern=>'@1\n'};
+matching-line = +&@pattern *. $ ?\n;
+
+# Helper definitions (commonly used)
+crlf = \r\n;
+cr = \r; r = \r;
+anglebraces = `< *(anglebraces / ~~`>) `>;
+brackets = `[ *(brackets / ~~`]) `];
+braces = `{ *(braces / ~~`}) `};
+parens = `( *(parens / ~~`)) `);
+id = (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);
+HEX = `0-9/`A-F;
+Hex = `0-9/`a-f/`A-F;
+hex = `0-9/`a-f;
+number = +`0-9 ?(`. *`0-9) / `. +`0-9;
+int = +`0-9;
+digit = `0-9;
+Abc = `a-z/`A-Z;
+ABC = `A-Z;
+abc = `a-z;
+esc = \e; e = \e;
+tab = \t; t = \t;
+nl = \n; lf = \n; n = \n;
+c-block-comment = '/*' &&'*/';
+c-line-comment = '//' &$;
+c-comment = c-line-comment / c-block-comment;
+hash-comment = `# &$;
+comment = !(/); # No default definition, can be overridden
+WS = ` /\t/\n/\r/comment;
+ws = ` /\t;
+$$ = !$.;
+$ = !.;
+^^ = !<$.;
+^ = !<.;
+__ = *(` /\t/\n/\r/comment);
+_ = *(` /\t);
diff --git a/grammars/html.bpeg b/grammars/html.bpeg
new file mode 100644
index 0000000..f6812bd
--- /dev/null
+++ b/grammars/html.bpeg
@@ -0,0 +1,18 @@
+# HTML grammar
+HTML = __ ?(doctype __) *html-element%__ __;
+
+doctype = "<!DOCTYPE" &`>;
+
+html-element = void-element / template-element / raw-text-element / normal-element;
+
+void-element = `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") *(__attribute) __ ?`/ __ `>;
+
+template-element = "<template" __`> __ *(~~`< / comment / html-element / ~~("</template"__`>)) ("</template"__`>);
+
+raw-text-element = `<@[tag]("script"/"style"/"textarea"/"title") *(__attribute) __ `> &("</"tag__`>);
+
+normal-element = !raw-text-element `<@[tag]id *(__attribute) __ `> *(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>;
+
+comment = "<!--" &&"-->";
+
+attribute = +id%`:__`=__(id / `" &`" / `' &`');
diff --git a/types.h b/types.h
index f8899b3..82c282c 100644
--- a/types.h
+++ b/types.h
@@ -7,7 +7,7 @@
#include <sys/types.h>
/*
- * BPEG virtual machine opcodes
+ * BPEG virtual machine opcodes (these must be kept in sync with the names in vm.c)
*/
enum VMOpcode {
VM_EMPTY = 0,
@@ -25,6 +25,7 @@ enum VMOpcode {
VM_CHAIN,
VM_REPLACE,
VM_REF,
+ VM_BACKREF,
};
/*
@@ -57,6 +58,7 @@ typedef struct vm_op_s {
struct vm_op_s *capture_pat;
char *name;
} capture;
+ void *backref;
struct vm_op_s *pat;
} args;
} vm_op_t;
@@ -81,8 +83,15 @@ typedef struct {
} def_t;
typedef struct {
+ size_t defcount, defcapacity;
def_t *definitions;
- size_t size, capacity;
+
+ size_t backrefcount, backrefcapacity;
+ struct {
+ const char *name;
+ match_t *capture;
+ vm_op_t *op;
+ } *backrefs;
} grammar_t;
#endif
diff --git a/utils.c b/utils.c
index d70d3fc..3a50abd 100644
--- a/utils.c
+++ b/utils.c
@@ -164,7 +164,7 @@ char *readfile(int fd)
while ((just_read=read(fd, &buf[len], capacity-len)) > 0) {
len += (size_t)just_read;
if (len >= capacity)
- buf = realloc(buf, (capacity *= 2));
+ buf = realloc(buf, sizeof(char)*(capacity *= 2));
}
return buf;
}
diff --git a/vm.c b/vm.c
index c82832a..1e2cb76 100644
--- a/vm.c
+++ b/vm.c
@@ -5,6 +5,38 @@
#include "grammar.h"
#include "utils.h"
+static match_t *match_backref(const char *str, vm_op_t *op, match_t *m);
+static size_t push_backrefs(grammar_t *g, match_t *m);
+static match_t *get_capture_n(match_t *m, int *n);
+static match_t *get_capture_named(match_t *m, const char *name);
+
+/*
+ * The names of the opcodes (keep in sync with the enum definition above)
+ */
+static const char *opcode_names[] = {
+ [VM_EMPTY] = "EMPTY",
+ [VM_ANYCHAR] = "ANYCHAR",
+ [VM_ANYTHING_BUT] = "ANYTHING_BUT",
+ [VM_STRING] = "STRING",
+ [VM_RANGE] = "RANGE",
+ [VM_NOT] = "NOT",
+ [VM_UPTO_AND] = "UPTO_AND",
+ [VM_REPEAT] = "REPEAT",
+ [VM_BEFORE] = "BEFORE",
+ [VM_AFTER] = "AFTER",
+ [VM_CAPTURE] = "CAPTURE",
+ [VM_OTHERWISE] = "OTHERWISE",
+ [VM_CHAIN] = "CHAIN",
+ [VM_REPLACE] = "REPLACE",
+ [VM_REF] = "REF",
+ [VM_BACKREF] = "BACKREF",
+};
+
+const char *opcode_name(enum VMOpcode o)
+{
+ return opcode_names[o];
+}
+
/*
* Recursively deallocate a match object and set to NULL
*/
@@ -16,6 +48,20 @@ void destroy_match(match_t **m)
*m = NULL;
}
+static size_t push_backrefs(grammar_t *g, match_t *m)
+{
+ if (m == NULL) return 0;
+ if (m->op->op == VM_REF) return 0;
+ size_t count = 0;
+ if (m->is_capture && m->name_or_replacement) {
+ ++count;
+ push_backref(g, m->name_or_replacement, m->child);
+ }
+ if (m->child) count += push_backrefs(g, m->child);
+ if (m->nextsibling) count += push_backrefs(g, m->nextsibling);
+ return count;
+}
+
/*
* Run virtual machine operation against a string and return
* a match struct, or NULL if no match is found.
@@ -183,7 +229,10 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
case VM_CHAIN: {
match_t *m1 = match(g, str, op->args.multiple.first);
if (m1 == NULL) return NULL;
+
+ size_t nbackrefs = push_backrefs(g, m1);
match_t *m2 = match(g, m1->end, op->args.multiple.second);
+ pop_backrefs(g, nbackrefs);
if (m2 == NULL) {
destroy_match(&m1);
return NULL;
@@ -227,6 +276,9 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op)
m->is_ref = 1;
return m;
}
+ case VM_BACKREF: {
+ return match_backref(str, op, (match_t*)op->args.backref);
+ }
default: {
fprintf(stderr, "Unknown opcode: %d", op->op);
_exit(1);
@@ -440,4 +492,105 @@ void print_match(match_t *m, const char *color, int verbose)
}
}
+static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap)
+{
+ check(op->op == VM_BACKREF, "Attempt to match backref against something that's not a backref");
+ match_t *ret = calloc(sizeof(match_t), 1);
+ ret->start = str;
+ ret->op = op;
+ match_t **dest = &ret->child;
+
+ if (cap->is_replacement) {
+ for (const char *r = cap->name_or_replacement; *r; ) {
+ if (*r == '\\') {
+ ++r;
+ if (*(str++) != unescapechar(r, &r)) {
+ destroy_match(&ret);
+ return NULL;
+ }
+ } else if (*r != '@') {
+ if (*(str++) != *r) {
+ destroy_match(&ret);
+ return NULL;
+ }
+ ++r;
+ continue;
+ }
+
+ ++r;
+ match_t *cap = NULL;
+ switch (*r) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ int n = (int)strtol(r, (char**)&r, 10);
+ cap = get_capture_n(cap->child, &n);
+ break;
+ }
+ case '[': {
+ char *closing = strchr(r+1, ']');
+ if (!closing) {
+ if (*(str++) != '@') {
+ destroy_match(&ret);
+ return NULL;
+ }
+ }
+ ++r;
+ char *name = strndup(r, (size_t)(closing-r));
+ cap = get_capture_named(cap, name);
+ free(name);
+ r = closing + 1;
+ break;
+ }
+ default: {
+ if (*(str++) != '@') {
+ destroy_match(&ret);
+ return NULL;
+ }
+ break;
+ }
+ }
+ if (cap != NULL) {
+ *dest = match_backref(str, op, cap);
+ if (*dest == NULL) {
+ destroy_match(&ret);
+ return NULL;
+ }
+ str = (*dest)->end;
+ dest = &(*dest)->nextsibling;
+ }
+ }
+ } else {
+ const char *prev = cap->start;
+ for (match_t *child = cap->child; child; child = child->nextsibling) {
+ if (child->start > prev) {
+ size_t len = (size_t)(child->start - prev);
+ if (strncmp(str, prev, len) != 0) {
+ destroy_match(&ret);
+ return NULL;
+ }
+ str += len;
+ prev = child->start;
+ }
+ *dest = match_backref(str, op, child);
+ if (*dest == NULL) {
+ destroy_match(&ret);
+ return NULL;
+ }
+ str = (*dest)->end;
+ dest = &(*dest)->nextsibling;
+ prev = child->end;
+ }
+ if (cap->end > prev) {
+ size_t len = (size_t)(cap->end - prev);
+ if (strncmp(str, prev, len) != 0) {
+ destroy_match(&ret);
+ return NULL;
+ }
+ str += len;
+ }
+ }
+ ret->end = str;
+ return ret;
+}
+
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1
diff --git a/vm.h b/vm.h
index 9225461..db1e9f5 100644
--- a/vm.h
+++ b/vm.h
@@ -10,6 +10,7 @@
#include "types.h"
+const char *opcode_name(enum VMOpcode o);
match_t *match(grammar_t *g, const char *str, vm_op_t *op);
void destroy_match(match_t **m);
void print_pattern(vm_op_t *op);