diff options
| -rw-r--r-- | bpeg.c | 6 | ||||
| -rw-r--r-- | grammar.c | 48 | ||||
| -rw-r--r-- | grammar.h | 2 | ||||
| -rw-r--r-- | grammars/builtins.bpeg | 41 | ||||
| -rw-r--r-- | grammars/html.bpeg | 18 | ||||
| -rw-r--r-- | types.h | 13 | ||||
| -rw-r--r-- | utils.c | 2 | ||||
| -rw-r--r-- | vm.c | 153 | ||||
| -rw-r--r-- | vm.h | 1 |
9 files changed, 274 insertions, 10 deletions
@@ -127,6 +127,12 @@ int main(int argc, char *argv[]) vm_op_t *pat = bpeg_pattern(src); check(pat, "Failed to compile pattern"); add_def(g, src, def, pat); + } else if (FLAG("--escaped") || FLAG("-e")) { + check(npatterns == 0, "Cannot define multiple patterns"); + vm_op_t *p = bpeg_pattern(argv[i]); + check(p, "Pattern failed to compile"); + add_def(g, argv[i], "pattern", p); + ++npatterns; } else if (argv[i][0] != '-') { if (npatterns > 0) break; vm_op_t *p = bpeg_stringpattern(argv[i]); @@ -9,20 +9,20 @@ grammar_t *new_grammar(void) { grammar_t *g = calloc(sizeof(grammar_t), 1); - g->definitions = calloc(sizeof(def_t), (g->capacity = 128)); + g->definitions = calloc(sizeof(def_t), (g->defcapacity = 128)); return g; } void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op) { - if (g->size >= g->capacity) { - g->definitions = realloc(g->definitions, (g->capacity += 32)); + if (g->defcount >= g->defcapacity) { + g->definitions = realloc(g->definitions, sizeof(&g->definitions[0])*(g->defcapacity += 32)); } - int i = g->size; + int i = g->defcount; g->definitions[i].source = src; g->definitions[i].name = name; g->definitions[i].op = op; - ++g->size; + ++g->defcount; } /* @@ -54,11 +54,45 @@ vm_op_t *load_grammar(grammar_t *g, const char *src) vm_op_t *lookup(grammar_t *g, const char *name) { - // Search backwards so newer defs take precedence - for (int i = g->size-1; i >= 0; i--) { + // Search backwards so newer backrefs/defs take precedence + for (int i = (int)g->backrefcount-1; i >= 0; i--) { + if (streq(g->backrefs[i].name, name)) { + return g->backrefs[i].op; + } + } + for (int i = g->defcount-1; i >= 0; i--) { if (streq(g->definitions[i].name, name)) { return g->definitions[i].op; } } return NULL; } + +void push_backref(grammar_t *g, const char *name, match_t *capture) +{ + check(capture, "No capture provided"); + if (g->backrefcount >= g->backrefcapacity) { + g->backrefs = realloc(g->backrefs, sizeof(g->backrefs[0])*(g->backrefcapacity += 32)); + } + size_t i = g->backrefcount++; + g->backrefs[i].name = name; + g->backrefs[i].capture = capture; + vm_op_t *op = calloc(sizeof(vm_op_t), 1); + op->op = VM_BACKREF; + op->start = capture->start; + op->end = capture->end; + op->len = -1; // TODO: maybe calculate this? + op->args.backref = (void*)capture; + g->backrefs[i].op = op; +} + +void pop_backrefs(grammar_t *g, size_t count) +{ + check(count <= g->backrefcount, "Attempt to pop %ld backrefs when there are only %ld", count, g->backrefcount); + for ( ; count > 0; count--) { + //free(g->backrefs[i].op); // TODO: memory leak problem?? + int i = (int)g->backrefcount - 1; + memset(&g->backrefs[i], 0, sizeof(g->backrefs[i])); + --g->backrefcount; + } +} @@ -11,6 +11,8 @@ grammar_t *new_grammar(void); void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op); +void push_backref(grammar_t *g, const char *name, match_t *capture); +void pop_backrefs(grammar_t *g, size_t count); vm_op_t *load_grammar(grammar_t *g, const char *source); vm_op_t *lookup(grammar_t *g, const char *name); diff --git a/grammars/builtins.bpeg b/grammars/builtins.bpeg new file mode 100644 index 0000000..0de9ae3 --- /dev/null +++ b/grammars/builtins.bpeg @@ -0,0 +1,41 @@ +# Meta-rules for acting on everything +pattern = !(/); # Not defined by default +replacement = {!(/)=>}; # Not defined by default +replace-all = *&&@replacement &&$$; +find-all = *(matching-line / {&&(\n/$$)=>}); +only-matches = *{&&@pattern=>'@1\n'}; +matching-line = +&@pattern *. $ ?\n; + +# Helper definitions (commonly used) +crlf = \r\n; +cr = \r; r = \r; +anglebraces = `< *(anglebraces / ~~`>) `>; +brackets = `[ *(brackets / ~~`]) `]; +braces = `{ *(braces / ~~`}) `}; +parens = `( *(parens / ~~`)) `); +id = (`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9); +HEX = `0-9/`A-F; +Hex = `0-9/`a-f/`A-F; +hex = `0-9/`a-f; +number = +`0-9 ?(`. *`0-9) / `. +`0-9; +int = +`0-9; +digit = `0-9; +Abc = `a-z/`A-Z; +ABC = `A-Z; +abc = `a-z; +esc = \e; e = \e; +tab = \t; t = \t; +nl = \n; lf = \n; n = \n; +c-block-comment = '/*' &&'*/'; +c-line-comment = '//' &$; +c-comment = c-line-comment / c-block-comment; +hash-comment = `# &$; +comment = !(/); # No default definition, can be overridden +WS = ` /\t/\n/\r/comment; +ws = ` /\t; +$$ = !$.; +$ = !.; +^^ = !<$.; +^ = !<.; +__ = *(` /\t/\n/\r/comment); +_ = *(` /\t); diff --git a/grammars/html.bpeg b/grammars/html.bpeg new file mode 100644 index 0000000..f6812bd --- /dev/null +++ b/grammars/html.bpeg @@ -0,0 +1,18 @@ +# HTML grammar +HTML = __ ?(doctype __) *html-element%__ __; + +doctype = "<!DOCTYPE" &`>; + +html-element = void-element / template-element / raw-text-element / normal-element; + +void-element = `< ("area"/"base"/"br"/"col"/"embed"/"hr"/"img"/"input"/"link"/"meta"/"param"/"source"/"track"/"wbr") *(__attribute) __ ?`/ __ `>; + +template-element = "<template" __`> __ *(~~`< / comment / html-element / ~~("</template"__`>)) ("</template"__`>); + +raw-text-element = `<@[tag]("script"/"style"/"textarea"/"title") *(__attribute) __ `> &("</"tag__`>); + +normal-element = !raw-text-element `<@[tag]id *(__attribute) __ `> *(~~`< / comment / html-element / ~~("</"tag__`>)) "</"tag__`>; + +comment = "<!--" &&"-->"; + +attribute = +id%`:__`=__(id / `" &`" / `' &`'); @@ -7,7 +7,7 @@ #include <sys/types.h> /* - * BPEG virtual machine opcodes + * BPEG virtual machine opcodes (these must be kept in sync with the names in vm.c) */ enum VMOpcode { VM_EMPTY = 0, @@ -25,6 +25,7 @@ enum VMOpcode { VM_CHAIN, VM_REPLACE, VM_REF, + VM_BACKREF, }; /* @@ -57,6 +58,7 @@ typedef struct vm_op_s { struct vm_op_s *capture_pat; char *name; } capture; + void *backref; struct vm_op_s *pat; } args; } vm_op_t; @@ -81,8 +83,15 @@ typedef struct { } def_t; typedef struct { + size_t defcount, defcapacity; def_t *definitions; - size_t size, capacity; + + size_t backrefcount, backrefcapacity; + struct { + const char *name; + match_t *capture; + vm_op_t *op; + } *backrefs; } grammar_t; #endif @@ -164,7 +164,7 @@ char *readfile(int fd) while ((just_read=read(fd, &buf[len], capacity-len)) > 0) { len += (size_t)just_read; if (len >= capacity) - buf = realloc(buf, (capacity *= 2)); + buf = realloc(buf, sizeof(char)*(capacity *= 2)); } return buf; } @@ -5,6 +5,38 @@ #include "grammar.h" #include "utils.h" +static match_t *match_backref(const char *str, vm_op_t *op, match_t *m); +static size_t push_backrefs(grammar_t *g, match_t *m); +static match_t *get_capture_n(match_t *m, int *n); +static match_t *get_capture_named(match_t *m, const char *name); + +/* + * The names of the opcodes (keep in sync with the enum definition above) + */ +static const char *opcode_names[] = { + [VM_EMPTY] = "EMPTY", + [VM_ANYCHAR] = "ANYCHAR", + [VM_ANYTHING_BUT] = "ANYTHING_BUT", + [VM_STRING] = "STRING", + [VM_RANGE] = "RANGE", + [VM_NOT] = "NOT", + [VM_UPTO_AND] = "UPTO_AND", + [VM_REPEAT] = "REPEAT", + [VM_BEFORE] = "BEFORE", + [VM_AFTER] = "AFTER", + [VM_CAPTURE] = "CAPTURE", + [VM_OTHERWISE] = "OTHERWISE", + [VM_CHAIN] = "CHAIN", + [VM_REPLACE] = "REPLACE", + [VM_REF] = "REF", + [VM_BACKREF] = "BACKREF", +}; + +const char *opcode_name(enum VMOpcode o) +{ + return opcode_names[o]; +} + /* * Recursively deallocate a match object and set to NULL */ @@ -16,6 +48,20 @@ void destroy_match(match_t **m) *m = NULL; } +static size_t push_backrefs(grammar_t *g, match_t *m) +{ + if (m == NULL) return 0; + if (m->op->op == VM_REF) return 0; + size_t count = 0; + if (m->is_capture && m->name_or_replacement) { + ++count; + push_backref(g, m->name_or_replacement, m->child); + } + if (m->child) count += push_backrefs(g, m->child); + if (m->nextsibling) count += push_backrefs(g, m->nextsibling); + return count; +} + /* * Run virtual machine operation against a string and return * a match struct, or NULL if no match is found. @@ -183,7 +229,10 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) case VM_CHAIN: { match_t *m1 = match(g, str, op->args.multiple.first); if (m1 == NULL) return NULL; + + size_t nbackrefs = push_backrefs(g, m1); match_t *m2 = match(g, m1->end, op->args.multiple.second); + pop_backrefs(g, nbackrefs); if (m2 == NULL) { destroy_match(&m1); return NULL; @@ -227,6 +276,9 @@ match_t *match(grammar_t *g, const char *str, vm_op_t *op) m->is_ref = 1; return m; } + case VM_BACKREF: { + return match_backref(str, op, (match_t*)op->args.backref); + } default: { fprintf(stderr, "Unknown opcode: %d", op->op); _exit(1); @@ -440,4 +492,105 @@ void print_match(match_t *m, const char *color, int verbose) } } +static match_t *match_backref(const char *str, vm_op_t *op, match_t *cap) +{ + check(op->op == VM_BACKREF, "Attempt to match backref against something that's not a backref"); + match_t *ret = calloc(sizeof(match_t), 1); + ret->start = str; + ret->op = op; + match_t **dest = &ret->child; + + if (cap->is_replacement) { + for (const char *r = cap->name_or_replacement; *r; ) { + if (*r == '\\') { + ++r; + if (*(str++) != unescapechar(r, &r)) { + destroy_match(&ret); + return NULL; + } + } else if (*r != '@') { + if (*(str++) != *r) { + destroy_match(&ret); + return NULL; + } + ++r; + continue; + } + + ++r; + match_t *cap = NULL; + switch (*r) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + int n = (int)strtol(r, (char**)&r, 10); + cap = get_capture_n(cap->child, &n); + break; + } + case '[': { + char *closing = strchr(r+1, ']'); + if (!closing) { + if (*(str++) != '@') { + destroy_match(&ret); + return NULL; + } + } + ++r; + char *name = strndup(r, (size_t)(closing-r)); + cap = get_capture_named(cap, name); + free(name); + r = closing + 1; + break; + } + default: { + if (*(str++) != '@') { + destroy_match(&ret); + return NULL; + } + break; + } + } + if (cap != NULL) { + *dest = match_backref(str, op, cap); + if (*dest == NULL) { + destroy_match(&ret); + return NULL; + } + str = (*dest)->end; + dest = &(*dest)->nextsibling; + } + } + } else { + const char *prev = cap->start; + for (match_t *child = cap->child; child; child = child->nextsibling) { + if (child->start > prev) { + size_t len = (size_t)(child->start - prev); + if (strncmp(str, prev, len) != 0) { + destroy_match(&ret); + return NULL; + } + str += len; + prev = child->start; + } + *dest = match_backref(str, op, child); + if (*dest == NULL) { + destroy_match(&ret); + return NULL; + } + str = (*dest)->end; + dest = &(*dest)->nextsibling; + prev = child->end; + } + if (cap->end > prev) { + size_t len = (size_t)(cap->end - prev); + if (strncmp(str, prev, len) != 0) { + destroy_match(&ret); + return NULL; + } + str += len; + } + } + ret->end = str; + return ret; +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 @@ -10,6 +10,7 @@ #include "types.h" +const char *opcode_name(enum VMOpcode o); match_t *match(grammar_t *g, const char *str, vm_op_t *op); void destroy_match(match_t **m); void print_pattern(vm_op_t *op); |
