From 2a11acc66738d6300bfa90a22adcd540371060f3 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Fri, 11 Sep 2020 01:28:06 -0700 Subject: Major overhaul refactor restructuring --- Makefile | 17 +- bpeg.bpeg | 3 +- bpeg.c | 1136 ++---------------------------------------------------------- bpeg.h | 112 ------ compiler.c | 461 ++++++++++++++++++++++++ compiler.h | 16 + grammar.c | 93 +++++ grammar.h | 18 + types.h | 89 +++++ utils.c | 171 +++++++++ utils.h | 129 +------ vm.c | 449 ++++++++++++++++++++++++ vm.h | 18 + 13 files changed, 1383 insertions(+), 1329 deletions(-) delete mode 100644 bpeg.h create mode 100644 compiler.c create mode 100644 compiler.h create mode 100644 grammar.c create mode 100644 grammar.h create mode 100644 types.h create mode 100644 utils.c create mode 100644 vm.c create mode 100644 vm.h diff --git a/Makefile b/Makefile index 205b8c2..986ecdc 100644 --- a/Makefile +++ b/Makefile @@ -1,14 +1,21 @@ -PREFIX= +PREFIX=/usr/local CFLAGS=-Wall -Wextra -pedantic -Wmissing-prototypes -Wstrict-prototypes +LDFLAGS= G ?= O ?= -O3 +CFILES=compiler.c grammar.c utils.c vm.c +OBJFILES=$(CFILES:.c=.o) + all: bpeg -clean: - rm -f bpeg +.c.o: + cc -c $(CFLAGS) $(G) $(O) -o $@ $< -bpeg: bpeg.c bpeg.h utils.h - cc $(CFLAGS) $(G) $(O) $< -o $@ +bpeg: $(OBJFILES) bpeg.c + cc $(CFLAGS) $(G) $(O) -o $@ $^ + +clean: + rm -f bpeg $(OBJFILES) .PHONY: all clean diff --git a/bpeg.bpeg b/bpeg.bpeg index f5d490d..0a03aa8 100644 --- a/bpeg.bpeg +++ b/bpeg.bpeg @@ -1,7 +1,6 @@ # This is a file defining the BPEG grammar using BPEG syntax -Grammar; -Grammar = __ @[main-pattern]extended-pat __ *((__`;__) Def) ?(`;__); +Grammar = __ *Def%(__`;__) ?(`;__); Def = @[name]Ref __ `= __ @[definition]extended-pat; # This is used for command line arguments: diff --git a/bpeg.c b/bpeg.c index 41f7238..23fa6fc 100644 --- a/bpeg.c +++ b/bpeg.c @@ -31,1089 +31,38 @@ * / otherwise * ; = is defined to be */ +#include +#include +#include +#include +#include + +#include "compiler.h" +#include "grammar.h" +#include "utils.h" +#include "vm.h" + +static const char *usage = ( + "Usage:\n" + " bpeg [flags] [...]\n\n" + "Flags:\n" + " -h --help\t print the usage and quit\n" + " -v --verbose\t print verbose debugging info\n" + " -s --slow\t run in slow mode for debugging\n" + " -r --replace replace the input pattern with the given replacement\n" + " -g --grammar use the specified file as a grammar\n"); -#include "bpeg.h" - -/* - * Recursively deallocate a match object and return NULL - */ -static match_t *free_match(match_t *m) -{ - if (m->child) m->child = free_match(m->child); - if (m->nextsibling) m->nextsibling = free_match(m->nextsibling); - free(m); - return NULL; -} - -/* - * Run virtual machine operation against a string and return - * a match struct, or NULL if no match is found. - * The returned value should be free()'d to avoid memory leaking. - */ -static match_t *match(const char *str, vm_op_t *op) -{ - //tailcall: - switch (op->op) { - case VM_EMPTY: { - match_t *m = calloc(sizeof(match_t), 1); - m->op = op; - m->start = str; - m->end = str; - return m; - } - case VM_ANYCHAR: { - if (!*str || (!op->multiline && *str == '\n')) - return NULL; - match_t *m = calloc(sizeof(match_t), 1); - m->op = op; - m->start = str; - m->end = str+1; - return m; - } - case VM_STRING: { - if (strncmp(str, op->args.s, op->len) != 0) - return NULL; - match_t *m = calloc(sizeof(match_t), 1); - m->op = op; - m->start = str; - m->end = str + op->len; - return m; - } - case VM_RANGE: { - if (*str < op->args.range.low || *str > op->args.range.high) - return NULL; - match_t *m = calloc(sizeof(match_t), 1); - m->op = op; - m->start = str; - m->end = str + 1; - return m; - } - case VM_NOT: case VM_ANYTHING_BUT: { - if (op->op == VM_ANYTHING_BUT) - if (!*str || (!op->multiline && *str == '\n')) - return NULL; - match_t *m = match(str, op->args.pat); - if (m != NULL) { - m = free_match(m); - return NULL; - } - m = calloc(sizeof(match_t), 1); - m->op = op; - m->start = str; - if (op->op == VM_ANYTHING_BUT) ++str; - m->end = str; - return m; - } - case VM_UPTO_AND: { - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->op = op; - match_t *p = NULL; - for (const char *prev = NULL; p == NULL && prev < str; ) { - prev = str; - p = match(str, op->args.pat); - if (*str && (op->multiline || *str != '\n')) - ++str; - } - if (p) { - m->end = p->end; - m->child = p; - return m; - } - m = free_match(m); - return NULL; - } - case VM_REPEAT: { - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->end = str; - m->op = op; - if (op->args.repetitions.max == 0) return m; - - match_t **dest = &m->child; - - const char *prev = str; - size_t reps; - for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) { - // Separator - match_t *sep = NULL; - if (op->args.repetitions.sep != NULL && reps > 0) { - sep = match(str, op->args.repetitions.sep); - if (sep == NULL) break; - str = sep->end; - } - match_t *p = match(str, op->args.repetitions.repeat_pat); - if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops - if (sep) sep = free_match(sep); - if (p) p = free_match(p); - break; - } - if (sep) { - *dest = sep; - dest = &sep->nextsibling; - } - *dest = p; - dest = &p->nextsibling; - str = p->end; - prev = str; - } - - if ((ssize_t)reps < op->args.repetitions.min) { - m = free_match(m); - return NULL; - } - m->end = str; - return m; - } - case VM_AFTER: { - ssize_t backtrack = op->args.pat->len; - check(backtrack != -1, "'<' is only allowed for fixed-length operations"); - // Check for necessary space: - for (int i = 0; i < backtrack; i++) { - if (str[-i] == '\0') return NULL; - } - match_t *before = match(str - backtrack, op->args.pat); - if (before == NULL) return NULL; - before = free_match(before); - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->end = str; - m->op = op; - return m; - } - case VM_BEFORE: { - match_t *after = match(str, op->args.pat); - if (after == NULL) return NULL; - after = free_match(after); - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->end = str; - m->op = op; - return m; - } - case VM_CAPTURE: { - match_t *p = match(str, op->args.pat); - if (p == NULL) return NULL; - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->end = p->end; - m->op = op; - m->child = p; - m->is_capture = 1; - if (op->args.capture.name) - m->name_or_replacement = op->args.capture.name; - return m; - } - case VM_OTHERWISE: { - match_t *m = match(str, op->args.multiple.first); - if (m == NULL) m = match(str, op->args.multiple.second); - return m; - } - case VM_CHAIN: { - match_t *m1 = match(str, op->args.multiple.first); - if (m1 == NULL) return NULL; - match_t *m2 = match(m1->end, op->args.multiple.second); - if (m2 == NULL) { - m1 = free_match(m1); - return NULL; - } - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->end = m2->end; - m->op = op; - m->child = m1; - m1->nextsibling = m2; - return m; - } - case VM_REPLACE: { - match_t *m = calloc(sizeof(match_t), 1); - m->start = str; - m->op = op; - if (op->args.replace.replace_pat) { - match_t *p = match(str, op->args.replace.replace_pat); - if (p == NULL) return NULL; - m->child = p; - m->end = p->end; - } else { - m->end = m->start; - } - m->is_replacement = 1; - m->name_or_replacement = op->args.replace.replacement; - return m; - } - case VM_REF: { - // Search backwards so newer defs take precedence - for (int i = ndefs-1; i >= 0; i--) { - if (streq(defs[i].name, op->args.s)) { - // Bingo! - /* - op = defs[i].op; - goto tailcall; - */ - match_t *p = match(str, defs[i].op); - if (p == NULL) return NULL; - match_t *m = calloc(sizeof(match_t), 1); - m->start = p->start; - m->end = p->end; - m->op = op; - m->child = p; - m->name_or_replacement = defs[i].name; - m->is_ref = 1; - return m; - } - } - check(0, "Unknown identifier: '%s'", op->args.s); - return NULL; - } - default: { - fprintf(stderr, "Unknown opcode: %d", op->op); - _exit(1); - return NULL; - } - } -} - -/* - * Helper function to initialize a range object. - */ -static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep) -{ - op->op = VM_REPEAT; - if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0) - op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1)); - else - op->len = -1; - op->args.repetitions.min = min; - op->args.repetitions.max = max; - op->args.repetitions.repeat_pat = pat; - op->args.repetitions.sep = sep; -} - -/* - * Take an opcode and expand it into a chain of patterns if it's - * followed by any patterns (e.g. "`x `y"), otherwise return - * the original input. - */ -static vm_op_t *expand_chain(const char *source, vm_op_t *first) -{ - visualize(source, first->end, "Expanding chain..."); - vm_op_t *second = compile_bpeg(source, first->end); - if (second == NULL) return first; - second = expand_chain(source, second); - check(second->end > first->end, "No forward progress in chain!"); - visualize(source, second->end, "Got chained pair."); - return chain_together(first, second); -} - -/* - * Take an opcode and expand it into a chain of choices if it's - * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise - * return the original input. - */ -static vm_op_t *expand_choices(const char *source, vm_op_t *first) -{ - first = expand_chain(source, first); - const char *str = first->end; - if (!matchchar(&str, '/')) return first; - visualize(source, str, "Expanding choices..."); - //debug("Otherwise:\n"); - vm_op_t *second = compile_bpeg(source, str); - check(second, "Expected pattern after '/'"); - second = expand_choices(source, second); - vm_op_t *choice = calloc(sizeof(vm_op_t), 1); - choice->op = VM_OTHERWISE; - choice->start = first->start; - if (first->len == second->len) - choice->len = first->len; - else choice->len = -1; - choice->end = second->end; - choice->args.multiple.first = first; - choice->args.multiple.second = second; - visualize(source, choice->end, "Got two choices"); - return choice; -} - -static char unescapechar(const char *escaped, const char **end) -{ - size_t len = 1; - char ret = *escaped; - switch (*escaped) { - case 'a': ret = '\a'; break; case 'b': ret = '\b'; break; - case 'n': ret = '\n'; break; case 'r': ret = '\r'; break; - case 't': ret = '\t'; break; case 'v': ret = '\v'; break; - case 'e': ret = '\033'; break; - case 'x': { // Hex - static const char hextable[255] = { - ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, - ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, - ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, - ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, - }; - if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) { - ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF); - len = 3; - } - break; - } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal - ret = escaped[0] - '0'; - if ('0' <= escaped[1] && escaped[1] <= '7') { - ++len; - ret = (ret << 3) | (escaped[1] - '0'); - if ('0' <= escaped[2] && escaped[2] <= '7') { - ++len; - ret = (ret << 3) | (escaped[2] - '0'); - } - } - break; - } - default: break; - } - *end = &escaped[len]; - return ret; -} - -static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second) -{ - if (first == NULL) return second; - if (second == NULL) return first; - vm_op_t *chain = calloc(sizeof(vm_op_t), 1); - chain->op = VM_CHAIN; - chain->start = first->start; - if (first->len >= 0 && second->len >= 0) - chain->len = first->len + second->len; - else chain->len = -1; - chain->end = second->end; - chain->args.multiple.first = first; - chain->args.multiple.second = second; - return chain; -} - -/* - * Compile a string of BPEG code into virtual machine opcodes - */ -static vm_op_t *compile_bpeg(const char *source, const char *str) -{ - if (!*str) return NULL; - visualize(source, str, "Compiling..."); - //debug("Parsing \"%s\"...\n", str); - str = after_spaces(str); - check(*str, "Expected a pattern"); - vm_op_t *op = calloc(sizeof(vm_op_t), 1); - op->start = str; - op->len = -1; - char c = *str; - ++str; - switch (c) { - // Any char (dot) ($. is multiline anychar) - case '.': { - anychar: - visualize(source, str, "Dot"); - //debug("Dot\n"); - op->op = VM_ANYCHAR; - op->len = 1; - break; - } - // Char literals - case '`': { - char literal[2] = {*str, '\0'}; - ++str; - visualize(source, str, "Char literal"); - check(literal[0], "Expected character after '`'\n"); - op->len = 1; - if (matchchar(&str, '-')) { // Range - visualize(source, str, "Char range"); - char c2 = *str; - check(c2, "Expected character after '-'"); - check(c2 >= literal[0], "Character range must be low-to-high"); - op->op = VM_RANGE; - op->args.range.low = literal[0]; - op->args.range.high = c2; - ++str; - } else { - //debug("Char literal\n"); - op->op = VM_STRING; - op->args.s = strdup(literal); - } - break; - } - // Escapes - case '\\': { - //debug("Escape sequence\n"); - visualize(source, str, "Escape sequence"); - check(*str, "Expected escape after '\\'"); - op->len = 1; - char e = unescapechar(str, &str); - if (*str == '-') { // Escape range (e.g. \x00-\xFF) - ++str; - char e2 = unescapechar(str, &str); - check(e2, "Expected character after '-'"); - check(e2 >= e, "Character range must be low-to-high"); - op->op = VM_RANGE; - op->args.range.low = e; - op->args.range.high = e2; - } else { - char literal[2] = {e, '\0'}; - op->op = VM_STRING; - op->args.s = strdup(literal); - } - break; - } - // String literal - case '"': case '\'': case '\002': { - visualize(source, str, "String literal"); - char endquote = c == '\002' ? '\003' : c; - char *literal = (char*)str; - for (; *str && *str != endquote; str++) { - if (*str == '\\') { - check(str[1], "Expected more string contents after backslash"); - ++str; - } - visualize(source, str, "String literal"); - } - size_t len = (size_t)(str - literal); - literal = strndup(literal, len); - len = unescape_string(literal, literal, len); - - op->op = VM_STRING; - op->len = len; - op->args.s = literal; - - check(matchchar(&str, endquote), "Missing closing quote"); - break; - } - // Not - case '!': { - // debug("Not pattern\n"); - visualize(source, str, "Not "); - vm_op_t *p = compile_bpeg(source, str); - check(p, "Expected pattern after '!'\n"); - str = p->end; - op->op = VM_NOT; - op->len = 0; - op->args.pat = p; - break; - } - // Anything but - case '~': { - if (matchchar(&str, '~')) op->multiline = 1; - visualize(source, str, "Anything but "); - vm_op_t *p = compile_bpeg(source, str); - check(p, "Expected pattern after '~'\n"); - str = p->end; - op->op = VM_ANYTHING_BUT; - op->len = -1; - op->args.pat = p; - break; - } - // Upto and including - case '&': { - if (matchchar(&str, '&')) op->multiline = 1; - visualize(source, str, "Upto and including "); - vm_op_t *p = compile_bpeg(source, str); - check(p, "Expected pattern after '&'\n"); - str = p->end; - op->op = VM_UPTO_AND; - op->len = -1; - op->args.pat = p; - break; - } - // Number of repetitions: (- / - / + / "") - case '0': case '1': case '2': case '3': case '4': case '5': - case '6': case '7': case '8': case '9': { - visualize(source, str, "Repeat "); - ssize_t min = -1, max = -1; - --str; - long n1 = strtol(str, (char**)&str, 10); - if (matchchar(&str, '-')) { - str = after_spaces(str); - const char *start = str; - long n2 = strtol(str, (char**)&str, 10); - if (str == start) min = 0, max = n1; - else min = n1, max = n2; - } else if (matchchar(&str, '+')) { - min = n1, max = -1; - } else { - min = n1, max = n1; - } - visualize(source, str, NULL); - vm_op_t *pat = compile_bpeg(source, str); - check(pat, "Expected pattern after repetition count"); - str = pat->end; - str = after_spaces(str); - if (matchchar(&str, '%')) { - visualize(source, str, "Repeat with separator"); - vm_op_t *sep = compile_bpeg(source, str); - check(sep, "Expected pattern for separator after '%%'"); - str = sep->end; - set_range(op, min, max, pat, sep); - } else { - set_range(op, min, max, pat, NULL); - } - visualize(source, str, NULL); - //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); - break; - } - // Special repetitions: - case '+': case '*': case '?': { - //debug("Special repetitions\n"); - visualize(source, str, "Repeat "); - ssize_t min = -1, max = -1; - switch (c) { - case '+': min = 1, max = -1; break; - case '*': min = 0, max = -1; break; - case '?': min = 0, max = 1; break; - } - vm_op_t *pat = compile_bpeg(source, str); - check(pat, "Expected pattern after +"); - str = pat->end; - str = after_spaces(str); - if (matchchar(&str, '%')) { - visualize(source, str, "Repeat with separator"); - vm_op_t *sep = compile_bpeg(source, str); - check(sep, "Expected pattern for separator after '%%'"); - str = sep->end; - set_range(op, min, max, pat, sep); - } else { - set_range(op, min, max, pat, NULL); - } - visualize(source, str, NULL); - //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); - break; - } - // Lookbehind - case '<': { - visualize(source, str, "After "); - //debug("Lookbehind\n"); - vm_op_t *pat = compile_bpeg(source, str); - check(pat, "Expected pattern after <"); - str = pat->end; - check(pat->len != -1, "Lookbehind patterns must have a fixed length"); - str = pat->end; - op->op = VM_AFTER; - op->len = 0; - op->args.pat = pat; - break; - } - // Lookahead - case '>': { - visualize(source, str, "Before "); - //debug("Lookahead\n"); - vm_op_t *pat = compile_bpeg(source, str); - check(pat, "Expected pattern after >"); - str = pat->end; - op->op = VM_BEFORE; - op->len = 0; - op->args.pat = pat; - break; - } - // Parentheses - case '(': { - visualize(source, str, NULL); - // debug("Open paren (\n"); - free(op); - op = compile_bpeg(source, str); - check(op, "Expected pattern inside parentheses"); - op = expand_choices(source, op); - str = op->end; - str = after_spaces(str); - check(matchchar(&str, ')'), "Expected closing parenthesis"); - visualize(source, str, NULL); - // debug(")\n"); - break; - } - // Capture - case '@': { - //debug("Capture\n"); - visualize(source, str, "Capture"); - op->op = VM_CAPTURE; - str = after_spaces(str); - if (matchchar(&str, '[')) { - char *closing = strchr(str, ']'); - check(closing, "Expected closing ']'"); - op->args.capture.name = strndup(str, (size_t)(closing-str)); - visualize(source, str, "Named capture"); - //debug("named \"%s\"\n", op->args.capture.name); - str = closing; - check(matchchar(&str, ']'), "Expected closing ']'"); - } - vm_op_t *pat = compile_bpeg(source, str); - check(pat, "Expected pattern after @"); - str = pat->end; - op->args.capture.capture_pat = pat; - op->len = pat->len; - visualize(source, str, NULL); - break; - } - // Replacement - case '{': { - //debug("Replacement {\n"); - visualize(source, str, "Replacement"); - str = after_spaces(str); - vm_op_t *pat = NULL; - if (strncmp(str, "=>", 2) == 0) { - str += strlen("=>"); - } else { - pat = compile_bpeg(source, str); - check(pat, "Invalid pattern after '{'"); - pat = expand_choices(source, pat); - str = pat->end; - str = after_spaces(str); - check(matchchar(&str, '=') && matchchar(&str, '>'), - "Expected '=>' after pattern in replacement"); - } - visualize(source, str, NULL); - str = after_spaces(str); - - char quote = *str; - const char *replacement; - if (matchchar(&str, '}')) { - replacement = strdup(""); - visualize(source, str, NULL); - } else { - check(matchchar(&str, '"') || matchchar(&str, '\''), - "Expected string literal for replacement"); - replacement = str; - for (; *str && *str != quote; str++) { - if (*str == '\\') { - check(str[1], "Expected more string contents after backslash"); - ++str; - } - visualize(source, str, NULL); - } - replacement = strndup(replacement, (size_t)(str-replacement)); - check(matchchar(&str, quote), "Expected closing quote"); - check(matchchar(&str, '}'), "Expected a closing '}'"); - } - op->op = VM_REPLACE; - op->args.replace.replace_pat = pat; - op->args.replace.replacement = replacement; - //debug(" rep = \"%s\"\n", replacement); - //debug("}\n"); - if (pat != NULL) op->len = pat->len; - visualize(source, str, NULL); - break; - } - // Special rules: - case '_': case '^': case '$': { - if (matchchar(&str, c)) { // double __, ^^, $$ - char tmp[3] = {c, c, '\0'}; - op->args.s = strdup(tmp); - } else if (c == '$' && matchchar(&str, '.')) { // $. (multi-line anychar) - op->multiline = 1; - goto anychar; - } else { - op->args.s = strndup(&c, 1); - } - op->op = VM_REF; - visualize(source, str, op->args.s); - break; - } - // Empty choice (/) or {/} - case '/': { - str = after_spaces(str); - if (*str == ')' || *str == '}') { - op->op = VM_EMPTY; - } else { - free(op); - return NULL; - } - break; - } - default: { - // Reference - if (isalpha(c)) { - visualize(source, str, "Ref"); - --str; - const char *refname = str; - str = after_name(str); - op->op = VM_REF; - op->len = (size_t)(str - refname); - op->args.s = strndup(refname, op->len); - break; - } else { - visualize(source, str, "Finished"); - free(op); - return NULL; - } - } - } - op->end = str; - return op; -} - -/* - * Similar to compile_bpeg, except that the pattern begins with an implicit, unclosable quote. - */ -static vm_op_t *compile_bpeg_string(const char *source, const char *str) -{ - visualize(source, str, "Compiling string..."); - vm_op_t *ret = NULL; - while (*str) { - vm_op_t *strop = calloc(sizeof(vm_op_t), 1); - strop->start = str; - strop->len = 0; - strop->op = VM_STRING; - char *literal = (char*)str; - vm_op_t *interp = NULL; - for (; *str; str++) { - if (*str == '\\') { - check(str[1], "Expected more string contents after backslash"); - interp = compile_bpeg(source, str + 1); - check(interp != NULL, "No valid BPEG pattern detected after backslash"); - break; - } - visualize(source, str, "String literal"); - } - // End of string - size_t len = (size_t)(str - literal); - literal = strndup(literal, len); - len = unescape_string(literal, literal, len); - strop->len = len; - strop->args.s = literal; - strop->end = str; - - if (strop->len == 0) { - free(strop); - strop = NULL; - } else { - ret = chain_together(ret, strop); - } - if (interp) { - ret = chain_together(ret, interp); - str = interp->end; - // allow terminating seq - matchchar(&str, ';'); - } - } - return ret; -} - -static vm_op_t *compile_bpeg_replacement(vm_op_t *pat, const char *str) -{ - vm_op_t *op = calloc(sizeof(vm_op_t), 1); - op->op = VM_REPLACE; - op->start = pat->start; - op->len = pat->len; - op->args.replace.replace_pat = pat; - const char *replacement = str; - for (; *str; str++) { - if (*str == '\\') { - check(str[1], "Expected more string contents after backslash"); - ++str; - } - } - replacement = strndup(replacement, (size_t)(str-replacement)); - op->args.replace.replacement = replacement; - return op; -} - -static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op) -{ - check(ndefs < (sizeof(defs)/sizeof(defs[0])), "Too many definitions!"); - defs[ndefs].name = name; - defs[ndefs].op = op; - defs[ndefs].source = source; - ++ndefs; - return op; -} - -static vm_op_t *load_def(const char *name, const char *source) -{ - vm_op_t *op = compile_bpeg(source, source); - source = strndup((char*)op->start, (int)(op->end - op->start)); - op = expand_choices(source, op); - return add_def(name, source, op); -} - -static void load_defs(void) -{ - // Approximately these are in least-to-most used order so they will be - // found most efficiently by the lookup, which goes in reverse order. - load_def("crlf", "\\r\\n"); - load_def("cr", "\\r"); load_def("r", "\\r"); - load_def("anglebraces", "`< *(anglebraces / ~~`>) `>"); - load_def("brackets", "`[ *(brackets / ~~`]) `]"); - load_def("braces", "`{ *(braces / ~~`}) `}"); - load_def("parens", "`( *(parens / ~~`)) `)"); - load_def("id", "(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9)"); - load_def("HEX", "`0-9/`A-F"); - load_def("Hex", "`0-9/`a-f/`A-F"); - load_def("hex", "`0-9/`a-f"); - load_def("number", "+`0-9 ?(`. *`0-9) / `. +`0-9"); - load_def("int", "+`0-9"); - load_def("digit", "`0-9"); - load_def("Abc", "`a-z/`A-Z"); - load_def("ABC", "`A-Z"); - load_def("abc", "`a-z"); - load_def("esc", "\\e"); load_def("e", "\\e"); - load_def("tab", "\\t"); load_def("t", "\\t"); - load_def("nl", "\\n"); load_def("lf", "\\n"); load_def("n", "\\n"); - load_def("c-block-comment", "'/*' &&'*/'"); - load_def("c-line-comment", "'//' &$"); - load_def("c-comment", "c-line-comment / c-block-comment"); - load_def("hash-comment", "`# &$"); - load_def("comment", "!(/)"); // undefined by default - load_def("WS", "` /\\t/\\n/\\r/comment"); - load_def("ws", "` /\\t"); - load_def("$$", "!$."); - load_def("$", "!."); - load_def("^^", "!<$."); - load_def("^", "!<."); - load_def("__", "*(` /\\t/\\n/\\r/comment)"); - load_def("_", "*(` /\\t)"); -} - -static match_t *get_capture_n(match_t *m, int *n) -{ - if (!m) return NULL; - if (*n == 0) return m; - if (m->is_capture && *n == 1) return m; - if (m->is_capture) --(*n); - for (match_t *c = m->child; c; c = c->nextsibling) { - match_t *cap = get_capture_n(c, n); - if (cap) return cap; - } - return NULL; -} - -static match_t *get_capture_named(match_t *m, const char *name) -{ - if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name)) - return m; - for (match_t *c = m->child; c; c = c->nextsibling) { - match_t *cap = get_capture_named(c, name); - if (cap) return cap; - } - return NULL; -} - -static void print_match(match_t *m, const char *color) -{ - if (m->is_replacement) { - printf("\033[0;34m"); - for (const char *r = m->name_or_replacement; *r; ) { - if (*r == '\\') { - fputc(unescapechar(r, &r), stdout); - continue; - } else if (*r != '@') { - fputc(*r, stdout); - ++r; - continue; - } - - ++r; - match_t *cap = NULL; - switch (*r) { - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': { - int n = (int)strtol(r, (char**)&r, 10); - cap = get_capture_n(m->child, &n); - break; - } - case '[': { - char *closing = strchr(r+1, ']'); - if (!closing) { - fputc('@', stdout); - break; - } - ++r; - char *name = strndup(r, (size_t)(closing-r)); - cap = get_capture_named(m, name); - free(name); - r = closing + 1; - break; - } - default: { - fputc('@', stdout); - break; - } - } - if (cap != NULL) { - print_match(cap, "\033[0;35m"); - printf("\033[0;34m"); - } - } - } else { - const char *name = m->name_or_replacement; - if (verbose && m->is_ref && name && isupper(name[0])) - printf("\033[0;2;35m{%s:", name); - //if (m->is_capture && name) - // printf("\033[0;2;33m[%s:", name); - const char *prev = m->start; - for (match_t *child = m->child; child; child = child->nextsibling) { - if (child->start > prev) - printf("%s%.*s", color, (int)(child->start - prev), prev); - print_match(child, m->is_capture ? "\033[0;1m" : color); - prev = child->end; - } - if (m->end > prev) - printf("%s%.*s", color, (int)(m->end - prev), prev); - if (verbose && m->is_ref && name && isupper(name[0])) - printf("\033[0;2;35m}"); - //if (m->is_capture && name) - // printf("\033[0;2;33m]"); - } -} - -/* - * Read an entire file into memory. - */ -static char *readfile(int fd) -{ - size_t capacity = 1000, len = 0; - char *buf = calloc(sizeof(char), capacity+1); - ssize_t just_read; - while ((just_read=read(fd, &buf[len], capacity-len)) > 0) { - len += (size_t)just_read; - if (len >= capacity) - buf = realloc(buf, (capacity *= 2)); - } - return buf; -} - -static void print_grammar(vm_op_t *op) -{ - switch (op->op) { - case VM_REF: fprintf(stderr, "a $%s", op->args.s); break; - case VM_EMPTY: fprintf(stderr, "the empty string"); break; - case VM_ANYCHAR: fprintf(stderr, "any char"); break; - case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break; - case VM_RANGE: { - fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high); - break; - } - case VM_REPEAT: { - if (op->args.repetitions.max == -1) - fprintf(stderr, "%ld or more (", op->args.repetitions.min); - else - fprintf(stderr, "%ld-%ld of (", - op->args.repetitions.min, - op->args.repetitions.max); - print_grammar(op->args.repetitions.repeat_pat); - fprintf(stderr, ")"); - if (op->args.repetitions.sep) { - fprintf(stderr, " separated by ("); - print_grammar(op->args.repetitions.sep); - fprintf(stderr, ")"); - } - break; - } - case VM_NOT: { - fprintf(stderr, "not ("); - print_grammar(op->args.pat); - fprintf(stderr, ")"); - break; - } - case VM_UPTO_AND: { - fprintf(stderr, "text up to and including ("); - print_grammar(op->args.pat); - fprintf(stderr, ")"); - break; - } - case VM_ANYTHING_BUT: { - fprintf(stderr, "anything but ("); - print_grammar(op->args.pat); - fprintf(stderr, ")"); - break; - } - case VM_AFTER: { - fprintf(stderr, "after ("); - print_grammar(op->args.pat); - fprintf(stderr, ")"); - break; - } - case VM_BEFORE: { - fprintf(stderr, "before ("); - print_grammar(op->args.pat); - fprintf(stderr, ")"); - break; - } - case VM_CAPTURE: { - fprintf(stderr, "capture ("); - print_grammar(op->args.pat); - fprintf(stderr, ")"); - if (op->args.capture.name) - fprintf(stderr, " and call it %s", op->args.capture.name); - break; - } - case VM_OTHERWISE: { - fprintf(stderr, "("); - print_grammar(op->args.multiple.first); - fprintf(stderr, ") or else "); - if (op->args.multiple.second->op != VM_OTHERWISE) - fprintf(stderr, "("); - print_grammar(op->args.multiple.second); - if (op->args.multiple.second->op != VM_OTHERWISE) - fprintf(stderr, ")"); - break; - } - case VM_CHAIN: { - fprintf(stderr, "("); - print_grammar(op->args.multiple.first); - fprintf(stderr, ") then "); - if (op->args.multiple.second->op != VM_CHAIN) - fprintf(stderr, "("); - print_grammar(op->args.multiple.second); - if (op->args.multiple.second->op != VM_CHAIN) - fprintf(stderr, ")"); - break; - } - case VM_REPLACE: { - fprintf(stderr, "replace "); - if (op->args.replace.replace_pat) { - fprintf(stderr, "("); - print_grammar(op->args.replace.replace_pat); - fprintf(stderr, ")"); - } else - fprintf(stderr, "\"\""); - fprintf(stderr, " with \"%s\"", op->args.replace.replacement); - break; - } - default: break; - } -} - -static vm_op_t *load_grammar(const char *grammar) -{ - vm_op_t *op = compile_bpeg(grammar, grammar); - check(op, "Failed to compile_bpeg input"); - op = expand_choices(grammar, op); - - const char *defs = op->end; - while (matchchar(&defs, ';')) { - if (verbose) fprintf(stderr, "\n"); - defs = after_spaces(defs); - const char *name = defs; - if (strncmp(name, "^^", 2) == 0 || - strncmp(name, "__", 2) == 0 || - strncmp(name, "$$", 2) == 0) { - name = strndup(name, 2); - defs += 2; - } else if (*name == '^' || *name == '_' || *name == '$') { - name = strndup(name, 1); - defs += 1; - } else { - defs = after_name(defs); - if (defs == NULL) break; - name = strndup(name, (size_t)(defs-name)); - } - defs = after_spaces(defs); - check(matchchar(&defs, '='), "Expected '=' in definition"); - vm_op_t *def = load_def(name, defs); - check(def, "Couldn't load definition"); - defs = def->end; - } - return op; -} int main(int argc, char *argv[]) { + int verbose = 0; const char *pattern = NULL, *replacement = NULL, *grammarfile = NULL, *infile = NULL; + grammar_t *g = new_grammar(); + for (int i = 1; i < argc; i++) { if (streq(argv[i], "--help") || streq(argv[i], "-h")) { printf("%s\n", usage); @@ -1122,8 +71,6 @@ int main(int argc, char *argv[]) verbose = 1; } else if (streq(argv[i], "--replace") || streq(argv[i], "-r")) { replacement = argv[++i]; - } else if (streq(argv[i], "--slow") || streq(argv[i], "-s")) { - visualize_delay = 100000; } else if (streq(argv[i], "--grammar") || streq(argv[i], "-g")) { grammarfile = argv[++i]; } else if (streq(argv[i], "--define") || streq(argv[i], "-d")) { @@ -1131,7 +78,10 @@ int main(int argc, char *argv[]) char *eq = strchr(def, '='); check(eq, usage); *eq = '\0'; - load_def(def, ++eq); + char *src = ++eq; + vm_op_t *pat = bpeg_pattern(src); + check(pat, "Failed to compile pattern"); + add_def(g, src, def, pat); } else if (pattern == NULL) { pattern = argv[i]; } else if (infile == NULL) { @@ -1140,16 +90,7 @@ int main(int argc, char *argv[]) } check(pattern != NULL || grammarfile != NULL, usage); - if (verbose) fprintf(stderr, "====== Loading definitions ======\n\n\n\n"); - { - int tmp1 = visualize_delay, tmp2 = verbose; - visualize_delay = -1, verbose = 0; - load_defs(); - visualize_delay = tmp1, verbose = tmp2; - } - if (verbose) fprintf(stderr, "========== Compiling ===========\n\n\n\n"); - vm_op_t *op; if (grammarfile) { // load grammar from a file (semicolon mode) char *grammar; @@ -1160,24 +101,21 @@ int main(int argc, char *argv[]) check(fd >= 0, "Couldn't open file: %s", argv[2]); grammar = readfile(fd); } - op = load_grammar(grammar); + load_grammar(g, grammar); } else { // load grammar in start-with-string mode: - vm_op_t *pat = compile_bpeg_string(pattern, pattern); - if (replacement) { - pat = compile_bpeg_replacement(pat, replacement); - } + vm_op_t *pat = bpeg_stringpattern(pattern); + if (replacement) + pat = bpeg_replacement(pat, replacement); - add_def("pattern", pattern, pat); + add_def(g, pattern, "pattern", pat); - const char *grammar = "*(@pattern / \\n / .)"; - op = compile_bpeg(grammar, grammar); + const char *grammar = "find = *(@pattern / \\n / .);"; + load_grammar(g, grammar); } if (verbose) { - fprintf(stderr, "\n\n"); - print_grammar(op); - fprintf(stderr, "\n\n"); + print_pattern(g->pattern); } char *input; @@ -1194,12 +132,12 @@ int main(int argc, char *argv[]) stpcpy(&lpadded[1], input); input = &lpadded[1]; - match_t *m = match(input, op); + match_t *m = match(g, input, g->pattern); if (m == NULL) { printf("No match\n"); return 1; } else { - print_match(m, "\033[0m"); + print_match(m, "\033[0m", verbose); printf("\033[0;2m%s\n", m->end); } diff --git a/bpeg.h b/bpeg.h deleted file mode 100644 index 459bd65..0000000 --- a/bpeg.h +++ /dev/null @@ -1,112 +0,0 @@ -/* - * bpeg.h - Header file for the bpeg parser - */ -#include -#include -#include -#include -#include -#include - -#include "utils.h" - -const char *usage = ( - "Usage:\n" - " bpeg [flags] [...]\n\n" - "Flags:\n" - " -h --help\t print the usage and quit\n" - " -v --verbose\t print verbose debugging info\n" - " -s --slow\t run in slow mode for debugging\n" - " -r --replace replace the input pattern with the given replacement\n" - " -g --grammar use the specified file as a grammar\n"); - -/* - * BPEG virtual machine opcodes - */ -enum VMOpcode { - VM_EMPTY = 0, - VM_ANYCHAR = 1, - VM_ANYTHING_BUT, - VM_STRING, - VM_RANGE, - VM_NOT, - VM_UPTO_AND, - VM_REPEAT, - VM_BEFORE, - VM_AFTER, - VM_CAPTURE, - VM_OTHERWISE, - VM_CHAIN, - VM_REPLACE, - VM_REF, -}; - -/* - * A struct reperesenting a BPEG virtual machine operation - */ -typedef struct vm_op_s { - enum VMOpcode op; - unsigned int multiline:1; - const char *start, *end; - // Length of the match, if constant, otherwise -1 - ssize_t len; - union { - const char *s; - struct { - char low, high; - } range; - struct { - ssize_t min, max; - struct vm_op_s *sep, *repeat_pat; - } repetitions; - // TODO: use a linked list instead of a binary tree - struct { - struct vm_op_s *first, *second; - } multiple; - struct { - struct vm_op_s *replace_pat; - const char *replacement; - } replace; - struct { - struct vm_op_s *capture_pat; - char *name; - } capture; - struct vm_op_s *pat; - } args; -} vm_op_t; - -/* - * Pattern matching result object - */ -typedef struct match_s { - // Where the match starts and ends (end is after the last character) - const char *start, *end; - unsigned int is_capture:1, is_replacement:1, is_ref:1; - const char *name_or_replacement; - struct match_s *child, *nextsibling; - vm_op_t *op; -} match_t; - -static inline const char *after_spaces(const char *str); -static match_t *free_match(match_t *m); -static match_t *match(const char *str, vm_op_t *op); -static vm_op_t *compile_bpeg(const char *source, const char *str); -static vm_op_t *load_grammar(const char *grammar); -static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op); -static vm_op_t *load_def(const char *name, const char *source); -static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second); -static vm_op_t *compile_bpeg_string(const char *source, const char *str); -static vm_op_t *expand_chain(const char *source, vm_op_t *first); -static vm_op_t *expand_choices(const char *source, vm_op_t *op); -static void print_match(match_t *m, const char *color); -static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep); - - -typedef struct { - const char *name; - const char *source; - vm_op_t *op; -} def_t; - -static def_t defs[1024] = {{NULL, NULL, NULL}}; -size_t ndefs = 0; diff --git a/compiler.c b/compiler.c new file mode 100644 index 0000000..88919cf --- /dev/null +++ b/compiler.c @@ -0,0 +1,461 @@ +/* + * compiler.c - Compile strings into BPEG virtual machine code. + */ + +#include "compiler.h" +#include "utils.h" + +static vm_op_t *expand_chain(vm_op_t *first); +static vm_op_t *expand_choices(vm_op_t *first); +static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second); +static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep); + +/* + * Helper function to initialize a range object. + */ +static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep) +{ + op->op = VM_REPEAT; + if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0) + op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1)); + else + op->len = -1; + op->args.repetitions.min = min; + op->args.repetitions.max = max; + op->args.repetitions.repeat_pat = pat; + op->args.repetitions.sep = sep; +} + +/* + * Take an opcode and expand it into a chain of patterns if it's + * followed by any patterns (e.g. "`x `y"), otherwise return + * the original input. + */ +static vm_op_t *expand_chain(vm_op_t *first) +{ + vm_op_t *second = bpeg_simplepattern(first->end); + if (second == NULL) return first; + second = expand_chain(second); + check(second->end > first->end, "No forward progress in chain!"); + return chain_together(first, second); +} + +/* + * Take an opcode and expand it into a chain of choices if it's + * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise + * return the original input. + */ +static vm_op_t *expand_choices(vm_op_t *first) +{ + first = expand_chain(first); + const char *str = first->end; + if (!matchchar(&str, '/')) return first; + vm_op_t *second = bpeg_simplepattern(str); + check(second, "Expected pattern after '/'"); + second = expand_choices(second); + vm_op_t *choice = calloc(sizeof(vm_op_t), 1); + choice->op = VM_OTHERWISE; + choice->start = first->start; + if (first->len == second->len) + choice->len = first->len; + else choice->len = -1; + choice->end = second->end; + choice->args.multiple.first = first; + choice->args.multiple.second = second; + return choice; +} + +static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second) +{ + if (first == NULL) return second; + if (second == NULL) return first; + vm_op_t *chain = calloc(sizeof(vm_op_t), 1); + chain->op = VM_CHAIN; + chain->start = first->start; + if (first->len >= 0 && second->len >= 0) + chain->len = first->len + second->len; + else chain->len = -1; + chain->end = second->end; + chain->args.multiple.first = first; + chain->args.multiple.second = second; + return chain; +} + +/* + * Compile a string of BPEG code into virtual machine opcodes + */ +vm_op_t *bpeg_simplepattern(const char *str) +{ + if (!*str) return NULL; + str = after_spaces(str); + check(*str, "Expected a pattern"); + vm_op_t *op = calloc(sizeof(vm_op_t), 1); + op->start = str; + op->len = -1; + char c = *str; + ++str; + switch (c) { + // Any char (dot) ($. is multiline anychar) + case '.': { + anychar: + op->op = VM_ANYCHAR; + op->len = 1; + break; + } + // Char literals + case '`': { + char literal[2] = {*str, '\0'}; + ++str; + check(literal[0], "Expected character after '`'\n"); + op->len = 1; + if (matchchar(&str, '-')) { // Range + char c2 = *str; + check(c2, "Expected character after '-'"); + check(c2 >= literal[0], "Character range must be low-to-high"); + op->op = VM_RANGE; + op->args.range.low = literal[0]; + op->args.range.high = c2; + ++str; + } else { + op->op = VM_STRING; + op->args.s = strdup(literal); + } + break; + } + // Escapes + case '\\': { + check(*str, "Expected escape after '\\'"); + op->len = 1; + char e = unescapechar(str, &str); + if (*str == '-') { // Escape range (e.g. \x00-\xFF) + ++str; + char e2 = unescapechar(str, &str); + check(e2, "Expected character after '-'"); + check(e2 >= e, "Character range must be low-to-high"); + op->op = VM_RANGE; + op->args.range.low = e; + op->args.range.high = e2; + } else { + char literal[2] = {e, '\0'}; + op->op = VM_STRING; + op->args.s = strdup(literal); + } + break; + } + // String literal + case '"': case '\'': case '\002': { + char endquote = c == '\002' ? '\003' : c; + char *literal = (char*)str; + for (; *str && *str != endquote; str++) { + if (*str == '\\') { + check(str[1], "Expected more string contents after backslash"); + ++str; + } + } + size_t len = (size_t)(str - literal); + literal = strndup(literal, len); + len = unescape_string(literal, literal, len); + + op->op = VM_STRING; + op->len = len; + op->args.s = literal; + + check(matchchar(&str, endquote), "Missing closing quote"); + break; + } + // Not + case '!': { + vm_op_t *p = bpeg_simplepattern(str); + check(p, "Expected pattern after '!'\n"); + str = p->end; + op->op = VM_NOT; + op->len = 0; + op->args.pat = p; + break; + } + // Anything but + case '~': { + if (matchchar(&str, '~')) op->multiline = 1; + vm_op_t *p = bpeg_simplepattern(str); + check(p, "Expected pattern after '~'\n"); + str = p->end; + op->op = VM_ANYTHING_BUT; + op->len = -1; + op->args.pat = p; + break; + } + // Upto and including + case '&': { + if (matchchar(&str, '&')) op->multiline = 1; + vm_op_t *p = bpeg_simplepattern(str); + check(p, "Expected pattern after '&'\n"); + str = p->end; + op->op = VM_UPTO_AND; + op->len = -1; + op->args.pat = p; + break; + } + // Number of repetitions: (- / - / + / "") + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': { + ssize_t min = -1, max = -1; + --str; + long n1 = strtol(str, (char**)&str, 10); + if (matchchar(&str, '-')) { + str = after_spaces(str); + const char *start = str; + long n2 = strtol(str, (char**)&str, 10); + if (str == start) min = 0, max = n1; + else min = n1, max = n2; + } else if (matchchar(&str, '+')) { + min = n1, max = -1; + } else { + min = n1, max = n1; + } + vm_op_t *pat = bpeg_simplepattern(str); + check(pat, "Expected pattern after repetition count"); + str = pat->end; + str = after_spaces(str); + if (matchchar(&str, '%')) { + vm_op_t *sep = bpeg_simplepattern(str); + check(sep, "Expected pattern for separator after '%%'"); + str = sep->end; + set_range(op, min, max, pat, sep); + } else { + set_range(op, min, max, pat, NULL); + } + break; + } + // Special repetitions: + case '+': case '*': case '?': { + ssize_t min = -1, max = -1; + switch (c) { + case '+': min = 1, max = -1; break; + case '*': min = 0, max = -1; break; + case '?': min = 0, max = 1; break; + } + vm_op_t *pat = bpeg_simplepattern(str); + check(pat, "Expected pattern after +"); + str = pat->end; + str = after_spaces(str); + if (matchchar(&str, '%')) { + vm_op_t *sep = bpeg_simplepattern(str); + check(sep, "Expected pattern for separator after '%%'"); + str = sep->end; + set_range(op, min, max, pat, sep); + } else { + set_range(op, min, max, pat, NULL); + } + break; + } + // Lookbehind + case '<': { + vm_op_t *pat = bpeg_simplepattern(str); + check(pat, "Expected pattern after <"); + str = pat->end; + check(pat->len != -1, "Lookbehind patterns must have a fixed length"); + str = pat->end; + op->op = VM_AFTER; + op->len = 0; + op->args.pat = pat; + break; + } + // Lookahead + case '>': { + vm_op_t *pat = bpeg_simplepattern(str); + check(pat, "Expected pattern after >"); + str = pat->end; + op->op = VM_BEFORE; + op->len = 0; + op->args.pat = pat; + break; + } + // Parentheses + case '(': { + free(op); + op = bpeg_simplepattern(str); + check(op, "Expected pattern inside parentheses"); + op = expand_choices(op); + str = op->end; + str = after_spaces(str); + check(matchchar(&str, ')'), "Expected closing parenthesis"); + break; + } + // Capture + case '@': { + op->op = VM_CAPTURE; + str = after_spaces(str); + if (matchchar(&str, '[')) { + char *closing = strchr(str, ']'); + check(closing, "Expected closing ']'"); + op->args.capture.name = strndup(str, (size_t)(closing-str)); + str = closing; + check(matchchar(&str, ']'), "Expected closing ']'"); + } + vm_op_t *pat = bpeg_simplepattern(str); + check(pat, "Expected pattern after @"); + str = pat->end; + op->args.capture.capture_pat = pat; + op->len = pat->len; + break; + } + // Replacement + case '{': { + str = after_spaces(str); + vm_op_t *pat = NULL; + if (strncmp(str, "=>", 2) == 0) { + str += strlen("=>"); + } else { + pat = bpeg_simplepattern(str); + check(pat, "Invalid pattern after '{'"); + pat = expand_choices(pat); + str = pat->end; + str = after_spaces(str); + check(matchchar(&str, '=') && matchchar(&str, '>'), + "Expected '=>' after pattern in replacement"); + } + str = after_spaces(str); + + char quote = *str; + const char *replacement; + if (matchchar(&str, '}')) { + replacement = strdup(""); + } else { + check(matchchar(&str, '"') || matchchar(&str, '\''), + "Expected string literal for replacement"); + replacement = str; + for (; *str && *str != quote; str++) { + if (*str == '\\') { + check(str[1], "Expected more string contents after backslash"); + ++str; + } + } + replacement = strndup(replacement, (size_t)(str-replacement)); + check(matchchar(&str, quote), "Expected closing quote"); + check(matchchar(&str, '}'), "Expected a closing '}'"); + } + op->op = VM_REPLACE; + op->args.replace.replace_pat = pat; + op->args.replace.replacement = replacement; + if (pat != NULL) op->len = pat->len; + break; + } + // Special rules: + case '_': case '^': case '$': { + if (matchchar(&str, c)) { // double __, ^^, $$ + char tmp[3] = {c, c, '\0'}; + op->args.s = strdup(tmp); + } else if (c == '$' && matchchar(&str, '.')) { // $. (multi-line anychar) + op->multiline = 1; + goto anychar; + } else { + op->args.s = strndup(&c, 1); + } + op->op = VM_REF; + break; + } + // Empty choice (/) or {/} + case '/': { + str = after_spaces(str); + if (*str == ')' || *str == '}') { + op->op = VM_EMPTY; + } else { + free(op); + return NULL; + } + break; + } + default: { + // Reference + if (isalpha(c)) { + --str; + const char *refname = str; + str = after_name(str); + op->op = VM_REF; + op->len = (size_t)(str - refname); + op->args.s = strndup(refname, op->len); + break; + } else { + free(op); + return NULL; + } + } + } + op->end = str; + return op; +} + +/* + * Similar to bpeg_simplepattern, except that the pattern begins with an implicit, unclosable quote. + */ +vm_op_t *bpeg_stringpattern(const char *str) +{ + vm_op_t *ret = NULL; + while (*str) { + vm_op_t *strop = calloc(sizeof(vm_op_t), 1); + strop->start = str; + strop->len = 0; + strop->op = VM_STRING; + char *literal = (char*)str; + vm_op_t *interp = NULL; + for (; *str; str++) { + if (*str == '\\') { + check(str[1], "Expected more string contents after backslash"); + interp = bpeg_simplepattern(str + 1); + check(interp != NULL, "No valid BPEG pattern detected after backslash"); + break; + } + } + // End of string + size_t len = (size_t)(str - literal); + literal = strndup(literal, len); + len = unescape_string(literal, literal, len); + strop->len = len; + strop->args.s = literal; + strop->end = str; + + if (strop->len == 0) { + free(strop); + strop = NULL; + } else { + ret = chain_together(ret, strop); + } + if (interp) { + ret = chain_together(ret, interp); + str = interp->end; + // allow terminating seq + matchchar(&str, ';'); + } + } + return ret; +} + +/* + * Given a pattern and a replacement string, compile the two into a replacement + * VM opcode. + */ +vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement) +{ + vm_op_t *op = calloc(sizeof(vm_op_t), 1); + op->op = VM_REPLACE; + op->start = pat->start; + op->len = pat->len; + op->args.replace.replace_pat = pat; + const char *p = replacement; + for (; *p; p++) { + if (*p == '\\') { + check(p[1], "Expected more string contents after backslash"); + ++p; + } + } + replacement = strndup(replacement, (size_t)(p-replacement)); + op->args.replace.replacement = replacement; + return op; +} + +vm_op_t *bpeg_pattern(const char *str) +{ + vm_op_t *op = bpeg_simplepattern(str); + if (op != NULL) op = expand_choices(op); + return op; +} diff --git a/compiler.h b/compiler.h new file mode 100644 index 0000000..916d8fe --- /dev/null +++ b/compiler.h @@ -0,0 +1,16 @@ +/* + * compiler.h - Header file for BPEG compiler. + */ +#ifndef COMPILER__H +#define COMPILER__H + +#include + +#include "types.h" + +vm_op_t *bpeg_simplepattern(const char *str); +vm_op_t *bpeg_stringpattern(const char *str); +vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement); +vm_op_t *bpeg_pattern(const char *str); + +#endif diff --git a/grammar.c b/grammar.c new file mode 100644 index 0000000..c04587b --- /dev/null +++ b/grammar.c @@ -0,0 +1,93 @@ +/* + * grammar.c - Code for defining grammars (sets of rules) + */ + +#include "grammar.h" +#include "compiler.h" +#include "utils.h" + +const char *BPEG_BUILTIN_GRAMMAR = ( + "crlf=\\r\\n;\n" + "cr=\\r;\n" "r=\\r;\n" + "anglebraces=`< *(anglebraces / ~~`>) `>;\n" + "brackets=`[ *(brackets / ~~`]) `];\n" + "braces=`{ *(braces / ~~`}) `};\n" + "parens=`( *(parens / ~~`)) `);\n" + "id=(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);\n" + "HEX=`0-9/`A-F;\n" + "Hex=`0-9/`a-f/`A-F;\n" + "hex=`0-9/`a-f;\n" + "number=+`0-9 ?(`. *`0-9) / `. +`0-9;\n" + "int=+`0-9;\n" + "digit=`0-9;\n" + "Abc=`a-z/`A-Z;\n" + "ABC=`A-Z;\n" + "abc=`a-z;\n" + "esc=\\e;\n" "e=\\e;\n" + "tab=\\t;\n" "t=\\t;\n" + "nl=\\n;\n" "lf=\\n;\n" "n=\\n;\n" + "c-block-comment='/*' &&'*/';\n" + "c-line-comment='//' &$;\n" + "c-comment=c-line-comment / c-block-comment;\n" + "hash-comment=`# &$;\n" + "comment=!(/);\n" // No default definition, can be overridden + "WS=` /\\t/\\n/\\r/comment;\n" + "ws=` /\\t;\n" + "$$=!$.;\n" + "$=!.;\n" + "^^=!<$.;\n" + "^=!<.;\n" + "__=*(` /\\t/\\n/\\r/comment);\n" + "_=*(` /\\t);\n" + ); + +grammar_t *new_grammar(void) +{ + grammar_t *g = calloc(sizeof(grammar_t), 1); + g->definitions = calloc(sizeof(def_t), (g->capacity = 128)); + load_grammar(g, BPEG_BUILTIN_GRAMMAR); + return g; +} + +void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op) +{ + if (g->size >= g->capacity) { + g->definitions = realloc(g->definitions, (g->capacity += 32)); + } + int i = g->size; + g->definitions[i].source = src; + g->definitions[i].name = name; + g->definitions[i].op = op; + ++g->size; +} + +void load_grammar(grammar_t *g, const char *src) +{ + vm_op_t *mainpat = NULL; + do { + src = after_spaces(src); + if (!*src) break; + const char *name = src; + const char *name_end = after_name(name); + check(name_end > name, "Invalid name for definition"); + name = strndup(name, (size_t)(name_end-name)); + src = after_spaces(name_end); + check(matchchar(&src, '='), "Expected '=' in definition"); + vm_op_t *op = bpeg_pattern(src); + check(op, "Couldn't load definition"); + add_def(g, src, name, op); + if (mainpat == NULL) { + mainpat = op; + g->pattern = op; + } + src = op->end; + } while (*src && matchchar(&src, ';')); +} + +/* + * Print a BPEG grammar in human-readable form. + */ +void print_grammar(grammar_t *g) +{ + if (g->pattern) print_pattern(g->pattern); +} diff --git a/grammar.h b/grammar.h new file mode 100644 index 0000000..90cc3f2 --- /dev/null +++ b/grammar.h @@ -0,0 +1,18 @@ +/* + * grammar.h - Header file defining grammars (sets of rule definitions) + */ +#ifndef GRAMMAR__H +#define GRAMMAR__H + +#include +#include + +#include "types.h" + +grammar_t *new_grammar(void); +void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op); +void load_grammar(grammar_t *g, const char *source); +void print_grammar(grammar_t *g); + + +#endif diff --git a/types.h b/types.h new file mode 100644 index 0000000..ef62cdc --- /dev/null +++ b/types.h @@ -0,0 +1,89 @@ +/* + * types.h - Datatypes used by BPEG + */ +#ifndef TYPES__H +#define TYPES__H + +#include + +/* + * BPEG virtual machine opcodes + */ +enum VMOpcode { + VM_EMPTY = 0, + VM_ANYCHAR = 1, + VM_ANYTHING_BUT, + VM_STRING, + VM_RANGE, + VM_NOT, + VM_UPTO_AND, + VM_REPEAT, + VM_BEFORE, + VM_AFTER, + VM_CAPTURE, + VM_OTHERWISE, + VM_CHAIN, + VM_REPLACE, + VM_REF, +}; + +/* + * A struct reperesenting a BPEG virtual machine operation + */ +typedef struct vm_op_s { + enum VMOpcode op; + unsigned int multiline:1; + const char *start, *end; + // Length of the match, if constant, otherwise -1 + ssize_t len; + union { + const char *s; + struct { + char low, high; + } range; + struct { + ssize_t min, max; + struct vm_op_s *sep, *repeat_pat; + } repetitions; + // TODO: use a linked list instead of a binary tree + struct { + struct vm_op_s *first, *second; + } multiple; + struct { + struct vm_op_s *replace_pat; + const char *replacement; + } replace; + struct { + struct vm_op_s *capture_pat; + char *name; + } capture; + struct vm_op_s *pat; + } args; +} vm_op_t; + +/* + * Pattern matching result object + */ +typedef struct match_s { + // Where the match starts and ends (end is after the last character) + const char *start, *end; + unsigned int is_capture:1, is_replacement:1, is_ref:1; + const char *name_or_replacement; + struct match_s *child, *nextsibling; + vm_op_t *op; +} match_t; + + +typedef struct { + const char *name; + const char *source; + vm_op_t *op; +} def_t; + +typedef struct { + vm_op_t *pattern; + def_t *definitions; + size_t size, capacity; +} grammar_t; + +#endif diff --git a/utils.c b/utils.c new file mode 100644 index 0000000..4fdec79 --- /dev/null +++ b/utils.c @@ -0,0 +1,171 @@ +/* + * utils.c - Some helper code for debugging and error logging. + */ +#include "utils.h" + +/* + * Helper function to skip past all spaces (and comments) + * Returns a pointer to the first non-space character. + */ +const char *after_spaces(const char *str) +{ + // Skip whitespace and comments: + skip_whitespace: + switch (*str) { + case ' ': case '\r': case '\n': case '\t': { + ++str; + goto skip_whitespace; + } + case '#': { + while (*str && *str != '\n') ++str; + goto skip_whitespace; + } + } + return str; +} + +/* + * Return the first character after a valid BPEG name, or NULL if none is + * found. + */ +const char *after_name(const char *str) +{ + if (*str == '^' || *str == '_' || *str == '$') { + return (str[1] == *str) ? &str[2] : &str[1]; + } + if (!isalpha(*str)) return NULL; + for (++str; *str; ++str) { + if (!(isalnum(*str) || *str == '-')) + break; + } + return str; +} + +/* + * Check if a character is found and if so, move past it. + */ +int matchchar(const char **str, char c) +{ + *str = after_spaces(*str); + if (**str == c) { + ++(*str); + return 1; + } else { + return 0; + } +} + +/* + * Process a string escape sequence for a character and return the + * character that was escaped. + * Set *end = the first character past the end of the escape sequence. + */ +char unescapechar(const char *escaped, const char **end) +{ + size_t len = 1; + char ret = *escaped; + switch (*escaped) { + case 'a': ret = '\a'; break; case 'b': ret = '\b'; break; + case 'n': ret = '\n'; break; case 'r': ret = '\r'; break; + case 't': ret = '\t'; break; case 'v': ret = '\v'; break; + case 'e': ret = '\033'; break; + case 'x': { // Hex + static const char hextable[255] = { + ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, + ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, + ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, + ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, + }; + if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) { + ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF); + len = 3; + } + break; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal + ret = escaped[0] - '0'; + if ('0' <= escaped[1] && escaped[1] <= '7') { + ++len; + ret = (ret << 3) | (escaped[1] - '0'); + if ('0' <= escaped[2] && escaped[2] <= '7') { + ++len; + ret = (ret << 3) | (escaped[2] - '0'); + } + } + break; + } + default: break; + } + *end = &escaped[len]; + return ret; +} + +/* + * Write an unescaped version of `src` to `dest` (at most bufsize-1 chars, + * terminated by a null byte) + */ +size_t unescape_string(char *dest, const char *src, size_t bufsize) +{ + size_t len = 0; +#define PUT(c) do { *(dest++) = (char)(c); ++len; } while (0) + for ( ; *src && len < bufsize; ++src) { + if (*src != '\\') { + PUT(*src); + continue; + } + ++src; + switch (*src) { + case 'a': PUT('\a'); break; case 'b': PUT('\b'); break; + case 'n': PUT('\n'); break; case 'r': PUT('\r'); break; + case 't': PUT('\t'); break; case 'v': PUT('\v'); break; + case 'e': PUT('\033'); break; + case 'x': { // Hex + static const char hextable[255] = { + ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, + ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, + ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, + ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, + }; + if (hextable[(int)src[1]] && hextable[(int)src[2]]) { + PUT((hextable[(int)src[1]] << 4) | (hextable[(int)src[2]] & 0xF)); + src += 2; + } + break; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal + int c = *src - '0'; + if ('0' <= src[1] && src[1] <= '7') { + ++src; + c = (c << 3) | (*src - '0'); + if ('0' <= src[1] && src[1] <= '7' && (c << 3) < 256) { + ++src; + c = (c << 3) | (*src - '0'); + } + } + PUT(c); + break; + } + default: PUT(*src); break; + } + } + *dest = '\0'; + return len; +#undef PUT +} + +/* + * Read an entire file into memory. + */ +char *readfile(int fd) +{ + size_t capacity = 1000, len = 0; + char *buf = calloc(sizeof(char), capacity+1); + ssize_t just_read; + while ((just_read=read(fd, &buf[len], capacity-len)) > 0) { + len += (size_t)just_read; + if (len >= capacity) + buf = realloc(buf, (capacity *= 2)); + } + return buf; +} + diff --git a/utils.h b/utils.h index 714f17e..c2dd04a 100644 --- a/utils.h +++ b/utils.h @@ -1,120 +1,27 @@ /* - * utils.h - Some helper code for debugging and error logging. + * utils.h - Some utility and printing functions. */ +#ifndef UTILS__H +#define UTILS__H + +#include +#include +#include +#include +#include + +#include "vm.h" #define streq(a, b) (strcmp(a, b) == 0) // TODO: better error reporting #define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0) #define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0) -static int verbose = 0; -static int visualize_delay = -1; - -/* - * Helper function to skip past all spaces (and comments) - * Returns a pointer to the first non-space character. - */ -static inline const char *after_spaces(const char *str) -{ - // Skip whitespace and comments: - skip_whitespace: - switch (*str) { - case ' ': case '\r': case '\n': case '\t': { - ++str; - goto skip_whitespace; - } - case '#': { - while (*str && *str != '\n') ++str; - goto skip_whitespace; - } - } - return str; -} - -static inline const char *after_name(const char *str) -{ - if (!isalpha(*str)) return NULL; - for (++str; *str; ++str) { - if (!(isalnum(*str) || *str == '-')) - break; - } - return str; -} +char *readfile(int fd); +char unescapechar(const char *escaped, const char **end); +const char *after_name(const char *str); +const char *after_spaces(const char *str); +int matchchar(const char **str, char c); +size_t unescape_string(char *dest, const char *src, size_t bufsize); -static inline int matchchar(const char **str, char c) -{ - *str = after_spaces(*str); - if (**str == c) { - ++(*str); - return 1; - } else { - return 0; - } -} - -static void visualize(const char *source, const char *ptr, const char *msg) -{ - if (!verbose) return; - fprintf(stderr, "\033[0;1m\r\033[2A\033[K%.*s\033[0;2m%s\033[0m\n", - (int)(ptr-source), source, ptr); - fprintf(stderr, "\033[0;1m"); - for (--ptr ; ptr > source; --ptr) putc(' ', stderr); - fprintf(stderr, "^\033[K\n"); - if (msg) - fprintf(stderr, "\033[K\033[33;1m%s\033[0m", msg); - if (visualize_delay > 0) - usleep(visualize_delay); -} - -/* - * Write an unescaped version of `src` to `dest` (at most bufsize-1 chars, - * terminated by a null byte) - */ -static size_t unescape_string(char *dest, const char *src, size_t bufsize) -{ - size_t len = 0; -#define PUT(c) do { *(dest++) = (char)(c); ++len; } while (0) - for ( ; *src && len < bufsize; ++src) { - if (*src != '\\') { - PUT(*src); - continue; - } - ++src; - switch (*src) { - case 'a': PUT('\a'); break; case 'b': PUT('\b'); break; - case 'n': PUT('\n'); break; case 'r': PUT('\r'); break; - case 't': PUT('\t'); break; case 'v': PUT('\v'); break; - case 'e': PUT('\033'); break; - case 'x': { // Hex - static const char hextable[255] = { - ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, - ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, - ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, - ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, - }; - if (hextable[(int)src[1]] && hextable[(int)src[2]]) { - PUT((hextable[(int)src[1]] << 4) | (hextable[(int)src[2]] & 0xF)); - src += 2; - } - break; - } - case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal - int c = *src - '0'; - if ('0' <= src[1] && src[1] <= '7') { - ++src; - c = (c << 3) | (*src - '0'); - if ('0' <= src[1] && src[1] <= '7' && (c << 3) < 256) { - ++src; - c = (c << 3) | (*src - '0'); - } - } - PUT(c); - break; - } - default: PUT(*src); break; - } - } - *dest = '\0'; - return len; -#undef PUT -} +#endif diff --git a/vm.c b/vm.c new file mode 100644 index 0000000..50b136a --- /dev/null +++ b/vm.c @@ -0,0 +1,449 @@ +/* + * vm.c - Code for the BPEG virtual machine that performs the matching. + */ +#include "vm.h" +#include "utils.h" + +/* + * Recursively deallocate a match object and set to NULL + */ +void destroy_match(match_t **m) +{ + if (!m || !*m) return; + destroy_match(&((*m)->child)); + destroy_match(&((*m)->nextsibling)); + *m = NULL; +} + +/* + * Run virtual machine operation against a string and return + * a match struct, or NULL if no match is found. + * The returned value should be free()'d to avoid memory leaking. + */ +match_t *match(grammar_t *g, const char *str, vm_op_t *op) +{ + //tailcall: + switch (op->op) { + case VM_EMPTY: { + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str; + return m; + } + case VM_ANYCHAR: { + if (!*str || (!op->multiline && *str == '\n')) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str+1; + return m; + } + case VM_STRING: { + if (strncmp(str, op->args.s, op->len) != 0) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str + op->len; + return m; + } + case VM_RANGE: { + if (*str < op->args.range.low || *str > op->args.range.high) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str + 1; + return m; + } + case VM_NOT: case VM_ANYTHING_BUT: { + if (op->op == VM_ANYTHING_BUT) + if (!*str || (!op->multiline && *str == '\n')) + return NULL; + match_t *m = match(g, str, op->args.pat); + if (m != NULL) { + destroy_match(&m); + return NULL; + } + m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + if (op->op == VM_ANYTHING_BUT) ++str; + m->end = str; + return m; + } + case VM_UPTO_AND: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->op = op; + match_t *p = NULL; + for (const char *prev = NULL; p == NULL && prev < str; ) { + prev = str; + p = match(g, str, op->args.pat); + if (*str && (op->multiline || *str != '\n')) + ++str; + } + if (p) { + m->end = p->end; + m->child = p; + return m; + } + destroy_match(&m); + return NULL; + } + case VM_REPEAT: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + m->op = op; + if (op->args.repetitions.max == 0) return m; + + match_t **dest = &m->child; + + const char *prev = str; + size_t reps; + for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) { + // Separator + match_t *sep = NULL; + if (op->args.repetitions.sep != NULL && reps > 0) { + sep = match(g, str, op->args.repetitions.sep); + if (sep == NULL) break; + str = sep->end; + } + match_t *p = match(g, str, op->args.repetitions.repeat_pat); + if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops + destroy_match(&sep); + destroy_match(&p); + break; + } + if (sep) { + *dest = sep; + dest = &sep->nextsibling; + } + *dest = p; + dest = &p->nextsibling; + str = p->end; + prev = str; + } + + if ((ssize_t)reps < op->args.repetitions.min) { + destroy_match(&m); + return NULL; + } + m->end = str; + return m; + } + case VM_AFTER: { + ssize_t backtrack = op->args.pat->len; + check(backtrack != -1, "'<' is only allowed for fixed-length operations"); + // Check for necessary space: + for (int i = 0; i < backtrack; i++) { + if (str[-i] == '\0') return NULL; + } + match_t *before = match(g, str - backtrack, op->args.pat); + if (before == NULL) return NULL; + destroy_match(&before); + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + m->op = op; + return m; + } + case VM_BEFORE: { + match_t *after = match(g, str, op->args.pat); + if (after == NULL) return NULL; + destroy_match(&after); + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + m->op = op; + return m; + } + case VM_CAPTURE: { + match_t *p = match(g, str, op->args.pat); + if (p == NULL) return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = p->end; + m->op = op; + m->child = p; + m->is_capture = 1; + if (op->args.capture.name) + m->name_or_replacement = op->args.capture.name; + return m; + } + case VM_OTHERWISE: { + match_t *m = match(g, str, op->args.multiple.first); + if (m == NULL) m = match(g, str, op->args.multiple.second); + return m; + } + case VM_CHAIN: { + match_t *m1 = match(g, str, op->args.multiple.first); + if (m1 == NULL) return NULL; + match_t *m2 = match(g, m1->end, op->args.multiple.second); + if (m2 == NULL) { + destroy_match(&m1); + return NULL; + } + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = m2->end; + m->op = op; + m->child = m1; + m1->nextsibling = m2; + return m; + } + case VM_REPLACE: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->op = op; + if (op->args.replace.replace_pat) { + match_t *p = match(g, str, op->args.replace.replace_pat); + if (p == NULL) return NULL; + m->child = p; + m->end = p->end; + } else { + m->end = m->start; + } + m->is_replacement = 1; + m->name_or_replacement = op->args.replace.replacement; + return m; + } + case VM_REF: { + // Search backwards so newer defs take precedence + for (int i = g->size-1; i >= 0; i--) { + if (streq(g->definitions[i].name, op->args.s)) { + // Bingo! + /* + op = g->definitions[i].op; + goto tailcall; + */ + match_t *p = match(g, str, g->definitions[i].op); + if (p == NULL) return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = p->start; + m->end = p->end; + m->op = op; + m->child = p; + m->name_or_replacement = g->definitions[i].name; + m->is_ref = 1; + return m; + } + } + check(0, "Unknown identifier: '%s'", op->args.s); + return NULL; + } + default: { + fprintf(stderr, "Unknown opcode: %d", op->op); + _exit(1); + return NULL; + } + } +} + + +void print_pattern(vm_op_t *op) +{ + switch (op->op) { + case VM_REF: fprintf(stderr, "a $%s", op->args.s); break; + case VM_EMPTY: fprintf(stderr, "the empty string"); break; + case VM_ANYCHAR: fprintf(stderr, "any char"); break; + case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break; + case VM_RANGE: { + fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high); + break; + } + case VM_REPEAT: { + if (op->args.repetitions.max == -1) + fprintf(stderr, "%ld or more (", op->args.repetitions.min); + else + fprintf(stderr, "%ld-%ld of (", + op->args.repetitions.min, + op->args.repetitions.max); + print_pattern(op->args.repetitions.repeat_pat); + fprintf(stderr, ")"); + if (op->args.repetitions.sep) { + fprintf(stderr, " separated by ("); + print_pattern(op->args.repetitions.sep); + fprintf(stderr, ")"); + } + break; + } + case VM_NOT: { + fprintf(stderr, "not ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_UPTO_AND: { + fprintf(stderr, "text up to and including ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_ANYTHING_BUT: { + fprintf(stderr, "anything but ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_AFTER: { + fprintf(stderr, "after ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_BEFORE: { + fprintf(stderr, "before ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_CAPTURE: { + fprintf(stderr, "capture ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + if (op->args.capture.name) + fprintf(stderr, " and call it %s", op->args.capture.name); + break; + } + case VM_OTHERWISE: { + fprintf(stderr, "("); + print_pattern(op->args.multiple.first); + fprintf(stderr, ") or else "); + if (op->args.multiple.second->op != VM_OTHERWISE) + fprintf(stderr, "("); + print_pattern(op->args.multiple.second); + if (op->args.multiple.second->op != VM_OTHERWISE) + fprintf(stderr, ")"); + break; + } + case VM_CHAIN: { + fprintf(stderr, "("); + print_pattern(op->args.multiple.first); + fprintf(stderr, ") then "); + if (op->args.multiple.second->op != VM_CHAIN) + fprintf(stderr, "("); + print_pattern(op->args.multiple.second); + if (op->args.multiple.second->op != VM_CHAIN) + fprintf(stderr, ")"); + break; + } + case VM_REPLACE: { + fprintf(stderr, "replace "); + if (op->args.replace.replace_pat) { + fprintf(stderr, "("); + print_pattern(op->args.replace.replace_pat); + fprintf(stderr, ")"); + } else + fprintf(stderr, "\"\""); + fprintf(stderr, " with \"%s\"", op->args.replace.replacement); + break; + } + default: break; + } +} + +/* + * Get a specific numbered pattern capture. + */ +static match_t *get_capture_n(match_t *m, int *n) +{ + if (!m) return NULL; + if (*n == 0) return m; + if (m->is_capture && *n == 1) return m; + if (m->is_capture) --(*n); + for (match_t *c = m->child; c; c = c->nextsibling) { + match_t *cap = get_capture_n(c, n); + if (cap) return cap; + } + return NULL; +} + +/* + * Get a named capture. + */ +static match_t *get_capture_named(match_t *m, const char *name) +{ + if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name)) + return m; + for (match_t *c = m->child; c; c = c->nextsibling) { + match_t *cap = get_capture_named(c, name); + if (cap) return cap; + } + return NULL; +} + +/* + * Print a match with replacements and highlighting. + */ +void print_match(match_t *m, const char *color, int verbose) +{ + if (m->is_replacement) { + printf("\033[0;34m"); + for (const char *r = m->name_or_replacement; *r; ) { + if (*r == '\\') { + fputc(unescapechar(r, &r), stdout); + continue; + } else if (*r != '@') { + fputc(*r, stdout); + ++r; + continue; + } + + ++r; + match_t *cap = NULL; + switch (*r) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + int n = (int)strtol(r, (char**)&r, 10); + cap = get_capture_n(m->child, &n); + break; + } + case '[': { + char *closing = strchr(r+1, ']'); + if (!closing) { + fputc('@', stdout); + break; + } + ++r; + char *name = strndup(r, (size_t)(closing-r)); + cap = get_capture_named(m, name); + free(name); + r = closing + 1; + break; + } + default: { + fputc('@', stdout); + break; + } + } + if (cap != NULL) { + print_match(cap, "\033[0;35m", verbose); + printf("\033[0;34m"); + } + } + } else { + const char *name = m->name_or_replacement; + if (verbose && m->is_ref && name && isupper(name[0])) + printf("\033[0;2;35m{%s:", name); + //if (m->is_capture && name) + // printf("\033[0;2;33m[%s:", name); + const char *prev = m->start; + for (match_t *child = m->child; child; child = child->nextsibling) { + if (child->start > prev) + printf("%s%.*s", color, (int)(child->start - prev), prev); + print_match(child, m->is_capture ? "\033[0;1m" : color, verbose); + prev = child->end; + } + if (m->end > prev) + printf("%s%.*s", color, (int)(m->end - prev), prev); + if (verbose && m->is_ref && name && isupper(name[0])) + printf("\033[0;2;35m}"); + //if (m->is_capture && name) + // printf("\033[0;2;33m]"); + } +} + diff --git a/vm.h b/vm.h new file mode 100644 index 0000000..965a040 --- /dev/null +++ b/vm.h @@ -0,0 +1,18 @@ +/* + * vm.h - Header file for BPEG virtual machine. + */ +#ifndef VM__H +#define VM__H + +#include +#include +#include + +#include "types.h" + +match_t *match(grammar_t *g, const char *str, vm_op_t *op); +void destroy_match(match_t **m); +void print_pattern(vm_op_t *op); +void print_match(match_t *m, const char *color, int verbose); + +#endif -- cgit v1.2.3