aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile17
-rw-r--r--bpeg.bpeg3
-rw-r--r--bpeg.c1136
-rw-r--r--compiler.c461
-rw-r--r--compiler.h16
-rw-r--r--grammar.c93
-rw-r--r--grammar.h18
-rw-r--r--types.h (renamed from bpeg.h)45
-rw-r--r--utils.c171
-rw-r--r--utils.h129
-rw-r--r--vm.c449
-rw-r--r--vm.h18
12 files changed, 1305 insertions, 1251 deletions
diff --git a/Makefile b/Makefile
index 205b8c2..986ecdc 100644
--- a/Makefile
+++ b/Makefile
@@ -1,14 +1,21 @@
-PREFIX=
+PREFIX=/usr/local
CFLAGS=-Wall -Wextra -pedantic -Wmissing-prototypes -Wstrict-prototypes
+LDFLAGS=
G ?=
O ?= -O3
+CFILES=compiler.c grammar.c utils.c vm.c
+OBJFILES=$(CFILES:.c=.o)
+
all: bpeg
-clean:
- rm -f bpeg
+.c.o:
+ cc -c $(CFLAGS) $(G) $(O) -o $@ $<
-bpeg: bpeg.c bpeg.h utils.h
- cc $(CFLAGS) $(G) $(O) $< -o $@
+bpeg: $(OBJFILES) bpeg.c
+ cc $(CFLAGS) $(G) $(O) -o $@ $^
+
+clean:
+ rm -f bpeg $(OBJFILES)
.PHONY: all clean
diff --git a/bpeg.bpeg b/bpeg.bpeg
index f5d490d..0a03aa8 100644
--- a/bpeg.bpeg
+++ b/bpeg.bpeg
@@ -1,7 +1,6 @@
# This is a file defining the BPEG grammar using BPEG syntax
-Grammar;
-Grammar = __ @[main-pattern]extended-pat __ *((__`;__) Def) ?(`;__);
+Grammar = __ *Def%(__`;__) ?(`;__);
Def = @[name]Ref __ `= __ @[definition]extended-pat;
# This is used for command line arguments:
diff --git a/bpeg.c b/bpeg.c
index 41f7238..23fa6fc 100644
--- a/bpeg.c
+++ b/bpeg.c
@@ -31,1089 +31,38 @@
* <pat> / <alt> <pat> otherwise <alt>
* ; <name> = <pat> <name> is defined to be <pat>
*/
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "compiler.h"
+#include "grammar.h"
+#include "utils.h"
+#include "vm.h"
+
+static const char *usage = (
+ "Usage:\n"
+ " bpeg [flags] <pattern> [<input files>...]\n\n"
+ "Flags:\n"
+ " -h --help\t print the usage and quit\n"
+ " -v --verbose\t print verbose debugging info\n"
+ " -s --slow\t run in slow mode for debugging\n"
+ " -r --replace <replacement> replace the input pattern with the given replacement\n"
+ " -g --grammar <grammar file> use the specified file as a grammar\n");
-#include "bpeg.h"
-
-/*
- * Recursively deallocate a match object and return NULL
- */
-static match_t *free_match(match_t *m)
-{
- if (m->child) m->child = free_match(m->child);
- if (m->nextsibling) m->nextsibling = free_match(m->nextsibling);
- free(m);
- return NULL;
-}
-
-/*
- * Run virtual machine operation against a string and return
- * a match struct, or NULL if no match is found.
- * The returned value should be free()'d to avoid memory leaking.
- */
-static match_t *match(const char *str, vm_op_t *op)
-{
- //tailcall:
- switch (op->op) {
- case VM_EMPTY: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str;
- return m;
- }
- case VM_ANYCHAR: {
- if (!*str || (!op->multiline && *str == '\n'))
- return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str+1;
- return m;
- }
- case VM_STRING: {
- if (strncmp(str, op->args.s, op->len) != 0)
- return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str + op->len;
- return m;
- }
- case VM_RANGE: {
- if (*str < op->args.range.low || *str > op->args.range.high)
- return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str + 1;
- return m;
- }
- case VM_NOT: case VM_ANYTHING_BUT: {
- if (op->op == VM_ANYTHING_BUT)
- if (!*str || (!op->multiline && *str == '\n'))
- return NULL;
- match_t *m = match(str, op->args.pat);
- if (m != NULL) {
- m = free_match(m);
- return NULL;
- }
- m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- if (op->op == VM_ANYTHING_BUT) ++str;
- m->end = str;
- return m;
- }
- case VM_UPTO_AND: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->op = op;
- match_t *p = NULL;
- for (const char *prev = NULL; p == NULL && prev < str; ) {
- prev = str;
- p = match(str, op->args.pat);
- if (*str && (op->multiline || *str != '\n'))
- ++str;
- }
- if (p) {
- m->end = p->end;
- m->child = p;
- return m;
- }
- m = free_match(m);
- return NULL;
- }
- case VM_REPEAT: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = str;
- m->op = op;
- if (op->args.repetitions.max == 0) return m;
-
- match_t **dest = &m->child;
-
- const char *prev = str;
- size_t reps;
- for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
- // Separator
- match_t *sep = NULL;
- if (op->args.repetitions.sep != NULL && reps > 0) {
- sep = match(str, op->args.repetitions.sep);
- if (sep == NULL) break;
- str = sep->end;
- }
- match_t *p = match(str, op->args.repetitions.repeat_pat);
- if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
- if (sep) sep = free_match(sep);
- if (p) p = free_match(p);
- break;
- }
- if (sep) {
- *dest = sep;
- dest = &sep->nextsibling;
- }
- *dest = p;
- dest = &p->nextsibling;
- str = p->end;
- prev = str;
- }
-
- if ((ssize_t)reps < op->args.repetitions.min) {
- m = free_match(m);
- return NULL;
- }
- m->end = str;
- return m;
- }
- case VM_AFTER: {
- ssize_t backtrack = op->args.pat->len;
- check(backtrack != -1, "'<' is only allowed for fixed-length operations");
- // Check for necessary space:
- for (int i = 0; i < backtrack; i++) {
- if (str[-i] == '\0') return NULL;
- }
- match_t *before = match(str - backtrack, op->args.pat);
- if (before == NULL) return NULL;
- before = free_match(before);
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = str;
- m->op = op;
- return m;
- }
- case VM_BEFORE: {
- match_t *after = match(str, op->args.pat);
- if (after == NULL) return NULL;
- after = free_match(after);
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = str;
- m->op = op;
- return m;
- }
- case VM_CAPTURE: {
- match_t *p = match(str, op->args.pat);
- if (p == NULL) return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = p->end;
- m->op = op;
- m->child = p;
- m->is_capture = 1;
- if (op->args.capture.name)
- m->name_or_replacement = op->args.capture.name;
- return m;
- }
- case VM_OTHERWISE: {
- match_t *m = match(str, op->args.multiple.first);
- if (m == NULL) m = match(str, op->args.multiple.second);
- return m;
- }
- case VM_CHAIN: {
- match_t *m1 = match(str, op->args.multiple.first);
- if (m1 == NULL) return NULL;
- match_t *m2 = match(m1->end, op->args.multiple.second);
- if (m2 == NULL) {
- m1 = free_match(m1);
- return NULL;
- }
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = m2->end;
- m->op = op;
- m->child = m1;
- m1->nextsibling = m2;
- return m;
- }
- case VM_REPLACE: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->op = op;
- if (op->args.replace.replace_pat) {
- match_t *p = match(str, op->args.replace.replace_pat);
- if (p == NULL) return NULL;
- m->child = p;
- m->end = p->end;
- } else {
- m->end = m->start;
- }
- m->is_replacement = 1;
- m->name_or_replacement = op->args.replace.replacement;
- return m;
- }
- case VM_REF: {
- // Search backwards so newer defs take precedence
- for (int i = ndefs-1; i >= 0; i--) {
- if (streq(defs[i].name, op->args.s)) {
- // Bingo!
- /*
- op = defs[i].op;
- goto tailcall;
- */
- match_t *p = match(str, defs[i].op);
- if (p == NULL) return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = p->start;
- m->end = p->end;
- m->op = op;
- m->child = p;
- m->name_or_replacement = defs[i].name;
- m->is_ref = 1;
- return m;
- }
- }
- check(0, "Unknown identifier: '%s'", op->args.s);
- return NULL;
- }
- default: {
- fprintf(stderr, "Unknown opcode: %d", op->op);
- _exit(1);
- return NULL;
- }
- }
-}
-
-/*
- * Helper function to initialize a range object.
- */
-static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep)
-{
- op->op = VM_REPEAT;
- if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0)
- op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1));
- else
- op->len = -1;
- op->args.repetitions.min = min;
- op->args.repetitions.max = max;
- op->args.repetitions.repeat_pat = pat;
- op->args.repetitions.sep = sep;
-}
-
-/*
- * Take an opcode and expand it into a chain of patterns if it's
- * followed by any patterns (e.g. "`x `y"), otherwise return
- * the original input.
- */
-static vm_op_t *expand_chain(const char *source, vm_op_t *first)
-{
- visualize(source, first->end, "Expanding chain...");
- vm_op_t *second = compile_bpeg(source, first->end);
- if (second == NULL) return first;
- second = expand_chain(source, second);
- check(second->end > first->end, "No forward progress in chain!");
- visualize(source, second->end, "Got chained pair.");
- return chain_together(first, second);
-}
-
-/*
- * Take an opcode and expand it into a chain of choices if it's
- * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise
- * return the original input.
- */
-static vm_op_t *expand_choices(const char *source, vm_op_t *first)
-{
- first = expand_chain(source, first);
- const char *str = first->end;
- if (!matchchar(&str, '/')) return first;
- visualize(source, str, "Expanding choices...");
- //debug("Otherwise:\n");
- vm_op_t *second = compile_bpeg(source, str);
- check(second, "Expected pattern after '/'");
- second = expand_choices(source, second);
- vm_op_t *choice = calloc(sizeof(vm_op_t), 1);
- choice->op = VM_OTHERWISE;
- choice->start = first->start;
- if (first->len == second->len)
- choice->len = first->len;
- else choice->len = -1;
- choice->end = second->end;
- choice->args.multiple.first = first;
- choice->args.multiple.second = second;
- visualize(source, choice->end, "Got two choices");
- return choice;
-}
-
-static char unescapechar(const char *escaped, const char **end)
-{
- size_t len = 1;
- char ret = *escaped;
- switch (*escaped) {
- case 'a': ret = '\a'; break; case 'b': ret = '\b'; break;
- case 'n': ret = '\n'; break; case 'r': ret = '\r'; break;
- case 't': ret = '\t'; break; case 'v': ret = '\v'; break;
- case 'e': ret = '\033'; break;
- case 'x': { // Hex
- static const char hextable[255] = {
- ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
- ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
- ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
- ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
- };
- if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) {
- ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF);
- len = 3;
- }
- break;
- }
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
- ret = escaped[0] - '0';
- if ('0' <= escaped[1] && escaped[1] <= '7') {
- ++len;
- ret = (ret << 3) | (escaped[1] - '0');
- if ('0' <= escaped[2] && escaped[2] <= '7') {
- ++len;
- ret = (ret << 3) | (escaped[2] - '0');
- }
- }
- break;
- }
- default: break;
- }
- *end = &escaped[len];
- return ret;
-}
-
-static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second)
-{
- if (first == NULL) return second;
- if (second == NULL) return first;
- vm_op_t *chain = calloc(sizeof(vm_op_t), 1);
- chain->op = VM_CHAIN;
- chain->start = first->start;
- if (first->len >= 0 && second->len >= 0)
- chain->len = first->len + second->len;
- else chain->len = -1;
- chain->end = second->end;
- chain->args.multiple.first = first;
- chain->args.multiple.second = second;
- return chain;
-}
-
-/*
- * Compile a string of BPEG code into virtual machine opcodes
- */
-static vm_op_t *compile_bpeg(const char *source, const char *str)
-{
- if (!*str) return NULL;
- visualize(source, str, "Compiling...");
- //debug("Parsing \"%s\"...\n", str);
- str = after_spaces(str);
- check(*str, "Expected a pattern");
- vm_op_t *op = calloc(sizeof(vm_op_t), 1);
- op->start = str;
- op->len = -1;
- char c = *str;
- ++str;
- switch (c) {
- // Any char (dot) ($. is multiline anychar)
- case '.': {
- anychar:
- visualize(source, str, "Dot");
- //debug("Dot\n");
- op->op = VM_ANYCHAR;
- op->len = 1;
- break;
- }
- // Char literals
- case '`': {
- char literal[2] = {*str, '\0'};
- ++str;
- visualize(source, str, "Char literal");
- check(literal[0], "Expected character after '`'\n");
- op->len = 1;
- if (matchchar(&str, '-')) { // Range
- visualize(source, str, "Char range");
- char c2 = *str;
- check(c2, "Expected character after '-'");
- check(c2 >= literal[0], "Character range must be low-to-high");
- op->op = VM_RANGE;
- op->args.range.low = literal[0];
- op->args.range.high = c2;
- ++str;
- } else {
- //debug("Char literal\n");
- op->op = VM_STRING;
- op->args.s = strdup(literal);
- }
- break;
- }
- // Escapes
- case '\\': {
- //debug("Escape sequence\n");
- visualize(source, str, "Escape sequence");
- check(*str, "Expected escape after '\\'");
- op->len = 1;
- char e = unescapechar(str, &str);
- if (*str == '-') { // Escape range (e.g. \x00-\xFF)
- ++str;
- char e2 = unescapechar(str, &str);
- check(e2, "Expected character after '-'");
- check(e2 >= e, "Character range must be low-to-high");
- op->op = VM_RANGE;
- op->args.range.low = e;
- op->args.range.high = e2;
- } else {
- char literal[2] = {e, '\0'};
- op->op = VM_STRING;
- op->args.s = strdup(literal);
- }
- break;
- }
- // String literal
- case '"': case '\'': case '\002': {
- visualize(source, str, "String literal");
- char endquote = c == '\002' ? '\003' : c;
- char *literal = (char*)str;
- for (; *str && *str != endquote; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- ++str;
- }
- visualize(source, str, "String literal");
- }
- size_t len = (size_t)(str - literal);
- literal = strndup(literal, len);
- len = unescape_string(literal, literal, len);
-
- op->op = VM_STRING;
- op->len = len;
- op->args.s = literal;
-
- check(matchchar(&str, endquote), "Missing closing quote");
- break;
- }
- // Not <pat>
- case '!': {
- // debug("Not pattern\n");
- visualize(source, str, "Not <pat>");
- vm_op_t *p = compile_bpeg(source, str);
- check(p, "Expected pattern after '!'\n");
- str = p->end;
- op->op = VM_NOT;
- op->len = 0;
- op->args.pat = p;
- break;
- }
- // Anything but <pat>
- case '~': {
- if (matchchar(&str, '~')) op->multiline = 1;
- visualize(source, str, "Anything but <pat>");
- vm_op_t *p = compile_bpeg(source, str);
- check(p, "Expected pattern after '~'\n");
- str = p->end;
- op->op = VM_ANYTHING_BUT;
- op->len = -1;
- op->args.pat = p;
- break;
- }
- // Upto and including <pat>
- case '&': {
- if (matchchar(&str, '&')) op->multiline = 1;
- visualize(source, str, "Upto and including <pat>");
- vm_op_t *p = compile_bpeg(source, str);
- check(p, "Expected pattern after '&'\n");
- str = p->end;
- op->op = VM_UPTO_AND;
- op->len = -1;
- op->args.pat = p;
- break;
- }
- // Number of repetitions: <N>(-<N> / - / + / "")
- case '0': case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9': {
- visualize(source, str, "Repeat <pat>");
- ssize_t min = -1, max = -1;
- --str;
- long n1 = strtol(str, (char**)&str, 10);
- if (matchchar(&str, '-')) {
- str = after_spaces(str);
- const char *start = str;
- long n2 = strtol(str, (char**)&str, 10);
- if (str == start) min = 0, max = n1;
- else min = n1, max = n2;
- } else if (matchchar(&str, '+')) {
- min = n1, max = -1;
- } else {
- min = n1, max = n1;
- }
- visualize(source, str, NULL);
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after repetition count");
- str = pat->end;
- str = after_spaces(str);
- if (matchchar(&str, '%')) {
- visualize(source, str, "Repeat <pat> with separator");
- vm_op_t *sep = compile_bpeg(source, str);
- check(sep, "Expected pattern for separator after '%%'");
- str = sep->end;
- set_range(op, min, max, pat, sep);
- } else {
- set_range(op, min, max, pat, NULL);
- }
- visualize(source, str, NULL);
- //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max);
- break;
- }
- // Special repetitions:
- case '+': case '*': case '?': {
- //debug("Special repetitions\n");
- visualize(source, str, "Repeat <pat>");
- ssize_t min = -1, max = -1;
- switch (c) {
- case '+': min = 1, max = -1; break;
- case '*': min = 0, max = -1; break;
- case '?': min = 0, max = 1; break;
- }
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after +");
- str = pat->end;
- str = after_spaces(str);
- if (matchchar(&str, '%')) {
- visualize(source, str, "Repeat <pat> with separator");
- vm_op_t *sep = compile_bpeg(source, str);
- check(sep, "Expected pattern for separator after '%%'");
- str = sep->end;
- set_range(op, min, max, pat, sep);
- } else {
- set_range(op, min, max, pat, NULL);
- }
- visualize(source, str, NULL);
- //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max);
- break;
- }
- // Lookbehind
- case '<': {
- visualize(source, str, "After <pat>");
- //debug("Lookbehind\n");
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after <");
- str = pat->end;
- check(pat->len != -1, "Lookbehind patterns must have a fixed length");
- str = pat->end;
- op->op = VM_AFTER;
- op->len = 0;
- op->args.pat = pat;
- break;
- }
- // Lookahead
- case '>': {
- visualize(source, str, "Before <pat>");
- //debug("Lookahead\n");
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after >");
- str = pat->end;
- op->op = VM_BEFORE;
- op->len = 0;
- op->args.pat = pat;
- break;
- }
- // Parentheses
- case '(': {
- visualize(source, str, NULL);
- // debug("Open paren (\n");
- free(op);
- op = compile_bpeg(source, str);
- check(op, "Expected pattern inside parentheses");
- op = expand_choices(source, op);
- str = op->end;
- str = after_spaces(str);
- check(matchchar(&str, ')'), "Expected closing parenthesis");
- visualize(source, str, NULL);
- // debug(")\n");
- break;
- }
- // Capture
- case '@': {
- //debug("Capture\n");
- visualize(source, str, "Capture");
- op->op = VM_CAPTURE;
- str = after_spaces(str);
- if (matchchar(&str, '[')) {
- char *closing = strchr(str, ']');
- check(closing, "Expected closing ']'");
- op->args.capture.name = strndup(str, (size_t)(closing-str));
- visualize(source, str, "Named capture");
- //debug("named \"%s\"\n", op->args.capture.name);
- str = closing;
- check(matchchar(&str, ']'), "Expected closing ']'");
- }
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after @");
- str = pat->end;
- op->args.capture.capture_pat = pat;
- op->len = pat->len;
- visualize(source, str, NULL);
- break;
- }
- // Replacement
- case '{': {
- //debug("Replacement {\n");
- visualize(source, str, "Replacement");
- str = after_spaces(str);
- vm_op_t *pat = NULL;
- if (strncmp(str, "=>", 2) == 0) {
- str += strlen("=>");
- } else {
- pat = compile_bpeg(source, str);
- check(pat, "Invalid pattern after '{'");
- pat = expand_choices(source, pat);
- str = pat->end;
- str = after_spaces(str);
- check(matchchar(&str, '=') && matchchar(&str, '>'),
- "Expected '=>' after pattern in replacement");
- }
- visualize(source, str, NULL);
- str = after_spaces(str);
-
- char quote = *str;
- const char *replacement;
- if (matchchar(&str, '}')) {
- replacement = strdup("");
- visualize(source, str, NULL);
- } else {
- check(matchchar(&str, '"') || matchchar(&str, '\''),
- "Expected string literal for replacement");
- replacement = str;
- for (; *str && *str != quote; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- ++str;
- }
- visualize(source, str, NULL);
- }
- replacement = strndup(replacement, (size_t)(str-replacement));
- check(matchchar(&str, quote), "Expected closing quote");
- check(matchchar(&str, '}'), "Expected a closing '}'");
- }
- op->op = VM_REPLACE;
- op->args.replace.replace_pat = pat;
- op->args.replace.replacement = replacement;
- //debug(" rep = \"%s\"\n", replacement);
- //debug("}\n");
- if (pat != NULL) op->len = pat->len;
- visualize(source, str, NULL);
- break;
- }
- // Special rules:
- case '_': case '^': case '$': {
- if (matchchar(&str, c)) { // double __, ^^, $$
- char tmp[3] = {c, c, '\0'};
- op->args.s = strdup(tmp);
- } else if (c == '$' && matchchar(&str, '.')) { // $. (multi-line anychar)
- op->multiline = 1;
- goto anychar;
- } else {
- op->args.s = strndup(&c, 1);
- }
- op->op = VM_REF;
- visualize(source, str, op->args.s);
- break;
- }
- // Empty choice (/) or {/}
- case '/': {
- str = after_spaces(str);
- if (*str == ')' || *str == '}') {
- op->op = VM_EMPTY;
- } else {
- free(op);
- return NULL;
- }
- break;
- }
- default: {
- // Reference
- if (isalpha(c)) {
- visualize(source, str, "Ref");
- --str;
- const char *refname = str;
- str = after_name(str);
- op->op = VM_REF;
- op->len = (size_t)(str - refname);
- op->args.s = strndup(refname, op->len);
- break;
- } else {
- visualize(source, str, "Finished");
- free(op);
- return NULL;
- }
- }
- }
- op->end = str;
- return op;
-}
-
-/*
- * Similar to compile_bpeg, except that the pattern begins with an implicit, unclosable quote.
- */
-static vm_op_t *compile_bpeg_string(const char *source, const char *str)
-{
- visualize(source, str, "Compiling string...");
- vm_op_t *ret = NULL;
- while (*str) {
- vm_op_t *strop = calloc(sizeof(vm_op_t), 1);
- strop->start = str;
- strop->len = 0;
- strop->op = VM_STRING;
- char *literal = (char*)str;
- vm_op_t *interp = NULL;
- for (; *str; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- interp = compile_bpeg(source, str + 1);
- check(interp != NULL, "No valid BPEG pattern detected after backslash");
- break;
- }
- visualize(source, str, "String literal");
- }
- // End of string
- size_t len = (size_t)(str - literal);
- literal = strndup(literal, len);
- len = unescape_string(literal, literal, len);
- strop->len = len;
- strop->args.s = literal;
- strop->end = str;
-
- if (strop->len == 0) {
- free(strop);
- strop = NULL;
- } else {
- ret = chain_together(ret, strop);
- }
- if (interp) {
- ret = chain_together(ret, interp);
- str = interp->end;
- // allow terminating seq
- matchchar(&str, ';');
- }
- }
- return ret;
-}
-
-static vm_op_t *compile_bpeg_replacement(vm_op_t *pat, const char *str)
-{
- vm_op_t *op = calloc(sizeof(vm_op_t), 1);
- op->op = VM_REPLACE;
- op->start = pat->start;
- op->len = pat->len;
- op->args.replace.replace_pat = pat;
- const char *replacement = str;
- for (; *str; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- ++str;
- }
- }
- replacement = strndup(replacement, (size_t)(str-replacement));
- op->args.replace.replacement = replacement;
- return op;
-}
-
-static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op)
-{
- check(ndefs < (sizeof(defs)/sizeof(defs[0])), "Too many definitions!");
- defs[ndefs].name = name;
- defs[ndefs].op = op;
- defs[ndefs].source = source;
- ++ndefs;
- return op;
-}
-
-static vm_op_t *load_def(const char *name, const char *source)
-{
- vm_op_t *op = compile_bpeg(source, source);
- source = strndup((char*)op->start, (int)(op->end - op->start));
- op = expand_choices(source, op);
- return add_def(name, source, op);
-}
-
-static void load_defs(void)
-{
- // Approximately these are in least-to-most used order so they will be
- // found most efficiently by the lookup, which goes in reverse order.
- load_def("crlf", "\\r\\n");
- load_def("cr", "\\r"); load_def("r", "\\r");
- load_def("anglebraces", "`< *(anglebraces / ~~`>) `>");
- load_def("brackets", "`[ *(brackets / ~~`]) `]");
- load_def("braces", "`{ *(braces / ~~`}) `}");
- load_def("parens", "`( *(parens / ~~`)) `)");
- load_def("id", "(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9)");
- load_def("HEX", "`0-9/`A-F");
- load_def("Hex", "`0-9/`a-f/`A-F");
- load_def("hex", "`0-9/`a-f");
- load_def("number", "+`0-9 ?(`. *`0-9) / `. +`0-9");
- load_def("int", "+`0-9");
- load_def("digit", "`0-9");
- load_def("Abc", "`a-z/`A-Z");
- load_def("ABC", "`A-Z");
- load_def("abc", "`a-z");
- load_def("esc", "\\e"); load_def("e", "\\e");
- load_def("tab", "\\t"); load_def("t", "\\t");
- load_def("nl", "\\n"); load_def("lf", "\\n"); load_def("n", "\\n");
- load_def("c-block-comment", "'/*' &&'*/'");
- load_def("c-line-comment", "'//' &$");
- load_def("c-comment", "c-line-comment / c-block-comment");
- load_def("hash-comment", "`# &$");
- load_def("comment", "!(/)"); // undefined by default
- load_def("WS", "` /\\t/\\n/\\r/comment");
- load_def("ws", "` /\\t");
- load_def("$$", "!$.");
- load_def("$", "!.");
- load_def("^^", "!<$.");
- load_def("^", "!<.");
- load_def("__", "*(` /\\t/\\n/\\r/comment)");
- load_def("_", "*(` /\\t)");
-}
-
-static match_t *get_capture_n(match_t *m, int *n)
-{
- if (!m) return NULL;
- if (*n == 0) return m;
- if (m->is_capture && *n == 1) return m;
- if (m->is_capture) --(*n);
- for (match_t *c = m->child; c; c = c->nextsibling) {
- match_t *cap = get_capture_n(c, n);
- if (cap) return cap;
- }
- return NULL;
-}
-
-static match_t *get_capture_named(match_t *m, const char *name)
-{
- if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name))
- return m;
- for (match_t *c = m->child; c; c = c->nextsibling) {
- match_t *cap = get_capture_named(c, name);
- if (cap) return cap;
- }
- return NULL;
-}
-
-static void print_match(match_t *m, const char *color)
-{
- if (m->is_replacement) {
- printf("\033[0;34m");
- for (const char *r = m->name_or_replacement; *r; ) {
- if (*r == '\\') {
- fputc(unescapechar(r, &r), stdout);
- continue;
- } else if (*r != '@') {
- fputc(*r, stdout);
- ++r;
- continue;
- }
-
- ++r;
- match_t *cap = NULL;
- switch (*r) {
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9': {
- int n = (int)strtol(r, (char**)&r, 10);
- cap = get_capture_n(m->child, &n);
- break;
- }
- case '[': {
- char *closing = strchr(r+1, ']');
- if (!closing) {
- fputc('@', stdout);
- break;
- }
- ++r;
- char *name = strndup(r, (size_t)(closing-r));
- cap = get_capture_named(m, name);
- free(name);
- r = closing + 1;
- break;
- }
- default: {
- fputc('@', stdout);
- break;
- }
- }
- if (cap != NULL) {
- print_match(cap, "\033[0;35m");
- printf("\033[0;34m");
- }
- }
- } else {
- const char *name = m->name_or_replacement;
- if (verbose && m->is_ref && name && isupper(name[0]))
- printf("\033[0;2;35m{%s:", name);
- //if (m->is_capture && name)
- // printf("\033[0;2;33m[%s:", name);
- const char *prev = m->start;
- for (match_t *child = m->child; child; child = child->nextsibling) {
- if (child->start > prev)
- printf("%s%.*s", color, (int)(child->start - prev), prev);
- print_match(child, m->is_capture ? "\033[0;1m" : color);
- prev = child->end;
- }
- if (m->end > prev)
- printf("%s%.*s", color, (int)(m->end - prev), prev);
- if (verbose && m->is_ref && name && isupper(name[0]))
- printf("\033[0;2;35m}");
- //if (m->is_capture && name)
- // printf("\033[0;2;33m]");
- }
-}
-
-/*
- * Read an entire file into memory.
- */
-static char *readfile(int fd)
-{
- size_t capacity = 1000, len = 0;
- char *buf = calloc(sizeof(char), capacity+1);
- ssize_t just_read;
- while ((just_read=read(fd, &buf[len], capacity-len)) > 0) {
- len += (size_t)just_read;
- if (len >= capacity)
- buf = realloc(buf, (capacity *= 2));
- }
- return buf;
-}
-
-static void print_grammar(vm_op_t *op)
-{
- switch (op->op) {
- case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
- case VM_EMPTY: fprintf(stderr, "the empty string"); break;
- case VM_ANYCHAR: fprintf(stderr, "any char"); break;
- case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
- case VM_RANGE: {
- fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high);
- break;
- }
- case VM_REPEAT: {
- if (op->args.repetitions.max == -1)
- fprintf(stderr, "%ld or more (", op->args.repetitions.min);
- else
- fprintf(stderr, "%ld-%ld of (",
- op->args.repetitions.min,
- op->args.repetitions.max);
- print_grammar(op->args.repetitions.repeat_pat);
- fprintf(stderr, ")");
- if (op->args.repetitions.sep) {
- fprintf(stderr, " separated by (");
- print_grammar(op->args.repetitions.sep);
- fprintf(stderr, ")");
- }
- break;
- }
- case VM_NOT: {
- fprintf(stderr, "not (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_UPTO_AND: {
- fprintf(stderr, "text up to and including (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_ANYTHING_BUT: {
- fprintf(stderr, "anything but (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_AFTER: {
- fprintf(stderr, "after (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_BEFORE: {
- fprintf(stderr, "before (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_CAPTURE: {
- fprintf(stderr, "capture (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- if (op->args.capture.name)
- fprintf(stderr, " and call it %s", op->args.capture.name);
- break;
- }
- case VM_OTHERWISE: {
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.first);
- fprintf(stderr, ") or else ");
- if (op->args.multiple.second->op != VM_OTHERWISE)
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.second);
- if (op->args.multiple.second->op != VM_OTHERWISE)
- fprintf(stderr, ")");
- break;
- }
- case VM_CHAIN: {
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.first);
- fprintf(stderr, ") then ");
- if (op->args.multiple.second->op != VM_CHAIN)
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.second);
- if (op->args.multiple.second->op != VM_CHAIN)
- fprintf(stderr, ")");
- break;
- }
- case VM_REPLACE: {
- fprintf(stderr, "replace ");
- if (op->args.replace.replace_pat) {
- fprintf(stderr, "(");
- print_grammar(op->args.replace.replace_pat);
- fprintf(stderr, ")");
- } else
- fprintf(stderr, "\"\"");
- fprintf(stderr, " with \"%s\"", op->args.replace.replacement);
- break;
- }
- default: break;
- }
-}
-
-static vm_op_t *load_grammar(const char *grammar)
-{
- vm_op_t *op = compile_bpeg(grammar, grammar);
- check(op, "Failed to compile_bpeg input");
- op = expand_choices(grammar, op);
-
- const char *defs = op->end;
- while (matchchar(&defs, ';')) {
- if (verbose) fprintf(stderr, "\n");
- defs = after_spaces(defs);
- const char *name = defs;
- if (strncmp(name, "^^", 2) == 0 ||
- strncmp(name, "__", 2) == 0 ||
- strncmp(name, "$$", 2) == 0) {
- name = strndup(name, 2);
- defs += 2;
- } else if (*name == '^' || *name == '_' || *name == '$') {
- name = strndup(name, 1);
- defs += 1;
- } else {
- defs = after_name(defs);
- if (defs == NULL) break;
- name = strndup(name, (size_t)(defs-name));
- }
- defs = after_spaces(defs);
- check(matchchar(&defs, '='), "Expected '=' in definition");
- vm_op_t *def = load_def(name, defs);
- check(def, "Couldn't load definition");
- defs = def->end;
- }
- return op;
-}
int main(int argc, char *argv[])
{
+ int verbose = 0;
const char *pattern = NULL,
*replacement = NULL,
*grammarfile = NULL,
*infile = NULL;
+ grammar_t *g = new_grammar();
+
for (int i = 1; i < argc; i++) {
if (streq(argv[i], "--help") || streq(argv[i], "-h")) {
printf("%s\n", usage);
@@ -1122,8 +71,6 @@ int main(int argc, char *argv[])
verbose = 1;
} else if (streq(argv[i], "--replace") || streq(argv[i], "-r")) {
replacement = argv[++i];
- } else if (streq(argv[i], "--slow") || streq(argv[i], "-s")) {
- visualize_delay = 100000;
} else if (streq(argv[i], "--grammar") || streq(argv[i], "-g")) {
grammarfile = argv[++i];
} else if (streq(argv[i], "--define") || streq(argv[i], "-d")) {
@@ -1131,7 +78,10 @@ int main(int argc, char *argv[])
char *eq = strchr(def, '=');
check(eq, usage);
*eq = '\0';
- load_def(def, ++eq);
+ char *src = ++eq;
+ vm_op_t *pat = bpeg_pattern(src);
+ check(pat, "Failed to compile pattern");
+ add_def(g, src, def, pat);
} else if (pattern == NULL) {
pattern = argv[i];
} else if (infile == NULL) {
@@ -1140,16 +90,7 @@ int main(int argc, char *argv[])
}
check(pattern != NULL || grammarfile != NULL, usage);
- if (verbose) fprintf(stderr, "====== Loading definitions ======\n\n\n\n");
- {
- int tmp1 = visualize_delay, tmp2 = verbose;
- visualize_delay = -1, verbose = 0;
- load_defs();
- visualize_delay = tmp1, verbose = tmp2;
- }
- if (verbose) fprintf(stderr, "========== Compiling ===========\n\n\n\n");
- vm_op_t *op;
if (grammarfile) {
// load grammar from a file (semicolon mode)
char *grammar;
@@ -1160,24 +101,21 @@ int main(int argc, char *argv[])
check(fd >= 0, "Couldn't open file: %s", argv[2]);
grammar = readfile(fd);
}
- op = load_grammar(grammar);
+ load_grammar(g, grammar);
} else {
// load grammar in start-with-string mode:
- vm_op_t *pat = compile_bpeg_string(pattern, pattern);
- if (replacement) {
- pat = compile_bpeg_replacement(pat, replacement);
- }
+ vm_op_t *pat = bpeg_stringpattern(pattern);
+ if (replacement)
+ pat = bpeg_replacement(pat, replacement);
- add_def("pattern", pattern, pat);
+ add_def(g, pattern, "pattern", pat);
- const char *grammar = "*(@pattern / \\n / .)";
- op = compile_bpeg(grammar, grammar);
+ const char *grammar = "find = *(@pattern / \\n / .);";
+ load_grammar(g, grammar);
}
if (verbose) {
- fprintf(stderr, "\n\n");
- print_grammar(op);
- fprintf(stderr, "\n\n");
+ print_pattern(g->pattern);
}
char *input;
@@ -1194,12 +132,12 @@ int main(int argc, char *argv[])
stpcpy(&lpadded[1], input);
input = &lpadded[1];
- match_t *m = match(input, op);
+ match_t *m = match(g, input, g->pattern);
if (m == NULL) {
printf("No match\n");
return 1;
} else {
- print_match(m, "\033[0m");
+ print_match(m, "\033[0m", verbose);
printf("\033[0;2m%s\n", m->end);
}
diff --git a/compiler.c b/compiler.c
new file mode 100644
index 0000000..88919cf
--- /dev/null
+++ b/compiler.c
@@ -0,0 +1,461 @@
+/*
+ * compiler.c - Compile strings into BPEG virtual machine code.
+ */
+
+#include "compiler.h"
+#include "utils.h"
+
+static vm_op_t *expand_chain(vm_op_t *first);
+static vm_op_t *expand_choices(vm_op_t *first);
+static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second);
+static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep);
+
+/*
+ * Helper function to initialize a range object.
+ */
+static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep)
+{
+ op->op = VM_REPEAT;
+ if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0)
+ op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1));
+ else
+ op->len = -1;
+ op->args.repetitions.min = min;
+ op->args.repetitions.max = max;
+ op->args.repetitions.repeat_pat = pat;
+ op->args.repetitions.sep = sep;
+}
+
+/*
+ * Take an opcode and expand it into a chain of patterns if it's
+ * followed by any patterns (e.g. "`x `y"), otherwise return
+ * the original input.
+ */
+static vm_op_t *expand_chain(vm_op_t *first)
+{
+ vm_op_t *second = bpeg_simplepattern(first->end);
+ if (second == NULL) return first;
+ second = expand_chain(second);
+ check(second->end > first->end, "No forward progress in chain!");
+ return chain_together(first, second);
+}
+
+/*
+ * Take an opcode and expand it into a chain of choices if it's
+ * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise
+ * return the original input.
+ */
+static vm_op_t *expand_choices(vm_op_t *first)
+{
+ first = expand_chain(first);
+ const char *str = first->end;
+ if (!matchchar(&str, '/')) return first;
+ vm_op_t *second = bpeg_simplepattern(str);
+ check(second, "Expected pattern after '/'");
+ second = expand_choices(second);
+ vm_op_t *choice = calloc(sizeof(vm_op_t), 1);
+ choice->op = VM_OTHERWISE;
+ choice->start = first->start;
+ if (first->len == second->len)
+ choice->len = first->len;
+ else choice->len = -1;
+ choice->end = second->end;
+ choice->args.multiple.first = first;
+ choice->args.multiple.second = second;
+ return choice;
+}
+
+static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second)
+{
+ if (first == NULL) return second;
+ if (second == NULL) return first;
+ vm_op_t *chain = calloc(sizeof(vm_op_t), 1);
+ chain->op = VM_CHAIN;
+ chain->start = first->start;
+ if (first->len >= 0 && second->len >= 0)
+ chain->len = first->len + second->len;
+ else chain->len = -1;
+ chain->end = second->end;
+ chain->args.multiple.first = first;
+ chain->args.multiple.second = second;
+ return chain;
+}
+
+/*
+ * Compile a string of BPEG code into virtual machine opcodes
+ */
+vm_op_t *bpeg_simplepattern(const char *str)
+{
+ if (!*str) return NULL;
+ str = after_spaces(str);
+ check(*str, "Expected a pattern");
+ vm_op_t *op = calloc(sizeof(vm_op_t), 1);
+ op->start = str;
+ op->len = -1;
+ char c = *str;
+ ++str;
+ switch (c) {
+ // Any char (dot) ($. is multiline anychar)
+ case '.': {
+ anychar:
+ op->op = VM_ANYCHAR;
+ op->len = 1;
+ break;
+ }
+ // Char literals
+ case '`': {
+ char literal[2] = {*str, '\0'};
+ ++str;
+ check(literal[0], "Expected character after '`'\n");
+ op->len = 1;
+ if (matchchar(&str, '-')) { // Range
+ char c2 = *str;
+ check(c2, "Expected character after '-'");
+ check(c2 >= literal[0], "Character range must be low-to-high");
+ op->op = VM_RANGE;
+ op->args.range.low = literal[0];
+ op->args.range.high = c2;
+ ++str;
+ } else {
+ op->op = VM_STRING;
+ op->args.s = strdup(literal);
+ }
+ break;
+ }
+ // Escapes
+ case '\\': {
+ check(*str, "Expected escape after '\\'");
+ op->len = 1;
+ char e = unescapechar(str, &str);
+ if (*str == '-') { // Escape range (e.g. \x00-\xFF)
+ ++str;
+ char e2 = unescapechar(str, &str);
+ check(e2, "Expected character after '-'");
+ check(e2 >= e, "Character range must be low-to-high");
+ op->op = VM_RANGE;
+ op->args.range.low = e;
+ op->args.range.high = e2;
+ } else {
+ char literal[2] = {e, '\0'};
+ op->op = VM_STRING;
+ op->args.s = strdup(literal);
+ }
+ break;
+ }
+ // String literal
+ case '"': case '\'': case '\002': {
+ char endquote = c == '\002' ? '\003' : c;
+ char *literal = (char*)str;
+ for (; *str && *str != endquote; str++) {
+ if (*str == '\\') {
+ check(str[1], "Expected more string contents after backslash");
+ ++str;
+ }
+ }
+ size_t len = (size_t)(str - literal);
+ literal = strndup(literal, len);
+ len = unescape_string(literal, literal, len);
+
+ op->op = VM_STRING;
+ op->len = len;
+ op->args.s = literal;
+
+ check(matchchar(&str, endquote), "Missing closing quote");
+ break;
+ }
+ // Not <pat>
+ case '!': {
+ vm_op_t *p = bpeg_simplepattern(str);
+ check(p, "Expected pattern after '!'\n");
+ str = p->end;
+ op->op = VM_NOT;
+ op->len = 0;
+ op->args.pat = p;
+ break;
+ }
+ // Anything but <pat>
+ case '~': {
+ if (matchchar(&str, '~')) op->multiline = 1;
+ vm_op_t *p = bpeg_simplepattern(str);
+ check(p, "Expected pattern after '~'\n");
+ str = p->end;
+ op->op = VM_ANYTHING_BUT;
+ op->len = -1;
+ op->args.pat = p;
+ break;
+ }
+ // Upto and including <pat>
+ case '&': {
+ if (matchchar(&str, '&')) op->multiline = 1;
+ vm_op_t *p = bpeg_simplepattern(str);
+ check(p, "Expected pattern after '&'\n");
+ str = p->end;
+ op->op = VM_UPTO_AND;
+ op->len = -1;
+ op->args.pat = p;
+ break;
+ }
+ // Number of repetitions: <N>(-<N> / - / + / "")
+ case '0': case '1': case '2': case '3': case '4': case '5':
+ case '6': case '7': case '8': case '9': {
+ ssize_t min = -1, max = -1;
+ --str;
+ long n1 = strtol(str, (char**)&str, 10);
+ if (matchchar(&str, '-')) {
+ str = after_spaces(str);
+ const char *start = str;
+ long n2 = strtol(str, (char**)&str, 10);
+ if (str == start) min = 0, max = n1;
+ else min = n1, max = n2;
+ } else if (matchchar(&str, '+')) {
+ min = n1, max = -1;
+ } else {
+ min = n1, max = n1;
+ }
+ vm_op_t *pat = bpeg_simplepattern(str);
+ check(pat, "Expected pattern after repetition count");
+ str = pat->end;
+ str = after_spaces(str);
+ if (matchchar(&str, '%')) {
+ vm_op_t *sep = bpeg_simplepattern(str);
+ check(sep, "Expected pattern for separator after '%%'");
+ str = sep->end;
+ set_range(op, min, max, pat, sep);
+ } else {
+ set_range(op, min, max, pat, NULL);
+ }
+ break;
+ }
+ // Special repetitions:
+ case '+': case '*': case '?': {
+ ssize_t min = -1, max = -1;
+ switch (c) {
+ case '+': min = 1, max = -1; break;
+ case '*': min = 0, max = -1; break;
+ case '?': min = 0, max = 1; break;
+ }
+ vm_op_t *pat = bpeg_simplepattern(str);
+ check(pat, "Expected pattern after +");
+ str = pat->end;
+ str = after_spaces(str);
+ if (matchchar(&str, '%')) {
+ vm_op_t *sep = bpeg_simplepattern(str);
+ check(sep, "Expected pattern for separator after '%%'");
+ str = sep->end;
+ set_range(op, min, max, pat, sep);
+ } else {
+ set_range(op, min, max, pat, NULL);
+ }
+ break;
+ }
+ // Lookbehind
+ case '<': {
+ vm_op_t *pat = bpeg_simplepattern(str);
+ check(pat, "Expected pattern after <");
+ str = pat->end;
+ check(pat->len != -1, "Lookbehind patterns must have a fixed length");
+ str = pat->end;
+ op->op = VM_AFTER;
+ op->len = 0;
+ op->args.pat = pat;
+ break;
+ }
+ // Lookahead
+ case '>': {
+ vm_op_t *pat = bpeg_simplepattern(str);
+ check(pat, "Expected pattern after >");
+ str = pat->end;
+ op->op = VM_BEFORE;
+ op->len = 0;
+ op->args.pat = pat;
+ break;
+ }
+ // Parentheses
+ case '(': {
+ free(op);
+ op = bpeg_simplepattern(str);
+ check(op, "Expected pattern inside parentheses");
+ op = expand_choices(op);
+ str = op->end;
+ str = after_spaces(str);
+ check(matchchar(&str, ')'), "Expected closing parenthesis");
+ break;
+ }
+ // Capture
+ case '@': {
+ op->op = VM_CAPTURE;
+ str = after_spaces(str);
+ if (matchchar(&str, '[')) {
+ char *closing = strchr(str, ']');
+ check(closing, "Expected closing ']'");
+ op->args.capture.name = strndup(str, (size_t)(closing-str));
+ str = closing;
+ check(matchchar(&str, ']'), "Expected closing ']'");
+ }
+ vm_op_t *pat = bpeg_simplepattern(str);
+ check(pat, "Expected pattern after @");
+ str = pat->end;
+ op->args.capture.capture_pat = pat;
+ op->len = pat->len;
+ break;
+ }
+ // Replacement
+ case '{': {
+ str = after_spaces(str);
+ vm_op_t *pat = NULL;
+ if (strncmp(str, "=>", 2) == 0) {
+ str += strlen("=>");
+ } else {
+ pat = bpeg_simplepattern(str);
+ check(pat, "Invalid pattern after '{'");
+ pat = expand_choices(pat);
+ str = pat->end;
+ str = after_spaces(str);
+ check(matchchar(&str, '=') && matchchar(&str, '>'),
+ "Expected '=>' after pattern in replacement");
+ }
+ str = after_spaces(str);
+
+ char quote = *str;
+ const char *replacement;
+ if (matchchar(&str, '}')) {
+ replacement = strdup("");
+ } else {
+ check(matchchar(&str, '"') || matchchar(&str, '\''),
+ "Expected string literal for replacement");
+ replacement = str;
+ for (; *str && *str != quote; str++) {
+ if (*str == '\\') {
+ check(str[1], "Expected more string contents after backslash");
+ ++str;
+ }
+ }
+ replacement = strndup(replacement, (size_t)(str-replacement));
+ check(matchchar(&str, quote), "Expected closing quote");
+ check(matchchar(&str, '}'), "Expected a closing '}'");
+ }
+ op->op = VM_REPLACE;
+ op->args.replace.replace_pat = pat;
+ op->args.replace.replacement = replacement;
+ if (pat != NULL) op->len = pat->len;
+ break;
+ }
+ // Special rules:
+ case '_': case '^': case '$': {
+ if (matchchar(&str, c)) { // double __, ^^, $$
+ char tmp[3] = {c, c, '\0'};
+ op->args.s = strdup(tmp);
+ } else if (c == '$' && matchchar(&str, '.')) { // $. (multi-line anychar)
+ op->multiline = 1;
+ goto anychar;
+ } else {
+ op->args.s = strndup(&c, 1);
+ }
+ op->op = VM_REF;
+ break;
+ }
+ // Empty choice (/) or {/}
+ case '/': {
+ str = after_spaces(str);
+ if (*str == ')' || *str == '}') {
+ op->op = VM_EMPTY;
+ } else {
+ free(op);
+ return NULL;
+ }
+ break;
+ }
+ default: {
+ // Reference
+ if (isalpha(c)) {
+ --str;
+ const char *refname = str;
+ str = after_name(str);
+ op->op = VM_REF;
+ op->len = (size_t)(str - refname);
+ op->args.s = strndup(refname, op->len);
+ break;
+ } else {
+ free(op);
+ return NULL;
+ }
+ }
+ }
+ op->end = str;
+ return op;
+}
+
+/*
+ * Similar to bpeg_simplepattern, except that the pattern begins with an implicit, unclosable quote.
+ */
+vm_op_t *bpeg_stringpattern(const char *str)
+{
+ vm_op_t *ret = NULL;
+ while (*str) {
+ vm_op_t *strop = calloc(sizeof(vm_op_t), 1);
+ strop->start = str;
+ strop->len = 0;
+ strop->op = VM_STRING;
+ char *literal = (char*)str;
+ vm_op_t *interp = NULL;
+ for (; *str; str++) {
+ if (*str == '\\') {
+ check(str[1], "Expected more string contents after backslash");
+ interp = bpeg_simplepattern(str + 1);
+ check(interp != NULL, "No valid BPEG pattern detected after backslash");
+ break;
+ }
+ }
+ // End of string
+ size_t len = (size_t)(str - literal);
+ literal = strndup(literal, len);
+ len = unescape_string(literal, literal, len);
+ strop->len = len;
+ strop->args.s = literal;
+ strop->end = str;
+
+ if (strop->len == 0) {
+ free(strop);
+ strop = NULL;
+ } else {
+ ret = chain_together(ret, strop);
+ }
+ if (interp) {
+ ret = chain_together(ret, interp);
+ str = interp->end;
+ // allow terminating seq
+ matchchar(&str, ';');
+ }
+ }
+ return ret;
+}
+
+/*
+ * Given a pattern and a replacement string, compile the two into a replacement
+ * VM opcode.
+ */
+vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement)
+{
+ vm_op_t *op = calloc(sizeof(vm_op_t), 1);
+ op->op = VM_REPLACE;
+ op->start = pat->start;
+ op->len = pat->len;
+ op->args.replace.replace_pat = pat;
+ const char *p = replacement;
+ for (; *p; p++) {
+ if (*p == '\\') {
+ check(p[1], "Expected more string contents after backslash");
+ ++p;
+ }
+ }
+ replacement = strndup(replacement, (size_t)(p-replacement));
+ op->args.replace.replacement = replacement;
+ return op;
+}
+
+vm_op_t *bpeg_pattern(const char *str)
+{
+ vm_op_t *op = bpeg_simplepattern(str);
+ if (op != NULL) op = expand_choices(op);
+ return op;
+}
diff --git a/compiler.h b/compiler.h
new file mode 100644
index 0000000..916d8fe
--- /dev/null
+++ b/compiler.h
@@ -0,0 +1,16 @@
+/*
+ * compiler.h - Header file for BPEG compiler.
+ */
+#ifndef COMPILER__H
+#define COMPILER__H
+
+#include <stdlib.h>
+
+#include "types.h"
+
+vm_op_t *bpeg_simplepattern(const char *str);
+vm_op_t *bpeg_stringpattern(const char *str);
+vm_op_t *bpeg_replacement(vm_op_t *pat, const char *replacement);
+vm_op_t *bpeg_pattern(const char *str);
+
+#endif
diff --git a/grammar.c b/grammar.c
new file mode 100644
index 0000000..c04587b
--- /dev/null
+++ b/grammar.c
@@ -0,0 +1,93 @@
+/*
+ * grammar.c - Code for defining grammars (sets of rules)
+ */
+
+#include "grammar.h"
+#include "compiler.h"
+#include "utils.h"
+
+const char *BPEG_BUILTIN_GRAMMAR = (
+ "crlf=\\r\\n;\n"
+ "cr=\\r;\n" "r=\\r;\n"
+ "anglebraces=`< *(anglebraces / ~~`>) `>;\n"
+ "brackets=`[ *(brackets / ~~`]) `];\n"
+ "braces=`{ *(braces / ~~`}) `};\n"
+ "parens=`( *(parens / ~~`)) `);\n"
+ "id=(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9);\n"
+ "HEX=`0-9/`A-F;\n"
+ "Hex=`0-9/`a-f/`A-F;\n"
+ "hex=`0-9/`a-f;\n"
+ "number=+`0-9 ?(`. *`0-9) / `. +`0-9;\n"
+ "int=+`0-9;\n"
+ "digit=`0-9;\n"
+ "Abc=`a-z/`A-Z;\n"
+ "ABC=`A-Z;\n"
+ "abc=`a-z;\n"
+ "esc=\\e;\n" "e=\\e;\n"
+ "tab=\\t;\n" "t=\\t;\n"
+ "nl=\\n;\n" "lf=\\n;\n" "n=\\n;\n"
+ "c-block-comment='/*' &&'*/';\n"
+ "c-line-comment='//' &$;\n"
+ "c-comment=c-line-comment / c-block-comment;\n"
+ "hash-comment=`# &$;\n"
+ "comment=!(/);\n" // No default definition, can be overridden
+ "WS=` /\\t/\\n/\\r/comment;\n"
+ "ws=` /\\t;\n"
+ "$$=!$.;\n"
+ "$=!.;\n"
+ "^^=!<$.;\n"
+ "^=!<.;\n"
+ "__=*(` /\\t/\\n/\\r/comment);\n"
+ "_=*(` /\\t);\n"
+ );
+
+grammar_t *new_grammar(void)
+{
+ grammar_t *g = calloc(sizeof(grammar_t), 1);
+ g->definitions = calloc(sizeof(def_t), (g->capacity = 128));
+ load_grammar(g, BPEG_BUILTIN_GRAMMAR);
+ return g;
+}
+
+void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op)
+{
+ if (g->size >= g->capacity) {
+ g->definitions = realloc(g->definitions, (g->capacity += 32));
+ }
+ int i = g->size;
+ g->definitions[i].source = src;
+ g->definitions[i].name = name;
+ g->definitions[i].op = op;
+ ++g->size;
+}
+
+void load_grammar(grammar_t *g, const char *src)
+{
+ vm_op_t *mainpat = NULL;
+ do {
+ src = after_spaces(src);
+ if (!*src) break;
+ const char *name = src;
+ const char *name_end = after_name(name);
+ check(name_end > name, "Invalid name for definition");
+ name = strndup(name, (size_t)(name_end-name));
+ src = after_spaces(name_end);
+ check(matchchar(&src, '='), "Expected '=' in definition");
+ vm_op_t *op = bpeg_pattern(src);
+ check(op, "Couldn't load definition");
+ add_def(g, src, name, op);
+ if (mainpat == NULL) {
+ mainpat = op;
+ g->pattern = op;
+ }
+ src = op->end;
+ } while (*src && matchchar(&src, ';'));
+}
+
+/*
+ * Print a BPEG grammar in human-readable form.
+ */
+void print_grammar(grammar_t *g)
+{
+ if (g->pattern) print_pattern(g->pattern);
+}
diff --git a/grammar.h b/grammar.h
new file mode 100644
index 0000000..90cc3f2
--- /dev/null
+++ b/grammar.h
@@ -0,0 +1,18 @@
+/*
+ * grammar.h - Header file defining grammars (sets of rule definitions)
+ */
+#ifndef GRAMMAR__H
+#define GRAMMAR__H
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "types.h"
+
+grammar_t *new_grammar(void);
+void add_def(grammar_t *g, const char *src, const char *name, vm_op_t *op);
+void load_grammar(grammar_t *g, const char *source);
+void print_grammar(grammar_t *g);
+
+
+#endif
diff --git a/bpeg.h b/types.h
index 459bd65..ef62cdc 100644
--- a/bpeg.h
+++ b/types.h
@@ -1,24 +1,10 @@
/*
- * bpeg.h - Header file for the bpeg parser
+ * types.h - Datatypes used by BPEG
*/
-#include <ctype.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
+#ifndef TYPES__H
+#define TYPES__H
-#include "utils.h"
-
-const char *usage = (
- "Usage:\n"
- " bpeg [flags] <pattern> [<input files>...]\n\n"
- "Flags:\n"
- " -h --help\t print the usage and quit\n"
- " -v --verbose\t print verbose debugging info\n"
- " -s --slow\t run in slow mode for debugging\n"
- " -r --replace <replacement> replace the input pattern with the given replacement\n"
- " -g --grammar <grammar file> use the specified file as a grammar\n");
+#include <sys/types.h>
/*
* BPEG virtual machine opcodes
@@ -87,20 +73,6 @@ typedef struct match_s {
vm_op_t *op;
} match_t;
-static inline const char *after_spaces(const char *str);
-static match_t *free_match(match_t *m);
-static match_t *match(const char *str, vm_op_t *op);
-static vm_op_t *compile_bpeg(const char *source, const char *str);
-static vm_op_t *load_grammar(const char *grammar);
-static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op);
-static vm_op_t *load_def(const char *name, const char *source);
-static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second);
-static vm_op_t *compile_bpeg_string(const char *source, const char *str);
-static vm_op_t *expand_chain(const char *source, vm_op_t *first);
-static vm_op_t *expand_choices(const char *source, vm_op_t *op);
-static void print_match(match_t *m, const char *color);
-static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep);
-
typedef struct {
const char *name;
@@ -108,5 +80,10 @@ typedef struct {
vm_op_t *op;
} def_t;
-static def_t defs[1024] = {{NULL, NULL, NULL}};
-size_t ndefs = 0;
+typedef struct {
+ vm_op_t *pattern;
+ def_t *definitions;
+ size_t size, capacity;
+} grammar_t;
+
+#endif
diff --git a/utils.c b/utils.c
new file mode 100644
index 0000000..4fdec79
--- /dev/null
+++ b/utils.c
@@ -0,0 +1,171 @@
+/*
+ * utils.c - Some helper code for debugging and error logging.
+ */
+#include "utils.h"
+
+/*
+ * Helper function to skip past all spaces (and comments)
+ * Returns a pointer to the first non-space character.
+ */
+const char *after_spaces(const char *str)
+{
+ // Skip whitespace and comments:
+ skip_whitespace:
+ switch (*str) {
+ case ' ': case '\r': case '\n': case '\t': {
+ ++str;
+ goto skip_whitespace;
+ }
+ case '#': {
+ while (*str && *str != '\n') ++str;
+ goto skip_whitespace;
+ }
+ }
+ return str;
+}
+
+/*
+ * Return the first character after a valid BPEG name, or NULL if none is
+ * found.
+ */
+const char *after_name(const char *str)
+{
+ if (*str == '^' || *str == '_' || *str == '$') {
+ return (str[1] == *str) ? &str[2] : &str[1];
+ }
+ if (!isalpha(*str)) return NULL;
+ for (++str; *str; ++str) {
+ if (!(isalnum(*str) || *str == '-'))
+ break;
+ }
+ return str;
+}
+
+/*
+ * Check if a character is found and if so, move past it.
+ */
+int matchchar(const char **str, char c)
+{
+ *str = after_spaces(*str);
+ if (**str == c) {
+ ++(*str);
+ return 1;
+ } else {
+ return 0;
+ }
+}
+
+/*
+ * Process a string escape sequence for a character and return the
+ * character that was escaped.
+ * Set *end = the first character past the end of the escape sequence.
+ */
+char unescapechar(const char *escaped, const char **end)
+{
+ size_t len = 1;
+ char ret = *escaped;
+ switch (*escaped) {
+ case 'a': ret = '\a'; break; case 'b': ret = '\b'; break;
+ case 'n': ret = '\n'; break; case 'r': ret = '\r'; break;
+ case 't': ret = '\t'; break; case 'v': ret = '\v'; break;
+ case 'e': ret = '\033'; break;
+ case 'x': { // Hex
+ static const char hextable[255] = {
+ ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
+ ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
+ ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
+ ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
+ };
+ if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) {
+ ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF);
+ len = 3;
+ }
+ break;
+ }
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
+ ret = escaped[0] - '0';
+ if ('0' <= escaped[1] && escaped[1] <= '7') {
+ ++len;
+ ret = (ret << 3) | (escaped[1] - '0');
+ if ('0' <= escaped[2] && escaped[2] <= '7') {
+ ++len;
+ ret = (ret << 3) | (escaped[2] - '0');
+ }
+ }
+ break;
+ }
+ default: break;
+ }
+ *end = &escaped[len];
+ return ret;
+}
+
+/*
+ * Write an unescaped version of `src` to `dest` (at most bufsize-1 chars,
+ * terminated by a null byte)
+ */
+size_t unescape_string(char *dest, const char *src, size_t bufsize)
+{
+ size_t len = 0;
+#define PUT(c) do { *(dest++) = (char)(c); ++len; } while (0)
+ for ( ; *src && len < bufsize; ++src) {
+ if (*src != '\\') {
+ PUT(*src);
+ continue;
+ }
+ ++src;
+ switch (*src) {
+ case 'a': PUT('\a'); break; case 'b': PUT('\b'); break;
+ case 'n': PUT('\n'); break; case 'r': PUT('\r'); break;
+ case 't': PUT('\t'); break; case 'v': PUT('\v'); break;
+ case 'e': PUT('\033'); break;
+ case 'x': { // Hex
+ static const char hextable[255] = {
+ ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
+ ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
+ ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
+ ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
+ };
+ if (hextable[(int)src[1]] && hextable[(int)src[2]]) {
+ PUT((hextable[(int)src[1]] << 4) | (hextable[(int)src[2]] & 0xF));
+ src += 2;
+ }
+ break;
+ }
+ case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
+ int c = *src - '0';
+ if ('0' <= src[1] && src[1] <= '7') {
+ ++src;
+ c = (c << 3) | (*src - '0');
+ if ('0' <= src[1] && src[1] <= '7' && (c << 3) < 256) {
+ ++src;
+ c = (c << 3) | (*src - '0');
+ }
+ }
+ PUT(c);
+ break;
+ }
+ default: PUT(*src); break;
+ }
+ }
+ *dest = '\0';
+ return len;
+#undef PUT
+}
+
+/*
+ * Read an entire file into memory.
+ */
+char *readfile(int fd)
+{
+ size_t capacity = 1000, len = 0;
+ char *buf = calloc(sizeof(char), capacity+1);
+ ssize_t just_read;
+ while ((just_read=read(fd, &buf[len], capacity-len)) > 0) {
+ len += (size_t)just_read;
+ if (len >= capacity)
+ buf = realloc(buf, (capacity *= 2));
+ }
+ return buf;
+}
+
diff --git a/utils.h b/utils.h
index 714f17e..c2dd04a 100644
--- a/utils.h
+++ b/utils.h
@@ -1,120 +1,27 @@
/*
- * utils.h - Some helper code for debugging and error logging.
+ * utils.h - Some utility and printing functions.
*/
+#ifndef UTILS__H
+#define UTILS__H
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "vm.h"
#define streq(a, b) (strcmp(a, b) == 0)
// TODO: better error reporting
#define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0)
#define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0)
-static int verbose = 0;
-static int visualize_delay = -1;
-
-/*
- * Helper function to skip past all spaces (and comments)
- * Returns a pointer to the first non-space character.
- */
-static inline const char *after_spaces(const char *str)
-{
- // Skip whitespace and comments:
- skip_whitespace:
- switch (*str) {
- case ' ': case '\r': case '\n': case '\t': {
- ++str;
- goto skip_whitespace;
- }
- case '#': {
- while (*str && *str != '\n') ++str;
- goto skip_whitespace;
- }
- }
- return str;
-}
-
-static inline const char *after_name(const char *str)
-{
- if (!isalpha(*str)) return NULL;
- for (++str; *str; ++str) {
- if (!(isalnum(*str) || *str == '-'))
- break;
- }
- return str;
-}
+char *readfile(int fd);
+char unescapechar(const char *escaped, const char **end);
+const char *after_name(const char *str);
+const char *after_spaces(const char *str);
+int matchchar(const char **str, char c);
+size_t unescape_string(char *dest, const char *src, size_t bufsize);
-static inline int matchchar(const char **str, char c)
-{
- *str = after_spaces(*str);
- if (**str == c) {
- ++(*str);
- return 1;
- } else {
- return 0;
- }
-}
-
-static void visualize(const char *source, const char *ptr, const char *msg)
-{
- if (!verbose) return;
- fprintf(stderr, "\033[0;1m\r\033[2A\033[K%.*s\033[0;2m%s\033[0m\n",
- (int)(ptr-source), source, ptr);
- fprintf(stderr, "\033[0;1m");
- for (--ptr ; ptr > source; --ptr) putc(' ', stderr);
- fprintf(stderr, "^\033[K\n");
- if (msg)
- fprintf(stderr, "\033[K\033[33;1m%s\033[0m", msg);
- if (visualize_delay > 0)
- usleep(visualize_delay);
-}
-
-/*
- * Write an unescaped version of `src` to `dest` (at most bufsize-1 chars,
- * terminated by a null byte)
- */
-static size_t unescape_string(char *dest, const char *src, size_t bufsize)
-{
- size_t len = 0;
-#define PUT(c) do { *(dest++) = (char)(c); ++len; } while (0)
- for ( ; *src && len < bufsize; ++src) {
- if (*src != '\\') {
- PUT(*src);
- continue;
- }
- ++src;
- switch (*src) {
- case 'a': PUT('\a'); break; case 'b': PUT('\b'); break;
- case 'n': PUT('\n'); break; case 'r': PUT('\r'); break;
- case 't': PUT('\t'); break; case 'v': PUT('\v'); break;
- case 'e': PUT('\033'); break;
- case 'x': { // Hex
- static const char hextable[255] = {
- ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
- ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
- ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
- ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
- };
- if (hextable[(int)src[1]] && hextable[(int)src[2]]) {
- PUT((hextable[(int)src[1]] << 4) | (hextable[(int)src[2]] & 0xF));
- src += 2;
- }
- break;
- }
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
- int c = *src - '0';
- if ('0' <= src[1] && src[1] <= '7') {
- ++src;
- c = (c << 3) | (*src - '0');
- if ('0' <= src[1] && src[1] <= '7' && (c << 3) < 256) {
- ++src;
- c = (c << 3) | (*src - '0');
- }
- }
- PUT(c);
- break;
- }
- default: PUT(*src); break;
- }
- }
- *dest = '\0';
- return len;
-#undef PUT
-}
+#endif
diff --git a/vm.c b/vm.c
new file mode 100644
index 0000000..50b136a
--- /dev/null
+++ b/vm.c
@@ -0,0 +1,449 @@
+/*
+ * vm.c - Code for the BPEG virtual machine that performs the matching.
+ */
+#include "vm.h"
+#include "utils.h"
+
+/*
+ * Recursively deallocate a match object and set to NULL
+ */
+void destroy_match(match_t **m)
+{
+ if (!m || !*m) return;
+ destroy_match(&((*m)->child));
+ destroy_match(&((*m)->nextsibling));
+ *m = NULL;
+}
+
+/*
+ * Run virtual machine operation against a string and return
+ * a match struct, or NULL if no match is found.
+ * The returned value should be free()'d to avoid memory leaking.
+ */
+match_t *match(grammar_t *g, const char *str, vm_op_t *op)
+{
+ //tailcall:
+ switch (op->op) {
+ case VM_EMPTY: {
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->op = op;
+ m->start = str;
+ m->end = str;
+ return m;
+ }
+ case VM_ANYCHAR: {
+ if (!*str || (!op->multiline && *str == '\n'))
+ return NULL;
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->op = op;
+ m->start = str;
+ m->end = str+1;
+ return m;
+ }
+ case VM_STRING: {
+ if (strncmp(str, op->args.s, op->len) != 0)
+ return NULL;
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->op = op;
+ m->start = str;
+ m->end = str + op->len;
+ return m;
+ }
+ case VM_RANGE: {
+ if (*str < op->args.range.low || *str > op->args.range.high)
+ return NULL;
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->op = op;
+ m->start = str;
+ m->end = str + 1;
+ return m;
+ }
+ case VM_NOT: case VM_ANYTHING_BUT: {
+ if (op->op == VM_ANYTHING_BUT)
+ if (!*str || (!op->multiline && *str == '\n'))
+ return NULL;
+ match_t *m = match(g, str, op->args.pat);
+ if (m != NULL) {
+ destroy_match(&m);
+ return NULL;
+ }
+ m = calloc(sizeof(match_t), 1);
+ m->op = op;
+ m->start = str;
+ if (op->op == VM_ANYTHING_BUT) ++str;
+ m->end = str;
+ return m;
+ }
+ case VM_UPTO_AND: {
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->op = op;
+ match_t *p = NULL;
+ for (const char *prev = NULL; p == NULL && prev < str; ) {
+ prev = str;
+ p = match(g, str, op->args.pat);
+ if (*str && (op->multiline || *str != '\n'))
+ ++str;
+ }
+ if (p) {
+ m->end = p->end;
+ m->child = p;
+ return m;
+ }
+ destroy_match(&m);
+ return NULL;
+ }
+ case VM_REPEAT: {
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->end = str;
+ m->op = op;
+ if (op->args.repetitions.max == 0) return m;
+
+ match_t **dest = &m->child;
+
+ const char *prev = str;
+ size_t reps;
+ for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
+ // Separator
+ match_t *sep = NULL;
+ if (op->args.repetitions.sep != NULL && reps > 0) {
+ sep = match(g, str, op->args.repetitions.sep);
+ if (sep == NULL) break;
+ str = sep->end;
+ }
+ match_t *p = match(g, str, op->args.repetitions.repeat_pat);
+ if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
+ destroy_match(&sep);
+ destroy_match(&p);
+ break;
+ }
+ if (sep) {
+ *dest = sep;
+ dest = &sep->nextsibling;
+ }
+ *dest = p;
+ dest = &p->nextsibling;
+ str = p->end;
+ prev = str;
+ }
+
+ if ((ssize_t)reps < op->args.repetitions.min) {
+ destroy_match(&m);
+ return NULL;
+ }
+ m->end = str;
+ return m;
+ }
+ case VM_AFTER: {
+ ssize_t backtrack = op->args.pat->len;
+ check(backtrack != -1, "'<' is only allowed for fixed-length operations");
+ // Check for necessary space:
+ for (int i = 0; i < backtrack; i++) {
+ if (str[-i] == '\0') return NULL;
+ }
+ match_t *before = match(g, str - backtrack, op->args.pat);
+ if (before == NULL) return NULL;
+ destroy_match(&before);
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->end = str;
+ m->op = op;
+ return m;
+ }
+ case VM_BEFORE: {
+ match_t *after = match(g, str, op->args.pat);
+ if (after == NULL) return NULL;
+ destroy_match(&after);
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->end = str;
+ m->op = op;
+ return m;
+ }
+ case VM_CAPTURE: {
+ match_t *p = match(g, str, op->args.pat);
+ if (p == NULL) return NULL;
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->end = p->end;
+ m->op = op;
+ m->child = p;
+ m->is_capture = 1;
+ if (op->args.capture.name)
+ m->name_or_replacement = op->args.capture.name;
+ return m;
+ }
+ case VM_OTHERWISE: {
+ match_t *m = match(g, str, op->args.multiple.first);
+ if (m == NULL) m = match(g, str, op->args.multiple.second);
+ return m;
+ }
+ case VM_CHAIN: {
+ match_t *m1 = match(g, str, op->args.multiple.first);
+ if (m1 == NULL) return NULL;
+ match_t *m2 = match(g, m1->end, op->args.multiple.second);
+ if (m2 == NULL) {
+ destroy_match(&m1);
+ return NULL;
+ }
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->end = m2->end;
+ m->op = op;
+ m->child = m1;
+ m1->nextsibling = m2;
+ return m;
+ }
+ case VM_REPLACE: {
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = str;
+ m->op = op;
+ if (op->args.replace.replace_pat) {
+ match_t *p = match(g, str, op->args.replace.replace_pat);
+ if (p == NULL) return NULL;
+ m->child = p;
+ m->end = p->end;
+ } else {
+ m->end = m->start;
+ }
+ m->is_replacement = 1;
+ m->name_or_replacement = op->args.replace.replacement;
+ return m;
+ }
+ case VM_REF: {
+ // Search backwards so newer defs take precedence
+ for (int i = g->size-1; i >= 0; i--) {
+ if (streq(g->definitions[i].name, op->args.s)) {
+ // Bingo!
+ /*
+ op = g->definitions[i].op;
+ goto tailcall;
+ */
+ match_t *p = match(g, str, g->definitions[i].op);
+ if (p == NULL) return NULL;
+ match_t *m = calloc(sizeof(match_t), 1);
+ m->start = p->start;
+ m->end = p->end;
+ m->op = op;
+ m->child = p;
+ m->name_or_replacement = g->definitions[i].name;
+ m->is_ref = 1;
+ return m;
+ }
+ }
+ check(0, "Unknown identifier: '%s'", op->args.s);
+ return NULL;
+ }
+ default: {
+ fprintf(stderr, "Unknown opcode: %d", op->op);
+ _exit(1);
+ return NULL;
+ }
+ }
+}
+
+
+void print_pattern(vm_op_t *op)
+{
+ switch (op->op) {
+ case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
+ case VM_EMPTY: fprintf(stderr, "the empty string"); break;
+ case VM_ANYCHAR: fprintf(stderr, "any char"); break;
+ case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
+ case VM_RANGE: {
+ fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high);
+ break;
+ }
+ case VM_REPEAT: {
+ if (op->args.repetitions.max == -1)
+ fprintf(stderr, "%ld or more (", op->args.repetitions.min);
+ else
+ fprintf(stderr, "%ld-%ld of (",
+ op->args.repetitions.min,
+ op->args.repetitions.max);
+ print_pattern(op->args.repetitions.repeat_pat);
+ fprintf(stderr, ")");
+ if (op->args.repetitions.sep) {
+ fprintf(stderr, " separated by (");
+ print_pattern(op->args.repetitions.sep);
+ fprintf(stderr, ")");
+ }
+ break;
+ }
+ case VM_NOT: {
+ fprintf(stderr, "not (");
+ print_pattern(op->args.pat);
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_UPTO_AND: {
+ fprintf(stderr, "text up to and including (");
+ print_pattern(op->args.pat);
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_ANYTHING_BUT: {
+ fprintf(stderr, "anything but (");
+ print_pattern(op->args.pat);
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_AFTER: {
+ fprintf(stderr, "after (");
+ print_pattern(op->args.pat);
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_BEFORE: {
+ fprintf(stderr, "before (");
+ print_pattern(op->args.pat);
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_CAPTURE: {
+ fprintf(stderr, "capture (");
+ print_pattern(op->args.pat);
+ fprintf(stderr, ")");
+ if (op->args.capture.name)
+ fprintf(stderr, " and call it %s", op->args.capture.name);
+ break;
+ }
+ case VM_OTHERWISE: {
+ fprintf(stderr, "(");
+ print_pattern(op->args.multiple.first);
+ fprintf(stderr, ") or else ");
+ if (op->args.multiple.second->op != VM_OTHERWISE)
+ fprintf(stderr, "(");
+ print_pattern(op->args.multiple.second);
+ if (op->args.multiple.second->op != VM_OTHERWISE)
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_CHAIN: {
+ fprintf(stderr, "(");
+ print_pattern(op->args.multiple.first);
+ fprintf(stderr, ") then ");
+ if (op->args.multiple.second->op != VM_CHAIN)
+ fprintf(stderr, "(");
+ print_pattern(op->args.multiple.second);
+ if (op->args.multiple.second->op != VM_CHAIN)
+ fprintf(stderr, ")");
+ break;
+ }
+ case VM_REPLACE: {
+ fprintf(stderr, "replace ");
+ if (op->args.replace.replace_pat) {
+ fprintf(stderr, "(");
+ print_pattern(op->args.replace.replace_pat);
+ fprintf(stderr, ")");
+ } else
+ fprintf(stderr, "\"\"");
+ fprintf(stderr, " with \"%s\"", op->args.replace.replacement);
+ break;
+ }
+ default: break;
+ }
+}
+
+/*
+ * Get a specific numbered pattern capture.
+ */
+static match_t *get_capture_n(match_t *m, int *n)
+{
+ if (!m) return NULL;
+ if (*n == 0) return m;
+ if (m->is_capture && *n == 1) return m;
+ if (m->is_capture) --(*n);
+ for (match_t *c = m->child; c; c = c->nextsibling) {
+ match_t *cap = get_capture_n(c, n);
+ if (cap) return cap;
+ }
+ return NULL;
+}
+
+/*
+ * Get a named capture.
+ */
+static match_t *get_capture_named(match_t *m, const char *name)
+{
+ if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name))
+ return m;
+ for (match_t *c = m->child; c; c = c->nextsibling) {
+ match_t *cap = get_capture_named(c, name);
+ if (cap) return cap;
+ }
+ return NULL;
+}
+
+/*
+ * Print a match with replacements and highlighting.
+ */
+void print_match(match_t *m, const char *color, int verbose)
+{
+ if (m->is_replacement) {
+ printf("\033[0;34m");
+ for (const char *r = m->name_or_replacement; *r; ) {
+ if (*r == '\\') {
+ fputc(unescapechar(r, &r), stdout);
+ continue;
+ } else if (*r != '@') {
+ fputc(*r, stdout);
+ ++r;
+ continue;
+ }
+
+ ++r;
+ match_t *cap = NULL;
+ switch (*r) {
+ case '0': case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9': {
+ int n = (int)strtol(r, (char**)&r, 10);
+ cap = get_capture_n(m->child, &n);
+ break;
+ }
+ case '[': {
+ char *closing = strchr(r+1, ']');
+ if (!closing) {
+ fputc('@', stdout);
+ break;
+ }
+ ++r;
+ char *name = strndup(r, (size_t)(closing-r));
+ cap = get_capture_named(m, name);
+ free(name);
+ r = closing + 1;
+ break;
+ }
+ default: {
+ fputc('@', stdout);
+ break;
+ }
+ }
+ if (cap != NULL) {
+ print_match(cap, "\033[0;35m", verbose);
+ printf("\033[0;34m");
+ }
+ }
+ } else {
+ const char *name = m->name_or_replacement;
+ if (verbose && m->is_ref && name && isupper(name[0]))
+ printf("\033[0;2;35m{%s:", name);
+ //if (m->is_capture && name)
+ // printf("\033[0;2;33m[%s:", name);
+ const char *prev = m->start;
+ for (match_t *child = m->child; child; child = child->nextsibling) {
+ if (child->start > prev)
+ printf("%s%.*s", color, (int)(child->start - prev), prev);
+ print_match(child, m->is_capture ? "\033[0;1m" : color, verbose);
+ prev = child->end;
+ }
+ if (m->end > prev)
+ printf("%s%.*s", color, (int)(m->end - prev), prev);
+ if (verbose && m->is_ref && name && isupper(name[0]))
+ printf("\033[0;2;35m}");
+ //if (m->is_capture && name)
+ // printf("\033[0;2;33m]");
+ }
+}
+
diff --git a/vm.h b/vm.h
new file mode 100644
index 0000000..965a040
--- /dev/null
+++ b/vm.h
@@ -0,0 +1,18 @@
+/*
+ * vm.h - Header file for BPEG virtual machine.
+ */
+#ifndef VM__H
+#define VM__H
+
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+
+#include "types.h"
+
+match_t *match(grammar_t *g, const char *str, vm_op_t *op);
+void destroy_match(match_t **m);
+void print_pattern(vm_op_t *op);
+void print_match(match_t *m, const char *color, int verbose);
+
+#endif