aboutsummaryrefslogtreecommitdiff
path: root/bpeg.c
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2020-09-11 01:28:06 -0700
committerBruce Hill <bruce@bruce-hill.com>2020-09-11 01:28:06 -0700
commit2a11acc66738d6300bfa90a22adcd540371060f3 (patch)
tree4966c9864434da3e43594f0d86dd9362f367fd8e /bpeg.c
parent8a846230f7b4269af08a6b6496ebd28c4ff459ba (diff)
Major overhaul refactor restructuring
Diffstat (limited to 'bpeg.c')
-rw-r--r--bpeg.c1136
1 files changed, 37 insertions, 1099 deletions
diff --git a/bpeg.c b/bpeg.c
index 41f7238..23fa6fc 100644
--- a/bpeg.c
+++ b/bpeg.c
@@ -31,1089 +31,38 @@
* <pat> / <alt> <pat> otherwise <alt>
* ; <name> = <pat> <name> is defined to be <pat>
*/
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "compiler.h"
+#include "grammar.h"
+#include "utils.h"
+#include "vm.h"
+
+static const char *usage = (
+ "Usage:\n"
+ " bpeg [flags] <pattern> [<input files>...]\n\n"
+ "Flags:\n"
+ " -h --help\t print the usage and quit\n"
+ " -v --verbose\t print verbose debugging info\n"
+ " -s --slow\t run in slow mode for debugging\n"
+ " -r --replace <replacement> replace the input pattern with the given replacement\n"
+ " -g --grammar <grammar file> use the specified file as a grammar\n");
-#include "bpeg.h"
-
-/*
- * Recursively deallocate a match object and return NULL
- */
-static match_t *free_match(match_t *m)
-{
- if (m->child) m->child = free_match(m->child);
- if (m->nextsibling) m->nextsibling = free_match(m->nextsibling);
- free(m);
- return NULL;
-}
-
-/*
- * Run virtual machine operation against a string and return
- * a match struct, or NULL if no match is found.
- * The returned value should be free()'d to avoid memory leaking.
- */
-static match_t *match(const char *str, vm_op_t *op)
-{
- //tailcall:
- switch (op->op) {
- case VM_EMPTY: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str;
- return m;
- }
- case VM_ANYCHAR: {
- if (!*str || (!op->multiline && *str == '\n'))
- return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str+1;
- return m;
- }
- case VM_STRING: {
- if (strncmp(str, op->args.s, op->len) != 0)
- return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str + op->len;
- return m;
- }
- case VM_RANGE: {
- if (*str < op->args.range.low || *str > op->args.range.high)
- return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- m->end = str + 1;
- return m;
- }
- case VM_NOT: case VM_ANYTHING_BUT: {
- if (op->op == VM_ANYTHING_BUT)
- if (!*str || (!op->multiline && *str == '\n'))
- return NULL;
- match_t *m = match(str, op->args.pat);
- if (m != NULL) {
- m = free_match(m);
- return NULL;
- }
- m = calloc(sizeof(match_t), 1);
- m->op = op;
- m->start = str;
- if (op->op == VM_ANYTHING_BUT) ++str;
- m->end = str;
- return m;
- }
- case VM_UPTO_AND: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->op = op;
- match_t *p = NULL;
- for (const char *prev = NULL; p == NULL && prev < str; ) {
- prev = str;
- p = match(str, op->args.pat);
- if (*str && (op->multiline || *str != '\n'))
- ++str;
- }
- if (p) {
- m->end = p->end;
- m->child = p;
- return m;
- }
- m = free_match(m);
- return NULL;
- }
- case VM_REPEAT: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = str;
- m->op = op;
- if (op->args.repetitions.max == 0) return m;
-
- match_t **dest = &m->child;
-
- const char *prev = str;
- size_t reps;
- for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
- // Separator
- match_t *sep = NULL;
- if (op->args.repetitions.sep != NULL && reps > 0) {
- sep = match(str, op->args.repetitions.sep);
- if (sep == NULL) break;
- str = sep->end;
- }
- match_t *p = match(str, op->args.repetitions.repeat_pat);
- if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
- if (sep) sep = free_match(sep);
- if (p) p = free_match(p);
- break;
- }
- if (sep) {
- *dest = sep;
- dest = &sep->nextsibling;
- }
- *dest = p;
- dest = &p->nextsibling;
- str = p->end;
- prev = str;
- }
-
- if ((ssize_t)reps < op->args.repetitions.min) {
- m = free_match(m);
- return NULL;
- }
- m->end = str;
- return m;
- }
- case VM_AFTER: {
- ssize_t backtrack = op->args.pat->len;
- check(backtrack != -1, "'<' is only allowed for fixed-length operations");
- // Check for necessary space:
- for (int i = 0; i < backtrack; i++) {
- if (str[-i] == '\0') return NULL;
- }
- match_t *before = match(str - backtrack, op->args.pat);
- if (before == NULL) return NULL;
- before = free_match(before);
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = str;
- m->op = op;
- return m;
- }
- case VM_BEFORE: {
- match_t *after = match(str, op->args.pat);
- if (after == NULL) return NULL;
- after = free_match(after);
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = str;
- m->op = op;
- return m;
- }
- case VM_CAPTURE: {
- match_t *p = match(str, op->args.pat);
- if (p == NULL) return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = p->end;
- m->op = op;
- m->child = p;
- m->is_capture = 1;
- if (op->args.capture.name)
- m->name_or_replacement = op->args.capture.name;
- return m;
- }
- case VM_OTHERWISE: {
- match_t *m = match(str, op->args.multiple.first);
- if (m == NULL) m = match(str, op->args.multiple.second);
- return m;
- }
- case VM_CHAIN: {
- match_t *m1 = match(str, op->args.multiple.first);
- if (m1 == NULL) return NULL;
- match_t *m2 = match(m1->end, op->args.multiple.second);
- if (m2 == NULL) {
- m1 = free_match(m1);
- return NULL;
- }
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->end = m2->end;
- m->op = op;
- m->child = m1;
- m1->nextsibling = m2;
- return m;
- }
- case VM_REPLACE: {
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = str;
- m->op = op;
- if (op->args.replace.replace_pat) {
- match_t *p = match(str, op->args.replace.replace_pat);
- if (p == NULL) return NULL;
- m->child = p;
- m->end = p->end;
- } else {
- m->end = m->start;
- }
- m->is_replacement = 1;
- m->name_or_replacement = op->args.replace.replacement;
- return m;
- }
- case VM_REF: {
- // Search backwards so newer defs take precedence
- for (int i = ndefs-1; i >= 0; i--) {
- if (streq(defs[i].name, op->args.s)) {
- // Bingo!
- /*
- op = defs[i].op;
- goto tailcall;
- */
- match_t *p = match(str, defs[i].op);
- if (p == NULL) return NULL;
- match_t *m = calloc(sizeof(match_t), 1);
- m->start = p->start;
- m->end = p->end;
- m->op = op;
- m->child = p;
- m->name_or_replacement = defs[i].name;
- m->is_ref = 1;
- return m;
- }
- }
- check(0, "Unknown identifier: '%s'", op->args.s);
- return NULL;
- }
- default: {
- fprintf(stderr, "Unknown opcode: %d", op->op);
- _exit(1);
- return NULL;
- }
- }
-}
-
-/*
- * Helper function to initialize a range object.
- */
-static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep)
-{
- op->op = VM_REPEAT;
- if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0)
- op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1));
- else
- op->len = -1;
- op->args.repetitions.min = min;
- op->args.repetitions.max = max;
- op->args.repetitions.repeat_pat = pat;
- op->args.repetitions.sep = sep;
-}
-
-/*
- * Take an opcode and expand it into a chain of patterns if it's
- * followed by any patterns (e.g. "`x `y"), otherwise return
- * the original input.
- */
-static vm_op_t *expand_chain(const char *source, vm_op_t *first)
-{
- visualize(source, first->end, "Expanding chain...");
- vm_op_t *second = compile_bpeg(source, first->end);
- if (second == NULL) return first;
- second = expand_chain(source, second);
- check(second->end > first->end, "No forward progress in chain!");
- visualize(source, second->end, "Got chained pair.");
- return chain_together(first, second);
-}
-
-/*
- * Take an opcode and expand it into a chain of choices if it's
- * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise
- * return the original input.
- */
-static vm_op_t *expand_choices(const char *source, vm_op_t *first)
-{
- first = expand_chain(source, first);
- const char *str = first->end;
- if (!matchchar(&str, '/')) return first;
- visualize(source, str, "Expanding choices...");
- //debug("Otherwise:\n");
- vm_op_t *second = compile_bpeg(source, str);
- check(second, "Expected pattern after '/'");
- second = expand_choices(source, second);
- vm_op_t *choice = calloc(sizeof(vm_op_t), 1);
- choice->op = VM_OTHERWISE;
- choice->start = first->start;
- if (first->len == second->len)
- choice->len = first->len;
- else choice->len = -1;
- choice->end = second->end;
- choice->args.multiple.first = first;
- choice->args.multiple.second = second;
- visualize(source, choice->end, "Got two choices");
- return choice;
-}
-
-static char unescapechar(const char *escaped, const char **end)
-{
- size_t len = 1;
- char ret = *escaped;
- switch (*escaped) {
- case 'a': ret = '\a'; break; case 'b': ret = '\b'; break;
- case 'n': ret = '\n'; break; case 'r': ret = '\r'; break;
- case 't': ret = '\t'; break; case 'v': ret = '\v'; break;
- case 'e': ret = '\033'; break;
- case 'x': { // Hex
- static const char hextable[255] = {
- ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4,
- ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9,
- ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf,
- ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf,
- };
- if (hextable[(int)escaped[1]] && hextable[(int)escaped[2]]) {
- ret = (hextable[(int)escaped[1]] << 4) | (hextable[(int)escaped[2]] & 0xF);
- len = 3;
- }
- break;
- }
- case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal
- ret = escaped[0] - '0';
- if ('0' <= escaped[1] && escaped[1] <= '7') {
- ++len;
- ret = (ret << 3) | (escaped[1] - '0');
- if ('0' <= escaped[2] && escaped[2] <= '7') {
- ++len;
- ret = (ret << 3) | (escaped[2] - '0');
- }
- }
- break;
- }
- default: break;
- }
- *end = &escaped[len];
- return ret;
-}
-
-static vm_op_t *chain_together(vm_op_t *first, vm_op_t *second)
-{
- if (first == NULL) return second;
- if (second == NULL) return first;
- vm_op_t *chain = calloc(sizeof(vm_op_t), 1);
- chain->op = VM_CHAIN;
- chain->start = first->start;
- if (first->len >= 0 && second->len >= 0)
- chain->len = first->len + second->len;
- else chain->len = -1;
- chain->end = second->end;
- chain->args.multiple.first = first;
- chain->args.multiple.second = second;
- return chain;
-}
-
-/*
- * Compile a string of BPEG code into virtual machine opcodes
- */
-static vm_op_t *compile_bpeg(const char *source, const char *str)
-{
- if (!*str) return NULL;
- visualize(source, str, "Compiling...");
- //debug("Parsing \"%s\"...\n", str);
- str = after_spaces(str);
- check(*str, "Expected a pattern");
- vm_op_t *op = calloc(sizeof(vm_op_t), 1);
- op->start = str;
- op->len = -1;
- char c = *str;
- ++str;
- switch (c) {
- // Any char (dot) ($. is multiline anychar)
- case '.': {
- anychar:
- visualize(source, str, "Dot");
- //debug("Dot\n");
- op->op = VM_ANYCHAR;
- op->len = 1;
- break;
- }
- // Char literals
- case '`': {
- char literal[2] = {*str, '\0'};
- ++str;
- visualize(source, str, "Char literal");
- check(literal[0], "Expected character after '`'\n");
- op->len = 1;
- if (matchchar(&str, '-')) { // Range
- visualize(source, str, "Char range");
- char c2 = *str;
- check(c2, "Expected character after '-'");
- check(c2 >= literal[0], "Character range must be low-to-high");
- op->op = VM_RANGE;
- op->args.range.low = literal[0];
- op->args.range.high = c2;
- ++str;
- } else {
- //debug("Char literal\n");
- op->op = VM_STRING;
- op->args.s = strdup(literal);
- }
- break;
- }
- // Escapes
- case '\\': {
- //debug("Escape sequence\n");
- visualize(source, str, "Escape sequence");
- check(*str, "Expected escape after '\\'");
- op->len = 1;
- char e = unescapechar(str, &str);
- if (*str == '-') { // Escape range (e.g. \x00-\xFF)
- ++str;
- char e2 = unescapechar(str, &str);
- check(e2, "Expected character after '-'");
- check(e2 >= e, "Character range must be low-to-high");
- op->op = VM_RANGE;
- op->args.range.low = e;
- op->args.range.high = e2;
- } else {
- char literal[2] = {e, '\0'};
- op->op = VM_STRING;
- op->args.s = strdup(literal);
- }
- break;
- }
- // String literal
- case '"': case '\'': case '\002': {
- visualize(source, str, "String literal");
- char endquote = c == '\002' ? '\003' : c;
- char *literal = (char*)str;
- for (; *str && *str != endquote; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- ++str;
- }
- visualize(source, str, "String literal");
- }
- size_t len = (size_t)(str - literal);
- literal = strndup(literal, len);
- len = unescape_string(literal, literal, len);
-
- op->op = VM_STRING;
- op->len = len;
- op->args.s = literal;
-
- check(matchchar(&str, endquote), "Missing closing quote");
- break;
- }
- // Not <pat>
- case '!': {
- // debug("Not pattern\n");
- visualize(source, str, "Not <pat>");
- vm_op_t *p = compile_bpeg(source, str);
- check(p, "Expected pattern after '!'\n");
- str = p->end;
- op->op = VM_NOT;
- op->len = 0;
- op->args.pat = p;
- break;
- }
- // Anything but <pat>
- case '~': {
- if (matchchar(&str, '~')) op->multiline = 1;
- visualize(source, str, "Anything but <pat>");
- vm_op_t *p = compile_bpeg(source, str);
- check(p, "Expected pattern after '~'\n");
- str = p->end;
- op->op = VM_ANYTHING_BUT;
- op->len = -1;
- op->args.pat = p;
- break;
- }
- // Upto and including <pat>
- case '&': {
- if (matchchar(&str, '&')) op->multiline = 1;
- visualize(source, str, "Upto and including <pat>");
- vm_op_t *p = compile_bpeg(source, str);
- check(p, "Expected pattern after '&'\n");
- str = p->end;
- op->op = VM_UPTO_AND;
- op->len = -1;
- op->args.pat = p;
- break;
- }
- // Number of repetitions: <N>(-<N> / - / + / "")
- case '0': case '1': case '2': case '3': case '4': case '5':
- case '6': case '7': case '8': case '9': {
- visualize(source, str, "Repeat <pat>");
- ssize_t min = -1, max = -1;
- --str;
- long n1 = strtol(str, (char**)&str, 10);
- if (matchchar(&str, '-')) {
- str = after_spaces(str);
- const char *start = str;
- long n2 = strtol(str, (char**)&str, 10);
- if (str == start) min = 0, max = n1;
- else min = n1, max = n2;
- } else if (matchchar(&str, '+')) {
- min = n1, max = -1;
- } else {
- min = n1, max = n1;
- }
- visualize(source, str, NULL);
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after repetition count");
- str = pat->end;
- str = after_spaces(str);
- if (matchchar(&str, '%')) {
- visualize(source, str, "Repeat <pat> with separator");
- vm_op_t *sep = compile_bpeg(source, str);
- check(sep, "Expected pattern for separator after '%%'");
- str = sep->end;
- set_range(op, min, max, pat, sep);
- } else {
- set_range(op, min, max, pat, NULL);
- }
- visualize(source, str, NULL);
- //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max);
- break;
- }
- // Special repetitions:
- case '+': case '*': case '?': {
- //debug("Special repetitions\n");
- visualize(source, str, "Repeat <pat>");
- ssize_t min = -1, max = -1;
- switch (c) {
- case '+': min = 1, max = -1; break;
- case '*': min = 0, max = -1; break;
- case '?': min = 0, max = 1; break;
- }
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after +");
- str = pat->end;
- str = after_spaces(str);
- if (matchchar(&str, '%')) {
- visualize(source, str, "Repeat <pat> with separator");
- vm_op_t *sep = compile_bpeg(source, str);
- check(sep, "Expected pattern for separator after '%%'");
- str = sep->end;
- set_range(op, min, max, pat, sep);
- } else {
- set_range(op, min, max, pat, NULL);
- }
- visualize(source, str, NULL);
- //debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max);
- break;
- }
- // Lookbehind
- case '<': {
- visualize(source, str, "After <pat>");
- //debug("Lookbehind\n");
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after <");
- str = pat->end;
- check(pat->len != -1, "Lookbehind patterns must have a fixed length");
- str = pat->end;
- op->op = VM_AFTER;
- op->len = 0;
- op->args.pat = pat;
- break;
- }
- // Lookahead
- case '>': {
- visualize(source, str, "Before <pat>");
- //debug("Lookahead\n");
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after >");
- str = pat->end;
- op->op = VM_BEFORE;
- op->len = 0;
- op->args.pat = pat;
- break;
- }
- // Parentheses
- case '(': {
- visualize(source, str, NULL);
- // debug("Open paren (\n");
- free(op);
- op = compile_bpeg(source, str);
- check(op, "Expected pattern inside parentheses");
- op = expand_choices(source, op);
- str = op->end;
- str = after_spaces(str);
- check(matchchar(&str, ')'), "Expected closing parenthesis");
- visualize(source, str, NULL);
- // debug(")\n");
- break;
- }
- // Capture
- case '@': {
- //debug("Capture\n");
- visualize(source, str, "Capture");
- op->op = VM_CAPTURE;
- str = after_spaces(str);
- if (matchchar(&str, '[')) {
- char *closing = strchr(str, ']');
- check(closing, "Expected closing ']'");
- op->args.capture.name = strndup(str, (size_t)(closing-str));
- visualize(source, str, "Named capture");
- //debug("named \"%s\"\n", op->args.capture.name);
- str = closing;
- check(matchchar(&str, ']'), "Expected closing ']'");
- }
- vm_op_t *pat = compile_bpeg(source, str);
- check(pat, "Expected pattern after @");
- str = pat->end;
- op->args.capture.capture_pat = pat;
- op->len = pat->len;
- visualize(source, str, NULL);
- break;
- }
- // Replacement
- case '{': {
- //debug("Replacement {\n");
- visualize(source, str, "Replacement");
- str = after_spaces(str);
- vm_op_t *pat = NULL;
- if (strncmp(str, "=>", 2) == 0) {
- str += strlen("=>");
- } else {
- pat = compile_bpeg(source, str);
- check(pat, "Invalid pattern after '{'");
- pat = expand_choices(source, pat);
- str = pat->end;
- str = after_spaces(str);
- check(matchchar(&str, '=') && matchchar(&str, '>'),
- "Expected '=>' after pattern in replacement");
- }
- visualize(source, str, NULL);
- str = after_spaces(str);
-
- char quote = *str;
- const char *replacement;
- if (matchchar(&str, '}')) {
- replacement = strdup("");
- visualize(source, str, NULL);
- } else {
- check(matchchar(&str, '"') || matchchar(&str, '\''),
- "Expected string literal for replacement");
- replacement = str;
- for (; *str && *str != quote; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- ++str;
- }
- visualize(source, str, NULL);
- }
- replacement = strndup(replacement, (size_t)(str-replacement));
- check(matchchar(&str, quote), "Expected closing quote");
- check(matchchar(&str, '}'), "Expected a closing '}'");
- }
- op->op = VM_REPLACE;
- op->args.replace.replace_pat = pat;
- op->args.replace.replacement = replacement;
- //debug(" rep = \"%s\"\n", replacement);
- //debug("}\n");
- if (pat != NULL) op->len = pat->len;
- visualize(source, str, NULL);
- break;
- }
- // Special rules:
- case '_': case '^': case '$': {
- if (matchchar(&str, c)) { // double __, ^^, $$
- char tmp[3] = {c, c, '\0'};
- op->args.s = strdup(tmp);
- } else if (c == '$' && matchchar(&str, '.')) { // $. (multi-line anychar)
- op->multiline = 1;
- goto anychar;
- } else {
- op->args.s = strndup(&c, 1);
- }
- op->op = VM_REF;
- visualize(source, str, op->args.s);
- break;
- }
- // Empty choice (/) or {/}
- case '/': {
- str = after_spaces(str);
- if (*str == ')' || *str == '}') {
- op->op = VM_EMPTY;
- } else {
- free(op);
- return NULL;
- }
- break;
- }
- default: {
- // Reference
- if (isalpha(c)) {
- visualize(source, str, "Ref");
- --str;
- const char *refname = str;
- str = after_name(str);
- op->op = VM_REF;
- op->len = (size_t)(str - refname);
- op->args.s = strndup(refname, op->len);
- break;
- } else {
- visualize(source, str, "Finished");
- free(op);
- return NULL;
- }
- }
- }
- op->end = str;
- return op;
-}
-
-/*
- * Similar to compile_bpeg, except that the pattern begins with an implicit, unclosable quote.
- */
-static vm_op_t *compile_bpeg_string(const char *source, const char *str)
-{
- visualize(source, str, "Compiling string...");
- vm_op_t *ret = NULL;
- while (*str) {
- vm_op_t *strop = calloc(sizeof(vm_op_t), 1);
- strop->start = str;
- strop->len = 0;
- strop->op = VM_STRING;
- char *literal = (char*)str;
- vm_op_t *interp = NULL;
- for (; *str; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- interp = compile_bpeg(source, str + 1);
- check(interp != NULL, "No valid BPEG pattern detected after backslash");
- break;
- }
- visualize(source, str, "String literal");
- }
- // End of string
- size_t len = (size_t)(str - literal);
- literal = strndup(literal, len);
- len = unescape_string(literal, literal, len);
- strop->len = len;
- strop->args.s = literal;
- strop->end = str;
-
- if (strop->len == 0) {
- free(strop);
- strop = NULL;
- } else {
- ret = chain_together(ret, strop);
- }
- if (interp) {
- ret = chain_together(ret, interp);
- str = interp->end;
- // allow terminating seq
- matchchar(&str, ';');
- }
- }
- return ret;
-}
-
-static vm_op_t *compile_bpeg_replacement(vm_op_t *pat, const char *str)
-{
- vm_op_t *op = calloc(sizeof(vm_op_t), 1);
- op->op = VM_REPLACE;
- op->start = pat->start;
- op->len = pat->len;
- op->args.replace.replace_pat = pat;
- const char *replacement = str;
- for (; *str; str++) {
- if (*str == '\\') {
- check(str[1], "Expected more string contents after backslash");
- ++str;
- }
- }
- replacement = strndup(replacement, (size_t)(str-replacement));
- op->args.replace.replacement = replacement;
- return op;
-}
-
-static vm_op_t *add_def(const char *name, const char *source, vm_op_t *op)
-{
- check(ndefs < (sizeof(defs)/sizeof(defs[0])), "Too many definitions!");
- defs[ndefs].name = name;
- defs[ndefs].op = op;
- defs[ndefs].source = source;
- ++ndefs;
- return op;
-}
-
-static vm_op_t *load_def(const char *name, const char *source)
-{
- vm_op_t *op = compile_bpeg(source, source);
- source = strndup((char*)op->start, (int)(op->end - op->start));
- op = expand_choices(source, op);
- return add_def(name, source, op);
-}
-
-static void load_defs(void)
-{
- // Approximately these are in least-to-most used order so they will be
- // found most efficiently by the lookup, which goes in reverse order.
- load_def("crlf", "\\r\\n");
- load_def("cr", "\\r"); load_def("r", "\\r");
- load_def("anglebraces", "`< *(anglebraces / ~~`>) `>");
- load_def("brackets", "`[ *(brackets / ~~`]) `]");
- load_def("braces", "`{ *(braces / ~~`}) `}");
- load_def("parens", "`( *(parens / ~~`)) `)");
- load_def("id", "(`a-z/`A-Z/`_) *(`a-z/`A-Z/`_/`0-9)");
- load_def("HEX", "`0-9/`A-F");
- load_def("Hex", "`0-9/`a-f/`A-F");
- load_def("hex", "`0-9/`a-f");
- load_def("number", "+`0-9 ?(`. *`0-9) / `. +`0-9");
- load_def("int", "+`0-9");
- load_def("digit", "`0-9");
- load_def("Abc", "`a-z/`A-Z");
- load_def("ABC", "`A-Z");
- load_def("abc", "`a-z");
- load_def("esc", "\\e"); load_def("e", "\\e");
- load_def("tab", "\\t"); load_def("t", "\\t");
- load_def("nl", "\\n"); load_def("lf", "\\n"); load_def("n", "\\n");
- load_def("c-block-comment", "'/*' &&'*/'");
- load_def("c-line-comment", "'//' &$");
- load_def("c-comment", "c-line-comment / c-block-comment");
- load_def("hash-comment", "`# &$");
- load_def("comment", "!(/)"); // undefined by default
- load_def("WS", "` /\\t/\\n/\\r/comment");
- load_def("ws", "` /\\t");
- load_def("$$", "!$.");
- load_def("$", "!.");
- load_def("^^", "!<$.");
- load_def("^", "!<.");
- load_def("__", "*(` /\\t/\\n/\\r/comment)");
- load_def("_", "*(` /\\t)");
-}
-
-static match_t *get_capture_n(match_t *m, int *n)
-{
- if (!m) return NULL;
- if (*n == 0) return m;
- if (m->is_capture && *n == 1) return m;
- if (m->is_capture) --(*n);
- for (match_t *c = m->child; c; c = c->nextsibling) {
- match_t *cap = get_capture_n(c, n);
- if (cap) return cap;
- }
- return NULL;
-}
-
-static match_t *get_capture_named(match_t *m, const char *name)
-{
- if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name))
- return m;
- for (match_t *c = m->child; c; c = c->nextsibling) {
- match_t *cap = get_capture_named(c, name);
- if (cap) return cap;
- }
- return NULL;
-}
-
-static void print_match(match_t *m, const char *color)
-{
- if (m->is_replacement) {
- printf("\033[0;34m");
- for (const char *r = m->name_or_replacement; *r; ) {
- if (*r == '\\') {
- fputc(unescapechar(r, &r), stdout);
- continue;
- } else if (*r != '@') {
- fputc(*r, stdout);
- ++r;
- continue;
- }
-
- ++r;
- match_t *cap = NULL;
- switch (*r) {
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9': {
- int n = (int)strtol(r, (char**)&r, 10);
- cap = get_capture_n(m->child, &n);
- break;
- }
- case '[': {
- char *closing = strchr(r+1, ']');
- if (!closing) {
- fputc('@', stdout);
- break;
- }
- ++r;
- char *name = strndup(r, (size_t)(closing-r));
- cap = get_capture_named(m, name);
- free(name);
- r = closing + 1;
- break;
- }
- default: {
- fputc('@', stdout);
- break;
- }
- }
- if (cap != NULL) {
- print_match(cap, "\033[0;35m");
- printf("\033[0;34m");
- }
- }
- } else {
- const char *name = m->name_or_replacement;
- if (verbose && m->is_ref && name && isupper(name[0]))
- printf("\033[0;2;35m{%s:", name);
- //if (m->is_capture && name)
- // printf("\033[0;2;33m[%s:", name);
- const char *prev = m->start;
- for (match_t *child = m->child; child; child = child->nextsibling) {
- if (child->start > prev)
- printf("%s%.*s", color, (int)(child->start - prev), prev);
- print_match(child, m->is_capture ? "\033[0;1m" : color);
- prev = child->end;
- }
- if (m->end > prev)
- printf("%s%.*s", color, (int)(m->end - prev), prev);
- if (verbose && m->is_ref && name && isupper(name[0]))
- printf("\033[0;2;35m}");
- //if (m->is_capture && name)
- // printf("\033[0;2;33m]");
- }
-}
-
-/*
- * Read an entire file into memory.
- */
-static char *readfile(int fd)
-{
- size_t capacity = 1000, len = 0;
- char *buf = calloc(sizeof(char), capacity+1);
- ssize_t just_read;
- while ((just_read=read(fd, &buf[len], capacity-len)) > 0) {
- len += (size_t)just_read;
- if (len >= capacity)
- buf = realloc(buf, (capacity *= 2));
- }
- return buf;
-}
-
-static void print_grammar(vm_op_t *op)
-{
- switch (op->op) {
- case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
- case VM_EMPTY: fprintf(stderr, "the empty string"); break;
- case VM_ANYCHAR: fprintf(stderr, "any char"); break;
- case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
- case VM_RANGE: {
- fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high);
- break;
- }
- case VM_REPEAT: {
- if (op->args.repetitions.max == -1)
- fprintf(stderr, "%ld or more (", op->args.repetitions.min);
- else
- fprintf(stderr, "%ld-%ld of (",
- op->args.repetitions.min,
- op->args.repetitions.max);
- print_grammar(op->args.repetitions.repeat_pat);
- fprintf(stderr, ")");
- if (op->args.repetitions.sep) {
- fprintf(stderr, " separated by (");
- print_grammar(op->args.repetitions.sep);
- fprintf(stderr, ")");
- }
- break;
- }
- case VM_NOT: {
- fprintf(stderr, "not (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_UPTO_AND: {
- fprintf(stderr, "text up to and including (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_ANYTHING_BUT: {
- fprintf(stderr, "anything but (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_AFTER: {
- fprintf(stderr, "after (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_BEFORE: {
- fprintf(stderr, "before (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- break;
- }
- case VM_CAPTURE: {
- fprintf(stderr, "capture (");
- print_grammar(op->args.pat);
- fprintf(stderr, ")");
- if (op->args.capture.name)
- fprintf(stderr, " and call it %s", op->args.capture.name);
- break;
- }
- case VM_OTHERWISE: {
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.first);
- fprintf(stderr, ") or else ");
- if (op->args.multiple.second->op != VM_OTHERWISE)
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.second);
- if (op->args.multiple.second->op != VM_OTHERWISE)
- fprintf(stderr, ")");
- break;
- }
- case VM_CHAIN: {
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.first);
- fprintf(stderr, ") then ");
- if (op->args.multiple.second->op != VM_CHAIN)
- fprintf(stderr, "(");
- print_grammar(op->args.multiple.second);
- if (op->args.multiple.second->op != VM_CHAIN)
- fprintf(stderr, ")");
- break;
- }
- case VM_REPLACE: {
- fprintf(stderr, "replace ");
- if (op->args.replace.replace_pat) {
- fprintf(stderr, "(");
- print_grammar(op->args.replace.replace_pat);
- fprintf(stderr, ")");
- } else
- fprintf(stderr, "\"\"");
- fprintf(stderr, " with \"%s\"", op->args.replace.replacement);
- break;
- }
- default: break;
- }
-}
-
-static vm_op_t *load_grammar(const char *grammar)
-{
- vm_op_t *op = compile_bpeg(grammar, grammar);
- check(op, "Failed to compile_bpeg input");
- op = expand_choices(grammar, op);
-
- const char *defs = op->end;
- while (matchchar(&defs, ';')) {
- if (verbose) fprintf(stderr, "\n");
- defs = after_spaces(defs);
- const char *name = defs;
- if (strncmp(name, "^^", 2) == 0 ||
- strncmp(name, "__", 2) == 0 ||
- strncmp(name, "$$", 2) == 0) {
- name = strndup(name, 2);
- defs += 2;
- } else if (*name == '^' || *name == '_' || *name == '$') {
- name = strndup(name, 1);
- defs += 1;
- } else {
- defs = after_name(defs);
- if (defs == NULL) break;
- name = strndup(name, (size_t)(defs-name));
- }
- defs = after_spaces(defs);
- check(matchchar(&defs, '='), "Expected '=' in definition");
- vm_op_t *def = load_def(name, defs);
- check(def, "Couldn't load definition");
- defs = def->end;
- }
- return op;
-}
int main(int argc, char *argv[])
{
+ int verbose = 0;
const char *pattern = NULL,
*replacement = NULL,
*grammarfile = NULL,
*infile = NULL;
+ grammar_t *g = new_grammar();
+
for (int i = 1; i < argc; i++) {
if (streq(argv[i], "--help") || streq(argv[i], "-h")) {
printf("%s\n", usage);
@@ -1122,8 +71,6 @@ int main(int argc, char *argv[])
verbose = 1;
} else if (streq(argv[i], "--replace") || streq(argv[i], "-r")) {
replacement = argv[++i];
- } else if (streq(argv[i], "--slow") || streq(argv[i], "-s")) {
- visualize_delay = 100000;
} else if (streq(argv[i], "--grammar") || streq(argv[i], "-g")) {
grammarfile = argv[++i];
} else if (streq(argv[i], "--define") || streq(argv[i], "-d")) {
@@ -1131,7 +78,10 @@ int main(int argc, char *argv[])
char *eq = strchr(def, '=');
check(eq, usage);
*eq = '\0';
- load_def(def, ++eq);
+ char *src = ++eq;
+ vm_op_t *pat = bpeg_pattern(src);
+ check(pat, "Failed to compile pattern");
+ add_def(g, src, def, pat);
} else if (pattern == NULL) {
pattern = argv[i];
} else if (infile == NULL) {
@@ -1140,16 +90,7 @@ int main(int argc, char *argv[])
}
check(pattern != NULL || grammarfile != NULL, usage);
- if (verbose) fprintf(stderr, "====== Loading definitions ======\n\n\n\n");
- {
- int tmp1 = visualize_delay, tmp2 = verbose;
- visualize_delay = -1, verbose = 0;
- load_defs();
- visualize_delay = tmp1, verbose = tmp2;
- }
- if (verbose) fprintf(stderr, "========== Compiling ===========\n\n\n\n");
- vm_op_t *op;
if (grammarfile) {
// load grammar from a file (semicolon mode)
char *grammar;
@@ -1160,24 +101,21 @@ int main(int argc, char *argv[])
check(fd >= 0, "Couldn't open file: %s", argv[2]);
grammar = readfile(fd);
}
- op = load_grammar(grammar);
+ load_grammar(g, grammar);
} else {
// load grammar in start-with-string mode:
- vm_op_t *pat = compile_bpeg_string(pattern, pattern);
- if (replacement) {
- pat = compile_bpeg_replacement(pat, replacement);
- }
+ vm_op_t *pat = bpeg_stringpattern(pattern);
+ if (replacement)
+ pat = bpeg_replacement(pat, replacement);
- add_def("pattern", pattern, pat);
+ add_def(g, pattern, "pattern", pat);
- const char *grammar = "*(@pattern / \\n / .)";
- op = compile_bpeg(grammar, grammar);
+ const char *grammar = "find = *(@pattern / \\n / .);";
+ load_grammar(g, grammar);
}
if (verbose) {
- fprintf(stderr, "\n\n");
- print_grammar(op);
- fprintf(stderr, "\n\n");
+ print_pattern(g->pattern);
}
char *input;
@@ -1194,12 +132,12 @@ int main(int argc, char *argv[])
stpcpy(&lpadded[1], input);
input = &lpadded[1];
- match_t *m = match(input, op);
+ match_t *m = match(g, input, g->pattern);
if (m == NULL) {
printf("No match\n");
return 1;
} else {
- print_match(m, "\033[0m");
+ print_match(m, "\033[0m", verbose);
printf("\033[0;2m%s\n", m->end);
}