aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2020-09-07 23:22:43 -0700
committerBruce Hill <bruce@bruce-hill.com>2020-09-07 23:22:43 -0700
commitf7cdd6d4d2c492683f0b5f65fa2fe788772df7c8 (patch)
tree75e7ab9b26a38c3ca7ba4337ec4bc991e8a9d2fa
parentcead7e5b2626e80f826236320e63f2e8570e7fb6 (diff)
Cleanup, splitting into files
-rw-r--r--Makefile2
-rw-r--r--bpeg.c219
-rw-r--r--bpeg.h40
-rw-r--r--utils.h7
-rw-r--r--vm.h50
5 files changed, 181 insertions, 137 deletions
diff --git a/Makefile b/Makefile
index eae2d13..e1ef268 100644
--- a/Makefile
+++ b/Makefile
@@ -7,7 +7,7 @@ all: bpeg
clean:
rm -f bpeg
-bpeg: bpeg.c
+bpeg: bpeg.c bpeg.h vm.h utils.h
cc $(CFLAGS) $(OFLAGS) $< -o $@
.PHONY: all clean
diff --git a/bpeg.c b/bpeg.c
index 0a4c551..d09dba3 100644
--- a/bpeg.c
+++ b/bpeg.c
@@ -1,110 +1,35 @@
-// # <comment> comment
-// ` <c> character <c>
-// ! <pat> no <pat>
-// ^ <pat> upto <pat>
-// & <pat> upto and including <pat>
-// <N=1> + <pat> [% <sep="">] <N> or more <pat>s (separated by <sep>)
-// * <pat> [% <sep="">] sugar for "0+ <pat> [% <sep>]"
-// <N=1> - <pat> [% <sep="">] <N> or fewer <pat>s (separated by <sep>)
-// ? <pat> sugar for "1- <pat>"
-// <N> - <M> <pat> <N> to <M> (inclusive) <pat>s
-// < <pat> after <pat>, ...
-// > <pat> before <pat>, ...
-// . any character
-// <pat> / <alt> <pat> otherwise <alt>
-// ( <pat> ) <pat>
-// @ <pat> capture <pat>
-// @ [ <name> ] <pat> <pat> named <name>
-// ; <name> = <pat> <name> is defined to be <pat>
-// { <pat> ~ <str> } <pat> replaced with <str>
-// "@1" or "@{1}" first capture
-// "@foo" or "@{foo}" capture named "foo"
-
-#include <ctype.h>
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-#define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); _exit(1); } } while(0)
-
-typedef struct match_s {
- const char *start, *end;
- union {
- unsigned int is_capture:1;
- const char *name;
- } capture;
- const char *replacement;
- struct match_s *child, *nextsibling;
-} match_t;
-
-enum VM_OPTYPE {
- VM_EMPTY = 0,
- VM_ANYCHAR = 1,
- VM_STRING,
- VM_RANGE,
- VM_NOT,
- VM_UPTO,
- VM_UPTO_AND,
- VM_REPEAT,
- VM_BEFORE,
- VM_AFTER,
- VM_CAPTURE,
- VM_OTHERWISE,
- VM_CHAIN,
- VM_REPLACE,
- VM_REF,
-};
-
-typedef struct vm_op_s {
- enum VM_OPTYPE op;
- const char *start, *end;
- ssize_t len;
- union {
- const char *s;
- struct {
- char low, high;
- } range;
- struct {
- ssize_t min, max;
- struct vm_op_s *sep, *repeat_pat;
- } repetitions;
- struct {
- struct vm_op_s *first, *second;
- } multiple;
- struct {
- struct vm_op_s *replace_pat;
- const char *replacement;
- } replace;
- struct {
- struct vm_op_s *capture_pat;
- char *name;
- } capture;
- struct vm_op_s *pat;
- } args;
-} vm_op_t;
-
-static match_t *free_match(match_t *m);
-static match_t *match(const char *str, vm_op_t *op);
-static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep);
-static inline const char *skip_spaces(const char *str);
-static vm_op_t *expand_choices(vm_op_t *op);
-static vm_op_t *expand_chain(vm_op_t *first);
-static vm_op_t *parse(const char *str);
-
-
-typedef struct {
- const char *name;
- vm_op_t *op;
-} def_t;
-
-static def_t defs[1024] = {{NULL, NULL}};
-size_t ndefs = 0;
-static int verbose = 1;
-
-#define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0)
+/*
+ * bpeg.h - Source code for the bpeg parser
+ *
+ * Grammar:
+ * # <comment> comment
+ * ` <c> character <c>
+ * ! <pat> no <pat>
+ * ^ <pat> upto <pat>
+ * & <pat> upto and including <pat>
+ * <N=1> + <pat> [% <sep="">] <N> or more <pat>s (separated by <sep>)
+ * * <pat> [% <sep="">] sugar for "0+ <pat> [% <sep>]"
+ * <N=1> - <pat> [% <sep="">] <N> or fewer <pat>s (separated by <sep>)
+ * ? <pat> sugar for "1- <pat>"
+ * <N> - <M> <pat> <N> to <M> (inclusive) <pat>s
+ * < <pat> after <pat>, ...
+ * > <pat> before <pat>, ...
+ * . any character
+ * <pat> / <alt> <pat> otherwise <alt>
+ * ( <pat> ) <pat>
+ * @ <pat> capture <pat>
+ * @ [ <name> ] <pat> <pat> named <name>
+ * ; <name> = <pat> <name> is defined to be <pat>
+ * { <pat> ~ <str> } <pat> replaced with <str>
+ * "@1" or "@{1}" first capture
+ * "@foo" or "@{foo}" capture named "foo"
+ */
+#include "bpeg.h"
+/*
+ * Recursively deallocate a match object and return NULL
+ */
static match_t *free_match(match_t *m)
{
if (m->child) m->child = free_match(m->child);
@@ -113,6 +38,11 @@ static match_t *free_match(match_t *m)
return NULL;
}
+/*
+ * Run virtual machine operation against a string and return
+ * a match struct, or NULL if no match is found.
+ * The returned value should be free()'d to avoid memory leaking.
+ */
static match_t *match(const char *str, vm_op_t *op)
{
tailcall:
@@ -311,6 +241,9 @@ static match_t *match(const char *str, vm_op_t *op)
}
}
+/*
+ * Helper function to initialize a range object.
+ */
static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep)
{
op->op = VM_REPEAT;
@@ -324,7 +257,11 @@ static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op
op->args.repetitions.sep = sep;
}
-static inline const char *skip_spaces(const char *str)
+/*
+ * Helper function to skip past all spaces (and comments)
+ * Returns a pointer to the first non-space character.
+ */
+static inline const char *after_spaces(const char *str)
{
// Skip whitespace and comments:
skip_whitespace:
@@ -341,9 +278,14 @@ static inline const char *skip_spaces(const char *str)
return str;
}
+/*
+ * Take an opcode and expand it into a chain of patterns if it's
+ * followed by any patterns (e.g. "`x `y"), otherwise return
+ * the original input.
+ */
static vm_op_t *expand_chain(vm_op_t *first)
{
- vm_op_t *second = parse(first->end);
+ vm_op_t *second = compile_bpeg(first->end);
if (second == NULL) return first;
check(second->end > first->end, "No forward progress in chain!");
second = expand_chain(second);
@@ -359,13 +301,18 @@ static vm_op_t *expand_chain(vm_op_t *first)
return chain;
}
+/*
+ * Take an opcode and expand it into a chain of choices if it's
+ * followed by any "/"-separated patterns (e.g. "`x/`y"), otherwise
+ * return the original input.
+ */
static vm_op_t *expand_choices(vm_op_t *first)
{
first = expand_chain(first);
- const char *str = skip_spaces(first->end);
+ const char *str = after_spaces(first->end);
if (*str != '/') return first;
++str;
- vm_op_t *second = parse(str);
+ vm_op_t *second = compile_bpeg(str);
check(second, "Expected pattern after '/'");
second = expand_chain(second);
vm_op_t *choice = calloc(sizeof(vm_op_t), 1);
@@ -380,11 +327,11 @@ static vm_op_t *expand_choices(vm_op_t *first)
return expand_choices(choice);
}
-static vm_op_t *parse(const char *str)
+static vm_op_t *compile_bpeg(const char *str)
{
if (!*str) return NULL;
debug("Parsing \"%s\"...\n", str);
- str = skip_spaces(str);
+ str = after_spaces(str);
check(*str, "Expected a pattern");
vm_op_t *op = calloc(sizeof(vm_op_t), 1);
op->start = str;
@@ -489,7 +436,7 @@ static vm_op_t *parse(const char *str)
case '!': {
++str;
debug("Not pattern\n");
- vm_op_t *p = parse(str);
+ vm_op_t *p = compile_bpeg(str);
check(p, "Expected pattern after '!'\n");
str = p->end;
op->op = VM_NOT;
@@ -501,7 +448,7 @@ static vm_op_t *parse(const char *str)
case '^': {
++str;
debug("Upto pattern\n");
- vm_op_t *p = parse(str);
+ vm_op_t *p = compile_bpeg(str);
check(p, "Expected pattern after '^'\n");
str = p->end;
op->op = VM_UPTO;
@@ -513,7 +460,7 @@ static vm_op_t *parse(const char *str)
case '&': {
++str;
debug("Upto-and pattern\n");
- vm_op_t *p = parse(str);
+ vm_op_t *p = compile_bpeg(str);
check(p, "Expected pattern after '&'\n");
str = p->end;
op->op = VM_UPTO_AND;
@@ -527,11 +474,11 @@ static vm_op_t *parse(const char *str)
debug("Repetitions\n");
ssize_t min = -1, max = -1;
long n1 = strtol(str, (char**)&str, 10);
- str = skip_spaces(str);
+ str = after_spaces(str);
switch (*str) {
case '-': {
++str;
- str = skip_spaces(str);
+ str = after_spaces(str);
const char *start = str;
long n2 = strtol(str, (char**)&str, 10);
if (str == start) min = 0, max = n1;
@@ -548,13 +495,13 @@ static vm_op_t *parse(const char *str)
break;
}
}
- vm_op_t *pat = parse(str);
+ vm_op_t *pat = compile_bpeg(str);
check(pat, "Expected pattern after repetition count");
str = pat->end;
- str = skip_spaces(str);
+ str = after_spaces(str);
if (*str == '%') {
++str;
- vm_op_t *sep = parse(str);
+ vm_op_t *sep = compile_bpeg(str);
check(sep, "Expected pattern for separator after '%%'");
str = sep->end;
set_range(op, min, max, pat, sep);
@@ -574,13 +521,13 @@ static vm_op_t *parse(const char *str)
case '?': min = 0, max = 1; break;
}
++str;
- vm_op_t *pat = parse(str);
+ vm_op_t *pat = compile_bpeg(str);
check(pat, "Expected pattern after +");
str = pat->end;
- str = skip_spaces(str);
+ str = after_spaces(str);
if (*str == '%') {
++str;
- vm_op_t *sep = parse(str);
+ vm_op_t *sep = compile_bpeg(str);
check(sep, "Expected pattern for separator after '%%'");
str = sep->end;
set_range(op, min, max, pat, sep);
@@ -594,7 +541,7 @@ static vm_op_t *parse(const char *str)
case '<': {
++str;
debug("Lookbehind\n");
- vm_op_t *pat = parse(str);
+ vm_op_t *pat = compile_bpeg(str);
check(pat, "Expected pattern after <");
str = pat->end;
check(pat->len != -1, "Lookbehind patterns must have a fixed length");
@@ -608,7 +555,7 @@ static vm_op_t *parse(const char *str)
case '>': {
++str;
debug("Lookahead\n");
- vm_op_t *pat = parse(str);
+ vm_op_t *pat = compile_bpeg(str);
check(pat, "Expected pattern after >");
str = pat->end;
op->op = VM_BEFORE;
@@ -621,11 +568,11 @@ static vm_op_t *parse(const char *str)
debug("Open paren (\n");
++str;
free(op);
- op = parse(str);
+ op = compile_bpeg(str);
check(op, "Expected pattern inside parentheses");
op = expand_choices(op);
str = op->end;
- str = skip_spaces(str);
+ str = after_spaces(str);
check(*str == ')', "Expected closing parenthesis");
++str;
debug("Close paren (\n");
@@ -636,7 +583,7 @@ static vm_op_t *parse(const char *str)
debug("Capture\n");
++str;
op->op = VM_CAPTURE;
- str = skip_spaces(str);
+ str = after_spaces(str);
if (*str == '[') {
++str;
char *closing = strchr(str, ']');
@@ -646,7 +593,7 @@ static vm_op_t *parse(const char *str)
str = closing;
++str;
}
- vm_op_t *pat = parse(str);
+ vm_op_t *pat = compile_bpeg(str);
check(pat, "Expected pattern after @");
str = pat->end;
op->args.capture.capture_pat = pat;
@@ -657,16 +604,16 @@ static vm_op_t *parse(const char *str)
case '{': {
debug("Replacement {\n");
++str;
- str = skip_spaces(str);
+ str = after_spaces(str);
vm_op_t *pat = NULL;
if (*str != '~') {
- pat = parse(str);
+ pat = compile_bpeg(str);
check(pat, "Expected pattern after '{'");
pat = expand_choices(pat);
str = pat->end;
- str = skip_spaces(str+1);
+ str = after_spaces(str+1);
}
- str = skip_spaces(str+1);
+ str = after_spaces(str+1);
char quote = *(str++);
check(quote == '\'' || quote == '"',
"Expected string literal for replacement");
@@ -679,7 +626,7 @@ static vm_op_t *parse(const char *str)
}
replacement = strndup(replacement, (size_t)(str-replacement));
++str;
- str = skip_spaces(str);
+ str = after_spaces(str);
check(*str == '}', "Expected a closing '}'");
++str;
op->op = VM_REPLACE;
@@ -722,7 +669,7 @@ static vm_op_t *parse(const char *str)
static void load_def(const char *name, const char *def)
{
defs[ndefs].name = name;
- defs[ndefs].op = parse(def);
+ defs[ndefs].op = compile_bpeg(def);
++ndefs;
}
@@ -752,8 +699,8 @@ int main(int argc, char *argv[])
load_defs();
char *lang = argc > 1 ? argv[1] : "'x''y'";
- vm_op_t *op = parse(lang);
- check(op, "Failed to parse input");
+ vm_op_t *op = compile_bpeg(lang);
+ check(op, "Failed to compile_bpeg input");
op = expand_choices(op);
// TODO: check for semicolon and more rules
diff --git a/bpeg.h b/bpeg.h
new file mode 100644
index 0000000..8bc813a
--- /dev/null
+++ b/bpeg.h
@@ -0,0 +1,40 @@
+/*
+ * bpeg.h - Header file for the bpeg parser
+ */
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "vm.h"
+
+typedef struct match_s {
+ const char *start, *end;
+ union {
+ unsigned int is_capture:1;
+ const char *name;
+ } capture;
+ const char *replacement;
+ struct match_s *child, *nextsibling;
+} match_t;
+
+static match_t *free_match(match_t *m);
+static match_t *match(const char *str, vm_op_t *op);
+static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep);
+static inline const char *after_spaces(const char *str);
+static vm_op_t *expand_choices(vm_op_t *op);
+static vm_op_t *expand_chain(vm_op_t *first);
+static vm_op_t *compile_bpeg(const char *str);
+
+
+typedef struct {
+ const char *name;
+ vm_op_t *op;
+} def_t;
+
+static def_t defs[1024] = {{NULL, NULL}};
+size_t ndefs = 0;
+static int verbose = 1;
+
diff --git a/utils.h b/utils.h
new file mode 100644
index 0000000..8723516
--- /dev/null
+++ b/utils.h
@@ -0,0 +1,7 @@
+/*
+ * utils.h - Some helper code for debugging and error logging.
+ */
+
+// TODO: better error reporting
+#define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); fwrite("\n", 1, 1, stderr); _exit(1); } } while(0)
+#define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0)
diff --git a/vm.h b/vm.h
new file mode 100644
index 0000000..2123c35
--- /dev/null
+++ b/vm.h
@@ -0,0 +1,50 @@
+/*
+ * vm.h - Source code for the BPEG virtual machine datatypes
+ */
+
+enum VMOpcode {
+ VM_EMPTY = 0,
+ VM_ANYCHAR = 1,
+ VM_STRING,
+ VM_RANGE,
+ VM_NOT,
+ VM_UPTO,
+ VM_UPTO_AND,
+ VM_REPEAT,
+ VM_BEFORE,
+ VM_AFTER,
+ VM_CAPTURE,
+ VM_OTHERWISE,
+ VM_CHAIN,
+ VM_REPLACE,
+ VM_REF,
+};
+
+typedef struct vm_op_s {
+ enum VMOpcode op;
+ const char *start, *end;
+ ssize_t len;
+ union {
+ const char *s;
+ struct {
+ char low, high;
+ } range;
+ struct {
+ ssize_t min, max;
+ struct vm_op_s *sep, *repeat_pat;
+ } repetitions;
+ struct {
+ struct vm_op_s *first, *second;
+ } multiple;
+ struct {
+ struct vm_op_s *replace_pat;
+ const char *replacement;
+ } replace;
+ struct {
+ struct vm_op_s *capture_pat;
+ char *name;
+ } capture;
+ struct vm_op_s *pat;
+ } args;
+} vm_op_t;
+