commit cead7e5b2626e80f826236320e63f2e8570e7fb6 Author: Bruce Hill Date: Mon Sep 7 23:05:38 2020 -0700 Initial commit diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..eae2d13 --- /dev/null +++ b/Makefile @@ -0,0 +1,13 @@ +PREFIX= +CFLAGS=-Wall -Wextra -pedantic -Wmissing-prototypes -Wstrict-prototypes +OFLAGS=-O3 + +all: bpeg + +clean: + rm -f bpeg + +bpeg: bpeg.c + cc $(CFLAGS) $(OFLAGS) $< -o $@ + +.PHONY: all clean diff --git a/bpeg.c b/bpeg.c new file mode 100644 index 0000000..0a4c551 --- /dev/null +++ b/bpeg.c @@ -0,0 +1,780 @@ +// # comment +// ` character +// ! no +// ^ upto +// & upto and including +// + [% ] or more s (separated by ) +// * [% ] sugar for "0+ [% ]" +// - [% ] or fewer s (separated by ) +// ? sugar for "1- " +// - to (inclusive) s +// < after , ... +// > before , ... +// . any character +// / otherwise +// ( ) +// @ capture +// @ [ ] named +// ; = is defined to be +// { ~ } replaced with +// "@1" or "@{1}" first capture +// "@foo" or "@{foo}" capture named "foo" + +#include +#include +#include +#include +#include + +#define check(cond, ...) do { if (!(cond)) { fprintf(stderr, __VA_ARGS__); _exit(1); } } while(0) + +typedef struct match_s { + const char *start, *end; + union { + unsigned int is_capture:1; + const char *name; + } capture; + const char *replacement; + struct match_s *child, *nextsibling; +} match_t; + +enum VM_OPTYPE { + VM_EMPTY = 0, + VM_ANYCHAR = 1, + VM_STRING, + VM_RANGE, + VM_NOT, + VM_UPTO, + VM_UPTO_AND, + VM_REPEAT, + VM_BEFORE, + VM_AFTER, + VM_CAPTURE, + VM_OTHERWISE, + VM_CHAIN, + VM_REPLACE, + VM_REF, +}; + +typedef struct vm_op_s { + enum VM_OPTYPE op; + const char *start, *end; + ssize_t len; + union { + const char *s; + struct { + char low, high; + } range; + struct { + ssize_t min, max; + struct vm_op_s *sep, *repeat_pat; + } repetitions; + struct { + struct vm_op_s *first, *second; + } multiple; + struct { + struct vm_op_s *replace_pat; + const char *replacement; + } replace; + struct { + struct vm_op_s *capture_pat; + char *name; + } capture; + struct vm_op_s *pat; + } args; +} vm_op_t; + +static match_t *free_match(match_t *m); +static match_t *match(const char *str, vm_op_t *op); +static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep); +static inline const char *skip_spaces(const char *str); +static vm_op_t *expand_choices(vm_op_t *op); +static vm_op_t *expand_chain(vm_op_t *first); +static vm_op_t *parse(const char *str); + + +typedef struct { + const char *name; + vm_op_t *op; +} def_t; + +static def_t defs[1024] = {{NULL, NULL}}; +size_t ndefs = 0; +static int verbose = 1; + +#define debug(...) do { if (verbose) fprintf(stderr, __VA_ARGS__); } while(0) + + +static match_t *free_match(match_t *m) +{ + if (m->child) m->child = free_match(m->child); + if (m->nextsibling) m->nextsibling = free_match(m->nextsibling); + free(m); + return NULL; +} + +static match_t *match(const char *str, vm_op_t *op) +{ + tailcall: + switch (op->op) { + case VM_EMPTY: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + return m; + } + case VM_ANYCHAR: { + if (!*str) return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str+1; + return m; + } + case VM_STRING: { + if (strncmp(str, op->args.s, op->len) != 0) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str + op->len; + return m; + } + case VM_RANGE: { + if (*str < op->args.range.low || *str > op->args.range.high) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str + 1; + return m; + } + case VM_NOT: { + match_t *m = match(str, op->args.pat); + if (m != NULL) { + m = free_match(m); + return NULL; + } + m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + return m; + } + case VM_UPTO: case VM_UPTO_AND: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + for (; *str; ++str) { + match_t *p = match(str, op->args.pat); + if (p != NULL) { + if (op->op == VM_UPTO) { + p = free_match(p); + m->end = str; + return m; + } else { + m->end = p->end; + m->child = p; + return m; + } + } + } + m = free_match(m); + return NULL; + } + case VM_REPEAT: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + if (op->args.repetitions.max == 0) return m; + + match_t **dest = &m->child; + + size_t reps = 0; + *dest = match(str, op->args.repetitions.repeat_pat); + if (*dest != NULL) { + ++reps; + str = (*dest)->end; + dest = &(*dest)->nextsibling; + + if (op->args.repetitions.sep != NULL) { + for (; reps < (size_t)op->args.repetitions.max; reps++) { + match_t *sep = match(str, op->args.repetitions.sep); + if (sep == NULL) break; + str = sep->end; + match_t *p = match(str, op->args.repetitions.repeat_pat); + if (p == NULL) { + p = free_match(p); + break; + } + str = p->end; + *dest = sep; + sep->nextsibling = p; + dest = &p->nextsibling; + } + } else { + for (; reps < (size_t)op->args.repetitions.max; reps++) { + *dest = match(str, op->args.repetitions.repeat_pat); + if (*dest == NULL) break; + str = (*dest)->end; + dest = &(*dest)->nextsibling; + } + } + } + if ((ssize_t)reps < op->args.repetitions.min) { + m = free_match(m); + return NULL; + } + m->end = str; + return m; + } + case VM_AFTER: { + check(op->len != -1, "'<' is only allowed for fixed-length operations"); + // Check for necessary space: + for (int i = 0; i < op->len; i++) { + if (str[-i] == '\0') return NULL; + } + match_t *before = match(str-op->len, op->args.pat); + if (before == NULL) return NULL; + before = free_match(before); + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + return m; + } + case VM_BEFORE: { + match_t *after = match(str, op->args.pat); + if (after == NULL) return NULL; + after = free_match(after); + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + return m; + } + case VM_CAPTURE: { + match_t *p = match(str, op->args.pat); + if (p == NULL) return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = p->end; + if (op->args.capture.name) + m->capture.name = op->args.capture.name; + else + m->capture.is_capture = 1; + return m; + } + case VM_OTHERWISE: { + match_t *m = match(str, op->args.multiple.first); + if (m == NULL) m = match(str, op->args.multiple.second); + return m; + } + case VM_CHAIN: { + match_t *m1 = match(str, op->args.multiple.first); + if (m1 == NULL) return NULL; + match_t *m2 = match(m1->end, op->args.multiple.second); + if (m2 == NULL) { + m1 = free_match(m1); + return NULL; + } + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = m2->end; + m->child = m1; + m1->nextsibling = m2; + return m; + } + case VM_REPLACE: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + if (op->args.replace.replace_pat) { + match_t *p = match(str, op->args.replace.replace_pat); + if (p == NULL) return NULL; + m->end = p->end; + } else { + m->end = m->start; + } + // TODO: handle captures + m->replacement = op->args.replace.replacement; + return m; + } + case VM_REF: { + for (size_t i = 0; i < ndefs; i++) { + if (strcmp(defs[i].name, op->args.s) == 0) { + // Bingo! + op = defs[i].op; + goto tailcall; + } + } + check(0, "Unknown identifier: '%s'", op->args.s); + return NULL; + } + default: { + fprintf(stderr, "Unknown opcode: %d", op->op); + _exit(1); + return NULL; + } + } +} + +static void set_range(vm_op_t *op, ssize_t min, ssize_t max, vm_op_t *pat, vm_op_t *sep) +{ + op->op = VM_REPEAT; + if (pat->len >= 0 && (sep == NULL || sep->len >= 0) && min == max && min >= 0) + op->len = pat->len * min + (sep == NULL || min == 0 ? 0 : sep->len * (min-1)); + else + op->len = -1; + op->args.repetitions.min = min; + op->args.repetitions.max = max; + op->args.repetitions.repeat_pat = pat; + op->args.repetitions.sep = sep; +} + +static inline const char *skip_spaces(const char *str) +{ + // Skip whitespace and comments: + skip_whitespace: + switch (*str) { + case ' ': case '\r': case '\n': case '\t': { + ++str; + goto skip_whitespace; + } + case '#': { + while (*str && *str != '\n') ++str; + goto skip_whitespace; + } + } + return str; +} + +static vm_op_t *expand_chain(vm_op_t *first) +{ + vm_op_t *second = parse(first->end); + if (second == NULL) return first; + check(second->end > first->end, "No forward progress in chain!"); + second = expand_chain(second); + vm_op_t *chain = calloc(sizeof(vm_op_t), 1); + chain->op = VM_CHAIN; + chain->start = first->start; + if (first->len >= 0 && second->len >= 0) + chain->len = first->len + second->len; + else chain->len = -1; + chain->end = second->end; + chain->args.multiple.first = first; + chain->args.multiple.second = second; + return chain; +} + +static vm_op_t *expand_choices(vm_op_t *first) +{ + first = expand_chain(first); + const char *str = skip_spaces(first->end); + if (*str != '/') return first; + ++str; + vm_op_t *second = parse(str); + check(second, "Expected pattern after '/'"); + second = expand_chain(second); + vm_op_t *choice = calloc(sizeof(vm_op_t), 1); + choice->op = VM_OTHERWISE; + choice->start = first->start; + if (first->len == second->len) + choice->len = first->len; + else choice->len = -1; + choice->end = second->end; + choice->args.multiple.first = first; + choice->args.multiple.second = second; + return expand_choices(choice); +} + +static vm_op_t *parse(const char *str) +{ + if (!*str) return NULL; + debug("Parsing \"%s\"...\n", str); + str = skip_spaces(str); + check(*str, "Expected a pattern"); + vm_op_t *op = calloc(sizeof(vm_op_t), 1); + op->start = str; + op->len = -1; + switch (*str) { + // Any char (dot) + case '.': { + ++str; + debug("Dot\n"); + op->op = VM_ANYCHAR; + op->len = 1; + break; + } + // Char literals + case '`': { + ++str; + char c[2] = {*str, '\0'}; + ++str; + check(c[0], "Expected character after '`'\n"); + op->len = 1; + if (*str == ',') { // Range + debug("Char range\n"); + char c2 = *(++str); + check(c2, "Expected character after ','"); + op->op = VM_RANGE; + op->args.range.low = c[0]; + op->args.range.high = c2; + } else { + debug("Char literal\n"); + op->op = VM_STRING; + op->args.s = strdup(c); + } + break; + } + // Escapes + case '\\': { + ++str; + debug("Escape sequence\n"); + check(*str, "Expected escape after '\\'"); + op->op = VM_STRING; + op->len = 1; + char c[2] = {*str, '\0'}; + switch (c[0]) { + case 'a': c[0] = '\a'; break; + case 'b': c[0] = '\b'; break; + case 'n': c[0] = '\n'; break; + case 'r': c[0] = '\r'; break; + case 't': c[0] = '\t'; break; + case 'v': c[0] = '\v'; break; + case 'x': { // Hex + static const char hextable[255] = { + ['0']=0x10, ['1']=0x1, ['2']=0x2, ['3']=0x3, ['4']=0x4, + ['5']=0x5, ['6']=0x6, ['7']=0x7, ['8']=0x8, ['9']=0x9, + ['a']=0xa, ['b']=0xb, ['c']=0xc, ['d']=0xd, ['e']=0xe, ['f']=0xf, + ['A']=0xa, ['B']=0xb, ['C']=0xc, ['D']=0xd, ['E']=0xe, ['F']=0xf, + }; + if (hextable[(int)str[1]] && hextable[(int)str[2]]) + c[0] = (hextable[(int)str[1]] << 4) | (hextable[(int)str[2]] & 0xF); + break; + } + case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { // Octal + c[0] = c[0] - '0'; + ++str; + if ('0' <= *str && *str <= '7') { + c[0] = (c[0] << 3) | (*str - '0'); + ++str; + } + if ('0' <= *str && *str <= '7') { + c[0] = (c[0] << 3) | (*str - '0'); + ++str; + } + break; + } + default: { + check(0, "Invalid escape sequence"); + } + } + op->args.s = strdup(c); + break; + } + // String literal + case '"': case '\'': { + char quote = *str; + ++str; + const char *literal = str; + for (; *str && *str != quote; str++) { + if (*str == '\\') { + // TODO: handle escape chars like \n + check(str[1], "Expected more string contents after backslash"); + ++str; + } + } + op->op = VM_STRING; + op->len = (ssize_t)(str - literal); + op->args.s = strndup(literal, (size_t)op->len); + debug("String literal: %c%s%c\n", quote, op->args.s, quote); + check(*str == quote, "Missing closing quote"); + ++str; + break; + } + // Not + case '!': { + ++str; + debug("Not pattern\n"); + vm_op_t *p = parse(str); + check(p, "Expected pattern after '!'\n"); + str = p->end; + op->op = VM_NOT; + op->len = 0; + op->args.pat = p; + break; + } + // Upto + case '^': { + ++str; + debug("Upto pattern\n"); + vm_op_t *p = parse(str); + check(p, "Expected pattern after '^'\n"); + str = p->end; + op->op = VM_UPTO; + op->len = -1; + op->args.pat = p; + break; + } + // Upto and including + case '&': { + ++str; + debug("Upto-and pattern\n"); + vm_op_t *p = parse(str); + check(p, "Expected pattern after '&'\n"); + str = p->end; + op->op = VM_UPTO_AND; + op->len = -1; + op->args.pat = p; + break; + } + // Number of repetitions: (- / - / + / "") + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': { + debug("Repetitions\n"); + ssize_t min = -1, max = -1; + long n1 = strtol(str, (char**)&str, 10); + str = skip_spaces(str); + switch (*str) { + case '-': { + ++str; + str = skip_spaces(str); + const char *start = str; + long n2 = strtol(str, (char**)&str, 10); + if (str == start) min = 0, max = n1; + else min = n1, max = n2; + break; + } + case '+': { + ++str; + min = n1, max = -1; + break; + } + default: { + min = n1, max = n1; + break; + } + } + vm_op_t *pat = parse(str); + check(pat, "Expected pattern after repetition count"); + str = pat->end; + str = skip_spaces(str); + if (*str == '%') { + ++str; + vm_op_t *sep = parse(str); + check(sep, "Expected pattern for separator after '%%'"); + str = sep->end; + set_range(op, min, max, pat, sep); + } else { + set_range(op, min, max, pat, NULL); + } + debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); + break; + } + // Special repetitions: + case '+': case '*': case '?': { + debug("Special repetitions\n"); + ssize_t min = -1, max = -1; + switch (*str) { + case '+': min = 1, max = -1; break; + case '*': min = 0, max = -1; break; + case '?': min = 0, max = 1; break; + } + ++str; + vm_op_t *pat = parse(str); + check(pat, "Expected pattern after +"); + str = pat->end; + str = skip_spaces(str); + if (*str == '%') { + ++str; + vm_op_t *sep = parse(str); + check(sep, "Expected pattern for separator after '%%'"); + str = sep->end; + set_range(op, min, max, pat, sep); + } else { + set_range(op, min, max, pat, NULL); + } + debug("min = %lld max = %lld\n", (long long)op->args.repetitions.min, (long long)op->args.repetitions.max); + break; + } + // Lookbehind + case '<': { + ++str; + debug("Lookbehind\n"); + vm_op_t *pat = parse(str); + check(pat, "Expected pattern after <"); + str = pat->end; + check(pat->len != -1, "Lookbehind patterns must have a fixed length"); + str = pat->end; + op->op = VM_AFTER; + op->len = 0; + op->args.pat = pat; + break; + } + // Lookahead + case '>': { + ++str; + debug("Lookahead\n"); + vm_op_t *pat = parse(str); + check(pat, "Expected pattern after >"); + str = pat->end; + op->op = VM_BEFORE; + op->len = 0; + op->args.pat = pat; + break; + } + // Parentheses + case '(': { + debug("Open paren (\n"); + ++str; + free(op); + op = parse(str); + check(op, "Expected pattern inside parentheses"); + op = expand_choices(op); + str = op->end; + str = skip_spaces(str); + check(*str == ')', "Expected closing parenthesis"); + ++str; + debug("Close paren (\n"); + break; + } + // Capture + case '@': { + debug("Capture\n"); + ++str; + op->op = VM_CAPTURE; + str = skip_spaces(str); + if (*str == '[') { + ++str; + char *closing = strchr(str, ']'); + check(closing, "Expected closing ']'"); + op->args.capture.name = strndup(str, (size_t)(closing-str)); + debug("named \"%s\"\n", op->args.capture.name); + str = closing; + ++str; + } + vm_op_t *pat = parse(str); + check(pat, "Expected pattern after @"); + str = pat->end; + op->args.capture.capture_pat = pat; + op->len = pat->len; + break; + } + // Replacement + case '{': { + debug("Replacement {\n"); + ++str; + str = skip_spaces(str); + vm_op_t *pat = NULL; + if (*str != '~') { + pat = parse(str); + check(pat, "Expected pattern after '{'"); + pat = expand_choices(pat); + str = pat->end; + str = skip_spaces(str+1); + } + str = skip_spaces(str+1); + char quote = *(str++); + check(quote == '\'' || quote == '"', + "Expected string literal for replacement"); + const char *replacement = str; + for (; *str && *str != quote; str++) { + if (*str == '\\') { + check(str[1], "Expected more string contents after backslash"); + ++str; + } + } + replacement = strndup(replacement, (size_t)(str-replacement)); + ++str; + str = skip_spaces(str); + check(*str == '}', "Expected a closing '}'"); + ++str; + op->op = VM_REPLACE; + op->args.replace.replace_pat = pat; + op->args.replace.replacement = replacement; + debug(" rep = \"%s\"", replacement); + debug("}"); + if (pat != NULL) op->len = pat->len; + break; + } + // Whitespace + case '_': { + debug("Whitespace\n"); + ++str; + op->op = VM_REF; + op->args.s = strdup("_"); + break; + } + default: { + // Reference + if (isalpha(*str)) { + const char *refname = str; + size_t len = 1; + for (++str; isalnum(*str); ++str) + ++len; + op->op = VM_REF; + debug("Ref: %s\n", refname); + op->args.s = strndup(refname, len); + break; + } else { + free(op); + return NULL; + } + } + } + op->end = str; + return op; +} + +static void load_def(const char *name, const char *def) +{ + defs[ndefs].name = name; + defs[ndefs].op = parse(def); + ++ndefs; +} + +static void load_defs(void) +{ + load_def("_", "` /\\t/\\n/\\r"); + load_def("nl", "\\n"); + load_def("crlf", "\\r\\n"); + load_def("abc", "`a,z"); + load_def("ABC", "`A,Z"); + load_def("Abc", "`a,z/`A,Z"); + load_def("digit", "`0,9"); + load_def("number", "+`0,9 ?(`. *`0,9) / `. +`0,9"); + load_def("hex", "`0,9/`a,f"); + load_def("Hex", "`0,9/`a,f/`A,F"); + load_def("HEX", "`0,9/`A,F"); + load_def("id", "(`a,z/`A,Z/`_) *(`a,z/`A,Z/`_/`0,9)"); + load_def("line", "&(?\\r\\n / !.)"); + load_def("parens", "`( *(parens / .) `)"); + load_def("braces", "`{ *(parens / .) `}"); + load_def("brackets", "`[ *(parens / .) `]"); + load_def("anglebraces", "`< *(parens / .) `>"); +} + +int main(int argc, char *argv[]) +{ + load_defs(); + + char *lang = argc > 1 ? argv[1] : "'x''y'"; + vm_op_t *op = parse(lang); + check(op, "Failed to parse input"); + op = expand_choices(op); + + // TODO: check for semicolon and more rules + + + char *str = argc > 2 ? argv[2] : "xyz"; + + // Ensure string has a null byte to the left: + char *lpadded = calloc(sizeof(char), strlen(str)+2); + stpcpy(&lpadded[1], str); + str = &lpadded[1]; + + match_t *m = match(str, op); + if (m == NULL) { + printf("No match\n"); + } else { + printf("%.*s\033[7m%.*s\033[0m%s\n", + (int)(str - m->start), str, + (int)(m->end - m->start), m->start, + m->end); + } + + return 0; +}