diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2020-09-11 01:28:06 -0700 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2020-09-11 01:28:06 -0700 |
| commit | 2a11acc66738d6300bfa90a22adcd540371060f3 (patch) | |
| tree | 4966c9864434da3e43594f0d86dd9362f367fd8e /vm.c | |
| parent | 8a846230f7b4269af08a6b6496ebd28c4ff459ba (diff) | |
Major overhaul refactor restructuring
Diffstat (limited to 'vm.c')
| -rw-r--r-- | vm.c | 449 |
1 files changed, 449 insertions, 0 deletions
@@ -0,0 +1,449 @@ +/* + * vm.c - Code for the BPEG virtual machine that performs the matching. + */ +#include "vm.h" +#include "utils.h" + +/* + * Recursively deallocate a match object and set to NULL + */ +void destroy_match(match_t **m) +{ + if (!m || !*m) return; + destroy_match(&((*m)->child)); + destroy_match(&((*m)->nextsibling)); + *m = NULL; +} + +/* + * Run virtual machine operation against a string and return + * a match struct, or NULL if no match is found. + * The returned value should be free()'d to avoid memory leaking. + */ +match_t *match(grammar_t *g, const char *str, vm_op_t *op) +{ + //tailcall: + switch (op->op) { + case VM_EMPTY: { + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str; + return m; + } + case VM_ANYCHAR: { + if (!*str || (!op->multiline && *str == '\n')) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str+1; + return m; + } + case VM_STRING: { + if (strncmp(str, op->args.s, op->len) != 0) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str + op->len; + return m; + } + case VM_RANGE: { + if (*str < op->args.range.low || *str > op->args.range.high) + return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + m->end = str + 1; + return m; + } + case VM_NOT: case VM_ANYTHING_BUT: { + if (op->op == VM_ANYTHING_BUT) + if (!*str || (!op->multiline && *str == '\n')) + return NULL; + match_t *m = match(g, str, op->args.pat); + if (m != NULL) { + destroy_match(&m); + return NULL; + } + m = calloc(sizeof(match_t), 1); + m->op = op; + m->start = str; + if (op->op == VM_ANYTHING_BUT) ++str; + m->end = str; + return m; + } + case VM_UPTO_AND: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->op = op; + match_t *p = NULL; + for (const char *prev = NULL; p == NULL && prev < str; ) { + prev = str; + p = match(g, str, op->args.pat); + if (*str && (op->multiline || *str != '\n')) + ++str; + } + if (p) { + m->end = p->end; + m->child = p; + return m; + } + destroy_match(&m); + return NULL; + } + case VM_REPEAT: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + m->op = op; + if (op->args.repetitions.max == 0) return m; + + match_t **dest = &m->child; + + const char *prev = str; + size_t reps; + for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) { + // Separator + match_t *sep = NULL; + if (op->args.repetitions.sep != NULL && reps > 0) { + sep = match(g, str, op->args.repetitions.sep); + if (sep == NULL) break; + str = sep->end; + } + match_t *p = match(g, str, op->args.repetitions.repeat_pat); + if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops + destroy_match(&sep); + destroy_match(&p); + break; + } + if (sep) { + *dest = sep; + dest = &sep->nextsibling; + } + *dest = p; + dest = &p->nextsibling; + str = p->end; + prev = str; + } + + if ((ssize_t)reps < op->args.repetitions.min) { + destroy_match(&m); + return NULL; + } + m->end = str; + return m; + } + case VM_AFTER: { + ssize_t backtrack = op->args.pat->len; + check(backtrack != -1, "'<' is only allowed for fixed-length operations"); + // Check for necessary space: + for (int i = 0; i < backtrack; i++) { + if (str[-i] == '\0') return NULL; + } + match_t *before = match(g, str - backtrack, op->args.pat); + if (before == NULL) return NULL; + destroy_match(&before); + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + m->op = op; + return m; + } + case VM_BEFORE: { + match_t *after = match(g, str, op->args.pat); + if (after == NULL) return NULL; + destroy_match(&after); + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = str; + m->op = op; + return m; + } + case VM_CAPTURE: { + match_t *p = match(g, str, op->args.pat); + if (p == NULL) return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = p->end; + m->op = op; + m->child = p; + m->is_capture = 1; + if (op->args.capture.name) + m->name_or_replacement = op->args.capture.name; + return m; + } + case VM_OTHERWISE: { + match_t *m = match(g, str, op->args.multiple.first); + if (m == NULL) m = match(g, str, op->args.multiple.second); + return m; + } + case VM_CHAIN: { + match_t *m1 = match(g, str, op->args.multiple.first); + if (m1 == NULL) return NULL; + match_t *m2 = match(g, m1->end, op->args.multiple.second); + if (m2 == NULL) { + destroy_match(&m1); + return NULL; + } + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->end = m2->end; + m->op = op; + m->child = m1; + m1->nextsibling = m2; + return m; + } + case VM_REPLACE: { + match_t *m = calloc(sizeof(match_t), 1); + m->start = str; + m->op = op; + if (op->args.replace.replace_pat) { + match_t *p = match(g, str, op->args.replace.replace_pat); + if (p == NULL) return NULL; + m->child = p; + m->end = p->end; + } else { + m->end = m->start; + } + m->is_replacement = 1; + m->name_or_replacement = op->args.replace.replacement; + return m; + } + case VM_REF: { + // Search backwards so newer defs take precedence + for (int i = g->size-1; i >= 0; i--) { + if (streq(g->definitions[i].name, op->args.s)) { + // Bingo! + /* + op = g->definitions[i].op; + goto tailcall; + */ + match_t *p = match(g, str, g->definitions[i].op); + if (p == NULL) return NULL; + match_t *m = calloc(sizeof(match_t), 1); + m->start = p->start; + m->end = p->end; + m->op = op; + m->child = p; + m->name_or_replacement = g->definitions[i].name; + m->is_ref = 1; + return m; + } + } + check(0, "Unknown identifier: '%s'", op->args.s); + return NULL; + } + default: { + fprintf(stderr, "Unknown opcode: %d", op->op); + _exit(1); + return NULL; + } + } +} + + +void print_pattern(vm_op_t *op) +{ + switch (op->op) { + case VM_REF: fprintf(stderr, "a $%s", op->args.s); break; + case VM_EMPTY: fprintf(stderr, "the empty string"); break; + case VM_ANYCHAR: fprintf(stderr, "any char"); break; + case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break; + case VM_RANGE: { + fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high); + break; + } + case VM_REPEAT: { + if (op->args.repetitions.max == -1) + fprintf(stderr, "%ld or more (", op->args.repetitions.min); + else + fprintf(stderr, "%ld-%ld of (", + op->args.repetitions.min, + op->args.repetitions.max); + print_pattern(op->args.repetitions.repeat_pat); + fprintf(stderr, ")"); + if (op->args.repetitions.sep) { + fprintf(stderr, " separated by ("); + print_pattern(op->args.repetitions.sep); + fprintf(stderr, ")"); + } + break; + } + case VM_NOT: { + fprintf(stderr, "not ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_UPTO_AND: { + fprintf(stderr, "text up to and including ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_ANYTHING_BUT: { + fprintf(stderr, "anything but ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_AFTER: { + fprintf(stderr, "after ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_BEFORE: { + fprintf(stderr, "before ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + break; + } + case VM_CAPTURE: { + fprintf(stderr, "capture ("); + print_pattern(op->args.pat); + fprintf(stderr, ")"); + if (op->args.capture.name) + fprintf(stderr, " and call it %s", op->args.capture.name); + break; + } + case VM_OTHERWISE: { + fprintf(stderr, "("); + print_pattern(op->args.multiple.first); + fprintf(stderr, ") or else "); + if (op->args.multiple.second->op != VM_OTHERWISE) + fprintf(stderr, "("); + print_pattern(op->args.multiple.second); + if (op->args.multiple.second->op != VM_OTHERWISE) + fprintf(stderr, ")"); + break; + } + case VM_CHAIN: { + fprintf(stderr, "("); + print_pattern(op->args.multiple.first); + fprintf(stderr, ") then "); + if (op->args.multiple.second->op != VM_CHAIN) + fprintf(stderr, "("); + print_pattern(op->args.multiple.second); + if (op->args.multiple.second->op != VM_CHAIN) + fprintf(stderr, ")"); + break; + } + case VM_REPLACE: { + fprintf(stderr, "replace "); + if (op->args.replace.replace_pat) { + fprintf(stderr, "("); + print_pattern(op->args.replace.replace_pat); + fprintf(stderr, ")"); + } else + fprintf(stderr, "\"\""); + fprintf(stderr, " with \"%s\"", op->args.replace.replacement); + break; + } + default: break; + } +} + +/* + * Get a specific numbered pattern capture. + */ +static match_t *get_capture_n(match_t *m, int *n) +{ + if (!m) return NULL; + if (*n == 0) return m; + if (m->is_capture && *n == 1) return m; + if (m->is_capture) --(*n); + for (match_t *c = m->child; c; c = c->nextsibling) { + match_t *cap = get_capture_n(c, n); + if (cap) return cap; + } + return NULL; +} + +/* + * Get a named capture. + */ +static match_t *get_capture_named(match_t *m, const char *name) +{ + if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name)) + return m; + for (match_t *c = m->child; c; c = c->nextsibling) { + match_t *cap = get_capture_named(c, name); + if (cap) return cap; + } + return NULL; +} + +/* + * Print a match with replacements and highlighting. + */ +void print_match(match_t *m, const char *color, int verbose) +{ + if (m->is_replacement) { + printf("\033[0;34m"); + for (const char *r = m->name_or_replacement; *r; ) { + if (*r == '\\') { + fputc(unescapechar(r, &r), stdout); + continue; + } else if (*r != '@') { + fputc(*r, stdout); + ++r; + continue; + } + + ++r; + match_t *cap = NULL; + switch (*r) { + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + int n = (int)strtol(r, (char**)&r, 10); + cap = get_capture_n(m->child, &n); + break; + } + case '[': { + char *closing = strchr(r+1, ']'); + if (!closing) { + fputc('@', stdout); + break; + } + ++r; + char *name = strndup(r, (size_t)(closing-r)); + cap = get_capture_named(m, name); + free(name); + r = closing + 1; + break; + } + default: { + fputc('@', stdout); + break; + } + } + if (cap != NULL) { + print_match(cap, "\033[0;35m", verbose); + printf("\033[0;34m"); + } + } + } else { + const char *name = m->name_or_replacement; + if (verbose && m->is_ref && name && isupper(name[0])) + printf("\033[0;2;35m{%s:", name); + //if (m->is_capture && name) + // printf("\033[0;2;33m[%s:", name); + const char *prev = m->start; + for (match_t *child = m->child; child; child = child->nextsibling) { + if (child->start > prev) + printf("%s%.*s", color, (int)(child->start - prev), prev); + print_match(child, m->is_capture ? "\033[0;1m" : color, verbose); + prev = child->end; + } + if (m->end > prev) + printf("%s%.*s", color, (int)(m->end - prev), prev); + if (verbose && m->is_ref && name && isupper(name[0])) + printf("\033[0;2;35m}"); + //if (m->is_capture && name) + // printf("\033[0;2;33m]"); + } +} + |
