450 lines
14 KiB
C
450 lines
14 KiB
C
/*
|
|
* vm.c - Code for the BPEG virtual machine that performs the matching.
|
|
*/
|
|
#include "vm.h"
|
|
#include "utils.h"
|
|
|
|
/*
|
|
* Recursively deallocate a match object and set to NULL
|
|
*/
|
|
void destroy_match(match_t **m)
|
|
{
|
|
if (!m || !*m) return;
|
|
destroy_match(&((*m)->child));
|
|
destroy_match(&((*m)->nextsibling));
|
|
*m = NULL;
|
|
}
|
|
|
|
/*
|
|
* Run virtual machine operation against a string and return
|
|
* a match struct, or NULL if no match is found.
|
|
* The returned value should be free()'d to avoid memory leaking.
|
|
*/
|
|
match_t *match(grammar_t *g, const char *str, vm_op_t *op)
|
|
{
|
|
//tailcall:
|
|
switch (op->op) {
|
|
case VM_EMPTY: {
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->op = op;
|
|
m->start = str;
|
|
m->end = str;
|
|
return m;
|
|
}
|
|
case VM_ANYCHAR: {
|
|
if (!*str || (!op->multiline && *str == '\n'))
|
|
return NULL;
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->op = op;
|
|
m->start = str;
|
|
m->end = str+1;
|
|
return m;
|
|
}
|
|
case VM_STRING: {
|
|
if (strncmp(str, op->args.s, op->len) != 0)
|
|
return NULL;
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->op = op;
|
|
m->start = str;
|
|
m->end = str + op->len;
|
|
return m;
|
|
}
|
|
case VM_RANGE: {
|
|
if (*str < op->args.range.low || *str > op->args.range.high)
|
|
return NULL;
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->op = op;
|
|
m->start = str;
|
|
m->end = str + 1;
|
|
return m;
|
|
}
|
|
case VM_NOT: case VM_ANYTHING_BUT: {
|
|
if (op->op == VM_ANYTHING_BUT)
|
|
if (!*str || (!op->multiline && *str == '\n'))
|
|
return NULL;
|
|
match_t *m = match(g, str, op->args.pat);
|
|
if (m != NULL) {
|
|
destroy_match(&m);
|
|
return NULL;
|
|
}
|
|
m = calloc(sizeof(match_t), 1);
|
|
m->op = op;
|
|
m->start = str;
|
|
if (op->op == VM_ANYTHING_BUT) ++str;
|
|
m->end = str;
|
|
return m;
|
|
}
|
|
case VM_UPTO_AND: {
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->op = op;
|
|
match_t *p = NULL;
|
|
for (const char *prev = NULL; p == NULL && prev < str; ) {
|
|
prev = str;
|
|
p = match(g, str, op->args.pat);
|
|
if (*str && (op->multiline || *str != '\n'))
|
|
++str;
|
|
}
|
|
if (p) {
|
|
m->end = p->end;
|
|
m->child = p;
|
|
return m;
|
|
}
|
|
destroy_match(&m);
|
|
return NULL;
|
|
}
|
|
case VM_REPEAT: {
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->end = str;
|
|
m->op = op;
|
|
if (op->args.repetitions.max == 0) return m;
|
|
|
|
match_t **dest = &m->child;
|
|
|
|
const char *prev = str;
|
|
size_t reps;
|
|
for (reps = 0; reps < (size_t)op->args.repetitions.max; ++reps) {
|
|
// Separator
|
|
match_t *sep = NULL;
|
|
if (op->args.repetitions.sep != NULL && reps > 0) {
|
|
sep = match(g, str, op->args.repetitions.sep);
|
|
if (sep == NULL) break;
|
|
str = sep->end;
|
|
}
|
|
match_t *p = match(g, str, op->args.repetitions.repeat_pat);
|
|
if (p == NULL || (p->end == prev && reps > 0)) { // Prevent infinite loops
|
|
destroy_match(&sep);
|
|
destroy_match(&p);
|
|
break;
|
|
}
|
|
if (sep) {
|
|
*dest = sep;
|
|
dest = &sep->nextsibling;
|
|
}
|
|
*dest = p;
|
|
dest = &p->nextsibling;
|
|
str = p->end;
|
|
prev = str;
|
|
}
|
|
|
|
if ((ssize_t)reps < op->args.repetitions.min) {
|
|
destroy_match(&m);
|
|
return NULL;
|
|
}
|
|
m->end = str;
|
|
return m;
|
|
}
|
|
case VM_AFTER: {
|
|
ssize_t backtrack = op->args.pat->len;
|
|
check(backtrack != -1, "'<' is only allowed for fixed-length operations");
|
|
// Check for necessary space:
|
|
for (int i = 0; i < backtrack; i++) {
|
|
if (str[-i] == '\0') return NULL;
|
|
}
|
|
match_t *before = match(g, str - backtrack, op->args.pat);
|
|
if (before == NULL) return NULL;
|
|
destroy_match(&before);
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->end = str;
|
|
m->op = op;
|
|
return m;
|
|
}
|
|
case VM_BEFORE: {
|
|
match_t *after = match(g, str, op->args.pat);
|
|
if (after == NULL) return NULL;
|
|
destroy_match(&after);
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->end = str;
|
|
m->op = op;
|
|
return m;
|
|
}
|
|
case VM_CAPTURE: {
|
|
match_t *p = match(g, str, op->args.pat);
|
|
if (p == NULL) return NULL;
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->end = p->end;
|
|
m->op = op;
|
|
m->child = p;
|
|
m->is_capture = 1;
|
|
if (op->args.capture.name)
|
|
m->name_or_replacement = op->args.capture.name;
|
|
return m;
|
|
}
|
|
case VM_OTHERWISE: {
|
|
match_t *m = match(g, str, op->args.multiple.first);
|
|
if (m == NULL) m = match(g, str, op->args.multiple.second);
|
|
return m;
|
|
}
|
|
case VM_CHAIN: {
|
|
match_t *m1 = match(g, str, op->args.multiple.first);
|
|
if (m1 == NULL) return NULL;
|
|
match_t *m2 = match(g, m1->end, op->args.multiple.second);
|
|
if (m2 == NULL) {
|
|
destroy_match(&m1);
|
|
return NULL;
|
|
}
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->end = m2->end;
|
|
m->op = op;
|
|
m->child = m1;
|
|
m1->nextsibling = m2;
|
|
return m;
|
|
}
|
|
case VM_REPLACE: {
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = str;
|
|
m->op = op;
|
|
if (op->args.replace.replace_pat) {
|
|
match_t *p = match(g, str, op->args.replace.replace_pat);
|
|
if (p == NULL) return NULL;
|
|
m->child = p;
|
|
m->end = p->end;
|
|
} else {
|
|
m->end = m->start;
|
|
}
|
|
m->is_replacement = 1;
|
|
m->name_or_replacement = op->args.replace.replacement;
|
|
return m;
|
|
}
|
|
case VM_REF: {
|
|
// Search backwards so newer defs take precedence
|
|
for (int i = g->size-1; i >= 0; i--) {
|
|
if (streq(g->definitions[i].name, op->args.s)) {
|
|
// Bingo!
|
|
/*
|
|
op = g->definitions[i].op;
|
|
goto tailcall;
|
|
*/
|
|
match_t *p = match(g, str, g->definitions[i].op);
|
|
if (p == NULL) return NULL;
|
|
match_t *m = calloc(sizeof(match_t), 1);
|
|
m->start = p->start;
|
|
m->end = p->end;
|
|
m->op = op;
|
|
m->child = p;
|
|
m->name_or_replacement = g->definitions[i].name;
|
|
m->is_ref = 1;
|
|
return m;
|
|
}
|
|
}
|
|
check(0, "Unknown identifier: '%s'", op->args.s);
|
|
return NULL;
|
|
}
|
|
default: {
|
|
fprintf(stderr, "Unknown opcode: %d", op->op);
|
|
_exit(1);
|
|
return NULL;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void print_pattern(vm_op_t *op)
|
|
{
|
|
switch (op->op) {
|
|
case VM_REF: fprintf(stderr, "a $%s", op->args.s); break;
|
|
case VM_EMPTY: fprintf(stderr, "the empty string"); break;
|
|
case VM_ANYCHAR: fprintf(stderr, "any char"); break;
|
|
case VM_STRING: fprintf(stderr, "string \"%s\"", op->args.s); break;
|
|
case VM_RANGE: {
|
|
fprintf(stderr, "char from %c-%c", op->args.range.low, op->args.range.high);
|
|
break;
|
|
}
|
|
case VM_REPEAT: {
|
|
if (op->args.repetitions.max == -1)
|
|
fprintf(stderr, "%ld or more (", op->args.repetitions.min);
|
|
else
|
|
fprintf(stderr, "%ld-%ld of (",
|
|
op->args.repetitions.min,
|
|
op->args.repetitions.max);
|
|
print_pattern(op->args.repetitions.repeat_pat);
|
|
fprintf(stderr, ")");
|
|
if (op->args.repetitions.sep) {
|
|
fprintf(stderr, " separated by (");
|
|
print_pattern(op->args.repetitions.sep);
|
|
fprintf(stderr, ")");
|
|
}
|
|
break;
|
|
}
|
|
case VM_NOT: {
|
|
fprintf(stderr, "not (");
|
|
print_pattern(op->args.pat);
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_UPTO_AND: {
|
|
fprintf(stderr, "text up to and including (");
|
|
print_pattern(op->args.pat);
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_ANYTHING_BUT: {
|
|
fprintf(stderr, "anything but (");
|
|
print_pattern(op->args.pat);
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_AFTER: {
|
|
fprintf(stderr, "after (");
|
|
print_pattern(op->args.pat);
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_BEFORE: {
|
|
fprintf(stderr, "before (");
|
|
print_pattern(op->args.pat);
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_CAPTURE: {
|
|
fprintf(stderr, "capture (");
|
|
print_pattern(op->args.pat);
|
|
fprintf(stderr, ")");
|
|
if (op->args.capture.name)
|
|
fprintf(stderr, " and call it %s", op->args.capture.name);
|
|
break;
|
|
}
|
|
case VM_OTHERWISE: {
|
|
fprintf(stderr, "(");
|
|
print_pattern(op->args.multiple.first);
|
|
fprintf(stderr, ") or else ");
|
|
if (op->args.multiple.second->op != VM_OTHERWISE)
|
|
fprintf(stderr, "(");
|
|
print_pattern(op->args.multiple.second);
|
|
if (op->args.multiple.second->op != VM_OTHERWISE)
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_CHAIN: {
|
|
fprintf(stderr, "(");
|
|
print_pattern(op->args.multiple.first);
|
|
fprintf(stderr, ") then ");
|
|
if (op->args.multiple.second->op != VM_CHAIN)
|
|
fprintf(stderr, "(");
|
|
print_pattern(op->args.multiple.second);
|
|
if (op->args.multiple.second->op != VM_CHAIN)
|
|
fprintf(stderr, ")");
|
|
break;
|
|
}
|
|
case VM_REPLACE: {
|
|
fprintf(stderr, "replace ");
|
|
if (op->args.replace.replace_pat) {
|
|
fprintf(stderr, "(");
|
|
print_pattern(op->args.replace.replace_pat);
|
|
fprintf(stderr, ")");
|
|
} else
|
|
fprintf(stderr, "\"\"");
|
|
fprintf(stderr, " with \"%s\"", op->args.replace.replacement);
|
|
break;
|
|
}
|
|
default: break;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Get a specific numbered pattern capture.
|
|
*/
|
|
static match_t *get_capture_n(match_t *m, int *n)
|
|
{
|
|
if (!m) return NULL;
|
|
if (*n == 0) return m;
|
|
if (m->is_capture && *n == 1) return m;
|
|
if (m->is_capture) --(*n);
|
|
for (match_t *c = m->child; c; c = c->nextsibling) {
|
|
match_t *cap = get_capture_n(c, n);
|
|
if (cap) return cap;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Get a named capture.
|
|
*/
|
|
static match_t *get_capture_named(match_t *m, const char *name)
|
|
{
|
|
if (m->is_capture && m->name_or_replacement && streq(m->name_or_replacement, name))
|
|
return m;
|
|
for (match_t *c = m->child; c; c = c->nextsibling) {
|
|
match_t *cap = get_capture_named(c, name);
|
|
if (cap) return cap;
|
|
}
|
|
return NULL;
|
|
}
|
|
|
|
/*
|
|
* Print a match with replacements and highlighting.
|
|
*/
|
|
void print_match(match_t *m, const char *color, int verbose)
|
|
{
|
|
if (m->is_replacement) {
|
|
printf("\033[0;34m");
|
|
for (const char *r = m->name_or_replacement; *r; ) {
|
|
if (*r == '\\') {
|
|
fputc(unescapechar(r, &r), stdout);
|
|
continue;
|
|
} else if (*r != '@') {
|
|
fputc(*r, stdout);
|
|
++r;
|
|
continue;
|
|
}
|
|
|
|
++r;
|
|
match_t *cap = NULL;
|
|
switch (*r) {
|
|
case '0': case '1': case '2': case '3': case '4':
|
|
case '5': case '6': case '7': case '8': case '9': {
|
|
int n = (int)strtol(r, (char**)&r, 10);
|
|
cap = get_capture_n(m->child, &n);
|
|
break;
|
|
}
|
|
case '[': {
|
|
char *closing = strchr(r+1, ']');
|
|
if (!closing) {
|
|
fputc('@', stdout);
|
|
break;
|
|
}
|
|
++r;
|
|
char *name = strndup(r, (size_t)(closing-r));
|
|
cap = get_capture_named(m, name);
|
|
free(name);
|
|
r = closing + 1;
|
|
break;
|
|
}
|
|
default: {
|
|
fputc('@', stdout);
|
|
break;
|
|
}
|
|
}
|
|
if (cap != NULL) {
|
|
print_match(cap, "\033[0;35m", verbose);
|
|
printf("\033[0;34m");
|
|
}
|
|
}
|
|
} else {
|
|
const char *name = m->name_or_replacement;
|
|
if (verbose && m->is_ref && name && isupper(name[0]))
|
|
printf("\033[0;2;35m{%s:", name);
|
|
//if (m->is_capture && name)
|
|
// printf("\033[0;2;33m[%s:", name);
|
|
const char *prev = m->start;
|
|
for (match_t *child = m->child; child; child = child->nextsibling) {
|
|
if (child->start > prev)
|
|
printf("%s%.*s", color, (int)(child->start - prev), prev);
|
|
print_match(child, m->is_capture ? "\033[0;1m" : color, verbose);
|
|
prev = child->end;
|
|
}
|
|
if (m->end > prev)
|
|
printf("%s%.*s", color, (int)(m->end - prev), prev);
|
|
if (verbose && m->is_ref && name && isupper(name[0]))
|
|
printf("\033[0;2;35m}");
|
|
//if (m->is_capture && name)
|
|
// printf("\033[0;2;33m]");
|
|
}
|
|
}
|
|
|