2021-01-12 21:04:43 -08:00
|
|
|
//
|
|
|
|
// vm.c - Code for the BP virtual machine that performs the matching.
|
|
|
|
//
|
2020-09-16 20:38:58 -07:00
|
|
|
|
|
|
|
#include <ctype.h>
|
2020-12-27 19:48:52 -08:00
|
|
|
#include <stdio.h>
|
2020-12-14 22:13:47 -08:00
|
|
|
#include <stdlib.h>
|
|
|
|
#include <string.h>
|
2020-09-16 20:38:58 -07:00
|
|
|
|
2020-09-11 02:55:15 -07:00
|
|
|
#include "grammar.h"
|
2020-12-14 22:01:50 -08:00
|
|
|
#include "types.h"
|
2020-09-11 01:28:06 -07:00
|
|
|
#include "utils.h"
|
2020-12-14 22:01:50 -08:00
|
|
|
#include "vm.h"
|
2020-09-11 01:28:06 -07:00
|
|
|
|
2021-01-12 22:33:28 -08:00
|
|
|
__attribute__((nonnull, pure))
|
|
|
|
static inline const char *next_char(file_t *f, const char *str);
|
|
|
|
__attribute__((nonnull))
|
|
|
|
static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, unsigned int flags);
|
|
|
|
__attribute__((hot, nonnull(2,3,4)))
|
2021-01-13 18:56:22 -08:00
|
|
|
static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags);
|
2021-01-12 22:33:28 -08:00
|
|
|
__attribute__((nonnull))
|
|
|
|
static match_t *get_capture_by_num(match_t *m, int *n);
|
|
|
|
__attribute__((nonnull, pure))
|
|
|
|
static match_t *get_capture_by_name(match_t *m, const char *name);
|
2020-09-12 18:20:13 -07:00
|
|
|
|
2021-01-13 18:56:22 -08:00
|
|
|
static inline void set_owner(match_t *m, match_t **owner)
|
|
|
|
{
|
|
|
|
check(owner != &m->child, "Circular ownership");
|
|
|
|
check(owner != &m->nextsibling, "Circular ownership");
|
|
|
|
*owner = m;
|
|
|
|
++m->refcount;
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
2021-01-12 23:08:32 -08:00
|
|
|
// Return the location of the next character or UTF8 codepoint.
|
|
|
|
// (i.e. skip forward one codepoint at a time, not one byte at a time)
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
2020-09-25 08:41:29 -07:00
|
|
|
static inline const char *next_char(file_t *f, const char *str)
|
|
|
|
{
|
|
|
|
char c = *str;
|
|
|
|
++str;
|
|
|
|
if (__builtin_expect(!(c & 0x80), 1))
|
|
|
|
return str;
|
|
|
|
|
|
|
|
if (__builtin_expect(str < f->end && !!(*str & 0x80), 1))
|
|
|
|
++str;
|
|
|
|
if (c > '\xDF' && __builtin_expect(str < f->end && !!(*str & 0x80), 1))
|
|
|
|
++str;
|
|
|
|
if (c > '\xEF' && __builtin_expect(str < f->end && !!(*str & 0x80), 1))
|
|
|
|
++str;
|
|
|
|
return str;
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
|
|
|
// Recursively deallocate a match object and set to NULL
|
|
|
|
//
|
2020-09-11 01:28:06 -07:00
|
|
|
void destroy_match(match_t **m)
|
|
|
|
{
|
2021-01-13 18:56:22 -08:00
|
|
|
if (*m == NULL) return;
|
|
|
|
if (--(*m)->refcount > 0) {
|
|
|
|
*m = NULL;
|
|
|
|
return;
|
|
|
|
}
|
2020-09-11 01:28:06 -07:00
|
|
|
destroy_match(&((*m)->child));
|
|
|
|
destroy_match(&((*m)->nextsibling));
|
2021-01-13 18:56:22 -08:00
|
|
|
free(*m);
|
2020-09-11 01:28:06 -07:00
|
|
|
*m = NULL;
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
|
|
|
// Attempt to match text against a previously captured value.
|
|
|
|
// Return the character position after the backref has matched, or NULL if no match has occurred.
|
|
|
|
//
|
2021-01-12 19:23:38 -08:00
|
|
|
static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, unsigned int flags)
|
2021-01-08 01:00:27 -08:00
|
|
|
{
|
2021-01-12 19:27:57 -08:00
|
|
|
check(op->type == VM_BACKREF, "Attempt to match backref against something that's not a backref");
|
|
|
|
if (cap->op->type == VM_REPLACE) {
|
2021-01-08 01:00:27 -08:00
|
|
|
const char *text = cap->op->args.replace.text;
|
|
|
|
const char *end = &text[cap->op->args.replace.len];
|
|
|
|
for (const char *r = text; r < end; ) {
|
|
|
|
if (*r == '\\') {
|
|
|
|
++r;
|
2021-01-12 19:23:38 -08:00
|
|
|
if (*(str++) != unescapechar(r, &r))
|
2021-01-08 01:00:27 -08:00
|
|
|
return NULL;
|
|
|
|
} else if (*r != '@') {
|
2021-01-12 19:23:38 -08:00
|
|
|
if (*(str++) != *r)
|
2021-01-08 01:00:27 -08:00
|
|
|
return NULL;
|
|
|
|
++r;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
++r;
|
|
|
|
match_t *value = get_capture(cap, &r);
|
|
|
|
if (value != NULL) {
|
2021-01-12 19:23:38 -08:00
|
|
|
str = match_backref(str, op, value, flags);
|
|
|
|
if (str == NULL) return NULL;
|
2021-01-08 01:00:27 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
const char *prev = cap->start;
|
|
|
|
for (match_t *child = cap->child; child; child = child->nextsibling) {
|
|
|
|
if (child->start > prev) {
|
|
|
|
size_t len = (size_t)(child->start - prev);
|
|
|
|
if ((flags & BP_IGNORECASE) ? memicmp(str, prev, len) != 0
|
2021-01-12 19:23:38 -08:00
|
|
|
: memcmp(str, prev, len) != 0) {
|
2021-01-08 01:00:27 -08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
str += len;
|
|
|
|
prev = child->start;
|
|
|
|
}
|
|
|
|
if (child->start < prev) continue;
|
2021-01-12 19:23:38 -08:00
|
|
|
str = match_backref(str, op, child, flags);
|
|
|
|
if (str == NULL) return NULL;
|
2021-01-08 01:00:27 -08:00
|
|
|
prev = child->end;
|
|
|
|
}
|
|
|
|
if (cap->end > prev) {
|
|
|
|
size_t len = (size_t)(cap->end - prev);
|
|
|
|
if ((flags & BP_IGNORECASE) ? memicmp(str, prev, len) != 0
|
2021-01-12 22:22:38 -08:00
|
|
|
: memcmp(str, prev, len) != 0) {
|
2021-01-08 01:00:27 -08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
str += len;
|
|
|
|
}
|
|
|
|
}
|
2021-01-12 19:23:38 -08:00
|
|
|
return str;
|
2021-01-08 01:00:27 -08:00
|
|
|
}
|
|
|
|
|
2020-09-13 00:48:39 -07:00
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
|
|
|
// Run virtual machine operation against a string and return
|
|
|
|
// a match struct, or NULL if no match is found.
|
|
|
|
// The returned value should be free()'d to avoid memory leaking.
|
|
|
|
//
|
2021-01-13 18:56:22 -08:00
|
|
|
static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
|
2020-09-11 01:28:06 -07:00
|
|
|
{
|
2021-01-12 19:27:57 -08:00
|
|
|
switch (op->type) {
|
2021-01-13 18:56:22 -08:00
|
|
|
case VM_CANNED: {
|
|
|
|
if (str == op->args.canned.at) {
|
|
|
|
++op->args.canned.visits;
|
|
|
|
// TODO: deep copy?
|
|
|
|
return op->args.canned.match;
|
|
|
|
} else {
|
|
|
|
return _match(defs, f, str, op->args.canned.fallback, flags);
|
|
|
|
}
|
|
|
|
}
|
2020-09-11 01:28:06 -07:00
|
|
|
case VM_ANYCHAR: {
|
2021-01-05 00:09:30 -08:00
|
|
|
if (str >= f->end || *str == '\n')
|
2020-09-11 01:28:06 -07:00
|
|
|
return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->op = op;
|
|
|
|
m->start = str;
|
2020-09-25 08:41:29 -07:00
|
|
|
m->end = next_char(f, str);
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_STRING: {
|
2020-12-14 21:35:49 -08:00
|
|
|
if (&str[op->len] > f->end) return NULL;
|
2020-12-30 19:42:47 -08:00
|
|
|
if ((flags & BP_IGNORECASE) ? memicmp(str, op->args.s, (size_t)op->len) != 0
|
2021-01-13 18:56:22 -08:00
|
|
|
: memcmp(str, op->args.s, (size_t)op->len) != 0)
|
2020-09-11 01:28:06 -07:00
|
|
|
return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->op = op;
|
|
|
|
m->start = str;
|
|
|
|
m->end = str + op->len;
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_RANGE: {
|
2020-12-19 18:51:30 -08:00
|
|
|
if (str >= f->end) return NULL;
|
2020-09-28 21:30:43 -07:00
|
|
|
if ((unsigned char)*str < op->args.range.low || (unsigned char)*str > op->args.range.high)
|
2020-09-11 01:28:06 -07:00
|
|
|
return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->op = op;
|
|
|
|
m->start = str;
|
|
|
|
m->end = str + 1;
|
|
|
|
return m;
|
|
|
|
}
|
2020-09-13 23:31:38 -07:00
|
|
|
case VM_NOT: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *m = _match(defs, f, str, op->args.pat, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (m != NULL) {
|
|
|
|
destroy_match(&m);
|
|
|
|
return NULL;
|
|
|
|
}
|
2020-12-17 16:27:23 -08:00
|
|
|
m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->op = op;
|
|
|
|
m->start = str;
|
|
|
|
m->end = str;
|
|
|
|
return m;
|
|
|
|
}
|
2020-09-16 17:57:56 -07:00
|
|
|
case VM_UPTO_AND: {
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
|
|
|
|
vm_op_t *pat = op->args.multiple.first, *skip = op->args.multiple.second;
|
|
|
|
if (!pat && !skip) {
|
2021-01-05 00:09:30 -08:00
|
|
|
while (str < f->end && *str != '\n') ++str;
|
2021-01-13 18:56:22 -08:00
|
|
|
m->end = str;
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
|
|
|
|
match_t **dest = &m->child;
|
|
|
|
for (const char *prev = NULL; prev < str; ) {
|
|
|
|
prev = str;
|
|
|
|
if (pat) {
|
|
|
|
match_t *p = _match(defs, f, str, pat, flags);
|
|
|
|
if (p != NULL) {
|
|
|
|
set_owner(p, dest);
|
|
|
|
m->end = p->end;
|
2021-01-08 00:46:47 -08:00
|
|
|
return m;
|
2020-09-13 22:04:51 -07:00
|
|
|
}
|
2021-01-13 18:56:22 -08:00
|
|
|
} else if (str == f->end) {
|
|
|
|
m->end = str;
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
if (skip) {
|
|
|
|
match_t *s = _match(defs, f, str, skip, flags);
|
|
|
|
if (s != NULL) {
|
|
|
|
set_owner(s, dest);
|
|
|
|
dest = &s->nextsibling;
|
|
|
|
str = s->end;
|
|
|
|
continue;
|
2020-12-14 18:11:33 -08:00
|
|
|
}
|
2020-09-13 22:04:51 -07:00
|
|
|
}
|
2021-01-13 18:56:22 -08:00
|
|
|
// This isn't in the for() structure because there needs to
|
|
|
|
// be at least once chance to match the pattern, even if
|
|
|
|
// we're at the end of the string already (e.g. "..$").
|
|
|
|
if (str < f->end && *str != '\n')
|
|
|
|
str = next_char(f, str);
|
2020-09-11 01:28:06 -07:00
|
|
|
}
|
2021-01-13 18:56:22 -08:00
|
|
|
destroy_match(&m);
|
|
|
|
return NULL;
|
2020-09-11 01:28:06 -07:00
|
|
|
}
|
|
|
|
case VM_REPEAT: {
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->end = str;
|
|
|
|
m->op = op;
|
|
|
|
|
|
|
|
match_t **dest = &m->child;
|
2020-09-13 23:31:38 -07:00
|
|
|
size_t reps = 0;
|
2020-09-18 22:32:36 -07:00
|
|
|
ssize_t max = op->args.repetitions.max;
|
|
|
|
for (reps = 0; max == -1 || reps < (size_t)max; ++reps) {
|
|
|
|
const char *start = str;
|
2020-09-11 01:28:06 -07:00
|
|
|
// Separator
|
|
|
|
match_t *sep = NULL;
|
|
|
|
if (op->args.repetitions.sep != NULL && reps > 0) {
|
2021-01-13 18:56:22 -08:00
|
|
|
sep = _match(defs, f, str, op->args.repetitions.sep, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (sep == NULL) break;
|
|
|
|
str = sep->end;
|
|
|
|
}
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *p = _match(defs, f, str, op->args.repetitions.repeat_pat, flags);
|
2020-09-18 22:32:36 -07:00
|
|
|
if (p == NULL) {
|
2021-01-12 20:01:46 -08:00
|
|
|
str = start;
|
2020-09-18 22:32:36 -07:00
|
|
|
destroy_match(&sep);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (p->end == start && reps > 0) {
|
|
|
|
// Since no forward progress was made on either `pat` or
|
2020-12-30 19:42:47 -08:00
|
|
|
// `sep` and BP does not have mutable state, it's
|
2020-09-18 22:32:36 -07:00
|
|
|
// guaranteed that no progress will be made on the next
|
|
|
|
// loop either. We know that this will continue to loop
|
|
|
|
// until reps==max, so let's just cut to the chase instead
|
|
|
|
// of looping infinitely.
|
2020-09-11 01:28:06 -07:00
|
|
|
destroy_match(&sep);
|
|
|
|
destroy_match(&p);
|
2020-09-18 22:32:36 -07:00
|
|
|
if (op->args.repetitions.max == -1)
|
|
|
|
reps = ~(size_t)0;
|
|
|
|
else
|
|
|
|
reps = (size_t)op->args.repetitions.max;
|
2020-09-11 01:28:06 -07:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (sep) {
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(sep, dest);
|
2020-09-11 01:28:06 -07:00
|
|
|
dest = &sep->nextsibling;
|
|
|
|
}
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(p, dest);
|
2020-09-11 01:28:06 -07:00
|
|
|
dest = &p->nextsibling;
|
|
|
|
str = p->end;
|
|
|
|
}
|
|
|
|
|
2020-09-18 22:32:36 -07:00
|
|
|
if (reps < (size_t)op->args.repetitions.min) {
|
2020-09-11 01:28:06 -07:00
|
|
|
destroy_match(&m);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
m->end = str;
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_AFTER: {
|
|
|
|
ssize_t backtrack = op->args.pat->len;
|
|
|
|
check(backtrack != -1, "'<' is only allowed for fixed-length operations");
|
2020-09-16 19:35:43 -07:00
|
|
|
if (str - backtrack < f->contents) return NULL;
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *before = _match(defs, f, str - backtrack, op->args.pat, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (before == NULL) return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->end = str;
|
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(before, &m->child);
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_BEFORE: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *after = _match(defs, f, str, op->args.pat, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (after == NULL) return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->end = str;
|
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(after, &m->child);
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_CAPTURE: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *p = _match(defs, f, str, op->args.pat, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (p == NULL) return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->end = p->end;
|
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(p, &m->child);
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
}
|
2020-12-12 16:31:53 -08:00
|
|
|
case VM_HIDE: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *p = _match(defs, f, str, op->args.pat, flags);
|
2020-12-12 16:31:53 -08:00
|
|
|
if (p == NULL) return NULL;
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-12-12 16:31:53 -08:00
|
|
|
m->start = str;
|
|
|
|
m->end = p->end;
|
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(p, &m->child);
|
2020-12-12 16:31:53 -08:00
|
|
|
return m;
|
|
|
|
}
|
2020-09-11 01:28:06 -07:00
|
|
|
case VM_OTHERWISE: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *m = _match(defs, f, str, op->args.multiple.first, flags);
|
|
|
|
if (m == NULL) m = _match(defs, f, str, op->args.multiple.second, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_CHAIN: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (m1 == NULL) return NULL;
|
2020-09-12 18:20:13 -07:00
|
|
|
|
2021-01-10 01:45:40 -08:00
|
|
|
match_t *m2;
|
|
|
|
{ // Push backrefs and run matching, then cleanup
|
|
|
|
def_t *defs2 = with_backrefs(defs, f, m1);
|
2021-01-13 18:56:22 -08:00
|
|
|
m2 = _match(defs2, f, m1->end, op->args.multiple.second, flags);
|
2021-01-13 01:48:36 -08:00
|
|
|
free_defs(&defs2, defs);
|
2021-01-10 01:45:40 -08:00
|
|
|
}
|
|
|
|
|
2020-09-11 01:28:06 -07:00
|
|
|
if (m2 == NULL) {
|
|
|
|
destroy_match(&m1);
|
|
|
|
return NULL;
|
|
|
|
}
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->end = m2->end;
|
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(m1, &m->child);
|
|
|
|
set_owner(m2, &m1->nextsibling);
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
}
|
2020-09-28 17:56:02 -07:00
|
|
|
case VM_EQUAL: case VM_NOT_EQUAL: {
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags);
|
2020-09-13 20:33:11 -07:00
|
|
|
if (m1 == NULL) return NULL;
|
|
|
|
|
2020-12-14 17:51:01 -08:00
|
|
|
// <p1>==<p2> matches iff the text of <p1> matches <p2>
|
|
|
|
// <p1>!=<p2> matches iff the text of <p1> does not match <p2>
|
|
|
|
file_t inner = {
|
|
|
|
.filename=f->filename,
|
|
|
|
.contents=(char*)m1->start, .end=(char*)m1->end,
|
|
|
|
.lines=f->lines, // I think this works, but am not 100% sure
|
|
|
|
.nlines=1 + get_line_number(f, m1->end)-get_line_number(f, m1->start),
|
|
|
|
.mmapped=f->mmapped,
|
|
|
|
};
|
2021-01-13 18:56:22 -08:00
|
|
|
match_t *m2 = _match(defs, &inner, str, op->args.multiple.second, flags);
|
2021-01-12 19:27:57 -08:00
|
|
|
if ((m2 == NULL) == (op->type == VM_EQUAL)) {
|
2020-09-13 20:33:11 -07:00
|
|
|
destroy_match(&m1);
|
2021-01-13 18:56:22 -08:00
|
|
|
if (m2 != NULL) destroy_match(&m2);
|
2020-09-13 20:33:11 -07:00
|
|
|
return NULL;
|
|
|
|
}
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-12-14 17:51:01 -08:00
|
|
|
m->start = m1->start;
|
2020-09-28 17:56:02 -07:00
|
|
|
m->end = m1->end;
|
2020-09-13 20:33:11 -07:00
|
|
|
m->op = op;
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(m1, &m->child);
|
2021-01-12 19:27:57 -08:00
|
|
|
if (op->type == VM_EQUAL) {
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(m2, &m1->nextsibling);
|
2020-09-28 17:56:02 -07:00
|
|
|
} else {
|
|
|
|
destroy_match(&m2);
|
|
|
|
}
|
2020-09-13 20:33:11 -07:00
|
|
|
return m;
|
|
|
|
}
|
2020-09-11 01:28:06 -07:00
|
|
|
case VM_REPLACE: {
|
2020-09-17 01:00:06 -07:00
|
|
|
match_t *p = NULL;
|
2020-12-17 19:49:56 -08:00
|
|
|
if (op->args.replace.pat) {
|
2021-01-13 18:56:22 -08:00
|
|
|
p = _match(defs, f, str, op->args.replace.pat, flags);
|
2020-09-17 01:00:06 -07:00
|
|
|
if (p == NULL) return NULL;
|
|
|
|
}
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->start = str;
|
|
|
|
m->op = op;
|
2020-09-17 01:00:06 -07:00
|
|
|
if (p) {
|
2021-01-13 18:56:22 -08:00
|
|
|
set_owner(p, &m->child);
|
2020-09-11 01:28:06 -07:00
|
|
|
m->end = p->end;
|
|
|
|
} else {
|
|
|
|
m->end = m->start;
|
|
|
|
}
|
|
|
|
return m;
|
|
|
|
}
|
|
|
|
case VM_REF: {
|
2021-01-13 18:56:22 -08:00
|
|
|
def_t *def = lookup(defs, op->args.s);
|
|
|
|
check(def != NULL, "Unknown identifier: '%s'", op->args.s);
|
|
|
|
vm_op_t *ref = def->op;
|
2020-09-13 00:48:39 -07:00
|
|
|
|
2021-01-13 18:56:22 -08:00
|
|
|
vm_op_t fail_op = {
|
|
|
|
.type = VM_CANNED,
|
|
|
|
.start = ref->start,
|
|
|
|
.end = ref->end,
|
|
|
|
.len = 0,
|
|
|
|
.args.canned = {
|
|
|
|
.match = NULL,
|
|
|
|
.visits = 0,
|
|
|
|
.at = str,
|
|
|
|
.fallback = ref,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
def_t defs2 = {
|
|
|
|
.namelen = def->namelen,
|
|
|
|
.name = def->name,
|
|
|
|
.file = def->file,
|
|
|
|
.op = &fail_op,
|
|
|
|
.next = defs,
|
2020-09-13 00:48:39 -07:00
|
|
|
};
|
2021-01-13 18:56:22 -08:00
|
|
|
|
|
|
|
const char *prev = str;
|
|
|
|
match_t *m = _match(&defs2, f, str, ref, flags);
|
|
|
|
if (m == NULL) return NULL;
|
|
|
|
|
|
|
|
while (fail_op.args.canned.visits > 0 && m->end > prev) {
|
|
|
|
fail_op.args.canned.visits = 0;
|
|
|
|
fail_op.args.canned.match = m;
|
|
|
|
prev = m->end;
|
|
|
|
match_t *m2 = _match(&defs2, f, str, ref, flags);
|
|
|
|
if (m2 == NULL) break;
|
|
|
|
m = m2;
|
2020-09-13 00:48:39 -07:00
|
|
|
}
|
2021-01-13 18:56:22 -08:00
|
|
|
|
2020-09-11 02:55:15 -07:00
|
|
|
return m;
|
2020-09-11 01:28:06 -07:00
|
|
|
}
|
2020-09-12 18:20:13 -07:00
|
|
|
case VM_BACKREF: {
|
2021-01-12 19:23:38 -08:00
|
|
|
const char *end = match_backref(str, op, op->args.backref, flags);
|
|
|
|
if (end == NULL) return NULL;
|
|
|
|
match_t *m = new(match_t);
|
|
|
|
m->op = op;
|
|
|
|
m->start = str;
|
|
|
|
m->end = end;
|
|
|
|
return m;
|
2020-09-12 18:20:13 -07:00
|
|
|
}
|
2020-09-14 01:21:49 -07:00
|
|
|
case VM_NODENT: {
|
2020-10-13 17:12:37 -07:00
|
|
|
if (*str != '\n') return NULL;
|
|
|
|
const char *start = str;
|
|
|
|
|
2020-09-16 19:35:43 -07:00
|
|
|
size_t linenum = get_line_number(f, str);
|
2020-10-13 17:12:37 -07:00
|
|
|
const char *p = get_line(f, linenum);
|
2020-12-14 22:18:04 -08:00
|
|
|
if (p < f->contents) p=f->contents; // Can happen with recursive matching
|
2020-10-13 17:12:37 -07:00
|
|
|
|
|
|
|
// Current indentation:
|
2020-09-14 01:21:49 -07:00
|
|
|
char denter = *p;
|
|
|
|
int dents = 0;
|
|
|
|
if (denter == ' ' || denter == '\t') {
|
2020-12-14 22:18:04 -08:00
|
|
|
for (; *p == denter && p < f->end; ++p) ++dents;
|
2020-09-14 01:21:49 -07:00
|
|
|
}
|
2020-10-13 17:12:37 -07:00
|
|
|
|
|
|
|
// Subsequent indentation:
|
|
|
|
while (*str == '\n') ++str;
|
2020-09-14 01:21:49 -07:00
|
|
|
for (int i = 0; i < dents; i++) {
|
2020-12-14 22:18:04 -08:00
|
|
|
if (str[i] != denter || &str[i] >= f->end) return NULL;
|
2020-09-14 01:21:49 -07:00
|
|
|
}
|
|
|
|
|
2020-12-17 16:27:23 -08:00
|
|
|
match_t *m = new(match_t);
|
2020-10-13 17:12:37 -07:00
|
|
|
m->start = start;
|
2020-09-14 01:21:49 -07:00
|
|
|
m->end = &str[dents];
|
|
|
|
m->op = op;
|
|
|
|
return m;
|
|
|
|
}
|
2020-09-11 01:28:06 -07:00
|
|
|
default: {
|
2021-01-12 19:27:57 -08:00
|
|
|
fprintf(stderr, "Unknown opcode: %d", op->type);
|
2021-01-13 19:01:49 -08:00
|
|
|
exit(1);
|
2020-09-11 01:28:06 -07:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
|
|
|
// Get a specific numbered pattern capture.
|
|
|
|
//
|
2021-01-08 01:00:27 -08:00
|
|
|
static match_t *get_capture_by_num(match_t *m, int *n)
|
2020-09-11 01:28:06 -07:00
|
|
|
{
|
|
|
|
if (*n == 0) return m;
|
2021-01-12 19:27:57 -08:00
|
|
|
if (m->op->type == VM_CAPTURE && *n == 1) return m;
|
|
|
|
if (m->op->type == VM_CAPTURE) --(*n);
|
2020-09-11 01:28:06 -07:00
|
|
|
for (match_t *c = m->child; c; c = c->nextsibling) {
|
2021-01-08 01:00:27 -08:00
|
|
|
match_t *cap = get_capture_by_num(c, n);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (cap) return cap;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
|
|
|
// Get a capture with a specific name.
|
|
|
|
//
|
2021-01-08 01:00:27 -08:00
|
|
|
static match_t *get_capture_by_name(match_t *m, const char *name)
|
2020-09-11 01:28:06 -07:00
|
|
|
{
|
2021-01-12 19:27:57 -08:00
|
|
|
if (m->op->type == VM_CAPTURE && m->op->args.capture.name
|
2020-12-17 19:49:56 -08:00
|
|
|
&& streq(m->op->args.capture.name, name))
|
2020-09-11 01:28:06 -07:00
|
|
|
return m;
|
|
|
|
for (match_t *c = m->child; c; c = c->nextsibling) {
|
2021-01-08 01:00:27 -08:00
|
|
|
match_t *cap = get_capture_by_name(c, name);
|
2020-09-11 01:28:06 -07:00
|
|
|
if (cap) return cap;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
2021-01-12 22:22:38 -08:00
|
|
|
// Get a capture by identifier (name or number).
|
|
|
|
// Update *id to point to after the identifier (if found).
|
2021-01-12 21:04:43 -08:00
|
|
|
//
|
2021-01-12 22:22:38 -08:00
|
|
|
match_t *get_capture(match_t *m, const char **id)
|
2020-09-16 20:38:58 -07:00
|
|
|
{
|
2021-01-12 22:22:38 -08:00
|
|
|
if (isdigit(**id)) {
|
|
|
|
int n = (int)strtol(*id, (char**)id, 10);
|
2021-01-08 01:00:27 -08:00
|
|
|
return get_capture_by_num(m->child, &n);
|
2020-09-28 16:35:22 -07:00
|
|
|
} else {
|
2021-01-12 22:22:38 -08:00
|
|
|
const char *end = after_name(*id);
|
|
|
|
if (end == *id) return NULL;
|
|
|
|
char *name = strndup(*id, (size_t)(end-*id));
|
2021-01-08 01:00:27 -08:00
|
|
|
match_t *cap = get_capture_by_name(m, name);
|
2021-01-10 00:24:24 -08:00
|
|
|
xfree(&name);
|
2021-01-12 22:22:38 -08:00
|
|
|
*id = end;
|
|
|
|
if (**id == ';') ++(*id);
|
2020-09-16 20:38:58 -07:00
|
|
|
return cap;
|
|
|
|
}
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2021-01-12 21:28:44 -08:00
|
|
|
//
|
|
|
|
// Wrapper function for _match() to kickstart the recursion info.
|
|
|
|
//
|
2021-01-10 01:45:40 -08:00
|
|
|
match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
|
2020-09-13 00:48:39 -07:00
|
|
|
{
|
2021-01-13 18:56:22 -08:00
|
|
|
return _match(defs, f, str, op, flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Deallocate memory associated with an op
|
|
|
|
void destroy_op(vm_op_t *op)
|
|
|
|
{
|
|
|
|
switch (op->type) {
|
|
|
|
case VM_STRING: case VM_REF:
|
|
|
|
xfree(&op->args.s);
|
|
|
|
break;
|
|
|
|
case VM_CAPTURE:
|
|
|
|
if (op->args.capture.name)
|
|
|
|
xfree(&op->args.capture.name);
|
|
|
|
break;
|
|
|
|
case VM_REPLACE:
|
|
|
|
if (op->args.replace.text)
|
|
|
|
xfree(&op->args.replace.text);
|
|
|
|
break;
|
|
|
|
default: break;
|
|
|
|
}
|
2020-09-13 00:48:39 -07:00
|
|
|
}
|
|
|
|
|
2020-09-11 01:38:44 -07:00
|
|
|
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1
|