Overhaul of memory tracking and left recursion. Added explanation doc
for left recursion and fixed all visible memory leaks.
This commit is contained in:
parent
9238ffea88
commit
3cd61e3abc
5
Makefile
5
Makefile
@ -4,6 +4,7 @@ PREFIX=/usr/local
|
||||
SYSCONFDIR=/etc
|
||||
CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L
|
||||
CWARN=-Wall -Wpedantic -Wextra -Wsign-conversion -Wtype-limits -Wunused-result
|
||||
EXTRA_FLAGS=
|
||||
G=
|
||||
O=-O3
|
||||
|
||||
@ -13,10 +14,10 @@ OBJFILES=$(CFILES:.c=.o)
|
||||
all: $(NAME)
|
||||
|
||||
%.o: %.c %.h types.h
|
||||
$(CC) -c $(CFLAGS) $(CWARN) $(G) $(O) -o $@ $<
|
||||
$(CC) -c $(CFLAGS) $(EXTRA_CFLAGS) $(CWARN) $(G) $(O) -o $@ $<
|
||||
|
||||
$(NAME): $(OBJFILES) $(NAME).c
|
||||
$(CC) $(CFLAGS) $(CWARN) $(G) $(O) -o $@ $(OBJFILES) $(NAME).c
|
||||
$(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(CWARN) $(G) $(O) -o $@ $(OBJFILES) $(NAME).c
|
||||
|
||||
clean:
|
||||
rm -f $(NAME) $(OBJFILES)
|
||||
|
41
bp.c
41
bp.c
@ -119,9 +119,8 @@ static int process_file(def_t *defs, const char *filename, vm_op_t *pattern, uns
|
||||
}
|
||||
}
|
||||
|
||||
if (m != NULL)
|
||||
destroy_match(&m);
|
||||
|
||||
recycle_if_unused(&m);
|
||||
check(recycle_all_matches() == 0, "Memory leak: there should no longer be any matches in use at this point.");
|
||||
destroy_file(&f);
|
||||
|
||||
return success;
|
||||
@ -134,12 +133,16 @@ int main(int argc, char *argv[])
|
||||
unsigned int flags = 0;
|
||||
char *flag = NULL;
|
||||
char path[PATH_MAX] = {0};
|
||||
const char *rule = "find-all";
|
||||
const char *mode = "find-all";
|
||||
|
||||
def_t *defs = NULL;
|
||||
|
||||
file_t *loaded_files = NULL;
|
||||
|
||||
// Define an opcode that is just a reference to the rule `pattern`
|
||||
file_t *pat_file = spoof_file(&loaded_files, "<pattern>", "pattern");
|
||||
vm_op_t *pattern = bp_pattern(pat_file, pat_file->contents);
|
||||
|
||||
// Load builtins:
|
||||
if (access("/etc/xdg/bp/builtins.bp", R_OK) != -1) {
|
||||
file_t *f = load_file(&loaded_files, "/etc/xdg/bp/builtins.bp");
|
||||
@ -176,13 +179,11 @@ int main(int argc, char *argv[])
|
||||
} else if (FLAG("--replace") || FLAG("-r")) {
|
||||
// TODO: spoof file as sprintf("pattern => '%s'", flag)
|
||||
// except that would require handling edge cases like quotation marks etc.
|
||||
file_t *pat_file = spoof_file(&loaded_files, "<pattern>", "pattern");
|
||||
vm_op_t *patref = bp_pattern(pat_file, pat_file->contents);
|
||||
file_t *replace_file = spoof_file(&loaded_files, "<replace argument>", flag);
|
||||
vm_op_t *rep = bp_replacement(replace_file, patref, replace_file->contents);
|
||||
vm_op_t *rep = bp_replacement(replace_file, pattern, replace_file->contents);
|
||||
check(rep, "Replacement failed to compile: %s", flag);
|
||||
defs = with_def(defs, replace_file, strlen("replacement"), "replacement", rep);
|
||||
rule = "replace-all";
|
||||
mode = "replace-all";
|
||||
} else if (FLAG("--grammar") || FLAG("-g")) {
|
||||
file_t *f = load_file(&loaded_files, flag);
|
||||
if (f == NULL) {
|
||||
@ -229,7 +230,7 @@ int main(int argc, char *argv[])
|
||||
defs = with_def(defs, arg_file, strlen("pattern"), "pattern", p);
|
||||
++npatterns;
|
||||
} else if (FLAG("--mode") || FLAG("-m")) {
|
||||
rule = flag;
|
||||
mode = flag;
|
||||
} else if (argv[i][0] == '-' && argv[i][1] && argv[i][1] != '-') { // single-char flags
|
||||
for (char *c = &argv[i][1]; *c; ++c) {
|
||||
switch (*c) {
|
||||
@ -269,16 +270,20 @@ int main(int argc, char *argv[])
|
||||
print_options |= PRINT_COLOR | PRINT_LINE_NUMBERS;
|
||||
}
|
||||
|
||||
def_t *pattern_def = lookup(defs, rule);
|
||||
check(pattern_def != NULL, "No such rule: '%s'", rule);
|
||||
vm_op_t *pattern = pattern_def->op;
|
||||
// Define an opcode that is just a reference to the overarching mode (e.g. find-all)
|
||||
if (lookup(defs, mode) == NULL) {
|
||||
printf("The mode '%s' is not defined.\n", mode);
|
||||
return 1;
|
||||
}
|
||||
file_t *mode_file = spoof_file(&loaded_files, "<mode>", mode);
|
||||
vm_op_t *mode_op = bp_pattern(mode_file, mode_file->contents);
|
||||
|
||||
int found = 0;
|
||||
if (flags & BP_JSON) printf("[");
|
||||
if (i < argc) {
|
||||
// Files pass in as command line args:
|
||||
for (int nfiles = 0; i < argc; nfiles++, i++) {
|
||||
found += process_file(defs, argv[i], pattern, flags);
|
||||
found += process_file(defs, argv[i], mode_op, flags);
|
||||
}
|
||||
} else if (isatty(STDIN_FILENO)) {
|
||||
// No files, no piped in input, so use * **/*:
|
||||
@ -286,21 +291,27 @@ int main(int argc, char *argv[])
|
||||
glob("*", 0, NULL, &globbuf);
|
||||
glob("**/*", GLOB_APPEND, NULL, &globbuf);
|
||||
for (size_t i = 0; i < globbuf.gl_pathc; i++) {
|
||||
found += process_file(defs, globbuf.gl_pathv[i], pattern, flags);
|
||||
found += process_file(defs, globbuf.gl_pathv[i], mode_op, flags);
|
||||
}
|
||||
globfree(&globbuf);
|
||||
} else {
|
||||
// Piped in input:
|
||||
found += process_file(defs, NULL, pattern, flags);
|
||||
found += process_file(defs, NULL, mode_op, flags);
|
||||
}
|
||||
if (flags & BP_JSON) printf("]\n");
|
||||
|
||||
#ifdef FREE_ON_EXIT
|
||||
// This code frees up all residual heap-allocated memory. Since the program
|
||||
// is about to exit, this step is unnecessary. However, it is useful for
|
||||
// tracking down memory leaks.
|
||||
free_defs(&defs, NULL);
|
||||
while (loaded_files) {
|
||||
file_t *next = loaded_files->next;
|
||||
destroy_file(&loaded_files);
|
||||
loaded_files = next;
|
||||
}
|
||||
free_all_matches();
|
||||
#endif
|
||||
|
||||
return (found > 0) ? 0 : 1;
|
||||
}
|
||||
|
@ -89,7 +89,7 @@ file_t *load_file(file_t **files, const char *filename)
|
||||
//
|
||||
// Create a virtual file from a string.
|
||||
//
|
||||
file_t *spoof_file(file_t **files, const char *filename, char *text)
|
||||
file_t *spoof_file(file_t **files, const char *filename, const char *text)
|
||||
{
|
||||
if (filename == NULL) filename = "";
|
||||
file_t *f = new(file_t);
|
||||
|
@ -19,7 +19,7 @@ typedef struct file_s {
|
||||
|
||||
file_t *load_file(file_t **files, const char *filename);
|
||||
__attribute__((nonnull(3), returns_nonnull))
|
||||
file_t *spoof_file(file_t **files, const char *filename, char *text);
|
||||
file_t *spoof_file(file_t **files, const char *filename, const char *text);
|
||||
__attribute__((nonnull))
|
||||
void intern_file(file_t *f);
|
||||
__attribute__((nonnull))
|
||||
|
68
left-recursion.md
Normal file
68
left-recursion.md
Normal file
@ -0,0 +1,68 @@
|
||||
# Left Recursion in BP
|
||||
|
||||
Left recursion presents a special difficulty for parsing expression grammars.
|
||||
A regular recursive rule looks like this:
|
||||
|
||||
```
|
||||
rec: "{" *rec "}"
|
||||
```
|
||||
|
||||
This is a relatively simple case to handle, because we're always making forward
|
||||
progress. In other words, calling `match(<rec>, str)` might call
|
||||
`match(<rec>, str + 1)` but it will never recursively call the same function
|
||||
with the same arguments: `match(<rec>, str)`.
|
||||
|
||||
Left-recursive rules, on the other hand, are rules that may attempt to match
|
||||
themselves at the same position. For example:
|
||||
|
||||
```
|
||||
laugh: (laugh "ha") / "Ha"
|
||||
```
|
||||
|
||||
The rule above is functionally equivalent to `laugh: "Ha" (*"ha")`, but it's
|
||||
written in a recursive style. It should be obvious by inspection that the
|
||||
string `Hahaha` matches both versions of the rule, but a naive implementation
|
||||
of `match()` would cause `match(<laugh>, str)` to call `match(<laugh>, str)`
|
||||
before doing anything else, which results in infinite recursion and a stack
|
||||
overflow. One option is to simply detect left recursion and cause a compilation
|
||||
error (this is what LPEG does). However, it is possible to actually match left
|
||||
recursive rules without overflowing the stack.
|
||||
|
||||
## Matching Left Recursion
|
||||
|
||||
The solution used in BP is the following:
|
||||
|
||||
1. Whenever matching a named rule, `foo`, against string `str`, first get the
|
||||
originally defined pattern for that rule. Call this
|
||||
`foo-original-definition`.
|
||||
2. Temporarily define `match(<foo>, str) := signal left recursion and return
|
||||
Fail` (In other words, `<foo>` is temporarily defined to signal left
|
||||
recursion and then fail to match if matching against the string `str`.)
|
||||
3. Run `result := match(<foo-original-definition>, str)`.
|
||||
4. If the match failed, then return failure.
|
||||
5. If the match was successful and did not trigger left recursion, return `result`.
|
||||
6. If the match was successful, but did trigger left recursion, then:
|
||||
|
||||
a. If `result` is not longer than any previous `result`, stop looping and
|
||||
return the longest `result` to avoid an infinite loop.
|
||||
b. Otherwise: temporarily define `match(<foo>, str) := signal left recursion and return <result>`
|
||||
c. Go to step 3.
|
||||
|
||||
This handles left recursion as a simple loop and successfully matches left
|
||||
recursive patterns, even ones that are indirectly or nontrivially left
|
||||
recursive.
|
||||
|
||||
## Example
|
||||
|
||||
Consider the rule `laugh: laugh "ha" / "Ha"` being applied to the input text `"Hahaha!"`:
|
||||
|
||||
|Definition of `match(<laugh>, "Hahaha!")` | Result of `match(<laugh "ha" / "Ha">, "Hahaha!")` |
|
||||
|------------------------------------------|---------------------------------------------------|
|
||||
|`Fail` | `Match{"Ha"}` |
|
||||
|`Match{"Ha"}` | `Match{"Haha"}` |
|
||||
|`Match{"Haha"}` | `Match{"Hahaha"}` |
|
||||
|`Match{"Hahaha"}` | `Match{"Ha"}` (Forward progress stops being made) |
|
||||
|
||||
As you can see in this example, each successive iteration builds up a final
|
||||
match incrementally until progress is no longer being made. At that point,
|
||||
the longest match is returned: `Match{"Hahaha"}`.
|
8
types.h
8
types.h
@ -39,7 +39,7 @@ enum VMOpcode {
|
||||
VM_REF,
|
||||
VM_BACKREF,
|
||||
VM_NODENT,
|
||||
VM_CANNED,
|
||||
VM_LEFTRECURSION,
|
||||
};
|
||||
|
||||
struct match_s; // forward declared to resolve circular struct defs
|
||||
@ -80,7 +80,7 @@ typedef struct vm_op_s {
|
||||
unsigned int visits;
|
||||
const char *at;
|
||||
struct vm_op_s *fallback;
|
||||
} canned;
|
||||
} leftrec;
|
||||
struct vm_op_s *pat;
|
||||
} args;
|
||||
} vm_op_t;
|
||||
@ -93,7 +93,9 @@ typedef struct match_s {
|
||||
const char *start, *end;
|
||||
struct match_s *child, *nextsibling;
|
||||
vm_op_t *op;
|
||||
unsigned int refcount;
|
||||
// Intrusive linked list nodes for garbage collection:
|
||||
struct match_s **atme, *next;
|
||||
int refcount;
|
||||
} match_t;
|
||||
|
||||
//
|
||||
|
270
vm.c
270
vm.c
@ -12,25 +12,31 @@
|
||||
#include "utils.h"
|
||||
#include "vm.h"
|
||||
|
||||
// Linked list operations:
|
||||
#define LL_PREPEND(head, node) do { (node)->atme = &(head); (node)->next = head; if (head) (head)->atme = &(node)->next; head = node; } while(0)
|
||||
#define LL_REMOVE(node) do { *(node)->atme = (node)->next; if ((node)->next) (node)->next->atme = (node)->atme; } while(0)
|
||||
|
||||
// Refcounting ownership-setting macros:
|
||||
#define ADD_OWNER(owner, m) do { owner = m; ++(m)->refcount; } while(0)
|
||||
#define REMOVE_OWNERSHIP(owner) do { if (owner) { --(owner)->refcount; recycle_if_unused(&(owner)); owner = NULL; } } while(0)
|
||||
|
||||
// New match objects are either recycled from unused match objects or allocated
|
||||
// from the heap. While it is in use, the match object is stored in the
|
||||
// `in_use_matches` linked list. Once it is no longer needed, it is moved to
|
||||
// the `unused_matches` linked list so it can be reused without the need for
|
||||
// additional calls to malloc/free. Thus, it is an invariant that every match
|
||||
// object is in one of these two lists:
|
||||
static match_t *in_use_matches = NULL, *unused_matches = NULL;
|
||||
|
||||
__attribute__((nonnull, pure))
|
||||
static inline const char *next_char(file_t *f, const char *str);
|
||||
__attribute__((nonnull))
|
||||
static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, unsigned int flags);
|
||||
__attribute__((hot, nonnull(2,3,4)))
|
||||
static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags);
|
||||
__attribute__((nonnull))
|
||||
static match_t *get_capture_by_num(match_t *m, int *n);
|
||||
__attribute__((nonnull, pure))
|
||||
static match_t *get_capture_by_name(match_t *m, const char *name);
|
||||
|
||||
static inline void set_owner(match_t *m, match_t **owner)
|
||||
{
|
||||
check(owner != &m->child, "Circular ownership");
|
||||
check(owner != &m->nextsibling, "Circular ownership");
|
||||
*owner = m;
|
||||
++m->refcount;
|
||||
}
|
||||
|
||||
//
|
||||
// Return the location of the next character or UTF8 codepoint.
|
||||
// (i.e. skip forward one codepoint at a time, not one byte at a time)
|
||||
@ -51,22 +57,6 @@ static inline const char *next_char(file_t *f, const char *str)
|
||||
return str;
|
||||
}
|
||||
|
||||
//
|
||||
// Recursively deallocate a match object and set to NULL
|
||||
//
|
||||
void destroy_match(match_t **m)
|
||||
{
|
||||
if (*m == NULL) return;
|
||||
if (--(*m)->refcount > 0) {
|
||||
*m = NULL;
|
||||
return;
|
||||
}
|
||||
destroy_match(&((*m)->child));
|
||||
destroy_match(&((*m)->nextsibling));
|
||||
free(*m);
|
||||
*m = NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// Attempt to match text against a previously captured value.
|
||||
// Return the character position after the backref has matched, or NULL if no match has occurred.
|
||||
@ -131,22 +121,26 @@ static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, uns
|
||||
// a match struct, or NULL if no match is found.
|
||||
// The returned value should be free()'d to avoid memory leaking.
|
||||
//
|
||||
static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
|
||||
match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
|
||||
{
|
||||
switch (op->type) {
|
||||
case VM_CANNED: {
|
||||
if (str == op->args.canned.at) {
|
||||
++op->args.canned.visits;
|
||||
// TODO: deep copy?
|
||||
return op->args.canned.match;
|
||||
case VM_LEFTRECURSION: {
|
||||
// Left recursion occurs when a pattern directly or indirectly
|
||||
// invokes itself at the same position in the text. It's handled as
|
||||
// a special case, but if a pattern invokes itself at a later
|
||||
// point, it can be handled with normal recursion.
|
||||
// See: left-recursion.md for more details.
|
||||
if (str == op->args.leftrec.at) {
|
||||
++op->args.leftrec.visits;
|
||||
return op->args.leftrec.match;
|
||||
} else {
|
||||
return _match(defs, f, str, op->args.canned.fallback, flags);
|
||||
return match(defs, f, str, op->args.leftrec.fallback, flags);
|
||||
}
|
||||
}
|
||||
case VM_ANYCHAR: {
|
||||
if (str >= f->end || *str == '\n')
|
||||
return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->op = op;
|
||||
m->start = str;
|
||||
m->end = next_char(f, str);
|
||||
@ -157,7 +151,7 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
if ((flags & BP_IGNORECASE) ? memicmp(str, op->args.s, (size_t)op->len) != 0
|
||||
: memcmp(str, op->args.s, (size_t)op->len) != 0)
|
||||
return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->op = op;
|
||||
m->start = str;
|
||||
m->end = str + op->len;
|
||||
@ -167,26 +161,26 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
if (str >= f->end) return NULL;
|
||||
if ((unsigned char)*str < op->args.range.low || (unsigned char)*str > op->args.range.high)
|
||||
return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->op = op;
|
||||
m->start = str;
|
||||
m->end = str + 1;
|
||||
return m;
|
||||
}
|
||||
case VM_NOT: {
|
||||
match_t *m = _match(defs, f, str, op->args.pat, flags);
|
||||
match_t *m = match(defs, f, str, op->args.pat, flags);
|
||||
if (m != NULL) {
|
||||
destroy_match(&m);
|
||||
recycle_if_unused(&m);
|
||||
return NULL;
|
||||
}
|
||||
m = new(match_t);
|
||||
m = new_match();
|
||||
m->op = op;
|
||||
m->start = str;
|
||||
m->end = str;
|
||||
return m;
|
||||
}
|
||||
case VM_UPTO_AND: {
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->op = op;
|
||||
|
||||
@ -201,9 +195,9 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
for (const char *prev = NULL; prev < str; ) {
|
||||
prev = str;
|
||||
if (pat) {
|
||||
match_t *p = _match(defs, f, str, pat, flags);
|
||||
match_t *p = match(defs, f, str, pat, flags);
|
||||
if (p != NULL) {
|
||||
set_owner(p, dest);
|
||||
ADD_OWNER(*dest, p);
|
||||
m->end = p->end;
|
||||
return m;
|
||||
}
|
||||
@ -212,9 +206,9 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
return m;
|
||||
}
|
||||
if (skip) {
|
||||
match_t *s = _match(defs, f, str, skip, flags);
|
||||
match_t *s = match(defs, f, str, skip, flags);
|
||||
if (s != NULL) {
|
||||
set_owner(s, dest);
|
||||
ADD_OWNER(*dest, s);
|
||||
dest = &s->nextsibling;
|
||||
str = s->end;
|
||||
continue;
|
||||
@ -226,11 +220,11 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
if (str < f->end && *str != '\n')
|
||||
str = next_char(f, str);
|
||||
}
|
||||
destroy_match(&m);
|
||||
recycle_if_unused(&m);
|
||||
return NULL;
|
||||
}
|
||||
case VM_REPEAT: {
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->end = str;
|
||||
m->op = op;
|
||||
@ -243,14 +237,14 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
// Separator
|
||||
match_t *sep = NULL;
|
||||
if (op->args.repetitions.sep != NULL && reps > 0) {
|
||||
sep = _match(defs, f, str, op->args.repetitions.sep, flags);
|
||||
sep = match(defs, f, str, op->args.repetitions.sep, flags);
|
||||
if (sep == NULL) break;
|
||||
str = sep->end;
|
||||
}
|
||||
match_t *p = _match(defs, f, str, op->args.repetitions.repeat_pat, flags);
|
||||
match_t *p = match(defs, f, str, op->args.repetitions.repeat_pat, flags);
|
||||
if (p == NULL) {
|
||||
str = start;
|
||||
destroy_match(&sep);
|
||||
recycle_if_unused(&sep);
|
||||
break;
|
||||
}
|
||||
if (p->end == start && reps > 0) {
|
||||
@ -260,8 +254,8 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
// loop either. We know that this will continue to loop
|
||||
// until reps==max, so let's just cut to the chase instead
|
||||
// of looping infinitely.
|
||||
destroy_match(&sep);
|
||||
destroy_match(&p);
|
||||
recycle_if_unused(&sep);
|
||||
recycle_if_unused(&p);
|
||||
if (op->args.repetitions.max == -1)
|
||||
reps = ~(size_t)0;
|
||||
else
|
||||
@ -269,16 +263,16 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
break;
|
||||
}
|
||||
if (sep) {
|
||||
set_owner(sep, dest);
|
||||
ADD_OWNER(*dest, sep);
|
||||
dest = &sep->nextsibling;
|
||||
}
|
||||
set_owner(p, dest);
|
||||
ADD_OWNER(*dest, p);
|
||||
dest = &p->nextsibling;
|
||||
str = p->end;
|
||||
}
|
||||
|
||||
if (reps < (size_t)op->args.repetitions.min) {
|
||||
destroy_match(&m);
|
||||
recycle_if_unused(&m);
|
||||
return NULL;
|
||||
}
|
||||
m->end = str;
|
||||
@ -288,75 +282,75 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
ssize_t backtrack = op->args.pat->len;
|
||||
check(backtrack != -1, "'<' is only allowed for fixed-length operations");
|
||||
if (str - backtrack < f->contents) return NULL;
|
||||
match_t *before = _match(defs, f, str - backtrack, op->args.pat, flags);
|
||||
match_t *before = match(defs, f, str - backtrack, op->args.pat, flags);
|
||||
if (before == NULL) return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->end = str;
|
||||
m->op = op;
|
||||
set_owner(before, &m->child);
|
||||
ADD_OWNER(m->child, before);
|
||||
return m;
|
||||
}
|
||||
case VM_BEFORE: {
|
||||
match_t *after = _match(defs, f, str, op->args.pat, flags);
|
||||
match_t *after = match(defs, f, str, op->args.pat, flags);
|
||||
if (after == NULL) return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->end = str;
|
||||
m->op = op;
|
||||
set_owner(after, &m->child);
|
||||
ADD_OWNER(m->child, after);
|
||||
return m;
|
||||
}
|
||||
case VM_CAPTURE: {
|
||||
match_t *p = _match(defs, f, str, op->args.pat, flags);
|
||||
match_t *p = match(defs, f, str, op->args.pat, flags);
|
||||
if (p == NULL) return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->end = p->end;
|
||||
m->op = op;
|
||||
set_owner(p, &m->child);
|
||||
ADD_OWNER(m->child, p);
|
||||
return m;
|
||||
}
|
||||
case VM_HIDE: {
|
||||
match_t *p = _match(defs, f, str, op->args.pat, flags);
|
||||
match_t *p = match(defs, f, str, op->args.pat, flags);
|
||||
if (p == NULL) return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->end = p->end;
|
||||
m->op = op;
|
||||
set_owner(p, &m->child);
|
||||
ADD_OWNER(m->child, p);
|
||||
return m;
|
||||
}
|
||||
case VM_OTHERWISE: {
|
||||
match_t *m = _match(defs, f, str, op->args.multiple.first, flags);
|
||||
if (m == NULL) m = _match(defs, f, str, op->args.multiple.second, flags);
|
||||
match_t *m = match(defs, f, str, op->args.multiple.first, flags);
|
||||
if (m == NULL) m = match(defs, f, str, op->args.multiple.second, flags);
|
||||
return m;
|
||||
}
|
||||
case VM_CHAIN: {
|
||||
match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags);
|
||||
match_t *m1 = match(defs, f, str, op->args.multiple.first, flags);
|
||||
if (m1 == NULL) return NULL;
|
||||
|
||||
match_t *m2;
|
||||
{ // Push backrefs and run matching, then cleanup
|
||||
def_t *defs2 = with_backrefs(defs, f, m1);
|
||||
m2 = _match(defs2, f, m1->end, op->args.multiple.second, flags);
|
||||
m2 = match(defs2, f, m1->end, op->args.multiple.second, flags);
|
||||
free_defs(&defs2, defs);
|
||||
}
|
||||
|
||||
if (m2 == NULL) {
|
||||
destroy_match(&m1);
|
||||
recycle_if_unused(&m1);
|
||||
return NULL;
|
||||
}
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->end = m2->end;
|
||||
m->op = op;
|
||||
set_owner(m1, &m->child);
|
||||
set_owner(m2, &m1->nextsibling);
|
||||
ADD_OWNER(m->child, m1);
|
||||
ADD_OWNER(m1->nextsibling, m2);
|
||||
return m;
|
||||
}
|
||||
case VM_EQUAL: case VM_NOT_EQUAL: {
|
||||
match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags);
|
||||
match_t *m1 = match(defs, f, str, op->args.multiple.first, flags);
|
||||
if (m1 == NULL) return NULL;
|
||||
|
||||
// <p1>==<p2> matches iff the text of <p1> matches <p2>
|
||||
@ -368,35 +362,35 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
.nlines=1 + get_line_number(f, m1->end)-get_line_number(f, m1->start),
|
||||
.mmapped=f->mmapped,
|
||||
};
|
||||
match_t *m2 = _match(defs, &inner, str, op->args.multiple.second, flags);
|
||||
match_t *m2 = match(defs, &inner, str, op->args.multiple.second, flags);
|
||||
if ((m2 == NULL) == (op->type == VM_EQUAL)) {
|
||||
destroy_match(&m1);
|
||||
if (m2 != NULL) destroy_match(&m2);
|
||||
recycle_if_unused(&m1);
|
||||
if (m2 != NULL) recycle_if_unused(&m2);
|
||||
return NULL;
|
||||
}
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = m1->start;
|
||||
m->end = m1->end;
|
||||
m->op = op;
|
||||
set_owner(m1, &m->child);
|
||||
ADD_OWNER(m->child, m1);
|
||||
if (op->type == VM_EQUAL) {
|
||||
set_owner(m2, &m1->nextsibling);
|
||||
ADD_OWNER(m1->nextsibling, m2);
|
||||
} else {
|
||||
destroy_match(&m2);
|
||||
recycle_if_unused(&m2);
|
||||
}
|
||||
return m;
|
||||
}
|
||||
case VM_REPLACE: {
|
||||
match_t *p = NULL;
|
||||
if (op->args.replace.pat) {
|
||||
p = _match(defs, f, str, op->args.replace.pat, flags);
|
||||
p = match(defs, f, str, op->args.replace.pat, flags);
|
||||
if (p == NULL) return NULL;
|
||||
}
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = str;
|
||||
m->op = op;
|
||||
if (p) {
|
||||
set_owner(p, &m->child);
|
||||
ADD_OWNER(m->child, p);
|
||||
m->end = p->end;
|
||||
} else {
|
||||
m->end = m->start;
|
||||
@ -408,12 +402,12 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
check(def != NULL, "Unknown identifier: '%s'", op->args.s);
|
||||
vm_op_t *ref = def->op;
|
||||
|
||||
vm_op_t fail_op = {
|
||||
.type = VM_CANNED,
|
||||
vm_op_t rec_op = {
|
||||
.type = VM_LEFTRECURSION,
|
||||
.start = ref->start,
|
||||
.end = ref->end,
|
||||
.len = 0,
|
||||
.args.canned = {
|
||||
.args.leftrec = {
|
||||
.match = NULL,
|
||||
.visits = 0,
|
||||
.at = str,
|
||||
@ -424,29 +418,42 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
.namelen = def->namelen,
|
||||
.name = def->name,
|
||||
.file = def->file,
|
||||
.op = &fail_op,
|
||||
.op = &rec_op,
|
||||
.next = defs,
|
||||
};
|
||||
|
||||
const char *prev = str;
|
||||
match_t *m = _match(&defs2, f, str, ref, flags);
|
||||
match_t *m = match(&defs2, f, str, ref, flags);
|
||||
if (m == NULL) return NULL;
|
||||
|
||||
while (fail_op.args.canned.visits > 0 && m->end > prev) {
|
||||
fail_op.args.canned.visits = 0;
|
||||
fail_op.args.canned.match = m;
|
||||
while (rec_op.args.leftrec.visits > 0) {
|
||||
rec_op.args.leftrec.visits = 0;
|
||||
REMOVE_OWNERSHIP(rec_op.args.leftrec.match);
|
||||
ADD_OWNER(rec_op.args.leftrec.match, m);
|
||||
prev = m->end;
|
||||
match_t *m2 = _match(&defs2, f, str, ref, flags);
|
||||
match_t *m2 = match(&defs2, f, str, ref, flags);
|
||||
if (m2 == NULL) break;
|
||||
if (m2->end <= prev) {
|
||||
recycle_if_unused(&m2);
|
||||
break;
|
||||
}
|
||||
m = m2;
|
||||
}
|
||||
|
||||
if (rec_op.args.leftrec.match) {
|
||||
// Ensure that `m` isn't garbage collected right now, but do
|
||||
// clean up the recursive match result if it's not needed.
|
||||
++m->refcount;
|
||||
REMOVE_OWNERSHIP(rec_op.args.leftrec.match);
|
||||
--m->refcount;
|
||||
}
|
||||
|
||||
return m;
|
||||
}
|
||||
case VM_BACKREF: {
|
||||
const char *end = match_backref(str, op, op->args.backref, flags);
|
||||
if (end == NULL) return NULL;
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->op = op;
|
||||
m->start = str;
|
||||
m->end = end;
|
||||
@ -473,7 +480,7 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
|
||||
if (str[i] != denter || &str[i] >= f->end) return NULL;
|
||||
}
|
||||
|
||||
match_t *m = new(match_t);
|
||||
match_t *m = new_match();
|
||||
m->start = start;
|
||||
m->end = &str[dents];
|
||||
m->op = op;
|
||||
@ -540,14 +547,79 @@ match_t *get_capture(match_t *m, const char **id)
|
||||
}
|
||||
|
||||
//
|
||||
// Wrapper function for _match() to kickstart the recursion info.
|
||||
// Return a match object which can be used (may be allocated or recycled).
|
||||
//
|
||||
match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
|
||||
match_t *new_match(void)
|
||||
{
|
||||
return _match(defs, f, str, op, flags);
|
||||
match_t *m;
|
||||
if (unused_matches) {
|
||||
m = unused_matches;
|
||||
LL_REMOVE(m);
|
||||
memset(m, 0, sizeof(match_t));
|
||||
} else {
|
||||
m = new(match_t);
|
||||
}
|
||||
// Keep track of the object:
|
||||
LL_PREPEND(in_use_matches, m);
|
||||
return m;
|
||||
}
|
||||
|
||||
//
|
||||
// If the given match is not currently a child member of another match (or
|
||||
// otherwise reserved) then put it back in the pool of unused match objects.
|
||||
//
|
||||
void recycle_if_unused(match_t **at_m)
|
||||
{
|
||||
match_t *m = *at_m;
|
||||
if (m == NULL) return;
|
||||
if (m->refcount > 0) {
|
||||
*at_m = NULL;
|
||||
return;
|
||||
}
|
||||
|
||||
REMOVE_OWNERSHIP(m->child);
|
||||
REMOVE_OWNERSHIP(m->nextsibling);
|
||||
|
||||
LL_REMOVE(m);
|
||||
memset(m, 0, sizeof(match_t));
|
||||
LL_PREPEND(unused_matches, m);
|
||||
*at_m = NULL;
|
||||
}
|
||||
|
||||
//
|
||||
// Force all match objects into the pool of unused match objects.
|
||||
//
|
||||
size_t recycle_all_matches(void)
|
||||
{
|
||||
size_t count = 0;
|
||||
while (in_use_matches) {
|
||||
match_t *m = in_use_matches;
|
||||
LL_REMOVE(m);
|
||||
LL_PREPEND(unused_matches, m);
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
//
|
||||
// Free all match objects in memory.
|
||||
//
|
||||
size_t free_all_matches(void)
|
||||
{
|
||||
size_t count = 0;
|
||||
recycle_all_matches();
|
||||
while (unused_matches) {
|
||||
match_t *m = unused_matches;
|
||||
LL_REMOVE(m);
|
||||
free(m);
|
||||
++count;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
//
|
||||
// Deallocate memory associated with an op
|
||||
//
|
||||
void destroy_op(vm_op_t *op)
|
||||
{
|
||||
switch (op->type) {
|
||||
|
9
vm.h
9
vm.h
@ -8,14 +8,17 @@
|
||||
|
||||
#include "types.h"
|
||||
|
||||
__attribute__((nonnull(2,3,4)))
|
||||
__attribute__((hot, nonnull(2,3,4)))
|
||||
match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags);
|
||||
__attribute__((nonnull))
|
||||
void destroy_match(match_t **m);
|
||||
__attribute__((nonnull))
|
||||
match_t *get_capture(match_t *m, const char **id);
|
||||
__attribute__((nonnull))
|
||||
void destroy_op(vm_op_t *op);
|
||||
match_t *new_match(void);
|
||||
size_t free_all_matches(void);
|
||||
__attribute__((nonnull))
|
||||
void recycle_if_unused(match_t **at_m);
|
||||
size_t recycle_all_matches(void);
|
||||
|
||||
#endif
|
||||
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1
|
||||
|
Loading…
Reference in New Issue
Block a user