aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2021-01-14 19:21:31 -0800
committerBruce Hill <bruce@bruce-hill.com>2021-01-14 19:21:31 -0800
commit3cd61e3abc79a80d38afa9c633c59585aeb0311a (patch)
treeebc97bfb416d0e9a0a486f754221b21a5a05fbc2
parent9238ffea8819461687e24ea45e466403faa68681 (diff)
Overhaul of memory tracking and left recursion. Added explanation doc
for left recursion and fixed all visible memory leaks.
-rw-r--r--Makefile5
-rw-r--r--bp.c41
-rw-r--r--file_loader.c2
-rw-r--r--file_loader.h2
-rw-r--r--left-recursion.md68
-rw-r--r--types.h8
-rw-r--r--vm.c270
-rw-r--r--vm.h9
8 files changed, 281 insertions, 124 deletions
diff --git a/Makefile b/Makefile
index 40a57ea..b73d68d 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,7 @@ PREFIX=/usr/local
SYSCONFDIR=/etc
CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L
CWARN=-Wall -Wpedantic -Wextra -Wsign-conversion -Wtype-limits -Wunused-result
+EXTRA_FLAGS=
G=
O=-O3
@@ -13,10 +14,10 @@ OBJFILES=$(CFILES:.c=.o)
all: $(NAME)
%.o: %.c %.h types.h
- $(CC) -c $(CFLAGS) $(CWARN) $(G) $(O) -o $@ $<
+ $(CC) -c $(CFLAGS) $(EXTRA_CFLAGS) $(CWARN) $(G) $(O) -o $@ $<
$(NAME): $(OBJFILES) $(NAME).c
- $(CC) $(CFLAGS) $(CWARN) $(G) $(O) -o $@ $(OBJFILES) $(NAME).c
+ $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(CWARN) $(G) $(O) -o $@ $(OBJFILES) $(NAME).c
clean:
rm -f $(NAME) $(OBJFILES)
diff --git a/bp.c b/bp.c
index 57c8135..51d34ca 100644
--- a/bp.c
+++ b/bp.c
@@ -119,9 +119,8 @@ static int process_file(def_t *defs, const char *filename, vm_op_t *pattern, uns
}
}
- if (m != NULL)
- destroy_match(&m);
-
+ recycle_if_unused(&m);
+ check(recycle_all_matches() == 0, "Memory leak: there should no longer be any matches in use at this point.");
destroy_file(&f);
return success;
@@ -134,12 +133,16 @@ int main(int argc, char *argv[])
unsigned int flags = 0;
char *flag = NULL;
char path[PATH_MAX] = {0};
- const char *rule = "find-all";
+ const char *mode = "find-all";
def_t *defs = NULL;
file_t *loaded_files = NULL;
+ // Define an opcode that is just a reference to the rule `pattern`
+ file_t *pat_file = spoof_file(&loaded_files, "<pattern>", "pattern");
+ vm_op_t *pattern = bp_pattern(pat_file, pat_file->contents);
+
// Load builtins:
if (access("/etc/xdg/bp/builtins.bp", R_OK) != -1) {
file_t *f = load_file(&loaded_files, "/etc/xdg/bp/builtins.bp");
@@ -176,13 +179,11 @@ int main(int argc, char *argv[])
} else if (FLAG("--replace") || FLAG("-r")) {
// TODO: spoof file as sprintf("pattern => '%s'", flag)
// except that would require handling edge cases like quotation marks etc.
- file_t *pat_file = spoof_file(&loaded_files, "<pattern>", "pattern");
- vm_op_t *patref = bp_pattern(pat_file, pat_file->contents);
file_t *replace_file = spoof_file(&loaded_files, "<replace argument>", flag);
- vm_op_t *rep = bp_replacement(replace_file, patref, replace_file->contents);
+ vm_op_t *rep = bp_replacement(replace_file, pattern, replace_file->contents);
check(rep, "Replacement failed to compile: %s", flag);
defs = with_def(defs, replace_file, strlen("replacement"), "replacement", rep);
- rule = "replace-all";
+ mode = "replace-all";
} else if (FLAG("--grammar") || FLAG("-g")) {
file_t *f = load_file(&loaded_files, flag);
if (f == NULL) {
@@ -229,7 +230,7 @@ int main(int argc, char *argv[])
defs = with_def(defs, arg_file, strlen("pattern"), "pattern", p);
++npatterns;
} else if (FLAG("--mode") || FLAG("-m")) {
- rule = flag;
+ mode = flag;
} else if (argv[i][0] == '-' && argv[i][1] && argv[i][1] != '-') { // single-char flags
for (char *c = &argv[i][1]; *c; ++c) {
switch (*c) {
@@ -269,16 +270,20 @@ int main(int argc, char *argv[])
print_options |= PRINT_COLOR | PRINT_LINE_NUMBERS;
}
- def_t *pattern_def = lookup(defs, rule);
- check(pattern_def != NULL, "No such rule: '%s'", rule);
- vm_op_t *pattern = pattern_def->op;
+ // Define an opcode that is just a reference to the overarching mode (e.g. find-all)
+ if (lookup(defs, mode) == NULL) {
+ printf("The mode '%s' is not defined.\n", mode);
+ return 1;
+ }
+ file_t *mode_file = spoof_file(&loaded_files, "<mode>", mode);
+ vm_op_t *mode_op = bp_pattern(mode_file, mode_file->contents);
int found = 0;
if (flags & BP_JSON) printf("[");
if (i < argc) {
// Files pass in as command line args:
for (int nfiles = 0; i < argc; nfiles++, i++) {
- found += process_file(defs, argv[i], pattern, flags);
+ found += process_file(defs, argv[i], mode_op, flags);
}
} else if (isatty(STDIN_FILENO)) {
// No files, no piped in input, so use * **/*:
@@ -286,21 +291,27 @@ int main(int argc, char *argv[])
glob("*", 0, NULL, &globbuf);
glob("**/*", GLOB_APPEND, NULL, &globbuf);
for (size_t i = 0; i < globbuf.gl_pathc; i++) {
- found += process_file(defs, globbuf.gl_pathv[i], pattern, flags);
+ found += process_file(defs, globbuf.gl_pathv[i], mode_op, flags);
}
globfree(&globbuf);
} else {
// Piped in input:
- found += process_file(defs, NULL, pattern, flags);
+ found += process_file(defs, NULL, mode_op, flags);
}
if (flags & BP_JSON) printf("]\n");
+#ifdef FREE_ON_EXIT
+ // This code frees up all residual heap-allocated memory. Since the program
+ // is about to exit, this step is unnecessary. However, it is useful for
+ // tracking down memory leaks.
free_defs(&defs, NULL);
while (loaded_files) {
file_t *next = loaded_files->next;
destroy_file(&loaded_files);
loaded_files = next;
}
+ free_all_matches();
+#endif
return (found > 0) ? 0 : 1;
}
diff --git a/file_loader.c b/file_loader.c
index 560a340..48e81e4 100644
--- a/file_loader.c
+++ b/file_loader.c
@@ -89,7 +89,7 @@ file_t *load_file(file_t **files, const char *filename)
//
// Create a virtual file from a string.
//
-file_t *spoof_file(file_t **files, const char *filename, char *text)
+file_t *spoof_file(file_t **files, const char *filename, const char *text)
{
if (filename == NULL) filename = "";
file_t *f = new(file_t);
diff --git a/file_loader.h b/file_loader.h
index 85bb920..91480b2 100644
--- a/file_loader.h
+++ b/file_loader.h
@@ -19,7 +19,7 @@ typedef struct file_s {
file_t *load_file(file_t **files, const char *filename);
__attribute__((nonnull(3), returns_nonnull))
-file_t *spoof_file(file_t **files, const char *filename, char *text);
+file_t *spoof_file(file_t **files, const char *filename, const char *text);
__attribute__((nonnull))
void intern_file(file_t *f);
__attribute__((nonnull))
diff --git a/left-recursion.md b/left-recursion.md
new file mode 100644
index 0000000..3e65e1c
--- /dev/null
+++ b/left-recursion.md
@@ -0,0 +1,68 @@
+# Left Recursion in BP
+
+Left recursion presents a special difficulty for parsing expression grammars.
+A regular recursive rule looks like this:
+
+```
+rec: "{" *rec "}"
+```
+
+This is a relatively simple case to handle, because we're always making forward
+progress. In other words, calling `match(<rec>, str)` might call
+`match(<rec>, str + 1)` but it will never recursively call the same function
+with the same arguments: `match(<rec>, str)`.
+
+Left-recursive rules, on the other hand, are rules that may attempt to match
+themselves at the same position. For example:
+
+```
+laugh: (laugh "ha") / "Ha"
+```
+
+The rule above is functionally equivalent to `laugh: "Ha" (*"ha")`, but it's
+written in a recursive style. It should be obvious by inspection that the
+string `Hahaha` matches both versions of the rule, but a naive implementation
+of `match()` would cause `match(<laugh>, str)` to call `match(<laugh>, str)`
+before doing anything else, which results in infinite recursion and a stack
+overflow. One option is to simply detect left recursion and cause a compilation
+error (this is what LPEG does). However, it is possible to actually match left
+recursive rules without overflowing the stack.
+
+## Matching Left Recursion
+
+The solution used in BP is the following:
+
+1. Whenever matching a named rule, `foo`, against string `str`, first get the
+ originally defined pattern for that rule. Call this
+ `foo-original-definition`.
+2. Temporarily define `match(<foo>, str) := signal left recursion and return
+ Fail` (In other words, `<foo>` is temporarily defined to signal left
+ recursion and then fail to match if matching against the string `str`.)
+3. Run `result := match(<foo-original-definition>, str)`.
+4. If the match failed, then return failure.
+5. If the match was successful and did not trigger left recursion, return `result`.
+6. If the match was successful, but did trigger left recursion, then:
+
+ a. If `result` is not longer than any previous `result`, stop looping and
+ return the longest `result` to avoid an infinite loop.
+ b. Otherwise: temporarily define `match(<foo>, str) := signal left recursion and return <result>`
+ c. Go to step 3.
+
+This handles left recursion as a simple loop and successfully matches left
+recursive patterns, even ones that are indirectly or nontrivially left
+recursive.
+
+## Example
+
+Consider the rule `laugh: laugh "ha" / "Ha"` being applied to the input text `"Hahaha!"`:
+
+|Definition of `match(<laugh>, "Hahaha!")` | Result of `match(<laugh "ha" / "Ha">, "Hahaha!")` |
+|------------------------------------------|---------------------------------------------------|
+|`Fail` | `Match{"Ha"}` |
+|`Match{"Ha"}` | `Match{"Haha"}` |
+|`Match{"Haha"}` | `Match{"Hahaha"}` |
+|`Match{"Hahaha"}` | `Match{"Ha"}` (Forward progress stops being made) |
+
+As you can see in this example, each successive iteration builds up a final
+match incrementally until progress is no longer being made. At that point,
+the longest match is returned: `Match{"Hahaha"}`.
diff --git a/types.h b/types.h
index b8c2b68..30a0e9e 100644
--- a/types.h
+++ b/types.h
@@ -39,7 +39,7 @@ enum VMOpcode {
VM_REF,
VM_BACKREF,
VM_NODENT,
- VM_CANNED,
+ VM_LEFTRECURSION,
};
struct match_s; // forward declared to resolve circular struct defs
@@ -80,7 +80,7 @@ typedef struct vm_op_s {
unsigned int visits;
const char *at;
struct vm_op_s *fallback;
- } canned;
+ } leftrec;
struct vm_op_s *pat;
} args;
} vm_op_t;
@@ -93,7 +93,9 @@ typedef struct match_s {
const char *start, *end;
struct match_s *child, *nextsibling;
vm_op_t *op;
- unsigned int refcount;
+ // Intrusive linked list nodes for garbage collection:
+ struct match_s **atme, *next;
+ int refcount;
} match_t;
//
diff --git a/vm.c b/vm.c
index 68c793a..ff62b4d 100644
--- a/vm.c
+++ b/vm.c
@@ -12,25 +12,31 @@
#include "utils.h"
#include "vm.h"
+// Linked list operations:
+#define LL_PREPEND(head, node) do { (node)->atme = &(head); (node)->next = head; if (head) (head)->atme = &(node)->next; head = node; } while(0)
+#define LL_REMOVE(node) do { *(node)->atme = (node)->next; if ((node)->next) (node)->next->atme = (node)->atme; } while(0)
+
+// Refcounting ownership-setting macros:
+#define ADD_OWNER(owner, m) do { owner = m; ++(m)->refcount; } while(0)
+#define REMOVE_OWNERSHIP(owner) do { if (owner) { --(owner)->refcount; recycle_if_unused(&(owner)); owner = NULL; } } while(0)
+
+// New match objects are either recycled from unused match objects or allocated
+// from the heap. While it is in use, the match object is stored in the
+// `in_use_matches` linked list. Once it is no longer needed, it is moved to
+// the `unused_matches` linked list so it can be reused without the need for
+// additional calls to malloc/free. Thus, it is an invariant that every match
+// object is in one of these two lists:
+static match_t *in_use_matches = NULL, *unused_matches = NULL;
+
__attribute__((nonnull, pure))
static inline const char *next_char(file_t *f, const char *str);
__attribute__((nonnull))
static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, unsigned int flags);
-__attribute__((hot, nonnull(2,3,4)))
-static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags);
__attribute__((nonnull))
static match_t *get_capture_by_num(match_t *m, int *n);
__attribute__((nonnull, pure))
static match_t *get_capture_by_name(match_t *m, const char *name);
-static inline void set_owner(match_t *m, match_t **owner)
-{
- check(owner != &m->child, "Circular ownership");
- check(owner != &m->nextsibling, "Circular ownership");
- *owner = m;
- ++m->refcount;
-}
-
//
// Return the location of the next character or UTF8 codepoint.
// (i.e. skip forward one codepoint at a time, not one byte at a time)
@@ -52,22 +58,6 @@ static inline const char *next_char(file_t *f, const char *str)
}
//
-// Recursively deallocate a match object and set to NULL
-//
-void destroy_match(match_t **m)
-{
- if (*m == NULL) return;
- if (--(*m)->refcount > 0) {
- *m = NULL;
- return;
- }
- destroy_match(&((*m)->child));
- destroy_match(&((*m)->nextsibling));
- free(*m);
- *m = NULL;
-}
-
-//
// Attempt to match text against a previously captured value.
// Return the character position after the backref has matched, or NULL if no match has occurred.
//
@@ -131,22 +121,26 @@ static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, uns
// a match struct, or NULL if no match is found.
// The returned value should be free()'d to avoid memory leaking.
//
-static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
+match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
{
switch (op->type) {
- case VM_CANNED: {
- if (str == op->args.canned.at) {
- ++op->args.canned.visits;
- // TODO: deep copy?
- return op->args.canned.match;
+ case VM_LEFTRECURSION: {
+ // Left recursion occurs when a pattern directly or indirectly
+ // invokes itself at the same position in the text. It's handled as
+ // a special case, but if a pattern invokes itself at a later
+ // point, it can be handled with normal recursion.
+ // See: left-recursion.md for more details.
+ if (str == op->args.leftrec.at) {
+ ++op->args.leftrec.visits;
+ return op->args.leftrec.match;
} else {
- return _match(defs, f, str, op->args.canned.fallback, flags);
+ return match(defs, f, str, op->args.leftrec.fallback, flags);
}
}
case VM_ANYCHAR: {
if (str >= f->end || *str == '\n')
return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->op = op;
m->start = str;
m->end = next_char(f, str);
@@ -157,7 +151,7 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
if ((flags & BP_IGNORECASE) ? memicmp(str, op->args.s, (size_t)op->len) != 0
: memcmp(str, op->args.s, (size_t)op->len) != 0)
return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->op = op;
m->start = str;
m->end = str + op->len;
@@ -167,26 +161,26 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
if (str >= f->end) return NULL;
if ((unsigned char)*str < op->args.range.low || (unsigned char)*str > op->args.range.high)
return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->op = op;
m->start = str;
m->end = str + 1;
return m;
}
case VM_NOT: {
- match_t *m = _match(defs, f, str, op->args.pat, flags);
+ match_t *m = match(defs, f, str, op->args.pat, flags);
if (m != NULL) {
- destroy_match(&m);
+ recycle_if_unused(&m);
return NULL;
}
- m = new(match_t);
+ m = new_match();
m->op = op;
m->start = str;
m->end = str;
return m;
}
case VM_UPTO_AND: {
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->op = op;
@@ -201,9 +195,9 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
for (const char *prev = NULL; prev < str; ) {
prev = str;
if (pat) {
- match_t *p = _match(defs, f, str, pat, flags);
+ match_t *p = match(defs, f, str, pat, flags);
if (p != NULL) {
- set_owner(p, dest);
+ ADD_OWNER(*dest, p);
m->end = p->end;
return m;
}
@@ -212,9 +206,9 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
return m;
}
if (skip) {
- match_t *s = _match(defs, f, str, skip, flags);
+ match_t *s = match(defs, f, str, skip, flags);
if (s != NULL) {
- set_owner(s, dest);
+ ADD_OWNER(*dest, s);
dest = &s->nextsibling;
str = s->end;
continue;
@@ -226,11 +220,11 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
if (str < f->end && *str != '\n')
str = next_char(f, str);
}
- destroy_match(&m);
+ recycle_if_unused(&m);
return NULL;
}
case VM_REPEAT: {
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->end = str;
m->op = op;
@@ -243,14 +237,14 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
// Separator
match_t *sep = NULL;
if (op->args.repetitions.sep != NULL && reps > 0) {
- sep = _match(defs, f, str, op->args.repetitions.sep, flags);
+ sep = match(defs, f, str, op->args.repetitions.sep, flags);
if (sep == NULL) break;
str = sep->end;
}
- match_t *p = _match(defs, f, str, op->args.repetitions.repeat_pat, flags);
+ match_t *p = match(defs, f, str, op->args.repetitions.repeat_pat, flags);
if (p == NULL) {
str = start;
- destroy_match(&sep);
+ recycle_if_unused(&sep);
break;
}
if (p->end == start && reps > 0) {
@@ -260,8 +254,8 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
// loop either. We know that this will continue to loop
// until reps==max, so let's just cut to the chase instead
// of looping infinitely.
- destroy_match(&sep);
- destroy_match(&p);
+ recycle_if_unused(&sep);
+ recycle_if_unused(&p);
if (op->args.repetitions.max == -1)
reps = ~(size_t)0;
else
@@ -269,16 +263,16 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
break;
}
if (sep) {
- set_owner(sep, dest);
+ ADD_OWNER(*dest, sep);
dest = &sep->nextsibling;
}
- set_owner(p, dest);
+ ADD_OWNER(*dest, p);
dest = &p->nextsibling;
str = p->end;
}
if (reps < (size_t)op->args.repetitions.min) {
- destroy_match(&m);
+ recycle_if_unused(&m);
return NULL;
}
m->end = str;
@@ -288,75 +282,75 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
ssize_t backtrack = op->args.pat->len;
check(backtrack != -1, "'<' is only allowed for fixed-length operations");
if (str - backtrack < f->contents) return NULL;
- match_t *before = _match(defs, f, str - backtrack, op->args.pat, flags);
+ match_t *before = match(defs, f, str - backtrack, op->args.pat, flags);
if (before == NULL) return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->end = str;
m->op = op;
- set_owner(before, &m->child);
+ ADD_OWNER(m->child, before);
return m;
}
case VM_BEFORE: {
- match_t *after = _match(defs, f, str, op->args.pat, flags);
+ match_t *after = match(defs, f, str, op->args.pat, flags);
if (after == NULL) return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->end = str;
m->op = op;
- set_owner(after, &m->child);
+ ADD_OWNER(m->child, after);
return m;
}
case VM_CAPTURE: {
- match_t *p = _match(defs, f, str, op->args.pat, flags);
+ match_t *p = match(defs, f, str, op->args.pat, flags);
if (p == NULL) return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->end = p->end;
m->op = op;
- set_owner(p, &m->child);
+ ADD_OWNER(m->child, p);
return m;
}
case VM_HIDE: {
- match_t *p = _match(defs, f, str, op->args.pat, flags);
+ match_t *p = match(defs, f, str, op->args.pat, flags);
if (p == NULL) return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->end = p->end;
m->op = op;
- set_owner(p, &m->child);
+ ADD_OWNER(m->child, p);
return m;
}
case VM_OTHERWISE: {
- match_t *m = _match(defs, f, str, op->args.multiple.first, flags);
- if (m == NULL) m = _match(defs, f, str, op->args.multiple.second, flags);
+ match_t *m = match(defs, f, str, op->args.multiple.first, flags);
+ if (m == NULL) m = match(defs, f, str, op->args.multiple.second, flags);
return m;
}
case VM_CHAIN: {
- match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags);
+ match_t *m1 = match(defs, f, str, op->args.multiple.first, flags);
if (m1 == NULL) return NULL;
match_t *m2;
{ // Push backrefs and run matching, then cleanup
def_t *defs2 = with_backrefs(defs, f, m1);
- m2 = _match(defs2, f, m1->end, op->args.multiple.second, flags);
+ m2 = match(defs2, f, m1->end, op->args.multiple.second, flags);
free_defs(&defs2, defs);
}
if (m2 == NULL) {
- destroy_match(&m1);
+ recycle_if_unused(&m1);
return NULL;
}
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->end = m2->end;
m->op = op;
- set_owner(m1, &m->child);
- set_owner(m2, &m1->nextsibling);
+ ADD_OWNER(m->child, m1);
+ ADD_OWNER(m1->nextsibling, m2);
return m;
}
case VM_EQUAL: case VM_NOT_EQUAL: {
- match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags);
+ match_t *m1 = match(defs, f, str, op->args.multiple.first, flags);
if (m1 == NULL) return NULL;
// <p1>==<p2> matches iff the text of <p1> matches <p2>
@@ -368,35 +362,35 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
.nlines=1 + get_line_number(f, m1->end)-get_line_number(f, m1->start),
.mmapped=f->mmapped,
};
- match_t *m2 = _match(defs, &inner, str, op->args.multiple.second, flags);
+ match_t *m2 = match(defs, &inner, str, op->args.multiple.second, flags);
if ((m2 == NULL) == (op->type == VM_EQUAL)) {
- destroy_match(&m1);
- if (m2 != NULL) destroy_match(&m2);
+ recycle_if_unused(&m1);
+ if (m2 != NULL) recycle_if_unused(&m2);
return NULL;
}
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = m1->start;
m->end = m1->end;
m->op = op;
- set_owner(m1, &m->child);
+ ADD_OWNER(m->child, m1);
if (op->type == VM_EQUAL) {
- set_owner(m2, &m1->nextsibling);
+ ADD_OWNER(m1->nextsibling, m2);
} else {
- destroy_match(&m2);
+ recycle_if_unused(&m2);
}
return m;
}
case VM_REPLACE: {
match_t *p = NULL;
if (op->args.replace.pat) {
- p = _match(defs, f, str, op->args.replace.pat, flags);
+ p = match(defs, f, str, op->args.replace.pat, flags);
if (p == NULL) return NULL;
}
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = str;
m->op = op;
if (p) {
- set_owner(p, &m->child);
+ ADD_OWNER(m->child, p);
m->end = p->end;
} else {
m->end = m->start;
@@ -408,12 +402,12 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
check(def != NULL, "Unknown identifier: '%s'", op->args.s);
vm_op_t *ref = def->op;
- vm_op_t fail_op = {
- .type = VM_CANNED,
+ vm_op_t rec_op = {
+ .type = VM_LEFTRECURSION,
.start = ref->start,
.end = ref->end,
.len = 0,
- .args.canned = {
+ .args.leftrec = {
.match = NULL,
.visits = 0,
.at = str,
@@ -424,29 +418,42 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
.namelen = def->namelen,
.name = def->name,
.file = def->file,
- .op = &fail_op,
+ .op = &rec_op,
.next = defs,
};
const char *prev = str;
- match_t *m = _match(&defs2, f, str, ref, flags);
+ match_t *m = match(&defs2, f, str, ref, flags);
if (m == NULL) return NULL;
- while (fail_op.args.canned.visits > 0 && m->end > prev) {
- fail_op.args.canned.visits = 0;
- fail_op.args.canned.match = m;
+ while (rec_op.args.leftrec.visits > 0) {
+ rec_op.args.leftrec.visits = 0;
+ REMOVE_OWNERSHIP(rec_op.args.leftrec.match);
+ ADD_OWNER(rec_op.args.leftrec.match, m);
prev = m->end;
- match_t *m2 = _match(&defs2, f, str, ref, flags);
+ match_t *m2 = match(&defs2, f, str, ref, flags);
if (m2 == NULL) break;
+ if (m2->end <= prev) {
+ recycle_if_unused(&m2);
+ break;
+ }
m = m2;
}
+ if (rec_op.args.leftrec.match) {
+ // Ensure that `m` isn't garbage collected right now, but do
+ // clean up the recursive match result if it's not needed.
+ ++m->refcount;
+ REMOVE_OWNERSHIP(rec_op.args.leftrec.match);
+ --m->refcount;
+ }
+
return m;
}
case VM_BACKREF: {
const char *end = match_backref(str, op, op->args.backref, flags);
if (end == NULL) return NULL;
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->op = op;
m->start = str;
m->end = end;
@@ -473,7 +480,7 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns
if (str[i] != denter || &str[i] >= f->end) return NULL;
}
- match_t *m = new(match_t);
+ match_t *m = new_match();
m->start = start;
m->end = &str[dents];
m->op = op;
@@ -540,14 +547,79 @@ match_t *get_capture(match_t *m, const char **id)
}
//
-// Wrapper function for _match() to kickstart the recursion info.
+// Return a match object which can be used (may be allocated or recycled).
//
-match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags)
+match_t *new_match(void)
+{
+ match_t *m;
+ if (unused_matches) {
+ m = unused_matches;
+ LL_REMOVE(m);
+ memset(m, 0, sizeof(match_t));
+ } else {
+ m = new(match_t);
+ }
+ // Keep track of the object:
+ LL_PREPEND(in_use_matches, m);
+ return m;
+}
+
+//
+// If the given match is not currently a child member of another match (or
+// otherwise reserved) then put it back in the pool of unused match objects.
+//
+void recycle_if_unused(match_t **at_m)
{
- return _match(defs, f, str, op, flags);
+ match_t *m = *at_m;
+ if (m == NULL) return;
+ if (m->refcount > 0) {
+ *at_m = NULL;
+ return;
+ }
+
+ REMOVE_OWNERSHIP(m->child);
+ REMOVE_OWNERSHIP(m->nextsibling);
+
+ LL_REMOVE(m);
+ memset(m, 0, sizeof(match_t));
+ LL_PREPEND(unused_matches, m);
+ *at_m = NULL;
}
+//
+// Force all match objects into the pool of unused match objects.
+//
+size_t recycle_all_matches(void)
+{
+ size_t count = 0;
+ while (in_use_matches) {
+ match_t *m = in_use_matches;
+ LL_REMOVE(m);
+ LL_PREPEND(unused_matches, m);
+ ++count;
+ }
+ return count;
+}
+
+//
+// Free all match objects in memory.
+//
+size_t free_all_matches(void)
+{
+ size_t count = 0;
+ recycle_all_matches();
+ while (unused_matches) {
+ match_t *m = unused_matches;
+ LL_REMOVE(m);
+ free(m);
+ ++count;
+ }
+ return count;
+}
+
+//
// Deallocate memory associated with an op
+//
void destroy_op(vm_op_t *op)
{
switch (op->type) {
diff --git a/vm.h b/vm.h
index 2127eca..3e7526b 100644
--- a/vm.h
+++ b/vm.h
@@ -8,14 +8,17 @@
#include "types.h"
-__attribute__((nonnull(2,3,4)))
+__attribute__((hot, nonnull(2,3,4)))
match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags);
__attribute__((nonnull))
-void destroy_match(match_t **m);
-__attribute__((nonnull))
match_t *get_capture(match_t *m, const char **id);
__attribute__((nonnull))
void destroy_op(vm_op_t *op);
+match_t *new_match(void);
+size_t free_all_matches(void);
+__attribute__((nonnull))
+void recycle_if_unused(match_t **at_m);
+size_t recycle_all_matches(void);
#endif
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1