From 3cd61e3abc79a80d38afa9c633c59585aeb0311a Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Thu, 14 Jan 2021 19:21:31 -0800 Subject: Overhaul of memory tracking and left recursion. Added explanation doc for left recursion and fixed all visible memory leaks. --- Makefile | 5 +- bp.c | 41 ++++++--- file_loader.c | 2 +- file_loader.h | 2 +- left-recursion.md | 68 ++++++++++++++ types.h | 8 +- vm.c | 270 ++++++++++++++++++++++++++++++++++-------------------- vm.h | 9 +- 8 files changed, 281 insertions(+), 124 deletions(-) create mode 100644 left-recursion.md diff --git a/Makefile b/Makefile index 40a57ea..b73d68d 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ PREFIX=/usr/local SYSCONFDIR=/etc CFLAGS=-std=c99 -Werror -D_XOPEN_SOURCE=700 -D_GNU_SOURCE -D_POSIX_C_SOURCE=200809L CWARN=-Wall -Wpedantic -Wextra -Wsign-conversion -Wtype-limits -Wunused-result +EXTRA_FLAGS= G= O=-O3 @@ -13,10 +14,10 @@ OBJFILES=$(CFILES:.c=.o) all: $(NAME) %.o: %.c %.h types.h - $(CC) -c $(CFLAGS) $(CWARN) $(G) $(O) -o $@ $< + $(CC) -c $(CFLAGS) $(EXTRA_CFLAGS) $(CWARN) $(G) $(O) -o $@ $< $(NAME): $(OBJFILES) $(NAME).c - $(CC) $(CFLAGS) $(CWARN) $(G) $(O) -o $@ $(OBJFILES) $(NAME).c + $(CC) $(CFLAGS) $(EXTRA_CFLAGS) $(CWARN) $(G) $(O) -o $@ $(OBJFILES) $(NAME).c clean: rm -f $(NAME) $(OBJFILES) diff --git a/bp.c b/bp.c index 57c8135..51d34ca 100644 --- a/bp.c +++ b/bp.c @@ -119,9 +119,8 @@ static int process_file(def_t *defs, const char *filename, vm_op_t *pattern, uns } } - if (m != NULL) - destroy_match(&m); - + recycle_if_unused(&m); + check(recycle_all_matches() == 0, "Memory leak: there should no longer be any matches in use at this point."); destroy_file(&f); return success; @@ -134,12 +133,16 @@ int main(int argc, char *argv[]) unsigned int flags = 0; char *flag = NULL; char path[PATH_MAX] = {0}; - const char *rule = "find-all"; + const char *mode = "find-all"; def_t *defs = NULL; file_t *loaded_files = NULL; + // Define an opcode that is just a reference to the rule `pattern` + file_t *pat_file = spoof_file(&loaded_files, "", "pattern"); + vm_op_t *pattern = bp_pattern(pat_file, pat_file->contents); + // Load builtins: if (access("/etc/xdg/bp/builtins.bp", R_OK) != -1) { file_t *f = load_file(&loaded_files, "/etc/xdg/bp/builtins.bp"); @@ -176,13 +179,11 @@ int main(int argc, char *argv[]) } else if (FLAG("--replace") || FLAG("-r")) { // TODO: spoof file as sprintf("pattern => '%s'", flag) // except that would require handling edge cases like quotation marks etc. - file_t *pat_file = spoof_file(&loaded_files, "", "pattern"); - vm_op_t *patref = bp_pattern(pat_file, pat_file->contents); file_t *replace_file = spoof_file(&loaded_files, "", flag); - vm_op_t *rep = bp_replacement(replace_file, patref, replace_file->contents); + vm_op_t *rep = bp_replacement(replace_file, pattern, replace_file->contents); check(rep, "Replacement failed to compile: %s", flag); defs = with_def(defs, replace_file, strlen("replacement"), "replacement", rep); - rule = "replace-all"; + mode = "replace-all"; } else if (FLAG("--grammar") || FLAG("-g")) { file_t *f = load_file(&loaded_files, flag); if (f == NULL) { @@ -229,7 +230,7 @@ int main(int argc, char *argv[]) defs = with_def(defs, arg_file, strlen("pattern"), "pattern", p); ++npatterns; } else if (FLAG("--mode") || FLAG("-m")) { - rule = flag; + mode = flag; } else if (argv[i][0] == '-' && argv[i][1] && argv[i][1] != '-') { // single-char flags for (char *c = &argv[i][1]; *c; ++c) { switch (*c) { @@ -269,16 +270,20 @@ int main(int argc, char *argv[]) print_options |= PRINT_COLOR | PRINT_LINE_NUMBERS; } - def_t *pattern_def = lookup(defs, rule); - check(pattern_def != NULL, "No such rule: '%s'", rule); - vm_op_t *pattern = pattern_def->op; + // Define an opcode that is just a reference to the overarching mode (e.g. find-all) + if (lookup(defs, mode) == NULL) { + printf("The mode '%s' is not defined.\n", mode); + return 1; + } + file_t *mode_file = spoof_file(&loaded_files, "", mode); + vm_op_t *mode_op = bp_pattern(mode_file, mode_file->contents); int found = 0; if (flags & BP_JSON) printf("["); if (i < argc) { // Files pass in as command line args: for (int nfiles = 0; i < argc; nfiles++, i++) { - found += process_file(defs, argv[i], pattern, flags); + found += process_file(defs, argv[i], mode_op, flags); } } else if (isatty(STDIN_FILENO)) { // No files, no piped in input, so use * **/*: @@ -286,21 +291,27 @@ int main(int argc, char *argv[]) glob("*", 0, NULL, &globbuf); glob("**/*", GLOB_APPEND, NULL, &globbuf); for (size_t i = 0; i < globbuf.gl_pathc; i++) { - found += process_file(defs, globbuf.gl_pathv[i], pattern, flags); + found += process_file(defs, globbuf.gl_pathv[i], mode_op, flags); } globfree(&globbuf); } else { // Piped in input: - found += process_file(defs, NULL, pattern, flags); + found += process_file(defs, NULL, mode_op, flags); } if (flags & BP_JSON) printf("]\n"); +#ifdef FREE_ON_EXIT + // This code frees up all residual heap-allocated memory. Since the program + // is about to exit, this step is unnecessary. However, it is useful for + // tracking down memory leaks. free_defs(&defs, NULL); while (loaded_files) { file_t *next = loaded_files->next; destroy_file(&loaded_files); loaded_files = next; } + free_all_matches(); +#endif return (found > 0) ? 0 : 1; } diff --git a/file_loader.c b/file_loader.c index 560a340..48e81e4 100644 --- a/file_loader.c +++ b/file_loader.c @@ -89,7 +89,7 @@ file_t *load_file(file_t **files, const char *filename) // // Create a virtual file from a string. // -file_t *spoof_file(file_t **files, const char *filename, char *text) +file_t *spoof_file(file_t **files, const char *filename, const char *text) { if (filename == NULL) filename = ""; file_t *f = new(file_t); diff --git a/file_loader.h b/file_loader.h index 85bb920..91480b2 100644 --- a/file_loader.h +++ b/file_loader.h @@ -19,7 +19,7 @@ typedef struct file_s { file_t *load_file(file_t **files, const char *filename); __attribute__((nonnull(3), returns_nonnull)) -file_t *spoof_file(file_t **files, const char *filename, char *text); +file_t *spoof_file(file_t **files, const char *filename, const char *text); __attribute__((nonnull)) void intern_file(file_t *f); __attribute__((nonnull)) diff --git a/left-recursion.md b/left-recursion.md new file mode 100644 index 0000000..3e65e1c --- /dev/null +++ b/left-recursion.md @@ -0,0 +1,68 @@ +# Left Recursion in BP + +Left recursion presents a special difficulty for parsing expression grammars. +A regular recursive rule looks like this: + +``` +rec: "{" *rec "}" +``` + +This is a relatively simple case to handle, because we're always making forward +progress. In other words, calling `match(, str)` might call +`match(, str + 1)` but it will never recursively call the same function +with the same arguments: `match(, str)`. + +Left-recursive rules, on the other hand, are rules that may attempt to match +themselves at the same position. For example: + +``` +laugh: (laugh "ha") / "Ha" +``` + +The rule above is functionally equivalent to `laugh: "Ha" (*"ha")`, but it's +written in a recursive style. It should be obvious by inspection that the +string `Hahaha` matches both versions of the rule, but a naive implementation +of `match()` would cause `match(, str)` to call `match(, str)` +before doing anything else, which results in infinite recursion and a stack +overflow. One option is to simply detect left recursion and cause a compilation +error (this is what LPEG does). However, it is possible to actually match left +recursive rules without overflowing the stack. + +## Matching Left Recursion + +The solution used in BP is the following: + +1. Whenever matching a named rule, `foo`, against string `str`, first get the + originally defined pattern for that rule. Call this + `foo-original-definition`. +2. Temporarily define `match(, str) := signal left recursion and return + Fail` (In other words, `` is temporarily defined to signal left + recursion and then fail to match if matching against the string `str`.) +3. Run `result := match(, str)`. +4. If the match failed, then return failure. +5. If the match was successful and did not trigger left recursion, return `result`. +6. If the match was successful, but did trigger left recursion, then: + + a. If `result` is not longer than any previous `result`, stop looping and + return the longest `result` to avoid an infinite loop. + b. Otherwise: temporarily define `match(, str) := signal left recursion and return ` + c. Go to step 3. + +This handles left recursion as a simple loop and successfully matches left +recursive patterns, even ones that are indirectly or nontrivially left +recursive. + +## Example + +Consider the rule `laugh: laugh "ha" / "Ha"` being applied to the input text `"Hahaha!"`: + +|Definition of `match(, "Hahaha!")` | Result of `match(, "Hahaha!")` | +|------------------------------------------|---------------------------------------------------| +|`Fail` | `Match{"Ha"}` | +|`Match{"Ha"}` | `Match{"Haha"}` | +|`Match{"Haha"}` | `Match{"Hahaha"}` | +|`Match{"Hahaha"}` | `Match{"Ha"}` (Forward progress stops being made) | + +As you can see in this example, each successive iteration builds up a final +match incrementally until progress is no longer being made. At that point, +the longest match is returned: `Match{"Hahaha"}`. diff --git a/types.h b/types.h index b8c2b68..30a0e9e 100644 --- a/types.h +++ b/types.h @@ -39,7 +39,7 @@ enum VMOpcode { VM_REF, VM_BACKREF, VM_NODENT, - VM_CANNED, + VM_LEFTRECURSION, }; struct match_s; // forward declared to resolve circular struct defs @@ -80,7 +80,7 @@ typedef struct vm_op_s { unsigned int visits; const char *at; struct vm_op_s *fallback; - } canned; + } leftrec; struct vm_op_s *pat; } args; } vm_op_t; @@ -93,7 +93,9 @@ typedef struct match_s { const char *start, *end; struct match_s *child, *nextsibling; vm_op_t *op; - unsigned int refcount; + // Intrusive linked list nodes for garbage collection: + struct match_s **atme, *next; + int refcount; } match_t; // diff --git a/vm.c b/vm.c index 68c793a..ff62b4d 100644 --- a/vm.c +++ b/vm.c @@ -12,25 +12,31 @@ #include "utils.h" #include "vm.h" +// Linked list operations: +#define LL_PREPEND(head, node) do { (node)->atme = &(head); (node)->next = head; if (head) (head)->atme = &(node)->next; head = node; } while(0) +#define LL_REMOVE(node) do { *(node)->atme = (node)->next; if ((node)->next) (node)->next->atme = (node)->atme; } while(0) + +// Refcounting ownership-setting macros: +#define ADD_OWNER(owner, m) do { owner = m; ++(m)->refcount; } while(0) +#define REMOVE_OWNERSHIP(owner) do { if (owner) { --(owner)->refcount; recycle_if_unused(&(owner)); owner = NULL; } } while(0) + +// New match objects are either recycled from unused match objects or allocated +// from the heap. While it is in use, the match object is stored in the +// `in_use_matches` linked list. Once it is no longer needed, it is moved to +// the `unused_matches` linked list so it can be reused without the need for +// additional calls to malloc/free. Thus, it is an invariant that every match +// object is in one of these two lists: +static match_t *in_use_matches = NULL, *unused_matches = NULL; + __attribute__((nonnull, pure)) static inline const char *next_char(file_t *f, const char *str); __attribute__((nonnull)) static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, unsigned int flags); -__attribute__((hot, nonnull(2,3,4))) -static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags); __attribute__((nonnull)) static match_t *get_capture_by_num(match_t *m, int *n); __attribute__((nonnull, pure)) static match_t *get_capture_by_name(match_t *m, const char *name); -static inline void set_owner(match_t *m, match_t **owner) -{ - check(owner != &m->child, "Circular ownership"); - check(owner != &m->nextsibling, "Circular ownership"); - *owner = m; - ++m->refcount; -} - // // Return the location of the next character or UTF8 codepoint. // (i.e. skip forward one codepoint at a time, not one byte at a time) @@ -51,22 +57,6 @@ static inline const char *next_char(file_t *f, const char *str) return str; } -// -// Recursively deallocate a match object and set to NULL -// -void destroy_match(match_t **m) -{ - if (*m == NULL) return; - if (--(*m)->refcount > 0) { - *m = NULL; - return; - } - destroy_match(&((*m)->child)); - destroy_match(&((*m)->nextsibling)); - free(*m); - *m = NULL; -} - // // Attempt to match text against a previously captured value. // Return the character position after the backref has matched, or NULL if no match has occurred. @@ -131,22 +121,26 @@ static const char *match_backref(const char *str, vm_op_t *op, match_t *cap, uns // a match struct, or NULL if no match is found. // The returned value should be free()'d to avoid memory leaking. // -static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags) +match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags) { switch (op->type) { - case VM_CANNED: { - if (str == op->args.canned.at) { - ++op->args.canned.visits; - // TODO: deep copy? - return op->args.canned.match; + case VM_LEFTRECURSION: { + // Left recursion occurs when a pattern directly or indirectly + // invokes itself at the same position in the text. It's handled as + // a special case, but if a pattern invokes itself at a later + // point, it can be handled with normal recursion. + // See: left-recursion.md for more details. + if (str == op->args.leftrec.at) { + ++op->args.leftrec.visits; + return op->args.leftrec.match; } else { - return _match(defs, f, str, op->args.canned.fallback, flags); + return match(defs, f, str, op->args.leftrec.fallback, flags); } } case VM_ANYCHAR: { if (str >= f->end || *str == '\n') return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->op = op; m->start = str; m->end = next_char(f, str); @@ -157,7 +151,7 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns if ((flags & BP_IGNORECASE) ? memicmp(str, op->args.s, (size_t)op->len) != 0 : memcmp(str, op->args.s, (size_t)op->len) != 0) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->op = op; m->start = str; m->end = str + op->len; @@ -167,26 +161,26 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns if (str >= f->end) return NULL; if ((unsigned char)*str < op->args.range.low || (unsigned char)*str > op->args.range.high) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->op = op; m->start = str; m->end = str + 1; return m; } case VM_NOT: { - match_t *m = _match(defs, f, str, op->args.pat, flags); + match_t *m = match(defs, f, str, op->args.pat, flags); if (m != NULL) { - destroy_match(&m); + recycle_if_unused(&m); return NULL; } - m = new(match_t); + m = new_match(); m->op = op; m->start = str; m->end = str; return m; } case VM_UPTO_AND: { - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->op = op; @@ -201,9 +195,9 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns for (const char *prev = NULL; prev < str; ) { prev = str; if (pat) { - match_t *p = _match(defs, f, str, pat, flags); + match_t *p = match(defs, f, str, pat, flags); if (p != NULL) { - set_owner(p, dest); + ADD_OWNER(*dest, p); m->end = p->end; return m; } @@ -212,9 +206,9 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns return m; } if (skip) { - match_t *s = _match(defs, f, str, skip, flags); + match_t *s = match(defs, f, str, skip, flags); if (s != NULL) { - set_owner(s, dest); + ADD_OWNER(*dest, s); dest = &s->nextsibling; str = s->end; continue; @@ -226,11 +220,11 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns if (str < f->end && *str != '\n') str = next_char(f, str); } - destroy_match(&m); + recycle_if_unused(&m); return NULL; } case VM_REPEAT: { - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->end = str; m->op = op; @@ -243,14 +237,14 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns // Separator match_t *sep = NULL; if (op->args.repetitions.sep != NULL && reps > 0) { - sep = _match(defs, f, str, op->args.repetitions.sep, flags); + sep = match(defs, f, str, op->args.repetitions.sep, flags); if (sep == NULL) break; str = sep->end; } - match_t *p = _match(defs, f, str, op->args.repetitions.repeat_pat, flags); + match_t *p = match(defs, f, str, op->args.repetitions.repeat_pat, flags); if (p == NULL) { str = start; - destroy_match(&sep); + recycle_if_unused(&sep); break; } if (p->end == start && reps > 0) { @@ -260,8 +254,8 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns // loop either. We know that this will continue to loop // until reps==max, so let's just cut to the chase instead // of looping infinitely. - destroy_match(&sep); - destroy_match(&p); + recycle_if_unused(&sep); + recycle_if_unused(&p); if (op->args.repetitions.max == -1) reps = ~(size_t)0; else @@ -269,16 +263,16 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns break; } if (sep) { - set_owner(sep, dest); + ADD_OWNER(*dest, sep); dest = &sep->nextsibling; } - set_owner(p, dest); + ADD_OWNER(*dest, p); dest = &p->nextsibling; str = p->end; } if (reps < (size_t)op->args.repetitions.min) { - destroy_match(&m); + recycle_if_unused(&m); return NULL; } m->end = str; @@ -288,75 +282,75 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns ssize_t backtrack = op->args.pat->len; check(backtrack != -1, "'<' is only allowed for fixed-length operations"); if (str - backtrack < f->contents) return NULL; - match_t *before = _match(defs, f, str - backtrack, op->args.pat, flags); + match_t *before = match(defs, f, str - backtrack, op->args.pat, flags); if (before == NULL) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->end = str; m->op = op; - set_owner(before, &m->child); + ADD_OWNER(m->child, before); return m; } case VM_BEFORE: { - match_t *after = _match(defs, f, str, op->args.pat, flags); + match_t *after = match(defs, f, str, op->args.pat, flags); if (after == NULL) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->end = str; m->op = op; - set_owner(after, &m->child); + ADD_OWNER(m->child, after); return m; } case VM_CAPTURE: { - match_t *p = _match(defs, f, str, op->args.pat, flags); + match_t *p = match(defs, f, str, op->args.pat, flags); if (p == NULL) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->end = p->end; m->op = op; - set_owner(p, &m->child); + ADD_OWNER(m->child, p); return m; } case VM_HIDE: { - match_t *p = _match(defs, f, str, op->args.pat, flags); + match_t *p = match(defs, f, str, op->args.pat, flags); if (p == NULL) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->end = p->end; m->op = op; - set_owner(p, &m->child); + ADD_OWNER(m->child, p); return m; } case VM_OTHERWISE: { - match_t *m = _match(defs, f, str, op->args.multiple.first, flags); - if (m == NULL) m = _match(defs, f, str, op->args.multiple.second, flags); + match_t *m = match(defs, f, str, op->args.multiple.first, flags); + if (m == NULL) m = match(defs, f, str, op->args.multiple.second, flags); return m; } case VM_CHAIN: { - match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags); + match_t *m1 = match(defs, f, str, op->args.multiple.first, flags); if (m1 == NULL) return NULL; match_t *m2; { // Push backrefs and run matching, then cleanup def_t *defs2 = with_backrefs(defs, f, m1); - m2 = _match(defs2, f, m1->end, op->args.multiple.second, flags); + m2 = match(defs2, f, m1->end, op->args.multiple.second, flags); free_defs(&defs2, defs); } if (m2 == NULL) { - destroy_match(&m1); + recycle_if_unused(&m1); return NULL; } - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->end = m2->end; m->op = op; - set_owner(m1, &m->child); - set_owner(m2, &m1->nextsibling); + ADD_OWNER(m->child, m1); + ADD_OWNER(m1->nextsibling, m2); return m; } case VM_EQUAL: case VM_NOT_EQUAL: { - match_t *m1 = _match(defs, f, str, op->args.multiple.first, flags); + match_t *m1 = match(defs, f, str, op->args.multiple.first, flags); if (m1 == NULL) return NULL; // == matches iff the text of matches @@ -368,35 +362,35 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns .nlines=1 + get_line_number(f, m1->end)-get_line_number(f, m1->start), .mmapped=f->mmapped, }; - match_t *m2 = _match(defs, &inner, str, op->args.multiple.second, flags); + match_t *m2 = match(defs, &inner, str, op->args.multiple.second, flags); if ((m2 == NULL) == (op->type == VM_EQUAL)) { - destroy_match(&m1); - if (m2 != NULL) destroy_match(&m2); + recycle_if_unused(&m1); + if (m2 != NULL) recycle_if_unused(&m2); return NULL; } - match_t *m = new(match_t); + match_t *m = new_match(); m->start = m1->start; m->end = m1->end; m->op = op; - set_owner(m1, &m->child); + ADD_OWNER(m->child, m1); if (op->type == VM_EQUAL) { - set_owner(m2, &m1->nextsibling); + ADD_OWNER(m1->nextsibling, m2); } else { - destroy_match(&m2); + recycle_if_unused(&m2); } return m; } case VM_REPLACE: { match_t *p = NULL; if (op->args.replace.pat) { - p = _match(defs, f, str, op->args.replace.pat, flags); + p = match(defs, f, str, op->args.replace.pat, flags); if (p == NULL) return NULL; } - match_t *m = new(match_t); + match_t *m = new_match(); m->start = str; m->op = op; if (p) { - set_owner(p, &m->child); + ADD_OWNER(m->child, p); m->end = p->end; } else { m->end = m->start; @@ -408,12 +402,12 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns check(def != NULL, "Unknown identifier: '%s'", op->args.s); vm_op_t *ref = def->op; - vm_op_t fail_op = { - .type = VM_CANNED, + vm_op_t rec_op = { + .type = VM_LEFTRECURSION, .start = ref->start, .end = ref->end, .len = 0, - .args.canned = { + .args.leftrec = { .match = NULL, .visits = 0, .at = str, @@ -424,29 +418,42 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns .namelen = def->namelen, .name = def->name, .file = def->file, - .op = &fail_op, + .op = &rec_op, .next = defs, }; const char *prev = str; - match_t *m = _match(&defs2, f, str, ref, flags); + match_t *m = match(&defs2, f, str, ref, flags); if (m == NULL) return NULL; - while (fail_op.args.canned.visits > 0 && m->end > prev) { - fail_op.args.canned.visits = 0; - fail_op.args.canned.match = m; + while (rec_op.args.leftrec.visits > 0) { + rec_op.args.leftrec.visits = 0; + REMOVE_OWNERSHIP(rec_op.args.leftrec.match); + ADD_OWNER(rec_op.args.leftrec.match, m); prev = m->end; - match_t *m2 = _match(&defs2, f, str, ref, flags); + match_t *m2 = match(&defs2, f, str, ref, flags); if (m2 == NULL) break; + if (m2->end <= prev) { + recycle_if_unused(&m2); + break; + } m = m2; } + if (rec_op.args.leftrec.match) { + // Ensure that `m` isn't garbage collected right now, but do + // clean up the recursive match result if it's not needed. + ++m->refcount; + REMOVE_OWNERSHIP(rec_op.args.leftrec.match); + --m->refcount; + } + return m; } case VM_BACKREF: { const char *end = match_backref(str, op, op->args.backref, flags); if (end == NULL) return NULL; - match_t *m = new(match_t); + match_t *m = new_match(); m->op = op; m->start = str; m->end = end; @@ -473,7 +480,7 @@ static match_t *_match(def_t *defs, file_t *f, const char *str, vm_op_t *op, uns if (str[i] != denter || &str[i] >= f->end) return NULL; } - match_t *m = new(match_t); + match_t *m = new_match(); m->start = start; m->end = &str[dents]; m->op = op; @@ -540,14 +547,79 @@ match_t *get_capture(match_t *m, const char **id) } // -// Wrapper function for _match() to kickstart the recursion info. +// Return a match object which can be used (may be allocated or recycled). // -match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags) +match_t *new_match(void) +{ + match_t *m; + if (unused_matches) { + m = unused_matches; + LL_REMOVE(m); + memset(m, 0, sizeof(match_t)); + } else { + m = new(match_t); + } + // Keep track of the object: + LL_PREPEND(in_use_matches, m); + return m; +} + +// +// If the given match is not currently a child member of another match (or +// otherwise reserved) then put it back in the pool of unused match objects. +// +void recycle_if_unused(match_t **at_m) { - return _match(defs, f, str, op, flags); + match_t *m = *at_m; + if (m == NULL) return; + if (m->refcount > 0) { + *at_m = NULL; + return; + } + + REMOVE_OWNERSHIP(m->child); + REMOVE_OWNERSHIP(m->nextsibling); + + LL_REMOVE(m); + memset(m, 0, sizeof(match_t)); + LL_PREPEND(unused_matches, m); + *at_m = NULL; } +// +// Force all match objects into the pool of unused match objects. +// +size_t recycle_all_matches(void) +{ + size_t count = 0; + while (in_use_matches) { + match_t *m = in_use_matches; + LL_REMOVE(m); + LL_PREPEND(unused_matches, m); + ++count; + } + return count; +} + +// +// Free all match objects in memory. +// +size_t free_all_matches(void) +{ + size_t count = 0; + recycle_all_matches(); + while (unused_matches) { + match_t *m = unused_matches; + LL_REMOVE(m); + free(m); + ++count; + } + return count; +} + +// // Deallocate memory associated with an op +// void destroy_op(vm_op_t *op) { switch (op->type) { diff --git a/vm.h b/vm.h index 2127eca..3e7526b 100644 --- a/vm.h +++ b/vm.h @@ -8,14 +8,17 @@ #include "types.h" -__attribute__((nonnull(2,3,4))) +__attribute__((hot, nonnull(2,3,4))) match_t *match(def_t *defs, file_t *f, const char *str, vm_op_t *op, unsigned int flags); __attribute__((nonnull)) -void destroy_match(match_t **m); -__attribute__((nonnull)) match_t *get_capture(match_t *m, const char **id); __attribute__((nonnull)) void destroy_op(vm_op_t *op); +match_t *new_match(void); +size_t free_all_matches(void); +__attribute__((nonnull)) +void recycle_if_unused(match_t **at_m); +size_t recycle_all_matches(void); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 -- cgit v1.2.3