diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2021-07-26 20:59:45 -0700 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2021-07-26 20:59:45 -0700 |
| commit | f23b9bc6375797d03dee54a31fcaa634f8376975 (patch) | |
| tree | 624128655eeb20d68098e8c772d9d3ac77f1ee1e | |
| parent | d7030709801cde01739850a85f156d181554f520 (diff) | |
Introduced cache to greatly speed up many use cases
| -rw-r--r-- | bp.c | 35 | ||||
| -rw-r--r-- | definitions.c | 16 | ||||
| -rw-r--r-- | definitions.h | 2 | ||||
| -rw-r--r-- | grammars/c++.bp | 4 | ||||
| -rw-r--r-- | grammars/c.bp | 4 | ||||
| -rw-r--r-- | json.c | 10 | ||||
| -rw-r--r-- | match.c | 337 | ||||
| -rw-r--r-- | match.h | 1 | ||||
| -rw-r--r-- | matchviz.c | 9 | ||||
| -rw-r--r-- | pattern.c | 29 | ||||
| -rw-r--r-- | print.c | 9 | ||||
| -rw-r--r-- | types.h | 59 |
12 files changed, 349 insertions, 166 deletions
@@ -55,7 +55,7 @@ static const char *usage = ( " -g --grammar <grammar-file> use the specified file as a grammar"); // Used as a heuristic to check if a file is binary or text: -#define CHECK_FIRST_N_BYTES 128 +#define CHECK_FIRST_N_BYTES 256 // Flag-configurable options: typedef enum { CONFIRM_ASK, CONFIRM_ALL, CONFIRM_NONE } confirm_t; @@ -167,13 +167,15 @@ static int is_text_file(const char *filename) static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) { static int matches = 0; - for (match_t *m = NULL; (m = next_match(defs, f, m, pattern, options.skip, options.ignorecase)); ) { + match_t *m = NULL; + while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { if (++matches > 1) printf(",\n"); printf("{\"filename\":\"%s\",\"match\":", f->filename); json_match(f->start, m, options.verbose); printf("}"); } + if (m) recycle_if_unused(&m); return matches; } @@ -183,7 +185,8 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) static int explain_matches(def_t *defs, file_t *f, pat_t *pattern) { int matches = 0; - for (match_t *m = NULL; (m = next_match(defs, f, m, pattern, options.skip, options.ignorecase)); ) { + match_t *m = NULL; + while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { if (++matches == 1) { fprint_filename(stdout, f->filename); } else { @@ -191,6 +194,7 @@ static int explain_matches(def_t *defs, file_t *f, pat_t *pattern) } visualize_match(m); } + if (m) recycle_if_unused(&m); return matches; } @@ -238,7 +242,7 @@ static void confirm_replacements(file_t *f, match_t *m, confirm_t *confirm) { // Print the original printer_t pr = {.file = f, .context_lines = options.context_lines, .use_color = true, .print_line_numbers = true}; - print_match(tty_out, &pr, m->child); + print_match(tty_out, &pr, m->children[0]); // Print trailing context lines: print_match(tty_out, &pr, NULL); } @@ -272,10 +276,8 @@ static void confirm_replacements(file_t *f, match_t *m, confirm_t *confirm) } check_children: - if (m->child) - confirm_replacements(f, m->child, confirm); - if (m->nextsibling) - confirm_replacements(f, m->nextsibling, confirm); + for (int i = 0; m->children && m->children[i]; i++) + confirm_replacements(f, m->children[i], confirm); } // @@ -301,7 +303,8 @@ static int inplace_modify_file(def_t *defs, file_t *f, pat_t *pattern) FILE *dest = NULL; // Lazy-open this on the first match int matches = 0; confirm_t confirm_file = options.confirm; - for (match_t *m = NULL; (m = next_match(defs, f, m, pattern, options.skip, options.ignorecase)); ) { + match_t *m = NULL; + while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { ++matches; printer_t err_pr = {.file = f, .context_lines = 1, .use_color = true, .print_line_numbers = true}; if (print_errors(&err_pr, m) > 0) @@ -319,6 +322,7 @@ static int inplace_modify_file(def_t *defs, file_t *f, pat_t *pattern) confirm_replacements(f, m, &confirm_file); print_match(dest, &pr, m); } + if (m) recycle_if_unused(&m); if (dest) { // Print trailing context lines: @@ -349,7 +353,8 @@ static int print_matches(def_t *defs, file_t *f, pat_t *pattern) .print_line_numbers = options.format == FORMAT_FANCY, }; - for (match_t *m = NULL; (m = next_match(defs, f, m, pattern, options.skip, options.ignorecase)); ) { + match_t *m = NULL; + while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { printer_t err_pr = {.file = f, .context_lines = 1, .use_color = true, .print_line_numbers = true}; if (print_errors(&err_pr, m) > 0) exit(EXIT_FAILURE); @@ -360,6 +365,7 @@ static int print_matches(def_t *defs, file_t *f, pat_t *pattern) } print_match(stdout, &pr, m); } + if (m) recycle_if_unused(&m); if (matches > 0) { // Print trailing context lines: @@ -399,10 +405,12 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) } else { matches += print_matches(defs, f, pattern); } + fflush(stdout); + cache_destroy(); #ifdef DEBUG_HEAP if (recycle_all_matches() != 0) - errx(EXIT_FAILURE, "Memory leak: there should no longer be any matches in use at this point."); + fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[0m\n"); #endif destroy_file(&f); (void)fflush(stdout); @@ -664,13 +672,14 @@ int main(int argc, char *argv[]) // This code frees up all residual heap-allocated memory. Since the program // is about to exit, this step is unnecessary. However, it is useful for // tracking down memory leaks. - free_defs(&defs, NULL); + cache_destroy(); + free_all_matches(); + defs = free_defs(defs, NULL); while (loaded_files) { file_t *next = loaded_files->next; destroy_file(&loaded_files); loaded_files = next; } - free_all_matches(); #endif exit(found > 0 ? EXIT_SUCCESS : EXIT_FAILURE); diff --git a/definitions.c b/definitions.c index 18bd612..c85e227 100644 --- a/definitions.c +++ b/definitions.c @@ -11,12 +11,15 @@ #include "pattern.h" #include "utils.h" +static size_t next_id = 0; + // // Return a new list of definitions with one added to the front // def_t *with_def(def_t *defs, size_t namelen, const char *name, pat_t *pat) { def_t *def = new(def_t); + def->id = next_id++; def->next = defs; def->namelen = namelen; def->name = name; @@ -68,14 +71,15 @@ def_t *lookup(def_t *defs, size_t namelen, const char *name) // // Free all the given definitions up till (but not including) `stop` // -void free_defs(def_t **defs, def_t *stop) +def_t *free_defs(def_t *defs, def_t *stop) { - while (*defs != stop && *defs != NULL) { - def_t *next = (*defs)->next; - (*defs)->next = NULL; - free(*defs); - (*defs) = next; + while (defs != stop && defs != NULL) { + def_t *next = defs->next; + defs->next = NULL; + free(defs); + defs = next; } + return defs; } // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/definitions.h b/definitions.h index d5b108d..5788ac0 100644 --- a/definitions.h +++ b/definitions.h @@ -14,7 +14,7 @@ def_t *load_grammar(def_t *defs, file_t *f); __attribute__((pure, nonnull(3))) def_t *lookup(def_t *defs, size_t namelen, const char *name); __attribute__((nonnull(1))) -void free_defs(def_t **defs, def_t *stop); +def_t *free_defs(def_t *defs, def_t *stop); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 diff --git a/grammars/c++.bp b/grammars/c++.bp index 30f47aa..333e9f3 100644 --- a/grammars/c++.bp +++ b/grammars/c++.bp @@ -24,5 +24,5 @@ keyword: "unsigned" / "using" / "virtual" / "void" / "volatile" / "wchar_t" / "while" / "xor" / "xor_eq" function-def: ^_ 2+(id / keyword / anglebraces / `*) % __ parens (__`; / >(__`{)) function: function-def __ braces -macro: ^"#define"} ..$ *(<`\ \n..$) -import: ^("#include"}/"#import"}) __ (string / `<..`>) +macro: ^"#define"\b ..$ *(<`\ \n..$) +import: ^`#("include"/"import")\b __ (string / `<..`>) diff --git a/grammars/c.bp b/grammars/c.bp index 36f580b..015e5ed 100644 --- a/grammars/c.bp +++ b/grammars/c.bp @@ -17,5 +17,5 @@ keyword: "volatile" / "while" function-def: ^_ 2+(id / keyword / `*) % __ parens (__`; / >(__`{)) function: function-def __ braces -macro: ^"#define"} ..$ *(<`\ \n..$) -import: ^"#include"} __ (string / `<..`>) +macro: ^"#define"\b ..$ *(<`\ \n..$) +import: ^"#include"\b __ (string / `<..`>) @@ -18,9 +18,8 @@ static int _json_match(const char *text, match_t *m, int comma, bool verbose) { if (!verbose) { if (m->pat->type != BP_REF && m->pat->type != BP_ERROR) { - for (match_t *child = m->child; child; child = child->nextsibling) { - comma |= _json_match(text, child, comma, verbose); - } + for (int i = 0; m->children && m->children[i]; i++) + comma |= _json_match(text, m->children[i], comma, verbose); return comma; } } @@ -39,9 +38,8 @@ static int _json_match(const char *text, match_t *m, int comma, bool verbose) } printf("\",\"start\":%ld,\"end\":%ld,\"children\":[", m->start - text, m->end - text); - for (match_t *child = m->child; child; child = child->nextsibling) { - comma |= _json_match(text, child, comma, verbose); - } + for (int i = 0; m->children && m->children[i]; i++) + comma |= _json_match(text, m->children[i], comma, verbose); printf("]}"); return 1; } @@ -4,6 +4,7 @@ #include <ctype.h> #include <err.h> +#include <limits.h> #include <stdbool.h> #include <stdio.h> #include <stdlib.h> @@ -18,14 +19,10 @@ #ifdef DEBUG_HEAP // Doubly-linked list operations: -#define DLL_PREPEND(head, node) do { (node)->atme = &(head); (node)->next = head; if (head) (head)->atme = &(node)->next; head = node; } while(false) -#define DLL_REMOVE(node) do { *(node)->atme = (node)->next; if ((node)->next) (node)->next->atme = (node)->atme; } while(false) +#define DLL_PREPEND(head, node) do { (node)->home = &(head); (node)->next = head; if (head) (head)->home = &(node)->next; head = node; } while(false) +#define DLL_REMOVE(node) do { *(node)->home = (node)->next; if ((node)->next) (node)->next->home = (node)->home; } while(false) #endif -// Refcounting ownership-setting macros: -#define ADD_OWNER(owner, m) do { owner = m; ++(m)->refcount; } while(false) -#define REMOVE_OWNERSHIP(owner) do { if (owner) { --(owner)->refcount; recycle_if_unused(&(owner)); owner = NULL; } } while(false) - // New match objects are either recycled from unused match objects or allocated // from the heap. While it is in use, the match object is stored in the // `in_use_matches` linked list. Once it is no longer needed, it is moved to @@ -37,10 +34,21 @@ static match_t *unused_matches = NULL; static match_t *in_use_matches = NULL; #endif +typedef struct { + size_t size, occupancy; + match_t **matches; +} cache_t; + +static const size_t cache_sizes[] = {7, 17, 31, 61, 127, 509, 1021, 2039, 4093, 8191, 16529, 0}; // Prime numbers, roughly doubling +static const short int CACHE_SUCKS = -50; +static const double CACHE_MAX_OCCUPANCY = (double)2.0f; + +cache_t cache = {0, 0, NULL}; + __attribute__((nonnull(1))) static inline pat_t *deref(def_t *defs, pat_t *pat); __attribute__((returns_nonnull)) -static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child); +static match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]); __attribute__((nonnull)) static match_t *get_capture_by_num(match_t *m, int *n); __attribute__((nonnull, pure)) @@ -48,6 +56,131 @@ static match_t *get_capture_by_name(match_t *m, const char *name); __attribute__((hot, nonnull(2,3,4))) static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase); +// Store a value and update its refcount +static inline void add_owner(match_t** owner, match_t* owned) +{ + if (*owner != NULL) + errx(EXIT_FAILURE, "Ownership is being overwritten"); + *owner = owned; + ++owned->refcount; +} + +// Unstore a value and update its refcount +static inline void remove_ownership(match_t** owner) +{ + if (*owner) { + --(*owner)->refcount; + if ((*owner)->refcount == 0) + recycle_if_unused(owner); + *owner = NULL; + } +} + +// Helper method for concisely allocating children matches +// static match_t** _alloc_children(size_t n, match_t* matches[]) +// { +// if (n == 0) return NULL; +// match_t **ret = xcalloc(n+1, sizeof(match_t*)); +// for (size_t i = 0; i < n; i++) +// add_owner(&ret[i], matches[i]); +// return ret; +// } + +// #define MATCHES(...) _alloc_children((sizeof((match_t*[]){__VA_ARGS__}))/sizeof(match_t*), (match_t*[]){__VA_ARGS__}) + +#define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL} + +static size_t hash(const char *str, pat_t *pat) +{ + return (size_t)((((unsigned long)pat)>>3) ^ ((unsigned long)str * 37)); +} + +static match_t *cache_lookup(def_t *defs, const char *str, pat_t *pat) +{ + if (!cache.matches || pat->cache_balance <= CACHE_SUCKS) return NULL; + size_t h = hash(str, pat) % cache.size; + for (match_t *c = cache.matches[h]; c; c = c->cache_next) { + if (c->start == str && c->pat == pat && c->defs_id == defs->id) { + if (pat->cache_balance < SHRT_MAX - 4) pat->cache_balance += 4; + return c; + } + } + if (pat->cache_balance > SHRT_MIN) --pat->cache_balance; + return NULL; +} + +static void cache_remove(match_t *m) +{ + if (!m->cache_home) return; + *m->cache_home = m->cache_next; + if (m->cache_next) m->cache_next->cache_home = m->cache_home; + m->cache_next = NULL; + m->cache_home = NULL; + remove_ownership(&m); + --cache.occupancy; +} + +static void cache_save(match_t *m) +{ + if ((double)(cache.occupancy + 1) >= ((double)cache.size) * CACHE_MAX_OCCUPANCY) { + cache_t old_cache = cache; + for (int s = 0; cache_sizes[s]; s++) { + if (cache_sizes[s] > cache.size) { + cache.size = cache_sizes[s]; + cache.matches = xcalloc(cache.size, sizeof(match_t*)); + + // Rehash: + if (old_cache.matches) { + for (size_t i = 0; i < old_cache.size; i++) { + for (match_t *o; (o = old_cache.matches[i]); ) { + *o->cache_home = o->cache_next; + if (o->cache_next) o->cache_next->cache_home = o->cache_home; + size_t h = hash(o->start, o->pat) % cache.size; + o->cache_home = &(cache.matches[h]); + o->cache_next = cache.matches[h]; + if (cache.matches[h]) cache.matches[h]->cache_home = &o->cache_next; + cache.matches[h] = o; + } + } + xfree(&old_cache.matches); + } + break; + } + } + } + size_t h = hash(m->start, m->pat) % cache.size; + m->cache_home = &(cache.matches[h]); + m->cache_next = cache.matches[h]; + if (cache.matches[h]) cache.matches[h]->cache_home = &m->cache_next; + cache.matches[h] = NULL; + add_owner(&cache.matches[h], m); + ++cache.occupancy; +} + +static void cache_prune(const char *start, const char *end) +{ + if (!cache.matches) return; + for (size_t i = 0; i < cache.size; i++) { + for (match_t *m = cache.matches[i], *next = NULL; m; m = next) { + next = m->cache_next; + if (m->start < start || (m->end ? m->end : m->start) > end) + cache_remove(m); + } + } +} + +void cache_destroy(void) +{ + if (!cache.matches) return; + for (size_t i = 0; i < cache.size; i++) { + while (cache.matches[i]) + cache_remove(cache.matches[i]); + } + cache.occupancy = 0; + xfree(&cache.matches); + cache.size = 0; +} + // // If the given pattern is a reference, look it up and return the referenced // pattern. This is used for an optimization to avoid repeated lookups. @@ -101,9 +234,11 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk const char *str; if (prev) { str = prev->end > prev->start ? prev->end : prev->end + 1; - recycle_if_unused(&prev); + if (prev->refcount == 0) recycle_if_unused(&prev); + cache_prune(str, f->end); } else { str = f->start; + cache_destroy(); } pat = deref(defs, pat); @@ -163,40 +298,40 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool } } case BP_ANYCHAR: { - return (str < f->end && *str != '\n') ? new_match(pat, str, next_char(f, str), NULL) : NULL; + return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; } case BP_ID_START: { - return (str < f->end && isidstart(f, str)) ? new_match(pat, str, next_char(f, str), NULL) : NULL; + return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; } case BP_ID_CONTINUE: { - return (str < f->end && isidcontinue(f, str)) ? new_match(pat, str, next_char(f, str), NULL) : NULL; + return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; } case BP_START_OF_FILE: { - return (str == f->start) ? new_match(pat, str, str, NULL) : NULL; + return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_START_OF_LINE: { - return (str == f->start || str[-1] == '\n') ? new_match(pat, str, str, NULL) : NULL; + return (str == f->start || str[-1] == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_END_OF_FILE: { - return (str == f->end) ? new_match(pat, str, str, NULL) : NULL; + return (str == f->end) ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_END_OF_LINE: { - return (str == f->end || *str == '\n') ? new_match(pat, str, str, NULL) : NULL; + return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_WORD_BOUNDARY: { - return (isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(pat, str, str, NULL) : NULL; + return (isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_STRING: { if (&str[pat->min_matchlen] > f->end) return NULL; if (pat->min_matchlen > 0 && (ignorecase ? memicmp : memcmp)(str, pat->args.string, pat->min_matchlen) != 0) return NULL; - return new_match(pat, str, str + pat->min_matchlen, NULL); + return new_match(defs, pat, str, str + pat->min_matchlen, NULL); } case BP_RANGE: { if (str >= f->end) return NULL; if ((unsigned char)*str < pat->args.range.low || (unsigned char)*str > pat->args.range.high) return NULL; - return new_match(pat, str, str+1, NULL); + return new_match(defs, pat, str, str+1, NULL); } case BP_NOT: { match_t *m = match(defs, f, str, pat->args.pat, ignorecase); @@ -204,10 +339,10 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool recycle_if_unused(&m); return NULL; } - return new_match(pat, str, str, NULL); + return new_match(defs, pat, str, str, NULL); } case BP_UPTO: { - match_t *m = new_match(pat, str, str, NULL); + match_t *m = new_match(defs, pat, str, str, NULL); pat_t *target = deref(defs, pat->args.multiple.first), *skip = deref(defs, pat->args.multiple.second); if (!target && !skip) { @@ -216,7 +351,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return m; } - match_t **dest = &m->child; + size_t child_cap = 0, nchildren = 0; for (const char *prev = NULL; prev < str; ) { prev = str; if (target) { @@ -233,9 +368,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool if (skip) { match_t *s = match(defs, f, str, skip, ignorecase); if (s != NULL) { - ADD_OWNER(*dest, s); - dest = &s->nextsibling; str = s->end; + if (nchildren+2 >= child_cap) { + m->children = xrealloc(m->children, sizeof(match_t*)*(child_cap += 5)); + for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; + } + add_owner(&m->children[nchildren++], s); continue; } } @@ -249,12 +387,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return NULL; } case BP_REPEAT: { - match_t *m = new_match(pat, str, str, NULL); - match_t **dest = &m->child; + match_t *m = new_match(defs, pat, str, str, NULL); size_t reps = 0; ssize_t max = pat->args.repetitions.max; pat_t *repeating = deref(defs, pat->args.repetitions.repeat_pat); pat_t *sep = deref(defs, pat->args.repetitions.sep); + size_t child_cap = 0, nchildren = 0; for (reps = 0; max == -1 || reps < (size_t)max; ++reps) { const char *start = str; // Separator @@ -286,11 +424,18 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool break; } if (msep) { - ADD_OWNER(*dest, msep); - dest = &msep->nextsibling; + if (nchildren+2 >= child_cap) { + m->children = xrealloc(m->children, sizeof(match_t*)*(child_cap += 5)); + for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; + } + add_owner(&m->children[nchildren++], msep); + } + + if (nchildren+2 >= child_cap) { + m->children = xrealloc(m->children, sizeof(match_t*)*(child_cap += 5)); + for (size_t i = nchildren; i < child_cap; i++) m->children[i] = NULL; } - ADD_OWNER(*dest, mp); - dest = &mp->nextsibling; + add_owner(&m->children[nchildren++], mp); str = mp->end; } @@ -320,7 +465,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool if (m && m->end != str) recycle_if_unused(&m); else if (m) - return new_match(pat, str, str, m); + return new_match(defs, pat, str, str, MATCHES(m)); if (pos == f->start) break; // To prevent extreme performance degradation, don't keep // walking backwards endlessly over newlines. @@ -330,11 +475,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool } case BP_BEFORE: { match_t *after = match(defs, f, str, pat->args.pat, ignorecase); - return after ? new_match(pat, str, str, after) : NULL; + return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL; } case BP_CAPTURE: { match_t *p = match(defs, f, str, pat->args.pat, ignorecase); - return p ? new_match(pat, str, p->end, p) : NULL; + return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; } case BP_OTHERWISE: { match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase); @@ -345,26 +490,39 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool if (m1 == NULL) return NULL; match_t *m2; - { // Push backrefs and run matching, then cleanup - def_t *defs2 = defs; - if (m1->pat->type == BP_CAPTURE && m1->pat->args.capture.name) { - // Temporarily add a rule that the backref name matches the - // exact string of the original match (no replacements) - ssize_t len = (ssize_t)(m1->end - m1->start); - pat_t *backref = new_pat(f, m1->start, m1->end, (size_t)len, len, BP_STRING); - backref->args.string = m1->start; - defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); - } - m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase); - free_defs(&defs2, defs); + // Push backrefs and run matching, then cleanup + if (m1->pat->type == BP_CAPTURE && m1->pat->args.capture.name) { + // Temporarily add a rule that the backref name matches the + // exact string of the original match (no replacements) + size_t len = (size_t)(m1->end - m1->start); + pat_t *backref = new_pat(f, m1->start, m1->end, len, (ssize_t)len, BP_STRING); + backref->args.string = m1->start; + def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); + + ++m1->refcount; { + m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase); + if (!m2) { // No need to keep the backref in memory if it didn't match + for (struct allocated_pat_s **rem = &f->pats; *rem; rem = &(*rem)->next) { + if (&(*rem)->pat == backref) { + struct allocated_pat_s *tmp = *rem; + *rem = (*rem)->next; + free(tmp); + break; + } + } + } + defs = free_defs(defs2, defs); + } --m1->refcount; + } else { + m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase); } if (m2 == NULL) { recycle_if_unused(&m1); return NULL; } - ADD_OWNER(m1->nextsibling, m2); - return new_match(pat, str, m2->end, m1); + + return new_match(defs, pat, str, m2->end, MATCHES(m1, m2)); } case BP_MATCH: case BP_NOT_MATCH: { match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); @@ -376,12 +534,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool slice_file(&slice, f, m1->start, m1->end); match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase); if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { - recycle_if_unused(&m2); + if (m2) recycle_if_unused(&m2); recycle_if_unused(&m1); return NULL; } - if (pat->type == BP_MATCH) ADD_OWNER(m1->nextsibling, m2); - return new_match(pat, m1->start, m1->end, m1); + return new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m2) : NULL); } case BP_REPLACE: { match_t *p = NULL; @@ -389,9 +546,12 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool p = match(defs, f, str, pat->args.replace.pat, ignorecase); if (p == NULL) return NULL; } - return new_match(pat, str, p ? p->end : str, p); + return new_match(defs, pat, str, p ? p->end : str, MATCHES(p)); } case BP_REF: { + match_t *cached = cache_lookup(defs, str, pat); + if (cached) return cached->end == NULL ? NULL : cached; + def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); if (def == NULL) errx(EXIT_FAILURE, "Unknown identifier: '%.*s'", (int)pat->args.ref.len, pat->args.ref.name); @@ -419,12 +579,16 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool const char *prev = str; match_t *m = match(&defs2, f, str, ref, ignorecase); - if (m == NULL) return NULL; + if (m == NULL) { + // Store placeholder: + cache_save(new_match(defs, pat, str, NULL, NULL)); + return NULL; + } while (rec_op.args.leftrec.visits > 0) { rec_op.args.leftrec.visits = 0; - REMOVE_OWNERSHIP(rec_op.args.leftrec.match); - ADD_OWNER(rec_op.args.leftrec.match, m); + remove_ownership(&rec_op.args.leftrec.match); + add_owner(&rec_op.args.leftrec.match, m); prev = m->end; match_t *m2 = match(&defs2, f, str, ref, ignorecase); if (m2 == NULL) break; @@ -435,21 +599,18 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool m = m2; } - if (rec_op.args.leftrec.match) { - // Ensure that `m` isn't garbage collected right now, but do - // clean up the recursive match result if it's not needed. - ++m->refcount; - REMOVE_OWNERSHIP(rec_op.args.leftrec.match); - --m->refcount; - } - - if (!m) - errx(EXIT_FAILURE, "Match should be non-null at this point"); - // This match wrapper mainly exists for record-keeping purposes and - // does not affect correctness. It also helps with visualization of - // match results. + // This match wrapper mainly exists for record-keeping purposes. + // However, it also keeps `m` from getting garbage collected with + // leftrec.match is GC'd. It also helps with visualization of match + // results. // OPTIMIZE: remove this if necessary - return new_match(pat, m->start, m->end, m); + match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m)); + cache_save(wrap); + + if (rec_op.args.leftrec.match) + remove_ownership(&rec_op.args.leftrec.match); + + return wrap; } case BP_NODENT: { if (*str != '\n') return NULL; @@ -472,11 +633,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool if (str[i] != denter || &str[i] >= f->end) return NULL; } - return new_match(pat, start, &str[dents], NULL); + return new_match(defs, pat, start, &str[dents], NULL); } case BP_ERROR: { match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL; - return p ? new_match(pat, str, p->end, p) : NULL; + return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; } default: { errx(EXIT_FAILURE, "Unknown pattern type: %u", pat->type); @@ -493,9 +654,11 @@ static match_t *get_capture_by_num(match_t *m, int *n) if (*n == 0) return m; if (m->pat->type == BP_CAPTURE && *n == 1) return m; if (m->pat->type == BP_CAPTURE) --(*n); - for (match_t *c = m->child; c; c = c->nextsibling) { - match_t *cap = get_capture_by_num(c, n); - if (cap) return cap; + if (m->children) { + for (int i = 0; m->children[i]; i++) { + match_t *cap = get_capture_by_num(m->children[i], n); + if (cap) return cap; + } } return NULL; } @@ -508,9 +671,11 @@ static match_t *get_capture_by_name(match_t *m, const char *name) if (m->pat->type == BP_CAPTURE && m->pat->args.capture.name && strncmp(m->pat->args.capture.name, name, m->pat->args.capture.namelen) == 0) return m; - for (match_t *c = m->child; c; c = c->nextsibling) { - match_t *cap = get_capture_by_name(c, name); - if (cap) return cap; + if (m->children) { + for (int i = 0; m->children[i]; i++) { + match_t *cap = get_capture_by_name(m->children[i], name); + if (cap) return cap; + } } return NULL; } @@ -523,7 +688,7 @@ match_t *get_capture(match_t *m, const char **id) { if (isdigit(**id)) { int n = (int)strtol(*id, (char**)id, 10); - return get_capture_by_num(m->child, &n); + return get_capture_by_num(m, &n); } else { const char *end = after_name(*id); if (end == *id) return NULL; @@ -539,7 +704,7 @@ match_t *get_capture(match_t *m, const char **id) // // Return a match object which can be used (may be allocated or recycled). // -static match_t *new_match(pat_t *pat, const char *start, const char *end, match_t *child) +static match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]) { match_t *m; @@ -566,7 +731,13 @@ static match_t *new_match(pat_t *pat, const char *start, const char *end, match_ m->pat = pat; m->start = start; m->end = end; - if (child) ADD_OWNER(m->child, child); + m->defs_id = defs->id; + + if (children) { + for (int i = 0; children[i]; i++) + add_owner(&m->_children[i], children[i]); + m->children = m->_children; + } return m; } @@ -583,8 +754,12 @@ void recycle_if_unused(match_t **at_m) return; } - REMOVE_OWNERSHIP(m->child); - REMOVE_OWNERSHIP(m->nextsibling); + if (m->children) { + for (int i = 0; m->children[i]; i++) + remove_ownership(&m->children[i]); + if (m->children != m->_children) + xfree(&m->children); + } #ifdef DEBUG_HEAP DLL_REMOVE(m); // Remove from in_use_matches @@ -609,6 +784,8 @@ size_t recycle_all_matches(void) while (in_use_matches) { match_t *m = in_use_matches; DLL_REMOVE(m); + if (m->children && m->children != m->_children) + xfree(&m->children); DLL_PREPEND(unused_matches, m); ++count; } @@ -19,6 +19,7 @@ void recycle_if_unused(match_t **at_m); size_t free_all_matches(void); size_t recycle_all_matches(void); #endif +void cache_destroy(void); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 @@ -26,8 +26,9 @@ static void _visualize_matches(match_node_t *firstmatch, int depth, const char * static int height_of_match(match_t *m) { int height = 0; - for (match_t *c = m->child; c; c = c->nextsibling) { - int childheight = height_of_match(c); + for (int i = 0; m->children && m->children[i]; i++) { + match_t *child = m->children[i]; + int childheight = height_of_match(child); if (childheight > height) height = childheight; } return 1 + height; @@ -83,9 +84,9 @@ static void _visualize_matches(match_node_t *firstmatch, int depth, const char * // Print nonzero-width first: for (match_node_t *m = firstmatch; m; m = m->next) { if (RIGHT_TYPE(m)) { - for (match_t *c = m->m->child; c; c = c->nextsibling) { + for (int i = 0; m->m->children && m->m->children[i]; i++) { *nextchild = new(match_node_t); - (*nextchild)->m = c; + (*nextchild)->m = m->m->children[i]; nextchild = &((*nextchild)->next); } if (m->m->end == m->m->start) continue; @@ -305,37 +305,24 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) } const char *opstart = str; - unsigned char e = (unsigned char)unescapechar(str, &str); + unsigned char e_low = (unsigned char)unescapechar(str, &str); if (str == opstart) file_err(f, start, str+1, "This isn't a valid escape sequence."); + unsigned char e_high = e_low; if (matchchar(&str, '-')) { // Escape range (e.g. \x00-\xFF) if (next_char(f, str) != str+1) file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges."); const char *seqstart = str; - unsigned char e2 = (unsigned char)unescapechar(str, &str); + e_high = (unsigned char)unescapechar(str, &str); if (str == seqstart) file_err(f, seqstart, str+1, "This value isn't a valid escape sequence"); - if (e2 < e) + if (e_high < e_low) file_err(f, start, str, "Escape ranges should be low-to-high, but this is high-to-low."); - pat_t *esc = new_pat(f, opstart, str, 1, 1, BP_RANGE); - esc->args.range.low = e; - esc->args.range.high = e2; - all = either_pat(f, all, esc); - } else if (str > opstart) { - pat_t *esc = new_pat(f, start, str, 1, 1, BP_STRING); - char *s = xcalloc(sizeof(char), 2); - s[0] = (char)e; - esc->args.string = s; - all = either_pat(f, all, esc); - } else { - const char *next = next_char(f, opstart); - size_t len = (size_t)(next-opstart); - pat_t *esc = new_pat(f, start, next, len, (ssize_t)len, BP_STRING); - char *s = xcalloc(sizeof(char), 1+len); - memcpy(s, opstart, len); - esc->args.string = s; - all = either_pat(f, all, esc); } + pat_t *esc = new_pat(f, start, str, 1, 1, BP_RANGE); + esc->args.range.low = e_low; + esc->args.range.high = e_high; + all = either_pat(f, all, esc); } while (matchchar(&str, ',')); return all; @@ -109,7 +109,7 @@ static void _print_match(FILE *out, printer_t *pr, match_t *m) pr->pos = m->start; if (m->pat->type == BP_REPLACE) { if (m->skip_replacement) { - _print_match(out, pr, m->child); + _print_match(out, pr, m->children[0]); return; } size_t line = get_line_number(pr->file, m->start); @@ -187,7 +187,8 @@ static void _print_match(FILE *out, printer_t *pr, match_t *m) print_line_number(out, pr, line > line_end ? 0 : line, pr->use_color ? color_normal : NULL); } else { const char *prev = m->start; - for (match_t *child = m->child; child; child = child->nextsibling) { + for (int i = 0; m->children && m->children[i]; i++) { + match_t *child = m->children[i]; // Skip children from e.g. zero-width matches like >@foo if (!(prev <= child->start && child->start <= m->end && prev <= child->end && child->end <= m->end)) @@ -262,8 +263,8 @@ int print_errors(printer_t *pr, match_t *m) fprint_line(stderr, pr->file, m->start, m->end, " "); return 1; } - if (m->child) ret += print_errors(pr, m->child); - if (m->nextsibling) ret += print_errors(pr, m->nextsibling); + for (int i = 0; m->children && m->children[i]; i++) + ret += print_errors(pr, m->children[i]); return ret; } @@ -13,31 +13,31 @@ // BP virtual machine pattern types enum pattype_e { - BP_ANYCHAR = 1, - BP_ID_START, - BP_ID_CONTINUE, - BP_STRING, - BP_RANGE, - BP_NOT, - BP_UPTO, - BP_REPEAT, - BP_BEFORE, - BP_AFTER, - BP_CAPTURE, - BP_OTHERWISE, - BP_CHAIN, - BP_MATCH, - BP_NOT_MATCH, - BP_REPLACE, - BP_REF, - BP_NODENT, - BP_START_OF_FILE, - BP_START_OF_LINE, - BP_END_OF_FILE, - BP_END_OF_LINE, - BP_WORD_BOUNDARY, - BP_LEFTRECURSION, - BP_ERROR, + BP_ANYCHAR = 1, + BP_ID_START = 2, + BP_ID_CONTINUE = 3, + BP_STRING = 4, + BP_RANGE = 5, + BP_NOT = 6, + BP_UPTO = 7, + BP_REPEAT = 8, + BP_BEFORE = 9, + BP_AFTER = 10, + BP_CAPTURE = 11, + BP_OTHERWISE = 12, + BP_CHAIN = 13, + BP_MATCH = 14, + BP_NOT_MATCH = 15, + BP_REPLACE = 16, + BP_REF = 17, + BP_NODENT = 18, + BP_START_OF_FILE = 19, + BP_START_OF_LINE = 20, + BP_END_OF_FILE = 21, + BP_END_OF_LINE = 22, + BP_WORD_BOUNDARY = 23, + BP_LEFTRECURSION = 24, + BP_ERROR = 25, }; struct match_s; // forward declared to resolve circular struct defs @@ -88,6 +88,7 @@ typedef struct pat_s { } leftrec; struct pat_s *pat; } args; + short int cache_balance; } pat_t; // @@ -96,25 +97,29 @@ typedef struct pat_s { typedef struct match_s { // Where the match starts and ends (end is after the last character) const char *start, *end; - struct match_s *child, *nextsibling; pat_t *pat; // Intrusive linked list nodes for garbage collection: struct match_s *next; #ifdef DEBUG_HEAP - struct match_s **atme; + struct match_s **home; #endif + struct match_s *cache_next, **cache_home; + size_t defs_id; int refcount; // If skip_replacement is set to 1, that means the user wants to not print // the replaced text when printing this match: // TODO: this is a bit hacky, there is probably a better way to go about // this but it's less hacky that mutating the match objects more drastically bool skip_replacement:1; + struct match_s **children; + struct match_s *_children[3]; } match_t; // // Pattern matching rule definition(s) // typedef struct def_s { + size_t id; size_t namelen; const char *name; pat_t *pat; |
