diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2021-08-01 15:36:53 -0700 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2021-08-01 15:36:53 -0700 |
| commit | 0b2de4721f3dcf95d6d0af54c78e197df10f6666 (patch) | |
| tree | 7b64cefb5c2897510fffdb6c1aa36c792a4c61c4 | |
| parent | 8268e67875abeaae99d0793e424514662a84628d (diff) | |
Moved caching code onto the file, which fixed an issue with file slicing
having stale cache values.
| -rw-r--r-- | bp.c | 7 | ||||
| -rw-r--r-- | definitions.c | 5 | ||||
| -rw-r--r-- | files.c | 125 | ||||
| -rw-r--r-- | files.h | 16 | ||||
| -rw-r--r-- | match.c | 128 | ||||
| -rw-r--r-- | match.h | 4 | ||||
| -rw-r--r-- | types.h | 2 |
7 files changed, 164 insertions, 123 deletions
@@ -406,7 +406,7 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) } fflush(stdout); - cache_destroy(); + cache_destroy(f); if (recycle_all_matches() != 0) fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[0m\n"); destroy_file(&f); @@ -624,6 +624,10 @@ int main(int argc, char *argv[]) tty_out = fopen("/dev/tty", "w"); } + // No need for these caches anymore: + for (file_t *f = loaded_files; f; f = f->next) + cache_destroy(f); + int found = 0; if (options.mode == MODE_JSON) printf("["); if (options.git_mode) { // Get the list of files from `git --ls-files ...` @@ -652,7 +656,6 @@ int main(int argc, char *argv[]) // This code frees up all residual heap-allocated memory. Since the program // is about to exit, this step is unnecessary. However, it is useful for // tracking down memory leaks. - cache_destroy(); free_all_matches(); defs = free_defs(defs, NULL); while (loaded_files) { diff --git a/definitions.c b/definitions.c index 43bfe3c..717e1a5 100644 --- a/definitions.c +++ b/definitions.c @@ -37,11 +37,8 @@ def_t *load_grammar(def_t *defs, file_t *f) pat_t *pat = bp_pattern(f, str); if (!pat) file_err(f, str, f->end, "Could not parse this file"); if (pat->end < f->end) file_err(f, pat->end, f->end, "Could not parse this part of the file"); - for (pat_t *p = pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) { - // printf("Def '%.*s': %.*s\n", (int)p->args.def.namelen, p->args.def.name, - // (int)(p->args.def.def->end - p->args.def.def->start), p->args.def.def->start); + for (pat_t *p = pat; p && p->type == BP_DEFINITION; p = p->args.def.pat) defs = with_def(defs, p->args.def.namelen, p->args.def.name, p->args.def.def); - } return defs; } @@ -14,6 +14,7 @@ #include <unistd.h> #include "files.h" +#include "match.h" #include "pattern.h" #include "utils.h" @@ -183,6 +184,8 @@ void destroy_file(file_t **f) } } + cache_destroy(*f); + for (pat_t *next; (*f)->pats; (*f)->pats = next) { next = (*f)->pats->next; delete(&(*f)->pats); @@ -270,4 +273,126 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons fprintf(dest, "\033[0m\n"); } +// +// Hash a string position/pattern. +// +static inline size_t hash(const char *str, pat_t *pat) +{ + return (size_t)str + 2*pat->id; +} + +// +// Check if we have memoized a pattern match at the given position for the +// given definitions. If a result has been memoized, set *result to the +// memoized value and return true, otherwise return false. +// +bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result) +{ + if (!f->cache.matches) return NULL; + size_t h = hash(str, pat) & (f->cache.size-1); + for (match_t *c = f->cache.matches[h]; c; c = c->cache.next) { + if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) { + // If c->end == NULL, that means no match occurs here + *result = c->end == NULL ? NULL : c; + return true; + } + } + return false; +} + +// +// Remove an item from the cache. +// +static void cache_remove(file_t *f, match_t *m) +{ + if (!m->cache.home) return; + *m->cache.home = m->cache.next; + if (m->cache.next) m->cache.next->cache.home = m->cache.home; + m->cache.next = NULL; + m->cache.home = NULL; + if (--m->refcount == 0) recycle_if_unused(&m); + --f->cache.occupancy; +} + +// +// Save a match in the cache. +// +void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m) +{ + // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used + // to memoize the fact that `pat` will *not* match at `str`. + if (m == NULL) m = new_match(defs, pat, str, NULL, NULL); + + if (f->cache.occupancy+1 > 3*f->cache.size) { + if (f->cache.size == MAX_CACHE_SIZE) { + size_t h = hash(m->start, m->pat) & (f->cache.size-1); + for (int quota = 2; f->cache.matches[h] && quota > 0; quota--) { + match_t *last = f->cache.matches[h]; + while (last->cache.next) last = last->cache.next; + cache_remove(f, last); + } + } else { + match_t **old_matches = f->cache.matches; + size_t old_size = f->cache.size; + f->cache.size = old_size == 0 ? 16 : 2*old_size; + f->cache.matches = new(match_t*[f->cache.size]); + + // Rehash: + if (old_matches) { + for (size_t i = 0; i < old_size; i++) { + for (match_t *o; (o = old_matches[i]); ) { + *o->cache.home = o->cache.next; + if (o->cache.next) o->cache.next->cache.home = o->cache.home; + size_t h = hash(o->start, o->pat) & (f->cache.size-1); + o->cache.home = &(f->cache.matches[h]); + o->cache.next = f->cache.matches[h]; + if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &o->cache.next; + f->cache.matches[h] = o; + } + } + free(old_matches); + } + } + } + + size_t h = hash(m->start, m->pat) & (f->cache.size-1); + m->cache.home = &(f->cache.matches[h]); + m->cache.next = f->cache.matches[h]; + if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &m->cache.next; + f->cache.matches[h] = m; + ++m->refcount; + ++f->cache.occupancy; +} + +// +// Remove all items from the cache that do not overlap `start` and `end`. +// (This is used to remove useless items from the cache) +// +void cache_prune(file_t *f, const char *start, const char *end) +{ + if (!f->cache.matches) return; + for (size_t i = 0; i < f->cache.size; i++) { + for (match_t *m = f->cache.matches[i], *next = NULL; m; m = next) { + next = m->cache.next; + if (m->start < start || (m->end ? m->end : m->start) > end) + cache_remove(f, m); + } + } +} + +// +// Clear and deallocate the cache. +// +void cache_destroy(file_t *f) +{ + if (!f->cache.matches) return; + for (size_t i = 0; i < f->cache.size; i++) { + while (f->cache.matches[i]) + cache_remove(f, f->cache.matches[i]); + } + f->cache.occupancy = 0; + delete(&f->cache.matches); + f->cache.size = 0; +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 @@ -4,13 +4,15 @@ #ifndef FILES__H #define FILES__H +#include "types.h" + #include <stdbool.h> #include <stdio.h> #include <unistd.h> #define file_err(f, ...) do { fprint_line(stderr, f, __VA_ARGS__); exit(EXIT_FAILURE); } while(false) -struct pat_s; // declared in types.h +#define MAX_CACHE_SIZE (1<<14) typedef struct file_s { struct file_s *next; @@ -18,6 +20,10 @@ typedef struct file_s { char *memory, **lines, *start, *end; size_t nlines; struct pat_s *pats; + struct { + size_t size, occupancy; + match_t **matches; + } cache; bool mmapped:1; } file_t; @@ -37,6 +43,14 @@ __attribute__((pure, nonnull)) const char *get_line(file_t *f, size_t line_number); __attribute__((nonnull(1,2,3), format(printf,5,6))) void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *fmt, ...); +__attribute__((nonnull(1,3,4,5))) +bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result); +__attribute__((nonnull(1,3,4))) +void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m); +__attribute__((nonnull)) +void cache_prune(file_t *f, const char *start, const char *end); +__attribute__((nonnull)) +void cache_destroy(file_t *f); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 @@ -26,21 +26,10 @@ static match_t *unused_matches = NULL; static match_t *in_use_matches = NULL; -typedef struct { - size_t size, occupancy; - match_t **matches; -} cache_t; - -#define MAX_CACHE_SIZE (1<<14) - #define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL} -cache_t cache = {0, 0, NULL}; - __attribute__((nonnull(1))) static inline pat_t *deref(def_t *defs, pat_t *pat); -__attribute__((returns_nonnull)) -static match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]); __attribute__((nonnull)) static match_t *get_capture_by_num(match_t *m, int *n); __attribute__((nonnull, pure)) @@ -96,99 +85,6 @@ static inline void list_remove(match_t *m, match_dll_t *node) node->next = NULL; } -static inline size_t hash(const char *str, pat_t *pat) -{ - return (size_t)str + 2*pat->id; -} - -static match_t *cache_lookup(def_t *defs, const char *str, pat_t *pat) -{ - if (!cache.matches) return NULL; - size_t h = hash(str, pat) & (cache.size-1); - for (match_t *c = cache.matches[h]; c; c = c->cache.next) { - if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) - return c; - } - return NULL; -} - -static void cache_remove(match_t *m) -{ - if (!m->cache.home) return; - *m->cache.home = m->cache.next; - if (m->cache.next) m->cache.next->cache.home = m->cache.home; - m->cache.next = NULL; - m->cache.home = NULL; - remove_ownership(&m); - --cache.occupancy; -} - -static void cache_save(match_t *m) -{ - if (cache.occupancy+1 > 3*cache.size) { - if (cache.size == MAX_CACHE_SIZE) { - size_t h = hash(m->start, m->pat) & (cache.size-1); - for (int quota = 2; cache.matches[h] && quota > 0; quota--) { - match_t *last = cache.matches[h]; - while (last->cache.next) last = last->cache.next; - cache_remove(last); - } - } else { - cache_t old_cache = cache; - cache.size = old_cache.size == 0 ? 16 : 2*old_cache.size; - cache.matches = new(match_t*[cache.size]); - - // Rehash: - if (old_cache.matches) { - for (size_t i = 0; i < old_cache.size; i++) { - for (match_t *o; (o = old_cache.matches[i]); ) { - *o->cache.home = o->cache.next; - if (o->cache.next) o->cache.next->cache.home = o->cache.home; - size_t h = hash(o->start, o->pat) & (cache.size-1); - o->cache.home = &(cache.matches[h]); - o->cache.next = cache.matches[h]; - if (cache.matches[h]) cache.matches[h]->cache.home = &o->cache.next; - cache.matches[h] = o; - } - } - free(old_cache.matches); - } - } - } - - size_t h = hash(m->start, m->pat) & (cache.size-1); - m->cache.home = &(cache.matches[h]); - m->cache.next = cache.matches[h]; - if (cache.matches[h]) cache.matches[h]->cache.home = &m->cache.next; - cache.matches[h] = NULL; - add_owner(&cache.matches[h], m); - ++cache.occupancy; -} - -static void cache_prune(const char *start, const char *end) -{ - if (!cache.matches) return; - for (size_t i = 0; i < cache.size; i++) { - for (match_t *m = cache.matches[i], *next = NULL; m; m = next) { - next = m->cache.next; - if (m->start < start || (m->end ? m->end : m->start) > end) - cache_remove(m); - } - } -} - -void cache_destroy(void) -{ - if (!cache.matches) return; - for (size_t i = 0; i < cache.size; i++) { - while (cache.matches[i]) - cache_remove(cache.matches[i]); - } - cache.occupancy = 0; - delete(&cache.matches); - cache.size = 0; -} - // // If the given pattern is a reference, look it up and return the referenced // pattern. This is used for an optimization to avoid repeated lookups. @@ -243,7 +139,7 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk if (prev) { str = prev->end > prev->start ? prev->end : prev->end + 1; if (prev->refcount == 0) recycle_if_unused(&prev); - cache_prune(str, f->end); + cache_prune(f, str, f->end); } else { str = f->start; } @@ -470,20 +366,24 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool file_t slice; slice_file(&slice, f, f->start, str); for (const char *pos = &str[-(long)back->min_matchlen]; - pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(long)back->max_matchlen]); + pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); pos = prev_char(f, pos)) { + cache_destroy(&slice); slice.start = (char*)pos; match_t *m = match(defs, &slice, pos, back, ignorecase); // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB") if (m && m->end != str) recycle_if_unused(&m); - else if (m) + else if (m) { + cache_destroy(&slice); return new_match(defs, pat, str, str, MATCHES(m)); + } if (pos == f->start) break; // To prevent extreme performance degradation, don't keep // walking backwards endlessly over newlines. if (back->max_matchlen == -1 && *pos == '\n') break; } + cache_destroy(&slice); return NULL; } case BP_BEFORE: { @@ -548,9 +448,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase); if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { if (m2) recycle_if_unused(&m2); + cache_destroy(&slice); recycle_if_unused(&m1); return NULL; } + cache_destroy(&slice); return new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m2) : NULL); } case BP_REPLACE: { @@ -562,8 +464,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return new_match(defs, pat, str, p ? p->end : str, MATCHES(p)); } case BP_REF: { - match_t *cached = cache_lookup(defs, str, pat); - if (cached) return cached->end == NULL ? NULL : cached; + match_t *cached; + if (cache_get(f, defs, str, pat, &cached)) + return cached; def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); if (def == NULL) @@ -593,8 +496,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool const char *prev = str; match_t *m = match(&defs2, f, str, ref, ignorecase); if (m == NULL) { - // Store placeholder: - cache_save(new_match(defs, pat, str, NULL, NULL)); + cache_save(f, defs, str, pat, NULL); return NULL; } @@ -618,7 +520,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // results. // OPTIMIZE: remove this if necessary match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m)); - cache_save(wrap); + cache_save(f, defs, str, pat, wrap); if (rec_op.args.leftrec.match) remove_ownership(&rec_op.args.leftrec.match); @@ -717,7 +619,7 @@ match_t *get_capture(match_t *m, const char **id) // // Return a match object which can be used (may be allocated or recycled). // -static match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]) +match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]) { match_t *m; if (unused_matches) { @@ -7,8 +7,11 @@ #include <stdbool.h> #include <stdio.h> +#include "files.h" #include "types.h" +__attribute__((returns_nonnull)) +match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]); __attribute__((nonnull(2,4))) match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase); __attribute__((nonnull)) @@ -17,7 +20,6 @@ __attribute__((nonnull)) void recycle_if_unused(match_t **at_m); size_t free_all_matches(void); size_t recycle_all_matches(void); -void cache_destroy(void); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1 @@ -7,8 +7,6 @@ #include <stdbool.h> #include <sys/types.h> -#include "files.h" - #define UNBOUNDED(pat) ((pat)->max_matchlen == -1) // BP virtual machine pattern types |
