From 90c3c13a02e501d3bea839dceb00f09c89bfb5fe Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Tue, 21 Sep 2021 18:45:43 -0700 Subject: [PATCH] Moving cache logic into match, cleaner next_match() API, and slightly less tightly coupled UTF8 API --- bp.c | 46 ++++------ files.c | 124 --------------------------- files.h | 11 --- match.c | 252 ++++++++++++++++++++++++++++++++++++++++++------------ match.h | 5 +- pattern.c | 111 ++++++++++++------------ pattern.h | 2 +- utf8.c | 45 +++++----- utf8.h | 12 +-- 9 files changed, 302 insertions(+), 306 deletions(-) diff --git a/bp.c b/bp.c index 1697a55..49e2317 100644 --- a/bp.c +++ b/bp.c @@ -169,17 +169,15 @@ static int is_text_file(const char *filename) // static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) { - static int matches = 0; - match_t *m = NULL; - while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { - if (++matches > 1) + int nmatches = 0; + for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) { + if (++nmatches > 1) printf(",\n"); printf("{\"filename\":\"%s\",\"match\":", f->filename); json_match(f->start, m, options.verbose); printf("}"); } - if (m) recycle_if_unused(&m); - return matches; + return nmatches; } // @@ -187,18 +185,16 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern) // static int explain_matches(def_t *defs, file_t *f, pat_t *pattern) { - int matches = 0; - match_t *m = NULL; - while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { - if (++matches == 1) { + int nmatches = 0; + for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) { + if (++nmatches == 1) { if (options.print_filenames) fprint_filename(stdout, f->filename); } else printf("\n\n"); explain_match(m); } - if (m) recycle_if_unused(&m); - return matches; + return nmatches; } // @@ -243,8 +239,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern) .lineformat = LINE_FORMATS[options.format], }; - match_t *m = NULL; - while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) { + for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) { if (print_errors(f, m) > 0) exit(EXIT_FAILURE); @@ -254,7 +249,6 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern) } print_match(out, &pr, m); } - if (m) recycle_if_unused(&m); if (matches > 0 || (f->filename[0] == '\0' && options.context_before == ALL_CONTEXT)) { // Print trailing context lines: @@ -281,18 +275,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) if (options.mode == MODE_EXPLAIN) { matches += explain_matches(defs, f, pattern); } else if (options.mode == MODE_LISTFILES) { - match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase); - if (m) { - recycle_if_unused(&m); + match_t *m = NULL; + if (next_match(&m, defs, f, pattern, options.skip, options.ignorecase)) { printf("%s\n", f->filename); matches += 1; } + stop_matching(&m); } else if (options.mode == MODE_JSON) { matches += print_matches_as_json(defs, f, pattern); } else if (options.mode == MODE_INPLACE) { - match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase); - if (m) recycle_if_unused(&m); - else return 0; + match_t *m = NULL; + bool found = next_match(&m, defs, f, pattern, options.skip, options.ignorecase); + stop_matching(&m); + if (!found) return 0; // Ensure the file is resident in memory: if (f->mmapped) { @@ -315,7 +310,6 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern) } fflush(stdout); - cache_destroy(f); if (recycle_all_matches() != 0) fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[m\n"); destroy_file(&f); @@ -480,10 +474,10 @@ int main(int argc, char *argv[]) file_t *arg_file = spoof_file(&loaded_files, "", flag, -1); pat_t *s = bp_pattern(arg_file, arg_file->start); if (!s) { - fprint_line(stdout, arg_file, arg_file->start, arg_file->end, + file_err(arg_file, arg_file->start, arg_file->end, "Failed to compile the skip argument"); } else if (after_spaces(s->end, true) < arg_file->end) { - fprint_line(stdout, arg_file, s->end, arg_file->end, + file_err(arg_file, s->end, arg_file->end, "Failed to compile part of the skip argument"); } options.skip = either_pat(arg_file, options.skip, s); @@ -537,10 +531,6 @@ int main(int argc, char *argv[]) // Handle exit() calls gracefully: require(atexit(&cleanup), "Failed to set cleanup handler at exit"); - // No need for these caches anymore: - for (file_t *f = loaded_files; f; f = f->next) - cache_destroy(f); - int found = 0; if (options.mode == MODE_JSON) printf("["); if (options.git_mode) { // Get the list of files from `git --ls-files ...` diff --git a/files.c b/files.c index 774f830..5e9b40e 100644 --- a/files.c +++ b/files.c @@ -182,8 +182,6 @@ void destroy_file(file_t **at_f) f->mmapped = NULL; } - cache_destroy(f); - for (pat_t *next; f->pats; f->pats = next) { next = f->pats->next; delete(&f->pats); @@ -261,126 +259,4 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons fprintf(dest, "\033[m\n"); } -// -// Hash a string position/pattern. -// -static inline size_t hash(const char *str, pat_t *pat) -{ - return (size_t)str + 2*pat->id; -} - -// -// Check if we have memoized a pattern match at the given position for the -// given definitions. If a result has been memoized, set *result to the -// memoized value and return true, otherwise return false. -// -bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result) -{ - if (!f->cache.matches) return NULL; - size_t h = hash(str, pat) & (f->cache.size-1); - for (match_t *c = f->cache.matches[h]; c; c = c->cache.next) { - if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) { - // If c->end == NULL, that means no match occurs here - *result = c->end == NULL ? NULL : c; - return true; - } - } - return false; -} - -// -// Remove an item from the cache. -// -static void cache_remove(file_t *f, match_t *m) -{ - if (!m->cache.home) return; - *m->cache.home = m->cache.next; - if (m->cache.next) m->cache.next->cache.home = m->cache.home; - m->cache.next = NULL; - m->cache.home = NULL; - if (--m->refcount == 0) recycle_if_unused(&m); - --f->cache.occupancy; -} - -// -// Save a match in the cache. -// -void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m) -{ - // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used - // to memoize the fact that `pat` will *not* match at `str`. - if (m == NULL) m = new_match(defs, pat, str, NULL, NULL); - - if (f->cache.occupancy+1 > 3*f->cache.size) { - if (f->cache.size == MAX_CACHE_SIZE) { - size_t h = hash(m->start, m->pat) & (f->cache.size-1); - for (int quota = 2; f->cache.matches[h] && quota > 0; quota--) { - match_t *last = f->cache.matches[h]; - while (last->cache.next) last = last->cache.next; - cache_remove(f, last); - } - } else { - match_t **old_matches = f->cache.matches; - size_t old_size = f->cache.size; - f->cache.size = old_size == 0 ? 16 : 2*old_size; - f->cache.matches = new(match_t*[f->cache.size]); - - // Rehash: - if (old_matches) { - for (size_t i = 0; i < old_size; i++) { - for (match_t *o; (o = old_matches[i]); ) { - *o->cache.home = o->cache.next; - if (o->cache.next) o->cache.next->cache.home = o->cache.home; - size_t h = hash(o->start, o->pat) & (f->cache.size-1); - o->cache.home = &(f->cache.matches[h]); - o->cache.next = f->cache.matches[h]; - if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &o->cache.next; - f->cache.matches[h] = o; - } - } - free(old_matches); - } - } - } - - size_t h = hash(m->start, m->pat) & (f->cache.size-1); - m->cache.home = &(f->cache.matches[h]); - m->cache.next = f->cache.matches[h]; - if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &m->cache.next; - f->cache.matches[h] = m; - ++m->refcount; - ++f->cache.occupancy; -} - -// -// Remove all items from the cache that do not overlap `start` and `end`. -// (This is used to remove useless items from the cache) -// -void cache_prune(file_t *f, const char *start, const char *end) -{ - if (!f->cache.matches) return; - for (size_t i = 0; i < f->cache.size; i++) { - for (match_t *m = f->cache.matches[i], *next = NULL; m; m = next) { - next = m->cache.next; - if (m->start < start || (m->end ? m->end : m->start) > end) - cache_remove(f, m); - } - } -} - -// -// Clear and deallocate the cache. -// -void cache_destroy(file_t *f) -{ - if (!f->cache.matches) return; - for (size_t i = 0; i < f->cache.size; i++) { - while (f->cache.matches[i]) - cache_remove(f, f->cache.matches[i]); - } - f->cache.occupancy = 0; - delete(&f->cache.matches); - f->cache.size = 0; -} - // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/files.h b/files.h index 840412b..b51b9c7 100644 --- a/files.h +++ b/files.h @@ -6,14 +6,11 @@ #include "types.h" -#include #include #include #define file_err(f, ...) do { fprint_line(stderr, f, __VA_ARGS__); exit(EXIT_FAILURE); } while(false) -#define MAX_CACHE_SIZE (1<<14) - typedef struct file_s { struct file_s *next; const char *filename; @@ -43,14 +40,6 @@ __attribute__((pure, nonnull)) const char *get_line(file_t *f, size_t line_number); __attribute__((nonnull(1,2,3), format(printf,5,6))) void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *fmt, ...); -__attribute__((nonnull(1,3,4,5))) -bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result); -__attribute__((nonnull(1,3,4))) -void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m); -__attribute__((nonnull)) -void cache_prune(file_t *f, const char *start, const char *end); -__attribute__((nonnull)) -void cache_destroy(file_t *f); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/match.c b/match.c index 99ef145..2a40472 100644 --- a/match.c +++ b/match.c @@ -16,6 +16,13 @@ #include "utils.h" #include "utf8.h" +#define MAX_CACHE_SIZE (1<<14) + +typedef struct { + size_t size, occupancy; + match_t **matches; +} cache_t; + // New match objects are either recycled from unused match objects or allocated // from the heap. While it is in use, the match object is stored in the // `in_use_matches` linked list. Once it is no longer needed, it is moved to @@ -27,10 +34,8 @@ static match_t *in_use_matches = NULL; #define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL} -__attribute__((nonnull(1))) -static inline pat_t *deref(def_t *defs, pat_t *pat); -__attribute__((hot, nonnull(2,3,4))) -static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase); +__attribute__((hot, nonnull(2,3,4,5))) +static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase); // Store a value and update its refcount static inline void add_owner(match_t** owner, match_t* owned) @@ -80,10 +85,117 @@ static inline void list_remove(match_t *m, match_dll_t *node) node->next = NULL; } +// +// Hash a string position/pattern. +// +static inline size_t hash(const char *str, pat_t *pat) +{ + return (size_t)str + 2*pat->id; +} + +// +// Check if we have memoized a pattern match at the given position for the +// given definitions. If a result has been memoized, set *result to the +// memoized value and return true, otherwise return false. +// +static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result) +{ + if (!cache->matches) return NULL; + size_t h = hash(str, pat) & (cache->size-1); + for (match_t *c = cache->matches[h]; c; c = c->cache.next) { + if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) { + // If c->end == NULL, that means no match occurs here + *result = c->end == NULL ? NULL : c; + return true; + } + } + return false; +} + +// +// Remove an item from the cache. +// +static void cache_remove(cache_t *cache, match_t *m) +{ + if (!m->cache.home) return; + *m->cache.home = m->cache.next; + if (m->cache.next) m->cache.next->cache.home = m->cache.home; + m->cache.next = NULL; + m->cache.home = NULL; + if (--m->refcount == 0) recycle_if_unused(&m); + --cache->occupancy; +} + +// +// Save a match in the cache. +// +static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m) +{ + // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used + // to memoize the fact that `pat` will *not* match at `str`. + if (m == NULL) m = new_match(defs, pat, str, NULL, NULL); + + if (cache->occupancy+1 > 3*cache->size) { + if (cache->size == MAX_CACHE_SIZE) { + size_t h = hash(m->start, m->pat) & (cache->size-1); + for (int quota = 2; cache->matches[h] && quota > 0; quota--) { + match_t *last = cache->matches[h]; + while (last->cache.next) last = last->cache.next; + cache_remove(cache, last); + } + } else { + match_t **old_matches = cache->matches; + size_t old_size = cache->size; + cache->size = old_size == 0 ? 16 : 2*old_size; + cache->matches = new(match_t*[cache->size]); + + // Rehash: + if (old_matches) { + for (size_t i = 0; i < old_size; i++) { + for (match_t *o; (o = old_matches[i]); ) { + *o->cache.home = o->cache.next; + if (o->cache.next) o->cache.next->cache.home = o->cache.home; + size_t h = hash(o->start, o->pat) & (cache->size-1); + o->cache.home = &(cache->matches[h]); + o->cache.next = cache->matches[h]; + if (cache->matches[h]) cache->matches[h]->cache.home = &o->cache.next; + cache->matches[h] = o; + } + } + free(old_matches); + } + } + } + + size_t h = hash(m->start, m->pat) & (cache->size-1); + m->cache.home = &(cache->matches[h]); + m->cache.next = cache->matches[h]; + if (cache->matches[h]) cache->matches[h]->cache.home = &m->cache.next; + cache->matches[h] = m; + ++m->refcount; + ++cache->occupancy; +} + +// +// Clear and deallocate the cache. +// +void cache_destroy(cache_t *cache) +{ + if (!cache->matches) return; + for (size_t i = 0; i < cache->size; i++) { + while (cache->matches[i]) + cache_remove(cache, cache->matches[i]); + } + cache->occupancy = 0; + delete(&cache->matches); + cache->size = 0; +} + // // If the given pattern is a reference, look it up and return the referenced // pattern. This is used for an optimization to avoid repeated lookups. // +__attribute__((nonnull(1))) static inline pat_t *deref(def_t *defs, pat_t *pat) { if (pat && pat->type == BP_REF) { @@ -128,15 +240,18 @@ static pat_t *first_pat(def_t *defs, pat_t *pat) // // Find the next match after prev (or the first match if prev is NULL) // -match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase) +__attribute__((nonnull(3,5))) +static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, pat_t *skip, bool ignorecase) { - const char *str; - if (prev) { - str = prev->end > prev->start ? prev->end : prev->end + 1; - if (prev->refcount == 0) recycle_if_unused(&prev); - cache_prune(f, str, f->end); - } else { - str = f->start; + // Prune the unnecessary entries from the cache (those not between start/end) + if (cache->matches) { + for (size_t i = 0; i < cache->size; i++) { + for (match_t *m = cache->matches[i], *next = NULL; m; m = next) { + next = m->cache.next; + if (m->start < f->start || (m->end ? m->end : m->start) > f->end) + cache_remove(cache, m); + } + } } pat = deref(defs, pat); @@ -162,14 +277,14 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk if (str > f->end) return NULL; do { - match_t *m = match(defs, f, str, pat, ignorecase); + match_t *m = match(defs, cache, f, str, pat, ignorecase); if (m) return m; if (first->type == BP_START_OF_FILE) return NULL; match_t *s; - if (skip && (s = match(defs, f, str, skip, ignorecase))) { + if (skip && (s = match(defs, cache, f, str, skip, ignorecase))) { str = s->end > str ? s->end : str + 1; recycle_if_unused(&s); - } else str = next_char(f, str); + } else str = next_char(str, f->end); } while (str < f->end); return NULL; } @@ -179,12 +294,12 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk // match object, or NULL if no match is found. // The returned value should be free()'d to avoid memory leaking. // -static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase) +static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase) { switch (pat->type) { case BP_DEFINITION: { def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def); - match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase); + match_t *m = match(defs2, cache, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase); defs = free_defs(defs2, defs); return m; } @@ -198,17 +313,17 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ++pat->args.leftrec.visits; return pat->args.leftrec.match; } else { - return match(defs, f, str, pat->args.leftrec.fallback, ignorecase); + return match(defs, cache, f, str, pat->args.leftrec.fallback, ignorecase); } } case BP_ANYCHAR: { - return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; + return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL; } case BP_ID_START: { - return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; + return (str < f->end && isidstart(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL; } case BP_ID_CONTINUE: { - return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL; + return (str < f->end && isidcontinue(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL; } case BP_START_OF_FILE: { return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL; @@ -223,7 +338,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_WORD_BOUNDARY: { - return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL; + return (str == f->start || isidcontinue(str, f->end) != isidcontinue(prev_char(f->start, str), f->end)) ? new_match(defs, pat, str, str, NULL) : NULL; } case BP_STRING: { if (&str[pat->min_matchlen] > f->end) return NULL; @@ -238,7 +353,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return new_match(defs, pat, str, str+1, NULL); } case BP_NOT: { - match_t *m = match(defs, f, str, pat->args.pat, ignorecase); + match_t *m = match(defs, cache, f, str, pat->args.pat, ignorecase); if (m != NULL) { recycle_if_unused(&m); return NULL; @@ -259,7 +374,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool for (const char *prev = NULL; prev < str; ) { prev = str; if (target) { - match_t *p = match(defs, f, str, target, ignorecase); + match_t *p = match(defs, cache, f, str, target, ignorecase); if (p != NULL) { recycle_if_unused(&p); m->end = str; @@ -270,7 +385,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return m; } if (skip) { - match_t *s = match(defs, f, str, skip, ignorecase); + match_t *s = match(defs, cache, f, str, skip, ignorecase); if (s != NULL) { str = s->end; if (nchildren+2 >= child_cap) { @@ -285,7 +400,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // be at least once chance to match the pattern, even if // we're at the end of the string already (e.g. "..$"). if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT) - str = next_char(f, str); + str = next_char(str, f->end); } recycle_if_unused(&m); return NULL; @@ -302,11 +417,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // Separator match_t *msep = NULL; if (sep != NULL && reps > 0) { - msep = match(defs, f, str, sep, ignorecase); + msep = match(defs, cache, f, str, sep, ignorecase); if (msep == NULL) break; str = msep->end; } - match_t *mp = match(defs, f, str, repeating, ignorecase); + match_t *mp = match(defs, cache, f, str, repeating, ignorecase); if (mp == NULL) { str = start; if (msep) recycle_if_unused(&msep); @@ -358,19 +473,20 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // current pos, so mock it out as a file slice. // TODO: this breaks ^/^^/$/$$, but that can probably be ignored // because you rarely need to check those in a backtrack. + cache_t slice_cache = {0}; file_t slice; slice_file(&slice, f, f->start, str); for (const char *pos = &str[-(long)back->min_matchlen]; pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]); - pos = prev_char(f, pos)) { - cache_destroy(&slice); + pos = prev_char(f->start, pos)) { + cache_destroy(&slice_cache); slice.start = (char*)pos; - match_t *m = match(defs, &slice, pos, back, ignorecase); + match_t *m = match(defs, &slice_cache, &slice, pos, back, ignorecase); // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB") if (m && m->end != str) recycle_if_unused(&m); else if (m) { - cache_destroy(&slice); + cache_destroy(&slice_cache); return new_match(defs, pat, str, str, MATCHES(m)); } if (pos == f->start) break; @@ -378,23 +494,23 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // walking backwards endlessly over newlines. if (back->max_matchlen == -1 && *pos == '\n') break; } - cache_destroy(&slice); + cache_destroy(&slice_cache); return NULL; } case BP_BEFORE: { - match_t *after = match(defs, f, str, pat->args.pat, ignorecase); + match_t *after = match(defs, cache, f, str, pat->args.pat, ignorecase); return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL; } case BP_CAPTURE: { - match_t *p = match(defs, f, str, pat->args.pat, ignorecase); + match_t *p = match(defs, cache, f, str, pat->args.pat, ignorecase); return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; } case BP_OTHERWISE: { - match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase); - return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase); + match_t *m = match(defs, cache, f, str, pat->args.multiple.first, ignorecase); + return m ? m : match(defs, cache, f, str, pat->args.multiple.second, ignorecase); } case BP_CHAIN: { - match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); + match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase); if (m1 == NULL) return NULL; match_t *m2; @@ -408,7 +524,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref); ++m1->refcount; { - m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase); + m2 = match(defs2, cache, f, m1->end, pat->args.multiple.second, ignorecase); if (!m2) { // No need to keep the backref in memory if it didn't match for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) { if ((*rem) == backref) { @@ -422,7 +538,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool defs = free_defs(defs2, defs); } --m1->refcount; } else { - m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase); + m2 = match(defs, cache, f, m1->end, pat->args.multiple.second, ignorecase); } if (m2 == NULL) { @@ -433,35 +549,36 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return new_match(defs, pat, str, m2->end, MATCHES(m1, m2)); } case BP_MATCH: case BP_NOT_MATCH: { - match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase); + match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase); if (m1 == NULL) return NULL; // ~ matches iff the text of matches // !~ matches iff the text of does not match + cache_t slice_cache = {0}; file_t slice; slice_file(&slice, f, m1->start, m1->end); - match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase); + match_t *m2 = _next_match(defs, &slice_cache, &slice, slice.start, pat->args.multiple.second, NULL, ignorecase); if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) { - cache_destroy(&slice); + cache_destroy(&slice_cache); if (m2) recycle_if_unused(&m2); recycle_if_unused(&m1); return NULL; } match_t *ret = new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1)); - cache_destroy(&slice); + cache_destroy(&slice_cache); return ret; } case BP_REPLACE: { match_t *p = NULL; if (pat->args.replace.pat) { - p = match(defs, f, str, pat->args.replace.pat, ignorecase); + p = match(defs, cache, f, str, pat->args.replace.pat, ignorecase); if (p == NULL) return NULL; } return new_match(defs, pat, str, p ? p->end : str, MATCHES(p)); } case BP_REF: { match_t *cached; - if (cache_get(f, defs, str, pat, &cached)) + if (cache_get(cache, defs, str, pat, &cached)) return cached; def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name); @@ -490,9 +607,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool }; const char *prev = str; - match_t *m = match(&defs2, f, str, ref, ignorecase); + match_t *m = match(&defs2, cache, f, str, ref, ignorecase); if (m == NULL) { - cache_save(f, defs, str, pat, NULL); + cache_save(cache, defs, str, pat, NULL); return NULL; } @@ -501,7 +618,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool remove_ownership(&rec_op.args.leftrec.match); add_owner(&rec_op.args.leftrec.match, m); prev = m->end; - match_t *m2 = match(&defs2, f, str, ref, ignorecase); + match_t *m2 = match(&defs2, cache, f, str, ref, ignorecase); if (m2 == NULL) break; if (m2->end <= prev) { recycle_if_unused(&m2); @@ -516,7 +633,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool // results. // OPTIMIZE: remove this if necessary match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m)); - cache_save(f, defs, str, pat, wrap); + cache_save(cache, defs, str, pat, wrap); if (rec_op.args.leftrec.match) remove_ownership(&rec_op.args.leftrec.match); @@ -527,9 +644,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool if (*str != '\n') return NULL; const char *start = str; - size_t linenum = get_line_number(f, str); - const char *p = get_line(f, linenum); - if (p < f->start) p = f->start; // Can happen with recursive matching + const char *p = str; + while (p > f->start && p[-1] != '\n') --p; // Current indentation: char denter = *p; @@ -546,7 +662,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool return new_match(defs, pat, start, &str[dents], NULL); } case BP_ERROR: { - match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL; + match_t *p = pat->args.pat ? match(defs, cache, f, str, pat->args.pat, ignorecase) : NULL; return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL; } default: { @@ -644,4 +760,32 @@ size_t free_all_matches(void) return count; } +// +// Iterate over matches. +// Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...} +// +bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase) +{ + static cache_t cache = {0}; + if (!f || !pat) { // Cleanup for stop_matching() + recycle_if_unused(m); + cache_destroy(&cache); + return false; + } + + const char *start; + if (*m) { + // Make sure forward progress is occurring, even after zero-width matches: + start = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1; + recycle_if_unused(m); + } else { + start = f->start; + cache_destroy(&cache); + } + + *m = (start <= f->end) ? _next_match(defs, &cache, f, start, pat, skip, ignorecase) : NULL; + if (!*m) cache_destroy(&cache); + return *m != NULL; +} + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/match.h b/match.h index cdd6592..535e20e 100644 --- a/match.h +++ b/match.h @@ -12,12 +12,13 @@ __attribute__((returns_nonnull)) match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]); -__attribute__((nonnull(2,4))) -match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase); __attribute__((nonnull)) void recycle_if_unused(match_t **at_m); size_t free_all_matches(void); size_t recycle_all_matches(void); +bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase); +#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0) + #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/pattern.c b/pattern.c index 7e31bfc..28a4ecf 100644 --- a/pattern.c +++ b/pattern.c @@ -16,16 +16,6 @@ __attribute__((nonnull)) static pat_t *bp_pattern_nl(file_t *f, const char *str, bool allow_nl); __attribute__((nonnull)) -static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl); -__attribute__((nonnull)) -static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl); -__attribute__((nonnull)) -static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl); -__attribute__((nonnull)) -static pat_t *_bp_simplepattern(file_t *f, const char *str); -__attribute__((nonnull(1,2,3,6))) -static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep); -__attribute__((nonnull(1,2))) static pat_t *bp_simplepattern(file_t *f, const char *str); // @@ -52,6 +42,7 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssi // // Helper function to initialize a range object. // +__attribute__((nonnull(1,2,3,6))) static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep) { size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0); @@ -69,6 +60,7 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi // Take a pattern and expand it into a chain of patterns if it's followed by // any patterns (e.g. "`x `y"), otherwise return the original input. // +__attribute__((nonnull)) static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl) { const char *str = after_spaces(first->end, allow_nl); @@ -84,6 +76,7 @@ static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl) // // Match trailing => replacements (with optional pattern beforehand) // +__attribute__((nonnull)) static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl) { const char *str = replace_pat->end; @@ -94,12 +87,12 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl) || matchchar(&str, '{', allow_nl) || matchchar(&str, '\002', allow_nl)) { char closequote = str[-1] == '{' ? '}' : (str[-1] == '\002' ? '\003' : str[-1]); repstr = str; - for (; *str && *str != closequote; str = next_char(f, str)) { + for (; str < f->end && *str != closequote; str = next_char(str, f->end)) { if (*str == '\\') { if (!str[1] || str[1] == '\n') file_err(f, str, str+1, "There should be an escape sequence after this backslash."); - str = next_char(f, str); + str = next_char(str, f->end); } } replen = (size_t)(str-repstr); @@ -124,6 +117,7 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl) // chain of choices if it's followed by any "/"-separated patterns (e.g. // "`x/`y"), otherwise return the original input. // +__attribute__((nonnull)) static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl) { first = expand_chain(f, first, allow_nl); @@ -191,54 +185,23 @@ pat_t *either_pat(file_t *f, pat_t *first, pat_t *second) return either; } -// -// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~) -// -static pat_t *bp_simplepattern(file_t *f, const char *str) -{ - pat_t *pat = _bp_simplepattern(f, str); - if (pat == NULL) return pat; - str = pat->end; - - // Expand postfix operators (if any) - while (str < f->end) { - enum pattype_e type; - if (matchchar(&str, '~', false)) - type = BP_MATCH; - else if (matchstr(&str, "!~", false)) - type = BP_NOT_MATCH; - else break; - - pat_t *first = pat; - pat_t *second = bp_simplepattern(f, str); - if (!second) - file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~"); - - pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type); - pat->args.multiple.first = first; - pat->args.multiple.second = second; - str = pat->end; - } - - return pat; -} - // // Compile a string of BP code into a BP pattern object. // +__attribute__((nonnull)) static pat_t *_bp_simplepattern(file_t *f, const char *str) { str = after_spaces(str, false); if (!*str) return NULL; const char *start = str; char c = *str; - str = next_char(f, str); + str = next_char(str, f->end); switch (c) { // Any char (dot) case '.': { if (*str == '.') { // ".." pat_t *skip = NULL; - str = next_char(f, str); + str = next_char(str, f->end); char skipper = *str; if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) { skip = bp_simplepattern(f, str); @@ -261,11 +224,11 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) file_err(f, str, str, "There should be a character here after the '`'"); const char *c1_loc = str; - str = next_char(f, c1_loc); + str = next_char(c1_loc, f->end); if (*str == '-') { // Range const char *c2_loc = ++str; - if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1) - file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported."); + if (next_char(c1_loc, f->end) > c1_loc+1 || next_char(c2_loc, f->end) > c2_loc+1) + file_err(f, start, next_char(c2_loc, f->end), "Sorry, UTF-8 character ranges are not yet supported."); char c1 = *c1_loc, c2 = *c2_loc; if (!c2 || c2 == '\n') file_err(f, str, str, "There should be a character here to complete the character range."); @@ -274,7 +237,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) c1 = c2; c2 = tmp; } - str = next_char(f, c2_loc); + str = next_char(c2_loc, f->end); pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE); pat->args.range.low = (unsigned char)c1; pat->args.range.high = (unsigned char)c2; @@ -318,8 +281,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) unsigned char e_high = e_low; if (*str == '-') { // Escape range (e.g. \x00-\xFF) ++str; - if (next_char(f, str) != str+1) - file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges."); + if (next_char(str, f->end) != str+1) + file_err(f, start, next_char(str, f->end), "Sorry, UTF8 escape sequences are not supported in ranges."); const char *seqstart = str; e_high = (unsigned char)unescapechar(str, &str); if (str == seqstart) @@ -331,7 +294,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) esc->args.range.low = e_low; esc->args.range.high = e_high; all = either_pat(f, all, esc); - } while (*str++ == ','); + } while (*str == ',' && str++ < f->end); return all; } @@ -344,9 +307,9 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c); char *litstart = (char*)str; while (str < f->end && *str != endquote) - str = next_char(f, str); + str = next_char(str, f->end); size_t len = (size_t)(str - litstart); - str = next_char(f, str); + str = next_char(str, f->end); pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING); pat->args.string = litstart; @@ -528,10 +491,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str) pat_t *bp_stringpattern(file_t *f, const char *str) { pat_t *ret = NULL; - while (*str) { + while (str < f->end) { char *start = (char*)str; pat_t *interp = NULL; - for (; str < f->end; str = next_char(f, str)) { + for (; str < f->end; str = next_char(str, f->end)) { if (*str == '\\' && str+1 < f->end) { if (str[1] == '\\' || isalnum(str[1])) interp = bp_simplepattern(f, str); @@ -558,6 +521,38 @@ pat_t *bp_stringpattern(file_t *f, const char *str) return ret; } +// +// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~) +// +static pat_t *bp_simplepattern(file_t *f, const char *str) +{ + pat_t *pat = _bp_simplepattern(f, str); + if (pat == NULL) return pat; + str = pat->end; + + // Expand postfix operators (if any) + while (str < f->end) { + enum pattype_e type; + if (matchchar(&str, '~', false)) + type = BP_MATCH; + else if (matchstr(&str, "!~", false)) + type = BP_NOT_MATCH; + else break; + + pat_t *first = pat; + pat_t *second = bp_simplepattern(f, str); + if (!second) + file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~"); + + pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type); + pat->args.multiple.first = first; + pat->args.multiple.second = second; + str = pat->end; + } + + return pat; +} + // // Given a pattern and a replacement string, compile the two into a BP // replace pattern. @@ -567,7 +562,7 @@ pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement) pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen, BP_REPLACE); pat->args.replace.pat = replacepat; const char *p = replacement; - for (; *p; p++) { + for (; p < f->end; p++) { if (*p == '\\') { if (!p[1] || p[1] == '\n') file_err(f, p, p, "There should be an escape sequence or pattern here after this backslash."); diff --git a/pattern.h b/pattern.h index 39aba63..b903d5b 100644 --- a/pattern.h +++ b/pattern.h @@ -9,7 +9,7 @@ __attribute__((returns_nonnull, nonnull(1,2))) pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type); -__attribute__((nonnull(1,2))) +__attribute__((nonnull)) pat_t *bp_stringpattern(file_t *f, const char *str); __attribute__((nonnull(1,2))) pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement); diff --git a/utf8.c b/utf8.c index 6180ffe..08e8932 100644 --- a/utf8.c +++ b/utf8.c @@ -3,8 +3,9 @@ // #include #include +#include +#include -#include "files.h" #include "utf8.h" #define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0])) @@ -181,39 +182,39 @@ static const uint32_t XID_Continue_only[][2] = { // Return the location of the next character or UTF8 codepoint. // (i.e. skip forward one codepoint at a time, not one byte at a time) // -const char *next_char(file_t *f, const char *str) +const char *next_char(const char *str, const char *end) { - if (likely(str+1 <= f->end) && likely((str[0] & 0x80) == 0x0)) + if (likely(str+1 <= end) && likely((str[0] & 0x80) == 0x0)) return str+1; - if (likely(str+2 <= f->end) && (str[0] & 0xe0) == 0xc0) + if (likely(str+2 <= end) && (str[0] & 0xe0) == 0xc0) return str+2; - if (likely(str+3 <= f->end) && (str[0] & 0xf0) == 0xe0) + if (likely(str+3 <= end) && (str[0] & 0xf0) == 0xe0) return str+3; - if (likely(str+4 <= f->end) && (str[0] & 0xf8) == 0xf0) + if (likely(str+4 <= end) && (str[0] & 0xf8) == 0xf0) return str+4; - return likely(str+1 <= f->end) ? str+1 : f->end; + return likely(str+1 <= end) ? str+1 : end; } // // Return the location of the previous character or UTF8 codepoint. // (i.e. skip backwards one codepoint at a time, not one byte at a time) // -const char *prev_char(file_t *f, const char *str) +const char *prev_char(const char *start, const char *str) { - if (likely(str-1 >= f->start) && likely((str[-1] & 0x80) == 0x0)) + if (likely(str-1 >= start) && likely((str[-1] & 0x80) == 0x0)) return str-1; - if (likely(str-2 >= f->start) && (str[-2] & 0xe0) == 0xc0) + if (likely(str-2 >= start) && (str[-2] & 0xe0) == 0xc0) return str-2; - if (likely(str-3 >= f->start) && (str[-3] & 0xf0) == 0xe0) + if (likely(str-3 >= start) && (str[-3] & 0xf0) == 0xe0) return str-3; - if (likely(str-4 >= f->start) && (str[-4] & 0xf8) == 0xf0) + if (likely(str-4 >= start) && (str[-4] & 0xf8) == 0xf0) return str-4; - return likely(str-1 >= f->start) ? str-1 : f->start; + return likely(str-1 >= start) ? str-1 : start; } -static uint32_t get_codepoint(file_t *f, const char *str) +static uint32_t get_codepoint(const char *str, const char *end) { - if (str >= f->end) + if (unlikely(str >= end)) return (uint32_t)-1; unsigned char c1 = (unsigned char)str[0]; @@ -235,7 +236,7 @@ static uint32_t get_codepoint(file_t *f, const char *str) } for (int i = 1; i < seqlen; ++i) { - if (unlikely(&str[i] >= f->end || (str[i] & 0xC0) != 0x80)) + if (unlikely((&str[i] >= end) || (str[i] & 0xC0) != 0x80)) return (uint32_t)-1; codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F)); } @@ -259,22 +260,22 @@ static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_ return false; } -bool isidstart(file_t *f, const char *str) +bool isidstart(const char *str, const char *end) { - if (unlikely(str >= f->end)) return false; + if (unlikely(str >= end)) return false; else if (isalpha(*str) || *str == '_') return true; else if (likely((*str & 0x80) == 0)) return false; - uint32_t codepoint = get_codepoint(f, str); + uint32_t codepoint = get_codepoint(str, end); return codepoint != (uint32_t)-1 && find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start)); } -bool isidcontinue(file_t *f, const char *str) +bool isidcontinue(const char *str, const char *end) { - if (unlikely(str >= f->end)) return false; + if (unlikely(str >= end)) return false; else if (isalnum(*str) || *str == '_') return true; else if (likely((*str & 0x80) == 0)) return false; - uint32_t codepoint = get_codepoint(f, str); + uint32_t codepoint = get_codepoint(str, end); return codepoint != (uint32_t)-1 && (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start)) || find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only))); diff --git a/utf8.h b/utf8.h index 97e259e..243acd3 100644 --- a/utf8.h +++ b/utf8.h @@ -1,21 +1,21 @@ // // utf8.h - UTF8 helper functions // -#include "files.h" - #ifndef UTF8__H #define UTF8__H +#include + #define UTF8_MAXCHARLEN 4 __attribute__((nonnull, pure)) -const char *next_char(file_t *f, const char *str); +const char *next_char(const char *str, const char *end); __attribute__((nonnull, pure)) -const char *prev_char(file_t *f, const char *str); +const char *prev_char(const char *start, const char *str); __attribute__((nonnull, pure)) -bool isidstart(file_t *f, const char *str); +bool isidstart(const char *str, const char *end); __attribute__((nonnull, pure)) -bool isidcontinue(file_t *f, const char *str); +bool isidcontinue(const char *str, const char *end); #endif // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0