Moving cache logic into match, cleaner next_match() API, and slightly

less tightly coupled UTF8 API
2021-09-21 18:45:43 -07:00 · 2021-09-21 18:45:43 -07:00 · 90c3c13a02
commit 90c3c13a02
parent 9401facbe7
9 changed files with 302 additions and 306 deletions
--- a/bp.c
+++ b/bp.c
@ -169,17 +169,15 @@ static int is_text_file(const char *filename)
 //
 static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
 {
-    static int matches = 0;
-    match_t *m = NULL;
-    while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
-        if (++matches > 1)
+    int nmatches = 0;
+    for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
+        if (++nmatches > 1)
            printf(",\n");
        printf("{\"filename\":\"%s\",\"match\":", f->filename);
        json_match(f->start, m, options.verbose);
        printf("}");
    }
-    if (m) recycle_if_unused(&m);
-    return matches;
+    return nmatches;
 }

 //
@ -187,18 +185,16 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
 //
 static int explain_matches(def_t *defs, file_t *f, pat_t *pattern)
 {
-    int matches = 0;
-    match_t *m = NULL;
-    while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
-        if (++matches == 1) {
+    int nmatches = 0;
+    for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
+        if (++nmatches == 1) {
            if (options.print_filenames)
                fprint_filename(stdout, f->filename);
        } else
            printf("\n\n");
        explain_match(m);
    }
-    if (m) recycle_if_unused(&m);
-    return matches;
+    return nmatches;
 }

 //
@ -243,8 +239,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
        .lineformat = LINE_FORMATS[options.format],
    };

-    match_t *m = NULL;
-    while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
+    for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
        if (print_errors(f, m) > 0)
            exit(EXIT_FAILURE);

@ -254,7 +249,6 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
        }
        print_match(out, &pr, m);
    }
-    if (m) recycle_if_unused(&m);

    if (matches > 0 || (f->filename[0] == '\0' && options.context_before == ALL_CONTEXT)) {
        // Print trailing context lines:
@ -281,18 +275,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
    if (options.mode == MODE_EXPLAIN) {
        matches += explain_matches(defs, f, pattern);
    } else if (options.mode == MODE_LISTFILES) {
-        match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
-        if (m) {
-            recycle_if_unused(&m);
+        match_t *m = NULL;
+        if (next_match(&m, defs, f, pattern, options.skip, options.ignorecase)) {
            printf("%s\n", f->filename);
            matches += 1;
        }
+        stop_matching(&m);
    } else if (options.mode == MODE_JSON) {
        matches += print_matches_as_json(defs, f, pattern);
    } else if (options.mode == MODE_INPLACE) {
-        match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
-        if (m) recycle_if_unused(&m);
-        else return 0;
+        match_t *m = NULL;
+        bool found = next_match(&m, defs, f, pattern, options.skip, options.ignorecase);
+        stop_matching(&m);
+        if (!found) return 0;

        // Ensure the file is resident in memory:
        if (f->mmapped) {
@ -315,7 +310,6 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
    }
    fflush(stdout);

-    cache_destroy(f);
    if (recycle_all_matches() != 0)
        fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[m\n");
    destroy_file(&f);
@ -480,10 +474,10 @@ int main(int argc, char *argv[])
            file_t *arg_file = spoof_file(&loaded_files, "<skip argument>", flag, -1);
            pat_t *s = bp_pattern(arg_file, arg_file->start);
            if (!s) {
-                fprint_line(stdout, arg_file, arg_file->start, arg_file->end,
+                file_err(arg_file, arg_file->start, arg_file->end,
                            "Failed to compile the skip argument");
            } else if (after_spaces(s->end, true) < arg_file->end) {
-                fprint_line(stdout, arg_file, s->end, arg_file->end,
+                file_err(arg_file, s->end, arg_file->end,
                            "Failed to compile part of the skip argument");
            }
            options.skip = either_pat(arg_file, options.skip, s);
@ -537,10 +531,6 @@ int main(int argc, char *argv[])
    // Handle exit() calls gracefully:
    require(atexit(&cleanup), "Failed to set cleanup handler at exit");

-    // No need for these caches anymore:
-    for (file_t *f = loaded_files; f; f = f->next)
-        cache_destroy(f);
-
    int found = 0;
    if (options.mode == MODE_JSON) printf("[");
    if (options.git_mode) { // Get the list of files from `git --ls-files ...`
--- a/files.c
+++ b/files.c
@ -182,8 +182,6 @@ void destroy_file(file_t **at_f)
        f->mmapped = NULL;
    }

-    cache_destroy(f);
-
    for (pat_t *next; f->pats; f->pats = next) {
        next = f->pats->next;
        delete(&f->pats);
@ -261,126 +259,4 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons
    fprintf(dest, "\033[m\n");
 }

-//
-// Hash a string position/pattern.
-//
-static inline size_t hash(const char *str, pat_t *pat)
-{
-    return (size_t)str + 2*pat->id;
-}
-
-//
-// Check if we have memoized a pattern match at the given position for the
-// given definitions. If a result has been memoized, set *result to the
-// memoized value and return true, otherwise return false.
-//
-bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result)
-{
-    if (!f->cache.matches) return NULL;
-    size_t h = hash(str, pat) & (f->cache.size-1);
-    for (match_t *c = f->cache.matches[h]; c; c = c->cache.next) {
-        if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
-            // If c->end == NULL, that means no match occurs here
-            *result = c->end == NULL ? NULL : c;
-            return true;
-        }
-    }
-    return false;
-}
-
-//
-// Remove an item from the cache.
-//
-static void cache_remove(file_t *f, match_t *m)
-{
-    if (!m->cache.home) return;
-    *m->cache.home = m->cache.next;
-    if (m->cache.next) m->cache.next->cache.home = m->cache.home;
-    m->cache.next = NULL;
-    m->cache.home = NULL;
-    if (--m->refcount == 0) recycle_if_unused(&m);
-    --f->cache.occupancy;
-}
-
-//
-// Save a match in the cache.
-//
-void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m)
-{
-    // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
-    // to memoize the fact that `pat` will *not* match at `str`.
-    if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
-
-    if (f->cache.occupancy+1 > 3*f->cache.size) {
-        if (f->cache.size == MAX_CACHE_SIZE) {
-            size_t h = hash(m->start, m->pat) & (f->cache.size-1);
-            for (int quota = 2; f->cache.matches[h] && quota > 0; quota--) {
-                match_t *last = f->cache.matches[h];
-                while (last->cache.next) last = last->cache.next;
-                cache_remove(f, last);
-            }
-        } else {
-            match_t **old_matches = f->cache.matches;
-            size_t old_size = f->cache.size;
-            f->cache.size = old_size == 0 ? 16 : 2*old_size;
-            f->cache.matches = new(match_t*[f->cache.size]);
-
-            // Rehash:
-            if (old_matches) {
-                for (size_t i = 0; i < old_size; i++) {
-                    for (match_t *o; (o = old_matches[i]); ) {
-                        *o->cache.home = o->cache.next;
-                        if (o->cache.next) o->cache.next->cache.home = o->cache.home;
-                        size_t h = hash(o->start, o->pat) & (f->cache.size-1);
-                        o->cache.home = &(f->cache.matches[h]);
-                        o->cache.next = f->cache.matches[h];
-                        if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &o->cache.next;
-                        f->cache.matches[h] = o;
-                    }
-                }
-                free(old_matches);
-            }
-        }
-    }
-
-    size_t h = hash(m->start, m->pat) & (f->cache.size-1);
-    m->cache.home = &(f->cache.matches[h]);
-    m->cache.next = f->cache.matches[h];
-    if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &m->cache.next;
-    f->cache.matches[h] = m;
-    ++m->refcount;
-    ++f->cache.occupancy;
-}
-
-//
-// Remove all items from the cache that do not overlap `start` and `end`.
-// (This is used to remove useless items from the cache)
-//
-void cache_prune(file_t *f, const char *start, const char *end)
-{
-    if (!f->cache.matches) return;
-    for (size_t i = 0; i < f->cache.size; i++) {
-        for (match_t *m = f->cache.matches[i], *next = NULL; m; m = next) {
-            next = m->cache.next;
-            if (m->start < start || (m->end ? m->end : m->start) > end)
-                cache_remove(f, m);
-        }
-    }
-}
-
-//
-// Clear and deallocate the cache.
-//
-void cache_destroy(file_t *f)
-{
-    if (!f->cache.matches) return;
-    for (size_t i = 0; i < f->cache.size; i++) {
-        while (f->cache.matches[i])
-            cache_remove(f, f->cache.matches[i]);
-    }
-    f->cache.occupancy = 0;
-    delete(&f->cache.matches);
-    f->cache.size = 0;
-}
-
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
--- a/files.h
+++ b/files.h
@ -6,14 +6,11 @@

 #include "types.h"

-#include <stdbool.h>
 #include <stdio.h>
 #include <unistd.h>

 #define file_err(f, ...) do { fprint_line(stderr, f, __VA_ARGS__); exit(EXIT_FAILURE); } while(false)

-#define MAX_CACHE_SIZE (1<<14)
-
 typedef struct file_s {
    struct file_s *next;
    const char *filename;
@ -43,14 +40,6 @@ __attribute__((pure, nonnull))
 const char *get_line(file_t *f, size_t line_number);
 __attribute__((nonnull(1,2,3), format(printf,5,6)))
 void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *fmt, ...);
-__attribute__((nonnull(1,3,4,5)))
-bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result);
-__attribute__((nonnull(1,3,4)))
-void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m);
-__attribute__((nonnull))
-void cache_prune(file_t *f, const char *start, const char *end);
-__attribute__((nonnull))
-void cache_destroy(file_t *f);

 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
--- a/match.c
+++ b/match.c
@ -16,6 +16,13 @@
 #include "utils.h"
 #include "utf8.h"

+#define MAX_CACHE_SIZE (1<<14)
+
+typedef struct {
+    size_t size, occupancy;
+    match_t **matches;
+} cache_t;
+
 // New match objects are either recycled from unused match objects or allocated
 // from the heap. While it is in use, the match object is stored in the
 // `in_use_matches` linked list. Once it is no longer needed, it is moved to
@ -27,10 +34,8 @@ static match_t *in_use_matches = NULL;

 #define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL}

-__attribute__((nonnull(1)))
-static inline pat_t *deref(def_t *defs, pat_t *pat);
-__attribute__((hot, nonnull(2,3,4)))
-static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase);
+__attribute__((hot, nonnull(2,3,4,5)))
+static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase);

 // Store a value and update its refcount
 static inline void add_owner(match_t** owner, match_t* owned)
@ -80,10 +85,117 @@ static inline void list_remove(match_t *m, match_dll_t *node)
    node->next = NULL;
 }

+//
+// Hash a string position/pattern.
+//
+static inline size_t hash(const char *str, pat_t *pat)
+{
+    return (size_t)str + 2*pat->id;
+}
+
+//
+// Check if we have memoized a pattern match at the given position for the
+// given definitions. If a result has been memoized, set *result to the
+// memoized value and return true, otherwise return false.
+//
+static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result)
+{
+    if (!cache->matches) return NULL;
+    size_t h = hash(str, pat) & (cache->size-1);
+    for (match_t *c = cache->matches[h]; c; c = c->cache.next) {
+        if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
+            // If c->end == NULL, that means no match occurs here
+            *result = c->end == NULL ? NULL : c;
+            return true;
+        }
+    }
+    return false;
+}
+
+//
+// Remove an item from the cache.
+//
+static void cache_remove(cache_t *cache, match_t *m)
+{
+    if (!m->cache.home) return;
+    *m->cache.home = m->cache.next;
+    if (m->cache.next) m->cache.next->cache.home = m->cache.home;
+    m->cache.next = NULL;
+    m->cache.home = NULL;
+    if (--m->refcount == 0) recycle_if_unused(&m);
+    --cache->occupancy;
+}
+
+//
+// Save a match in the cache.
+//
+static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m)
+{
+    // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
+    // to memoize the fact that `pat` will *not* match at `str`.
+    if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
+
+    if (cache->occupancy+1 > 3*cache->size) {
+        if (cache->size == MAX_CACHE_SIZE) {
+            size_t h = hash(m->start, m->pat) & (cache->size-1);
+            for (int quota = 2; cache->matches[h] && quota > 0; quota--) {
+                match_t *last = cache->matches[h];
+                while (last->cache.next) last = last->cache.next;
+                cache_remove(cache, last);
+            }
+        } else {
+            match_t **old_matches = cache->matches;
+            size_t old_size = cache->size;
+            cache->size = old_size == 0 ? 16 : 2*old_size;
+            cache->matches = new(match_t*[cache->size]);
+
+            // Rehash:
+            if (old_matches) {
+                for (size_t i = 0; i < old_size; i++) {
+                    for (match_t *o; (o = old_matches[i]); ) {
+                        *o->cache.home = o->cache.next;
+                        if (o->cache.next) o->cache.next->cache.home = o->cache.home;
+                        size_t h = hash(o->start, o->pat) & (cache->size-1);
+                        o->cache.home = &(cache->matches[h]);
+                        o->cache.next = cache->matches[h];
+                        if (cache->matches[h]) cache->matches[h]->cache.home = &o->cache.next;
+                        cache->matches[h] = o;
+                    }
+                }
+                free(old_matches);
+            }
+        }
+    }
+
+    size_t h = hash(m->start, m->pat) & (cache->size-1);
+    m->cache.home = &(cache->matches[h]);
+    m->cache.next = cache->matches[h];
+    if (cache->matches[h]) cache->matches[h]->cache.home = &m->cache.next;
+    cache->matches[h] = m;
+    ++m->refcount;
+    ++cache->occupancy;
+}
+
+//
+// Clear and deallocate the cache.
+//
+void cache_destroy(cache_t *cache)
+{
+    if (!cache->matches) return;
+    for (size_t i = 0; i < cache->size; i++) {
+        while (cache->matches[i])
+            cache_remove(cache, cache->matches[i]);
+    }
+    cache->occupancy = 0;
+    delete(&cache->matches);
+    cache->size = 0;
+}
+
 //
 // If the given pattern is a reference, look it up and return the referenced
 // pattern. This is used for an optimization to avoid repeated lookups.
 //
+__attribute__((nonnull(1)))
 static inline pat_t *deref(def_t *defs, pat_t *pat)
 {
    if (pat && pat->type == BP_REF) {
@ -128,15 +240,18 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
 //
 // Find the next match after prev (or the first match if prev is NULL)
 //
-match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase)
+__attribute__((nonnull(3,5)))
+static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, pat_t *skip, bool ignorecase)
 {
-    const char *str;
-    if (prev) {
-        str = prev->end > prev->start ? prev->end : prev->end + 1;
-        if (prev->refcount == 0) recycle_if_unused(&prev);
-        cache_prune(f, str, f->end);
-    } else {
-        str = f->start;
+    // Prune the unnecessary entries from the cache (those not between start/end)
+    if (cache->matches) {
+        for (size_t i = 0; i < cache->size; i++) {
+            for (match_t *m = cache->matches[i], *next = NULL; m; m = next) {
+                next = m->cache.next;
+                if (m->start < f->start || (m->end ? m->end : m->start) > f->end)
+                    cache_remove(cache, m);
+            }
+        }
    }

    pat = deref(defs, pat);
@ -162,14 +277,14 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
    if (str > f->end) return NULL;

    do {
-        match_t *m = match(defs, f, str, pat, ignorecase);
+        match_t *m = match(defs, cache, f, str, pat, ignorecase);
        if (m) return m;
        if (first->type == BP_START_OF_FILE) return NULL;
        match_t *s;
-        if (skip && (s = match(defs, f, str, skip, ignorecase))) {
+        if (skip && (s = match(defs, cache, f, str, skip, ignorecase))) {
            str = s->end > str ? s->end : str + 1;
            recycle_if_unused(&s);
-        } else str = next_char(f, str);
+        } else str = next_char(str, f->end);
    } while (str < f->end);
    return NULL;
 }
@ -179,12 +294,12 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
 // match object, or NULL if no match is found.
 // The returned value should be free()'d to avoid memory leaking.
 //
-static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase)
+static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase)
 {
    switch (pat->type) {
    case BP_DEFINITION: {
        def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def);
-        match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
+        match_t *m = match(defs2, cache, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
        defs = free_defs(defs2, defs);
        return m;
    }
@ -198,17 +313,17 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
            ++pat->args.leftrec.visits;
            return pat->args.leftrec.match;
        } else {
-            return match(defs, f, str, pat->args.leftrec.fallback, ignorecase);
+            return match(defs, cache, f, str, pat->args.leftrec.fallback, ignorecase);
        }
    }
    case BP_ANYCHAR: {
-        return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+        return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
    }
    case BP_ID_START: {
-        return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+        return (str < f->end && isidstart(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
    }
    case BP_ID_CONTINUE: {
-        return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+        return (str < f->end && isidcontinue(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
    }
    case BP_START_OF_FILE: {
        return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL;
@ -223,7 +338,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL;
    }
    case BP_WORD_BOUNDARY: {
-        return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL;
+        return (str == f->start || isidcontinue(str, f->end) != isidcontinue(prev_char(f->start, str), f->end)) ? new_match(defs, pat, str, str, NULL) : NULL;
    }
    case BP_STRING: {
        if (&str[pat->min_matchlen] > f->end) return NULL;
@ -238,7 +353,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        return new_match(defs, pat, str, str+1, NULL);
    }
    case BP_NOT: {
-        match_t *m = match(defs, f, str, pat->args.pat, ignorecase);
+        match_t *m = match(defs, cache, f, str, pat->args.pat, ignorecase);
        if (m != NULL) {
            recycle_if_unused(&m);
            return NULL;
@ -259,7 +374,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        for (const char *prev = NULL; prev < str; ) {
            prev = str;
            if (target) {
-                match_t *p = match(defs, f, str, target, ignorecase);
+                match_t *p = match(defs, cache, f, str, target, ignorecase);
                if (p != NULL) {
                    recycle_if_unused(&p);
                    m->end = str;
@ -270,7 +385,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
                return m;
            }
            if (skip) {
-                match_t *s = match(defs, f, str, skip, ignorecase);
+                match_t *s = match(defs, cache, f, str, skip, ignorecase);
                if (s != NULL) {
                    str = s->end;
                    if (nchildren+2 >= child_cap) {
@ -285,7 +400,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
            // be at least once chance to match the pattern, even if
            // we're at the end of the string already (e.g. "..$").
            if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT)
-                str = next_char(f, str);
+                str = next_char(str, f->end);
        }
        recycle_if_unused(&m);
        return NULL;
@ -302,11 +417,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
            // Separator
            match_t *msep = NULL;
            if (sep != NULL && reps > 0) {
-                msep = match(defs, f, str, sep, ignorecase);
+                msep = match(defs, cache, f, str, sep, ignorecase);
                if (msep == NULL) break;
                str = msep->end;
            }
-            match_t *mp = match(defs, f, str, repeating, ignorecase);
+            match_t *mp = match(defs, cache, f, str, repeating, ignorecase);
            if (mp == NULL) {
                str = start;
                if (msep) recycle_if_unused(&msep);
@ -358,19 +473,20 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        // current pos, so mock it out as a file slice.
        // TODO: this breaks ^/^^/$/$$, but that can probably be ignored
        // because you rarely need to check those in a backtrack.
+        cache_t slice_cache = {0};
        file_t slice;
        slice_file(&slice, f, f->start, str);
        for (const char *pos = &str[-(long)back->min_matchlen];
             pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
-             pos = prev_char(f, pos)) {
-            cache_destroy(&slice);
+             pos = prev_char(f->start, pos)) {
+            cache_destroy(&slice_cache);
            slice.start = (char*)pos;
-            match_t *m = match(defs, &slice, pos, back, ignorecase);
+            match_t *m = match(defs, &slice_cache, &slice, pos, back, ignorecase);
            // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
            if (m && m->end != str)
                recycle_if_unused(&m);
            else if (m) {
-                cache_destroy(&slice);
+                cache_destroy(&slice_cache);
                return new_match(defs, pat, str, str, MATCHES(m));
            }
            if (pos == f->start) break;
@ -378,23 +494,23 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
            // walking backwards endlessly over newlines.
            if (back->max_matchlen == -1 && *pos == '\n') break;
        }
-        cache_destroy(&slice);
+        cache_destroy(&slice_cache);
        return NULL;
    }
    case BP_BEFORE: {
-        match_t *after = match(defs, f, str, pat->args.pat, ignorecase);
+        match_t *after = match(defs, cache, f, str, pat->args.pat, ignorecase);
        return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL;
    }
    case BP_CAPTURE: {
-        match_t *p = match(defs, f, str, pat->args.pat, ignorecase);
+        match_t *p = match(defs, cache, f, str, pat->args.pat, ignorecase);
        return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
    }
    case BP_OTHERWISE: {
-        match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase);
-        return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase);
+        match_t *m = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
+        return m ? m : match(defs, cache, f, str, pat->args.multiple.second, ignorecase);
    }
    case BP_CHAIN: {
-        match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
+        match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
        if (m1 == NULL) return NULL;

        match_t *m2;
@ -408,7 +524,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool

            def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref);
            ++m1->refcount; {
-                m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase);
+                m2 = match(defs2, cache, f, m1->end, pat->args.multiple.second, ignorecase);
                if (!m2) { // No need to keep the backref in memory if it didn't match
                    for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) {
                        if ((*rem) == backref) {
@ -422,7 +538,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
                defs = free_defs(defs2, defs);
            } --m1->refcount;
        } else {
-            m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase);
+            m2 = match(defs, cache, f, m1->end, pat->args.multiple.second, ignorecase);
        }

        if (m2 == NULL) {
@ -433,35 +549,36 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        return new_match(defs, pat, str, m2->end, MATCHES(m1, m2));
    }
    case BP_MATCH: case BP_NOT_MATCH: {
-        match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
+        match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
        if (m1 == NULL) return NULL;

        // <p1>~<p2> matches iff the text of <p1> matches <p2>
        // <p1>!~<p2> matches iff the text of <p1> does not match <p2>
+        cache_t slice_cache = {0};
        file_t slice;
        slice_file(&slice, f, m1->start, m1->end);
-        match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase);
+        match_t *m2 = _next_match(defs, &slice_cache, &slice, slice.start, pat->args.multiple.second, NULL, ignorecase);
        if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
-            cache_destroy(&slice);
+            cache_destroy(&slice_cache);
            if (m2) recycle_if_unused(&m2);
            recycle_if_unused(&m1);
            return NULL;
        }
        match_t *ret = new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
-        cache_destroy(&slice);
+        cache_destroy(&slice_cache);
        return ret;
    }
    case BP_REPLACE: {
        match_t *p = NULL;
        if (pat->args.replace.pat) {
-            p = match(defs, f, str, pat->args.replace.pat, ignorecase);
+            p = match(defs, cache, f, str, pat->args.replace.pat, ignorecase);
            if (p == NULL) return NULL;
        }
        return new_match(defs, pat, str, p ? p->end : str, MATCHES(p));
    }
    case BP_REF: {
        match_t *cached;
-        if (cache_get(f, defs, str, pat, &cached))
+        if (cache_get(cache, defs, str, pat, &cached))
            return cached;

        def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
@ -490,9 +607,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        };

        const char *prev = str;
-        match_t *m = match(&defs2, f, str, ref, ignorecase);
+        match_t *m = match(&defs2, cache, f, str, ref, ignorecase);
        if (m == NULL) {
-            cache_save(f, defs, str, pat, NULL);
+            cache_save(cache, defs, str, pat, NULL);
            return NULL;
        }

@ -501,7 +618,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
            remove_ownership(&rec_op.args.leftrec.match);
            add_owner(&rec_op.args.leftrec.match, m);
            prev = m->end;
-            match_t *m2 = match(&defs2, f, str, ref, ignorecase);
+            match_t *m2 = match(&defs2, cache, f, str, ref, ignorecase);
            if (m2 == NULL) break;
            if (m2->end <= prev) {
                recycle_if_unused(&m2);
@ -516,7 +633,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        // results.
        // OPTIMIZE: remove this if necessary
        match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m));
-        cache_save(f, defs, str, pat, wrap);
+        cache_save(cache, defs, str, pat, wrap);

        if (rec_op.args.leftrec.match)
            remove_ownership(&rec_op.args.leftrec.match);
@ -527,9 +644,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        if (*str != '\n') return NULL;
        const char *start = str;

-        size_t linenum = get_line_number(f, str);
-        const char *p = get_line(f, linenum);
-        if (p < f->start) p = f->start; // Can happen with recursive matching
+        const char *p = str;
+        while (p > f->start && p[-1] != '\n') --p;

        // Current indentation:
        char denter = *p;
@ -546,7 +662,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
        return new_match(defs, pat, start, &str[dents], NULL);
    }
    case BP_ERROR: {
-        match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL;
+        match_t *p = pat->args.pat ? match(defs, cache, f, str, pat->args.pat, ignorecase) : NULL;
        return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
    }
    default: {
@ -644,4 +760,32 @@ size_t free_all_matches(void)
    return count;
 }

+//
+// Iterate over matches.
+// Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...}
+//
+bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase)
+{
+    static cache_t cache = {0};
+    if (!f || !pat) { // Cleanup for stop_matching()
+        recycle_if_unused(m);
+        cache_destroy(&cache);
+        return false;
+    }
+
+    const char *start;
+    if (*m) {
+        // Make sure forward progress is occurring, even after zero-width matches:
+        start = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1;
+        recycle_if_unused(m);
+    } else {
+        start = f->start;
+        cache_destroy(&cache);
+    }
+
+    *m = (start <= f->end) ? _next_match(defs, &cache, f, start, pat, skip, ignorecase) : NULL;
+    if (!*m) cache_destroy(&cache);
+    return *m != NULL;
+}
+
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
--- a/match.h
+++ b/match.h
@ -12,12 +12,13 @@

 __attribute__((returns_nonnull))
 match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]);
-__attribute__((nonnull(2,4)))
-match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase);
 __attribute__((nonnull))
 void recycle_if_unused(match_t **at_m);
 size_t free_all_matches(void);
 size_t recycle_all_matches(void);

+bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase);
+#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0)
+
 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
--- a/pattern.c
+++ b/pattern.c
@ -16,16 +16,6 @@
 __attribute__((nonnull))
 static pat_t *bp_pattern_nl(file_t *f, const char *str, bool allow_nl);
 __attribute__((nonnull))
-static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *_bp_simplepattern(file_t *f, const char *str);
-__attribute__((nonnull(1,2,3,6)))
-static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep);
-__attribute__((nonnull(1,2)))
 static pat_t *bp_simplepattern(file_t *f, const char *str);

 //
@ -52,6 +42,7 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssi
 //
 // Helper function to initialize a range object.
 //
+__attribute__((nonnull(1,2,3,6)))
 static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep)
 {
    size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0);
@ -69,6 +60,7 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi
 // Take a pattern and expand it into a chain of patterns if it's followed by
 // any patterns (e.g. "`x `y"), otherwise return the original input.
 //
+__attribute__((nonnull))
 static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
 {
    const char *str = after_spaces(first->end, allow_nl);
@ -84,6 +76,7 @@ static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
 //
 // Match trailing => replacements (with optional pattern beforehand)
 //
+__attribute__((nonnull))
 static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
 {
    const char *str = replace_pat->end;
@ -94,12 +87,12 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
            || matchchar(&str, '{', allow_nl) || matchchar(&str, '\002', allow_nl)) {
            char closequote = str[-1] == '{' ? '}' : (str[-1] == '\002' ? '\003' : str[-1]);
            repstr = str;
-            for (; *str && *str != closequote; str = next_char(f, str)) {
+            for (; str < f->end && *str != closequote; str = next_char(str, f->end)) {
                if (*str == '\\') {
                    if (!str[1] || str[1] == '\n')
                        file_err(f, str, str+1,
                                 "There should be an escape sequence after this backslash.");
-                    str = next_char(f, str);
+                    str = next_char(str, f->end);
                }
            }
            replen = (size_t)(str-repstr);
@ -124,6 +117,7 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
 // chain of choices if it's followed by any "/"-separated patterns (e.g.
 // "`x/`y"), otherwise return the original input.
 //
+__attribute__((nonnull))
 static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl)
 {
    first = expand_chain(f, first, allow_nl);
@ -191,54 +185,23 @@ pat_t *either_pat(file_t *f, pat_t *first, pat_t *second)
    return either;
 }

-//
-// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
-//
-static pat_t *bp_simplepattern(file_t *f, const char *str)
-{
-    pat_t *pat = _bp_simplepattern(f, str);
-    if (pat == NULL) return pat;
-    str = pat->end;
-
-    // Expand postfix operators (if any)
-    while (str < f->end) {
-        enum pattype_e type;
-        if (matchchar(&str, '~', false))
-            type = BP_MATCH;
-        else if (matchstr(&str, "!~", false))
-            type = BP_NOT_MATCH;
-        else break;
-
-        pat_t *first = pat;
-        pat_t *second = bp_simplepattern(f, str);
-        if (!second)
-            file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
-
-        pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
-        pat->args.multiple.first = first;
-        pat->args.multiple.second = second;
-        str = pat->end;
-    }
-
-    return pat;
-}
-
 //
 // Compile a string of BP code into a BP pattern object.
 //
+__attribute__((nonnull))
 static pat_t *_bp_simplepattern(file_t *f, const char *str)
 {
    str = after_spaces(str, false);
    if (!*str) return NULL;
    const char *start = str;
    char c = *str;
-    str = next_char(f, str);
+    str = next_char(str, f->end);
    switch (c) {
    // Any char (dot)
    case '.': {
        if (*str == '.') { // ".."
            pat_t *skip = NULL;
-            str = next_char(f, str);
+            str = next_char(str, f->end);
            char skipper = *str;
            if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) {
                skip = bp_simplepattern(f, str);
@ -261,11 +224,11 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                file_err(f, str, str, "There should be a character here after the '`'");

            const char *c1_loc = str;
-            str = next_char(f, c1_loc);
+            str = next_char(c1_loc, f->end);
            if (*str == '-') { // Range
                const char *c2_loc = ++str;
-                if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1)
-                    file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported.");
+                if (next_char(c1_loc, f->end) > c1_loc+1 || next_char(c2_loc, f->end) > c2_loc+1)
+                    file_err(f, start, next_char(c2_loc, f->end), "Sorry, UTF-8 character ranges are not yet supported.");
                char c1 = *c1_loc, c2 = *c2_loc;
                if (!c2 || c2 == '\n')
                    file_err(f, str, str, "There should be a character here to complete the character range.");
@ -274,7 +237,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                    c1 = c2;
                    c2 = tmp;
                }
-                str = next_char(f, c2_loc);
+                str = next_char(c2_loc, f->end);
                pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE);
                pat->args.range.low = (unsigned char)c1;
                pat->args.range.high = (unsigned char)c2;
@ -318,8 +281,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
            unsigned char e_high = e_low;
            if (*str == '-') { // Escape range (e.g. \x00-\xFF)
                ++str;
-                if (next_char(f, str) != str+1)
-                    file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges.");
+                if (next_char(str, f->end) != str+1)
+                    file_err(f, start, next_char(str, f->end), "Sorry, UTF8 escape sequences are not supported in ranges.");
                const char *seqstart = str;
                e_high = (unsigned char)unescapechar(str, &str);
                if (str == seqstart)
@ -331,7 +294,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
            esc->args.range.low = e_low;
            esc->args.range.high = e_high;
            all = either_pat(f, all, esc);
-        } while (*str++ == ',');
+        } while (*str == ',' && str++ < f->end);

        return all;
    }
@ -344,9 +307,9 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
        char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c);
        char *litstart = (char*)str;
        while (str < f->end && *str != endquote)
-            str = next_char(f, str);
+            str = next_char(str, f->end);
        size_t len = (size_t)(str - litstart);
-        str = next_char(f, str);
+        str = next_char(str, f->end);

        pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
        pat->args.string = litstart;
@ -528,10 +491,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
 pat_t *bp_stringpattern(file_t *f, const char *str)
 {
    pat_t *ret = NULL;
-    while (*str) {
+    while (str < f->end) {
        char *start = (char*)str;
        pat_t *interp = NULL;
-        for (; str < f->end; str = next_char(f, str)) {
+        for (; str < f->end; str = next_char(str, f->end)) {
            if (*str == '\\' && str+1 < f->end) {
                if (str[1] == '\\' || isalnum(str[1]))
                    interp = bp_simplepattern(f, str);
@ -558,6 +521,38 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
    return ret;
 }

+//
+// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
+//
+static pat_t *bp_simplepattern(file_t *f, const char *str)
+{
+    pat_t *pat = _bp_simplepattern(f, str);
+    if (pat == NULL) return pat;
+    str = pat->end;
+
+    // Expand postfix operators (if any)
+    while (str < f->end) {
+        enum pattype_e type;
+        if (matchchar(&str, '~', false))
+            type = BP_MATCH;
+        else if (matchstr(&str, "!~", false))
+            type = BP_NOT_MATCH;
+        else break;
+
+        pat_t *first = pat;
+        pat_t *second = bp_simplepattern(f, str);
+        if (!second)
+            file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
+
+        pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
+        pat->args.multiple.first = first;
+        pat->args.multiple.second = second;
+        str = pat->end;
+    }
+
+    return pat;
+}
+
 //
 // Given a pattern and a replacement string, compile the two into a BP
 // replace pattern.
@ -567,7 +562,7 @@ pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement)
    pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen, BP_REPLACE);
    pat->args.replace.pat = replacepat;
    const char *p = replacement;
-    for (; *p; p++) {
+    for (; p < f->end; p++) {
        if (*p == '\\') {
            if (!p[1] || p[1] == '\n')
                file_err(f, p, p, "There should be an escape sequence or pattern here after this backslash.");
--- a/pattern.h
+++ b/pattern.h
@ -9,7 +9,7 @@

 __attribute__((returns_nonnull, nonnull(1,2)))
 pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type);
-__attribute__((nonnull(1,2)))
+__attribute__((nonnull))
 pat_t *bp_stringpattern(file_t *f, const char *str);
 __attribute__((nonnull(1,2)))
 pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement);
--- a/utf8.c
+++ b/utf8.c
@ -3,8 +3,9 @@
 //
 #include <ctype.h>
 #include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>

-#include "files.h"
 #include "utf8.h"

 #define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))
@ -181,39 +182,39 @@ static const uint32_t XID_Continue_only[][2] = {
 // Return the location of the next character or UTF8 codepoint.
 // (i.e. skip forward one codepoint at a time, not one byte at a time)
 //
-const char *next_char(file_t *f, const char *str)
+const char *next_char(const char *str, const char *end)
 {
-    if (likely(str+1 <= f->end) && likely((str[0] & 0x80) == 0x0))
+    if (likely(str+1 <= end) && likely((str[0] & 0x80) == 0x0))
        return str+1;
-    if (likely(str+2 <= f->end) && (str[0] & 0xe0) == 0xc0)
+    if (likely(str+2 <= end) && (str[0] & 0xe0) == 0xc0)
        return str+2;
-    if (likely(str+3 <= f->end) && (str[0] & 0xf0) == 0xe0)
+    if (likely(str+3 <= end) && (str[0] & 0xf0) == 0xe0)
        return str+3;
-    if (likely(str+4 <= f->end) && (str[0] & 0xf8) == 0xf0)
+    if (likely(str+4 <= end) && (str[0] & 0xf8) == 0xf0)
        return str+4;
-    return likely(str+1 <= f->end) ? str+1 : f->end;
+    return likely(str+1 <= end) ? str+1 : end;
 }

 //
 // Return the location of the previous character or UTF8 codepoint.
 // (i.e. skip backwards one codepoint at a time, not one byte at a time)
 //
-const char *prev_char(file_t *f, const char *str)
+const char *prev_char(const char *start, const char *str)
 {
-    if (likely(str-1 >= f->start) && likely((str[-1] & 0x80) == 0x0))
+    if (likely(str-1 >= start) && likely((str[-1] & 0x80) == 0x0))
        return str-1;
-    if (likely(str-2 >= f->start) && (str[-2] & 0xe0) == 0xc0)
+    if (likely(str-2 >= start) && (str[-2] & 0xe0) == 0xc0)
        return str-2;
-    if (likely(str-3 >= f->start) && (str[-3] & 0xf0) == 0xe0)
+    if (likely(str-3 >= start) && (str[-3] & 0xf0) == 0xe0)
        return str-3;
-    if (likely(str-4 >= f->start) && (str[-4] & 0xf8) == 0xf0)
+    if (likely(str-4 >= start) && (str[-4] & 0xf8) == 0xf0)
        return str-4;
-    return likely(str-1 >= f->start) ? str-1 : f->start;
+    return likely(str-1 >= start) ? str-1 : start;
 }

-static uint32_t get_codepoint(file_t *f, const char *str)
+static uint32_t get_codepoint(const char *str, const char *end)
 {
-    if (str >= f->end)
+    if (unlikely(str >= end))
        return (uint32_t)-1;

    unsigned char c1 = (unsigned char)str[0];
@ -235,7 +236,7 @@ static uint32_t get_codepoint(file_t *f, const char *str)
    }

    for (int i = 1; i < seqlen; ++i) {
-        if (unlikely(&str[i] >= f->end || (str[i] & 0xC0) != 0x80))
+        if (unlikely((&str[i] >= end) || (str[i] & 0xC0) != 0x80))
            return (uint32_t)-1;
        codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F));
    }
@ -259,22 +260,22 @@ static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_
    return false;
 }

-bool isidstart(file_t *f, const char *str)
+bool isidstart(const char *str, const char *end)
 {
-    if (unlikely(str >= f->end)) return false;
+    if (unlikely(str >= end)) return false;
    else if (isalpha(*str) || *str == '_') return true;
    else if (likely((*str & 0x80) == 0)) return false;
-    uint32_t codepoint = get_codepoint(f, str);
+    uint32_t codepoint = get_codepoint(str, end);
    return codepoint != (uint32_t)-1
        && find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start));
 }

-bool isidcontinue(file_t *f, const char *str)
+bool isidcontinue(const char *str, const char *end)
 {
-    if (unlikely(str >= f->end)) return false;
+    if (unlikely(str >= end)) return false;
    else if (isalnum(*str) || *str == '_') return true;
    else if (likely((*str & 0x80) == 0)) return false;
-    uint32_t codepoint = get_codepoint(f, str);
+    uint32_t codepoint = get_codepoint(str, end);
    return codepoint != (uint32_t)-1
        && (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start))
            || find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only)));
--- a/utf8.h
+++ b/utf8.h
@ -1,21 +1,21 @@
 //
 // utf8.h - UTF8 helper functions
 //
-#include "files.h"
-
 #ifndef UTF8__H
 #define UTF8__H

+#include <stdbool.h>
+
 #define UTF8_MAXCHARLEN 4

 __attribute__((nonnull, pure))
-const char *next_char(file_t *f, const char *str);
+const char *next_char(const char *str, const char *end);
 __attribute__((nonnull, pure))
-const char *prev_char(file_t *f, const char *str);
+const char *prev_char(const char *start, const char *str);
 __attribute__((nonnull, pure))
-bool isidstart(file_t *f, const char *str);
+bool isidstart(const char *str, const char *end);
 __attribute__((nonnull, pure))
-bool isidcontinue(file_t *f, const char *str);
+bool isidcontinue(const char *str, const char *end);

 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0