From 90c3c13a02e501d3bea839dceb00f09c89bfb5fe Mon Sep 17 00:00:00 2001
From: Bruce Hill <bruce@bruce-hill.com>
Date: Tue, 21 Sep 2021 18:45:43 -0700
Subject: [PATCH] Moving cache logic into match, cleaner next_match() API, and
 slightly less tightly coupled UTF8 API

---
 bp.c      |  46 ++++------
 files.c   | 124 ---------------------------
 files.h   |  11 ---
 match.c   | 252 ++++++++++++++++++++++++++++++++++++++++++------------
 match.h   |   5 +-
 pattern.c | 111 ++++++++++++------------
 pattern.h |   2 +-
 utf8.c    |  45 +++++-----
 utf8.h    |  12 +--
 9 files changed, 302 insertions(+), 306 deletions(-)

diff --git a/bp.c b/bp.c
index 1697a55..49e2317 100644
--- a/bp.c
+++ b/bp.c
@@ -169,17 +169,15 @@ static int is_text_file(const char *filename)
 //
 static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
 {
-    static int matches = 0;
-    match_t *m = NULL;
-    while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
-        if (++matches > 1)
+    int nmatches = 0;
+    for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
+        if (++nmatches > 1)
             printf(",\n");
         printf("{\"filename\":\"%s\",\"match\":", f->filename);
         json_match(f->start, m, options.verbose);
         printf("}");
     }
-    if (m) recycle_if_unused(&m);
-    return matches;
+    return nmatches;
 }
 
 //
@@ -187,18 +185,16 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
 //
 static int explain_matches(def_t *defs, file_t *f, pat_t *pattern)
 {
-    int matches = 0;
-    match_t *m = NULL;
-    while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
-        if (++matches == 1) {
+    int nmatches = 0;
+    for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
+        if (++nmatches == 1) {
             if (options.print_filenames)
                 fprint_filename(stdout, f->filename);
         } else
             printf("\n\n");
         explain_match(m);
     }
-    if (m) recycle_if_unused(&m);
-    return matches;
+    return nmatches;
 }
 
 //
@@ -243,8 +239,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
         .lineformat = LINE_FORMATS[options.format],
     };
 
-    match_t *m = NULL;
-    while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
+    for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
         if (print_errors(f, m) > 0)
             exit(EXIT_FAILURE);
 
@@ -254,7 +249,6 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
         }
         print_match(out, &pr, m);
     }
-    if (m) recycle_if_unused(&m);
 
     if (matches > 0 || (f->filename[0] == '\0' && options.context_before == ALL_CONTEXT)) {
         // Print trailing context lines:
@@ -281,18 +275,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
     if (options.mode == MODE_EXPLAIN) {
         matches += explain_matches(defs, f, pattern);
     } else if (options.mode == MODE_LISTFILES) {
-        match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
-        if (m) {
-            recycle_if_unused(&m);
+        match_t *m = NULL;
+        if (next_match(&m, defs, f, pattern, options.skip, options.ignorecase)) {
             printf("%s\n", f->filename);
             matches += 1;
         }
+        stop_matching(&m);
     } else if (options.mode == MODE_JSON) {
         matches += print_matches_as_json(defs, f, pattern);
     } else if (options.mode == MODE_INPLACE) {
-        match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
-        if (m) recycle_if_unused(&m);
-        else return 0;
+        match_t *m = NULL;
+        bool found = next_match(&m, defs, f, pattern, options.skip, options.ignorecase);
+        stop_matching(&m);
+        if (!found) return 0;
 
         // Ensure the file is resident in memory:
         if (f->mmapped) {
@@ -315,7 +310,6 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
     }
     fflush(stdout);
 
-    cache_destroy(f);
     if (recycle_all_matches() != 0)
         fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[m\n");
     destroy_file(&f);
@@ -480,10 +474,10 @@ int main(int argc, char *argv[])
             file_t *arg_file = spoof_file(&loaded_files, "<skip argument>", flag, -1);
             pat_t *s = bp_pattern(arg_file, arg_file->start);
             if (!s) {
-                fprint_line(stdout, arg_file, arg_file->start, arg_file->end,
+                file_err(arg_file, arg_file->start, arg_file->end,
                             "Failed to compile the skip argument");
             } else if (after_spaces(s->end, true) < arg_file->end) {
-                fprint_line(stdout, arg_file, s->end, arg_file->end,
+                file_err(arg_file, s->end, arg_file->end,
                             "Failed to compile part of the skip argument");
             }
             options.skip = either_pat(arg_file, options.skip, s);
@@ -537,10 +531,6 @@ int main(int argc, char *argv[])
     // Handle exit() calls gracefully:
     require(atexit(&cleanup), "Failed to set cleanup handler at exit");
 
-    // No need for these caches anymore:
-    for (file_t *f = loaded_files; f; f = f->next)
-        cache_destroy(f);
-
     int found = 0;
     if (options.mode == MODE_JSON) printf("[");
     if (options.git_mode) { // Get the list of files from `git --ls-files ...`
diff --git a/files.c b/files.c
index 774f830..5e9b40e 100644
--- a/files.c
+++ b/files.c
@@ -182,8 +182,6 @@ void destroy_file(file_t **at_f)
         f->mmapped = NULL;
     }
 
-    cache_destroy(f);
-
     for (pat_t *next; f->pats; f->pats = next) {
         next = f->pats->next;
         delete(&f->pats);
@@ -261,126 +259,4 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons
     fprintf(dest, "\033[m\n");
 }
 
-//
-// Hash a string position/pattern.
-//
-static inline size_t hash(const char *str, pat_t *pat)
-{
-    return (size_t)str + 2*pat->id;
-}
-
-//
-// Check if we have memoized a pattern match at the given position for the
-// given definitions. If a result has been memoized, set *result to the
-// memoized value and return true, otherwise return false.
-//
-bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result)
-{
-    if (!f->cache.matches) return NULL;
-    size_t h = hash(str, pat) & (f->cache.size-1);
-    for (match_t *c = f->cache.matches[h]; c; c = c->cache.next) {
-        if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
-            // If c->end == NULL, that means no match occurs here
-            *result = c->end == NULL ? NULL : c;
-            return true;
-        }
-    }
-    return false;
-}
-
-//
-// Remove an item from the cache.
-//
-static void cache_remove(file_t *f, match_t *m)
-{
-    if (!m->cache.home) return;
-    *m->cache.home = m->cache.next;
-    if (m->cache.next) m->cache.next->cache.home = m->cache.home;
-    m->cache.next = NULL;
-    m->cache.home = NULL;
-    if (--m->refcount == 0) recycle_if_unused(&m);
-    --f->cache.occupancy;
-}
-
-//
-// Save a match in the cache.
-//
-void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m)
-{
-    // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
-    // to memoize the fact that `pat` will *not* match at `str`.
-    if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
-
-    if (f->cache.occupancy+1 > 3*f->cache.size) {
-        if (f->cache.size == MAX_CACHE_SIZE) {
-            size_t h = hash(m->start, m->pat) & (f->cache.size-1);
-            for (int quota = 2; f->cache.matches[h] && quota > 0; quota--) {
-                match_t *last = f->cache.matches[h];
-                while (last->cache.next) last = last->cache.next;
-                cache_remove(f, last);
-            }
-        } else {
-            match_t **old_matches = f->cache.matches;
-            size_t old_size = f->cache.size;
-            f->cache.size = old_size == 0 ? 16 : 2*old_size;
-            f->cache.matches = new(match_t*[f->cache.size]);
-
-            // Rehash:
-            if (old_matches) {
-                for (size_t i = 0; i < old_size; i++) {
-                    for (match_t *o; (o = old_matches[i]); ) {
-                        *o->cache.home = o->cache.next;
-                        if (o->cache.next) o->cache.next->cache.home = o->cache.home;
-                        size_t h = hash(o->start, o->pat) & (f->cache.size-1);
-                        o->cache.home = &(f->cache.matches[h]);
-                        o->cache.next = f->cache.matches[h];
-                        if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &o->cache.next;
-                        f->cache.matches[h] = o;
-                    }
-                }
-                free(old_matches);
-            }
-        }
-    }
-
-    size_t h = hash(m->start, m->pat) & (f->cache.size-1);
-    m->cache.home = &(f->cache.matches[h]);
-    m->cache.next = f->cache.matches[h];
-    if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &m->cache.next;
-    f->cache.matches[h] = m;
-    ++m->refcount;
-    ++f->cache.occupancy;
-}
-
-//
-// Remove all items from the cache that do not overlap `start` and `end`.
-// (This is used to remove useless items from the cache)
-//
-void cache_prune(file_t *f, const char *start, const char *end)
-{
-    if (!f->cache.matches) return;
-    for (size_t i = 0; i < f->cache.size; i++) {
-        for (match_t *m = f->cache.matches[i], *next = NULL; m; m = next) {
-            next = m->cache.next;
-            if (m->start < start || (m->end ? m->end : m->start) > end)
-                cache_remove(f, m);
-        }
-    }
-}
-
-//
-// Clear and deallocate the cache.
-//
-void cache_destroy(file_t *f)
-{
-    if (!f->cache.matches) return;
-    for (size_t i = 0; i < f->cache.size; i++) {
-        while (f->cache.matches[i])
-            cache_remove(f, f->cache.matches[i]);
-    }
-    f->cache.occupancy = 0;
-    delete(&f->cache.matches);
-    f->cache.size = 0;
-}
-
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/files.h b/files.h
index 840412b..b51b9c7 100644
--- a/files.h
+++ b/files.h
@@ -6,14 +6,11 @@
 
 #include "types.h"
 
-#include <stdbool.h>
 #include <stdio.h>
 #include <unistd.h>
 
 #define file_err(f, ...) do { fprint_line(stderr, f, __VA_ARGS__); exit(EXIT_FAILURE); } while(false)
 
-#define MAX_CACHE_SIZE (1<<14)
-
 typedef struct file_s {
     struct file_s *next;
     const char *filename;
@@ -43,14 +40,6 @@ __attribute__((pure, nonnull))
 const char *get_line(file_t *f, size_t line_number);
 __attribute__((nonnull(1,2,3), format(printf,5,6)))
 void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *fmt, ...);
-__attribute__((nonnull(1,3,4,5)))
-bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result);
-__attribute__((nonnull(1,3,4)))
-void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m);
-__attribute__((nonnull))
-void cache_prune(file_t *f, const char *start, const char *end);
-__attribute__((nonnull))
-void cache_destroy(file_t *f);
 
 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/match.c b/match.c
index 99ef145..2a40472 100644
--- a/match.c
+++ b/match.c
@@ -16,6 +16,13 @@
 #include "utils.h"
 #include "utf8.h"
 
+#define MAX_CACHE_SIZE (1<<14)
+
+typedef struct {
+    size_t size, occupancy;
+    match_t **matches;
+} cache_t;
+
 // New match objects are either recycled from unused match objects or allocated
 // from the heap. While it is in use, the match object is stored in the
 // `in_use_matches` linked list. Once it is no longer needed, it is moved to
@@ -27,10 +34,8 @@ static match_t *in_use_matches = NULL;
 
 #define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL}
 
-__attribute__((nonnull(1)))
-static inline pat_t *deref(def_t *defs, pat_t *pat);
-__attribute__((hot, nonnull(2,3,4)))
-static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase);
+__attribute__((hot, nonnull(2,3,4,5)))
+static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase);
 
 // Store a value and update its refcount
 static inline void add_owner(match_t** owner, match_t* owned)
@@ -80,10 +85,117 @@ static inline void list_remove(match_t *m, match_dll_t *node)
     node->next = NULL;
 }
 
+//
+// Hash a string position/pattern.
+//
+static inline size_t hash(const char *str, pat_t *pat)
+{
+    return (size_t)str + 2*pat->id;
+}
+
+//
+// Check if we have memoized a pattern match at the given position for the
+// given definitions. If a result has been memoized, set *result to the
+// memoized value and return true, otherwise return false.
+//
+static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result)
+{
+    if (!cache->matches) return NULL;
+    size_t h = hash(str, pat) & (cache->size-1);
+    for (match_t *c = cache->matches[h]; c; c = c->cache.next) {
+        if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
+            // If c->end == NULL, that means no match occurs here
+            *result = c->end == NULL ? NULL : c;
+            return true;
+        }
+    }
+    return false;
+}
+
+//
+// Remove an item from the cache.
+//
+static void cache_remove(cache_t *cache, match_t *m)
+{
+    if (!m->cache.home) return;
+    *m->cache.home = m->cache.next;
+    if (m->cache.next) m->cache.next->cache.home = m->cache.home;
+    m->cache.next = NULL;
+    m->cache.home = NULL;
+    if (--m->refcount == 0) recycle_if_unused(&m);
+    --cache->occupancy;
+}
+
+//
+// Save a match in the cache.
+//
+static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m)
+{
+    // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
+    // to memoize the fact that `pat` will *not* match at `str`.
+    if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
+
+    if (cache->occupancy+1 > 3*cache->size) {
+        if (cache->size == MAX_CACHE_SIZE) {
+            size_t h = hash(m->start, m->pat) & (cache->size-1);
+            for (int quota = 2; cache->matches[h] && quota > 0; quota--) {
+                match_t *last = cache->matches[h];
+                while (last->cache.next) last = last->cache.next;
+                cache_remove(cache, last);
+            }
+        } else {
+            match_t **old_matches = cache->matches;
+            size_t old_size = cache->size;
+            cache->size = old_size == 0 ? 16 : 2*old_size;
+            cache->matches = new(match_t*[cache->size]);
+
+            // Rehash:
+            if (old_matches) {
+                for (size_t i = 0; i < old_size; i++) {
+                    for (match_t *o; (o = old_matches[i]); ) {
+                        *o->cache.home = o->cache.next;
+                        if (o->cache.next) o->cache.next->cache.home = o->cache.home;
+                        size_t h = hash(o->start, o->pat) & (cache->size-1);
+                        o->cache.home = &(cache->matches[h]);
+                        o->cache.next = cache->matches[h];
+                        if (cache->matches[h]) cache->matches[h]->cache.home = &o->cache.next;
+                        cache->matches[h] = o;
+                    }
+                }
+                free(old_matches);
+            }
+        }
+    }
+
+    size_t h = hash(m->start, m->pat) & (cache->size-1);
+    m->cache.home = &(cache->matches[h]);
+    m->cache.next = cache->matches[h];
+    if (cache->matches[h]) cache->matches[h]->cache.home = &m->cache.next;
+    cache->matches[h] = m;
+    ++m->refcount;
+    ++cache->occupancy;
+}
+
+//
+// Clear and deallocate the cache.
+//
+void cache_destroy(cache_t *cache)
+{
+    if (!cache->matches) return;
+    for (size_t i = 0; i < cache->size; i++) {
+        while (cache->matches[i])
+            cache_remove(cache, cache->matches[i]);
+    }
+    cache->occupancy = 0;
+    delete(&cache->matches);
+    cache->size = 0;
+}
+
 //
 // If the given pattern is a reference, look it up and return the referenced
 // pattern. This is used for an optimization to avoid repeated lookups.
 //
+__attribute__((nonnull(1)))
 static inline pat_t *deref(def_t *defs, pat_t *pat)
 {
     if (pat && pat->type == BP_REF) {
@@ -128,15 +240,18 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
 //
 // Find the next match after prev (or the first match if prev is NULL)
 //
-match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase)
+__attribute__((nonnull(3,5)))
+static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, pat_t *skip, bool ignorecase)
 {
-    const char *str;
-    if (prev) {
-        str = prev->end > prev->start ? prev->end : prev->end + 1;
-        if (prev->refcount == 0) recycle_if_unused(&prev);
-        cache_prune(f, str, f->end);
-    } else {
-        str = f->start;
+    // Prune the unnecessary entries from the cache (those not between start/end)
+    if (cache->matches) {
+        for (size_t i = 0; i < cache->size; i++) {
+            for (match_t *m = cache->matches[i], *next = NULL; m; m = next) {
+                next = m->cache.next;
+                if (m->start < f->start || (m->end ? m->end : m->start) > f->end)
+                    cache_remove(cache, m);
+            }
+        }
     }
 
     pat = deref(defs, pat);
@@ -162,14 +277,14 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
     if (str > f->end) return NULL;
 
     do {
-        match_t *m = match(defs, f, str, pat, ignorecase);
+        match_t *m = match(defs, cache, f, str, pat, ignorecase);
         if (m) return m;
         if (first->type == BP_START_OF_FILE) return NULL;
         match_t *s;
-        if (skip && (s = match(defs, f, str, skip, ignorecase))) {
+        if (skip && (s = match(defs, cache, f, str, skip, ignorecase))) {
             str = s->end > str ? s->end : str + 1;
             recycle_if_unused(&s);
-        } else str = next_char(f, str);
+        } else str = next_char(str, f->end);
     } while (str < f->end);
     return NULL;
 }
@@ -179,12 +294,12 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
 // match object, or NULL if no match is found.
 // The returned value should be free()'d to avoid memory leaking.
 //
-static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase)
+static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase)
 {
     switch (pat->type) {
     case BP_DEFINITION: {
         def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def);
-        match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
+        match_t *m = match(defs2, cache, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
         defs = free_defs(defs2, defs);
         return m;
     }
@@ -198,17 +313,17 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
             ++pat->args.leftrec.visits;
             return pat->args.leftrec.match;
         } else {
-            return match(defs, f, str, pat->args.leftrec.fallback, ignorecase);
+            return match(defs, cache, f, str, pat->args.leftrec.fallback, ignorecase);
         }
     }
     case BP_ANYCHAR: {
-        return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+        return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
     }
     case BP_ID_START: {
-        return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+        return (str < f->end && isidstart(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
     }
     case BP_ID_CONTINUE: {
-        return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+        return (str < f->end && isidcontinue(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
     }
     case BP_START_OF_FILE: {
         return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL;
@@ -223,7 +338,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL;
     }
     case BP_WORD_BOUNDARY: {
-        return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL;
+        return (str == f->start || isidcontinue(str, f->end) != isidcontinue(prev_char(f->start, str), f->end)) ? new_match(defs, pat, str, str, NULL) : NULL;
     }
     case BP_STRING: {
         if (&str[pat->min_matchlen] > f->end) return NULL;
@@ -238,7 +353,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         return new_match(defs, pat, str, str+1, NULL);
     }
     case BP_NOT: {
-        match_t *m = match(defs, f, str, pat->args.pat, ignorecase);
+        match_t *m = match(defs, cache, f, str, pat->args.pat, ignorecase);
         if (m != NULL) {
             recycle_if_unused(&m);
             return NULL;
@@ -259,7 +374,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         for (const char *prev = NULL; prev < str; ) {
             prev = str;
             if (target) {
-                match_t *p = match(defs, f, str, target, ignorecase);
+                match_t *p = match(defs, cache, f, str, target, ignorecase);
                 if (p != NULL) {
                     recycle_if_unused(&p);
                     m->end = str;
@@ -270,7 +385,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
                 return m;
             }
             if (skip) {
-                match_t *s = match(defs, f, str, skip, ignorecase);
+                match_t *s = match(defs, cache, f, str, skip, ignorecase);
                 if (s != NULL) {
                     str = s->end;
                     if (nchildren+2 >= child_cap) {
@@ -285,7 +400,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
             // be at least once chance to match the pattern, even if
             // we're at the end of the string already (e.g. "..$").
             if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT)
-                str = next_char(f, str);
+                str = next_char(str, f->end);
         }
         recycle_if_unused(&m);
         return NULL;
@@ -302,11 +417,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
             // Separator
             match_t *msep = NULL;
             if (sep != NULL && reps > 0) {
-                msep = match(defs, f, str, sep, ignorecase);
+                msep = match(defs, cache, f, str, sep, ignorecase);
                 if (msep == NULL) break;
                 str = msep->end;
             }
-            match_t *mp = match(defs, f, str, repeating, ignorecase);
+            match_t *mp = match(defs, cache, f, str, repeating, ignorecase);
             if (mp == NULL) {
                 str = start;
                 if (msep) recycle_if_unused(&msep);
@@ -358,19 +473,20 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         // current pos, so mock it out as a file slice.
         // TODO: this breaks ^/^^/$/$$, but that can probably be ignored
         // because you rarely need to check those in a backtrack.
+        cache_t slice_cache = {0};
         file_t slice;
         slice_file(&slice, f, f->start, str);
         for (const char *pos = &str[-(long)back->min_matchlen];
              pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
-             pos = prev_char(f, pos)) {
-            cache_destroy(&slice);
+             pos = prev_char(f->start, pos)) {
+            cache_destroy(&slice_cache);
             slice.start = (char*)pos;
-            match_t *m = match(defs, &slice, pos, back, ignorecase);
+            match_t *m = match(defs, &slice_cache, &slice, pos, back, ignorecase);
             // Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
             if (m && m->end != str)
                 recycle_if_unused(&m);
             else if (m) {
-                cache_destroy(&slice);
+                cache_destroy(&slice_cache);
                 return new_match(defs, pat, str, str, MATCHES(m));
             }
             if (pos == f->start) break;
@@ -378,23 +494,23 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
             // walking backwards endlessly over newlines.
             if (back->max_matchlen == -1 && *pos == '\n') break;
         }
-        cache_destroy(&slice);
+        cache_destroy(&slice_cache);
         return NULL;
     }
     case BP_BEFORE: {
-        match_t *after = match(defs, f, str, pat->args.pat, ignorecase);
+        match_t *after = match(defs, cache, f, str, pat->args.pat, ignorecase);
         return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL;
     }
     case BP_CAPTURE: {
-        match_t *p = match(defs, f, str, pat->args.pat, ignorecase);
+        match_t *p = match(defs, cache, f, str, pat->args.pat, ignorecase);
         return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
     }
     case BP_OTHERWISE: {
-        match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase);
-        return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase);
+        match_t *m = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
+        return m ? m : match(defs, cache, f, str, pat->args.multiple.second, ignorecase);
     }
     case BP_CHAIN: {
-        match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
+        match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
         if (m1 == NULL) return NULL;
 
         match_t *m2;
@@ -408,7 +524,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
 
             def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref);
             ++m1->refcount; {
-                m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase);
+                m2 = match(defs2, cache, f, m1->end, pat->args.multiple.second, ignorecase);
                 if (!m2) { // No need to keep the backref in memory if it didn't match
                     for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) {
                         if ((*rem) == backref) {
@@ -422,7 +538,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
                 defs = free_defs(defs2, defs);
             } --m1->refcount;
         } else {
-            m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase);
+            m2 = match(defs, cache, f, m1->end, pat->args.multiple.second, ignorecase);
         }
 
         if (m2 == NULL) {
@@ -433,35 +549,36 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         return new_match(defs, pat, str, m2->end, MATCHES(m1, m2));
     }
     case BP_MATCH: case BP_NOT_MATCH: {
-        match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
+        match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
         if (m1 == NULL) return NULL;
 
         // <p1>~<p2> matches iff the text of <p1> matches <p2>
         // <p1>!~<p2> matches iff the text of <p1> does not match <p2>
+        cache_t slice_cache = {0};
         file_t slice;
         slice_file(&slice, f, m1->start, m1->end);
-        match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase);
+        match_t *m2 = _next_match(defs, &slice_cache, &slice, slice.start, pat->args.multiple.second, NULL, ignorecase);
         if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
-            cache_destroy(&slice);
+            cache_destroy(&slice_cache);
             if (m2) recycle_if_unused(&m2);
             recycle_if_unused(&m1);
             return NULL;
         }
         match_t *ret = new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
-        cache_destroy(&slice);
+        cache_destroy(&slice_cache);
         return ret;
     }
     case BP_REPLACE: {
         match_t *p = NULL;
         if (pat->args.replace.pat) {
-            p = match(defs, f, str, pat->args.replace.pat, ignorecase);
+            p = match(defs, cache, f, str, pat->args.replace.pat, ignorecase);
             if (p == NULL) return NULL;
         }
         return new_match(defs, pat, str, p ? p->end : str, MATCHES(p));
     }
     case BP_REF: {
         match_t *cached;
-        if (cache_get(f, defs, str, pat, &cached))
+        if (cache_get(cache, defs, str, pat, &cached))
             return cached;
 
         def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
@@ -490,9 +607,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         };
 
         const char *prev = str;
-        match_t *m = match(&defs2, f, str, ref, ignorecase);
+        match_t *m = match(&defs2, cache, f, str, ref, ignorecase);
         if (m == NULL) {
-            cache_save(f, defs, str, pat, NULL);
+            cache_save(cache, defs, str, pat, NULL);
             return NULL;
         }
 
@@ -501,7 +618,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
             remove_ownership(&rec_op.args.leftrec.match);
             add_owner(&rec_op.args.leftrec.match, m);
             prev = m->end;
-            match_t *m2 = match(&defs2, f, str, ref, ignorecase);
+            match_t *m2 = match(&defs2, cache, f, str, ref, ignorecase);
             if (m2 == NULL) break;
             if (m2->end <= prev) {
                 recycle_if_unused(&m2);
@@ -516,7 +633,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         // results.
         // OPTIMIZE: remove this if necessary
         match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m));
-        cache_save(f, defs, str, pat, wrap);
+        cache_save(cache, defs, str, pat, wrap);
 
         if (rec_op.args.leftrec.match)
             remove_ownership(&rec_op.args.leftrec.match);
@@ -527,9 +644,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         if (*str != '\n') return NULL;
         const char *start = str;
 
-        size_t linenum = get_line_number(f, str);
-        const char *p = get_line(f, linenum);
-        if (p < f->start) p = f->start; // Can happen with recursive matching
+        const char *p = str;
+        while (p > f->start && p[-1] != '\n') --p;
 
         // Current indentation:
         char denter = *p;
@@ -546,7 +662,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
         return new_match(defs, pat, start, &str[dents], NULL);
     }
     case BP_ERROR: {
-        match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL;
+        match_t *p = pat->args.pat ? match(defs, cache, f, str, pat->args.pat, ignorecase) : NULL;
         return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
     }
     default: {
@@ -644,4 +760,32 @@ size_t free_all_matches(void)
     return count;
 }
 
+//
+// Iterate over matches.
+// Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...}
+//
+bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase)
+{
+    static cache_t cache = {0};
+    if (!f || !pat) { // Cleanup for stop_matching()
+        recycle_if_unused(m);
+        cache_destroy(&cache);
+        return false;
+    }
+
+    const char *start;
+    if (*m) {
+        // Make sure forward progress is occurring, even after zero-width matches:
+        start = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1;
+        recycle_if_unused(m);
+    } else {
+        start = f->start;
+        cache_destroy(&cache);
+    }
+
+    *m = (start <= f->end) ? _next_match(defs, &cache, f, start, pat, skip, ignorecase) : NULL;
+    if (!*m) cache_destroy(&cache);
+    return *m != NULL;
+}
+
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/match.h b/match.h
index cdd6592..535e20e 100644
--- a/match.h
+++ b/match.h
@@ -12,12 +12,13 @@
 
 __attribute__((returns_nonnull))
 match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]);
-__attribute__((nonnull(2,4)))
-match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase);
 __attribute__((nonnull))
 void recycle_if_unused(match_t **at_m);
 size_t free_all_matches(void);
 size_t recycle_all_matches(void);
 
+bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase);
+#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0)
+
 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/pattern.c b/pattern.c
index 7e31bfc..28a4ecf 100644
--- a/pattern.c
+++ b/pattern.c
@@ -16,16 +16,6 @@
 __attribute__((nonnull))
 static pat_t *bp_pattern_nl(file_t *f, const char *str, bool allow_nl);
 __attribute__((nonnull))
-static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *_bp_simplepattern(file_t *f, const char *str);
-__attribute__((nonnull(1,2,3,6)))
-static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep);
-__attribute__((nonnull(1,2)))
 static pat_t *bp_simplepattern(file_t *f, const char *str);
 
 //
@@ -52,6 +42,7 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssi
 //
 // Helper function to initialize a range object.
 //
+__attribute__((nonnull(1,2,3,6)))
 static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep)
 {
     size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0);
@@ -69,6 +60,7 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi
 // Take a pattern and expand it into a chain of patterns if it's followed by
 // any patterns (e.g. "`x `y"), otherwise return the original input.
 //
+__attribute__((nonnull))
 static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
 {
     const char *str = after_spaces(first->end, allow_nl);
@@ -84,6 +76,7 @@ static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
 //
 // Match trailing => replacements (with optional pattern beforehand)
 //
+__attribute__((nonnull))
 static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
 {
     const char *str = replace_pat->end;
@@ -94,12 +87,12 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
             || matchchar(&str, '{', allow_nl) || matchchar(&str, '\002', allow_nl)) {
             char closequote = str[-1] == '{' ? '}' : (str[-1] == '\002' ? '\003' : str[-1]);
             repstr = str;
-            for (; *str && *str != closequote; str = next_char(f, str)) {
+            for (; str < f->end && *str != closequote; str = next_char(str, f->end)) {
                 if (*str == '\\') {
                     if (!str[1] || str[1] == '\n')
                         file_err(f, str, str+1,
                                  "There should be an escape sequence after this backslash.");
-                    str = next_char(f, str);
+                    str = next_char(str, f->end);
                 }
             }
             replen = (size_t)(str-repstr);
@@ -124,6 +117,7 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
 // chain of choices if it's followed by any "/"-separated patterns (e.g.
 // "`x/`y"), otherwise return the original input.
 //
+__attribute__((nonnull))
 static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl)
 {
     first = expand_chain(f, first, allow_nl);
@@ -191,54 +185,23 @@ pat_t *either_pat(file_t *f, pat_t *first, pat_t *second)
     return either;
 }
 
-//
-// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
-//
-static pat_t *bp_simplepattern(file_t *f, const char *str)
-{
-    pat_t *pat = _bp_simplepattern(f, str);
-    if (pat == NULL) return pat;
-    str = pat->end;
-
-    // Expand postfix operators (if any)
-    while (str < f->end) {
-        enum pattype_e type;
-        if (matchchar(&str, '~', false))
-            type = BP_MATCH;
-        else if (matchstr(&str, "!~", false))
-            type = BP_NOT_MATCH;
-        else break;
-
-        pat_t *first = pat;
-        pat_t *second = bp_simplepattern(f, str);
-        if (!second)
-            file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
-
-        pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
-        pat->args.multiple.first = first;
-        pat->args.multiple.second = second;
-        str = pat->end;
-    }
-
-    return pat;
-}
-
 //
 // Compile a string of BP code into a BP pattern object.
 //
+__attribute__((nonnull))
 static pat_t *_bp_simplepattern(file_t *f, const char *str)
 {
     str = after_spaces(str, false);
     if (!*str) return NULL;
     const char *start = str;
     char c = *str;
-    str = next_char(f, str);
+    str = next_char(str, f->end);
     switch (c) {
     // Any char (dot)
     case '.': {
         if (*str == '.') { // ".."
             pat_t *skip = NULL;
-            str = next_char(f, str);
+            str = next_char(str, f->end);
             char skipper = *str;
             if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) {
                 skip = bp_simplepattern(f, str);
@@ -261,11 +224,11 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                 file_err(f, str, str, "There should be a character here after the '`'");
 
             const char *c1_loc = str;
-            str = next_char(f, c1_loc);
+            str = next_char(c1_loc, f->end);
             if (*str == '-') { // Range
                 const char *c2_loc = ++str;
-                if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1)
-                    file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported.");
+                if (next_char(c1_loc, f->end) > c1_loc+1 || next_char(c2_loc, f->end) > c2_loc+1)
+                    file_err(f, start, next_char(c2_loc, f->end), "Sorry, UTF-8 character ranges are not yet supported.");
                 char c1 = *c1_loc, c2 = *c2_loc;
                 if (!c2 || c2 == '\n')
                     file_err(f, str, str, "There should be a character here to complete the character range.");
@@ -274,7 +237,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
                     c1 = c2;
                     c2 = tmp;
                 }
-                str = next_char(f, c2_loc);
+                str = next_char(c2_loc, f->end);
                 pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE);
                 pat->args.range.low = (unsigned char)c1;
                 pat->args.range.high = (unsigned char)c2;
@@ -318,8 +281,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             unsigned char e_high = e_low;
             if (*str == '-') { // Escape range (e.g. \x00-\xFF)
                 ++str;
-                if (next_char(f, str) != str+1)
-                    file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges.");
+                if (next_char(str, f->end) != str+1)
+                    file_err(f, start, next_char(str, f->end), "Sorry, UTF8 escape sequences are not supported in ranges.");
                 const char *seqstart = str;
                 e_high = (unsigned char)unescapechar(str, &str);
                 if (str == seqstart)
@@ -331,7 +294,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
             esc->args.range.low = e_low;
             esc->args.range.high = e_high;
             all = either_pat(f, all, esc);
-        } while (*str++ == ',');
+        } while (*str == ',' && str++ < f->end);
 
         return all;
     }
@@ -344,9 +307,9 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
         char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c);
         char *litstart = (char*)str;
         while (str < f->end && *str != endquote)
-            str = next_char(f, str);
+            str = next_char(str, f->end);
         size_t len = (size_t)(str - litstart);
-        str = next_char(f, str);
+        str = next_char(str, f->end);
 
         pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
         pat->args.string = litstart;
@@ -528,10 +491,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
 pat_t *bp_stringpattern(file_t *f, const char *str)
 {
     pat_t *ret = NULL;
-    while (*str) {
+    while (str < f->end) {
         char *start = (char*)str;
         pat_t *interp = NULL;
-        for (; str < f->end; str = next_char(f, str)) {
+        for (; str < f->end; str = next_char(str, f->end)) {
             if (*str == '\\' && str+1 < f->end) {
                 if (str[1] == '\\' || isalnum(str[1]))
                     interp = bp_simplepattern(f, str);
@@ -558,6 +521,38 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
     return ret;
 }
 
+//
+// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
+//
+static pat_t *bp_simplepattern(file_t *f, const char *str)
+{
+    pat_t *pat = _bp_simplepattern(f, str);
+    if (pat == NULL) return pat;
+    str = pat->end;
+
+    // Expand postfix operators (if any)
+    while (str < f->end) {
+        enum pattype_e type;
+        if (matchchar(&str, '~', false))
+            type = BP_MATCH;
+        else if (matchstr(&str, "!~", false))
+            type = BP_NOT_MATCH;
+        else break;
+
+        pat_t *first = pat;
+        pat_t *second = bp_simplepattern(f, str);
+        if (!second)
+            file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
+
+        pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
+        pat->args.multiple.first = first;
+        pat->args.multiple.second = second;
+        str = pat->end;
+    }
+
+    return pat;
+}
+
 //
 // Given a pattern and a replacement string, compile the two into a BP
 // replace pattern.
@@ -567,7 +562,7 @@ pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement)
     pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen, BP_REPLACE);
     pat->args.replace.pat = replacepat;
     const char *p = replacement;
-    for (; *p; p++) {
+    for (; p < f->end; p++) {
         if (*p == '\\') {
             if (!p[1] || p[1] == '\n')
                 file_err(f, p, p, "There should be an escape sequence or pattern here after this backslash.");
diff --git a/pattern.h b/pattern.h
index 39aba63..b903d5b 100644
--- a/pattern.h
+++ b/pattern.h
@@ -9,7 +9,7 @@
 
 __attribute__((returns_nonnull, nonnull(1,2)))
 pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type);
-__attribute__((nonnull(1,2)))
+__attribute__((nonnull))
 pat_t *bp_stringpattern(file_t *f, const char *str);
 __attribute__((nonnull(1,2)))
 pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement);
diff --git a/utf8.c b/utf8.c
index 6180ffe..08e8932 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3,8 +3,9 @@
 //
 #include <ctype.h>
 #include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
 
-#include "files.h"
 #include "utf8.h"
 
 #define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))
@@ -181,39 +182,39 @@ static const uint32_t XID_Continue_only[][2] = {
 // Return the location of the next character or UTF8 codepoint.
 // (i.e. skip forward one codepoint at a time, not one byte at a time)
 //
-const char *next_char(file_t *f, const char *str)
+const char *next_char(const char *str, const char *end)
 {
-    if (likely(str+1 <= f->end) && likely((str[0] & 0x80) == 0x0))
+    if (likely(str+1 <= end) && likely((str[0] & 0x80) == 0x0))
         return str+1;
-    if (likely(str+2 <= f->end) && (str[0] & 0xe0) == 0xc0)
+    if (likely(str+2 <= end) && (str[0] & 0xe0) == 0xc0)
         return str+2;
-    if (likely(str+3 <= f->end) && (str[0] & 0xf0) == 0xe0)
+    if (likely(str+3 <= end) && (str[0] & 0xf0) == 0xe0)
         return str+3;
-    if (likely(str+4 <= f->end) && (str[0] & 0xf8) == 0xf0)
+    if (likely(str+4 <= end) && (str[0] & 0xf8) == 0xf0)
         return str+4;
-    return likely(str+1 <= f->end) ? str+1 : f->end;
+    return likely(str+1 <= end) ? str+1 : end;
 }
 
 //
 // Return the location of the previous character or UTF8 codepoint.
 // (i.e. skip backwards one codepoint at a time, not one byte at a time)
 //
-const char *prev_char(file_t *f, const char *str)
+const char *prev_char(const char *start, const char *str)
 {
-    if (likely(str-1 >= f->start) && likely((str[-1] & 0x80) == 0x0))
+    if (likely(str-1 >= start) && likely((str[-1] & 0x80) == 0x0))
         return str-1;
-    if (likely(str-2 >= f->start) && (str[-2] & 0xe0) == 0xc0)
+    if (likely(str-2 >= start) && (str[-2] & 0xe0) == 0xc0)
         return str-2;
-    if (likely(str-3 >= f->start) && (str[-3] & 0xf0) == 0xe0)
+    if (likely(str-3 >= start) && (str[-3] & 0xf0) == 0xe0)
         return str-3;
-    if (likely(str-4 >= f->start) && (str[-4] & 0xf8) == 0xf0)
+    if (likely(str-4 >= start) && (str[-4] & 0xf8) == 0xf0)
         return str-4;
-    return likely(str-1 >= f->start) ? str-1 : f->start;
+    return likely(str-1 >= start) ? str-1 : start;
 }
 
-static uint32_t get_codepoint(file_t *f, const char *str)
+static uint32_t get_codepoint(const char *str, const char *end)
 {
-    if (str >= f->end)
+    if (unlikely(str >= end))
         return (uint32_t)-1;
 
     unsigned char c1 = (unsigned char)str[0];
@@ -235,7 +236,7 @@ static uint32_t get_codepoint(file_t *f, const char *str)
     }
 
     for (int i = 1; i < seqlen; ++i) {
-        if (unlikely(&str[i] >= f->end || (str[i] & 0xC0) != 0x80))
+        if (unlikely((&str[i] >= end) || (str[i] & 0xC0) != 0x80))
             return (uint32_t)-1;
         codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F));
     }
@@ -259,22 +260,22 @@ static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_
     return false;
 }
 
-bool isidstart(file_t *f, const char *str)
+bool isidstart(const char *str, const char *end)
 {
-    if (unlikely(str >= f->end)) return false;
+    if (unlikely(str >= end)) return false;
     else if (isalpha(*str) || *str == '_') return true;
     else if (likely((*str & 0x80) == 0)) return false;
-    uint32_t codepoint = get_codepoint(f, str);
+    uint32_t codepoint = get_codepoint(str, end);
     return codepoint != (uint32_t)-1
         && find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start));
 }
 
-bool isidcontinue(file_t *f, const char *str)
+bool isidcontinue(const char *str, const char *end)
 {
-    if (unlikely(str >= f->end)) return false;
+    if (unlikely(str >= end)) return false;
     else if (isalnum(*str) || *str == '_') return true;
     else if (likely((*str & 0x80) == 0)) return false;
-    uint32_t codepoint = get_codepoint(f, str);
+    uint32_t codepoint = get_codepoint(str, end);
     return codepoint != (uint32_t)-1
         && (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start))
             || find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only)));
diff --git a/utf8.h b/utf8.h
index 97e259e..243acd3 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1,21 +1,21 @@
 //
 // utf8.h - UTF8 helper functions
 //
-#include "files.h"
-
 #ifndef UTF8__H
 #define UTF8__H
 
+#include <stdbool.h>
+
 #define UTF8_MAXCHARLEN 4
 
 __attribute__((nonnull, pure))
-const char *next_char(file_t *f, const char *str);
+const char *next_char(const char *str, const char *end);
 __attribute__((nonnull, pure))
-const char *prev_char(file_t *f, const char *str);
+const char *prev_char(const char *start, const char *str);
 __attribute__((nonnull, pure))
-bool isidstart(file_t *f, const char *str);
+bool isidstart(const char *str, const char *end);
 __attribute__((nonnull, pure))
-bool isidcontinue(file_t *f, const char *str);
+bool isidcontinue(const char *str, const char *end);
 
 #endif
 // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0