aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2021-09-21 18:45:43 -0700
committerBruce Hill <bruce@bruce-hill.com>2021-09-21 18:45:43 -0700
commit90c3c13a02e501d3bea839dceb00f09c89bfb5fe (patch)
treec084e3d34fb5fd83a6cc207a78cc8826cafdc031
parent9401facbe7f768c7f4574a7f101d57f7075c42b7 (diff)
Moving cache logic into match, cleaner next_match() API, and slightly
less tightly coupled UTF8 API
-rw-r--r--bp.c46
-rw-r--r--files.c124
-rw-r--r--files.h11
-rw-r--r--match.c252
-rw-r--r--match.h5
-rw-r--r--pattern.c111
-rw-r--r--pattern.h2
-rw-r--r--utf8.c45
-rw-r--r--utf8.h12
9 files changed, 302 insertions, 306 deletions
diff --git a/bp.c b/bp.c
index 1697a55..49e2317 100644
--- a/bp.c
+++ b/bp.c
@@ -169,17 +169,15 @@ static int is_text_file(const char *filename)
//
static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
{
- static int matches = 0;
- match_t *m = NULL;
- while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
- if (++matches > 1)
+ int nmatches = 0;
+ for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
+ if (++nmatches > 1)
printf(",\n");
printf("{\"filename\":\"%s\",\"match\":", f->filename);
json_match(f->start, m, options.verbose);
printf("}");
}
- if (m) recycle_if_unused(&m);
- return matches;
+ return nmatches;
}
//
@@ -187,18 +185,16 @@ static int print_matches_as_json(def_t *defs, file_t *f, pat_t *pattern)
//
static int explain_matches(def_t *defs, file_t *f, pat_t *pattern)
{
- int matches = 0;
- match_t *m = NULL;
- while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
- if (++matches == 1) {
+ int nmatches = 0;
+ for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
+ if (++nmatches == 1) {
if (options.print_filenames)
fprint_filename(stdout, f->filename);
} else
printf("\n\n");
explain_match(m);
}
- if (m) recycle_if_unused(&m);
- return matches;
+ return nmatches;
}
//
@@ -243,8 +239,7 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
.lineformat = LINE_FORMATS[options.format],
};
- match_t *m = NULL;
- while ((m = next_match(defs, f, m, pattern, options.skip, options.ignorecase))) {
+ for (match_t *m = NULL; next_match(&m, defs, f, pattern, options.skip, options.ignorecase); ) {
if (print_errors(f, m) > 0)
exit(EXIT_FAILURE);
@@ -254,7 +249,6 @@ static int print_matches(FILE *out, def_t *defs, file_t *f, pat_t *pattern)
}
print_match(out, &pr, m);
}
- if (m) recycle_if_unused(&m);
if (matches > 0 || (f->filename[0] == '\0' && options.context_before == ALL_CONTEXT)) {
// Print trailing context lines:
@@ -281,18 +275,19 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
if (options.mode == MODE_EXPLAIN) {
matches += explain_matches(defs, f, pattern);
} else if (options.mode == MODE_LISTFILES) {
- match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
- if (m) {
- recycle_if_unused(&m);
+ match_t *m = NULL;
+ if (next_match(&m, defs, f, pattern, options.skip, options.ignorecase)) {
printf("%s\n", f->filename);
matches += 1;
}
+ stop_matching(&m);
} else if (options.mode == MODE_JSON) {
matches += print_matches_as_json(defs, f, pattern);
} else if (options.mode == MODE_INPLACE) {
- match_t *m = next_match(defs, f, NULL, pattern, options.skip, options.ignorecase);
- if (m) recycle_if_unused(&m);
- else return 0;
+ match_t *m = NULL;
+ bool found = next_match(&m, defs, f, pattern, options.skip, options.ignorecase);
+ stop_matching(&m);
+ if (!found) return 0;
// Ensure the file is resident in memory:
if (f->mmapped) {
@@ -315,7 +310,6 @@ static int process_file(def_t *defs, const char *filename, pat_t *pattern)
}
fflush(stdout);
- cache_destroy(f);
if (recycle_all_matches() != 0)
fprintf(stderr, "\033[33;1mMemory leak: there should no longer be any matches in use at this point.\033[m\n");
destroy_file(&f);
@@ -480,10 +474,10 @@ int main(int argc, char *argv[])
file_t *arg_file = spoof_file(&loaded_files, "<skip argument>", flag, -1);
pat_t *s = bp_pattern(arg_file, arg_file->start);
if (!s) {
- fprint_line(stdout, arg_file, arg_file->start, arg_file->end,
+ file_err(arg_file, arg_file->start, arg_file->end,
"Failed to compile the skip argument");
} else if (after_spaces(s->end, true) < arg_file->end) {
- fprint_line(stdout, arg_file, s->end, arg_file->end,
+ file_err(arg_file, s->end, arg_file->end,
"Failed to compile part of the skip argument");
}
options.skip = either_pat(arg_file, options.skip, s);
@@ -537,10 +531,6 @@ int main(int argc, char *argv[])
// Handle exit() calls gracefully:
require(atexit(&cleanup), "Failed to set cleanup handler at exit");
- // No need for these caches anymore:
- for (file_t *f = loaded_files; f; f = f->next)
- cache_destroy(f);
-
int found = 0;
if (options.mode == MODE_JSON) printf("[");
if (options.git_mode) { // Get the list of files from `git --ls-files ...`
diff --git a/files.c b/files.c
index 774f830..5e9b40e 100644
--- a/files.c
+++ b/files.c
@@ -182,8 +182,6 @@ void destroy_file(file_t **at_f)
f->mmapped = NULL;
}
- cache_destroy(f);
-
for (pat_t *next; f->pats; f->pats = next) {
next = f->pats->next;
delete(&f->pats);
@@ -261,126 +259,4 @@ void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, cons
fprintf(dest, "\033[m\n");
}
-//
-// Hash a string position/pattern.
-//
-static inline size_t hash(const char *str, pat_t *pat)
-{
- return (size_t)str + 2*pat->id;
-}
-
-//
-// Check if we have memoized a pattern match at the given position for the
-// given definitions. If a result has been memoized, set *result to the
-// memoized value and return true, otherwise return false.
-//
-bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result)
-{
- if (!f->cache.matches) return NULL;
- size_t h = hash(str, pat) & (f->cache.size-1);
- for (match_t *c = f->cache.matches[h]; c; c = c->cache.next) {
- if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
- // If c->end == NULL, that means no match occurs here
- *result = c->end == NULL ? NULL : c;
- return true;
- }
- }
- return false;
-}
-
-//
-// Remove an item from the cache.
-//
-static void cache_remove(file_t *f, match_t *m)
-{
- if (!m->cache.home) return;
- *m->cache.home = m->cache.next;
- if (m->cache.next) m->cache.next->cache.home = m->cache.home;
- m->cache.next = NULL;
- m->cache.home = NULL;
- if (--m->refcount == 0) recycle_if_unused(&m);
- --f->cache.occupancy;
-}
-
-//
-// Save a match in the cache.
-//
-void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m)
-{
- // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
- // to memoize the fact that `pat` will *not* match at `str`.
- if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
-
- if (f->cache.occupancy+1 > 3*f->cache.size) {
- if (f->cache.size == MAX_CACHE_SIZE) {
- size_t h = hash(m->start, m->pat) & (f->cache.size-1);
- for (int quota = 2; f->cache.matches[h] && quota > 0; quota--) {
- match_t *last = f->cache.matches[h];
- while (last->cache.next) last = last->cache.next;
- cache_remove(f, last);
- }
- } else {
- match_t **old_matches = f->cache.matches;
- size_t old_size = f->cache.size;
- f->cache.size = old_size == 0 ? 16 : 2*old_size;
- f->cache.matches = new(match_t*[f->cache.size]);
-
- // Rehash:
- if (old_matches) {
- for (size_t i = 0; i < old_size; i++) {
- for (match_t *o; (o = old_matches[i]); ) {
- *o->cache.home = o->cache.next;
- if (o->cache.next) o->cache.next->cache.home = o->cache.home;
- size_t h = hash(o->start, o->pat) & (f->cache.size-1);
- o->cache.home = &(f->cache.matches[h]);
- o->cache.next = f->cache.matches[h];
- if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &o->cache.next;
- f->cache.matches[h] = o;
- }
- }
- free(old_matches);
- }
- }
- }
-
- size_t h = hash(m->start, m->pat) & (f->cache.size-1);
- m->cache.home = &(f->cache.matches[h]);
- m->cache.next = f->cache.matches[h];
- if (f->cache.matches[h]) f->cache.matches[h]->cache.home = &m->cache.next;
- f->cache.matches[h] = m;
- ++m->refcount;
- ++f->cache.occupancy;
-}
-
-//
-// Remove all items from the cache that do not overlap `start` and `end`.
-// (This is used to remove useless items from the cache)
-//
-void cache_prune(file_t *f, const char *start, const char *end)
-{
- if (!f->cache.matches) return;
- for (size_t i = 0; i < f->cache.size; i++) {
- for (match_t *m = f->cache.matches[i], *next = NULL; m; m = next) {
- next = m->cache.next;
- if (m->start < start || (m->end ? m->end : m->start) > end)
- cache_remove(f, m);
- }
- }
-}
-
-//
-// Clear and deallocate the cache.
-//
-void cache_destroy(file_t *f)
-{
- if (!f->cache.matches) return;
- for (size_t i = 0; i < f->cache.size; i++) {
- while (f->cache.matches[i])
- cache_remove(f, f->cache.matches[i]);
- }
- f->cache.occupancy = 0;
- delete(&f->cache.matches);
- f->cache.size = 0;
-}
-
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/files.h b/files.h
index 840412b..b51b9c7 100644
--- a/files.h
+++ b/files.h
@@ -6,14 +6,11 @@
#include "types.h"
-#include <stdbool.h>
#include <stdio.h>
#include <unistd.h>
#define file_err(f, ...) do { fprint_line(stderr, f, __VA_ARGS__); exit(EXIT_FAILURE); } while(false)
-#define MAX_CACHE_SIZE (1<<14)
-
typedef struct file_s {
struct file_s *next;
const char *filename;
@@ -43,14 +40,6 @@ __attribute__((pure, nonnull))
const char *get_line(file_t *f, size_t line_number);
__attribute__((nonnull(1,2,3), format(printf,5,6)))
void fprint_line(FILE *dest, file_t *f, const char *start, const char *end, const char *fmt, ...);
-__attribute__((nonnull(1,3,4,5)))
-bool cache_get(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t **result);
-__attribute__((nonnull(1,3,4)))
-void cache_save(file_t *f, def_t *defs, const char *str, pat_t *pat, match_t *m);
-__attribute__((nonnull))
-void cache_prune(file_t *f, const char *start, const char *end);
-__attribute__((nonnull))
-void cache_destroy(file_t *f);
#endif
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/match.c b/match.c
index 99ef145..2a40472 100644
--- a/match.c
+++ b/match.c
@@ -16,6 +16,13 @@
#include "utils.h"
#include "utf8.h"
+#define MAX_CACHE_SIZE (1<<14)
+
+typedef struct {
+ size_t size, occupancy;
+ match_t **matches;
+} cache_t;
+
// New match objects are either recycled from unused match objects or allocated
// from the heap. While it is in use, the match object is stored in the
// `in_use_matches` linked list. Once it is no longer needed, it is moved to
@@ -27,10 +34,8 @@ static match_t *in_use_matches = NULL;
#define MATCHES(...) (match_t*[]){__VA_ARGS__, NULL}
-__attribute__((nonnull(1)))
-static inline pat_t *deref(def_t *defs, pat_t *pat);
-__attribute__((hot, nonnull(2,3,4)))
-static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase);
+__attribute__((hot, nonnull(2,3,4,5)))
+static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase);
// Store a value and update its refcount
static inline void add_owner(match_t** owner, match_t* owned)
@@ -81,9 +86,116 @@ static inline void list_remove(match_t *m, match_dll_t *node)
}
//
+// Hash a string position/pattern.
+//
+static inline size_t hash(const char *str, pat_t *pat)
+{
+ return (size_t)str + 2*pat->id;
+}
+
+//
+// Check if we have memoized a pattern match at the given position for the
+// given definitions. If a result has been memoized, set *result to the
+// memoized value and return true, otherwise return false.
+//
+static bool cache_get(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t **result)
+{
+ if (!cache->matches) return NULL;
+ size_t h = hash(str, pat) & (cache->size-1);
+ for (match_t *c = cache->matches[h]; c; c = c->cache.next) {
+ if (c->pat == pat && c->defs_id == (defs?defs->id:0) && c->start == str) {
+ // If c->end == NULL, that means no match occurs here
+ *result = c->end == NULL ? NULL : c;
+ return true;
+ }
+ }
+ return false;
+}
+
+//
+// Remove an item from the cache.
+//
+static void cache_remove(cache_t *cache, match_t *m)
+{
+ if (!m->cache.home) return;
+ *m->cache.home = m->cache.next;
+ if (m->cache.next) m->cache.next->cache.home = m->cache.home;
+ m->cache.next = NULL;
+ m->cache.home = NULL;
+ if (--m->refcount == 0) recycle_if_unused(&m);
+ --cache->occupancy;
+}
+
+//
+// Save a match in the cache.
+//
+static void cache_save(cache_t *cache, def_t *defs, const char *str, pat_t *pat, match_t *m)
+{
+ // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
+ // to memoize the fact that `pat` will *not* match at `str`.
+ if (m == NULL) m = new_match(defs, pat, str, NULL, NULL);
+
+ if (cache->occupancy+1 > 3*cache->size) {
+ if (cache->size == MAX_CACHE_SIZE) {
+ size_t h = hash(m->start, m->pat) & (cache->size-1);
+ for (int quota = 2; cache->matches[h] && quota > 0; quota--) {
+ match_t *last = cache->matches[h];
+ while (last->cache.next) last = last->cache.next;
+ cache_remove(cache, last);
+ }
+ } else {
+ match_t **old_matches = cache->matches;
+ size_t old_size = cache->size;
+ cache->size = old_size == 0 ? 16 : 2*old_size;
+ cache->matches = new(match_t*[cache->size]);
+
+ // Rehash:
+ if (old_matches) {
+ for (size_t i = 0; i < old_size; i++) {
+ for (match_t *o; (o = old_matches[i]); ) {
+ *o->cache.home = o->cache.next;
+ if (o->cache.next) o->cache.next->cache.home = o->cache.home;
+ size_t h = hash(o->start, o->pat) & (cache->size-1);
+ o->cache.home = &(cache->matches[h]);
+ o->cache.next = cache->matches[h];
+ if (cache->matches[h]) cache->matches[h]->cache.home = &o->cache.next;
+ cache->matches[h] = o;
+ }
+ }
+ free(old_matches);
+ }
+ }
+ }
+
+ size_t h = hash(m->start, m->pat) & (cache->size-1);
+ m->cache.home = &(cache->matches[h]);
+ m->cache.next = cache->matches[h];
+ if (cache->matches[h]) cache->matches[h]->cache.home = &m->cache.next;
+ cache->matches[h] = m;
+ ++m->refcount;
+ ++cache->occupancy;
+}
+
+//
+// Clear and deallocate the cache.
+//
+void cache_destroy(cache_t *cache)
+{
+ if (!cache->matches) return;
+ for (size_t i = 0; i < cache->size; i++) {
+ while (cache->matches[i])
+ cache_remove(cache, cache->matches[i]);
+ }
+ cache->occupancy = 0;
+ delete(&cache->matches);
+ cache->size = 0;
+}
+
+//
// If the given pattern is a reference, look it up and return the referenced
// pattern. This is used for an optimization to avoid repeated lookups.
//
+__attribute__((nonnull(1)))
static inline pat_t *deref(def_t *defs, pat_t *pat)
{
if (pat && pat->type == BP_REF) {
@@ -128,15 +240,18 @@ static pat_t *first_pat(def_t *defs, pat_t *pat)
//
// Find the next match after prev (or the first match if prev is NULL)
//
-match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase)
+__attribute__((nonnull(3,5)))
+static match_t *_next_match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, pat_t *skip, bool ignorecase)
{
- const char *str;
- if (prev) {
- str = prev->end > prev->start ? prev->end : prev->end + 1;
- if (prev->refcount == 0) recycle_if_unused(&prev);
- cache_prune(f, str, f->end);
- } else {
- str = f->start;
+ // Prune the unnecessary entries from the cache (those not between start/end)
+ if (cache->matches) {
+ for (size_t i = 0; i < cache->size; i++) {
+ for (match_t *m = cache->matches[i], *next = NULL; m; m = next) {
+ next = m->cache.next;
+ if (m->start < f->start || (m->end ? m->end : m->start) > f->end)
+ cache_remove(cache, m);
+ }
+ }
}
pat = deref(defs, pat);
@@ -162,14 +277,14 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
if (str > f->end) return NULL;
do {
- match_t *m = match(defs, f, str, pat, ignorecase);
+ match_t *m = match(defs, cache, f, str, pat, ignorecase);
if (m) return m;
if (first->type == BP_START_OF_FILE) return NULL;
match_t *s;
- if (skip && (s = match(defs, f, str, skip, ignorecase))) {
+ if (skip && (s = match(defs, cache, f, str, skip, ignorecase))) {
str = s->end > str ? s->end : str + 1;
recycle_if_unused(&s);
- } else str = next_char(f, str);
+ } else str = next_char(str, f->end);
} while (str < f->end);
return NULL;
}
@@ -179,12 +294,12 @@ match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *sk
// match object, or NULL if no match is found.
// The returned value should be free()'d to avoid memory leaking.
//
-static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool ignorecase)
+static match_t *match(def_t *defs, cache_t *cache, file_t *f, const char *str, pat_t *pat, bool ignorecase)
{
switch (pat->type) {
case BP_DEFINITION: {
def_t *defs2 = with_def(defs, pat->args.def.namelen, pat->args.def.name, pat->args.def.def);
- match_t *m = match(defs2, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
+ match_t *m = match(defs2, cache, f, str, pat->args.def.pat ? pat->args.def.pat : pat->args.def.def, ignorecase);
defs = free_defs(defs2, defs);
return m;
}
@@ -198,17 +313,17 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
++pat->args.leftrec.visits;
return pat->args.leftrec.match;
} else {
- return match(defs, f, str, pat->args.leftrec.fallback, ignorecase);
+ return match(defs, cache, f, str, pat->args.leftrec.fallback, ignorecase);
}
}
case BP_ANYCHAR: {
- return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+ return (str < f->end && *str != '\n') ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
}
case BP_ID_START: {
- return (str < f->end && isidstart(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+ return (str < f->end && isidstart(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
}
case BP_ID_CONTINUE: {
- return (str < f->end && isidcontinue(f, str)) ? new_match(defs, pat, str, next_char(f, str), NULL) : NULL;
+ return (str < f->end && isidcontinue(str, f->end)) ? new_match(defs, pat, str, next_char(str, f->end), NULL) : NULL;
}
case BP_START_OF_FILE: {
return (str == f->start) ? new_match(defs, pat, str, str, NULL) : NULL;
@@ -223,7 +338,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
return (str == f->end || *str == '\n') ? new_match(defs, pat, str, str, NULL) : NULL;
}
case BP_WORD_BOUNDARY: {
- return (str == f->start || isidcontinue(f, str) != isidcontinue(f, prev_char(f, str))) ? new_match(defs, pat, str, str, NULL) : NULL;
+ return (str == f->start || isidcontinue(str, f->end) != isidcontinue(prev_char(f->start, str), f->end)) ? new_match(defs, pat, str, str, NULL) : NULL;
}
case BP_STRING: {
if (&str[pat->min_matchlen] > f->end) return NULL;
@@ -238,7 +353,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
return new_match(defs, pat, str, str+1, NULL);
}
case BP_NOT: {
- match_t *m = match(defs, f, str, pat->args.pat, ignorecase);
+ match_t *m = match(defs, cache, f, str, pat->args.pat, ignorecase);
if (m != NULL) {
recycle_if_unused(&m);
return NULL;
@@ -259,7 +374,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
for (const char *prev = NULL; prev < str; ) {
prev = str;
if (target) {
- match_t *p = match(defs, f, str, target, ignorecase);
+ match_t *p = match(defs, cache, f, str, target, ignorecase);
if (p != NULL) {
recycle_if_unused(&p);
m->end = str;
@@ -270,7 +385,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
return m;
}
if (skip) {
- match_t *s = match(defs, f, str, skip, ignorecase);
+ match_t *s = match(defs, cache, f, str, skip, ignorecase);
if (s != NULL) {
str = s->end;
if (nchildren+2 >= child_cap) {
@@ -285,7 +400,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
// be at least once chance to match the pattern, even if
// we're at the end of the string already (e.g. "..$").
if (str < f->end && *str != '\n' && pat->type != BP_UPTO_STRICT)
- str = next_char(f, str);
+ str = next_char(str, f->end);
}
recycle_if_unused(&m);
return NULL;
@@ -302,11 +417,11 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
// Separator
match_t *msep = NULL;
if (sep != NULL && reps > 0) {
- msep = match(defs, f, str, sep, ignorecase);
+ msep = match(defs, cache, f, str, sep, ignorecase);
if (msep == NULL) break;
str = msep->end;
}
- match_t *mp = match(defs, f, str, repeating, ignorecase);
+ match_t *mp = match(defs, cache, f, str, repeating, ignorecase);
if (mp == NULL) {
str = start;
if (msep) recycle_if_unused(&msep);
@@ -358,19 +473,20 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
// current pos, so mock it out as a file slice.
// TODO: this breaks ^/^^/$/$$, but that can probably be ignored
// because you rarely need to check those in a backtrack.
+ cache_t slice_cache = {0};
file_t slice;
slice_file(&slice, f, f->start, str);
for (const char *pos = &str[-(long)back->min_matchlen];
pos >= f->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
- pos = prev_char(f, pos)) {
- cache_destroy(&slice);
+ pos = prev_char(f->start, pos)) {
+ cache_destroy(&slice_cache);
slice.start = (char*)pos;
- match_t *m = match(defs, &slice, pos, back, ignorecase);
+ match_t *m = match(defs, &slice_cache, &slice, pos, back, ignorecase);
// Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
if (m && m->end != str)
recycle_if_unused(&m);
else if (m) {
- cache_destroy(&slice);
+ cache_destroy(&slice_cache);
return new_match(defs, pat, str, str, MATCHES(m));
}
if (pos == f->start) break;
@@ -378,23 +494,23 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
// walking backwards endlessly over newlines.
if (back->max_matchlen == -1 && *pos == '\n') break;
}
- cache_destroy(&slice);
+ cache_destroy(&slice_cache);
return NULL;
}
case BP_BEFORE: {
- match_t *after = match(defs, f, str, pat->args.pat, ignorecase);
+ match_t *after = match(defs, cache, f, str, pat->args.pat, ignorecase);
return after ? new_match(defs, pat, str, str, MATCHES(after)) : NULL;
}
case BP_CAPTURE: {
- match_t *p = match(defs, f, str, pat->args.pat, ignorecase);
+ match_t *p = match(defs, cache, f, str, pat->args.pat, ignorecase);
return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
}
case BP_OTHERWISE: {
- match_t *m = match(defs, f, str, pat->args.multiple.first, ignorecase);
- return m ? m : match(defs, f, str, pat->args.multiple.second, ignorecase);
+ match_t *m = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
+ return m ? m : match(defs, cache, f, str, pat->args.multiple.second, ignorecase);
}
case BP_CHAIN: {
- match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
+ match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
if (m1 == NULL) return NULL;
match_t *m2;
@@ -408,7 +524,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
def_t *defs2 = with_def(defs, m1->pat->args.capture.namelen, m1->pat->args.capture.name, backref);
++m1->refcount; {
- m2 = match(defs2, f, m1->end, pat->args.multiple.second, ignorecase);
+ m2 = match(defs2, cache, f, m1->end, pat->args.multiple.second, ignorecase);
if (!m2) { // No need to keep the backref in memory if it didn't match
for (pat_t **rem = &f->pats; *rem; rem = &(*rem)->next) {
if ((*rem) == backref) {
@@ -422,7 +538,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
defs = free_defs(defs2, defs);
} --m1->refcount;
} else {
- m2 = match(defs, f, m1->end, pat->args.multiple.second, ignorecase);
+ m2 = match(defs, cache, f, m1->end, pat->args.multiple.second, ignorecase);
}
if (m2 == NULL) {
@@ -433,35 +549,36 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
return new_match(defs, pat, str, m2->end, MATCHES(m1, m2));
}
case BP_MATCH: case BP_NOT_MATCH: {
- match_t *m1 = match(defs, f, str, pat->args.multiple.first, ignorecase);
+ match_t *m1 = match(defs, cache, f, str, pat->args.multiple.first, ignorecase);
if (m1 == NULL) return NULL;
// <p1>~<p2> matches iff the text of <p1> matches <p2>
// <p1>!~<p2> matches iff the text of <p1> does not match <p2>
+ cache_t slice_cache = {0};
file_t slice;
slice_file(&slice, f, m1->start, m1->end);
- match_t *m2 = next_match(defs, &slice, NULL, pat->args.multiple.second, NULL, ignorecase);
+ match_t *m2 = _next_match(defs, &slice_cache, &slice, slice.start, pat->args.multiple.second, NULL, ignorecase);
if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
- cache_destroy(&slice);
+ cache_destroy(&slice_cache);
if (m2) recycle_if_unused(&m2);
recycle_if_unused(&m1);
return NULL;
}
match_t *ret = new_match(defs, pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
- cache_destroy(&slice);
+ cache_destroy(&slice_cache);
return ret;
}
case BP_REPLACE: {
match_t *p = NULL;
if (pat->args.replace.pat) {
- p = match(defs, f, str, pat->args.replace.pat, ignorecase);
+ p = match(defs, cache, f, str, pat->args.replace.pat, ignorecase);
if (p == NULL) return NULL;
}
return new_match(defs, pat, str, p ? p->end : str, MATCHES(p));
}
case BP_REF: {
match_t *cached;
- if (cache_get(f, defs, str, pat, &cached))
+ if (cache_get(cache, defs, str, pat, &cached))
return cached;
def_t *def = lookup(defs, pat->args.ref.len, pat->args.ref.name);
@@ -490,9 +607,9 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
};
const char *prev = str;
- match_t *m = match(&defs2, f, str, ref, ignorecase);
+ match_t *m = match(&defs2, cache, f, str, ref, ignorecase);
if (m == NULL) {
- cache_save(f, defs, str, pat, NULL);
+ cache_save(cache, defs, str, pat, NULL);
return NULL;
}
@@ -501,7 +618,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
remove_ownership(&rec_op.args.leftrec.match);
add_owner(&rec_op.args.leftrec.match, m);
prev = m->end;
- match_t *m2 = match(&defs2, f, str, ref, ignorecase);
+ match_t *m2 = match(&defs2, cache, f, str, ref, ignorecase);
if (m2 == NULL) break;
if (m2->end <= prev) {
recycle_if_unused(&m2);
@@ -516,7 +633,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
// results.
// OPTIMIZE: remove this if necessary
match_t *wrap = new_match(defs, pat, m->start, m->end, MATCHES(m));
- cache_save(f, defs, str, pat, wrap);
+ cache_save(cache, defs, str, pat, wrap);
if (rec_op.args.leftrec.match)
remove_ownership(&rec_op.args.leftrec.match);
@@ -527,9 +644,8 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
if (*str != '\n') return NULL;
const char *start = str;
- size_t linenum = get_line_number(f, str);
- const char *p = get_line(f, linenum);
- if (p < f->start) p = f->start; // Can happen with recursive matching
+ const char *p = str;
+ while (p > f->start && p[-1] != '\n') --p;
// Current indentation:
char denter = *p;
@@ -546,7 +662,7 @@ static match_t *match(def_t *defs, file_t *f, const char *str, pat_t *pat, bool
return new_match(defs, pat, start, &str[dents], NULL);
}
case BP_ERROR: {
- match_t *p = pat->args.pat ? match(defs, f, str, pat->args.pat, ignorecase) : NULL;
+ match_t *p = pat->args.pat ? match(defs, cache, f, str, pat->args.pat, ignorecase) : NULL;
return p ? new_match(defs, pat, str, p->end, MATCHES(p)) : NULL;
}
default: {
@@ -644,4 +760,32 @@ size_t free_all_matches(void)
return count;
}
+//
+// Iterate over matches.
+// Usage: for (match_t *m = NULL; next_match(&m, ...); ) {...}
+//
+bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase)
+{
+ static cache_t cache = {0};
+ if (!f || !pat) { // Cleanup for stop_matching()
+ recycle_if_unused(m);
+ cache_destroy(&cache);
+ return false;
+ }
+
+ const char *start;
+ if (*m) {
+ // Make sure forward progress is occurring, even after zero-width matches:
+ start = ((*m)->end > (*m)->start) ? (*m)->end : (*m)->end+1;
+ recycle_if_unused(m);
+ } else {
+ start = f->start;
+ cache_destroy(&cache);
+ }
+
+ *m = (start <= f->end) ? _next_match(defs, &cache, f, start, pat, skip, ignorecase) : NULL;
+ if (!*m) cache_destroy(&cache);
+ return *m != NULL;
+}
+
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/match.h b/match.h
index cdd6592..535e20e 100644
--- a/match.h
+++ b/match.h
@@ -12,12 +12,13 @@
__attribute__((returns_nonnull))
match_t *new_match(def_t *defs, pat_t *pat, const char *start, const char *end, match_t *children[]);
-__attribute__((nonnull(2,4)))
-match_t *next_match(def_t *defs, file_t *f, match_t *prev, pat_t *pat, pat_t *skip, bool ignorecase);
__attribute__((nonnull))
void recycle_if_unused(match_t **at_m);
size_t free_all_matches(void);
size_t recycle_all_matches(void);
+bool next_match(match_t **m, def_t *defs, file_t *f, pat_t *pat, pat_t *skip, bool ignorecase);
+#define stop_matching(m) next_match(m, NULL, NULL, NULL, NULL, 0)
+
#endif
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0
diff --git a/pattern.c b/pattern.c
index 7e31bfc..28a4ecf 100644
--- a/pattern.c
+++ b/pattern.c
@@ -16,16 +16,6 @@
__attribute__((nonnull))
static pat_t *bp_pattern_nl(file_t *f, const char *str, bool allow_nl);
__attribute__((nonnull))
-static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl);
-__attribute__((nonnull))
-static pat_t *_bp_simplepattern(file_t *f, const char *str);
-__attribute__((nonnull(1,2,3,6)))
-static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep);
-__attribute__((nonnull(1,2)))
static pat_t *bp_simplepattern(file_t *f, const char *str);
//
@@ -52,6 +42,7 @@ pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssi
//
// Helper function to initialize a range object.
//
+__attribute__((nonnull(1,2,3,6)))
static pat_t *new_range(file_t *f, const char *start, const char *end, size_t min, ssize_t max, pat_t *repeating, pat_t *sep)
{
size_t minlen = min*repeating->min_matchlen + (min > 0 ? min-1 : 0)*(sep ? sep->min_matchlen : 0);
@@ -69,6 +60,7 @@ static pat_t *new_range(file_t *f, const char *start, const char *end, size_t mi
// Take a pattern and expand it into a chain of patterns if it's followed by
// any patterns (e.g. "`x `y"), otherwise return the original input.
//
+__attribute__((nonnull))
static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
{
const char *str = after_spaces(first->end, allow_nl);
@@ -84,6 +76,7 @@ static pat_t *expand_chain(file_t *f, pat_t *first, bool allow_nl)
//
// Match trailing => replacements (with optional pattern beforehand)
//
+__attribute__((nonnull))
static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
{
const char *str = replace_pat->end;
@@ -94,12 +87,12 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
|| matchchar(&str, '{', allow_nl) || matchchar(&str, '\002', allow_nl)) {
char closequote = str[-1] == '{' ? '}' : (str[-1] == '\002' ? '\003' : str[-1]);
repstr = str;
- for (; *str && *str != closequote; str = next_char(f, str)) {
+ for (; str < f->end && *str != closequote; str = next_char(str, f->end)) {
if (*str == '\\') {
if (!str[1] || str[1] == '\n')
file_err(f, str, str+1,
"There should be an escape sequence after this backslash.");
- str = next_char(f, str);
+ str = next_char(str, f->end);
}
}
replen = (size_t)(str-repstr);
@@ -124,6 +117,7 @@ static pat_t *expand_replacements(file_t *f, pat_t *replace_pat, bool allow_nl)
// chain of choices if it's followed by any "/"-separated patterns (e.g.
// "`x/`y"), otherwise return the original input.
//
+__attribute__((nonnull))
static pat_t *expand_choices(file_t *f, pat_t *first, bool allow_nl)
{
first = expand_chain(f, first, allow_nl);
@@ -192,53 +186,22 @@ pat_t *either_pat(file_t *f, pat_t *first, pat_t *second)
}
//
-// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
-//
-static pat_t *bp_simplepattern(file_t *f, const char *str)
-{
- pat_t *pat = _bp_simplepattern(f, str);
- if (pat == NULL) return pat;
- str = pat->end;
-
- // Expand postfix operators (if any)
- while (str < f->end) {
- enum pattype_e type;
- if (matchchar(&str, '~', false))
- type = BP_MATCH;
- else if (matchstr(&str, "!~", false))
- type = BP_NOT_MATCH;
- else break;
-
- pat_t *first = pat;
- pat_t *second = bp_simplepattern(f, str);
- if (!second)
- file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
-
- pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
- pat->args.multiple.first = first;
- pat->args.multiple.second = second;
- str = pat->end;
- }
-
- return pat;
-}
-
-//
// Compile a string of BP code into a BP pattern object.
//
+__attribute__((nonnull))
static pat_t *_bp_simplepattern(file_t *f, const char *str)
{
str = after_spaces(str, false);
if (!*str) return NULL;
const char *start = str;
char c = *str;
- str = next_char(f, str);
+ str = next_char(str, f->end);
switch (c) {
// Any char (dot)
case '.': {
if (*str == '.') { // ".."
pat_t *skip = NULL;
- str = next_char(f, str);
+ str = next_char(str, f->end);
char skipper = *str;
if (matchchar(&str, '%', false) || matchchar(&str, '=', false)) {
skip = bp_simplepattern(f, str);
@@ -261,11 +224,11 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
file_err(f, str, str, "There should be a character here after the '`'");
const char *c1_loc = str;
- str = next_char(f, c1_loc);
+ str = next_char(c1_loc, f->end);
if (*str == '-') { // Range
const char *c2_loc = ++str;
- if (next_char(f, c1_loc) > c1_loc+1 || next_char(f, c2_loc) > c2_loc+1)
- file_err(f, start, next_char(f, c2_loc), "Sorry, UTF-8 character ranges are not yet supported.");
+ if (next_char(c1_loc, f->end) > c1_loc+1 || next_char(c2_loc, f->end) > c2_loc+1)
+ file_err(f, start, next_char(c2_loc, f->end), "Sorry, UTF-8 character ranges are not yet supported.");
char c1 = *c1_loc, c2 = *c2_loc;
if (!c2 || c2 == '\n')
file_err(f, str, str, "There should be a character here to complete the character range.");
@@ -274,7 +237,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
c1 = c2;
c2 = tmp;
}
- str = next_char(f, c2_loc);
+ str = next_char(c2_loc, f->end);
pat_t *pat = new_pat(f, start == c1_loc - 1 ? start : c1_loc, str, 1, 1, BP_RANGE);
pat->args.range.low = (unsigned char)c1;
pat->args.range.high = (unsigned char)c2;
@@ -318,8 +281,8 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
unsigned char e_high = e_low;
if (*str == '-') { // Escape range (e.g. \x00-\xFF)
++str;
- if (next_char(f, str) != str+1)
- file_err(f, start, next_char(f, str), "Sorry, UTF8 escape sequences are not supported in ranges.");
+ if (next_char(str, f->end) != str+1)
+ file_err(f, start, next_char(str, f->end), "Sorry, UTF8 escape sequences are not supported in ranges.");
const char *seqstart = str;
e_high = (unsigned char)unescapechar(str, &str);
if (str == seqstart)
@@ -331,7 +294,7 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
esc->args.range.low = e_low;
esc->args.range.high = e_high;
all = either_pat(f, all, esc);
- } while (*str++ == ',');
+ } while (*str == ',' && str++ < f->end);
return all;
}
@@ -344,9 +307,9 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
char endquote = c == '\002' ? '\003' : (c == '{' ? '}' : c);
char *litstart = (char*)str;
while (str < f->end && *str != endquote)
- str = next_char(f, str);
+ str = next_char(str, f->end);
size_t len = (size_t)(str - litstart);
- str = next_char(f, str);
+ str = next_char(str, f->end);
pat_t *pat = new_pat(f, start, str, len, (ssize_t)len, BP_STRING);
pat->args.string = litstart;
@@ -528,10 +491,10 @@ static pat_t *_bp_simplepattern(file_t *f, const char *str)
pat_t *bp_stringpattern(file_t *f, const char *str)
{
pat_t *ret = NULL;
- while (*str) {
+ while (str < f->end) {
char *start = (char*)str;
pat_t *interp = NULL;
- for (; str < f->end; str = next_char(f, str)) {
+ for (; str < f->end; str = next_char(str, f->end)) {
if (*str == '\\' && str+1 < f->end) {
if (str[1] == '\\' || isalnum(str[1]))
interp = bp_simplepattern(f, str);
@@ -559,6 +522,38 @@ pat_t *bp_stringpattern(file_t *f, const char *str)
}
//
+// Wrapper for _bp_simplepattern() that expands any postfix operators (~, !~)
+//
+static pat_t *bp_simplepattern(file_t *f, const char *str)
+{
+ pat_t *pat = _bp_simplepattern(f, str);
+ if (pat == NULL) return pat;
+ str = pat->end;
+
+ // Expand postfix operators (if any)
+ while (str < f->end) {
+ enum pattype_e type;
+ if (matchchar(&str, '~', false))
+ type = BP_MATCH;
+ else if (matchstr(&str, "!~", false))
+ type = BP_NOT_MATCH;
+ else break;
+
+ pat_t *first = pat;
+ pat_t *second = bp_simplepattern(f, str);
+ if (!second)
+ file_err(f, str, str, "The '%s' operator expects a pattern before and after.", type == BP_MATCH ? "~" : "!~");
+
+ pat = new_pat(f, str, second->end, first->min_matchlen, first->max_matchlen, type);
+ pat->args.multiple.first = first;
+ pat->args.multiple.second = second;
+ str = pat->end;
+ }
+
+ return pat;
+}
+
+//
// Given a pattern and a replacement string, compile the two into a BP
// replace pattern.
//
@@ -567,7 +562,7 @@ pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement)
pat_t *pat = new_pat(f, replacepat->start, replacepat->end, replacepat->min_matchlen, replacepat->max_matchlen, BP_REPLACE);
pat->args.replace.pat = replacepat;
const char *p = replacement;
- for (; *p; p++) {
+ for (; p < f->end; p++) {
if (*p == '\\') {
if (!p[1] || p[1] == '\n')
file_err(f, p, p, "There should be an escape sequence or pattern here after this backslash.");
diff --git a/pattern.h b/pattern.h
index 39aba63..b903d5b 100644
--- a/pattern.h
+++ b/pattern.h
@@ -9,7 +9,7 @@
__attribute__((returns_nonnull, nonnull(1,2)))
pat_t *new_pat(file_t *f, const char *start, const char *end, size_t minlen, ssize_t maxlen, enum pattype_e type);
-__attribute__((nonnull(1,2)))
+__attribute__((nonnull))
pat_t *bp_stringpattern(file_t *f, const char *str);
__attribute__((nonnull(1,2)))
pat_t *bp_replacement(file_t *f, pat_t *replacepat, const char *replacement);
diff --git a/utf8.c b/utf8.c
index 6180ffe..08e8932 100644
--- a/utf8.c
+++ b/utf8.c
@@ -3,8 +3,9 @@
//
#include <ctype.h>
#include <stdint.h>
+#include <stdbool.h>
+#include <unistd.h>
-#include "files.h"
#include "utf8.h"
#define ARRAY_LEN(a) (sizeof(a)/sizeof((a)[0]))
@@ -181,39 +182,39 @@ static const uint32_t XID_Continue_only[][2] = {
// Return the location of the next character or UTF8 codepoint.
// (i.e. skip forward one codepoint at a time, not one byte at a time)
//
-const char *next_char(file_t *f, const char *str)
+const char *next_char(const char *str, const char *end)
{
- if (likely(str+1 <= f->end) && likely((str[0] & 0x80) == 0x0))
+ if (likely(str+1 <= end) && likely((str[0] & 0x80) == 0x0))
return str+1;
- if (likely(str+2 <= f->end) && (str[0] & 0xe0) == 0xc0)
+ if (likely(str+2 <= end) && (str[0] & 0xe0) == 0xc0)
return str+2;
- if (likely(str+3 <= f->end) && (str[0] & 0xf0) == 0xe0)
+ if (likely(str+3 <= end) && (str[0] & 0xf0) == 0xe0)
return str+3;
- if (likely(str+4 <= f->end) && (str[0] & 0xf8) == 0xf0)
+ if (likely(str+4 <= end) && (str[0] & 0xf8) == 0xf0)
return str+4;
- return likely(str+1 <= f->end) ? str+1 : f->end;
+ return likely(str+1 <= end) ? str+1 : end;
}
//
// Return the location of the previous character or UTF8 codepoint.
// (i.e. skip backwards one codepoint at a time, not one byte at a time)
//
-const char *prev_char(file_t *f, const char *str)
+const char *prev_char(const char *start, const char *str)
{
- if (likely(str-1 >= f->start) && likely((str[-1] & 0x80) == 0x0))
+ if (likely(str-1 >= start) && likely((str[-1] & 0x80) == 0x0))
return str-1;
- if (likely(str-2 >= f->start) && (str[-2] & 0xe0) == 0xc0)
+ if (likely(str-2 >= start) && (str[-2] & 0xe0) == 0xc0)
return str-2;
- if (likely(str-3 >= f->start) && (str[-3] & 0xf0) == 0xe0)
+ if (likely(str-3 >= start) && (str[-3] & 0xf0) == 0xe0)
return str-3;
- if (likely(str-4 >= f->start) && (str[-4] & 0xf8) == 0xf0)
+ if (likely(str-4 >= start) && (str[-4] & 0xf8) == 0xf0)
return str-4;
- return likely(str-1 >= f->start) ? str-1 : f->start;
+ return likely(str-1 >= start) ? str-1 : start;
}
-static uint32_t get_codepoint(file_t *f, const char *str)
+static uint32_t get_codepoint(const char *str, const char *end)
{
- if (str >= f->end)
+ if (unlikely(str >= end))
return (uint32_t)-1;
unsigned char c1 = (unsigned char)str[0];
@@ -235,7 +236,7 @@ static uint32_t get_codepoint(file_t *f, const char *str)
}
for (int i = 1; i < seqlen; ++i) {
- if (unlikely(&str[i] >= f->end || (str[i] & 0xC0) != 0x80))
+ if (unlikely((&str[i] >= end) || (str[i] & 0xC0) != 0x80))
return (uint32_t)-1;
codepoint = ((codepoint << 6) | (uint32_t)(str[i] & 0x3F));
}
@@ -259,22 +260,22 @@ static bool find_in_ranges(uint32_t codepoint, const uint32_t ranges[][2], size_
return false;
}
-bool isidstart(file_t *f, const char *str)
+bool isidstart(const char *str, const char *end)
{
- if (unlikely(str >= f->end)) return false;
+ if (unlikely(str >= end)) return false;
else if (isalpha(*str) || *str == '_') return true;
else if (likely((*str & 0x80) == 0)) return false;
- uint32_t codepoint = get_codepoint(f, str);
+ uint32_t codepoint = get_codepoint(str, end);
return codepoint != (uint32_t)-1
&& find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start));
}
-bool isidcontinue(file_t *f, const char *str)
+bool isidcontinue(const char *str, const char *end)
{
- if (unlikely(str >= f->end)) return false;
+ if (unlikely(str >= end)) return false;
else if (isalnum(*str) || *str == '_') return true;
else if (likely((*str & 0x80) == 0)) return false;
- uint32_t codepoint = get_codepoint(f, str);
+ uint32_t codepoint = get_codepoint(str, end);
return codepoint != (uint32_t)-1
&& (find_in_ranges(codepoint, XID_Start, ARRAY_LEN(XID_Start))
|| find_in_ranges(codepoint, XID_Continue_only, ARRAY_LEN(XID_Continue_only)));
diff --git a/utf8.h b/utf8.h
index 97e259e..243acd3 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1,21 +1,21 @@
//
// utf8.h - UTF8 helper functions
//
-#include "files.h"
-
#ifndef UTF8__H
#define UTF8__H
+#include <stdbool.h>
+
#define UTF8_MAXCHARLEN 4
__attribute__((nonnull, pure))
-const char *next_char(file_t *f, const char *str);
+const char *next_char(const char *str, const char *end);
__attribute__((nonnull, pure))
-const char *prev_char(file_t *f, const char *str);
+const char *prev_char(const char *start, const char *str);
__attribute__((nonnull, pure))
-bool isidstart(file_t *f, const char *str);
+bool isidstart(const char *str, const char *end);
__attribute__((nonnull, pure))
-bool isidcontinue(file_t *f, const char *str);
+bool isidcontinue(const char *str, const char *end);
#endif
// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0