aboutsummaryrefslogtreecommitdiff
path: root/match.c
diff options
context:
space:
mode:
authorBruce Hill <bruce@bruce-hill.com>2021-10-01 14:19:21 -0700
committerBruce Hill <bruce@bruce-hill.com>2021-10-01 14:19:21 -0700
commit9812de0c58e6d21a7f04bd68893c2caef1bb3213 (patch)
tree344c00d64c7465e4be85d72976e075394ffb54df /match.c
parentc9948248049bd7f60948fcc158f083cedabfc8f6 (diff)
Initial working version
Diffstat (limited to 'match.c')
-rw-r--r--match.c121
1 files changed, 54 insertions, 67 deletions
diff --git a/match.c b/match.c
index 2053a4f..15d808d 100644
--- a/match.c
+++ b/match.c
@@ -17,10 +17,17 @@
#define MAX_CACHE_SIZE (1<<14)
+typedef struct cache_hit_s {
+ size_t pat_id;
+ const char *start;
+ bool does_match;
+ struct cache_hit_s *next;
+} cache_hit_t;
+
// Cache datastructure
typedef struct {
size_t size, occupancy;
- match_t **matches;
+ cache_hit_t **matches;
} cache_t;
// Data structure for various ambient state for matching
@@ -99,9 +106,9 @@ static inline void list_remove(match_t *m, match_dll_t *node)
//
// Hash a string position/pattern.
//
-static inline size_t hash(const char *str, pat_t *pat)
+static inline size_t hash(const char *str, size_t pat_id)
{
- return (size_t)str + 2*pat->id;
+ return (size_t)str + 2*pat_id;
}
//
@@ -109,32 +116,15 @@ static inline size_t hash(const char *str, pat_t *pat)
// given definitions. If a result has been memoized, set *result to the
// memoized value and return true, otherwise return false.
//
-static bool cache_get(match_ctx_t *ctx, const char *str, pat_t *pat, match_t **result)
+static enum {CACHE_MISSING, CACHE_MATCH, CACHE_NOMATCH} cache_get(match_ctx_t *ctx, const char *str, pat_t *pat)
{
- if (!ctx->cache->matches) return false;
- size_t h = hash(str, pat) & (ctx->cache->size-1);
- for (match_t *c = ctx->cache->matches[h]; c; c = c->cache.next) {
- if (c->pat == pat && c->start == str) {
- // If c->end == NULL, that means no match occurs here
- *result = c->end == NULL ? NULL : c;
- return true;
- }
+ if (!ctx->cache->matches) return CACHE_MISSING;
+ size_t h = hash(str, pat->id) & (ctx->cache->size-1);
+ for (cache_hit_t *c = ctx->cache->matches[h]; c; c = c->next) {
+ if (c->pat_id == pat->id && c->start == str)
+ return c->does_match ? CACHE_MATCH : CACHE_NOMATCH;
}
- return false;
-}
-
-//
-// Remove an item from the cache.
-//
-static void cache_remove(match_ctx_t *ctx, match_t *m, bool do_recycle)
-{
- if (!m->cache.home) return;
- *m->cache.home = m->cache.next;
- if (m->cache.next) m->cache.next->cache.home = m->cache.home;
- m->cache.next = NULL;
- m->cache.home = NULL;
- if (--m->refcount == 0 && do_recycle) recycle_if_unused(&m);
- --ctx->cache->occupancy;
+ return CACHE_MISSING;
}
//
@@ -142,34 +132,28 @@ static void cache_remove(match_ctx_t *ctx, match_t *m, bool do_recycle)
//
static void cache_save(match_ctx_t *ctx, const char *str, pat_t *pat, match_t *m)
{
- // As a convention, a match with {.pat=pat, .start=str, .end==NULL} is used
- // to memoize the fact that `pat` will *not* match at `str`.
- if (m == NULL) m = new_match(pat, str, NULL, NULL);
-
cache_t *cache = ctx->cache;
if (cache->occupancy+1 > 3*cache->size) {
if (cache->size == MAX_CACHE_SIZE) {
- size_t h = hash(m->start, m->pat) & (cache->size-1);
+ size_t h = hash(m->start, m->pat->id) & (cache->size-1);
for (int quota = 2; cache->matches[h] && quota > 0; quota--) {
- match_t *last = cache->matches[h];
- while (last->cache.next) last = last->cache.next;
- cache_remove(ctx, last, true);
+ cache_hit_t **last_home = &cache->matches[h];
+ while ((*last_home)->next)
+ last_home = &(*last_home)->next;
+ delete(last_home);
}
} else {
- match_t **old_matches = cache->matches;
+ cache_hit_t **old_matches = cache->matches;
size_t old_size = cache->size;
cache->size = old_size == 0 ? 16 : 2*old_size;
- cache->matches = new(match_t*[cache->size]);
+ cache->matches = new(cache_hit_t*[cache->size]);
// Rehash:
for (size_t i = 0; i < old_size; i++) {
- for (match_t *o; (o = old_matches[i]); ) {
- *o->cache.home = o->cache.next;
- if (o->cache.next) o->cache.next->cache.home = o->cache.home;
- size_t h = hash(o->start, o->pat) & (cache->size-1);
- o->cache.home = &(cache->matches[h]);
- o->cache.next = cache->matches[h];
- if (cache->matches[h]) cache->matches[h]->cache.home = &o->cache.next;
+ for (cache_hit_t *o; (o = old_matches[i]); ) {
+ old_matches[i] = o->next;
+ size_t h = hash(o->start, o->pat_id) & (cache->size-1);
+ o->next = cache->matches[h];
cache->matches[h] = o;
}
}
@@ -177,24 +161,28 @@ static void cache_save(match_ctx_t *ctx, const char *str, pat_t *pat, match_t *m
}
}
- size_t h = hash(m->start, m->pat) & (cache->size-1);
- m->cache.home = &(cache->matches[h]);
- m->cache.next = cache->matches[h];
- if (cache->matches[h]) cache->matches[h]->cache.home = &m->cache.next;
- cache->matches[h] = m;
- ++m->refcount;
+ size_t h = hash(str, pat->id) & (cache->size-1);
+ cache_hit_t *hit = new(cache_hit_t);
+ hit->pat_id = pat->id;
+ hit->start = str;
+ hit->does_match = (m != NULL);
+ hit->next = cache->matches[h];
+ cache->matches[h] = hit;
++cache->occupancy;
}
//
// Clear and deallocate the cache.
//
-void cache_destroy(match_ctx_t *ctx, match_t *preserve)
+void cache_destroy(match_ctx_t *ctx)
{
cache_t *cache = ctx->cache;
for (size_t i = 0; cache->occupancy > 0 && i < cache->size; i++) {
- while (cache->matches[i])
- cache_remove(ctx, cache->matches[i], cache->matches[i] != preserve);
+ while (cache->matches[i]) {
+ cache_hit_t *next = cache->matches[i]->next;
+ delete(&cache->matches[i]);
+ cache->matches[i] = next;
+ }
}
cache->occupancy = 0;
if (cache->matches) delete(&cache->matches);
@@ -281,12 +269,12 @@ static match_t *_next_match(match_ctx_t *ctx, const char *str, pat_t *pat, pat_t
ctx2.defs = pat->type == BP_DEFINITIONS ? pat : pat->args.multiple.first;
pat_t *match_pat = pat->type == BP_DEFINITIONS ? pat->args.def.meaning : pat->args.multiple.second;
match_t *m = _next_match(&ctx2, str, match_pat, skip);
- cache_destroy(&ctx2, m);
+ cache_destroy(&ctx2);
return m;
}
// Clear the cache so it's not full of old cache values from different parts of the file:
- cache_destroy(ctx, NULL);
+ cache_destroy(ctx);
pat = deref(ctx, pat); // Avoid repeated lookups
pat_t *first = get_prerequisite(ctx, pat);
@@ -334,7 +322,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
ctx2.parent_ctx = ctx;
ctx2.defs = pat;
match_t *m = match(&ctx2, str, pat->args.def.meaning);
- cache_destroy(&ctx2, m);
+ cache_destroy(&ctx2);
return m;
}
case BP_LEFTRECURSION: {
@@ -515,14 +503,14 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
for (const char *pos = &str[-(long)back->min_matchlen];
pos >= ctx->start && (back->max_matchlen == -1 || pos >= &str[-(int)back->max_matchlen]);
pos = prev_char(ctx->start, pos)) {
- cache_destroy(&slice_ctx, NULL);
+ cache_destroy(&slice_ctx);
slice_ctx.start = (char*)pos;
match_t *m = match(&slice_ctx, pos, back);
// Match should not go past str (i.e. (<"AB" "B") should match "ABB", but not "AB")
if (m && m->end != str)
recycle_if_unused(&m);
else if (m) {
- cache_destroy(&slice_ctx, NULL);
+ cache_destroy(&slice_ctx);
return new_match(pat, str, str, MATCHES(m));
}
if (pos == ctx->start) break;
@@ -530,7 +518,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
// walking backwards endlessly over newlines.
if (back->max_matchlen == -1 && *pos == '\n') break;
}
- cache_destroy(&slice_ctx, NULL);
+ cache_destroy(&slice_ctx);
return NULL;
}
case BP_BEFORE: {
@@ -552,7 +540,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
ctx2.parent_ctx = ctx;
ctx2.defs = pat->args.multiple.first;
match_t *m = match(&ctx2, str, pat->args.multiple.second);
- cache_destroy(&ctx2, m);
+ cache_destroy(&ctx2);
return m;
}
@@ -584,7 +572,7 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
if (!m2) // No need to keep the backref in memory if it didn't match
delete_pat(&backref, false);
} --m1->refcount;
- cache_destroy(&ctx2, m2);
+ cache_destroy(&ctx2);
} else {
m2 = match(ctx, m1->end, pat->args.multiple.second);
}
@@ -608,13 +596,13 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
slice_ctx.end = m1->end;
match_t *m2 = _next_match(&slice_ctx, slice_ctx.start, pat->args.multiple.second, NULL);
if ((!m2 && pat->type == BP_MATCH) || (m2 && pat->type == BP_NOT_MATCH)) {
- cache_destroy(&slice_ctx, NULL);
+ cache_destroy(&slice_ctx);
if (m2) recycle_if_unused(&m2);
recycle_if_unused(&m1);
return NULL;
}
match_t *ret = new_match(pat, m1->start, m1->end, (pat->type == BP_MATCH) ? MATCHES(m1, m2) : MATCHES(m1));
- cache_destroy(&slice_ctx, ret);
+ cache_destroy(&slice_ctx);
return ret;
}
case BP_REPLACE: {
@@ -626,9 +614,8 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat)
return new_match(pat, str, p ? p->end : str, MATCHES(p));
}
case BP_REF: {
- match_t *cached;
- if (cache_get(ctx, str, pat, &cached))
- return cached;
+ if (cache_get(ctx, str, pat) == CACHE_NOMATCH)
+ return NULL;
pat_t *ref = lookup(ctx, pat->args.ref.name, pat->args.ref.len);
if (ref == NULL)
@@ -834,7 +821,7 @@ bool next_match(match_t **m, const char *start, const char *end, pat_t *pat, pat
.ignorecase = ignorecase,
};
*m = (pos <= end) ? _next_match(&ctx, pos, pat, skip) : NULL;
- cache_destroy(&ctx, *m);
+ cache_destroy(&ctx);
return *m != NULL;
}