From b67cb4643cc837728f5c527dbc5bf59958039735 Mon Sep 17 00:00:00 2001 From: Bruce Hill Date: Fri, 1 Oct 2021 18:18:22 -0700 Subject: Use chained scatter table --- match.c | 77 +++++++++++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/match.c b/match.c index aa00592..cd284a2 100644 --- a/match.c +++ b/match.c @@ -17,14 +17,16 @@ #define MAX_CACHE_SIZE (1<<14) -typedef struct { +// Cache datastructures +typedef struct cache_hit_s { pat_t *pat; const char *start, *end; + // Cache entries use a chained scatter approach modeled after Lua's tables + struct cache_hit_s *next_probe; } cache_hit_t; -// Cache datastructure typedef struct { - size_t size, occupancy; + unsigned int size, occupancy, next_free; cache_hit_t *hits; } cache_t; @@ -88,14 +90,46 @@ static inline size_t hash(const char *str, size_t pat_id) // given definitions. If a result has been memoized, set *result to the // memoized value and return true, otherwise return false. // -static cache_hit_t cache_get(match_ctx_t *ctx, const char *str, pat_t *pat) +static cache_hit_t *cache_get(match_ctx_t *ctx, const char *str, pat_t *pat) { - if (!ctx->cache->hits) return (cache_hit_t){0}; - size_t h = hash(str, pat->id) & (ctx->cache->size-1); - cache_hit_t c = ctx->cache->hits[h]; - if (c.pat == pat && c.start == str) - return c; - return (cache_hit_t){0}; + if (!ctx->cache->hits) return NULL; + for (cache_hit_t *hit = &ctx->cache->hits[hash(str, pat->id) & (ctx->cache->size-1)]; hit; hit = hit->next_probe) { + if (hit->pat == pat && hit->start == str) + return hit; + } + return NULL; +} + +static void _hash_insert(cache_t *cache, cache_hit_t hit) +{ + size_t h = hash(hit.start, hit.pat->id) & (cache->size-1); + cache_hit_t collision = cache->hits[h]; + if (collision.pat == NULL) { // No collision + hit.next_probe = NULL; + cache->hits[h] = hit; + ++cache->occupancy; + return; + } + + if (collision.pat == hit.pat && collision.start == hit.start) + return; // Duplicate entry, just leave it be + + // Shuffle the collision along to a free space: + while (cache->hits[cache->next_free].pat) ++cache->next_free; + cache->hits[cache->next_free] = collision; + + size_t hcol = hash(collision.start, collision.pat->id) & (cache->size-1); + if (hcol == h) { // Chain `collision` after `hit` + hit.next_probe = &cache->hits[cache->next_free]; + } else { // Keep `collision` in its own chain + cache_hit_t *prev = &cache->hits[hcol]; // Where `collision` wanted to be originally + while (prev->next_probe != &cache->hits[h]) prev = prev->next_probe; + prev->next_probe = &cache->hits[cache->next_free]; + hit.next_probe = NULL; + } + cache->hits[h] = hit; + ++cache->next_free; + ++cache->occupancy; } // @@ -104,27 +138,23 @@ static cache_hit_t cache_get(match_ctx_t *ctx, const char *str, pat_t *pat) static void cache_save(match_ctx_t *ctx, const char *str, pat_t *pat, match_t *m) { cache_t *cache = ctx->cache; - if (cache->occupancy+1 > (cache->size*1)/5) { + // Grow the hash if needed (>99% utilization): + if (cache->occupancy+1 > (cache->size*99)/100) { cache_hit_t *old_hits = cache->hits; size_t old_size = cache->size; cache->size = old_size == 0 ? 16 : 2*old_size; cache->hits = new(cache_hit_t[cache->size]); + cache->next_free = 0; // Rehash: for (size_t i = 0; i < old_size; i++) { - if (old_hits[i].pat) { - size_t h = hash(old_hits[i].start, old_hits[i].pat->id) & (cache->size-1); - cache->hits[h] = old_hits[i]; - } + if (old_hits[i].pat) + _hash_insert(cache, old_hits[i]); } if (old_hits) delete(&old_hits); } - size_t h = hash(str, pat->id) & (cache->size-1); - if (!cache->hits[h].start) ++cache->occupancy; - cache->hits[h].start = str; - cache->hits[h].end = m ? m->end : NULL; - cache->hits[h].pat = pat; + _hash_insert(cache, (cache_hit_t){.pat = pat, .start = str, .end = m ? m->end : NULL}); } // @@ -134,8 +164,7 @@ void cache_destroy(match_ctx_t *ctx) { cache_t *cache = ctx->cache; if (cache->hits) delete(&cache->hits); - cache->occupancy = 0; - cache->size = 0; + memset(cache, 0, sizeof(cache_t)); } // @@ -561,8 +590,8 @@ static match_t *match(match_ctx_t *ctx, const char *str, pat_t *pat) return new_match(pat, str, p ? p->end : str, MATCHES(p)); } case BP_REF: { - cache_hit_t hit = cache_get(ctx, str, pat); - if (hit.start && !hit.end) + cache_hit_t *hit = cache_get(ctx, str, pat); + if (hit && !hit->end) return NULL; pat_t *ref = lookup(ctx, pat->args.ref.name, pat->args.ref.len); -- cgit v1.2.3