diff options
| author | Bruce Hill <bruce@bruce-hill.com> | 2024-09-05 16:23:05 -0400 |
|---|---|---|
| committer | Bruce Hill <bruce@bruce-hill.com> | 2024-09-05 16:23:05 -0400 |
| commit | bac14fa6c79b6abea50fd5f188799ae8e83cb20d (patch) | |
| tree | 416a7b9ff39075ab33fb93ab4e86003df4d25aca /builtins | |
| parent | 47e8972427c152891c4c6c4b458f78cd7ceb8aee (diff) | |
Fully clean up siphash code and fix some issues
Diffstat (limited to 'builtins')
| -rw-r--r-- | builtins/array.c | 6 | ||||
| -rw-r--r-- | builtins/c_string.c | 9 | ||||
| -rw-r--r-- | builtins/channel.c | 8 | ||||
| -rw-r--r-- | builtins/functions.c | 7 | ||||
| -rw-r--r-- | builtins/functions.h | 3 | ||||
| -rw-r--r-- | builtins/halfsiphash.h | 22 | ||||
| -rw-r--r-- | builtins/integers.c | 7 | ||||
| -rw-r--r-- | builtins/memory.c | 1 | ||||
| -rw-r--r-- | builtins/pointer.c | 1 | ||||
| -rw-r--r-- | builtins/siphash-internals.h | 125 | ||||
| -rw-r--r-- | builtins/siphash.h | 9 | ||||
| -rw-r--r-- | builtins/table.c | 28 | ||||
| -rw-r--r-- | builtins/table.h | 2 | ||||
| -rw-r--r-- | builtins/text.c | 8 | ||||
| -rw-r--r-- | builtins/thread.c | 1 | ||||
| -rw-r--r-- | builtins/tomo.h | 5 | ||||
| -rw-r--r-- | builtins/types.h | 2 |
17 files changed, 174 insertions, 70 deletions
diff --git a/builtins/array.c b/builtins/array.c index 174bcbdd..f09af917 100644 --- a/builtins/array.c +++ b/builtins/array.c @@ -18,7 +18,9 @@ #include "types.h" #include "util.h" -#include "siphash.c" +// Use inline version of siphash code: +#include "siphash.h" +#include "siphash-internals.h" static inline int64_t get_padded_item_size(const TypeInfo *info) { @@ -555,7 +557,7 @@ public uint64_t Array$hash(const Array_t *arr, const TypeInfo *type) { const TypeInfo *item = type->ArrayInfo.item; siphash sh; - siphashinit(&sh, sizeof(uint64_t[arr->length]), (uint64_t*)TOMO_HASH_KEY); + siphashinit(&sh, sizeof(uint64_t[arr->length])); if (item->tag == PointerInfo || (item->tag == CustomInfo && item->CustomInfo.hash == NULL && item->size == sizeof(void*))) { // Raw data hash for (int64_t i = 0; i < arr->length; i++) siphashadd64bits(&sh, (uint64_t)(arr->data + i*arr->stride)); diff --git a/builtins/c_string.c b/builtins/c_string.c index b0c70f0a..d916bfa9 100644 --- a/builtins/c_string.c +++ b/builtins/c_string.c @@ -8,8 +8,8 @@ #include <stdlib.h> #include "functions.h" -#include "halfsiphash.h" #include "text.h" +#include "siphash.h" #include "types.h" #include "util.h" @@ -37,13 +37,10 @@ public bool CString$equal(const char **x, const char **y) return CString$compare(x, y) == 0; } -public uint32_t CString$hash(const char **c_str) +public uint64_t CString$hash(const char **c_str) { if (!*c_str) return 0; - - uint32_t hash; - halfsiphash(*c_str, strlen(*c_str), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; + return siphash24((void*)*c_str, strlen(*c_str)); } public const TypeInfo CString$info = { diff --git a/builtins/channel.c b/builtins/channel.c index 9b1754af..e877b6b5 100644 --- a/builtins/channel.c +++ b/builtins/channel.c @@ -13,8 +13,8 @@ #include "array.h" #include "functions.h" -#include "halfsiphash.h" #include "integers.h" +#include "siphash.h" #include "text.h" #include "types.h" #include "util.h" @@ -100,12 +100,10 @@ public void Channel$clear(channel_t *channel) (void)pthread_cond_signal(&channel->cond); } -public uint32_t Channel$hash(const channel_t **channel, const TypeInfo *type) +public uint64_t Channel$hash(const channel_t **channel, const TypeInfo *type) { (void)type; - uint32_t hash; - halfsiphash(*channel, sizeof(channel_t*), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; + return siphash24((void*)*channel, sizeof(channel_t*)); } public int32_t Channel$compare(const channel_t **x, const channel_t **y, const TypeInfo *type) diff --git a/builtins/functions.c b/builtins/functions.c index 0ccc44cb..b59885bc 100644 --- a/builtins/functions.c +++ b/builtins/functions.c @@ -17,15 +17,14 @@ #include "functions.h" #include "integers.h" #include "pointer.h" +#include "siphash.h" #include "string.h" #include "table.h" #include "text.h" #include "types.h" #include "util.h" -#include "siphash.c" - -public uint8_t TOMO_HASH_KEY[16] = {0}; +public uint64_t TOMO_HASH_KEY[2] = {23, 42}; // Randomized in tomo_init() public void tomo_init(void) { @@ -115,7 +114,7 @@ public uint64_t generic_hash(const void *obj, const TypeInfo *type) return type->CustomInfo.hash(obj, type); default: { hash_data:; - return siphash24((void*)obj, type->size, (uint64_t*)TOMO_HASH_KEY); + return siphash24((void*)obj, type->size); } } } diff --git a/builtins/functions.h b/builtins/functions.h index c9db6ea8..6445400a 100644 --- a/builtins/functions.h +++ b/builtins/functions.h @@ -9,10 +9,7 @@ #include "datatypes.h" #include "types.h" -extern uint8_t TOMO_HASH_KEY[16]; - void tomo_init(void); - void fail(const char *fmt, ...); void fail_source(const char *filename, int64_t start, int64_t end, const char *fmt, ...); Text_t builtin_last_err(); diff --git a/builtins/halfsiphash.h b/builtins/halfsiphash.h deleted file mode 100644 index a1af8cd2..00000000 --- a/builtins/halfsiphash.h +++ /dev/null @@ -1,22 +0,0 @@ -/* - SipHash reference C implementation - - Copyright (c) 2012-2021 Jean-Philippe Aumasson - <jeanphilippe.aumasson@gmail.com> - Copyright (c) 2012-2014 Daniel J. Bernstein <djb@cr.yp.to> - - To the extent possible under law, the author(s) have dedicated all copyright - and related and neighboring rights to this software to the public domain - worldwide. This software is distributed without any warranty. - - You should have received a copy of the CC0 Public Domain Dedication along - with - this software. If not, see - <http://creativecommons.org/publicdomain/zero/1.0/>. - */ - -#include <inttypes.h> -#include <string.h> - -int halfsiphash(const void *in, const size_t inlen, const void *k, uint8_t *out, - const size_t outlen); diff --git a/builtins/integers.c b/builtins/integers.c index aad4bf33..523ec4d1 100644 --- a/builtins/integers.c +++ b/builtins/integers.c @@ -10,11 +10,10 @@ #include "array.h" #include "datatypes.h" #include "integers.h" +#include "siphash.h" #include "text.h" #include "types.h" -#include "siphash.c" - static gmp_randstate_t Int_rng = {}; public void Int$init_random(long seed) @@ -63,10 +62,10 @@ public uint64_t Int$hash(const Int_t *x, const TypeInfo *type) { (void)type; if (__builtin_expect(x->small & 1, 1)) { int64_t i = (x->small>>2); - return siphash24((void*)&i, sizeof(i), (uint64_t*)TOMO_HASH_KEY); + return siphash24((void*)&i, sizeof(i)); } else { char *str = mpz_get_str(NULL, 16, *x->big); - return siphash24((void*)str, strlen(str), (uint64_t*)TOMO_HASH_KEY); + return siphash24((void*)str, strlen(str)); } } diff --git a/builtins/memory.c b/builtins/memory.c index 47ff081b..9d7dbc80 100644 --- a/builtins/memory.c +++ b/builtins/memory.c @@ -7,7 +7,6 @@ #include <sys/param.h> #include <err.h> -#include "halfsiphash.h" #include "memory.h" #include "text.h" #include "types.h" diff --git a/builtins/pointer.c b/builtins/pointer.c index bd18a1e4..66cd308a 100644 --- a/builtins/pointer.c +++ b/builtins/pointer.c @@ -9,7 +9,6 @@ #include <sys/param.h> #include "functions.h" -#include "halfsiphash.h" #include "text.h" #include "types.h" #include "util.h" diff --git a/builtins/siphash-internals.h b/builtins/siphash-internals.h new file mode 100644 index 00000000..95b00b53 --- /dev/null +++ b/builtins/siphash-internals.h @@ -0,0 +1,125 @@ +#pragma once + +// This file holds the internals for the SipHash implementation. For a few +// cases, we want to include this for incrementally computing hashes. +// Otherwise, it suffices to just use the siphash24() function from siphash.h + +#include <stddef.h> +#include <stdint.h> +#include <string.h> + +#include "tomo.h" + +/* <MIT License> + Copyright (c) 2013 Marek Majkowski <marek@popcount.org> + Copyright (c) 2018 Samantha McVey <samantham@posteo.net> + Copyright (c) 2024 Bruce Hill <bruce@bruce-hill.com> + + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + </MIT License> + + Original location: + https://github.com/majek/csiphash/ + + Original solution inspired by code from: + Samuel Neves (supercop/crypto_auth/siphash24/little) + djb (supercop/crypto_auth/siphash24/little2) + Jean-Philippe Aumasson (https://131002.net/siphash/siphash24.c) + + Extensive modifications for MoarVM by Samantha McVey + + Further modifications for Tomo by Bruce Hill +*/ +struct siphash { + uint64_t v0; + uint64_t v1; + uint64_t v2; + uint64_t v3; + uint64_t b; +}; +typedef struct siphash siphash; +#define ROTATE(x, b) (uint64_t)( ((x) << (b)) | ( (x) >> (64 - (b))) ) + +#define HALF_ROUND(a,b,c,d,s,t) \ + a += b; c += d; \ + b = ROTATE(b, s) ^ a; \ + d = ROTATE(d, t) ^ c; \ + a = ROTATE(a, 32); + +#define DOUBLE_ROUND(v0,v1,v2,v3) \ + HALF_ROUND(v0,v1,v2,v3,13,16); \ + HALF_ROUND(v2,v1,v0,v3,17,21); \ + HALF_ROUND(v0,v1,v2,v3,13,16); \ + HALF_ROUND(v2,v1,v0,v3,17,21); + +static inline void siphashinit (siphash *sh, size_t src_sz) { + const uint64_t k0 = TOMO_HASH_KEY[0]; + const uint64_t k1 = TOMO_HASH_KEY[1]; + sh->b = (uint64_t)src_sz << 56; + sh->v0 = k0 ^ 0x736f6d6570736575ULL; + sh->v1 = k1 ^ 0x646f72616e646f6dULL; + sh->v2 = k0 ^ 0x6c7967656e657261ULL; + sh->v3 = k1 ^ 0x7465646279746573ULL; +} +static inline void siphashadd64bits (siphash *sh, const uint64_t in) { + const uint64_t mi = in; + sh->v3 ^= mi; + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + sh->v0 ^= mi; +} +static inline uint64_t siphashfinish_last_part (siphash *sh, uint64_t t) { + sh->b |= t; + sh->v3 ^= sh->b; + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + sh->v0 ^= sh->b; + sh->v2 ^= 0xff; + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + DOUBLE_ROUND(sh->v0,sh->v1,sh->v2,sh->v3); + return (sh->v0 ^ sh->v1) ^ (sh->v2 ^ sh->v3); +} +/* This union helps us avoid doing weird things with pointers that can cause old + * compilers like GCC 4 to generate bad code. In addition it is nicely more C + * standards compliant to keep type punning to a minimum. */ +union SipHash64_union { + uint64_t u64; + uint32_t u32; + uint8_t u8[8]; +}; +static inline uint64_t siphashfinish (siphash *sh, const uint8_t *src, size_t src_sz) { + union SipHash64_union t = { 0 }; + switch (src_sz) { + /* Falls through */ + case 7: t.u8[6] = src[6]; + /* Falls through */ + case 6: t.u8[5] = src[5]; + /* Falls through */ + case 5: t.u8[4] = src[4]; + /* Falls through */ + case 4: t.u8[3] = src[3]; + /* Falls through */ + case 3: t.u8[2] = src[2]; + /* Falls through */ + case 2: t.u8[1] = src[1]; + /* Falls through */ + case 1: t.u8[0] = src[0]; + } + return siphashfinish_last_part(sh, t.u64); +} + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/siphash.h b/builtins/siphash.h new file mode 100644 index 00000000..a466200c --- /dev/null +++ b/builtins/siphash.h @@ -0,0 +1,9 @@ +#pragma once + +// An implementation of the SipHash algorithm. + +#include <stdint.h> + +uint64_t siphash24(const uint8_t *src, size_t src_sz); + +// vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/table.c b/builtins/table.c index 3a672e0f..a5dafd48 100644 --- a/builtins/table.c +++ b/builtins/table.c @@ -19,8 +19,8 @@ #include "array.h" #include "c_string.h" #include "datatypes.h" -#include "halfsiphash.h" #include "memory.h" +#include "siphash.h" #include "table.h" #include "text.h" #include "types.h" @@ -114,11 +114,11 @@ public void *Table$get_raw(Table_t t, const void *key, const TypeInfo *type) assert(type->tag == TableInfo); if (!key || !t.bucket_info) return NULL; - uint32_t hash = HASH_KEY(t, key); + uint64_t hash = HASH_KEY(t, key); hshow(&t); hdebug("Getting value with initial probe at %u\n", hash); bucket_t *buckets = t.bucket_info->buckets; - for (uint32_t i = hash; buckets[i].occupied; i = buckets[i].next_bucket) { + for (uint64_t i = hash; buckets[i].occupied; i = buckets[i].next_bucket) { hdebug("Checking against key in bucket %u\n", i); void *entry = GET_ENTRY(t, buckets[i].index); if (EQUAL_KEYS(entry, key)) { @@ -147,7 +147,7 @@ static void Table$set_bucket(Table_t *t, const void *entry, int32_t index, const hshow(t); const void *key = entry; bucket_t *buckets = t->bucket_info->buckets; - uint32_t hash = HASH_KEY(*t, key); + uint64_t hash = HASH_KEY(*t, key); hdebug("Hash value (mod %u) = %u\n", t->bucket_info->count, hash); bucket_t *bucket = &buckets[hash]; if (!bucket->occupied) { @@ -167,11 +167,11 @@ static void Table$set_bucket(Table_t *t, const void *entry, int32_t index, const --t->bucket_info->last_free; } - uint32_t collided_hash = HASH_KEY(*t, GET_ENTRY(*t, bucket->index)); + uint64_t collided_hash = HASH_KEY(*t, GET_ENTRY(*t, bucket->index)); if (collided_hash != hash) { // Collided with a mid-chain entry hdebug("Hit a mid-chain entry at bucket %u (chain starting at %u)\n", hash, collided_hash); // Find chain predecessor - uint32_t predecessor = collided_hash; + uint64_t predecessor = collided_hash; while (buckets[predecessor].next_bucket != hash) predecessor = buckets[predecessor].next_bucket; @@ -180,7 +180,7 @@ static void Table$set_bucket(Table_t *t, const void *entry, int32_t index, const buckets[t->bucket_info->last_free] = *bucket; } else { // Collided with the start of a chain hdebug("Hit start of a chain\n"); - uint32_t end_of_chain = hash; + uint64_t end_of_chain = hash; while (buckets[end_of_chain].next_bucket != END_OF_CHAIN) end_of_chain = buckets[end_of_chain].next_bucket; hdebug("Appending to chain\n"); @@ -309,10 +309,10 @@ public void Table$remove(Table_t *t, const void *key, const TypeInfo *type) // zero out bucket // maybe update lastfree_index1 to removed bucket's index - uint32_t hash = HASH_KEY(*t, key); + uint64_t hash = HASH_KEY(*t, key); hdebug("Removing key with hash %u\n", hash); bucket_t *bucket, *prev = NULL; - for (uint32_t i = hash; t->bucket_info->buckets[i].occupied; i = t->bucket_info->buckets[i].next_bucket) { + for (uint64_t i = hash; t->bucket_info->buckets[i].occupied; i = t->bucket_info->buckets[i].next_bucket) { if (EQUAL_KEYS(GET_ENTRY(*t, t->bucket_info->buckets[i].index), key)) { bucket = &t->bucket_info->buckets[i]; hdebug("Found key to delete in bucket %u\n", i); @@ -336,7 +336,7 @@ public void Table$remove(Table_t *t, const void *key, const TypeInfo *type) hdebug("Removing key/value from the middle of the entries array\n"); // Find the bucket that points to the last entry's index: - uint32_t i = HASH_KEY(*t, GET_ENTRY(*t, last_entry)); + uint64_t i = HASH_KEY(*t, GET_ENTRY(*t, last_entry)); while (t->bucket_info->buckets[i].index != last_entry) i = t->bucket_info->buckets[i].next_bucket; // Update the bucket to point to the last entry's new home (the space @@ -438,21 +438,19 @@ public int32_t Table$compare(const Table_t *x, const Table_t *y, const TypeInfo return 0; } -public uint32_t Table$hash(const Table_t *t, const TypeInfo *type) +public uint64_t Table$hash(const Table_t *t, const TypeInfo *type) { assert(type->tag == TableInfo); // Table hashes are computed as: // hash(hash(t.keys), hash(t.values), hash(t.fallback), hash(t.default)) // Where fallback and default hash to zero if absent auto table = type->TableInfo; - uint32_t components[] = { + uint64_t components[] = { Array$hash(&t->entries, Array$info(table.key)), Array$hash(&t->entries + value_offset(type), Array$info(table.value)), t->fallback ? Table$hash(t->fallback, type) : 0, }; - uint32_t hash; - halfsiphash(&components, sizeof(components), TOMO_HASH_KEY, (uint8_t*)&hash, sizeof(hash)); - return hash; + return siphash24((void*)&components, sizeof(components)); } public Text_t Table$as_text(const Table_t *t, bool colorize, const TypeInfo *type) diff --git a/builtins/table.h b/builtins/table.h index cd267912..12f20f29 100644 --- a/builtins/table.h +++ b/builtins/table.h @@ -73,7 +73,7 @@ void Table$mark_copy_on_write(Table_t *t); #define TABLE_COPY(t) ({ TABLE_INCREF(t); t; }) int32_t Table$compare(const Table_t *x, const Table_t *y, const TypeInfo *type); bool Table$equal(const Table_t *x, const Table_t *y, const TypeInfo *type); -uint32_t Table$hash(const Table_t *t, const TypeInfo *type); +uint64_t Table$hash(const Table_t *t, const TypeInfo *type); Text_t Table$as_text(const Table_t *t, bool colorize, const TypeInfo *type); void *Table$str_entry(Table_t t, int64_t n); diff --git a/builtins/text.c b/builtins/text.c index 39222ed4..3beda82f 100644 --- a/builtins/text.c +++ b/builtins/text.c @@ -74,7 +74,9 @@ #include "text.h" #include "types.h" -#include "siphash.c" +// Use inline version of the siphash code for performance: +#include "siphash.h" +#include "siphash-internals.h" typedef struct { uint32_t main_codepoint; @@ -114,7 +116,7 @@ static bool graphemes_equal(uint32_t **a, uint32_t **b) { static uint64_t grapheme_hash(uint32_t **g) { uint32_t *cluster = *g; - return siphash24((void*)&cluster[1], sizeof(uint32_t[cluster[0]]), (uint64_t*)TOMO_HASH_KEY); + return siphash24((void*)&cluster[1], sizeof(uint32_t[cluster[0]])); } static const TypeInfo GraphemeClusterInfo = { @@ -723,7 +725,7 @@ public uint64_t Text$hash(Text_t *text) { if (text->hash != 0) return text->hash; siphash sh; - siphashinit(&sh, sizeof(int32_t[text->length]), (uint64_t*)TOMO_HASH_KEY); + siphashinit(&sh, sizeof(int32_t[text->length])); union { int32_t chunks[2]; diff --git a/builtins/thread.c b/builtins/thread.c index 4f422e5d..b9193787 100644 --- a/builtins/thread.c +++ b/builtins/thread.c @@ -12,7 +12,6 @@ #include "array.h" #include "functions.h" -#include "halfsiphash.h" #include "text.h" #include "types.h" #include "util.h" diff --git a/builtins/tomo.h b/builtins/tomo.h index 637975f4..8d6661ca 100644 --- a/builtins/tomo.h +++ b/builtins/tomo.h @@ -16,16 +16,19 @@ #include "channel.h" #include "datatypes.h" #include "functions.h" -#include "halfsiphash.h" #include "integers.h" #include "macros.h" #include "memory.h" #include "nums.h" #include "pointer.h" #include "range.h" +#include "siphash.h" #include "table.h" #include "text.h" #include "thread.h" #include "types.h" +// This value will be randomized on startup in tomo_init(): +extern uint64_t TOMO_HASH_KEY[2]; + // vim: ts=4 sw=0 et cino=L2,l1,(0,W4,m1,\:0 diff --git a/builtins/types.h b/builtins/types.h index 70cf89c3..0b48ab36 100644 --- a/builtins/types.h +++ b/builtins/types.h @@ -9,7 +9,7 @@ struct TypeInfo; -typedef uint32_t (*hash_fn_t)(const void*, const struct TypeInfo*); +typedef uint64_t (*hash_fn_t)(const void*, const struct TypeInfo*); typedef int32_t (*compare_fn_t)(const void*, const void*, const struct TypeInfo*); typedef bool (*equal_fn_t)(const void*, const void*, const struct TypeInfo*); typedef Text_t (*text_fn_t)(const void*, bool, const struct TypeInfo*); |
