shared: switch our hash table implementation over to SipHash

SipHash appears to be the new gold standard for hashing smaller strings for hashtables these days, so let's make use of it.
author: Lennart Poettering <lennart@poettering.net> 2013-12-22 19:59:12 +0100
committer: Lennart Poettering <lennart@poettering.net> 2013-12-22 21:12:25 +0100
commit: 9bf3b53533cdc9b95c921b71da755401f223f765 (patch)
tree: 812e99b25cc09f5d5d3b130d25a02754283ff7a7 /src/shared
parent: 14f862a508ee64466fa8b3f036797d472f4d03ed (diff)
7 files changed, 245 insertions, 87 deletions
diff --git a/src/shared/ask-password-api.c b/src/shared/ask-password-api.c
index 755abf0b5e..c9c82b2520 100644
--- a/src/shared/ask-password-api.c
+++ b/src/shared/ask-password-api.c
@@ -262,7 +262,7 @@ static int create_socket(char **name) {
                 return -errno;
         }
 
-        snprintf(sa.un.sun_path, sizeof(sa.un.sun_path)-1, "/run/systemd/ask-password/sck.%llu", random_ull());
+        snprintf(sa.un.sun_path, sizeof(sa.un.sun_path)-1, "/run/systemd/ask-password/sck.%" PRIx64, random_u64());
 
         RUN_WITH_UMASK(0177) {
                 r = bind(fd, &sa.sa, offsetof(struct sockaddr_un, sun_path) + strlen(sa.un.sun_path));
diff --git a/src/shared/hashmap.c b/src/shared/hashmap.c
index 3762e3ab0d..b1dccaf4e7 100644
--- a/src/shared/hashmap.c
+++ b/src/shared/hashmap.c
@@ -24,13 +24,10 @@
 #include <string.h>
 #include <errno.h>
 
-#ifdef HAVE_SYS_AUXV_H
-#include <sys/auxv.h>
-#endif
-
 #include "util.h"
 #include "hashmap.h"
 #include "macro.h"
+#include "siphash24.h"
 
 #define INITIAL_N_BUCKETS 31
 
@@ -50,8 +47,8 @@ struct Hashmap {
         struct hashmap_entry ** buckets;
         unsigned n_buckets, n_entries;
 
-        unsigned random_xor;
-        bool from_pool;
+        uint8_t hash_key[HASH_KEY_SIZE];
+        bool from_pool:1;
 };
 
 struct pool {
@@ -134,51 +131,60 @@ __attribute__((destructor)) static void cleanup_pool(void) {
 
 #endif
 
-unsigned string_hash_func(const void *p) {
-        unsigned hash = 5381;
-        const signed char *c;
-
-        /* DJB's hash function */
-
-        for (c = p; *c; c++)
-                hash = (hash << 5) + hash + (unsigned) *c;
-
-        return hash;
+unsigned long string_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) {
+        uint64_t u;
+        siphash24((uint8_t*) &u, p, strlen(p), hash_key);
+        return (unsigned long) u;
 }
 
 int string_compare_func(const void *a, const void *b) {
         return strcmp(a, b);
 }
 
-unsigned trivial_hash_func(const void *p) {
-        return PTR_TO_UINT(p);
+unsigned long trivial_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) {
+        uint64_t u;
+        siphash24((uint8_t*) &u, &p, sizeof(p), hash_key);
+        return (unsigned long) u;
 }
 
 int trivial_compare_func(const void *a, const void *b) {
         return a < b ? -1 : (a > b ? 1 : 0);
 }
 
-unsigned uint64_hash_func(const void *p) {
+unsigned long uint64_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) {
         uint64_t u;
-
-        assert_cc(sizeof(uint64_t) == 2*sizeof(unsigned));
-
-        u = *(const uint64_t*) p;
-
-        return (unsigned) ((u >> 32) ^ u);
+        siphash24((uint8_t*) &u, p, sizeof(uint64_t), hash_key);
+        return (unsigned long) u;
 }
 
 int uint64_compare_func(const void *_a, const void *_b) {
         uint64_t a, b;
-
         a = *(const uint64_t*) _a;
         b = *(const uint64_t*) _b;
-
         return a < b ? -1 : (a > b ? 1 : 0);
 }
 
 static unsigned bucket_hash(Hashmap *h, const void *p) {
-        return (h->hash_func(p) ^ h->random_xor) % h->n_buckets;
+        return (unsigned) (h->hash_func(p, h->hash_key) % h->n_buckets);
+}
+
+static void get_hash_key(uint8_t hash_key[HASH_KEY_SIZE], bool reuse_is_ok) {
+        static uint8_t current[HASH_KEY_SIZE];
+        static bool current_initialized = false;
+
+        /* Returns a hash function key to use. In order to keep things
+         * fast we will not generate a new key each time we allocate a
+         * new hash table. Instead, we'll just reuse the most recently
+         * generated one, except if we never generated one or when we
+         * are rehashing an entire hash table because we reached a
+         * fill level */
+
+        if (!current_initialized || !reuse_is_ok) {
+                random_bytes(current, sizeof(current));
+                current_initialized = true;
+        }
+
+        memcpy(hash_key, current, sizeof(current));
 }
 
 Hashmap *hashmap_new(hash_func_t hash_func, compare_func_t compare_func) {
@@ -214,21 +220,7 @@ Hashmap *hashmap_new(hash_func_t hash_func, compare_func_t compare_func) {
 
         h->from_pool = b;
 
-        /* Let's randomize our hash functions a bit so that they are
-         * harder to guess for clients. For this, start out by cheaply
-         * using some bits the kernel passed into the process using
-         * the auxiliary vector. If the hashmap grows later on we will
-         * rehash everything using a new random XOR mask from
-         * /dev/random. */
-#ifdef HAVE_SYS_AUXV_H
-        {
-                void *auxv;
-                auxv = (void*) getauxval(AT_RANDOM);
-                h->random_xor = auxv ? *(unsigned*) auxv : random_u();
-        }
-#else
-        h->random_xor = random_u();
-#endif
+        get_hash_key(h->hash_key, true);
 
         return h;
 }
@@ -407,7 +399,8 @@ static struct hashmap_entry *hash_scan(Hashmap *h, unsigned hash, const void *ke
 
 static bool resize_buckets(Hashmap *h) {
         struct hashmap_entry **n, *i;
-        unsigned m, nxor;
+        unsigned m;
+        uint8_t nkey[HASH_KEY_SIZE];
 
         assert(h);
 
@@ -422,15 +415,15 @@ static bool resize_buckets(Hashmap *h) {
         if (!n)
                 return false;
 
-        /* Let's use a different randomized xor value for the
+        /* Let's use a different randomized hash key for the
          * extension, so that people cannot guess what we are using
          * here forever */
-        nxor = random_u();
+        get_hash_key(nkey, false);
 
         for (i = h->iterate_list_head; i; i = i->iterate_next) {
-                unsigned hash, x;
+                unsigned long old_bucket, new_bucket;
 
-                hash = h->hash_func(i->key);
+                old_bucket = h->hash_func(i->key, h->hash_key) % h->n_buckets;
 
                 /* First, drop from old bucket table */
                 if (i->bucket_next)
@@ -439,16 +432,16 @@ static bool resize_buckets(Hashmap *h) {
                 if (i->bucket_previous)
                         i->bucket_previous->bucket_next = i->bucket_next;
                 else
-                        h->buckets[(hash ^ h->random_xor) % h->n_buckets] = i->bucket_next;
+                        h->buckets[old_bucket] = i->bucket_next;
 
                 /* Then, add to new backet table */
-                x = (hash ^ nxor) % m;
+                new_bucket = h->hash_func(i->key, nkey)  % m;
 
-                i->bucket_next = n[x];
+                i->bucket_next = n[new_bucket];
                 i->bucket_previous = NULL;
-                if (n[x])
-                        n[x]->bucket_previous = i;
-                n[x] = i;
+                if (n[new_bucket])
+                        n[new_bucket]->bucket_previous = i;
+                n[new_bucket] = i;
         }
 
         if (h->buckets != (struct hashmap_entry**) ((uint8_t*) h + ALIGN(sizeof(Hashmap))))
@@ -456,7 +449,8 @@ static bool resize_buckets(Hashmap *h) {
 
         h->buckets = n;
         h->n_buckets = m;
-        h->random_xor = nxor;
+
+        memcpy(h->hash_key, nkey, HASH_KEY_SIZE);
 
         return true;
 }
diff --git a/src/shared/hashmap.h b/src/shared/hashmap.h
index b912af8d8f..154f68eaf0 100644
--- a/src/shared/hashmap.h
+++ b/src/shared/hashmap.h
@@ -31,6 +31,8 @@
  * for all read operations. That way it is not necessary to
  * instantiate an object for each Hashmap use. */
 
+#define HASH_KEY_SIZE 16
+
 typedef struct Hashmap Hashmap;
 typedef struct _IteratorStruct _IteratorStruct;
 typedef _IteratorStruct* Iterator;
@@ -38,19 +40,19 @@ typedef _IteratorStruct* Iterator;
 #define ITERATOR_FIRST ((Iterator) 0)
 #define ITERATOR_LAST ((Iterator) -1)
 
-typedef unsigned (*hash_func_t)(const void *p);
+typedef unsigned long (*hash_func_t)(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]);
 typedef int (*compare_func_t)(const void *a, const void *b);
 
-unsigned string_hash_func(const void *p) _pure_;
+unsigned long string_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) _pure_;
 int string_compare_func(const void *a, const void *b) _pure_;
 
 /* This will compare the passed pointers directly, and will not
  * dereference them. This is hence not useful for strings or
  * suchlike. */
-unsigned trivial_hash_func(const void *p) _const_;
+unsigned long trivial_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) _pure_;
 int trivial_compare_func(const void *a, const void *b) _const_;
 
-unsigned uint64_hash_func(const void *p) _pure_;
+unsigned long uint64_hash_func(const void *p, const uint8_t hash_key[HASH_KEY_SIZE]) _pure_;
 int uint64_compare_func(const void *a, const void *b) _pure_;
 
 Hashmap *hashmap_new(hash_func_t hash_func, compare_func_t compare_func);
diff --git a/src/shared/siphash24.c b/src/shared/siphash24.c
new file mode 100644
index 0000000000..f68bd283a1
--- /dev/null
+++ b/src/shared/siphash24.c
@@ -0,0 +1,135 @@
+/*
+   SipHash reference C implementation
+
+   Written in 2012 by
+   Jean-Philippe Aumasson <jeanphilippe.aumasson@gmail.com>
+   Daniel J. Bernstein <djb@cr.yp.to>
+
+   To the extent possible under law, the author(s) have dedicated all copyright
+   and related and neighboring rights to this software to the public domain
+   worldwide. This software is distributed without any warranty.
+
+   You should have received a copy of the CC0 Public Domain Dedication along with
+   this software. If not, see <http://creativecommons.org/publicdomain/zero/1.0/>.
+
+   (Minimal changes made by Lennart Poettering, to make clean for inclusion in systemd)
+*/
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "siphash24.h"
+
+typedef uint64_t u64;
+typedef uint32_t u32;
+typedef uint8_t u8;
+
+#define ROTL(x,b) (u64)( ((x) << (b)) | ( (x) >> (64 - (b))) )
+
+#define U32TO8_LE(p, v)         \
+    (p)[0] = (u8)((v)      ); (p)[1] = (u8)((v) >>  8); \
+    (p)[2] = (u8)((v) >> 16); (p)[3] = (u8)((v) >> 24);
+
+#define U64TO8_LE(p, v)         \
+  U32TO8_LE((p),     (u32)((v)      ));   \
+  U32TO8_LE((p) + 4, (u32)((v) >> 32));
+
+#define U8TO64_LE(p) \
+  (((u64)((p)[0])      ) | \
+   ((u64)((p)[1]) <<  8) | \
+   ((u64)((p)[2]) << 16) | \
+   ((u64)((p)[3]) << 24) | \
+   ((u64)((p)[4]) << 32) | \
+   ((u64)((p)[5]) << 40) | \
+   ((u64)((p)[6]) << 48) | \
+   ((u64)((p)[7]) << 56))
+
+#define SIPROUND            \
+  do {              \
+    v0 += v1; v1=ROTL(v1,13); v1 ^= v0; v0=ROTL(v0,32); \
+    v2 += v3; v3=ROTL(v3,16); v3 ^= v2;     \
+    v0 += v3; v3=ROTL(v3,21); v3 ^= v0;     \
+    v2 += v1; v1=ROTL(v1,17); v1 ^= v2; v2=ROTL(v2,32); \
+  } while(0)
+
+/* SipHash-2-4 */
+void siphash24(uint8_t out[8], const void *_in, size_t inlen, const uint8_t k[16])
+{
+  /* "somepseudorandomlygeneratedbytes" */
+  u64 v0 = 0x736f6d6570736575ULL;
+  u64 v1 = 0x646f72616e646f6dULL;
+  u64 v2 = 0x6c7967656e657261ULL;
+  u64 v3 = 0x7465646279746573ULL;
+  u64 b;
+  u64 k0 = U8TO64_LE( k );
+  u64 k1 = U8TO64_LE( k + 8 );
+  u64 m;
+  const u8 *in = _in;
+  const u8 *end = in + inlen - ( inlen % sizeof( u64 ) );
+  const int left = inlen & 7;
+  b = ( ( u64 )inlen ) << 56;
+  v3 ^= k1;
+  v2 ^= k0;
+  v1 ^= k1;
+  v0 ^= k0;
+
+  for ( ; in != end; in += 8 )
+  {
+    m = U8TO64_LE( in );
+#ifdef DEBUG
+    printf( "(%3d) v0 %08x %08x\n", ( int )inlen, ( u32 )( v0 >> 32 ), ( u32 )v0 );
+    printf( "(%3d) v1 %08x %08x\n", ( int )inlen, ( u32 )( v1 >> 32 ), ( u32 )v1 );
+    printf( "(%3d) v2 %08x %08x\n", ( int )inlen, ( u32 )( v2 >> 32 ), ( u32 )v2 );
+    printf( "(%3d) v3 %08x %08x\n", ( int )inlen, ( u32 )( v3 >> 32 ), ( u32 )v3 );
+    printf( "(%3d) compress %08x %08x\n", ( int )inlen, ( u32 )( m >> 32 ), ( u32 )m );
+#endif
+    v3 ^= m;
+    SIPROUND;
+    SIPROUND;
+    v0 ^= m;
+  }
+
+  switch( left )
+  {
+  case 7: b |= ( ( u64 )in[ 6] )  << 48;
+
+  case 6: b |= ( ( u64 )in[ 5] )  << 40;
+
+  case 5: b |= ( ( u64 )in[ 4] )  << 32;
+
+  case 4: b |= ( ( u64 )in[ 3] )  << 24;
+
+  case 3: b |= ( ( u64 )in[ 2] )  << 16;
+
+  case 2: b |= ( ( u64 )in[ 1] )  <<  8;
+
+  case 1: b |= ( ( u64 )in[ 0] ); break;
+
+  case 0: break;
+  }
+
+#ifdef DEBUG
+  printf( "(%3d) v0 %08x %08x\n", ( int )inlen, ( u32 )( v0 >> 32 ), ( u32 )v0 );
+  printf( "(%3d) v1 %08x %08x\n", ( int )inlen, ( u32 )( v1 >> 32 ), ( u32 )v1 );
+  printf( "(%3d) v2 %08x %08x\n", ( int )inlen, ( u32 )( v2 >> 32 ), ( u32 )v2 );
+  printf( "(%3d) v3 %08x %08x\n", ( int )inlen, ( u32 )( v3 >> 32 ), ( u32 )v3 );
+  printf( "(%3d) padding   %08x %08x\n", ( int )inlen, ( u32 )( b >> 32 ), ( u32 )b );
+#endif
+  v3 ^= b;
+  SIPROUND;
+  SIPROUND;
+  v0 ^= b;
+#ifdef DEBUG
+  printf( "(%3d) v0 %08x %08x\n", ( int )inlen, ( u32 )( v0 >> 32 ), ( u32 )v0 );
+  printf( "(%3d) v1 %08x %08x\n", ( int )inlen, ( u32 )( v1 >> 32 ), ( u32 )v1 );
+  printf( "(%3d) v2 %08x %08x\n", ( int )inlen, ( u32 )( v2 >> 32 ), ( u32 )v2 );
+  printf( "(%3d) v3 %08x %08x\n", ( int )inlen, ( u32 )( v3 >> 32 ), ( u32 )v3 );
+#endif
+  v2 ^= 0xff;
+  SIPROUND;
+  SIPROUND;
+  SIPROUND;
+  SIPROUND;
+  b = v0 ^ v1 ^ v2  ^ v3;
+  U64TO8_LE( out, b );
+}
diff --git a/src/shared/siphash24.h b/src/shared/siphash24.h
new file mode 100644
index 0000000000..62e1168a79
--- /dev/null
+++ b/src/shared/siphash24.h
@@ -0,0 +1,6 @@
+#pragma once
+
+#include <inttypes.h>
+#include <sys/types.h>
+
+void siphash24(uint8_t out[8], const void *in, size_t inlen, const uint8_t k[16]);
diff --git a/src/shared/util.c b/src/shared/util.c
index 481c17245d..5c9d0bb730 100644
--- a/src/shared/util.c
+++ b/src/shared/util.c
@@ -61,6 +61,10 @@
 #include <libgen.h>
 #undef basename
 
+#ifdef HAVE_SYS_AUXV_H
+#include <sys/auxv.h>
+#endif
+
 #include "macro.h"
 #include "util.h"
 #include "ioprio.h"
@@ -2345,42 +2349,48 @@ char* dirname_malloc(const char *path) {
         return dir;
 }
 
-unsigned long long random_ull(void) {
+void random_bytes(void *p, size_t n) {
+        static bool srand_called = false;
         _cleanup_close_ int fd;
-        uint64_t ull;
-        ssize_t r;
+        ssize_t k;
+        uint8_t *q;
 
         fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY);
         if (fd < 0)
                 goto fallback;
 
-        r = loop_read(fd, &ull, sizeof(ull), true);
-        if (r != sizeof(ull))
+        k = loop_read(fd, p, n, true);
+        if (k < 0 || (size_t) k != n)
                 goto fallback;
 
-        return ull;
+        return;
 
 fallback:
-        return random() * RAND_MAX + random();
-}
 
-unsigned random_u(void) {
-        _cleanup_close_ int fd;
-        unsigned u;
-        ssize_t r;
+        if (!srand_called) {
 
-        fd = open("/dev/urandom", O_RDONLY|O_CLOEXEC|O_NOCTTY);
-        if (fd < 0)
-                goto fallback;
+#ifdef HAVE_SYS_AUXV_H
+                /* The kernel provides us with a bit of entropy in
+                 * auxv, so let's try to make use of that to seed the
+                 * pseudo-random generator. It's better than
+                 * nothing... */
 
-        r = loop_read(fd, &u, sizeof(u), true);
-        if (r != sizeof(u))
-                goto fallback;
+                void *auxv;
 
-        return u;
+                auxv = (void*) getauxval(AT_RANDOM);
+                if (auxv)
+                        srand(*(unsigned*) auxv);
+                else
+#endif
+                        srand(time(NULL) + gettid());
 
-fallback:
-        return random() * RAND_MAX + random();
+                srand_called = true;
+        }
+
+        /* If some idiot made /dev/urandom unavailable to us, he'll
+         * get a PRNG instead. */
+        for (q = p; q < (uint8_t*) p + n; q ++)
+                *q = rand();
 }
 
 void rename_process(const char name[8]) {
@@ -4137,7 +4147,7 @@ int symlink_atomic(const char *from, const char *to) {
         _cleanup_free_ char *t;
         const char *fn;
         size_t k;
-        unsigned long long ull;
+        uint64_t u;
         unsigned i;
         int r;
 
@@ -4154,10 +4164,10 @@ int symlink_atomic(const char *from, const char *to) {
         t[k] = '.';
         x = stpcpy(t+k+1, fn);
 
-        ull = random_ull();
+        u = random_u64();
         for (i = 0; i < 16; i++) {
-                *(x++) = hexchar(ull & 0xF);
-                ull >>= 4;
+                *(x++) = hexchar(u & 0xF);
+                u >>= 4;
         }
 
         *x = 0;
diff --git a/src/shared/util.h b/src/shared/util.h
index 488ce3ba6d..338d79c7ac 100644
--- a/src/shared/util.h
+++ b/src/shared/util.h
@@ -249,8 +249,19 @@ int make_stdio(int fd);
 int make_null_stdio(void);
 int make_console_stdio(void);
 
-unsigned long long random_ull(void);
-unsigned random_u(void);
+void random_bytes(void *p, size_t n);
+
+static inline uint64_t random_u64(void) {
+        uint64_t u;
+        random_bytes(&u, sizeof(u));
+        return u;
+}
+
+static inline uint32_t random_u32(void) {
+        uint32_t u;
+        random_bytes(&u, sizeof(u));
+        return u;
+}
 
 /* For basic lookup tables with strictly enumerated entries */
 #define __DEFINE_STRING_TABLE_LOOKUP(name,type,scope)                   \
author	Lennart Poettering <lennart@poettering.net>	2013-12-22 19:59:12 +0100
committer	Lennart Poettering <lennart@poettering.net>	2013-12-22 21:12:25 +0100
commit	9bf3b53533cdc9b95c921b71da755401f223f765 (patch)
tree	812e99b25cc09f5d5d3b130d25a02754283ff7a7 /src/shared
parent	14f862a508ee64466fa8b3f036797d472f4d03ed (diff)