├── .package ├── LICENSE ├── README.md ├── rhbloom.c ├── rhbloom.h └── test.c /.package: -------------------------------------------------------------------------------- 1 | file rhbloom.h 2 | file rhbloom.c 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2024 Joshua J Baker 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Robin hood bloom filter 2 | 3 | A [bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) in C that supports dynamic growth using a [robin hood hashmap](https://en.wikipedia.org/wiki/Hash_table#Robin_Hood_hashing). 4 | 5 | Basically it starts out as a hashmap and grows until it reaches a capacity that suffices the number of bits to become a full-on bloom filter. 6 | 7 | ## Example 8 | 9 | ```c 10 | // Create a filter that has a capacity of 10 million and a false positive 11 | // rate of 0.1% 12 | struct rhbloom *filter = rhbloom_new(10000000, 0.001); 13 | 14 | // Add the key 12031 15 | rhbloom_add(filter, 12031); 16 | 17 | // Check of the key exists 18 | if (rhbloom_test(filter, 12031)) { 19 | // Yes, of course is does. 20 | } 21 | 22 | rhbloom_free(filter); 23 | ``` 24 | 25 | ## API 26 | 27 | ```c 28 | rhbloom_new(size_t n, double p); // create a new filter 29 | rhbloom_add(struct rhbloom*, uint64_t key); // add a key (typically a hash) 30 | rhbloom_test(struct rhbloom*, uint64_t key); // test if key probably exists 31 | rhbloom_free(struct rhbloom*); // free the filter 32 | rhbloom_clear(struct rhbloom*); // clear entries without freeing 33 | ``` 34 | 35 | ## Performance 36 | 37 | Here we'll benchmark a filter with the capacity of 10,000,000 and a false positive rate of 1%. 38 | 39 | ```c 40 | cc -O3 rhbloom.c test.c -lm && ./a.out bench 10000000 0.01 41 | ``` 42 | 43 | ``` 44 | add 10,000,000 ops in 0.221 secs 22.1 ns/op 45,167,118 op/sec 45 | test (yes) 10,000,000 ops in 0.150 secs 15.0 ns/op 66,673,778 op/sec 46 | test (no) 10,000,000 ops in 0.136 secs 13.6 ns/op 73,576,478 op/sec 47 | Misses 28943 (0.2894% false-positive) 48 | Memory 16.00 MB 49 | ``` 50 | 51 | ## License 52 | 53 | Source code is available under the MIT License. 54 | -------------------------------------------------------------------------------- /rhbloom.c: -------------------------------------------------------------------------------- 1 | // https://github.com/tidwall/rhbloom 2 | // 3 | // Copyright 2024 Joshua J Baker. All rights reserved. 4 | // Use of this source code is governed by an MIT-style 5 | // license that can be found in the LICENSE file. 6 | // 7 | // rhbloom: Robin hood bloom filter 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "rhbloom.h" 16 | 17 | struct rhbloom { 18 | // allocator 19 | void*(*malloc)(size_t); 20 | void(*free)(void*); 21 | 22 | // robinhood fields 23 | size_t count; // number of keys in hashtable 24 | size_t nbuckets; // number of buckets 25 | uint64_t *buckets; // hashtable buckets 26 | 27 | // bloom fields 28 | size_t k; // number of bits per key 29 | size_t m; // number of bits total 30 | uint8_t *bits; // bloom bits 31 | }; 32 | 33 | // dib/key entry as a uint64 34 | #define RHBLOOM_KEY(x) ((uint64_t)(x)<<8>>8) 35 | #define RHBLOOM_DIB(x) (int)((uint64_t)(x)>>56) 36 | #define RHBLOOM_SETKEYDIB(key,dib) (RHBLOOM_KEY((key))|((uint64_t)(dib)<<56)) 37 | 38 | struct rhbloom *rhbloom_new_with_allocator(size_t n, double p, 39 | void*(*_malloc)(size_t), void(*_free)(void*)) 40 | { 41 | if (n < 16) n = 16; 42 | 43 | // Calculate the total number of bits needed 44 | size_t m = n * log(p) / log(1 / pow(2, log(2))); 45 | 46 | // Calculate the bits per key 47 | size_t k = round(((double)m / (double)(n)) * log(2)); 48 | 49 | // Adjust the number of bit to power of two 50 | size_t m0 = 2; 51 | while (m0 < m) { 52 | m0 *= 2; 53 | } 54 | size_t k0 = round((double)m / (double)m0 * (double)k); 55 | 56 | _malloc = _malloc ? _malloc : malloc; 57 | _free = _free ? _free : free; 58 | 59 | struct rhbloom *rhbloom = _malloc(sizeof(struct rhbloom)); 60 | if (!_malloc) { 61 | return 0; 62 | } 63 | rhbloom->malloc = _malloc; 64 | rhbloom->free = _free; 65 | rhbloom->count = 0; 66 | rhbloom->nbuckets = 0; 67 | rhbloom->buckets = 0; 68 | rhbloom->k = k0; 69 | rhbloom->m = m0; 70 | rhbloom->bits = 0; 71 | return rhbloom; 72 | } 73 | 74 | /// Create a new filter 75 | /// @param n maximum number of keys that can exist in filter to 76 | /// @return NULL if out of memory 77 | struct rhbloom *rhbloom_new(size_t n, double p) { 78 | return rhbloom_new_with_allocator(n, p, malloc, free); 79 | } 80 | 81 | /// Free the filter 82 | void rhbloom_free(struct rhbloom *rhbloom) { 83 | if (rhbloom->bits) { 84 | rhbloom->free(rhbloom->bits); 85 | } 86 | if (rhbloom->buckets) { 87 | rhbloom->free(rhbloom->buckets); 88 | } 89 | rhbloom->count = 0; 90 | rhbloom->nbuckets = 0; 91 | rhbloom->buckets = 0; 92 | rhbloom->bits = 0; 93 | rhbloom->free(rhbloom); 94 | } 95 | 96 | static uint64_t rhbloom_mix(uint64_t key) { 97 | // https://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html 98 | // hash u64 using mix13 99 | key ^= key >> 30; 100 | key *= UINT64_C(0xbf58476d1ce4e5b9); 101 | key ^= key >> 27; 102 | key *= UINT64_C(0x94d049bb133111eb); 103 | key ^= key >> 31; 104 | return key; 105 | } 106 | 107 | static bool rhbloom_testadd(struct rhbloom *rhbloom, uint64_t key, bool add) { 108 | // We only want the 56-bit key in order to match correcly with the 109 | // robinhood entries, upon upgrade. 110 | key = RHBLOOM_KEY(key); 111 | 112 | // Add or check each bit 113 | size_t i = 0; 114 | size_t j = key & (rhbloom->m-1); 115 | while (1) { 116 | if (add) { 117 | rhbloom->bits[j>>3] |= add<<(j&7); 118 | } else if (!((rhbloom->bits[j>>3]>>(j&7))&1)) { 119 | return false; 120 | } 121 | if (i == rhbloom->k-1) { 122 | break; 123 | } 124 | // Pick the next bit. Use part of the mix13 forumula to help get a more 125 | // randomized value. 126 | key *= UINT64_C(0x94d049bb133111eb); 127 | key ^= key >> 31; 128 | j = key & (rhbloom->m-1); 129 | i++; 130 | } 131 | 132 | return true; 133 | } 134 | 135 | static bool rhbloom_addkey(struct rhbloom *rhbloom, uint64_t key); 136 | 137 | static bool rhbloom_grow(struct rhbloom *rhbloom) { 138 | size_t nbuckets_old = rhbloom->nbuckets; 139 | uint64_t *buckets_old = rhbloom->buckets; 140 | size_t nbuckets_new = nbuckets_old == 0 ? 16 : nbuckets_old * 2; 141 | if (nbuckets_new * 8 >= rhbloom->m >> 3) { 142 | // Upgrade to bloom filter 143 | rhbloom->bits = rhbloom->malloc(rhbloom->m >> 3); 144 | if (!rhbloom->bits) { 145 | return 0; 146 | } 147 | memset(rhbloom->bits, 0, rhbloom->m >> 3); 148 | rhbloom->count = 0; 149 | rhbloom->nbuckets = 0; 150 | rhbloom->buckets = 0; 151 | for (size_t i = 0; i < nbuckets_old; i++) { 152 | if (RHBLOOM_DIB(buckets_old[i])) { 153 | rhbloom_testadd(rhbloom, buckets_old[i], true); 154 | } 155 | } 156 | } else { 157 | uint64_t *buckets = rhbloom->malloc(nbuckets_new << 3); 158 | if (!buckets) { 159 | return 0; 160 | } 161 | memset(buckets, 0, nbuckets_new << 3); 162 | rhbloom->count = 0; 163 | rhbloom->nbuckets = nbuckets_new; 164 | rhbloom->buckets = buckets; 165 | for (size_t i = 0; i < nbuckets_old; i++) { 166 | if (RHBLOOM_DIB(buckets_old[i])) { 167 | rhbloom_addkey(rhbloom, buckets_old[i]); 168 | } 169 | } 170 | } 171 | if (buckets_old) { 172 | rhbloom->free(buckets_old); 173 | } 174 | return true; 175 | } 176 | 177 | static bool rhbloom_addkey(struct rhbloom *rhbloom, uint64_t key) { 178 | key = RHBLOOM_KEY(key); 179 | int dib = 1; 180 | size_t i = key & (rhbloom->nbuckets-1); 181 | while (1) { 182 | if (RHBLOOM_DIB(rhbloom->buckets[i]) == 0) { 183 | rhbloom->buckets[i] = RHBLOOM_SETKEYDIB(key, dib); 184 | rhbloom->count++; 185 | return true; 186 | } 187 | if (RHBLOOM_KEY(rhbloom->buckets[i]) == key) { 188 | return true; 189 | } 190 | if (RHBLOOM_DIB(rhbloom->buckets[i]) < dib) { 191 | uint64_t tmp = rhbloom->buckets[i]; 192 | rhbloom->buckets[i] = RHBLOOM_SETKEYDIB(key, dib); 193 | key = RHBLOOM_KEY(tmp); 194 | dib = RHBLOOM_DIB(tmp); 195 | } 196 | dib++; 197 | i = (i + 1) & (rhbloom->nbuckets-1); 198 | } 199 | } 200 | 201 | /// Adds a key to the filter. 202 | /// @return true if key was added or false if out of memory 203 | bool rhbloom_add(struct rhbloom *rhbloom, uint64_t key) { 204 | key = rhbloom_mix(key); 205 | while (1) { 206 | if (rhbloom->bits) { 207 | rhbloom_testadd(rhbloom, key, true); 208 | return true; 209 | } 210 | if (rhbloom->count == rhbloom->nbuckets >> 1) { 211 | if (!rhbloom_grow(rhbloom)) { 212 | return false; 213 | } 214 | continue; 215 | } 216 | break; 217 | } 218 | return rhbloom_addkey(rhbloom, key); 219 | } 220 | 221 | /// Check if key probably exists in filter. 222 | /// @return true if probably exists or false if not exists 223 | bool rhbloom_test(struct rhbloom *rhbloom, uint64_t key) { 224 | key = rhbloom_mix(key); 225 | if (rhbloom->bits) { 226 | return rhbloom_testadd(rhbloom, key, false); 227 | } 228 | if (!rhbloom->buckets) { 229 | return false; 230 | } 231 | key = RHBLOOM_KEY(key); 232 | int dib = 1; 233 | size_t i = key & (rhbloom->nbuckets-1); 234 | while (1) { 235 | bool yes = RHBLOOM_KEY(rhbloom->buckets[i]) == key; 236 | bool no = RHBLOOM_DIB(rhbloom->buckets[i]) < dib; 237 | if (yes || no) { 238 | return yes; 239 | } 240 | dib++; 241 | i = (i + 1) & (rhbloom->nbuckets-1); 242 | } 243 | } 244 | 245 | /// Get the memory size in bytes of this filter. 246 | size_t rhbloom_memsize(struct rhbloom *rhbloom) { 247 | size_t size = sizeof(struct rhbloom); 248 | size += rhbloom->bits ? rhbloom->m >> 3 : rhbloom->nbuckets << 3; 249 | return size; 250 | } 251 | 252 | /// Returns true if been upgraded to a bloom filter 253 | bool rhbloom_upgraded(struct rhbloom *rhbloom) { 254 | return !!rhbloom->bits; 255 | } 256 | 257 | /// Clear all entries in the filter without freeing resources. 258 | void rhbloom_clear(struct rhbloom *rhbloom) { 259 | if (rhbloom->bits) { 260 | memset(rhbloom->bits, 0, rhbloom->m >> 3); 261 | } else if (rhbloom->buckets) { 262 | memset(rhbloom->buckets, 0, rhbloom->nbuckets << 3); 263 | rhbloom->count = 0; 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /rhbloom.h: -------------------------------------------------------------------------------- 1 | // https://github.com/tidwall/rhbloom 2 | // 3 | // Copyright 2024 Joshua J Baker. All rights reserved. 4 | // Use of this source code is governed by an MIT-style 5 | // license that can be found in the LICENSE file. 6 | // 7 | // rhbloom: Robin hood bloom filter 8 | 9 | #ifndef RHBLOOM_H 10 | #define RHBLOOM_H 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | struct rhbloom; 17 | 18 | struct rhbloom *rhbloom_new(size_t n, double p); 19 | struct rhbloom *rhbloom_new_with_allocator(size_t n, double p, void*(*malloc)(size_t), void(*free)(void*)); 20 | void rhbloom_free(struct rhbloom *rhbloom); 21 | void rhbloom_clear(struct rhbloom *rhbloom); 22 | bool rhbloom_add(struct rhbloom *rhbloom, uint64_t key); 23 | bool rhbloom_test(struct rhbloom *rhbloom, uint64_t key); 24 | size_t rhbloom_memsize(struct rhbloom *rhbloom); 25 | bool rhbloom_upgraded(struct rhbloom *rhbloom); 26 | 27 | #endif // RHBLOOM_H 28 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | // # Run tests 2 | // $ cc *.c && ./a.out 3 | // 4 | // # run benchmarks 5 | // $ cc *.c && ./a.out bench 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "rhbloom.h" 13 | 14 | unsigned int murmurhash2(const void * key, int len, const unsigned int seed) { 15 | const unsigned int m = 0x5bd1e995; 16 | const int r = 24; 17 | unsigned int h = seed ^ len; 18 | const unsigned char * data = (const unsigned char *)key; 19 | while(len >= 4) { 20 | unsigned int k = *(unsigned int *)data; 21 | k *= m; 22 | k ^= k >> r; 23 | k *= m; 24 | h *= m; 25 | h ^= k; 26 | data += 4; 27 | len -= 4; 28 | } 29 | switch(len) { 30 | case 3: h ^= data[2] << 16; 31 | case 2: h ^= data[1] << 8; 32 | case 1: h ^= data[0]; 33 | h *= m; 34 | }; 35 | h ^= h >> 13; 36 | h *= m; 37 | h ^= h >> 15; 38 | return h; 39 | } 40 | 41 | uint64_t hash(int x) { 42 | return murmurhash2(&x, sizeof(int), 0); 43 | } 44 | 45 | double now(void) { 46 | struct timespec now; 47 | clock_gettime(CLOCK_MONOTONIC, &now); 48 | return (now.tv_sec*1e9 + now.tv_nsec) / 1e9; 49 | } 50 | 51 | char *commaize(unsigned int n) { 52 | char s1[64]; 53 | char *s2 = malloc(64); 54 | assert(s2); 55 | memset(s2, 0, sizeof(64)); 56 | snprintf(s1, sizeof(s1), "%d", n); 57 | int i = strlen(s1)-1; 58 | int j = 0; 59 | while (i >= 0) { 60 | if (j%3 == 0 && j != 0) { 61 | memmove(s2+1, s2, strlen(s2)+1); 62 | s2[0] = ','; 63 | } 64 | memmove(s2+1, s2, strlen(s2)+1); 65 | s2[0] = s1[i]; 66 | i--; 67 | j++; 68 | } 69 | return s2; 70 | } 71 | 72 | #define bench_print(n, start, end) { \ 73 | double elapsed = end - start; \ 74 | double nsop = elapsed/(double)(n)*1e9; \ 75 | char *pops = commaize((n)); \ 76 | char *psec = commaize((double)(n)/elapsed); \ 77 | printf("%s ops in %.3f secs %6.1f ns/op %13s op/sec\n", \ 78 | pops, elapsed, nsop, psec); \ 79 | } 80 | 81 | void test_step(struct rhbloom *rhbloom, int n, double p) { 82 | int nn = n+1; 83 | for (int i = 0; i < nn; i++) { 84 | if (!rhbloom_upgraded(rhbloom)) { 85 | assert(!rhbloom_test(rhbloom, hash(i))); 86 | } 87 | rhbloom_add(rhbloom, hash(i)); 88 | if (!rhbloom_upgraded(rhbloom)) { 89 | assert(rhbloom_test(rhbloom, hash(i))); 90 | } 91 | } 92 | assert(rhbloom_upgraded(rhbloom)); 93 | int hits = 0; 94 | for (int i = 0; i < nn; i++) { 95 | if (rhbloom_test(rhbloom, hash(i))) { 96 | hits++; 97 | } 98 | } 99 | assert(hits == nn); 100 | hits = 0; 101 | 102 | for (int i = nn; i < nn*2; i++) { 103 | if (rhbloom_test(rhbloom, hash(i))) { 104 | hits++; 105 | } 106 | } 107 | if ((double)hits/(double)n - p > 0.1 && n > 0) { 108 | printf("n=%d p=%f hits=%d \t(%f)", 109 | n, p, hits, (double)hits/(double)n); 110 | printf(" (%f)", (double)hits/(double)n - p); 111 | printf("\n"); 112 | assert(!"bad probability"); 113 | } 114 | } 115 | 116 | void test(void) { 117 | for (int n = 0; n < 100000; n += 1000) { 118 | for (double p = 0.01; p < 0.70; p += 0.05) { 119 | struct rhbloom *rhbloom = rhbloom_new(n, p); 120 | assert(rhbloom); 121 | test_step(rhbloom, n, p); 122 | // test after clear 123 | rhbloom_clear(rhbloom); 124 | test_step(rhbloom, n, p); 125 | rhbloom_free(rhbloom); 126 | } 127 | } 128 | printf("PASSED\n"); 129 | } 130 | 131 | void bench(int argc, char *argv[]) { 132 | 133 | int N = 1000000; 134 | double P = 0.01; 135 | 136 | 137 | if (argc > 2) { 138 | N = atoi(argv[2]); 139 | } 140 | if (argc > 3) { 141 | P = atof(argv[3]); 142 | } 143 | 144 | uint64_t *hashes = malloc(N*8*2); 145 | assert(hashes); 146 | for (int i = 0; i < N*2; i++) { 147 | hashes[i] = hash(i); 148 | } 149 | 150 | struct rhbloom *rhbloom = rhbloom_new(N, P); 151 | assert(rhbloom); 152 | // exit(1); 153 | double start; 154 | 155 | size_t misses = 0; 156 | for (int j = 0; j < 2; j++) { 157 | if (j > 0) { 158 | printf("-- clear --\n"); 159 | rhbloom_clear(rhbloom); 160 | } 161 | printf("add "); 162 | start = now(); 163 | for (int i = 0; i < N; i++) { 164 | // printf("insert %d (%llu)\n", i, hash(i)); 165 | rhbloom_add(rhbloom, hashes[i]); 166 | } 167 | bench_print(N, start, now()); 168 | 169 | // rhbloom_print(rhbloom); 170 | 171 | printf("test (yes) "); 172 | start = now(); 173 | for (int i = 0; i < N; i++) { 174 | // printf("contains %d (%llu)\n", i, hash(i)); 175 | assert(rhbloom_test(rhbloom, hashes[i])); 176 | } 177 | bench_print(N, start, now()); 178 | 179 | 180 | printf("test (no) "); 181 | misses = 0; 182 | start = now(); 183 | for (int i = N; i < N*2; i++) { 184 | misses += rhbloom_test(rhbloom, hashes[i]); 185 | } 186 | bench_print(N, start, now()); 187 | 188 | } 189 | 190 | 191 | printf("Misses %zu (%0.4f%% false-positive)\n", misses, misses / (double)N * 100); 192 | printf("Memory %.2f MB\n", rhbloom_memsize(rhbloom)/1024.0/1024.0); 193 | 194 | rhbloom_free(rhbloom); 195 | free(hashes); 196 | } 197 | 198 | int main(int argc, char *argv[]) { 199 | if (argc > 1 && strcmp(argv[1], "bench") == 0) { 200 | bench(argc, argv); 201 | } else { 202 | test(); 203 | } 204 | return 0; 205 | } --------------------------------------------------------------------------------