├── .gitignore ├── Makefile ├── xf8.h ├── README.md ├── UNLICENSE ├── tests ├── test.c └── example.c └── xf8.c /.gitignore: -------------------------------------------------------------------------------- 1 | tests/example 2 | tests/test 3 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CC = cc -std=c99 2 | CFLAGS = -O3 -Wall -Wextra 3 | LDFLAGS = 4 | LDLIBS = 5 | 6 | all: tests/example tests/test 7 | 8 | tests/example: tests/example.c xf8.c xf8.h 9 | $(CC) $(LDFLAGS) $(CFLAGS) -o $@ tests/example.c xf8.c $(LDLIBS) 10 | 11 | tests/test: tests/test.c xf8.c xf8.h 12 | $(CC) $(LDFLAGS) $(CFLAGS) -o $@ tests/test.c xf8.c $(LDLIBS) 13 | 14 | check: tests/test 15 | tests/test 16 | 17 | clean: 18 | rm -f tests/example tests/test 19 | -------------------------------------------------------------------------------- /xf8.h: -------------------------------------------------------------------------------- 1 | /* 8-bit Xor filter 2 | * Filters have a fixed false positive rate of 1/256 (~0.39%). 3 | * Ref: https://arxiv.org/abs/1912.08258 4 | * 5 | * This is free and unencumbered software released into the public domain. 6 | */ 7 | #ifndef XF8_H 8 | #define XF8_H 9 | 10 | #include 11 | #include 12 | 13 | #if !defined(XF8_BITS) || XF8_BITS == 8 14 | typedef uint8_t xf8slot; 15 | #elif XF8_BITS == 16 16 | typedef uint16_t xf8slot; 17 | #endif 18 | 19 | struct xf8 { 20 | size_t len; 21 | int seed; 22 | xf8slot slots[]; 23 | }; 24 | 25 | /** 26 | * Allocate a new XOR filter for COUNT elements, returns NULL on OOM. 27 | * This is a simple, flat allocation that is freed using free(). 28 | */ 29 | struct xf8 *xf8_create(size_t count); 30 | 31 | /** 32 | * Set the filter for the given KEYS, returns 0 on OOM. 33 | * There MUST NOT be repeated keys. 34 | */ 35 | int xf8_populate(struct xf8 *, uint64_t *keys, size_t count); 36 | 37 | /** 38 | * Return non-zero if KEY is probably a member of the XOR filter. 39 | */ 40 | int xf8_member(const struct xf8 *, uint64_t key); 41 | 42 | #endif 43 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 8-bit Xor Filter 2 | 3 | This is a C99 implementation of an 8-bit xor filter. It has a fixed 4 | error rate of 1/256 (~0.39%) and uses ~9.84 bits of storage per element. 5 | See [Xor Filters: Faster and Smaller Than Bloom Filters][ref]. 6 | 7 | ## Memory Usage 8 | 9 | By default only up to 2^32 elements are supported, which will require 10 | 123GB of memory to process. Much larger sets are supported by using 11 | `-DXF8_64BIT` at compile time, at the cost of roughly doubling memory 12 | usage. For example, that same set of 2^32 elements will then require a 13 | total of 211GB of memory to process. 14 | 15 | Despite the library's name, the error rate can be reduced to 1/65,536 16 | (~0.0015%) by using `-DXF8_BITS=16` at compile time. This doubles the 17 | size of the filter to ~19.68 bits per element. 18 | 19 | ## Example 20 | 21 | The example program (`tests/example.c`) is a probabilistic spell checker: 22 | 23 | ``` 24 | $ make 25 | $ tests/example spelling.db 26 | $ printf 'hello\nfoobarbaz' | tests/example spelling.db 27 | Y hello 28 | N foobarbaz 29 | ``` 30 | 31 | [ref]: https://arxiv.org/abs/1912.08258 32 | -------------------------------------------------------------------------------- /UNLICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /tests/test.c: -------------------------------------------------------------------------------- 1 | #include "../xf8.h" 2 | #include 3 | #include 4 | 5 | static uint64_t 6 | hash(uint64_t x) 7 | { 8 | x ^= x >> 32; 9 | x *= 0x78a1303cf234a045; 10 | x ^= x >> 32; 11 | x *= 0x098a71082462c3d9; 12 | x ^= x >> 32; 13 | return x; 14 | } 15 | 16 | static char * 17 | test(uint64_t seed, long n) 18 | { 19 | struct xf8 *xf = xf8_create(n); 20 | if (!xf) return 0; 21 | 22 | uint64_t *keys = malloc(sizeof(keys[0])*n); 23 | if (!keys) { 24 | free(xf); 25 | return "OOM"; 26 | } 27 | for (long i = 0; i < n; i++) { 28 | keys[i] = hash(seed + i); 29 | } 30 | 31 | if (!xf8_populate(xf, keys, n)) { 32 | free(keys); 33 | free(xf); 34 | return "OOM"; 35 | } 36 | free(keys); 37 | 38 | for (long i = 0; i < n; i++) { 39 | uint64_t key = hash(seed + i); 40 | if (!xf8_member(xf, key)) { 41 | free(xf); 42 | return "xf8_member"; 43 | } 44 | } 45 | 46 | long hit = 0; 47 | for (long i = 1; i >= -n; i--) { 48 | uint64_t key = hash(seed + i); 49 | hit += xf8_member(xf, key); 50 | } 51 | 52 | double rate = hit / (double)n; 53 | if (rate > 1.5/(sizeof(xf8slot)*8)) { 54 | static char tmp[256]; 55 | sprintf(tmp, "too many false positives: %.17g%%\n", rate*100); 56 | return tmp; 57 | } 58 | 59 | free(xf); 60 | return 0; 61 | } 62 | 63 | int 64 | main(void) 65 | { 66 | int fails = 0; 67 | 68 | #ifdef _WIN32 69 | /* Best effort enable ANSI escape processing. */ 70 | void *GetStdHandle(unsigned); 71 | int GetConsoleMode(void *, unsigned *); 72 | int SetConsoleMode(void *, unsigned); 73 | void *handle; 74 | unsigned mode; 75 | handle = GetStdHandle(-11); /* STD_OUTPUT_HANDLE */ 76 | if (GetConsoleMode(handle, &mode)) { 77 | mode |= 0x0004; /* ENABLE_VIRTUAL_TERMINAL_PROCESSING */ 78 | SetConsoleMode(handle, mode); /* ignore errors */ 79 | } 80 | #endif 81 | 82 | for (int i = 13; i <= 22; i++) { 83 | int level = 0; 84 | for (int j = 0; j < 8; j++) { 85 | uint64_t seed = hash(i) + hash(-j - 1); 86 | const char *result = test(seed, 1L< 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #define MAXKEYS (1L<<24) 11 | 12 | static uint64_t 13 | hash(const void *buf, size_t len, uint64_t key) 14 | { 15 | const unsigned char *p = buf; 16 | uint64_t h = key; 17 | for (size_t i = 0; i < len; i++) { 18 | h ^= p[i]; 19 | h *= UINT64_C(0x25b751109e05be63); 20 | } 21 | h ^= h >> 32; 22 | h *= UINT64_C(0x2330e1453ed4b9b9); 23 | return h; 24 | } 25 | 26 | int 27 | main(int argc, char *argv[]) 28 | { 29 | uint64_t hashkey = 0x648aaecaca11a629; 30 | 31 | if (argc == 1) { 32 | #ifdef _WIN32 33 | /* Set stdout to binary mode. */ 34 | int _setmode(int, int); 35 | _setmode(1, 0x8000); 36 | #endif 37 | char buf[4096]; 38 | long count = 0; 39 | static uint64_t keys[MAXKEYS]; 40 | while (fgets(buf, sizeof(buf), stdin) && count < MAXKEYS) { 41 | size_t len = strcspn(buf, "\r\n"); 42 | keys[count++] = hash(buf, len, hashkey); 43 | } 44 | 45 | struct xf8 *xf = xf8_create(count); 46 | if (!xf) { 47 | fprintf(stderr, "fatal: out of memory (xf8_create)\n"); 48 | exit(EXIT_FAILURE); 49 | } 50 | if (!xf8_populate(xf, keys, count)) { 51 | fprintf(stderr, "fatal: out of memory (xf8_populate)\n"); 52 | exit(EXIT_FAILURE); 53 | } 54 | fputc(count >> 0 & 0xff, stdout); 55 | fputc(count >> 8 & 0xff, stdout); 56 | fputc(count >> 16 & 0xff, stdout); 57 | fputc(count >> 24 & 0xff, stdout); 58 | fputc(xf->seed, stdout); 59 | fwrite(xf->slots, xf->len, sizeof(xf->slots[0]), stdout); 60 | free(xf); 61 | 62 | if (fflush(stdout)) { // note: is not a 100% sufficient check 63 | fprintf(stderr, "fatal: %s, \n", strerror(errno)); 64 | exit(EXIT_FAILURE); 65 | } 66 | 67 | } else { 68 | FILE *f = fopen(argv[1], "rb"); 69 | if (!f) { 70 | fprintf(stderr, "fatal: %s, %s\n", strerror(errno), argv[1]); 71 | exit(EXIT_FAILURE); 72 | } 73 | 74 | unsigned char header[5]; 75 | if (!fread(header, sizeof(header), 1, f)) { 76 | fprintf(stderr, "fatal: cannot read %s\n", argv[1]); 77 | exit(EXIT_FAILURE); 78 | } 79 | unsigned long count = (unsigned long)header[0] << 0 | 80 | (unsigned long)header[1] << 8 | 81 | (unsigned long)header[2] << 16 | 82 | (unsigned long)header[3] << 24; 83 | struct xf8 *xf = xf8_create(count); 84 | xf->seed = header[4]; 85 | if (fread(xf->slots, sizeof(xf->slots[0]), xf->len, f) != xf->len) { 86 | fprintf(stderr, "fatal: cannot read %s\n", argv[1]); 87 | exit(EXIT_FAILURE); 88 | } 89 | fclose(f); 90 | 91 | char buf[4096]; 92 | while (fgets(buf, sizeof(buf), stdin)) { 93 | size_t len = strcspn(buf, "\r\n"); 94 | uint64_t key = hash(buf, len, hashkey); 95 | putchar(xf8_member(xf, key) ? 'Y' : 'N'); 96 | putchar(' '); 97 | fwrite(buf, len, 1, stdout); 98 | putchar('\n'); 99 | } 100 | 101 | free(xf); 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /xf8.c: -------------------------------------------------------------------------------- 1 | /* This is free and unencumbered software released into the public domain. */ 2 | #include 3 | #include 4 | #include "xf8.h" 5 | 6 | static uint64_t 7 | xf8_hash(uint64_t x) 8 | { 9 | x ^= x >> 32; 10 | x *= UINT64_C(0xc6629b183fbdc9a7); 11 | x ^= x >> 32; 12 | x *= UINT64_C(0xc029435c0845c0b3); 13 | x ^= x >> 32; 14 | return x; 15 | } 16 | 17 | /* Adjust this typedef as needed to control the maximum allowed number 18 | * of keys at the cost of higher memory usage. As a 32-bit integer, up 19 | * to 2^32 keys are supported. Using a uint64_t allows for a larger 20 | * number of keys, but roughly doubles total memory usage. 21 | */ 22 | #ifndef XF8_64BIT 23 | typedef uint32_t fx8uint; 24 | static void 25 | xf8_index(const struct xf8 *xf, fx8uint c[3], uint64_t key) 26 | { 27 | size_t len = xf->len / 3; 28 | for (int i = 0; i < 3; i++) { 29 | uint64_t x = (uint32_t)xf8_hash(key + xf->seed*3 + i); 30 | c[i] = ((x*len)>>32) + len*i; 31 | } 32 | } 33 | 34 | #else /* XF8_64BIT */ 35 | typedef uint64_t fx8uint; 36 | static void 37 | xf8_index(const struct xf8 *xf, fx8uint c[3], uint64_t key) 38 | { 39 | size_t len = xf->len / 3; 40 | // TODO: avoid division by unknown denominator 41 | c[0] = xf8_hash(key + xf->seed*3 + 0)%len + len*0; 42 | c[1] = xf8_hash(key + xf->seed*3 + 1)%len + len*1; 43 | c[2] = xf8_hash(key + xf->seed*3 + 2)%len + len*2; 44 | } 45 | #endif 46 | 47 | #define FX8NULL ((fx8uint)-1) 48 | 49 | /* Compute the 3-tuple of indices for KEY. */ 50 | static void xf8_index(const struct xf8 *xf, fx8uint c[3], uint64_t key); 51 | 52 | struct xf8 * 53 | xf8_create(size_t count) 54 | { 55 | struct xf8 *xf = 0; 56 | unsigned long long len = 123ULL*count/100 + 32; 57 | len += (3 - len % 3) % 3; // round up to divisible by 3 58 | if (count < -1ULL/123 && len*sizeof(xf8slot) < (size_t)-1) { 59 | xf = malloc(sizeof(struct xf8) + len*sizeof(xf8slot)); 60 | if (xf) { 61 | xf->len = len; 62 | } 63 | } 64 | return xf; 65 | } 66 | 67 | int 68 | xf8_populate(struct xf8 *xf, uint64_t *keys, size_t count) 69 | { 70 | /* Sets are represented using linked lists with all nodes allocated 71 | * contiguously up front. There are no pointers, just indices into 72 | * the array of linked list nodes, so each third of the set array 73 | * needs its own allocation of linked list nodes. The special index 74 | * value of -1 is like a NULL pointer. 75 | * 76 | * The queue and stack use the same storage: As the queue is 77 | * consumed, the stack overwrites it. The worst case allocation is 78 | * used for the queue so it never needs to be reallocated. 79 | * 80 | * Since all values are fx8uint indices, just allocate everything up 81 | * front as a giant fx8uint buffer, then dice it up. (There's some 82 | * potential here for allowing the caller to make this allocation.) 83 | */ 84 | if (xf->len > (size_t)-1/(5*sizeof(fx8uint))) { 85 | return 0; // overflow 86 | } 87 | fx8uint *buf = malloc((2*xf->len + 3*count)*sizeof(*buf)); 88 | if (!buf) { 89 | return 0; 90 | } 91 | fx8uint *sets = buf; 92 | fx8uint *queue = buf + xf->len; 93 | fx8uint *stack = queue; 94 | fx8uint *nodes[3] = { 95 | buf + 2*xf->len + count*0, 96 | buf + 2*xf->len + count*1, 97 | buf + 2*xf->len + count*2 98 | }; 99 | 100 | for (xf->seed = 0; ; xf->seed++) { 101 | /* Initialize all sets to empty. */ 102 | for (size_t i = 0; i < xf->len; i++) { 103 | sets[i] = FX8NULL; 104 | } 105 | 106 | /* Fills sets with the keys. */ 107 | for (size_t i = 0; i < count; i++) { 108 | fx8uint c[3]; 109 | xf8_index(xf, c, keys[i]); 110 | for (int j = 0; j < 3; j++) { 111 | nodes[j][i] = sets[c[j]]; 112 | sets[c[j]] = i; 113 | } 114 | } 115 | 116 | /* Queue all sets with exactly one element. */ 117 | fx8uint head = 0; 118 | fx8uint tail = 0; 119 | for (size_t i = 0; i < xf->len; i++) { 120 | int j = i / (xf->len / 3); 121 | if (sets[i] != FX8NULL && nodes[j][sets[i]] == FX8NULL) { 122 | queue[head++] = i; 123 | } 124 | } 125 | 126 | /* Process the queue until empty. */ 127 | fx8uint top = 0; 128 | while (head != tail) { 129 | fx8uint i = queue[tail++]; 130 | if (sets[i] != FX8NULL) { 131 | fx8uint k = sets[i]; 132 | fx8uint c[3]; 133 | xf8_index(xf, c, keys[k]); 134 | for (int j = 0; j < 3; j++) { 135 | fx8uint *p = sets + c[j]; 136 | while (*p != k) { 137 | p = nodes[j] + *p; 138 | } 139 | *p = nodes[j][k]; 140 | fx8uint h = sets[c[j]]; 141 | if (h != FX8NULL && nodes[j][h] == FX8NULL) { 142 | queue[head++] = c[j]; 143 | } 144 | } 145 | 146 | /* Push this key index onto the stack. The first set of 147 | * linked list nodes is re-purposed to track the set index 148 | * that belongs to this key. 149 | */ 150 | stack[top++] = k; 151 | nodes[0][k] = i; 152 | } 153 | } 154 | 155 | if (top == count) { 156 | /* Success! Fill out the XOR filter with the results. */ 157 | for (size_t i = 0; i < xf->len; i++) { 158 | xf->slots[i] = 0; 159 | } 160 | while (top) { 161 | fx8uint k = stack[--top]; 162 | fx8uint i = nodes[0][k]; 163 | fx8uint c[3]; 164 | xf8_index(xf, c, keys[k]); 165 | xf8slot *b = xf->slots; 166 | xf->slots[i] = b[c[0]] ^ b[c[1]] ^ b[c[2]] ^ keys[k]; 167 | } 168 | break; 169 | } 170 | 171 | /* Failure. Increment the seed and try again with a new set of 172 | * hash functions. This is very unlikely. 173 | */ 174 | } 175 | 176 | free(buf); 177 | return 1; 178 | } 179 | 180 | int 181 | xf8_member(const struct xf8 *xf, uint64_t key) 182 | { 183 | fx8uint c[3]; 184 | xf8_index(xf, c, key); 185 | const xf8slot *b = xf->slots; 186 | return (xf8slot)key == (b[c[0]] ^ b[c[1]] ^ b[c[2]]); 187 | } 188 | --------------------------------------------------------------------------------