├── .gitignore ├── misc ├── .gitignore ├── collisions │ ├── collision_data_v1.2.gz │ ├── collision_data_v1.3.gz │ ├── collision_data_v1.2.gz.png │ ├── collision_data_v1.3.gz.png │ └── dograph ├── 1.1 │ ├── linux.10000.1 │ ├── solaris.10000.1 │ ├── linux.100000.1 │ ├── solaris.100000.1 │ ├── linux.10000000.001 │ ├── solaris.10000000.001 │ ├── solaris.10000000.01 │ ├── linux.10000000.01 │ ├── linux.10000000.1 │ └── solaris.10000000.1 ├── 1.0 │ ├── linux.10000.1 │ ├── solaris.10000.1 │ ├── linux.100000.1 │ ├── linux.10000000.001 │ ├── solaris.100000.1 │ ├── solaris.10000000.001 │ ├── linux.10000000.01 │ ├── solaris.10000000.01 │ ├── linux.10000000.1 │ └── solaris.10000000.1 ├── colgraph ├── test │ ├── basic.c │ └── test.c └── colcmp ├── murmur2 ├── murmurhash2.h ├── README └── MurmurHash2.c ├── Makefile.am ├── README ├── LICENSE ├── bloom.c └── bloom.h /.gitignore: -------------------------------------------------------------------------------- 1 | build 2 | -------------------------------------------------------------------------------- /misc/.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | data.cmp 3 | -------------------------------------------------------------------------------- /misc/collisions/collision_data_v1.2.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shadowsocks/libbloom/HEAD/misc/collisions/collision_data_v1.2.gz -------------------------------------------------------------------------------- /misc/collisions/collision_data_v1.3.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shadowsocks/libbloom/HEAD/misc/collisions/collision_data_v1.3.gz -------------------------------------------------------------------------------- /misc/collisions/collision_data_v1.2.gz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shadowsocks/libbloom/HEAD/misc/collisions/collision_data_v1.2.gz.png -------------------------------------------------------------------------------- /misc/collisions/collision_data_v1.3.gz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shadowsocks/libbloom/HEAD/misc/collisions/collision_data_v1.3.gz.png -------------------------------------------------------------------------------- /misc/1.1/linux.10000.1: -------------------------------------------------------------------------------- 1 | 1000, 0 2 | 2000, 0 3 | 3000, 0 4 | 4000, 0 5 | 5000, 0.12 6 | 6000, 0 7 | 7000, 0.28 8 | 8000, 0.2 9 | 9000, 0.32 10 | 10000, 1.04 11 | -------------------------------------------------------------------------------- /misc/1.1/solaris.10000.1: -------------------------------------------------------------------------------- 1 | 1000, 0 2 | 2000, 0 3 | 3000, 0.48 4 | 4000, 1.88 5 | 5000, 4.68 6 | 6000, 11.12 7 | 7000, 25.12 8 | 8000, 42.32 9 | 9000, 69 10 | 10000, 104.8 11 | -------------------------------------------------------------------------------- /misc/1.0/linux.10000.1: -------------------------------------------------------------------------------- 1 | 1000, 0 2 | 2000, 0.24 3 | 3000, 1.44 4 | 4000, 5.16 5 | 5000, 15.24 6 | 6000, 34.2 7 | 7000, 63.96 8 | 8000, 113.92 9 | 9000, 175.72 10 | 10000, 269.24 11 | -------------------------------------------------------------------------------- /misc/1.0/solaris.10000.1: -------------------------------------------------------------------------------- 1 | 1000, 0 2 | 2000, 0.48 3 | 3000, 1.4 4 | 4000, 5.24 5 | 5000, 15.84 6 | 6000, 33.68 7 | 7000, 68.12 8 | 8000, 110.6 9 | 9000, 178.96 10 | 10000, 268.84 11 | -------------------------------------------------------------------------------- /murmur2/murmurhash2.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef _BLOOM_MURMURHASH2 3 | #define _BLOOM_MURMURHASH2 4 | 5 | unsigned int murmurhash2(const void * key, int len, const unsigned int seed); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /misc/1.1/linux.100000.1: -------------------------------------------------------------------------------- 1 | 10000, 0 2 | 20000, 4.48 3 | 30000, 26.96 4 | 40000, 95.52 5 | 50000, 251.96 6 | 60000, 522.12 7 | 70000, 970.16 8 | 80000, 1622.8 9 | 90000, 2529.24 10 | 100000, 3691.6 11 | -------------------------------------------------------------------------------- /misc/1.0/linux.100000.1: -------------------------------------------------------------------------------- 1 | 10000, 0.16 2 | 20000, 2.64 3 | 30000, 15.32 4 | 40000, 60.36 5 | 50000, 158.32 6 | 60000, 336.48 7 | 70000, 649.32 8 | 80000, 1109.2 9 | 90000, 1784.48 10 | 100000, 2691.52 11 | -------------------------------------------------------------------------------- /misc/1.0/linux.10000000.001: -------------------------------------------------------------------------------- 1 | 1000000, 0 2 | 2000000, 0 3 | 3000000, 0.04 4 | 4000000, 0.32 5 | 5000000, 2.6 6 | 6000000, 13.96 7 | 7000000, 57 8 | 8000000, 184.36 9 | 9000000, 509.52 10 | 10000000, 1210.56 11 | -------------------------------------------------------------------------------- /misc/1.0/solaris.100000.1: -------------------------------------------------------------------------------- 1 | 10000, 0.08 2 | 20000, 2.44 3 | 30000, 18.16 4 | 40000, 59.88 5 | 50000, 151.84 6 | 60000, 342.52 7 | 70000, 656.04 8 | 80000, 1113.56 9 | 90000, 1795.76 10 | 100000, 2690.04 11 | -------------------------------------------------------------------------------- /misc/1.1/solaris.100000.1: -------------------------------------------------------------------------------- 1 | 10000, 0.88 2 | 20000, 13.64 3 | 30000, 78.84 4 | 40000, 243.12 5 | 50000, 569.36 6 | 60000, 1088.84 7 | 70000, 1844.68 8 | 80000, 2843.56 9 | 90000, 4118.68 10 | 100000, 5672.08 11 | -------------------------------------------------------------------------------- /misc/1.0/solaris.10000000.001: -------------------------------------------------------------------------------- 1 | 1000000, 0 2 | 2000000, 0 3 | 3000000, 0.04 4 | 4000000, 0.24 5 | 5000000, 3.44 6 | 6000000, 12.84 7 | 7000000, 55.8 8 | 8000000, 183.88 9 | 9000000, 493.92 10 | 10000000, 1224.08 11 | -------------------------------------------------------------------------------- /misc/1.1/linux.10000000.001: -------------------------------------------------------------------------------- 1 | 1000000, 0.28 2 | 2000000, 1.04 3 | 3000000, 1.16 4 | 4000000, 3.48 5 | 5000000, 6.76 6 | 6000000, 22.6 7 | 7000000, 65.76 8 | 8000000, 197.8 9 | 9000000, 525.12 10 | 10000000, 1234.48 11 | -------------------------------------------------------------------------------- /Makefile.am: -------------------------------------------------------------------------------- 1 | noinst_LTLIBRARIES = libbloom.la 2 | 3 | libbloom_la_SOURCES = bloom.c murmur2/MurmurHash2.c 4 | libbloom_la_CFLAGS = -I$(top_srcdir)/libbloom -I$(top_srcdir)/libbloom/murmur2 5 | 6 | libbloom_la_LDFLAGS = -static 7 | -------------------------------------------------------------------------------- /misc/1.0/linux.10000000.01: -------------------------------------------------------------------------------- 1 | 1000000, 0 2 | 2000000, 0.12 3 | 3000000, 4.8 4 | 4000000, 39.12 5 | 5000000, 180.68 6 | 6000000, 620.88 7 | 7000000, 1749.44 8 | 8000000, 4168.48 9 | 9000000, 8716.72 10 | 10000000, 16666.44 11 | -------------------------------------------------------------------------------- /misc/1.0/solaris.10000000.01: -------------------------------------------------------------------------------- 1 | 1000000, 0 2 | 2000000, 0.36 3 | 3000000, 4.04 4 | 4000000, 38.52 5 | 5000000, 175.08 6 | 6000000, 631.96 7 | 7000000, 1764.8 8 | 8000000, 4164.16 9 | 9000000, 8780.44 10 | 10000000, 16635.36 11 | -------------------------------------------------------------------------------- /misc/1.1/solaris.10000000.001: -------------------------------------------------------------------------------- 1 | 1000000, 1.24 2 | 2000000, 4.24 3 | 3000000, 10.2 4 | 4000000, 21.04 5 | 5000000, 68.72 6 | 6000000, 216.28 7 | 7000000, 689.24 8 | 8000000, 1878.12 9 | 9000000, 4440.2 10 | 10000000, 9494.76 11 | -------------------------------------------------------------------------------- /misc/1.1/solaris.10000000.01: -------------------------------------------------------------------------------- 1 | 1000000, 0.76 2 | 2000000, 3.12 3 | 3000000, 12.24 4 | 4000000, 48.44 5 | 5000000, 203.44 6 | 6000000, 662.32 7 | 7000000, 1793.6 8 | 8000000, 4239.4 9 | 9000000, 8839.64 10 | 10000000, 16732.08 11 | -------------------------------------------------------------------------------- /misc/1.1/linux.10000000.01: -------------------------------------------------------------------------------- 1 | 1000000, 0.52 2 | 2000000, 2.76 3 | 3000000, 42.84 4 | 4000000, 291.12 5 | 5000000, 1222.52 6 | 6000000, 3851.12 7 | 7000000, 9718.96 8 | 8000000, 20957.12 9 | 9000000, 40128.24 10 | 10000000, 70279.2 11 | -------------------------------------------------------------------------------- /misc/1.1/linux.10000000.1: -------------------------------------------------------------------------------- 1 | 1000000, 8.68 2 | 2000000, 236.6 3 | 3000000, 1581.76 4 | 4000000, 5780.84 5 | 5000000, 15509.6 6 | 6000000, 34036.8 7 | 7000000, 64884.52 8 | 8000000, 111586.72 9 | 9000000, 178323.6 10 | 10000000, 267998 11 | -------------------------------------------------------------------------------- /misc/1.0/linux.10000000.1: -------------------------------------------------------------------------------- 1 | 1000000, 8.96 2 | 2000000, 240.44 3 | 3000000, 1561.84 4 | 4000000, 5801.16 5 | 5000000, 15577.6 6 | 6000000, 34093.12 7 | 7000000, 64954.48 8 | 8000000, 112030.32 9 | 9000000, 178613.72 10 | 10000000, 268721.6 11 | -------------------------------------------------------------------------------- /misc/1.0/solaris.10000000.1: -------------------------------------------------------------------------------- 1 | 1000000, 7.44 2 | 2000000, 231.64 3 | 3000000, 1567.76 4 | 4000000, 5794.88 5 | 5000000, 15564.04 6 | 6000000, 34060.92 7 | 7000000, 64926.88 8 | 8000000, 111990.88 9 | 9000000, 178743 10 | 10000000, 268803.76 11 | -------------------------------------------------------------------------------- /misc/1.1/solaris.10000000.1: -------------------------------------------------------------------------------- 1 | 1000000, 40.68 2 | 2000000, 1007.92 3 | 3000000, 5946.2 4 | 4000000, 19937.72 5 | 5000000, 48851.76 6 | 6000000, 98408.04 7 | 7000000, 173404.48 8 | 8000000, 278122.6 9 | 9000000, 414522.92 10 | 10000000, 585045.72 11 | -------------------------------------------------------------------------------- /murmur2/README: -------------------------------------------------------------------------------- 1 | 2 | MurmurHash2.c is taken from 3 | 4 | http://sites.google.com/site/murmurhash/ 5 | 6 | According to the above document: 7 | 8 | All code is released to the public domain. For business purposes, 9 | Murmurhash is under the MIT license. 10 | -------------------------------------------------------------------------------- /misc/collisions/dograph: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # 4 | # Generate graph from collision test data (requires ploticus). 5 | # See top Makefile target 'collision_test' for details. 6 | # 7 | # Invocation: 8 | # 9 | # ./dograph DATAFILE 10 | # 11 | 12 | zcat $1 > uncompressed 13 | ploticus -prefab lines data=uncompressed x=1 y=5 -maxfields 8000000 -maxrows 1000000 -maxvector 1000000 pointsym=none -png -o $1.png 14 | rm -f uncompressed 15 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | 2 | Introduction 3 | ------------ 4 | This is libbloom, a simple and small bloom filter implementation in C. 5 | 6 | If you are reading this you probably already know about bloom filters 7 | and why you might use one. If not, the wikipedia article is a good intro: 8 | http://en.wikipedia.org/wiki/Bloom_filter 9 | 10 | 11 | Building 12 | -------- 13 | The Makefile assumes GNU Make, so run 'make' or 'gmake' as appropriate 14 | on your system. 15 | 16 | By default it builds an optimized 64 bit libbloom. See Makefile comments 17 | for other build options. 18 | 19 | The shared library will be in ./build/libbloom.so 20 | 21 | 22 | Sample Usage 23 | ------------ 24 | 25 | #include "bloom.h" 26 | 27 | struct bloom bloom; 28 | bloom_init(&bloom, 1000000, 0.01); 29 | bloom_add(&bloom, buffer, buflen); 30 | 31 | if (bloom_check(&bloom, buffer, buflen)) { 32 | printf("It may be there!\n"); 33 | } 34 | 35 | 36 | Documentation 37 | ------------- 38 | Read bloom.h for more detailed documentation on the public interfaces. 39 | 40 | 41 | License 42 | ------- 43 | This code (except MurmurHash2) is under BSD license. See LICENSE file. 44 | 45 | See murmur2/README for info on MurmurHash2. 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Copyright (c) 2012, Jyri J. Virkki 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are 7 | met: 8 | 9 | 1. Redistributions of source code must retain the above copyright 10 | notice, this list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /murmur2/MurmurHash2.c: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash2, by Austin Appleby 3 | 4 | // Note - This code makes a few assumptions about how your machine behaves - 5 | 6 | // 1. We can read a 4-byte value from any address without crashing 7 | // 2. sizeof(int) == 4 8 | 9 | // And it has a few limitations - 10 | 11 | // 1. It will not work incrementally. 12 | // 2. It will not produce the same results on little-endian and big-endian 13 | // machines. 14 | 15 | unsigned int murmurhash2(const void * key, int len, const unsigned int seed) 16 | { 17 | // 'm' and 'r' are mixing constants generated offline. 18 | // They're not really 'magic', they just happen to work well. 19 | 20 | const unsigned int m = 0x5bd1e995; 21 | const int r = 24; 22 | 23 | // Initialize the hash to a 'random' value 24 | 25 | unsigned int h = seed ^ len; 26 | 27 | // Mix 4 bytes at a time into the hash 28 | 29 | const unsigned char * data = (const unsigned char *)key; 30 | 31 | while(len >= 4) 32 | { 33 | unsigned int k = *(unsigned int *)data; 34 | 35 | k *= m; 36 | k ^= k >> r; 37 | k *= m; 38 | 39 | h *= m; 40 | h ^= k; 41 | 42 | data += 4; 43 | len -= 4; 44 | } 45 | 46 | // Handle the last few bytes of the input array 47 | 48 | switch(len) 49 | { 50 | case 3: h ^= data[2] << 16; 51 | case 2: h ^= data[1] << 8; 52 | case 1: h ^= data[0]; 53 | h *= m; 54 | }; 55 | 56 | // Do a few final mixes of the hash to ensure the last few 57 | // bytes are well-incorporated. 58 | 59 | h ^= h >> 13; 60 | h *= m; 61 | h ^= h >> 15; 62 | 63 | return h; 64 | } 65 | -------------------------------------------------------------------------------- /misc/colgraph: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # 4 | # Copyright (c) 2015, Jyri J. Virkki 5 | # All rights reserved. 6 | # 7 | # This file is under BSD license. See LICENSE file. 8 | # 9 | 10 | # 11 | # ./colgraph SIZE ERROR 12 | # 13 | # SIZE = size of bloom library to initialize 14 | # ERROR = expected error 15 | # 16 | # This script runs a random collision test (test-libbloom -c) $rounds 17 | # number of times for 10 element counts from SIZE/10 to SIZE. The 18 | # average of each run is saved in the 'data' file in the current 19 | # directory. 20 | # 21 | # If ploticus is available it'll also display a graph. Or you can use 22 | # any other graphing app or tool to process the 'data' file. 23 | # 24 | 25 | $rounds = 25; 26 | 27 | $size = shift(@ARGV); 28 | if (!$size) { 29 | die "provide a size\n"; 30 | } 31 | 32 | $error = shift(@ARGV); 33 | if (!$error) { 34 | die "provide expected error\n"; 35 | } 36 | 37 | open(OUT, ">data"); 38 | for ($tenth = 1; $tenth < 11; $tenth++) { 39 | $count = ($size / 10) * $tenth; 40 | 41 | $avg = 0; 42 | for ($n = 0; $n < $rounds; $n++) { 43 | open(RES, "../build/test-libbloom -c $size $error $count |"); 44 | while() {$got = $_;} 45 | close(RES); 46 | ($added, $coll) = $got =~ /count: (\d+), coll: (\d+)/; 47 | $avg += $coll; 48 | } 49 | $avg /= $rounds; 50 | print "ADDED $added, AVG.COLL $avg\n"; 51 | print OUT "$added, $avg\n"; 52 | } 53 | close(OUT); 54 | 55 | $cmd = "ploticus -prefab lines data=data x=1 y=2 \"xrange=0 $size\" " . 56 | "\"title=size = $size\" \"ylbl=collisions\" \"xlbl=count\" "; 57 | print "$cmd\n"; 58 | system($cmd); 59 | -------------------------------------------------------------------------------- /misc/test/basic.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016, Jyri J. Virkki 3 | * All rights reserved. 4 | * 5 | * This file is under BSD license. See LICENSE file. 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | #include "bloom.h" 19 | 20 | #ifdef __linux 21 | #include 22 | #include 23 | #endif 24 | 25 | 26 | /** *************************************************************************** 27 | * A few simple tests to check if it works at all. 28 | * 29 | * These are covered in the main test, repeated here just to create a test 30 | * executable using the static libbloom library to exercise it as well. 31 | * 32 | */ 33 | int main(int argc, char **argv) 34 | { 35 | struct bloom bloom; 36 | 37 | printf("----- Basic tests with static library -----\n"); 38 | assert(bloom_init(&bloom, 0, 1.0) == 1); 39 | assert(bloom_init(&bloom, 10, 0) == 1); 40 | assert(bloom.ready == 0); 41 | assert(bloom_add(&bloom, "hello world", 11) == -1); 42 | assert(bloom_check(&bloom, "hello world", 11) == -1); 43 | bloom_free(&bloom); 44 | 45 | assert(bloom_init(&bloom, 102, 0.1) == 0); 46 | assert(bloom.ready == 1); 47 | bloom_print(&bloom); 48 | 49 | assert(bloom_check(&bloom, "hello world", 11) == 0); 50 | assert(bloom_add(&bloom, "hello world", 11) == 0); 51 | assert(bloom_check(&bloom, "hello world", 11) == 1); 52 | assert(bloom_add(&bloom, "hello world", 11) > 0); 53 | assert(bloom_add(&bloom, "hello", 5) == 0); 54 | assert(bloom_add(&bloom, "hello", 5) > 0); 55 | assert(bloom_check(&bloom, "hello", 5) == 1); 56 | bloom_free(&bloom); 57 | printf("----- DONE Basic tests with static library -----\n"); 58 | } 59 | -------------------------------------------------------------------------------- /misc/colcmp: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | # 4 | # Copyright (c) 2015, Jyri J. Virkki 5 | # All rights reserved. 6 | # 7 | # This file is under BSD license. See LICENSE file. 8 | # 9 | 10 | # 11 | # ./colcmp FILE1 [FILEn*] 12 | # 13 | # Graph one or more collision average runs (generated with colgraph). 14 | # The runs must be for the same SIZE (see colgraph). 15 | # 16 | # Requires ploticus to generate the graph. Note that the lines prefab 17 | # only supports four data sets so in practice that is the limit, 18 | # even though this script will process any number of input file. 19 | # 20 | # (To plot more than four, you could load the generated 'data.cmp' 21 | # file into an alternate graphing tool or spreadsheet instead of 22 | # relying on ploticus.) 23 | # 24 | 25 | $sets = 0; 26 | while (@ARGV) { 27 | $files[$sets] = shift(@ARGV); 28 | if (! -e $files[$sets]) { 29 | die "$files[$sets] does not exist!\n"; 30 | } 31 | $sets++; 32 | } 33 | 34 | if ($sets < 1) { 35 | die "Need at least some data!\n"; 36 | } 37 | 38 | for ($n = 0; $n < $sets; $n++) { 39 | local *FILE; 40 | open(FILE, $files[$n]); 41 | $fh[$n] = *FILE; 42 | } 43 | 44 | open(OUT, ">data.cmp"); 45 | 46 | # Read one line at a time from each input file. 47 | # All files must be comparable, meaning the count column 48 | # for each line must be the same, else give up. 49 | 50 | $points = 0; 51 | 52 | while(readline $fh[0]) { 53 | chomp; 54 | ($count, $avg[0]) = /(\d+), (\S+)/; 55 | 56 | for ($n = 1; $n < $sets; $n++) { 57 | $_ = readline $fh[$n]; chomp; 58 | ($nc, $avg[$n]) = /(\d+), (\S+)/; 59 | if ($nc ne $count) { 60 | die "Mismatch in file $files[$n]: $nc instead of $count\n"; 61 | } 62 | } 63 | 64 | print OUT $count; 65 | for ($n = 0; $n < $sets; $n++) { 66 | print OUT " $avg[$n]"; 67 | } 68 | print OUT "\n"; 69 | 70 | $points++; 71 | } 72 | 73 | # Close all files... 74 | for ($n = 0; $n < $sets; $n++) { 75 | close($fh[$n]); 76 | } 77 | close(OUT); 78 | 79 | if (!$points) { 80 | die "Nothing to show!\n"; 81 | } 82 | 83 | $cmd = "ploticus -prefab lines data=data.cmp " . 84 | " \"xrange=0 $count\" " . 85 | " \"title=size = $count\" " . 86 | "\"ylbl=collisions\" \"xlbl=count\" " . 87 | "x=1 y=2 "; 88 | 89 | for ($n = 1; $n < $sets; $n++) { 90 | $cmd .= " y" . ($n+1) . "=" . ($n+2); 91 | } 92 | 93 | $cmd .= " \"name=$files[0]\" "; 94 | for ($n = 1; $n < $sets; $n++) { 95 | $cmd .= " \"name" . ($n+1) . "=$files[$n]\" "; 96 | } 97 | 98 | print "$cmd\n"; 99 | system($cmd); 100 | -------------------------------------------------------------------------------- /bloom.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2016, Jyri J. Virkki 3 | * All rights reserved. 4 | * 5 | * This file is under BSD license. See LICENSE file. 6 | */ 7 | 8 | /* 9 | * Refer to bloom.h for documentation on the public interfaces. 10 | */ 11 | 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | #include "bloom.h" 24 | #include "murmurhash2.h" 25 | 26 | #define MAKESTRING(n) STRING(n) 27 | #define STRING(n) #n 28 | 29 | 30 | inline static int test_bit_set_bit(unsigned char * buf, 31 | unsigned int x, int set_bit) 32 | { 33 | unsigned int byte = x >> 3; 34 | unsigned char c = buf[byte]; // expensive memory access 35 | unsigned int mask = 1 << (x % 8); 36 | 37 | if (c & mask) { 38 | return 1; 39 | } else { 40 | if (set_bit) { 41 | buf[byte] = c | mask; 42 | } 43 | return 0; 44 | } 45 | } 46 | 47 | 48 | static int bloom_check_add(struct bloom * bloom, 49 | const void * buffer, int len, int add) 50 | { 51 | if (bloom->ready == 0) { 52 | printf("bloom at %p not initialized!\n", (void *)bloom); 53 | return -1; 54 | } 55 | 56 | int hits = 0; 57 | register unsigned int a = murmurhash2(buffer, len, 0x9747b28c); 58 | register unsigned int b = murmurhash2(buffer, len, a); 59 | register unsigned int x; 60 | register unsigned int i; 61 | 62 | for (i = 0; i < bloom->hashes; i++) { 63 | x = (a + i*b) % bloom->bits; 64 | if (test_bit_set_bit(bloom->bf, x, add)) { 65 | hits++; 66 | } else if (!add) { 67 | // Don't care about the presence of all the bits. Just our own. 68 | return 0; 69 | } 70 | } 71 | 72 | if (hits == bloom->hashes) { 73 | return 1; // 1 == element already in (or collision) 74 | } 75 | 76 | return 0; 77 | } 78 | 79 | 80 | int bloom_init_size(struct bloom * bloom, int entries, double error, 81 | unsigned int cache_size) 82 | { 83 | return bloom_init(bloom, entries, error); 84 | } 85 | 86 | 87 | int bloom_init(struct bloom * bloom, int entries, double error) 88 | { 89 | bloom->ready = 0; 90 | 91 | if (entries < 1 || error == 0) { 92 | return 1; 93 | } 94 | 95 | bloom->entries = entries; 96 | bloom->error = error; 97 | 98 | double num = log(bloom->error); 99 | double denom = 0.480453013918201; // ln(2)^2 100 | bloom->bpe = -(num / denom); 101 | 102 | double dentries = (double)entries; 103 | bloom->bits = (int)(dentries * bloom->bpe); 104 | 105 | if (bloom->bits % 8) { 106 | bloom->bytes = (bloom->bits / 8) + 1; 107 | } else { 108 | bloom->bytes = bloom->bits / 8; 109 | } 110 | 111 | bloom->hashes = (int)ceil(0.693147180559945 * bloom->bpe); // ln(2) 112 | 113 | bloom->bf = (unsigned char *)calloc(bloom->bytes, sizeof(unsigned char)); 114 | if (bloom->bf == NULL) { 115 | return 1; 116 | } 117 | 118 | bloom->ready = 1; 119 | return 0; 120 | } 121 | 122 | 123 | int bloom_check(struct bloom * bloom, const void * buffer, int len) 124 | { 125 | return bloom_check_add(bloom, buffer, len, 0); 126 | } 127 | 128 | 129 | int bloom_add(struct bloom * bloom, const void * buffer, int len) 130 | { 131 | return bloom_check_add(bloom, buffer, len, 1); 132 | } 133 | 134 | 135 | void bloom_print(struct bloom * bloom) 136 | { 137 | printf("bloom at %p\n", (void *)bloom); 138 | printf(" ->entries = %d\n", bloom->entries); 139 | printf(" ->error = %f\n", bloom->error); 140 | printf(" ->bits = %d\n", bloom->bits); 141 | printf(" ->bits per elem = %f\n", bloom->bpe); 142 | printf(" ->bytes = %d\n", bloom->bytes); 143 | printf(" ->hash functions = %d\n", bloom->hashes); 144 | } 145 | 146 | 147 | void bloom_free(struct bloom * bloom) 148 | { 149 | if (bloom->ready) { 150 | free(bloom->bf); 151 | } 152 | bloom->ready = 0; 153 | } 154 | 155 | 156 | int bloom_reset(struct bloom * bloom) 157 | { 158 | if (!bloom->ready) return 1; 159 | memset(bloom->bf, 0, bloom->bytes); 160 | return 0; 161 | } 162 | 163 | 164 | const char * bloom_version() 165 | { 166 | return MAKESTRING(BLOOM_VERSION); 167 | } 168 | -------------------------------------------------------------------------------- /bloom.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2016, Jyri J. Virkki 3 | * All rights reserved. 4 | * 5 | * This file is under BSD license. See LICENSE file. 6 | */ 7 | 8 | #ifndef _BLOOM_H 9 | #define _BLOOM_H 10 | 11 | #ifdef __cplusplus 12 | extern "C" { 13 | #endif 14 | 15 | 16 | /** *************************************************************************** 17 | * Structure to keep track of one bloom filter. Caller needs to 18 | * allocate this and pass it to the functions below. First call for 19 | * every struct must be to bloom_init(). 20 | * 21 | */ 22 | struct bloom 23 | { 24 | // These fields are part of the public interface of this structure. 25 | // Client code may read these values if desired. Client code MUST NOT 26 | // modify any of these. 27 | int entries; 28 | double error; 29 | int bits; 30 | int bytes; 31 | int hashes; 32 | 33 | // Fields below are private to the implementation. These may go away or 34 | // change incompatibly at any moment. Client code MUST NOT access or rely 35 | // on these. 36 | double bpe; 37 | unsigned char * bf; 38 | int ready; 39 | }; 40 | 41 | 42 | /** *************************************************************************** 43 | * Initialize the bloom filter for use. 44 | * 45 | * The filter is initialized with a bit field and number of hash functions 46 | * according to the computations from the wikipedia entry: 47 | * http://en.wikipedia.org/wiki/Bloom_filter 48 | * 49 | * Optimal number of bits is: 50 | * bits = (entries * ln(error)) / ln(2)^2 51 | * 52 | * Optimal number of hash functions is: 53 | * hashes = bpe * ln(2) 54 | * 55 | * Parameters: 56 | * ----------- 57 | * bloom - Pointer to an allocated struct bloom (see above). 58 | * entries - The expected number of entries which will be inserted. 59 | * error - Probability of collision (as long as entries are not 60 | * exceeded). 61 | * 62 | * Return: 63 | * ------- 64 | * 0 - on success 65 | * 1 - on failure 66 | * 67 | */ 68 | int bloom_init(struct bloom * bloom, int entries, double error); 69 | 70 | 71 | /** *************************************************************************** 72 | * Deprecated, use bloom_init() 73 | * 74 | */ 75 | int bloom_init_size(struct bloom * bloom, int entries, double error, 76 | unsigned int cache_size); 77 | 78 | 79 | /** *************************************************************************** 80 | * Check if the given element is in the bloom filter. Remember this may 81 | * return false positive if a collision occured. 82 | * 83 | * Parameters: 84 | * ----------- 85 | * bloom - Pointer to an allocated struct bloom (see above). 86 | * buffer - Pointer to buffer containing element to check. 87 | * len - Size of 'buffer'. 88 | * 89 | * Return: 90 | * ------- 91 | * 0 - element is not present 92 | * 1 - element is present (or false positive due to collision) 93 | * -1 - bloom not initialized 94 | * 95 | */ 96 | int bloom_check(struct bloom * bloom, const void * buffer, int len); 97 | 98 | 99 | /** *************************************************************************** 100 | * Add the given element to the bloom filter. 101 | * The return code indicates if the element (or a collision) was already in, 102 | * so for the common check+add use case, no need to call check separately. 103 | * 104 | * Parameters: 105 | * ----------- 106 | * bloom - Pointer to an allocated struct bloom (see above). 107 | * buffer - Pointer to buffer containing element to add. 108 | * len - Size of 'buffer'. 109 | * 110 | * Return: 111 | * ------- 112 | * 0 - element was not present and was added 113 | * 1 - element (or a collision) had already been added previously 114 | * -1 - bloom not initialized 115 | * 116 | */ 117 | int bloom_add(struct bloom * bloom, const void * buffer, int len); 118 | 119 | 120 | /** *************************************************************************** 121 | * Print (to stdout) info about this bloom filter. Debugging aid. 122 | * 123 | */ 124 | void bloom_print(struct bloom * bloom); 125 | 126 | 127 | /** *************************************************************************** 128 | * Deallocate internal storage. 129 | * 130 | * Upon return, the bloom struct is no longer usable. You may call bloom_init 131 | * again on the same struct to reinitialize it again. 132 | * 133 | * Parameters: 134 | * ----------- 135 | * bloom - Pointer to an allocated struct bloom (see above). 136 | * 137 | * Return: none 138 | * 139 | */ 140 | void bloom_free(struct bloom * bloom); 141 | 142 | /** *************************************************************************** 143 | * Erase internal storage. 144 | * 145 | * Erases all elements. Upon return, the bloom struct returns to its initial 146 | * (initialized) state. 147 | * 148 | * Parameters: 149 | * ----------- 150 | * bloom - Pointer to an allocated struct bloom (see above). 151 | * 152 | * Return: 153 | * 0 - on success 154 | * 1 - on failure 155 | * 156 | */ 157 | int bloom_reset(struct bloom * bloom); 158 | 159 | 160 | /** *************************************************************************** 161 | * Returns version string compiled into library. 162 | * 163 | * Return: version string 164 | * 165 | */ 166 | const char * bloom_version(); 167 | 168 | #ifdef __cplusplus 169 | } 170 | #endif 171 | 172 | #endif 173 | -------------------------------------------------------------------------------- /misc/test/test.c: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2012-2016, Jyri J. Virkki 3 | * All rights reserved. 4 | * 5 | * This file is under BSD license. See LICENSE file. 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #include "bloom.h" 20 | 21 | #ifdef __linux 22 | #include 23 | #include 24 | #endif 25 | 26 | 27 | /** *************************************************************************** 28 | * A few simple tests to check if it works at all. 29 | * 30 | */ 31 | static int basic() 32 | { 33 | printf("----- basic -----\n"); 34 | 35 | struct bloom bloom; 36 | 37 | assert(bloom_init(&bloom, 0, 1.0) == 1); 38 | assert(bloom_init(&bloom, 10, 0) == 1); 39 | assert(bloom.ready == 0); 40 | assert(bloom_add(&bloom, "hello world", 11) == -1); 41 | assert(bloom_check(&bloom, "hello world", 11) == -1); 42 | bloom_free(&bloom); 43 | 44 | assert(bloom_init(&bloom, 102, 0.1) == 0); 45 | assert(bloom.ready == 1); 46 | bloom_print(&bloom); 47 | 48 | assert(bloom_check(&bloom, "hello world", 11) == 0); 49 | assert(bloom_add(&bloom, "hello world", 11) == 0); 50 | assert(bloom_check(&bloom, "hello world", 11) == 1); 51 | assert(bloom_add(&bloom, "hello world", 11) > 0); 52 | assert(bloom_add(&bloom, "hello", 5) == 0); 53 | assert(bloom_add(&bloom, "hello", 5) > 0); 54 | assert(bloom_check(&bloom, "hello", 5) == 1); 55 | bloom_free(&bloom); 56 | 57 | return 0; 58 | } 59 | 60 | 61 | /** *************************************************************************** 62 | * Create a bloom filter with given parameters and add 'count' random elements 63 | * into it to see if collission rates are within expectations. 64 | * 65 | */ 66 | static int add_random(int entries, double error, int count, 67 | int quiet, int check_error, uint8_t elem_size, int validate) 68 | { 69 | if (!quiet) { 70 | printf("----- add_random(%d, %f, %d, %d, %d, %d, %d) -----\n", 71 | entries, error, count, quiet, check_error, elem_size, validate); 72 | } 73 | 74 | struct bloom bloom; 75 | assert(bloom_init(&bloom, entries, error) == 0); 76 | if (!quiet) { bloom_print(&bloom); } 77 | 78 | char block[elem_size]; 79 | uint8_t * saved = NULL; 80 | uint8_t * savedp = NULL; 81 | int collisions = 0; 82 | int n; 83 | 84 | int fd = open("/dev/urandom", O_RDONLY); 85 | if (fd < 0) { 86 | printf("error: unable to open /dev/random\n"); 87 | exit(1); 88 | } 89 | 90 | if (validate) { 91 | saved = (uint8_t *)malloc(elem_size * count); 92 | if (!saved) { 93 | printf("error: unable to allocate buffer for validation\n"); 94 | exit(1); 95 | } 96 | savedp = saved; 97 | } 98 | 99 | for (n = 0; n < count; n++) { 100 | assert(read(fd, block, elem_size) == elem_size); 101 | memcpy(savedp, block, elem_size); 102 | savedp += elem_size; 103 | if (bloom_add(&bloom, (void *)block, elem_size)) { collisions++; } 104 | } 105 | close(fd); 106 | 107 | double er = (double)collisions / (double)count; 108 | 109 | if (!quiet) { 110 | printf("entries: %d, error: %f, count: %d, coll: %d, error: %f, " 111 | "bytes: %d\n", 112 | entries, error, count, collisions, er, bloom.bytes); 113 | } else { 114 | printf("%d %f %d %d %f %d\n", 115 | entries, error, count, collisions, er, bloom.bytes); 116 | } 117 | 118 | if (check_error && er > error) { 119 | printf("error: expected error %f but observed %f\n", error, er); 120 | exit(1); 121 | } 122 | 123 | if (validate) { 124 | for (n = 0; n < count; n++) { 125 | if (!bloom_check(&bloom, saved + (n * elem_size), elem_size)) { 126 | printf("error: data saved in filter is not there!\n"); 127 | exit(1); 128 | } 129 | } 130 | } 131 | 132 | bloom_free(&bloom); 133 | if (saved) { free(saved); } 134 | return 0; 135 | } 136 | 137 | 138 | /** *************************************************************************** 139 | * Simple loop to compare performance. 140 | * 141 | */ 142 | static int perf_loop(int entries, int count) 143 | { 144 | printf("----- perf_loop -----\n"); 145 | 146 | struct bloom bloom; 147 | assert(bloom_init(&bloom, entries, 0.001) == 0); 148 | bloom_print(&bloom); 149 | 150 | int i; 151 | int collisions = 0; 152 | 153 | struct timeval tp; 154 | gettimeofday(&tp, NULL); 155 | long before = (tp.tv_sec * 1000L) + (tp.tv_usec / 1000L); 156 | 157 | for (i = 0; i < count; i++) { 158 | if (bloom_add(&bloom, (void *)&i, sizeof(int))) { collisions++; } 159 | } 160 | 161 | gettimeofday(&tp, NULL); 162 | long after = (tp.tv_sec * 1000L) + (tp.tv_usec / 1000L); 163 | 164 | printf("Added %d elements of size %d, took %d ms (collisions=%d)\n", 165 | count, (int)sizeof(int), (int)(after - before), collisions); 166 | 167 | printf("%d,%d,%ld\n", entries, bloom.bytes, after - before); 168 | 169 | bloom_print(&bloom); 170 | bloom_free(&bloom); 171 | 172 | return 0; 173 | } 174 | 175 | 176 | /** *************************************************************************** 177 | * Default set of basic tests. 178 | * 179 | * These should run reasonably quick so they can be run all the time. 180 | * 181 | */ 182 | static int basic_tests() 183 | { 184 | int rv = 0; 185 | 186 | rv += basic(); 187 | rv += add_random(10, 0.1, 10, 0, 1, 32, 1); 188 | rv += add_random(10000, 0.1, 10000, 0, 1, 32, 1); 189 | rv += add_random(10000, 0.01, 10000, 0, 1, 32, 1); 190 | rv += add_random(10000, 0.001, 10000, 0, 1, 32, 1); 191 | rv += add_random(10000, 0.0001, 10000, 0, 1, 32, 1); 192 | rv += add_random(1000000, 0.0001, 1000000, 0, 1, 32, 1); 193 | 194 | printf("\nBrought to you by libbloom-%s\n", bloom_version()); 195 | 196 | return 0; 197 | } 198 | 199 | 200 | /** *************************************************************************** 201 | * Some longer-running tests. 202 | * 203 | */ 204 | static int larger_tests() 205 | { 206 | int rv = 0; 207 | int e; 208 | 209 | printf("\nAdd 10M elements and verify (0.00001)\n"); 210 | rv += add_random(10000000, 0.00001, 10000000, 0, 1, 32, 1); 211 | 212 | printf("\nChecking collision rates with filters from 100K to 1M (0.001)\n"); 213 | for (e = 100000; e <= 1000000; e+= 100) { 214 | rv += add_random(e, 0.001, e, 1, 1, 8, 1); 215 | } 216 | 217 | return rv; 218 | } 219 | 220 | 221 | /** *************************************************************************** 222 | * With no options, runs brief default tests. 223 | * 224 | * With -L, runs some longer-running tests. 225 | * 226 | * To test collisions over a range of sizes: -G START END INCREMENT ERROR 227 | * This produces output that can be graphed with collisions/dograph 228 | * See also collision_test make target. 229 | * 230 | * To test collisions, run with options: -c ENTRIES ERROR COUNT 231 | * Where 'ENTRIES' is the expected number of entries used to initialize the 232 | * bloom filter and 'ERROR' is the acceptable probability of collision 233 | * used to initialize the bloom filter. 'COUNT' is the actual number of 234 | * entries inserted. 235 | * 236 | * To test performance only, run with options: -p ENTRIES COUNT 237 | * Where 'ENTRIES' is the expected number of entries used to initialize the 238 | * bloom filter and 'COUNT' is the actual number of entries inserted. 239 | * 240 | */ 241 | int main(int argc, char **argv) 242 | { 243 | // Calls return() instead of exit() just to make valgrind mark as 244 | // an error any reachable allocations. That makes them show up 245 | // when running the tests. 246 | 247 | int rv = 0; 248 | 249 | if (argc == 1) { 250 | printf("----- Running basic tests -----\n"); 251 | rv = basic_tests(); 252 | printf("----- DONE Running basic tests -----\n"); 253 | return rv; 254 | } 255 | 256 | if (!strncmp(argv[1], "-L", 2)) { 257 | return larger_tests(); 258 | } 259 | 260 | if (!strncmp(argv[1], "-G", 2)) { 261 | if (argc != 6) { 262 | printf("-G START END INCREMENT ERROR\n"); 263 | return 1; 264 | } 265 | int e; 266 | for (e = atoi(argv[2]); e <= atoi(argv[3]); e+= atoi(argv[4])) { 267 | rv += add_random(e, atof(argv[5]), e, 1, 0, 32, 1); 268 | } 269 | return rv; 270 | } 271 | 272 | if (!strncmp(argv[1], "-c", 2)) { 273 | if (argc != 5) { 274 | printf("-c ENTRIES ERROR COUNT\n"); 275 | return 1; 276 | } 277 | 278 | return add_random(atoi(argv[2]), atof(argv[3]), atoi(argv[4]), 0, 1, 32, 1); 279 | } 280 | 281 | if (!strncmp(argv[1], "-p", 2)) { 282 | if (argc != 4) { 283 | printf("-p ENTRIES COUNT\n"); 284 | } 285 | return perf_loop(atoi(argv[2]), atoi(argv[3])); 286 | } 287 | 288 | return rv; 289 | } 290 | --------------------------------------------------------------------------------