├── .travis.yml ├── LICENSE ├── sipHash.go ├── words.txt ├── README.md ├── bbloom_test.go └── bbloom.go /.travis.yml: -------------------------------------------------------------------------------- 1 | language: go 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | bbloom.go 2 | 3 | // The MIT License (MIT) 4 | // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt 5 | 6 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 7 | // this software and associated documentation files (the "Software"), to deal in 8 | // the Software without restriction, including without limitation the rights to 9 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 10 | // the Software, and to permit persons to whom the Software is furnished to do so, 11 | // subject to the following conditions: 12 | 13 | // The above copyright notice and this permission notice shall be included in all 14 | // copies or substantial portions of the Software. 15 | 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 18 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 19 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 20 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 22 | 23 | siphash.go 24 | 25 | // https://github.com/dchest/siphash 26 | // 27 | // Written in 2012 by Dmitry Chestnykh. 28 | // 29 | // To the extent possible under law, the author have dedicated all copyright 30 | // and related and neighboring rights to this software to the public domain 31 | // worldwide. This software is distributed without any warranty. 32 | // http://creativecommons.org/publicdomain/zero/1.0/ 33 | // 34 | // Package siphash implements SipHash-2-4, a fast short-input PRF 35 | // created by Jean-Philippe Aumasson and Daniel J. Bernstein. 36 | -------------------------------------------------------------------------------- /sipHash.go: -------------------------------------------------------------------------------- 1 | // Written in 2012 by Dmitry Chestnykh. 2 | // 3 | // To the extent possible under law, the author have dedicated all copyright 4 | // and related and neighboring rights to this software to the public domain 5 | // worldwide. This software is distributed without any warranty. 6 | // http://creativecommons.org/publicdomain/zero/1.0/ 7 | // 8 | // Package siphash implements SipHash-2-4, a fast short-input PRF 9 | // created by Jean-Philippe Aumasson and Daniel J. Bernstein. 10 | 11 | package bbloom 12 | 13 | // Hash returns the 64-bit SipHash-2-4 of the given byte slice with two 64-bit 14 | // parts of 128-bit key: k0 and k1. 15 | func (bl Bloom) sipHash(p []byte) (l, h uint64) { 16 | // Initialization. 17 | v0 := uint64(8317987320269560794) // k0 ^ 0x736f6d6570736575 18 | v1 := uint64(7237128889637516672) // k1 ^ 0x646f72616e646f6d 19 | v2 := uint64(7816392314733513934) // k0 ^ 0x6c7967656e657261 20 | v3 := uint64(8387220255325274014) // k1 ^ 0x7465646279746573 21 | t := uint64(len(p)) << 56 22 | 23 | // Compression. 24 | for len(p) >= 8 { 25 | 26 | m := uint64(p[0]) | uint64(p[1])<<8 | uint64(p[2])<<16 | uint64(p[3])<<24 | 27 | uint64(p[4])<<32 | uint64(p[5])<<40 | uint64(p[6])<<48 | uint64(p[7])<<56 28 | 29 | v3 ^= m 30 | 31 | // Round 1. 32 | v0 += v1 33 | v1 = v1<<13 | v1>>51 34 | v1 ^= v0 35 | v0 = v0<<32 | v0>>32 36 | 37 | v2 += v3 38 | v3 = v3<<16 | v3>>48 39 | v3 ^= v2 40 | 41 | v0 += v3 42 | v3 = v3<<21 | v3>>43 43 | v3 ^= v0 44 | 45 | v2 += v1 46 | v1 = v1<<17 | v1>>47 47 | v1 ^= v2 48 | v2 = v2<<32 | v2>>32 49 | 50 | // Round 2. 51 | v0 += v1 52 | v1 = v1<<13 | v1>>51 53 | v1 ^= v0 54 | v0 = v0<<32 | v0>>32 55 | 56 | v2 += v3 57 | v3 = v3<<16 | v3>>48 58 | v3 ^= v2 59 | 60 | v0 += v3 61 | v3 = v3<<21 | v3>>43 62 | v3 ^= v0 63 | 64 | v2 += v1 65 | v1 = v1<<17 | v1>>47 66 | v1 ^= v2 67 | v2 = v2<<32 | v2>>32 68 | 69 | v0 ^= m 70 | p = p[8:] 71 | } 72 | 73 | // Compress last block. 74 | switch len(p) { 75 | case 7: 76 | t |= uint64(p[6]) << 48 77 | fallthrough 78 | case 6: 79 | t |= uint64(p[5]) << 40 80 | fallthrough 81 | case 5: 82 | t |= uint64(p[4]) << 32 83 | fallthrough 84 | case 4: 85 | t |= uint64(p[3]) << 24 86 | fallthrough 87 | case 3: 88 | t |= uint64(p[2]) << 16 89 | fallthrough 90 | case 2: 91 | t |= uint64(p[1]) << 8 92 | fallthrough 93 | case 1: 94 | t |= uint64(p[0]) 95 | } 96 | 97 | v3 ^= t 98 | 99 | // Round 1. 100 | v0 += v1 101 | v1 = v1<<13 | v1>>51 102 | v1 ^= v0 103 | v0 = v0<<32 | v0>>32 104 | 105 | v2 += v3 106 | v3 = v3<<16 | v3>>48 107 | v3 ^= v2 108 | 109 | v0 += v3 110 | v3 = v3<<21 | v3>>43 111 | v3 ^= v0 112 | 113 | v2 += v1 114 | v1 = v1<<17 | v1>>47 115 | v1 ^= v2 116 | v2 = v2<<32 | v2>>32 117 | 118 | // Round 2. 119 | v0 += v1 120 | v1 = v1<<13 | v1>>51 121 | v1 ^= v0 122 | v0 = v0<<32 | v0>>32 123 | 124 | v2 += v3 125 | v3 = v3<<16 | v3>>48 126 | v3 ^= v2 127 | 128 | v0 += v3 129 | v3 = v3<<21 | v3>>43 130 | v3 ^= v0 131 | 132 | v2 += v1 133 | v1 = v1<<17 | v1>>47 134 | v1 ^= v2 135 | v2 = v2<<32 | v2>>32 136 | 137 | v0 ^= t 138 | 139 | // Finalization. 140 | v2 ^= 0xff 141 | 142 | // Round 1. 143 | v0 += v1 144 | v1 = v1<<13 | v1>>51 145 | v1 ^= v0 146 | v0 = v0<<32 | v0>>32 147 | 148 | v2 += v3 149 | v3 = v3<<16 | v3>>48 150 | v3 ^= v2 151 | 152 | v0 += v3 153 | v3 = v3<<21 | v3>>43 154 | v3 ^= v0 155 | 156 | v2 += v1 157 | v1 = v1<<17 | v1>>47 158 | v1 ^= v2 159 | v2 = v2<<32 | v2>>32 160 | 161 | // Round 2. 162 | v0 += v1 163 | v1 = v1<<13 | v1>>51 164 | v1 ^= v0 165 | v0 = v0<<32 | v0>>32 166 | 167 | v2 += v3 168 | v3 = v3<<16 | v3>>48 169 | v3 ^= v2 170 | 171 | v0 += v3 172 | v3 = v3<<21 | v3>>43 173 | v3 ^= v0 174 | 175 | v2 += v1 176 | v1 = v1<<17 | v1>>47 177 | v1 ^= v2 178 | v2 = v2<<32 | v2>>32 179 | 180 | // Round 3. 181 | v0 += v1 182 | v1 = v1<<13 | v1>>51 183 | v1 ^= v0 184 | v0 = v0<<32 | v0>>32 185 | 186 | v2 += v3 187 | v3 = v3<<16 | v3>>48 188 | v3 ^= v2 189 | 190 | v0 += v3 191 | v3 = v3<<21 | v3>>43 192 | v3 ^= v0 193 | 194 | v2 += v1 195 | v1 = v1<<17 | v1>>47 196 | v1 ^= v2 197 | v2 = v2<<32 | v2>>32 198 | 199 | // Round 4. 200 | v0 += v1 201 | v1 = v1<<13 | v1>>51 202 | v1 ^= v0 203 | v0 = v0<<32 | v0>>32 204 | 205 | v2 += v3 206 | v3 = v3<<16 | v3>>48 207 | v3 ^= v2 208 | 209 | v0 += v3 210 | v3 = v3<<21 | v3>>43 211 | v3 ^= v0 212 | 213 | v2 += v1 214 | v1 = v1<<17 | v1>>47 215 | v1 ^= v2 216 | v2 = v2<<32 | v2>>32 217 | 218 | // return v0 ^ v1 ^ v2 ^ v3 219 | 220 | hash := v0 ^ v1 ^ v2 ^ v3 221 | h = hash >> bl.shift 222 | l = hash << bl.shift >> bl.shift 223 | return l, h 224 | 225 | } 226 | -------------------------------------------------------------------------------- /words.txt: -------------------------------------------------------------------------------- 1 | 2014/01/01 00:00:00 /info.html 2 | 2014/01/01 00:00:00 /info.html 3 | 2014/01/01 00:00:01 /info.html 4 | 2014/01/01 00:00:02 /info.html 5 | 2014/01/01 00:00:03 /info.html 6 | 2014/01/01 00:00:04 /info.html 7 | 2014/01/01 00:00:05 /info.html 8 | 2014/01/01 00:00:06 /info.html 9 | 2014/01/01 00:00:07 /info.html 10 | 2014/01/01 00:00:08 /info.html 11 | 2014/01/01 00:00:09 /info.html 12 | 2014/01/01 00:00:10 /info.html 13 | 2014/01/01 00:00:11 /info.html 14 | 2014/01/01 00:00:12 /info.html 15 | 2014/01/01 00:00:13 /info.html 16 | 2014/01/01 00:00:14 /info.html 17 | 2014/01/01 00:00:15 /info.html 18 | 2014/01/01 00:00:16 /info.html 19 | 2014/01/01 00:00:17 /info.html 20 | 2014/01/01 00:00:18 /info.html 21 | 2014/01/01 00:00:19 /info.html 22 | 2014/01/01 00:00:20 /info.html 23 | 2014/01/01 00:00:21 /info.html 24 | 2014/01/01 00:00:22 /info.html 25 | 2014/01/01 00:00:23 /info.html 26 | 2014/01/01 00:00:24 /info.html 27 | 2014/01/01 00:00:25 /info.html 28 | 2014/01/01 00:00:26 /info.html 29 | 2014/01/01 00:00:27 /info.html 30 | 2014/01/01 00:00:28 /info.html 31 | 2014/01/01 00:00:29 /info.html 32 | 2014/01/01 00:00:30 /info.html 33 | 2014/01/01 00:00:31 /info.html 34 | 2014/01/01 00:00:32 /info.html 35 | 2014/01/01 00:00:33 /info.html 36 | 2014/01/01 00:00:34 /info.html 37 | 2014/01/01 00:00:35 /info.html 38 | 2014/01/01 00:00:36 /info.html 39 | 2014/01/01 00:00:37 /info.html 40 | 2014/01/01 00:00:38 /info.html 41 | 2014/01/01 00:00:39 /info.html 42 | 2014/01/01 00:00:40 /info.html 43 | 2014/01/01 00:00:41 /info.html 44 | 2014/01/01 00:00:42 /info.html 45 | 2014/01/01 00:00:43 /info.html 46 | 2014/01/01 00:00:44 /info.html 47 | 2014/01/01 00:00:45 /info.html 48 | 2014/01/01 00:00:46 /info.html 49 | 2014/01/01 00:00:47 /info.html 50 | 2014/01/01 00:00:48 /info.html 51 | 2014/01/01 00:00:49 /info.html 52 | 2014/01/01 00:00:50 /info.html 53 | 2014/01/01 00:00:51 /info.html 54 | 2014/01/01 00:00:52 /info.html 55 | 2014/01/01 00:00:53 /info.html 56 | 2014/01/01 00:00:54 /info.html 57 | 2014/01/01 00:00:55 /info.html 58 | 2014/01/01 00:00:56 /info.html 59 | 2014/01/01 00:00:57 /info.html 60 | 2014/01/01 00:00:58 /info.html 61 | 2014/01/01 00:00:59 /info.html 62 | 2014/01/01 00:01:00 /info.html 63 | 2014/01/01 00:01:01 /info.html 64 | 2014/01/01 00:01:02 /info.html 65 | 2014/01/01 00:01:03 /info.html 66 | 2014/01/01 00:01:04 /info.html 67 | 2014/01/01 00:01:05 /info.html 68 | 2014/01/01 00:01:06 /info.html 69 | 2014/01/01 00:01:07 /info.html 70 | 2014/01/01 00:01:08 /info.html 71 | 2014/01/01 00:01:09 /info.html 72 | 2014/01/01 00:01:10 /info.html 73 | 2014/01/01 00:01:11 /info.html 74 | 2014/01/01 00:01:12 /info.html 75 | 2014/01/01 00:01:13 /info.html 76 | 2014/01/01 00:01:14 /info.html 77 | 2014/01/01 00:01:15 /info.html 78 | 2014/01/01 00:01:16 /info.html 79 | 2014/01/01 00:01:17 /info.html 80 | 2014/01/01 00:01:18 /info.html 81 | 2014/01/01 00:01:19 /info.html 82 | 2014/01/01 00:01:20 /info.html 83 | 2014/01/01 00:01:21 /info.html 84 | 2014/01/01 00:01:22 /info.html 85 | 2014/01/01 00:01:23 /info.html 86 | 2014/01/01 00:01:24 /info.html 87 | 2014/01/01 00:01:25 /info.html 88 | 2014/01/01 00:01:26 /info.html 89 | 2014/01/01 00:01:27 /info.html 90 | 2014/01/01 00:01:28 /info.html 91 | 2014/01/01 00:01:29 /info.html 92 | 2014/01/01 00:01:30 /info.html 93 | 2014/01/01 00:01:31 /info.html 94 | 2014/01/01 00:01:32 /info.html 95 | 2014/01/01 00:01:33 /info.html 96 | 2014/01/01 00:01:34 /info.html 97 | 2014/01/01 00:01:35 /info.html 98 | 2014/01/01 00:01:36 /info.html 99 | 2014/01/01 00:01:37 /info.html 100 | 2014/01/01 00:01:38 /info.html 101 | 2014/01/01 00:01:39 /info.html 102 | 2014/01/01 00:01:40 /info.html 103 | 2014/01/01 00:01:41 /info.html 104 | 2014/01/01 00:01:42 /info.html 105 | 2014/01/01 00:01:43 /info.html 106 | 2014/01/01 00:01:44 /info.html 107 | 2014/01/01 00:01:45 /info.html 108 | 2014/01/01 00:01:46 /info.html 109 | 2014/01/01 00:01:47 /info.html 110 | 2014/01/01 00:01:48 /info.html 111 | 2014/01/01 00:01:49 /info.html 112 | 2014/01/01 00:01:50 /info.html 113 | 2014/01/01 00:01:51 /info.html 114 | 2014/01/01 00:01:52 /info.html 115 | 2014/01/01 00:01:53 /info.html 116 | 2014/01/01 00:01:54 /info.html 117 | 2014/01/01 00:01:55 /info.html 118 | 2014/01/01 00:01:56 /info.html 119 | 2014/01/01 00:01:57 /info.html 120 | 2014/01/01 00:01:58 /info.html 121 | 2014/01/01 00:01:59 /info.html 122 | 2014/01/01 00:02:00 /info.html 123 | 2014/01/01 00:02:01 /info.html 124 | 2014/01/01 00:02:02 /info.html 125 | 2014/01/01 00:02:03 /info.html 126 | 2014/01/01 00:02:04 /info.html 127 | 2014/01/01 00:02:05 /info.html 128 | 2014/01/01 00:02:06 /info.html 129 | 2014/01/01 00:02:07 /info.html 130 | 2014/01/01 00:02:08 /info.html 131 | 2014/01/01 00:02:09 /info.html 132 | 2014/01/01 00:02:10 /info.html 133 | 2014/01/01 00:02:11 /info.html 134 | 2014/01/01 00:02:12 /info.html 135 | 2014/01/01 00:02:13 /info.html 136 | 2014/01/01 00:02:14 /info.html 137 | 2014/01/01 00:02:15 /info.html 138 | 2014/01/01 00:02:16 /info.html 139 | 2014/01/01 00:02:17 /info.html 140 | 2014/01/01 00:02:18 /info.html 141 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## bbloom: a bitset Bloom filter for go/golang 2 | === 3 | 4 | [![Build Status](https://travis-ci.org/AndreasBriese/bbloom.png?branch=master)](http://travis-ci.org/AndreasBriese/bbloom) 5 | 6 | package implements a fast bloom filter with real 'bitset' and JSONMarshal/JSONUnmarshal to store/reload the Bloom filter. 7 | 8 | NOTE: the package uses unsafe.Pointer to set and read the bits from the bitset. If you're uncomfortable with using the unsafe package, please consider using my bloom filter package at github.com/AndreasBriese/bloom 9 | 10 | === 11 | 12 | changelog 11/2015: new thread safe methods AddTS(), HasTS(), AddIfNotHasTS() following a suggestion from Srdjan Marinovic (github @a-little-srdjan), who used this to code a bloomfilter cache. 13 | 14 | This bloom filter was developed to strengthen a website-log database and was tested and optimized for this log-entry mask: "2014/%02i/%02i %02i:%02i:%02i /info.html". 15 | Nonetheless bbloom should work with any other form of entries. 16 | 17 | ~~Hash function is a modified Berkeley DB sdbm hash (to optimize for smaller strings). sdbm http://www.cse.yorku.ca/~oz/hash.html~~ 18 | 19 | Found sipHash (SipHash-2-4, a fast short-input PRF created by Jean-Philippe Aumasson and Daniel J. Bernstein.) to be about as fast. sipHash had been ported by Dimtry Chestnyk to Go (github.com/dchest/siphash ) 20 | 21 | Minimum hashset size is: 512 ([4]uint64; will be set automatically). 22 | 23 | ###install 24 | 25 | ```sh 26 | go get github.com/AndreasBriese/bbloom 27 | ``` 28 | 29 | ###test 30 | + change to folder ../bbloom 31 | + create wordlist in file "words.txt" (you might use `python permut.py`) 32 | + run 'go test -bench=.' within the folder 33 | 34 | ```go 35 | go test -bench=. 36 | ``` 37 | 38 | ~~If you've installed the GOCONVEY TDD-framework http://goconvey.co/ you can run the tests automatically.~~ 39 | 40 | using go's testing framework now (have in mind that the op timing is related to 65536 operations of Add, Has, AddIfNotHas respectively) 41 | 42 | ### usage 43 | 44 | after installation add 45 | 46 | ```go 47 | import ( 48 | ... 49 | "github.com/AndreasBriese/bbloom" 50 | ... 51 | ) 52 | ``` 53 | 54 | at your header. In the program use 55 | 56 | ```go 57 | // create a bloom filter for 65536 items and 1 % wrong-positive ratio 58 | bf := bbloom.New(float64(1<<16), float64(0.01)) 59 | 60 | // or 61 | // create a bloom filter with 650000 for 65536 items and 7 locs per hash explicitly 62 | // bf = bbloom.New(float64(650000), float64(7)) 63 | // or 64 | bf = bbloom.New(650000.0, 7.0) 65 | 66 | // add one item 67 | bf.Add([]byte("butter")) 68 | 69 | // Number of elements added is exposed now 70 | // Note: ElemNum will not be included in JSON export (for compatability to older version) 71 | nOfElementsInFilter := bf.ElemNum 72 | 73 | // check if item is in the filter 74 | isIn := bf.Has([]byte("butter")) // should be true 75 | isNotIn := bf.Has([]byte("Butter")) // should be false 76 | 77 | // 'add only if item is new' to the bloomfilter 78 | added := bf.AddIfNotHas([]byte("butter")) // should be false because 'butter' is already in the set 79 | added = bf.AddIfNotHas([]byte("buTTer")) // should be true because 'buTTer' is new 80 | 81 | // thread safe versions for concurrent use: AddTS, HasTS, AddIfNotHasTS 82 | // add one item 83 | bf.AddTS([]byte("peanutbutter")) 84 | // check if item is in the filter 85 | isIn = bf.HasTS([]byte("peanutbutter")) // should be true 86 | isNotIn = bf.HasTS([]byte("peanutButter")) // should be false 87 | // 'add only if item is new' to the bloomfilter 88 | added = bf.AddIfNotHasTS([]byte("butter")) // should be false because 'peanutbutter' is already in the set 89 | added = bf.AddIfNotHasTS([]byte("peanutbuTTer")) // should be true because 'penutbuTTer' is new 90 | 91 | // convert to JSON ([]byte) 92 | Json := bf.JSONMarshal() 93 | 94 | // bloomfilters Mutex is exposed for external un-/locking 95 | // i.e. mutex lock while doing JSON conversion 96 | bf.Mtx.Lock() 97 | Json = bf.JSONMarshal() 98 | bf.Mtx.Unlock() 99 | 100 | // restore a bloom filter from storage 101 | bfNew := bbloom.JSONUnmarshal(Json) 102 | 103 | isInNew := bfNew.Has([]byte("butter")) // should be true 104 | isNotInNew := bfNew.Has([]byte("Butter")) // should be false 105 | 106 | ``` 107 | 108 | to work with the bloom filter. 109 | 110 | ### why 'fast'? 111 | 112 | It's about 3 times faster than William Fitzgeralds bitset bloom filter https://github.com/willf/bloom . And it is about so fast as my []bool set variant for Boom filters (see https://github.com/AndreasBriese/bloom ) but having a 8times smaller memory footprint: 113 | 114 | 115 | Bloom filter (filter size 524288, 7 hashlocs) 116 | github.com/AndreasBriese/bbloom 'Add' 65536 items (10 repetitions): 6595800 ns (100 ns/op) 117 | github.com/AndreasBriese/bbloom 'Has' 65536 items (10 repetitions): 5986600 ns (91 ns/op) 118 | github.com/AndreasBriese/bloom 'Add' 65536 items (10 repetitions): 6304684 ns (96 ns/op) 119 | github.com/AndreasBriese/bloom 'Has' 65536 items (10 repetitions): 6568663 ns (100 ns/op) 120 | 121 | github.com/willf/bloom 'Add' 65536 items (10 repetitions): 24367224 ns (371 ns/op) 122 | github.com/willf/bloom 'Test' 65536 items (10 repetitions): 21881142 ns (333 ns/op) 123 | github.com/dataence/bloom/standard 'Add' 65536 items (10 repetitions): 23041644 ns (351 ns/op) 124 | github.com/dataence/bloom/standard 'Check' 65536 items (10 repetitions): 19153133 ns (292 ns/op) 125 | github.com/cabello/bloom 'Add' 65536 items (10 repetitions): 131921507 ns (2012 ns/op) 126 | github.com/cabello/bloom 'Contains' 65536 items (10 repetitions): 131108962 ns (2000 ns/op) 127 | 128 | (on MBPro15 OSX10.8.5 i7 4Core 2.4Ghz) 129 | 130 | 131 | With 32bit bloom filters (bloom32) using modified sdbm, bloom32 does hashing with only 2 bit shifts, one xor and one substraction per byte. smdb is about as fast as fnv64a but gives less collisions with the dataset (see mask above). bloom.New(float64(10 * 1<<16),float64(7)) populated with 1<<16 random items from the dataset (see above) and tested against the rest results in less than 0.05% collisions. 132 | -------------------------------------------------------------------------------- /bbloom_test.go: -------------------------------------------------------------------------------- 1 | package bbloom 2 | 3 | import ( 4 | "bufio" 5 | "fmt" 6 | "log" 7 | "os" 8 | "testing" 9 | ) 10 | 11 | var ( 12 | wordlist1 [][]byte 13 | n = 1 << 16 14 | bf Bloom 15 | ) 16 | 17 | func TestMain(m *testing.M) { 18 | file, err := os.Open("words.txt") 19 | if err != nil { 20 | log.Fatal(err) 21 | } 22 | defer file.Close() 23 | scanner := bufio.NewScanner(file) 24 | wordlist1 = make([][]byte, n) 25 | for i := range wordlist1 { 26 | if scanner.Scan() { 27 | wordlist1[i] = []byte(scanner.Text()) 28 | } 29 | } 30 | if err := scanner.Err(); err != nil { 31 | log.Fatal(err) 32 | } 33 | fmt.Println("\n###############\nbbloom_test.go") 34 | fmt.Print("Benchmarks relate to 2**16 OP. --> output/65536 op/ns\n###############\n\n") 35 | 36 | m.Run() 37 | 38 | } 39 | 40 | func TestM_NumberOfWrongs(t *testing.T) { 41 | bf = New(float64(n*10), float64(7)) 42 | 43 | cnt := 0 44 | for i := range wordlist1 { 45 | if !bf.AddIfNotHas(wordlist1[i]) { 46 | cnt++ 47 | } 48 | } 49 | fmt.Printf("Bloomfilter New(7* 2**16, 7) (-> size=%v bit): \n Check for 'false positives': %v wrong positive 'Has' results on 2**16 entries => %v %%\n", len(bf.bitset)<<6, cnt, float64(cnt)/float64(n)) 50 | 51 | } 52 | 53 | func TestM_JSON(t *testing.T) { 54 | const shallBe = int(1 << 16) 55 | 56 | bf = New(float64(n*10), float64(7)) 57 | 58 | cnt := 0 59 | for i := range wordlist1 { 60 | if !bf.AddIfNotHas(wordlist1[i]) { 61 | cnt++ 62 | } 63 | } 64 | 65 | Json := bf.JSONMarshal() 66 | 67 | // create new bloomfilter from bloomfilter's JSON representation 68 | bf2 := JSONUnmarshal(Json) 69 | 70 | cnt2 := 0 71 | for i := range wordlist1 { 72 | if !bf2.AddIfNotHas(wordlist1[i]) { 73 | cnt2++ 74 | } 75 | } 76 | 77 | if cnt2 != shallBe { 78 | t.Errorf("FAILED !AddIfNotHas = %v; want %v", cnt2, shallBe) 79 | } 80 | 81 | } 82 | 83 | func ExampleM_NewAddHasAddIfNotHas() { 84 | bf := New(float64(512), float64(1)) 85 | 86 | fmt.Printf("%v %v %v %v\n", bf.sizeExp, bf.size, bf.setLocs, bf.shift) 87 | 88 | bf.Add([]byte("Manfred")) 89 | fmt.Println("bf.Add([]byte(\"Manfred\"))") 90 | fmt.Printf("bf.Has([]byte(\"Manfred\")) -> %v - should be true\n", bf.Has([]byte("Manfred"))) 91 | fmt.Printf("bf.Add([]byte(\"manfred\")) -> %v - should be false\n", bf.Has([]byte("manfred"))) 92 | fmt.Printf("bf.AddIfNotHas([]byte(\"Manfred\")) -> %v - should be false\n", bf.AddIfNotHas([]byte("Manfred"))) 93 | fmt.Printf("bf.AddIfNotHas([]byte(\"manfred\")) -> %v - should be true\n", bf.AddIfNotHas([]byte("manfred"))) 94 | 95 | bf.AddTS([]byte("Hans")) 96 | fmt.Println("bf.AddTS([]byte(\"Hans\")") 97 | fmt.Printf("bf.HasTS([]byte(\"Hans\")) -> %v - should be true\n", bf.HasTS([]byte("Hans"))) 98 | fmt.Printf("bf.AddTS([]byte(\"hans\")) -> %v - should be false\n", bf.HasTS([]byte("hans"))) 99 | fmt.Printf("bf.AddIfNotHasTS([]byte(\"Hans\")) -> %v - should be false\n", bf.AddIfNotHasTS([]byte("Hans"))) 100 | fmt.Printf("bf.AddIfNotHasTS([]byte(\"hans\")) -> %v - should be true\n", bf.AddIfNotHasTS([]byte("hans"))) 101 | 102 | // Output: 9 511 1 55 103 | // bf.Add([]byte("Manfred")) 104 | // bf.Has([]byte("Manfred")) -> true - should be true 105 | // bf.Add([]byte("manfred")) -> false - should be false 106 | // bf.AddIfNotHas([]byte("Manfred")) -> false - should be false 107 | // bf.AddIfNotHas([]byte("manfred")) -> true - should be true 108 | // bf.AddTS([]byte("Hans") 109 | // bf.HasTS([]byte("Hans")) -> true - should be true 110 | // bf.AddTS([]byte("hans")) -> false - should be false 111 | // bf.AddIfNotHasTS([]byte("Hans")) -> false - should be false 112 | // bf.AddIfNotHasTS([]byte("hans")) -> true - should be true 113 | } 114 | 115 | func BenchmarkM_New(b *testing.B) { 116 | for r := 0; r < b.N; r++ { 117 | _ = New(float64(n*10), float64(7)) 118 | } 119 | } 120 | 121 | func BenchmarkM_Clear(b *testing.B) { 122 | bf = New(float64(n*10), float64(7)) 123 | for i := range wordlist1 { 124 | bf.Add(wordlist1[i]) 125 | } 126 | b.ResetTimer() 127 | for r := 0; r < b.N; r++ { 128 | bf.Clear() 129 | } 130 | } 131 | 132 | func BenchmarkM_Add(b *testing.B) { 133 | bf = New(float64(n*10), float64(7)) 134 | b.ResetTimer() 135 | for r := 0; r < b.N; r++ { 136 | for i := range wordlist1 { 137 | bf.Add(wordlist1[i]) 138 | } 139 | } 140 | 141 | } 142 | 143 | func BenchmarkM_Has(b *testing.B) { 144 | b.ResetTimer() 145 | for r := 0; r < b.N; r++ { 146 | for i := range wordlist1 { 147 | bf.Has(wordlist1[i]) 148 | } 149 | } 150 | 151 | } 152 | 153 | func BenchmarkM_AddIfNotHasFALSE(b *testing.B) { 154 | bf = New(float64(n*10), float64(7)) 155 | for i := range wordlist1 { 156 | bf.Has(wordlist1[i]) 157 | } 158 | b.ResetTimer() 159 | for r := 0; r < b.N; r++ { 160 | for i := range wordlist1 { 161 | bf.AddIfNotHas(wordlist1[i]) 162 | } 163 | } 164 | } 165 | 166 | func BenchmarkM_AddIfNotHasClearTRUE(b *testing.B) { 167 | bf = New(float64(n*10), float64(7)) 168 | 169 | b.ResetTimer() 170 | for r := 0; r < b.N; r++ { 171 | for i := range wordlist1 { 172 | bf.AddIfNotHas(wordlist1[i]) 173 | } 174 | bf.Clear() 175 | } 176 | } 177 | 178 | func BenchmarkM_AddTS(b *testing.B) { 179 | bf = New(float64(n*10), float64(7)) 180 | 181 | b.ResetTimer() 182 | for r := 0; r < b.N; r++ { 183 | for i := range wordlist1 { 184 | bf.AddTS(wordlist1[i]) 185 | } 186 | } 187 | 188 | } 189 | 190 | func BenchmarkM_HasTS(b *testing.B) { 191 | b.ResetTimer() 192 | for r := 0; r < b.N; r++ { 193 | for i := range wordlist1 { 194 | bf.HasTS(wordlist1[i]) 195 | } 196 | } 197 | 198 | } 199 | 200 | func BenchmarkM_AddIfNotHasTSFALSE(b *testing.B) { 201 | bf = New(float64(n*10), float64(7)) 202 | for i := range wordlist1 { 203 | bf.Has(wordlist1[i]) 204 | } 205 | b.ResetTimer() 206 | for r := 0; r < b.N; r++ { 207 | for i := range wordlist1 { 208 | bf.AddIfNotHasTS(wordlist1[i]) 209 | } 210 | } 211 | } 212 | 213 | func BenchmarkM_AddIfNotHasTSClearTRUE(b *testing.B) { 214 | bf = New(float64(n*10), float64(7)) 215 | 216 | b.ResetTimer() 217 | for r := 0; r < b.N; r++ { 218 | for i := range wordlist1 { 219 | bf.AddIfNotHasTS(wordlist1[i]) 220 | } 221 | bf.Clear() 222 | } 223 | 224 | } 225 | -------------------------------------------------------------------------------- /bbloom.go: -------------------------------------------------------------------------------- 1 | // The MIT License (MIT) 2 | // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt 3 | 4 | // Permission is hereby granted, free of charge, to any person obtaining a copy of 5 | // this software and associated documentation files (the "Software"), to deal in 6 | // the Software without restriction, including without limitation the rights to 7 | // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 8 | // the Software, and to permit persons to whom the Software is furnished to do so, 9 | // subject to the following conditions: 10 | 11 | // The above copyright notice and this permission notice shall be included in all 12 | // copies or substantial portions of the Software. 13 | 14 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 16 | // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 17 | // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 18 | // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 19 | // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 20 | 21 | // 2019/08/25 code revision to reduce unsafe use 22 | // Parts are adopted from the fork at ipfs/bbloom after performance rev by 23 | // Steve Allen (https://github.com/Stebalien) 24 | // (see https://github.com/ipfs/bbloom/blob/master/bbloom.go) 25 | // -> func Has 26 | // -> func set 27 | // -> func add 28 | 29 | package bbloom 30 | 31 | import ( 32 | "bytes" 33 | "encoding/json" 34 | "log" 35 | "math" 36 | "sync" 37 | "unsafe" 38 | ) 39 | 40 | // helper 41 | // not needed anymore by Set 42 | // var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128} 43 | 44 | func getSize(ui64 uint64) (size uint64, exponent uint64) { 45 | if ui64 < uint64(512) { 46 | ui64 = uint64(512) 47 | } 48 | size = uint64(1) 49 | for size < ui64 { 50 | size <<= 1 51 | exponent++ 52 | } 53 | return size, exponent 54 | } 55 | 56 | func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) { 57 | size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2) 58 | locs := math.Ceil(float64(0.69314718056) * size / numEntries) 59 | return uint64(size), uint64(locs) 60 | } 61 | 62 | // New 63 | // returns a new bloomfilter 64 | func New(params ...float64) (bloomfilter Bloom) { 65 | var entries, locs uint64 66 | if len(params) == 2 { 67 | if params[1] < 1 { 68 | entries, locs = calcSizeByWrongPositives(params[0], params[1]) 69 | } else { 70 | entries, locs = uint64(params[0]), uint64(params[1]) 71 | } 72 | } else { 73 | log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries), float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))") 74 | } 75 | size, exponent := getSize(uint64(entries)) 76 | bloomfilter = Bloom{ 77 | Mtx: &sync.Mutex{}, 78 | sizeExp: exponent, 79 | size: size - 1, 80 | setLocs: locs, 81 | shift: 64 - exponent, 82 | } 83 | bloomfilter.Size(size) 84 | return bloomfilter 85 | } 86 | 87 | // NewWithBoolset 88 | // takes a []byte slice and number of locs per entry 89 | // returns the bloomfilter with a bitset populated according to the input []byte 90 | func NewWithBoolset(bs *[]byte, locs uint64) (bloomfilter Bloom) { 91 | bloomfilter = New(float64(len(*bs)<<3), float64(locs)) 92 | for i, b := range *bs { 93 | *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) + uintptr(i))) = b 94 | } 95 | return bloomfilter 96 | } 97 | 98 | // bloomJSONImExport 99 | // Im/Export structure used by JSONMarshal / JSONUnmarshal 100 | type bloomJSONImExport struct { 101 | FilterSet []byte 102 | SetLocs uint64 103 | } 104 | 105 | // JSONUnmarshal 106 | // takes JSON-Object (type bloomJSONImExport) as []bytes 107 | // returns Bloom object 108 | func JSONUnmarshal(dbData []byte) Bloom { 109 | bloomImEx := bloomJSONImExport{} 110 | json.Unmarshal(dbData, &bloomImEx) 111 | buf := bytes.NewBuffer(bloomImEx.FilterSet) 112 | bs := buf.Bytes() 113 | bf := NewWithBoolset(&bs, bloomImEx.SetLocs) 114 | return bf 115 | } 116 | 117 | // 118 | // Bloom filter 119 | type Bloom struct { 120 | Mtx *sync.Mutex 121 | ElemNum uint64 122 | bitset []uint64 123 | sizeExp uint64 124 | size uint64 125 | setLocs uint64 126 | shift uint64 127 | } 128 | 129 | // <--- http://www.cse.yorku.ca/~oz/hash.html 130 | // modified Berkeley DB Hash (32bit) 131 | // hash is casted to l, h = 16bit fragments 132 | // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) { 133 | // hash := uint64(len(*b)) 134 | // for _, c := range *b { 135 | // hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash 136 | // } 137 | // h = hash >> bl.shift 138 | // l = hash << bl.shift >> bl.shift 139 | // return l, h 140 | // } 141 | 142 | // Update: found sipHash of Jean-Philippe Aumasson & Daniel J. Bernstein to be even faster than absdbm() 143 | // https://131002.net/siphash/ 144 | // siphash was implemented for Go by Dmitry Chestnykh https://github.com/dchest/siphash 145 | 146 | // Add 147 | // set the bit(s) for entry; Adds an entry to the Bloom filter 148 | func (bl *Bloom) Add(entry []byte) { 149 | l, h := bl.sipHash(entry) 150 | for i := uint64(0); i < bl.setLocs; i++ { 151 | bl.set((h + i*l) & bl.size) 152 | bl.ElemNum++ 153 | } 154 | } 155 | 156 | // AddTS 157 | // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry 158 | func (bl *Bloom) AddTS(entry []byte) { 159 | bl.Mtx.Lock() 160 | defer bl.Mtx.Unlock() 161 | bl.Add(entry) 162 | } 163 | 164 | // Has 165 | // check if bit(s) for entry is/are set 166 | // returns true if the entry was added to the Bloom Filter 167 | func (bl Bloom) Has(entry []byte) bool { 168 | l, h := bl.sipHash(entry) 169 | res := true 170 | for i := uint64(0); i < bl.setLocs; i++ { 171 | res = res && bl.isSet((h+i*l)&bl.size) 172 | // https://github.com/ipfs/bbloom/commit/84e8303a9bfb37b2658b85982921d15bbb0fecff 173 | // // Branching here (early escape) is not worth it 174 | // // This is my conclusion from benchmarks 175 | // // (prevents loop unrolling) 176 | // switch bl.IsSet((h + i*l) & bl.size) { 177 | // case false: 178 | // return false 179 | // } 180 | } 181 | return res 182 | } 183 | 184 | // HasTS 185 | // Thread safe: Mutex.Lock the bloomfilter for the time of processing the entry 186 | func (bl *Bloom) HasTS(entry []byte) bool { 187 | bl.Mtx.Lock() 188 | defer bl.Mtx.Unlock() 189 | return bl.Has(entry) 190 | } 191 | 192 | // AddIfNotHas 193 | // Only Add entry if it's not present in the bloomfilter 194 | // returns true if entry was added 195 | // returns false if entry was allready registered in the bloomfilter 196 | func (bl Bloom) AddIfNotHas(entry []byte) (added bool) { 197 | if bl.Has(entry) { 198 | return added 199 | } 200 | bl.Add(entry) 201 | return true 202 | } 203 | 204 | // AddIfNotHasTS 205 | // Tread safe: Only Add entry if it's not present in the bloomfilter 206 | // returns true if entry was added 207 | // returns false if entry was allready registered in the bloomfilter 208 | func (bl *Bloom) AddIfNotHasTS(entry []byte) (added bool) { 209 | bl.Mtx.Lock() 210 | defer bl.Mtx.Unlock() 211 | return bl.AddIfNotHas(entry) 212 | } 213 | 214 | // Size 215 | // make Bloom filter with as bitset of size sz 216 | func (bl *Bloom) Size(sz uint64) { 217 | bl.bitset = make([]uint64, sz>>6) 218 | } 219 | 220 | // Clear 221 | // resets the Bloom filter 222 | func (bl *Bloom) Clear() { 223 | bs := bl.bitset 224 | for i := range bs { 225 | bs[i] = 0 226 | } 227 | } 228 | 229 | // Set 230 | // set the bit[idx] of bitsit 231 | func (bl *Bloom) set(idx uint64) { 232 | // ommit unsafe 233 | // *(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))) |= mask[idx%8] 234 | bl.bitset[idx>>6] |= 1 << (idx % 64) 235 | } 236 | 237 | // IsSet 238 | // check if bit[idx] of bitset is set 239 | // returns true/false 240 | func (bl *Bloom) isSet(idx uint64) bool { 241 | // ommit unsafe 242 | // return (((*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3)))) >> (idx % 8)) & 1) == 1 243 | return bl.bitset[idx>>6]&(1<<(idx%64)) != 0 244 | } 245 | 246 | // JSONMarshal 247 | // returns JSON-object (type bloomJSONImExport) as []byte 248 | func (bl Bloom) JSONMarshal() []byte { 249 | bloomImEx := bloomJSONImExport{} 250 | bloomImEx.SetLocs = uint64(bl.setLocs) 251 | bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3) 252 | for i := range bloomImEx.FilterSet { 253 | bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[0])) + uintptr(i))) 254 | } 255 | data, err := json.Marshal(bloomImEx) 256 | if err != nil { 257 | log.Fatal("json.Marshal failed: ", err) 258 | } 259 | return data 260 | } 261 | 262 | // // alternative hashFn 263 | // func (bl Bloom) fnv64a(b *[]byte) (l, h uint64) { 264 | // h64 := fnv.New64a() 265 | // h64.Write(*b) 266 | // hash := h64.Sum64() 267 | // h = hash >> 32 268 | // l = hash << 32 >> 32 269 | // return l, h 270 | // } 271 | // 272 | // // <-- http://partow.net/programming/hashfunctions/index.html 273 | // // citation: An algorithm proposed by Donald E. Knuth in The Art Of Computer Programming Volume 3, 274 | // // under the topic of sorting and search chapter 6.4. 275 | // // modified to fit with boolset-length 276 | // func (bl Bloom) DEKHash(b *[]byte) (l, h uint64) { 277 | // hash := uint64(len(*b)) 278 | // for _, c := range *b { 279 | // hash = ((hash << 5) ^ (hash >> bl.shift)) ^ uint64(c) 280 | // } 281 | // h = hash >> bl.shift 282 | // l = hash << bl.sizeExp >> bl.sizeExp 283 | // return l, h 284 | // } 285 | --------------------------------------------------------------------------------