├── go.sum ├── go.mod ├── v2 ├── go.mod ├── conformance_test.go ├── iscompatible.go ├── MIT-LICENSE.txt ├── optimal_test.go ├── statistics.go ├── fileio.go ├── binarymarshaler.go ├── new.go ├── binaryunmarshaler.go ├── bloomfilter.go ├── fileio_test.go └── bloomfilter_test.go ├── codecov.yml ├── .deepsource.toml ├── .circleci └── config.yml ├── MIT-LICENSE.txt └── README.md /go.sum: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/holiman/bloomfilter 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /v2/go.mod: -------------------------------------------------------------------------------- 1 | module github.com/holiman/bloomfilter/v2 2 | 3 | go 1.15 4 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | 2 | codecov: 3 | require_ci_to_pass: no 4 | 5 | coverage: 6 | status: 7 | project: no 8 | patch: no 9 | 10 | comment: 11 | layout: "diff" 12 | -------------------------------------------------------------------------------- /.deepsource.toml: -------------------------------------------------------------------------------- 1 | version = 1 2 | 3 | test_patterns = ["*_test.go"] 4 | 5 | [[analyzers]] 6 | name = "go" 7 | enabled = true 8 | 9 | [analyzers.meta] 10 | import_paths = ["github.com/holiman/bloomfilter"] -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | # Golang CircleCI 2.0 configuration file 2 | # 3 | # Check https://circleci.com/docs/2.0/language-go/ for more details 4 | version: 2 5 | jobs: 6 | build: 7 | docker: 8 | # specify the version 9 | - image: cimg/go:1.19 10 | 11 | steps: 12 | - checkout 13 | 14 | # specify any bash command here prefixed with `run: ` 15 | - run: (cd v2 && go test -v ./... -coverprofile=coverage.txt -covermode=count ) 16 | - run: 17 | name: "Codecov upload" 18 | command: bash <(curl -s https://codecov.io/bash) 19 | - run: 20 | name: "Install tools" 21 | command: curl -sSfL https://raw.githubusercontent.com/golangci/golangci-lint/master/install.sh | sh -s -- -b $(go env GOPATH)/bin v1.54.2 22 | - run: 23 | name: "Lint" 24 | command: (cd v2 && golangci-lint run) 25 | 26 | -------------------------------------------------------------------------------- /v2/conformance_test.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "encoding" 20 | "encoding/json" 21 | "io" 22 | ) 23 | 24 | // compile-time conformance tests 25 | var ( 26 | _ encoding.BinaryMarshaler = (*Filter)(nil) 27 | _ encoding.BinaryUnmarshaler = (*Filter)(nil) 28 | _ io.ReaderFrom = (*Filter)(nil) 29 | _ io.WriterTo = (*Filter)(nil) 30 | _ json.Marshaler = (*Filter)(nil) 31 | _ json.Unmarshaler = (*Filter)(nil) 32 | ) 33 | -------------------------------------------------------------------------------- /v2/iscompatible.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | // returns 0 if equal, does not compare len(b0) with len(b1) 19 | func noBranchCompareUint64s(b0, b1 []uint64) uint64 { 20 | r := uint64(0) 21 | for i, b0i := range b0 { 22 | r |= b0i ^ b1[i] 23 | } 24 | return r 25 | } 26 | 27 | // IsCompatible is true if f and f2 can be Union()ed together 28 | func (f *Filter) IsCompatible(f2 *Filter) bool { 29 | // 0 is true, non-0 is false 30 | compat := f.M() ^ f2.M() 31 | compat |= f.K() ^ f2.K() 32 | compat |= noBranchCompareUint64s(f.keys, f2.keys) 33 | return compat == 0 34 | } 35 | -------------------------------------------------------------------------------- /MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright © 2014, 2015 Barry Allard 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | -------------------------------------------------------------------------------- /v2/MIT-LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright © 2014, 2015 Barry Allard 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 5 | 6 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 7 | 8 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 9 | -------------------------------------------------------------------------------- /v2/optimal_test.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "testing" 20 | ) 21 | 22 | func TestOptimal(t *testing.T) { 23 | tests := []struct { 24 | n uint64 25 | p float64 26 | k, m uint64 27 | }{ 28 | { 29 | n: 1000, 30 | p: 0.01 / 100, 31 | k: 14, 32 | m: 19171, 33 | }, 34 | { 35 | n: 10000, 36 | p: 0.01 / 100, 37 | k: 14, 38 | m: 191702, 39 | }, 40 | { 41 | n: 10000, 42 | p: 0.01 / 100, 43 | k: 14, 44 | m: 191702, 45 | }, 46 | { 47 | n: 1000, 48 | p: 0.001 / 100, 49 | k: 17, 50 | m: 23963, 51 | }, 52 | } 53 | 54 | for _, test := range tests { 55 | m := OptimalM(test.n, test.p) 56 | k := OptimalK(m, test.n) 57 | 58 | if k != test.k || m != test.m { 59 | t.Errorf( 60 | "n=%d p=%f: expected (m=%d, k=%d), got (m=%d, k=%d)", 61 | test.n, 62 | test.p, 63 | test.m, 64 | test.k, 65 | m, 66 | k, 67 | ) 68 | } 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /v2/statistics.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "math" 20 | "math/bits" 21 | ) 22 | 23 | // CountBitsUint64s count 1's in b 24 | func CountBitsUint64s(b []uint64) int { 25 | c := 0 26 | for _, x := range b { 27 | c += bits.OnesCount64(x) 28 | } 29 | return c 30 | } 31 | 32 | // PreciseFilledRatio is an exhaustive count # of 1's 33 | func (f *Filter) PreciseFilledRatio() float64 { 34 | f.lock.RLock() 35 | defer f.lock.RUnlock() 36 | return float64(CountBitsUint64s(f.bits)) / float64(f.M()) 37 | } 38 | 39 | // N is how many elements have been inserted 40 | // (actually, how many Add()s have been performed?) 41 | func (f *Filter) N() uint64 { 42 | f.lock.RLock() 43 | defer f.lock.RUnlock() 44 | 45 | return f.n 46 | } 47 | 48 | // FalsePosititveProbability is the upper-bound probability of false positives 49 | // (1 - exp(-k*(n+0.5)/(m-1))) ** k 50 | func (f *Filter) FalsePosititveProbability() float64 { 51 | k := float64(f.K()) 52 | n := float64(f.N()) 53 | m := float64(f.M()) 54 | return math.Pow(1.0-math.Exp((-k)*(n+0.5)/(m-1)), k) 55 | } 56 | -------------------------------------------------------------------------------- /v2/fileio.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "compress/gzip" 20 | "encoding/json" 21 | "errors" 22 | "io" 23 | "os" 24 | ) 25 | 26 | // ReadFrom r and overwrite f with new Bloom filter data 27 | func (f *Filter) ReadFrom(r io.Reader) (n int64, err error) { 28 | f2, n, err := ReadFrom(r) 29 | if err != nil { 30 | return -1, err 31 | } 32 | f.lock.Lock() 33 | defer f.lock.Unlock() 34 | f.m = f2.m 35 | f.n = f2.n 36 | f.bits = f2.bits 37 | f.keys = f2.keys 38 | return n, nil 39 | } 40 | 41 | // ReadFrom Reader r into a lossless-compressed Bloom filter f 42 | func ReadFrom(r io.Reader) (f *Filter, n int64, err error) { 43 | f = new(Filter) 44 | rawR, err := gzip.NewReader(r) 45 | if err != nil { 46 | return nil, -1, err 47 | } 48 | defer rawR.Close() 49 | n, err = f.UnmarshalFromReader(rawR) 50 | if err != nil { 51 | return nil, -1, err 52 | } 53 | return f, n, nil 54 | } 55 | 56 | // ReadFile from filename into a lossless-compressed Bloom Filter f 57 | // Suggested file extension: .bf.gz 58 | func ReadFile(filename string) (f *Filter, n int64, err error) { 59 | r, err := os.Open(filename) 60 | if err != nil { 61 | return nil, -1, err 62 | } 63 | defer r.Close() 64 | 65 | return ReadFrom(r) 66 | } 67 | 68 | // WriteTo a Writer w from lossless-compressed Bloom Filter f 69 | func (f *Filter) WriteTo(w io.Writer) (n int64, err error) { 70 | rawW := gzip.NewWriter(w) 71 | defer rawW.Close() 72 | 73 | intN, _, err := f.MarshallToWriter(rawW) 74 | n = int64(intN) 75 | return n, err 76 | } 77 | 78 | // WriteFile filename from a a lossless-compressed Bloom Filter f 79 | // Suggested file extension: .bf.gz 80 | func (f *Filter) WriteFile(filename string) (n int64, err error) { 81 | w, err := os.Create(filename) 82 | if err != nil { 83 | return -1, err 84 | } 85 | defer w.Close() 86 | 87 | return f.WriteTo(w) 88 | } 89 | 90 | type jsonType struct { 91 | Version string `json:"version"` 92 | Bits []uint64 `json:"bits"` 93 | Keys []uint64 `json:"keys"` 94 | M uint64 `json:"m"` 95 | N uint64 `json:"n"` 96 | } 97 | 98 | func (f *Filter) MarshalJSON() ([]byte, error) { 99 | return json.Marshal(&jsonType{ 100 | string(version), 101 | f.bits, 102 | f.keys, 103 | f.m, 104 | f.n, 105 | }) 106 | } 107 | 108 | func (f *Filter) UnmarshalJSON(data []byte) error { 109 | var j jsonType 110 | if err := json.Unmarshal(data, &j); err != nil { 111 | return err 112 | } 113 | if j.Version != string(version) { 114 | return errors.New("incompatible version") 115 | } 116 | f.bits = j.Bits 117 | f.keys = j.Keys 118 | f.n = j.N 119 | f.m = j.M 120 | return nil 121 | } 122 | -------------------------------------------------------------------------------- /v2/binarymarshaler.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "bytes" 20 | "crypto/sha512" 21 | "encoding/binary" 22 | "io" 23 | ) 24 | 25 | // headerMagic is used to disambiguate between this package and the original 26 | // steakknife implementation. 27 | // Since the key hashing algorithm has changed, the format is no longer 28 | // binary compatible 29 | var version = []byte("v02\n") 30 | var headerMagic = append([]byte{0, 0, 0, 0, 0, 0, 0, 0}, version...) 31 | 32 | // counter is a utility to count bytes written 33 | type counter struct { 34 | bytes int 35 | } 36 | 37 | func (c *counter) Write(p []byte) (n int, err error) { 38 | count := len(p) 39 | c.bytes += count 40 | return count, nil 41 | } 42 | 43 | // conforms to encoding.BinaryMarshaler 44 | 45 | // MarshallToWriter marshalls the filter into the given io.Writer 46 | // Binary layout (Little Endian): 47 | // 48 | // k 1 uint64 49 | // n 1 uint64 50 | // m 1 uint64 51 | // keys [k]uint64 52 | // bits [(m+63)/64]uint64 53 | // hash sha384 (384 bits == 48 bytes) 54 | // 55 | // size = (3 + k + (m+63)/64) * 8 bytes 56 | // 57 | func (f *Filter) MarshallToWriter(out io.Writer) (int, [sha512.Size384]byte, error) { 58 | var ( 59 | c = &counter{0} 60 | hasher = sha512.New384() 61 | mw = io.MultiWriter(out, hasher, c) 62 | hash [sha512.Size384]byte 63 | ) 64 | f.lock.RLock() 65 | defer f.lock.RUnlock() 66 | 67 | if _, err := mw.Write(headerMagic); err != nil { 68 | return c.bytes, hash, err 69 | } 70 | if err := binary.Write(mw, binary.LittleEndian, []uint64{f.K(), f.n, f.m}); err != nil { 71 | return c.bytes, hash, err 72 | } 73 | if err := binary.Write(mw, binary.LittleEndian, f.keys); err != nil { 74 | return c.bytes, hash, err 75 | } 76 | // Write it in chunks of 5% (but at least 4K). Otherwise, the binary.Write will allocate a 77 | // same-size slice of bytes, doubling the memory usage 78 | var chunkSize = len(f.bits) / 20 79 | if chunkSize < 512 { 80 | chunkSize = 512 // Min 4K bytes (512 uint64s) 81 | } 82 | buf := make([]byte, chunkSize*8) 83 | for start := 0; start < len(f.bits); { 84 | end := start + chunkSize 85 | if end > len(f.bits) { 86 | end = len(f.bits) 87 | } 88 | for i, x := range f.bits[start:end] { 89 | binary.LittleEndian.PutUint64(buf[8*i:], x) 90 | } 91 | if _, err := mw.Write(buf[0 : (end-start)*8]); err != nil { 92 | return c.bytes, hash, err 93 | } 94 | start = end 95 | } 96 | // Now we stop using the multiwriter, pick out the hash of what we've 97 | // written so far, and then write the hash to the output 98 | hashbytes := hasher.Sum(nil) 99 | copy(hash[:], hashbytes[:sha512.Size384]) 100 | err := binary.Write(out, binary.LittleEndian, hashbytes) 101 | return c.bytes + len(hashbytes), hash, err 102 | } 103 | 104 | // MarshalBinary converts a Filter into []bytes 105 | func (f *Filter) MarshalBinary() (data []byte, err error) { 106 | buf := new(bytes.Buffer) 107 | _, _, err = f.MarshallToWriter(buf) 108 | if err != nil { 109 | return nil, err 110 | } 111 | data = buf.Bytes() 112 | return data, nil 113 | } 114 | -------------------------------------------------------------------------------- /v2/new.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | crand "crypto/rand" 20 | "encoding/binary" 21 | "fmt" 22 | "math" 23 | ) 24 | 25 | const ( 26 | MMin = 2 // MMin is the minimum Bloom filter bits count 27 | KMin = 1 // KMin is the minimum number of keys 28 | Uint64Bytes = 8 // Uint64Bytes is the number of bytes in type uint64 29 | ) 30 | 31 | // OptimalK calculates the optimal k value for creating a new Bloom filter 32 | // maxn is the maximum anticipated number of elements 33 | func OptimalK(m, maxN uint64) uint64 { 34 | return uint64(math.Ceil(float64(m) * math.Ln2 / float64(maxN))) 35 | } 36 | 37 | // OptimalM calculates the optimal m value for creating a new Bloom filter 38 | // p is the desired false positive probability 39 | // optimal m = ceiling( - n * ln(p) / ln(2)**2 ) 40 | func OptimalM(maxN uint64, p float64) uint64 { 41 | return uint64(math.Ceil(-float64(maxN) * math.Log(p) / (math.Ln2 * math.Ln2))) 42 | } 43 | 44 | // New Filter with CSPRNG keys 45 | // 46 | // m is the size of the Bloom filter, in bits, >= 2 47 | // 48 | // k is the number of random keys, >= 1 49 | func New(m, k uint64) (*Filter, error) { 50 | return NewWithKeys(m, newRandKeys(m, k)) 51 | } 52 | 53 | func newRandKeys(m uint64, k uint64) []uint64 { 54 | keys := make([]uint64, k) 55 | if err := binary.Read(crand.Reader, binary.LittleEndian, keys); err != nil { 56 | panic(fmt.Sprintf("Cannot read %d bytes from CSRPNG crypto/rand.Read (err=%v)", 57 | Uint64Bytes, err)) 58 | } 59 | return keys 60 | } 61 | 62 | // NewCompatible Filter compatible with f 63 | func (f *Filter) NewCompatible() (*Filter, error) { 64 | return NewWithKeys(f.m, f.keys) 65 | } 66 | 67 | // NewOptimal Bloom filter with random CSPRNG keys 68 | func NewOptimal(maxN uint64, p float64) (*Filter, error) { 69 | m := OptimalM(maxN, p) 70 | k := OptimalK(m, maxN) 71 | return New(m, k) 72 | } 73 | 74 | // uniqueKeys is true if all keys are unique 75 | func uniqueKeys(keys []uint64) bool { 76 | for j := 0; j < len(keys)-1; j++ { 77 | for i := j + 1; i < len(keys); i++ { 78 | if keys[i] == keys[j] { 79 | return false 80 | } 81 | } 82 | } 83 | return true 84 | } 85 | 86 | // NewWithKeys creates a new Filter from user-supplied origKeys 87 | func NewWithKeys(m uint64, origKeys []uint64) (f *Filter, err error) { 88 | var ( 89 | bits []uint64 90 | keys []uint64 91 | ) 92 | if bits, err = newBits(m); err != nil { 93 | return nil, err 94 | } 95 | if keys, err = newKeysCopy(origKeys); err != nil { 96 | return nil, err 97 | } 98 | return &Filter{ 99 | m: m, 100 | n: 0, 101 | bits: bits, 102 | keys: keys, 103 | }, nil 104 | } 105 | 106 | func newBits(m uint64) ([]uint64, error) { 107 | if m < MMin { 108 | return nil, fmt.Errorf("number of bits in the filter must be >= %d (was %d)", MMin, m) 109 | } 110 | return make([]uint64, (m+63)/64), nil 111 | } 112 | 113 | func newKeysCopy(origKeys []uint64) (keys []uint64, err error) { 114 | if len(origKeys) < KMin { 115 | return nil, fmt.Errorf("keys must have length %d or greater (was %d)", KMin, len(origKeys)) 116 | } 117 | if !uniqueKeys(origKeys) { 118 | return nil, fmt.Errorf("Bloom filter keys must be unique") 119 | } 120 | keys = append(keys, origKeys...) 121 | return keys, err 122 | } 123 | -------------------------------------------------------------------------------- /v2/binaryunmarshaler.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "bytes" 20 | "crypto/sha512" 21 | "encoding/binary" 22 | "fmt" 23 | "hash" 24 | "io" 25 | ) 26 | 27 | func unmarshalBinaryHeader(r io.Reader) (k, n, m uint64, err error) { 28 | magic := make([]byte, len(headerMagic)) 29 | if _, err := io.ReadFull(r, magic); err != nil { 30 | return 0, 0, 0, err 31 | } 32 | if !bytes.Equal(magic, headerMagic) { 33 | return 0, 0, 0, fmt.Errorf("incompatible version (wrong magic), got %x", magic) 34 | } 35 | var knm = make([]uint64, 3) 36 | err = binary.Read(r, binary.LittleEndian, knm) 37 | if err != nil { 38 | return 0, 0, 0, err 39 | } 40 | k = knm[0] 41 | n = knm[1] 42 | m = knm[2] 43 | if k < KMin { 44 | return 0, 0, 0, fmt.Errorf("keys must have length %d or greater (was %d)", KMin, k) 45 | } 46 | if m < MMin { 47 | return 0, 0, 0, fmt.Errorf("number of bits in the filter must be >= %d (was %d)", MMin, m) 48 | } 49 | return k, n, m, err 50 | } 51 | 52 | func unmarshalBinaryBits(r io.Reader, m uint64) (bits []uint64, err error) { 53 | bits, err = newBits(m) 54 | if err != nil { 55 | return bits, err 56 | } 57 | bs := make([]byte, 8) 58 | for i := 0; i < len(bits) && err == nil; i++ { 59 | _, err = io.ReadFull(r, bs) 60 | bits[i] = binary.LittleEndian.Uint64(bs) 61 | } 62 | if err != nil { 63 | return nil, err 64 | } 65 | return bits, nil 66 | } 67 | 68 | func unmarshalBinaryKeys(r io.Reader, k uint64) (keys []uint64, err error) { 69 | keys = make([]uint64, k) 70 | err = binary.Read(r, binary.LittleEndian, keys) 71 | return keys, err 72 | } 73 | 74 | // hashingReader can be used to read from a reader, and simultaneously 75 | // do a hash on the bytes that were read 76 | type hashingReader struct { 77 | reader io.Reader 78 | hasher hash.Hash 79 | tot int64 80 | } 81 | 82 | func (h *hashingReader) Read(p []byte) (n int, err error) { 83 | n, err = h.reader.Read(p) 84 | h.tot += int64(n) 85 | if err != nil { 86 | return n, err 87 | } 88 | _, _ = h.hasher.Write(p[:n]) 89 | return n, err 90 | } 91 | 92 | // UnmarshalBinary converts []bytes into a Filter 93 | // conforms to encoding.BinaryUnmarshaler 94 | func (f *Filter) UnmarshalBinary(data []byte) (err error) { 95 | buf := bytes.NewBuffer(data) 96 | _, err = f.UnmarshalFromReader(buf) 97 | return err 98 | } 99 | 100 | func (f *Filter) UnmarshalFromReader(input io.Reader) (n int64, err error) { 101 | f.lock.Lock() 102 | defer f.lock.Unlock() 103 | 104 | buf := &hashingReader{ 105 | reader: input, 106 | hasher: sha512.New384(), 107 | } 108 | var k uint64 109 | k, f.n, f.m, err = unmarshalBinaryHeader(buf) 110 | if err != nil { 111 | return buf.tot, err 112 | } 113 | 114 | f.keys, err = unmarshalBinaryKeys(buf, k) 115 | if err != nil { 116 | return buf.tot, err 117 | } 118 | f.bits, err = unmarshalBinaryBits(buf, f.m) 119 | if err != nil { 120 | return buf.tot, err 121 | } 122 | 123 | // Only the hash remains to be read now 124 | // so abort the hasher at this point 125 | gotHash := buf.hasher.Sum(nil) 126 | expHash := make([]byte, sha512.Size384) 127 | err = binary.Read(buf, binary.LittleEndian, expHash) 128 | if err != nil { 129 | return buf.tot, err 130 | } 131 | if !bytes.Equal(gotHash, expHash) { 132 | return buf.tot, errHashMismatch 133 | } 134 | return buf.tot, nil 135 | } 136 | -------------------------------------------------------------------------------- /v2/bloomfilter.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "errors" 20 | "hash" 21 | "sync" 22 | ) 23 | 24 | var ( 25 | errHashMismatch = errors.New("hash mismatch, bloom filter corruption or wrong version") 26 | ) 27 | 28 | // Filter is an opaque Bloom filter type 29 | type Filter struct { 30 | keys []uint64 31 | m uint64 // number of bits the "bits" field should recognize 32 | 33 | lock sync.RWMutex // lock guards accesses to the fields below 34 | bits []uint64 35 | n uint64 // number of inserted elements 36 | } 37 | 38 | // M is the size of Bloom filter, in bits 39 | func (f *Filter) M() uint64 { 40 | return f.m 41 | } 42 | 43 | // K is the count of keys 44 | func (f *Filter) K() uint64 { 45 | return uint64(len(f.keys)) 46 | } 47 | 48 | // Add a hashable item, v, to the filter 49 | func (f *Filter) Add(v hash.Hash64) { 50 | f.AddHash(v.Sum64()) 51 | } 52 | 53 | // rotation sets how much to rotate the hash on each filter iteration. This 54 | // is somewhat randomly set to a prime on the lower segment of 64. At 17, the cycle 55 | // does not repeat for quite a while, but even for low number of filters the 56 | // changes are quite rapid 57 | const rotation = 17 58 | 59 | // Adds an already hashes item to the filter. 60 | // Identical to Add (but slightly faster) 61 | func (f *Filter) AddHash(hash uint64) { 62 | f.lock.Lock() 63 | defer f.lock.Unlock() 64 | var ( 65 | i uint64 66 | ) 67 | for n := 0; n < len(f.keys); n++ { 68 | hash = ((hash << rotation) | (hash >> (64 - rotation))) ^ f.keys[n] 69 | i = hash % f.m 70 | f.bits[i>>6] |= 1 << uint(i&0x3f) 71 | } 72 | f.n++ 73 | } 74 | 75 | // ContainsHash tests if f contains the (already hashed) key 76 | // Identical to Contains but slightly faster 77 | func (f *Filter) ContainsHash(hash uint64) bool { 78 | f.lock.RLock() 79 | defer f.lock.RUnlock() 80 | var ( 81 | i uint64 82 | r = uint64(1) 83 | ) 84 | for n := 0; n < len(f.keys) && r != 0; n++ { 85 | hash = ((hash << rotation) | (hash >> (64 - rotation))) ^ f.keys[n] 86 | i = hash % f.m 87 | r &= (f.bits[i>>6] >> uint(i&0x3f)) & 1 88 | } 89 | return r != 0 90 | } 91 | 92 | // Contains tests if f contains v 93 | // false: f definitely does not contain value v 94 | // true: f maybe contains value v 95 | func (f *Filter) Contains(v hash.Hash64) bool { 96 | return f.ContainsHash(v.Sum64()) 97 | } 98 | 99 | // Copy f to a new Bloom filter 100 | func (f *Filter) Copy() (*Filter, error) { 101 | f.lock.RLock() 102 | defer f.lock.RUnlock() 103 | 104 | out, err := f.NewCompatible() 105 | if err != nil { 106 | return nil, err 107 | } 108 | copy(out.bits, f.bits) 109 | out.n = f.n 110 | return out, nil 111 | } 112 | 113 | // UnionInPlace merges Bloom filter f2 into f 114 | func (f *Filter) UnionInPlace(f2 *Filter) error { 115 | if f == f2 { 116 | return nil 117 | } 118 | if !f.IsCompatible(f2) { 119 | return errors.New("incompatible bloom filters") 120 | } 121 | f.lock.Lock() 122 | defer f.lock.Unlock() 123 | f2.lock.RLock() 124 | defer f2.lock.RUnlock() 125 | for i, bitword := range f2.bits { 126 | f.bits[i] |= bitword 127 | } 128 | // Also update the counters 129 | f.n += f2.n 130 | return nil 131 | } 132 | 133 | // Union merges f2 and f2 into a new Filter out 134 | func (f *Filter) Union(f2 *Filter) (out *Filter, err error) { 135 | if f == f2 { 136 | return f.Copy() 137 | } 138 | if !f.IsCompatible(f2) { 139 | return nil, errors.New("incompatible bloom filters") 140 | } 141 | f.lock.RLock() 142 | defer f.lock.RUnlock() 143 | 144 | out, err = f.NewCompatible() 145 | if err != nil { 146 | return nil, err 147 | } 148 | f2.lock.RLock() 149 | defer f2.lock.RUnlock() 150 | 151 | for i, bitword := range f2.bits { 152 | out.bits[i] = f.bits[i] | bitword 153 | } 154 | // Also update the counters 155 | out.n = f.n + f2.n 156 | return out, nil 157 | } 158 | 159 | // Clear clears the bloom filter. 160 | func (f *Filter) Clear() { 161 | f.lock.Lock() 162 | defer f.lock.Unlock() 163 | 164 | for i := range f.bits { 165 | f.bits[i] = 0 166 | } 167 | f.n = 0 // Also update the counters 168 | } 169 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![GoDoc](https://godoc.org/github.com/holiman/bloomfilter?status.png)](https://godoc.org/github.com/holiman/bloomfilter) 3 | [![CircleCI](https://circleci.com/gh/holiman/bloomfilter.svg?style=svg)](https://app.circleci.com/pipelines/github/holiman/bloomfilter) 4 | [![codecov](https://codecov.io/gh/holiman/bloomfilter/branch/master/graph/badge.svg?token=O48l6LbHkL)](https://codecov.io/gh/holiman/bloomfilter) 5 | [![DeepSource](https://deepsource.io/gh/holiman/bloomfilter.svg/?label=active+issues&show_trend=true)](https://deepsource.io/gh/holiman/bloomfilter/?ref=repository-badge) 6 | 7 | # History 8 | 9 | This bloom filter implementation is a fork from [steakknife/bloomfilter](https://github.com/steakknife/bloomfilter) by Barry Allard. 10 | The upstream project is now archived, so this fork exists to fix some bugs and also 11 | make a few improvements. Below is the original description. 12 | 13 | The original implemenation is Copyright © 2014-2016,2018 Barry Allard 14 | [MIT license](MIT-LICENSE.txt) 15 | 16 | All recent changes are copyright © 2019-2020 Martin Holst Swende. 17 | 18 | ## Installation 19 | 20 | ``` 21 | $ go get github.com/holiman/bloomfilter 22 | ``` 23 | 24 | ## Face-meltingly fast, thread-safe, marshalable, unionable, probability- and optimal-size-calculating Bloom filter in go 25 | 26 | ### WTF is a bloom filter 27 | 28 | **TL;DR:** Probabilistic, extra lookup table to track a set of elements kept elsewhere to reduce expensive, unnecessary set element retrieval and/or iterator operations **when an element is not present in the set.** It's a classic time-storage tradeoff algoritm. 29 | 30 | ### Properties 31 | 32 | #### [See wikipedia](https://en.wikipedia.org/wiki/Bloom_filter) for algorithm details 33 | 34 | |Impact|What|Description| 35 | |---|---|---| 36 | |Good|No false negatives|know for certain if a given element is definitely NOT in the set| 37 | |Bad|False positives|uncertain if a given element is in the set| 38 | |Bad|Theoretical potential for hash collisions|in very large systems and/or badly hash.Hash64-conforming implementations| 39 | |Bad|Add only|Cannot remove an element, it would destroy information about other elements| 40 | |Good|Constant storage|uses only a fixed amount of memory| 41 | 42 | ## Naming conventions 43 | 44 | (Similar to algorithm) 45 | 46 | |Variable/function|Description|Range| 47 | |---|---|---| 48 | |m/M()|number of bits in the bloom filter (memory representation is about m/8 bytes in size)|>=2| 49 | |n/N()|number of elements present|>=0| 50 | |k/K()|number of keys to use (keys are kept private to user code but are de/serialized to Marshal and file I/O)|>=0| 51 | |maxN|maximum capacity of intended structure|>0| 52 | |p|maximum allowed probability of collision (for computing m and k for optimal sizing)|>0..<1| 53 | 54 | - Memory representation should be exactly `24 + 8*(k + (m+63)/64) + unsafe.Sizeof(RWMutex)` bytes. 55 | - Serialized (`BinaryMarshaler`) representation should be exactly `72 + 8*(k + (m+63)/64)` bytes. (Disk format is less due to compression.) 56 | 57 | ## Binary serialization format 58 | 59 | All values in Little-endian format 60 | 61 | |Offset|Offset (Hex)|Length (bytes)|Name|Type| 62 | |---|---|---|---|---| 63 | |0|00|12|magic + version number|`\0\0\0\0\0\0\0\0v02\n`| 64 | |12|0c|8|k|`uint64`| 65 | |20|14|8|n|`uint64`| 66 | |28|1c|8|m|`uint64`| 67 | |36|24|k|(keys)|`[k]uint64`| 68 | |36+8*k|...|(m+63)/64|(bloom filter)|`[(m+63)/64]uint64`| 69 | |36+8\*k+8\*((m+63)/64)|...|48|(SHA384 of all previous fields, hashed in order)|`[48]byte`| 70 | 71 | - `bloomfilter.Filter` conforms to `encoding.BinaryMarshaler` and `encoding.BinaryUnmarshaler' 72 | 73 | ## Usage 74 | 75 | ```go 76 | 77 | import "github.com/holiman/bloomfilter" 78 | 79 | const ( 80 | maxElements = 100000 81 | probCollide = 0.0000001 82 | ) 83 | 84 | bf, err := bloomfilter.NewOptimal(maxElements, probCollide) 85 | if err != nil { 86 | panic(err) 87 | } 88 | 89 | someValue := ... // must conform to hash.Hash64 90 | 91 | bf.Add(someValue) 92 | if bf.Contains(someValue) { // probably true, could be false 93 | // whatever 94 | } 95 | 96 | anotherValue := ... // must also conform to hash.Hash64 97 | 98 | if bf.Contains(anotherValue) { 99 | panic("This should never happen") 100 | } 101 | 102 | err := bf.WriteFile("1.bf.gz") // saves this BF to a file 103 | if err != nil { 104 | panic(err) 105 | } 106 | 107 | bf2, err := bloomfilter.ReadFile("1.bf.gz") // read the BF to another var 108 | if err != nil { 109 | panic(err) 110 | } 111 | ``` 112 | 113 | 114 | ## Design 115 | 116 | Where possible, branch-free operations are used to avoid deep pipeline / execution unit stalls on branch-misses. 117 | 118 | ## Contact 119 | 120 | - [Issues](https://github.com/holiman/bloomfilter/issues) 121 | 122 | ## License 123 | 124 | [MIT license](MIT-LICENSE.txt) 125 | 126 | Copyright © 2014-2016 Barry Allard 127 | Copyright © 2019-2020 Martin Holst Swende 128 | 129 | -------------------------------------------------------------------------------- /v2/fileio_test.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "bytes" 20 | "crypto/sha512" 21 | "encoding/gob" 22 | "encoding/json" 23 | "fmt" 24 | "math/rand" 25 | "os" 26 | "path/filepath" 27 | "runtime" 28 | "testing" 29 | ) 30 | 31 | type devnull struct{} 32 | 33 | func (d devnull) Write(p []byte) (n int, err error) { 34 | return len(p), nil 35 | } 36 | 37 | func TestWriteRead(t *testing.T) { 38 | // minimal filter 39 | f, _ := New(8*1024*100, 5) 40 | // Add some content 41 | var tests = make([]hashableUint64, 20) 42 | for i := 0; i < 20; i++ { 43 | tests[i] = hashableUint64(rand.Uint64()) 44 | f.Add(tests[i]) 45 | } 46 | verify := func(t *testing.T, f *Filter) { 47 | for i, v := range tests { 48 | if !f.Contains(v) { 49 | t.Errorf("missing item %d", i) 50 | } 51 | } 52 | } 53 | 54 | t.Run("binary", func(t *testing.T) { 55 | var b bytes.Buffer 56 | _, err := f.WriteTo(&b) 57 | if err != nil { 58 | t.Fatal(err) 59 | } 60 | cpy := append([]byte{}, b.Bytes()...) 61 | var f2 *Filter 62 | if f2, _, err = ReadFrom(&b); err != nil { 63 | t.Fatal(err) 64 | } 65 | verify(t, f2) 66 | // test overwrite 67 | f3, _ := New(8*5, 3) 68 | if _, err = f3.ReadFrom(bytes.NewReader(cpy)); err != nil { 69 | t.Fatal(err) 70 | } 71 | verify(t, f3) 72 | }) 73 | t.Run("gob", func(t *testing.T) { 74 | var buffer bytes.Buffer 75 | err := gob.NewEncoder(&buffer).Encode(f) 76 | if err != nil { 77 | t.Fatal(err) 78 | } 79 | var f2 Filter 80 | err = gob.NewDecoder(&buffer).Decode(&f2) 81 | if err != nil { 82 | t.Fatal(err) 83 | } 84 | verify(t, &f2) 85 | }) 86 | 87 | t.Run("json", func(t *testing.T) { 88 | data, err := json.Marshal(f) 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | var f2 Filter 93 | if err = json.Unmarshal(data, &f2); err != nil { 94 | t.Fatal(err) 95 | } 96 | verify(t, &f2) 97 | }) 98 | t.Run("file", func(t *testing.T) { 99 | fName := filepath.Join(os.TempDir(), "temp.deleteme.gz") 100 | if _, err := f.WriteFile(fName); err != nil { 101 | t.Fatal(err) 102 | } 103 | defer os.Remove(fName) 104 | if f2, _, err := ReadFile(fName); err != nil { 105 | t.Fatal(err) 106 | } else { 107 | verify(t, f2) 108 | } 109 | }) 110 | } 111 | 112 | func TestCorruption(t *testing.T) { 113 | // minimal filter 114 | f, _ := New(8*32, 5) 115 | // Add some content 116 | var tests = make([]hashableUint64, 20) 117 | for i := 0; i < 20; i++ { 118 | tests[i] = hashableUint64(rand.Uint64()) 119 | f.Add(tests[i]) 120 | } 121 | t.Run("binary", func(t *testing.T) { 122 | var b bytes.Buffer 123 | _, err := f.WriteTo(&b) 124 | if err != nil { 125 | t.Fatal(err) 126 | } 127 | buf := b.Bytes() 128 | buf[len(buf)/2] ^= 1 129 | if _, _, err := ReadFrom(&b); err == nil { 130 | t.Errorf("expected error") 131 | } 132 | }) 133 | 134 | t.Run("gob", func(t *testing.T) { 135 | var buffer bytes.Buffer 136 | err := gob.NewEncoder(&buffer).Encode(f) 137 | if err != nil { 138 | t.Fatal(err) 139 | } 140 | data := buffer.Bytes() 141 | // Flip a bit 142 | data[len(data)/2] ^= 1 143 | var f2 Filter 144 | err = gob.NewDecoder(&buffer).Decode(&f2) 145 | if err == nil { 146 | t.Errorf("expected error") 147 | } 148 | }) 149 | 150 | } 151 | 152 | func bToMb(b uint64) uint64 { 153 | return b / 1024 / 1024 154 | } 155 | func PrintMemUsage() { 156 | var m runtime.MemStats 157 | runtime.ReadMemStats(&m) 158 | // For info on each, see: https://golang.org/pkg/runtime/#MemStats 159 | fmt.Printf("Alloc = %v MiB", bToMb(m.Alloc)) 160 | fmt.Printf("\tTotalAlloc = %v MiB", bToMb(m.TotalAlloc)) 161 | fmt.Printf("\tSys = %v MiB", bToMb(m.Sys)) 162 | fmt.Printf("\tNumGC = %v\n", m.NumGC) 163 | } 164 | 165 | func TestWrite(t *testing.T) { 166 | // 1Mb 167 | f, _ := New(4*8*1024*1024, 1) 168 | fmt.Printf("Allocated 1mb filter\n") 169 | PrintMemUsage() 170 | _, _ = f.WriteTo(devnull{}) 171 | fmt.Printf("Wrote filter to devnull\n") 172 | PrintMemUsage() 173 | } 174 | 175 | // fillRandom fills the filter with N random values, where N is roughly half 176 | // the size of the number of uint64's in the filter 177 | func fillRandom(f *Filter) { 178 | num := len(f.bits) * 4 179 | for i := 0; i < num; i++ { 180 | f.AddHash(uint64(rand.Int63())) 181 | } 182 | } 183 | 184 | // TestMarshaller tests that it writes outputs correctly. 185 | func TestMarshaller(t *testing.T) { 186 | 187 | h1 := sha512.New384() 188 | h2 := sha512.New384() 189 | 190 | f, _ := New(1*8*1024*1024, 1) 191 | fillRandom(f) 192 | // Marshall using writer 193 | _, _, _ = f.MarshallToWriter(h1) 194 | // Marshall as a blob 195 | data, _ := f.MarshalBinary() 196 | _, _ = h2.Write(data) 197 | 198 | if have, want := h1.Sum(nil), h2.Sum(nil); !bytes.Equal(have, want) { 199 | t.Errorf("Marshalling error, have %x want %x", have, want) 200 | } 201 | } 202 | 203 | func BenchmarkWrite1Mb(b *testing.B) { 204 | 205 | // 1Mb 206 | f, _ := New(1*8*1024*1024, 1) 207 | f.Add(hashableUint64(0)) 208 | f.Add(hashableUint64(1)) 209 | f.Add(hashableUint64(1 << 3)) 210 | f.Add(hashableUint64(1 << 40)) 211 | f.Add(hashableUint64(1 << 23)) 212 | f.Add(hashableUint64(1 << 16)) 213 | f.Add(hashableUint64(1 << 28)) 214 | 215 | b.ReportAllocs() 216 | for i := 0; i < b.N; i++ { 217 | _, _ = f.WriteTo(devnull{}) 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /v2/bloomfilter_test.go: -------------------------------------------------------------------------------- 1 | // Package bloomfilter is face-meltingly fast, thread-safe, 2 | // marshalable, unionable, probability- and 3 | // optimal-size-calculating Bloom filter in go 4 | // 5 | // https://github.com/holiman/bloomfilter 6 | // 7 | // Original source: 8 | // https://github.com/steakknife/bloomfilter 9 | // 10 | // Copyright © 2014, 2015, 2018 Barry Allard 11 | // Copyright © 2020 Martin Holst Swende 12 | // 13 | // MIT license 14 | // 15 | 16 | package v2 17 | 18 | import ( 19 | "fmt" 20 | "math/rand" 21 | "testing" 22 | ) 23 | 24 | // a read-only type that conforms to hash.Hash64, but only Sum64() works. 25 | // It is set by writing the underlying value. 26 | type hashableUint64 uint64 27 | 28 | func (h hashableUint64) Write([]byte) (int, error) { 29 | panic("Unimplemented") 30 | } 31 | 32 | func (h hashableUint64) Sum([]byte) []byte { 33 | panic("Unimplemented") 34 | } 35 | 36 | func (h hashableUint64) Reset() { 37 | panic("Unimplemented") 38 | } 39 | 40 | func (h hashableUint64) BlockSize() int { 41 | panic("Unimplemented") 42 | } 43 | 44 | func (h hashableUint64) Size() int { 45 | panic("Unimplemented") 46 | } 47 | 48 | func (h hashableUint64) Sum64() uint64 { 49 | return uint64(h) 50 | } 51 | 52 | func hashableUint64Values() []hashableUint64 { 53 | return []hashableUint64{ 54 | 0, 55 | 7, 56 | 0x0c0ffee0, 57 | 0xdeadbeef, 58 | 0xffffffff, 59 | } 60 | } 61 | 62 | func hashableUint64NotValues() []hashableUint64 { 63 | return []hashableUint64{ 64 | 1, 65 | 5, 66 | 42, 67 | 0xa5a5a5a5, 68 | 0xfffffffe, 69 | } 70 | } 71 | 72 | func Test0(t *testing.T) { 73 | bf, _ := New(10000, 5) 74 | 75 | t.Log("Filled ratio before adds :", bf.PreciseFilledRatio()) 76 | for _, x := range hashableUint64Values() { 77 | bf.Add(x) 78 | } 79 | t.Log("Filled ratio after adds :", bf.PreciseFilledRatio()) 80 | 81 | // these may or may not be true 82 | for _, y := range hashableUint64Values() { 83 | if bf.Contains(y) { 84 | t.Log("value in set querties: may contain ", y) 85 | } else { 86 | t.Fatal("value in set queries: definitely does not contain ", y, 87 | ", but it should") 88 | } 89 | } 90 | 91 | // these must all be false 92 | for _, z := range hashableUint64NotValues() { 93 | if bf.Contains(z) { 94 | t.Log("value not in set queries: may or may not contain ", z) 95 | } else { 96 | t.Log("value not in set queries: definitely does not contain ", z, 97 | " which is correct") 98 | } 99 | } 100 | } 101 | 102 | func TestUnion(t *testing.T) { 103 | f1, _ := New(8*500, 4) 104 | tmp, _ := New(8*500, 4) 105 | if _, err := tmp.Union(f1); err == nil { 106 | t.Errorf("Incompatible, should error") 107 | } 108 | f2, err := f1.NewCompatible() 109 | if err != nil { 110 | t.Fatal(err) 111 | } 112 | rand.Seed(1337) 113 | // Add some content 114 | var tests = make([]hashableUint64, 200) 115 | for i := 0; i < len(tests); i++ { 116 | tests[i] = hashableUint64(rand.Uint64()) 117 | if i&1 == 0 { 118 | f1.Add(tests[i]) 119 | } else { 120 | f2.Add(tests[i]) 121 | } 122 | } 123 | unionF, err := f2.Union(f1) 124 | if err != nil { 125 | t.Fatal(err) 126 | } 127 | copyF, err := unionF.Copy() 128 | if err != nil { 129 | t.Fatal(err) 130 | } 131 | 132 | for i, v := range tests { 133 | if !unionF.Contains(v) { 134 | t.Errorf("missing item %d", i) 135 | } 136 | if !copyF.Contains(v) { 137 | t.Errorf("missing item %d", i) 138 | } 139 | if i&1 == 0 { 140 | if !f1.Contains(v) { 141 | t.Errorf("missing item %d", i) 142 | } 143 | if f2.Contains(v) { 144 | t.Errorf("f2 has item it shouldn't have") 145 | } 146 | } else { 147 | if !f2.Contains(v) { 148 | t.Errorf("missing item %d", i) 149 | } 150 | if f1.Contains(v) { 151 | t.Errorf("f1 has item it shouldn't have") 152 | } 153 | } 154 | } 155 | // And test merging f1 into f2 156 | if err := f2.UnionInPlace(f1); err != nil { 157 | t.Fatal(err) 158 | } 159 | 160 | for i, v := range tests { 161 | if !f2.Contains(v) { 162 | t.Errorf("missing item %d", i) 163 | } 164 | if i&1 == 0 { 165 | if !f1.Contains(v) { 166 | t.Errorf("missing item %d", i) 167 | } 168 | } else { 169 | if f1.Contains(v) { 170 | t.Errorf("f1 has item it shouldn't have") 171 | } 172 | } 173 | } 174 | } 175 | 176 | func TestFPRate(t *testing.T) { 177 | f, _ := New(8*32, 4) 178 | f.n = 101 // "insert" 101 items 179 | // yes we could add some more tests here... 180 | have, want := f.FalsePosititveProbability(), 0.402507 181 | if int(1000*have) != int(1000*want) { 182 | t.Errorf("have %08f, want %f", have, want) 183 | } 184 | } 185 | 186 | func BenchmarkAddX10kX5(b *testing.B) { 187 | bf, _ := New(10000, 5) 188 | b.Run("add-10kx5", func(b *testing.B) { 189 | b.ReportAllocs() 190 | for i := 0; i < b.N; i++ { 191 | bf.Add(hashableUint64(rand.Uint32())) 192 | } 193 | }) 194 | b.Run("add-10kx5-hash", func(b *testing.B) { 195 | b.ReportAllocs() 196 | for i := 0; i < b.N; i++ { 197 | bf.AddHash(uint64(rand.Uint32())) 198 | } 199 | }) 200 | } 201 | 202 | func TestAddX10kX5(t *testing.T) { 203 | b1, _ := New(10000, 5) 204 | b2, _ := b1.NewCompatible() 205 | 206 | verify := func() { 207 | for i := 0; i < len(b1.bits); i++ { 208 | if b1.bits[i] != b2.bits[i] { 209 | t.Fatalf("error at bit %d!", i) 210 | } 211 | } 212 | } 213 | for i := 0; i < 1000000; i++ { 214 | v := hashableUint64(rand.Uint32()) 215 | b1.Add(v) 216 | b2.AddHash(v.Sum64()) 217 | verify() 218 | if !b2.Contains(v) { 219 | t.Fatal("contain error") 220 | } 221 | } 222 | } 223 | func BenchmarkContains1kX10kX5(b *testing.B) { 224 | bf, _ := New(10000, 5) 225 | for i := 0; i < 1000; i++ { 226 | bf.Add(hashableUint64(rand.Uint32())) 227 | } 228 | b.Run("contains", func(b *testing.B) { 229 | for i := 0; i < b.N; i++ { 230 | bf.Contains(hashableUint64(rand.Uint32())) 231 | } 232 | }) 233 | b.Run("containsHash", func(b *testing.B) { 234 | for i := 0; i < b.N; i++ { 235 | bf.ContainsHash(uint64(rand.Uint32())) 236 | } 237 | }) 238 | } 239 | 240 | func BenchmarkContains100kX10BX20(b *testing.B) { 241 | rand.Seed(1337) 242 | b.StopTimer() 243 | bf, _ := New(10*1000*1000*1000, 20) 244 | for i := 0; i < 100*1000; i++ { 245 | bf.Add(hashableUint64(rand.Uint32())) 246 | } 247 | b.Run("contains", func(b *testing.B) { 248 | for i := 0; i < b.N; i++ { 249 | bf.Contains(hashableUint64(rand.Uint32())) 250 | } 251 | }) 252 | b.Run("containshash", func(b *testing.B) { 253 | for i := 0; i < b.N; i++ { 254 | bf.ContainsHash(uint64(rand.Uint32())) 255 | } 256 | }) 257 | } 258 | 259 | func TestContains(t *testing.T) { 260 | rand.Seed(1337) 261 | bf, _ := New(10*1000*1000, 20) 262 | for i := 0; i < 100*10000; i++ { 263 | x := hashableUint64(rand.Uint32()) 264 | bf.Add(x) 265 | if !bf.Contains(x) { 266 | t.Fatalf("Did not contain newly added elem: %d", x.Sum64()) 267 | } 268 | } 269 | } 270 | 271 | //BenchmarkUnionInPlace/union-8-6 15270 77848 ns/op 272 | func BenchmarkUnionInPlace(b *testing.B) { 273 | var filters []*Filter 274 | b1, _ := New(813129, 6) 275 | for i := 0; i < 2000; i++ { 276 | b1.Add(hashableUint64(rand.Uint32())) 277 | } 278 | filters = append(filters, b1) 279 | for i := 0; i < 7; i++ { 280 | b, _ := b1.NewCompatible() 281 | filters = append(filters, b) 282 | } 283 | b.ResetTimer() 284 | b.Run("union-8", func(b *testing.B) { 285 | for i := 0; i < b.N; i++ { 286 | for _, bx := range filters { 287 | _ = b1.UnionInPlace(bx) 288 | } 289 | } 290 | }) 291 | } 292 | 293 | func BenchmarkContains94percentMisses(b *testing.B) { 294 | // This test should produce about 295 | // 5.4K hits and 94k misses 296 | rand.Seed(1337) 297 | b.StopTimer() 298 | bf, _ := New(10*1000*1000, 20) 299 | for i := 0; i < 100*1000; i++ { 300 | bf.Add(hashableUint64(rand.Uint32())) 301 | } 302 | b.Run("contains", func(b *testing.B) { 303 | for i := 0; i < b.N; i++ { 304 | bf.Contains(hashableUint64(rand.Uint32())) 305 | } 306 | }) 307 | b.Run("containsHash", func(b *testing.B) { 308 | for i := 0; i < b.N; i++ { 309 | bf.ContainsHash(uint64(rand.Uint32())) 310 | } 311 | }) 312 | } 313 | 314 | // This test is quite long-running, thus disabled 315 | func TestHitrate(t *testing.T) { 316 | t.Skip("Long-running test, use only for sanity-checking") 317 | /** 318 | After changes: 319 | 320 | Fill ratio: 9.303936 % 321 | Theoretical hitrate : 0.007493 % 322 | Hit rate (100K random tests): 0.009000 % (9 out of 100000) 323 | Hit rate (100K random tests): 0.009000 % (9 out of 100000) 324 | Zero-filter Hit rate (100K random tests): 9.373000 % (9373 out of 100000) 325 | 1-filter Hit rate: 9.474021 % (888 out of 9373) 326 | 327 | Original changes: 328 | 329 | Fill ratio: 9.303647 % 330 | Theoretical hitrate : 0.007492 % 331 | Hit rate (100K random tests): 2.658000 % (2658 out of 100000) 332 | Zero-filter Hit rate (100K random tests): 9.456000 % (9456 out of 100000) 333 | 1-filter Hit rate: 53.489848 % (5058 out of 9456) 334 | 335 | */ 336 | // 512 MB bloom filter 337 | f, _ := New(512*1024*1024*8, 4) 338 | 339 | // Fill it with 100M items 340 | for i := 0; i < 100*1024*1024; i++ { 341 | val := rand.Uint64() 342 | f.AddHash(val) 343 | if !f.ContainsHash(val) { 344 | t.Fatalf("Missing value (just inserted) %d", val) 345 | } 346 | } 347 | // Test individual matches 348 | numTests := 100000 349 | hits := 0 350 | 351 | for i := 0; i < numTests; i++ { 352 | h := rand.Uint64() 353 | if f.ContainsHash(h) { 354 | hits++ 355 | } 356 | } 357 | fmt.Printf("Error rate: %f %%\n", 100*f.FalsePosititveProbability()) 358 | // With four keys, we should obtain fillrate^4 chance of false positive 359 | fp := f.PreciseFilledRatio() 360 | fmt.Printf("Fill ratio: %02f %%\n", 100*fp) 361 | fmt.Printf("Theoretical hitrate : %02f %%\n", 100*fp*fp*fp*fp) 362 | fmt.Printf("Hit rate (100K random tests): %02f %% (%d out of %d) \n", 100*float64(hits)/float64(numTests), hits, numTests) 363 | } 364 | --------------------------------------------------------------------------------