├── .github └── workflows │ ├── go.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── bloom.go ├── bloom └── manage.go ├── bloom_test.go ├── go.mod ├── go.sum ├── io.go ├── io_test.go └── testdata ├── broken.bloom ├── test-input.txt ├── test.bloom └── test.bloom.gz /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Go build 3 | 4 | on: 5 | - push 6 | - pull_request 7 | 8 | jobs: 9 | build: 10 | name: "Go build" 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v2 14 | 15 | - name: Set up Go 16 | uses: actions/setup-go@v2 17 | with: 18 | go-version: 1.16 19 | 20 | - name: Build 21 | run: go build -v ./... 22 | 23 | - name: Test 24 | run: go test -v ./... 25 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: "Go tagged release" 3 | 4 | on: 5 | push: 6 | tags: 7 | - "v*" 8 | 9 | jobs: 10 | tagged-release: 11 | name: "Tagged Release" 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v2 16 | 17 | - name: Set up Go 18 | uses: actions/setup-go@v2 19 | with: 20 | go-version: 1.16 21 | 22 | - name: Build 23 | run: make release test 24 | 25 | - uses: "marvinpinto/action-automatic-releases@latest" 26 | with: 27 | repo_token: "${{ secrets.GITHUB_TOKEN }}" 28 | prerelease: false 29 | files: | 30 | bloom_linux_amd64.bin 31 | bloom_windows_amd64.exe -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bloom.gz 2 | *.bloom 3 | build/* 4 | *.bin 5 | *.exe 6 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## v0.2.4 (2021-06-29) 4 | 5 | - Add Go module support 6 | - Switch CI/CD to GitHub Actions 7 | - Update documentation 8 | 9 | ## v0.2.3 (2019-01-10) 10 | 11 | - Add test for fingerprint 12 | - Update documentation 13 | - Improve robustness regarding broken input files 14 | 15 | ## v0.2.2 (2018-10-24) 16 | 17 | - Remove dead code 18 | 19 | ## v0.2.0 (2017-08-14) 20 | 21 | - Add 'bloom join' command line tool 22 | - Make it possible to store arbitary data in a Bloom filter (useful for 23 | associating meta-data with a filter) 24 | - Remove HTTP test 25 | - Use scientific notation for FP probability 26 | - Add a 64-bit value reserved for the version bit and additional flags 27 | - Use 64-bit integers everywhere 28 | - Switched to new, more robust hashing scheme 29 | 30 | ## v0.1.0 (2017-06-27) 31 | 32 | - Initial open source release 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017, DCSO Deutsche Cyber-Sicherheitsorganisation GmbH 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of the DCSO Deutsche Cyber-Sicherheitsorganisation GmbH 15 | nor the names of its contributors may be used to endorse or promote products 16 | derived from this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | ## simple makefile to log workflow 2 | .PHONY: all test clean build install 3 | 4 | GOFLAGS ?= $(GOFLAGS:) 5 | 6 | all: install test 7 | 8 | build: 9 | @go build $(GOFLAGS) ./... 10 | 11 | install: 12 | @go get $(GOFLAGS) ./... 13 | 14 | test: install 15 | @go test -cover $(GOFLAGS) ./... 16 | 17 | bench: install 18 | @go test -run=NONE -bench=. $(GOFLAGS) ./... 19 | 20 | clean: 21 | @go clean $(GOFLAGS) -i ./... 22 | 23 | release: 24 | @go get $(GOFLAGS) ./... 25 | @go build -v -o bloom_linux_amd64.bin bloom/* 26 | GOOS=windows GOARCH=amd64 go build -v -o bloom_windows_amd64.exe bloom/* 27 | 28 | ## EOF 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bloom 2 | 3 | ### A highly efficient bloom filter implementation for Go 4 | 5 | [![GoDoc](https://godoc.org/github.com/DCSO/bloom?status.svg)](http://godoc.org/github.com/DCSO/bloom) 6 | ![Build Status](https://github.com/DCSO/bloom/actions/workflows/go.yml/badge.svg) 7 | 8 | Bloom is a simple tool that provides a very efficient implementation of Bloom filters for the go language. 9 | It provides a command line tool that can be used to easily create Bloom filters with desired capacity 10 | and false positive probability. Values can be added to filters through standard input, which makes it 11 | easy to use the tool in a pipeline workflow. 12 | 13 | # Usage 14 | 15 | NAME: 16 | Bloom Filter - Utility to work with bloom filters 17 | 18 | USAGE: 19 | bloom [global options] command [command options] [arguments...] 20 | 21 | VERSION: 22 | 0.2.2 23 | 24 | COMMANDS: 25 | create, cr Create a new Bloom filter and store it in the given filename. 26 | insert, i Inserts new values into an existing Bloom filter. 27 | join, j, merge, m Joins two Bloom filters into one. 28 | check, c Checks values against an existing Bloom filter. 29 | set-data, sd Sets the data associated with the Bloom filter. 30 | get-data, gd Prints the data associated with the Bloom filter. 31 | show, s Shows various details about a given Bloom filter. 32 | help, h Shows a list of commands or help for one command 33 | 34 | GLOBAL OPTIONS: 35 | --gzip, --gz compress bloom file with gzip 36 | --interactive, -i interactively add values to the filter 37 | --split, -s split the input string 38 | --each, -e print each match of a split string individually 39 | --delimiter value, -d value delimiter to use for splitting (default: ",") 40 | --fields value, -f value fields of split output to use in filter (a single number or a comma-separated list of numbers, zero-indexed) 41 | --print-fields value, --pf value fields of split output to print for a successful match (a single number or a comma-separated list of numbers, zero-indexed). 42 | --help, -h show help 43 | --version, -v print the version 44 | 45 | 46 | # Examples 47 | 48 | To create a new bloom filter with a desired capacity and false positive probability, you can use the `create` command: 49 | 50 | #will create a gzipped Bloom filter with 100.000 capacity and a 0.1 % false positive probability 51 | bloom --gzip create -p 0.001 -n 100000 test.bloom.gz 52 | 53 | To insert values, you can use the `insert` command and pipe some input to it (each line will be treated as one value): 54 | 55 | cat values | bloom --gzip insert test.bloom.gz 56 | 57 | You can also interactively add values to the filter by specifying the `--interactive` command line option: 58 | 59 | bloom --gzip --interactive insert test.bloom.gz 60 | 61 | To check if a given value or a list of values is in the filter, you can use the `check` command: 62 | 63 | cat values | bloom --gzip check test.bloom.gz 64 | 65 | This will return a list of all values in the filter. 66 | 67 | # Advanced Usage 68 | 69 | Sometimes it is useful to attach additional information to a string that we want to check against the Bloom filter, 70 | such as a timestamp or the original line content. To make passing along this additional information easier within 71 | a shell context, the Bloom tool provides an option for splitting the input string by a given delimiter and checking 72 | the filter against the resulting field values. Example: 73 | 74 | # will check the Bloom filter for the values foo, bar and baz 75 | cat "foo,bar,baz" | bloom -s filter.bloom 76 | 77 | # uses a different delimiter (--magic-delimiter--) 78 | cat "foo--ac5ba--bar--ac5ba--baz" | bloom -d "--ac5ba--" -s filter.bloom 79 | 80 | # will check the Bloom filter against the second field value only 81 | cat "foo,bar,baz" | bloom -f 1 -s filter.bloom 82 | 83 | # will check the Bloom filter against the second and third field values only 84 | cat "foo,bar,baz" | bloom -f 1,2 -s filter.bloom 85 | 86 | # will print one line for each field value that matched against the filter 87 | cat "foo,bar,baz" | bloom -e -s filter.bloom 88 | 89 | # will print the last field value for each line whose fields matched against the filter 90 | cat "foo,bar,baz" | bloom -e -s --pf -1 filter.bloom 91 | 92 | This functionality is especially handy when using CSV data, as it allows you to filter CSV rows by checking individual 93 | columns against the filter without having to use external tools to split and reassemble the lines. 94 | 95 | # Installation 96 | 97 | ## Installation on Debian-based systems 98 | 99 | Debian [command line tool](https://tracker.debian.org/pkg/golang-github-dcso-bloom): 100 | 101 | sudo apt install golang-github-dcso-bloom-cli 102 | 103 | ## Installation via `go get`: 104 | 105 | go get github.com/DCSO/bloom/... 106 | 107 | ## Installation from source 108 | 109 | These need to be run from within the `GOPATH` source directory for this project (e.g. `$GOPATH/src/github.com/DCSO/bloom`). To install the command line tool: 110 | 111 | make install 112 | 113 | To run the tests: 114 | 115 | make test 116 | 117 | To run the benchmarks: 118 | 119 | make bench 120 | 121 | # Cross-Compiling 122 | 123 | To compile a binary, simply specify the target architecture and go: 124 | 125 | #Windows, 64 bit 126 | env GOOS=windows GOARCH=amd64 go build -v -o bloom.exe github.com/DCSO/bloom 127 | #Windows, 32 bit 128 | env GOOS=windows GOARCH=i386 go build -v -o /tmp/bloom github.com/DCSO/bloom 129 | -------------------------------------------------------------------------------- /bloom.go: -------------------------------------------------------------------------------- 1 | // DCSO go bloom filter 2 | // Copyright (c) 2017, DCSO GmbH 3 | 4 | //Implements a simple and highly efficient variant of the Bloom filter that uses only two hash functions. 5 | 6 | package bloom 7 | 8 | import ( 9 | "encoding/binary" 10 | "errors" 11 | "fmt" 12 | "hash/fnv" 13 | "io" 14 | "io/ioutil" 15 | "math" 16 | ) 17 | 18 | // BloomFilter represents a Bloom filter, a data structure for quickly checking 19 | // for set membership, with a specific desired capacity and false positive 20 | // probability. 21 | type BloomFilter struct { 22 | //bit array 23 | v []uint64 24 | 25 | //desired maximum number of elements 26 | n uint64 27 | 28 | //desired false positive probability 29 | p float64 30 | 31 | //number of hash functions 32 | k uint64 33 | 34 | //number of bits 35 | m uint64 36 | 37 | //number of elements in the filter 38 | N uint64 39 | 40 | //number of 64-bit integers (generated automatically) 41 | M uint64 42 | 43 | //arbitrary data that we can attach to the filter 44 | Data []byte 45 | } 46 | 47 | // Read loads a filter from a reader object. 48 | func (s *BloomFilter) Read(input io.Reader) error { 49 | bs8 := make([]byte, 8) 50 | 51 | if _, err := io.ReadFull(input, bs8); err != nil { 52 | return err 53 | } 54 | 55 | flags := binary.LittleEndian.Uint64(bs8) 56 | 57 | if flags&0xFF != 1 { 58 | return fmt.Errorf("Invalid version bit (should be 1)") 59 | } 60 | 61 | if _, err := io.ReadFull(input, bs8); err != nil { 62 | return err 63 | } 64 | 65 | s.n = binary.LittleEndian.Uint64(bs8) 66 | 67 | if _, err := io.ReadFull(input, bs8); err != nil { 68 | return err 69 | } 70 | 71 | s.p = math.Float64frombits(binary.LittleEndian.Uint64(bs8)) 72 | 73 | if _, err := io.ReadFull(input, bs8); err != nil { 74 | return err 75 | } 76 | 77 | s.k = binary.LittleEndian.Uint64(bs8) 78 | maxInt := uint64(int(^uint(0) >> 1)) 79 | if s.k >= maxInt { 80 | return fmt.Errorf("value of k (number of hash functions) is too high (%d), must be less than maximum int (%d)", s.k, maxInt) 81 | } 82 | 83 | if _, err := io.ReadFull(input, bs8); err != nil { 84 | return err 85 | } 86 | 87 | s.m = binary.LittleEndian.Uint64(bs8) 88 | 89 | if _, err := io.ReadFull(input, bs8); err != nil { 90 | return err 91 | } 92 | 93 | s.N = binary.LittleEndian.Uint64(bs8) 94 | 95 | s.M = uint64(math.Ceil(float64(s.m) / 64.0)) 96 | 97 | s.v = make([]uint64, s.M) 98 | 99 | for i := uint64(0); i < s.M; i++ { 100 | n, err := io.ReadFull(input, bs8) 101 | if err != nil { 102 | return err 103 | } 104 | if n != 8 { 105 | return fmt.Errorf("Cannot read from file: %d, position: %d, %d", n, i*8, len(bs8)) 106 | } 107 | s.v[i] = binary.LittleEndian.Uint64(bs8) 108 | } 109 | 110 | b, err := ioutil.ReadAll(input) 111 | 112 | if err != nil { 113 | return err 114 | } 115 | 116 | s.Data = b 117 | 118 | return nil 119 | 120 | } 121 | 122 | // NumHashFuncs returns the number of hash functions used in the Bloom filter. 123 | func (s *BloomFilter) NumHashFuncs() uint64 { 124 | return s.k 125 | } 126 | 127 | // MaxNumElements returns the maximal supported number of elements in the Bloom 128 | // filter (capacity). 129 | func (s *BloomFilter) MaxNumElements() uint64 { 130 | return s.n 131 | } 132 | 133 | // NumBits returns the number of bits used in the Bloom filter. 134 | func (s *BloomFilter) NumBits() uint64 { 135 | return s.m 136 | } 137 | 138 | // FalsePositiveProb returns the chosen false positive probability for the 139 | // Bloom filter. 140 | func (s *BloomFilter) FalsePositiveProb() float64 { 141 | return s.p 142 | } 143 | 144 | // Write writes the binary representation of a Bloom filter to an io.Writer. 145 | func (s *BloomFilter) Write(output io.Writer) error { 146 | bs8 := make([]byte, 8) 147 | 148 | // we write the version bit 149 | binary.LittleEndian.PutUint64(bs8, 1) 150 | output.Write(bs8) 151 | 152 | binary.LittleEndian.PutUint64(bs8, s.n) 153 | output.Write(bs8) 154 | binary.LittleEndian.PutUint64(bs8, math.Float64bits(s.p)) 155 | output.Write(bs8) 156 | binary.LittleEndian.PutUint64(bs8, s.k) 157 | output.Write(bs8) 158 | binary.LittleEndian.PutUint64(bs8, s.m) 159 | output.Write(bs8) 160 | binary.LittleEndian.PutUint64(bs8, s.N) 161 | output.Write(bs8) 162 | 163 | for i := uint64(0); i < s.M; i++ { 164 | binary.LittleEndian.PutUint64(bs8, s.v[i]) 165 | n, err := output.Write(bs8) 166 | if n != 8 { 167 | return errors.New("Cannot write to file!") 168 | } 169 | if err != nil { 170 | return err 171 | } 172 | } 173 | if s.Data != nil { 174 | output.Write(s.Data) 175 | } 176 | return nil 177 | } 178 | 179 | // Reset clears the Bloom filter of all elements. 180 | func (s *BloomFilter) Reset() { 181 | for i := uint64(0); i < s.M; i++ { 182 | s.v[i] = 0 183 | } 184 | s.N = 0 185 | } 186 | 187 | // this is the largest prime number < 2^64. As we will probably never encounter 188 | // a Bloom filter with a number of bits > m (if yes sorry to you people from, 189 | // the future, I envy your available RAM though) we can use the pseudorandom 190 | // sequence generated by repeatedly multiplying the initial hash value h 191 | // with g and taking the modulo over m to generate a sequence of hash values 192 | // that has a uniform distribution. 193 | const m uint64 = 18446744073709551557 194 | 195 | // this is our multiplier. It has a very large primitive root 196 | // so that it will not repeat a given cycle for any practically meaningful 197 | // value of k. 198 | const g uint64 = 18446744073709550147 199 | 200 | // Fingerprint returns the fingerprint of a given value, as an array of index 201 | // values. 202 | func (s *BloomFilter) Fingerprint(value []byte, fingerprint []uint64) { 203 | hv := fnv.New64() 204 | hv.Write(value) 205 | hn := hv.Sum64() % m 206 | 207 | for i := uint64(0); i < s.k; i++ { 208 | hn = (hn * g) % m 209 | fingerprint[i] = uint64(hn % s.m) 210 | } 211 | } 212 | 213 | // Add adds a byte array element to the Bloom filter. 214 | func (s *BloomFilter) Add(value []byte) { 215 | var k, l uint64 216 | newValue := false 217 | fingerprint := make([]uint64, s.k) 218 | s.Fingerprint(value, fingerprint) 219 | for i := uint64(0); i < s.k; i++ { 220 | k = uint64(fingerprint[i] / 64) 221 | l = uint64(fingerprint[i] % 64) 222 | v := uint64(1 << l) 223 | if (s.v[k] & v) == 0 { 224 | newValue = true 225 | } 226 | s.v[k] |= v 227 | } 228 | if newValue { 229 | s.N++ 230 | } 231 | } 232 | 233 | // Join adds the items of another Bloom filter with identical dimensions to 234 | // the receiver. That is, all elements that are described in the 235 | // second filter will also described by the receiver, and the number of elements 236 | // of the receiver will grow by the number of elements in the added filter. 237 | // Note that it is implicitly assumed that both filters are disjoint! Otherwise 238 | // the number of elements in the joined filter must _only_ be considered an 239 | // upper bound and not an exact value! 240 | // Joining two differently dimensioned filters may yield unexpected results and 241 | // hence is not allowed. An error will be returned in this case, and the 242 | // receiver will be left unaltered. 243 | func (s *BloomFilter) Join(s2 *BloomFilter) error { 244 | var i uint64 245 | if s.n != s2.n { 246 | return fmt.Errorf("filters have different dimensions (n = %d vs. %d))", 247 | s.n, s2.n) 248 | } 249 | if s.p != s2.p { 250 | return fmt.Errorf("filters have different dimensions (p = %f vs. %f))", 251 | s.p, s2.p) 252 | } 253 | if s.k != s2.k { 254 | return fmt.Errorf("filters have different dimensions (k = %d vs. %d))", 255 | s.k, s2.k) 256 | } 257 | if s.m != s2.m { 258 | return fmt.Errorf("filters have different dimensions (m = %d vs. %d))", 259 | s.m, s2.m) 260 | } 261 | if s.M != s2.M { 262 | return fmt.Errorf("filters have different dimensions (M = %d vs. %d))", 263 | s.M, s2.M) 264 | } 265 | for i = 0; i < s.M; i++ { 266 | s.v[i] |= s2.v[i] 267 | } 268 | if s.N+s2.N < s.N { 269 | return fmt.Errorf("addition of member counts would overflow") 270 | } 271 | s.N += s2.N 272 | 273 | return nil 274 | } 275 | 276 | // Check returns true if the given value may be in the Bloom filter, false if it 277 | // is definitely not in it. 278 | func (s *BloomFilter) Check(value []byte) bool { 279 | fingerprint := make([]uint64, s.k) 280 | s.Fingerprint(value, fingerprint) 281 | return s.CheckFingerprint(fingerprint) 282 | } 283 | 284 | // CheckFingerprint returns true if the given fingerprint occurs in the Bloom 285 | // filter, false if it does not. 286 | func (s *BloomFilter) CheckFingerprint(fingerprint []uint64) bool { 287 | var k, l uint64 288 | for i := uint64(0); i < s.k; i++ { 289 | k = uint64(fingerprint[i] / 64) 290 | l = uint64(fingerprint[i] % 64) 291 | if (s.v[k] & (1 << l)) == 0 { 292 | return false 293 | } 294 | } 295 | return true 296 | } 297 | 298 | // Initialize returns a new, empty Bloom filter with the given capacity (n) 299 | // and FP probability (p). 300 | func Initialize(n uint64, p float64) BloomFilter { 301 | m := math.Abs(math.Ceil(float64(n) * math.Log(p) / math.Pow(math.Log(2.0), 2.0))) 302 | var bf BloomFilter 303 | bf.n = n 304 | bf.p = p 305 | bf.m = uint64(m) 306 | bf.M = uint64(math.Ceil(m / 64.0)) 307 | bf.k = uint64(math.Ceil(math.Log(2) * m / float64(n))) 308 | bf.v = make([]uint64, bf.M) 309 | return bf 310 | } 311 | -------------------------------------------------------------------------------- /bloom/manage.go: -------------------------------------------------------------------------------- 1 | // DCSO go bloom filter 2 | // Copyright (c) 2017, DCSO GmbH 3 | 4 | package main 5 | 6 | import ( 7 | "bufio" 8 | "bytes" 9 | "fmt" 10 | "os" 11 | "path/filepath" 12 | "strconv" 13 | "strings" 14 | 15 | "github.com/DCSO/bloom" 16 | "gopkg.in/urfave/cli.v1" 17 | ) 18 | 19 | // BloomParams represents the parameters of the 'bloom' command line tool. 20 | type BloomParams struct { 21 | gzip bool 22 | interactive bool 23 | split bool 24 | printEachMatch bool 25 | delimiter string 26 | fields []int 27 | printFields []int 28 | } 29 | 30 | func exitWithError(message string) { 31 | fmt.Fprintf(os.Stderr, "Error: %s \n", message) 32 | os.Exit(-1) 33 | } 34 | 35 | func readValuesIntoFilter(filter *bloom.BloomFilter, bloomParams BloomParams) { 36 | //we determine if the program is run interactively or within a pipe 37 | stat, _ := os.Stdin.Stat() 38 | var isTerminal = (stat.Mode() & os.ModeCharDevice) != 0 39 | //if we are not in an interactive session and this is a terminal, we quit 40 | if !bloomParams.interactive && isTerminal { 41 | return 42 | } 43 | if bloomParams.interactive { 44 | fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit (values will not be stored otherwise).") 45 | } 46 | scanner := bufio.NewScanner(os.Stdin) 47 | for scanner.Scan() { 48 | line := scanner.Text() 49 | if line == "" && bloomParams.interactive { 50 | break 51 | } 52 | if bloomParams.split { 53 | values := strings.Split(line, bloomParams.delimiter) 54 | for i, value := range values { 55 | j := i - len(values) 56 | 57 | if len(bloomParams.fields) > 0 { 58 | if !contains(bloomParams.fields, i) && !contains(bloomParams.fields, j) { 59 | continue 60 | } 61 | } 62 | filter.Add([]byte(value)) 63 | } 64 | } else { 65 | filter.Add([]byte(line)) 66 | } 67 | } 68 | } 69 | 70 | func readInputIntoData(filter *bloom.BloomFilter, bloomParams BloomParams) { 71 | //we determine if the program is run interactively or within a pipe 72 | stat, _ := os.Stdin.Stat() 73 | var isTerminal = (stat.Mode() & os.ModeCharDevice) != 0 74 | //if we are not in an interactive session and this is a terminal, we quit 75 | if !bloomParams.interactive && isTerminal { 76 | return 77 | } 78 | if bloomParams.interactive { 79 | fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit (values will not be stored otherwise).") 80 | } 81 | scanner := bufio.NewScanner(os.Stdin) 82 | dataBuffer := bytes.NewBuffer([]byte("")) 83 | for scanner.Scan() { 84 | line := scanner.Bytes() 85 | if len(line) == 0 && bloomParams.interactive { 86 | break 87 | } 88 | dataBuffer.Write(line) 89 | dataBuffer.Write([]byte("\n")) 90 | } 91 | filter.Data = dataBuffer.Bytes() 92 | } 93 | 94 | func insertIntoFilter(path string, bloomParams BloomParams) { 95 | filter, err := bloom.LoadFilter(path, bloomParams.gzip) 96 | if err != nil { 97 | exitWithError(err.Error()) 98 | } 99 | readValuesIntoFilter(filter, bloomParams) 100 | err = bloom.WriteFilter(filter, path, bloomParams.gzip) 101 | if err != nil { 102 | exitWithError(err.Error()) 103 | } 104 | } 105 | 106 | func updateFilterData(path string, bloomParams BloomParams) { 107 | filter, err := bloom.LoadFilter(path, bloomParams.gzip) 108 | if err != nil { 109 | exitWithError(err.Error()) 110 | } 111 | readInputIntoData(filter, bloomParams) 112 | err = bloom.WriteFilter(filter, path, bloomParams.gzip) 113 | if err != nil { 114 | exitWithError(err.Error()) 115 | } 116 | } 117 | 118 | func getFilterData(path string, bloomParams BloomParams) { 119 | filter, err := bloom.LoadFilter(path, bloomParams.gzip) 120 | if err != nil { 121 | exitWithError(err.Error()) 122 | } 123 | fmt.Print(string(filter.Data)) 124 | } 125 | 126 | func contains(s []int, e int) bool { 127 | for _, a := range s { 128 | if a == e { 129 | return true 130 | } 131 | } 132 | return false 133 | } 134 | 135 | func checkAgainstFilter(path string, bloomParams BloomParams) { 136 | filter, err := bloom.LoadFilter(path, bloomParams.gzip) 137 | if err != nil { 138 | exitWithError(err.Error()) 139 | } 140 | scanner := bufio.NewScanner(os.Stdin) 141 | if bloomParams.interactive { 142 | fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit.") 143 | } 144 | for scanner.Scan() { 145 | line := scanner.Text() 146 | if line == "" && bloomParams.interactive { 147 | break 148 | } 149 | var valuesToCheck []string 150 | if bloomParams.split { 151 | valuesToCheck = strings.Split(line, bloomParams.delimiter) 152 | } else { 153 | valuesToCheck = make([]string, 1) 154 | valuesToCheck[0] = line 155 | } 156 | printed := false 157 | prefix := "" 158 | if bloomParams.interactive { 159 | prefix = ">" 160 | } 161 | for i, value := range valuesToCheck { 162 | j := i - len(valuesToCheck) 163 | //we only check fields that are in the "fields" parameters (if defined) 164 | if len(bloomParams.fields) > 0 { 165 | if !contains(bloomParams.fields, i) && !contains(bloomParams.fields, j) { 166 | continue 167 | } 168 | } 169 | 170 | if filter.Check([]byte(value)) { 171 | if bloomParams.printEachMatch { 172 | fmt.Printf("%s%s\n", prefix, value) 173 | } else { 174 | if !printed { 175 | if len(bloomParams.printFields) > 0 { 176 | values := make([]string, 0, len(bloomParams.printFields)) 177 | for _, i := range bloomParams.printFields { 178 | j := i 179 | if j < 0 { 180 | j = j + len(valuesToCheck) 181 | } 182 | if j >= len(valuesToCheck) || j < 0 { 183 | continue 184 | } 185 | values = append(values, valuesToCheck[j]) 186 | } 187 | fmt.Printf("%s%s\n", prefix, strings.Join(values, bloomParams.delimiter)) 188 | } else { 189 | fmt.Printf("%s%s\n", prefix, line) 190 | } 191 | } 192 | printed = true 193 | } 194 | } 195 | } 196 | } 197 | } 198 | 199 | func printStats(path string, bloomParams BloomParams) { 200 | filter, err := bloom.LoadFilter(path, bloomParams.gzip) 201 | if err != nil { 202 | exitWithError(err.Error()) 203 | } 204 | fmt.Printf("File:\t\t\t%s\n", path) 205 | fmt.Printf("Capacity:\t\t%d\n", filter.MaxNumElements()) 206 | fmt.Printf("Elements present:\t%d\n", filter.N) 207 | fmt.Printf("FP probability:\t\t%.2e\n", filter.FalsePositiveProb()) 208 | fmt.Printf("Bits:\t\t\t%d\n", filter.NumBits()) 209 | fmt.Printf("Hash functions:\t\t%d\n", filter.NumHashFuncs()) 210 | } 211 | 212 | func createFilter(path string, n uint64, p float64, bloomParams BloomParams) { 213 | filter := bloom.Initialize(n, p) 214 | readValuesIntoFilter(&filter, bloomParams) 215 | err := bloom.WriteFilter(&filter, path, bloomParams.gzip) 216 | if err != nil { 217 | exitWithError(err.Error()) 218 | } 219 | } 220 | 221 | func joinFilters(path string, pathToAdd string, bloomParams BloomParams) { 222 | filter, err := bloom.LoadFilter(path, bloomParams.gzip) 223 | if err != nil { 224 | exitWithError(err.Error()) 225 | } 226 | filter2, err := bloom.LoadFilter(pathToAdd, bloomParams.gzip) 227 | if err != nil { 228 | exitWithError(err.Error()) 229 | } 230 | err = filter.Join(filter2) 231 | if err != nil { 232 | exitWithError(err.Error()) 233 | } 234 | err = bloom.WriteFilter(filter, path, bloomParams.gzip) 235 | if err != nil { 236 | exitWithError(err.Error()) 237 | } 238 | } 239 | 240 | func parseFieldIndexes(s string) ([]int, error) { 241 | fields := strings.Split(s, ",") 242 | fieldNumbers := make([]int, len(fields)) 243 | for i, field := range fields { 244 | num, err := strconv.Atoi(field) 245 | if err != nil { 246 | return nil, err 247 | } 248 | fieldNumbers[i] = num 249 | } 250 | return fieldNumbers, nil 251 | } 252 | 253 | func parseBloomParams(c *cli.Context) BloomParams { 254 | var bloomParams BloomParams 255 | var err error 256 | bloomParams.gzip = c.GlobalBool("gzip") 257 | bloomParams.interactive = c.GlobalBool("interactive") 258 | bloomParams.split = c.GlobalBool("split") 259 | bloomParams.delimiter = c.GlobalString("delimiter") 260 | bloomParams.printEachMatch = c.GlobalBool("each") 261 | if c.GlobalString("fields") != "" { 262 | bloomParams.fields, err = parseFieldIndexes(c.GlobalString("fields")) 263 | if err != nil { 264 | exitWithError(err.Error()) 265 | } 266 | } 267 | if c.GlobalString("print-fields") != "" { 268 | bloomParams.printFields, err = parseFieldIndexes(c.GlobalString("print-fields")) 269 | if err != nil { 270 | exitWithError(err.Error()) 271 | } 272 | //if printFields is set we also set printEachMatch 273 | if len(bloomParams.printFields) > 0 { 274 | bloomParams.printEachMatch = false 275 | } 276 | } 277 | return bloomParams 278 | } 279 | 280 | func main() { 281 | 282 | app := cli.NewApp() 283 | app.Name = "Bloom Filter" 284 | app.Usage = "Utility to work with bloom filters" 285 | app.Flags = []cli.Flag{ 286 | cli.BoolFlag{ 287 | Name: "gzip, gz", 288 | Usage: "compress bloom file with gzip", 289 | }, 290 | cli.BoolFlag{ 291 | Name: "interactive, i", 292 | Usage: "interactively add values to the filter", 293 | }, 294 | cli.BoolFlag{ 295 | Name: "split, s", 296 | Usage: "split the input string", 297 | }, 298 | cli.BoolFlag{ 299 | Name: "each, e", 300 | Usage: "print each match of a split string individually", 301 | }, 302 | cli.StringFlag{ 303 | Name: "delimiter, d", 304 | Value: ",", 305 | Usage: "delimiter to use for splitting", 306 | }, 307 | cli.StringFlag{ 308 | Name: "fields, f", 309 | Value: "", 310 | Usage: "fields of split output to use in filter (a single number or a comma-separated list of numbers, zero-indexed)", 311 | }, 312 | cli.StringFlag{ 313 | Name: "print-fields, pf", 314 | Value: "", 315 | Usage: "fields of split output to print for a successful match (a single number or a comma-separated list of numbers, zero-indexed).", 316 | }, 317 | } 318 | app.Commands = []cli.Command{ 319 | { 320 | Name: "create", 321 | Aliases: []string{"cr"}, 322 | Flags: []cli.Flag{ 323 | cli.Float64Flag{Name: "p", Value: 0.01, Usage: "The desired false positive probability."}, 324 | cli.Uint64Flag{Name: "n", Value: 10000, Usage: "The desired capacity."}, 325 | }, 326 | Usage: "Create a new Bloom filter and store it in the given filename.", 327 | Action: func(c *cli.Context) error { 328 | path := c.Args().First() 329 | bloomParams := parseBloomParams(c) 330 | if path == "" { 331 | exitWithError("No filename given.") 332 | } 333 | path, err := filepath.Abs(path) 334 | if err != nil { 335 | return err 336 | } 337 | n := c.Uint64("n") 338 | p := c.Float64("p") 339 | if n < 0 { 340 | exitWithError("n cannot be negative.") 341 | } 342 | if p < 0 || p > 1 { 343 | exitWithError("p must be between 0 and 1.") 344 | } 345 | createFilter(path, n, p, bloomParams) 346 | return nil 347 | }, 348 | }, 349 | { 350 | Name: "insert", 351 | Aliases: []string{"i"}, 352 | Flags: []cli.Flag{}, 353 | Usage: "Inserts new values into an existing Bloom filter.", 354 | Action: func(c *cli.Context) error { 355 | path := c.Args().First() 356 | bloomParams := parseBloomParams(c) 357 | if path == "" { 358 | exitWithError("No filename given.") 359 | } 360 | path, err := filepath.Abs(path) 361 | if err != nil { 362 | return err 363 | } 364 | insertIntoFilter(path, bloomParams) 365 | return nil 366 | }, 367 | }, 368 | { 369 | Name: "join", 370 | Aliases: []string{"j", "merge", "m"}, 371 | Flags: []cli.Flag{}, 372 | Usage: "Joins two Bloom filters into one.", 373 | Action: func(c *cli.Context) error { 374 | if len(c.Args()) != 2 { 375 | exitWithError("Two filenames are required.") 376 | } 377 | bloomParams := parseBloomParams(c) 378 | path := c.Args().First() 379 | if path == "" { 380 | exitWithError("No first filename given.") 381 | } 382 | path, err := filepath.Abs(path) 383 | if err != nil { 384 | return err 385 | } 386 | pathToAdd := c.Args().Get(1) 387 | if pathToAdd == "" { 388 | exitWithError("No second filename given.") 389 | } 390 | pathToAdd, err = filepath.Abs(pathToAdd) 391 | if err != nil { 392 | return err 393 | } 394 | joinFilters(path, pathToAdd, bloomParams) 395 | return nil 396 | }, 397 | }, 398 | { 399 | Name: "check", 400 | Aliases: []string{"c"}, 401 | Flags: []cli.Flag{}, 402 | Usage: "Checks values against an existing Bloom filter.", 403 | Action: func(c *cli.Context) error { 404 | path := c.Args().First() 405 | bloomParams := parseBloomParams(c) 406 | if path == "" { 407 | exitWithError("No filename given.") 408 | } 409 | path, err := filepath.Abs(path) 410 | if err != nil { 411 | return err 412 | } 413 | checkAgainstFilter(path, bloomParams) 414 | return nil 415 | }, 416 | }, 417 | { 418 | Name: "set-data", 419 | Aliases: []string{"sd"}, 420 | Flags: []cli.Flag{}, 421 | Usage: "Sets the data associated with the Bloom filter.", 422 | Action: func(c *cli.Context) error { 423 | path := c.Args().First() 424 | bloomParams := parseBloomParams(c) 425 | if path == "" { 426 | exitWithError("No filename given.") 427 | } 428 | path, err := filepath.Abs(path) 429 | if err != nil { 430 | return err 431 | } 432 | updateFilterData(path, bloomParams) 433 | return nil 434 | }, 435 | }, 436 | { 437 | Name: "get-data", 438 | Aliases: []string{"gd"}, 439 | Flags: []cli.Flag{}, 440 | Usage: "Prints the data associated with the Bloom filter.", 441 | Action: func(c *cli.Context) error { 442 | path := c.Args().First() 443 | bloomParams := parseBloomParams(c) 444 | if path == "" { 445 | exitWithError("No filename given.") 446 | } 447 | path, err := filepath.Abs(path) 448 | if err != nil { 449 | return err 450 | } 451 | getFilterData(path, bloomParams) 452 | return nil 453 | }, 454 | }, 455 | { 456 | Name: "show", 457 | Aliases: []string{"s"}, 458 | Flags: []cli.Flag{}, 459 | Usage: "Shows various details about a given Bloom filter.", 460 | Action: func(c *cli.Context) error { 461 | path := c.Args().First() 462 | bloomParams := parseBloomParams(c) 463 | if path == "" { 464 | exitWithError("No filename given.") 465 | } 466 | path, err := filepath.Abs(path) 467 | if err != nil { 468 | return err 469 | } 470 | printStats(path, bloomParams) 471 | return nil 472 | }, 473 | }, 474 | } 475 | app.Version = "0.2.4" 476 | 477 | app.Run(os.Args) 478 | } 479 | -------------------------------------------------------------------------------- /bloom_test.go: -------------------------------------------------------------------------------- 1 | // DCSO go bloom filter 2 | // Copyright (c) 2017, DCSO GmbH 3 | 4 | package bloom 5 | 6 | import ( 7 | "bytes" 8 | "io/ioutil" 9 | "log" 10 | "math" 11 | "math/rand" 12 | "os" 13 | "path/filepath" 14 | "strings" 15 | "testing" 16 | ) 17 | 18 | func TestFingerprinting(t *testing.T) { 19 | filter := Initialize(100000, 0.01) 20 | fp := make([]uint64, 7) 21 | expected := [7]uint64{20311, 36825, 412501, 835777, 658914, 853361, 307361} 22 | filter.Fingerprint([]byte("bar"), fp) 23 | for i, v := range fp { 24 | if v != expected[i] { 25 | t.Errorf("Wrong fingerprint: %d vs. %d", v, expected[i]) 26 | break 27 | } 28 | } 29 | } 30 | 31 | func TestInitialization(t *testing.T) { 32 | filter := Initialize(10000, 0.001) 33 | if filter.k != 10 { 34 | t.Error("k does not match expectation!") 35 | } 36 | if filter.m != 143775 { 37 | t.Error("m does not match expectation: ", filter.m) 38 | } 39 | if filter.M != uint64(math.Ceil(float64(filter.m)/64)) { 40 | t.Error("M does not match expectation: ", filter.M) 41 | } 42 | for i := uint64(0); i < filter.M; i++ { 43 | if filter.v[i] != 0 { 44 | t.Error("Filter value is not initialized to zero!") 45 | } 46 | } 47 | } 48 | 49 | func checkFilters(a BloomFilter, b BloomFilter, t *testing.T) bool { 50 | if b.n != a.n || 51 | b.p != a.p || 52 | b.k != a.k || 53 | b.m != a.m || 54 | b.M != a.M || 55 | !bytes.Equal(b.Data, a.Data) { 56 | return false 57 | } 58 | for i := uint64(0); i < a.M; i++ { 59 | if a.v[i] != b.v[i] { 60 | return false 61 | } 62 | } 63 | return true 64 | } 65 | 66 | func serializeToBuffer(filter BloomFilter) (*BloomFilter, error) { 67 | var buf bytes.Buffer 68 | filter.Write(&buf) 69 | var newFilter BloomFilter 70 | newFilter.Read(&buf) 71 | return &newFilter, nil 72 | } 73 | 74 | func serializeToDisk(filter BloomFilter) (*BloomFilter, error) { 75 | tempFile, err := ioutil.TempFile("", "filter") 76 | if err != nil { 77 | return nil, err 78 | } 79 | defer os.Remove(tempFile.Name()) 80 | filter.Write(tempFile) 81 | tempFile.Sync() 82 | tempFile.Seek(0, 0) 83 | var newFilter BloomFilter 84 | err = newFilter.Read(tempFile) 85 | if err != nil { 86 | return nil, err 87 | } 88 | return &newFilter, nil 89 | } 90 | 91 | func TestSerialization(t *testing.T) { 92 | capacity := uint64(100000) 93 | p := float64(0.01) 94 | samples := uint64(1000) 95 | filter, _ := GenerateExampleFilter(capacity, p, samples) 96 | 97 | newFilter, err := serializeToBuffer(filter) 98 | if err != nil { 99 | t.Error("Cannot serialize filter to buffer!") 100 | return 101 | } 102 | 103 | if !checkFilters(filter, *newFilter, t) { 104 | t.Error("Filters do not match!") 105 | } 106 | 107 | newFilter, err = serializeToDisk(filter) 108 | 109 | if err != nil { 110 | t.Error("Cannot serialize filter to file!") 111 | return 112 | } 113 | 114 | if !checkFilters(filter, *newFilter, t) { 115 | t.Error("Filters do not match!") 116 | } 117 | 118 | filter.Add(GenerateTestValue(100)) 119 | newFilter.Add(GenerateTestValue(100)) 120 | newFilter, err = serializeToDisk(filter) 121 | if err != nil { 122 | t.Error("Cannot serialize filter to disk!") 123 | return 124 | } 125 | 126 | if !checkFilters(filter, *newFilter, t) { 127 | t.Error("Filters do not match!") 128 | } 129 | 130 | filter.Add(GenerateTestValue(100)) 131 | newFilter.Add(GenerateTestValue(100)) 132 | newFilter, err = serializeToDisk(filter) 133 | if err != nil { 134 | t.Error("Cannot serialize filter to disk!") 135 | return 136 | } 137 | 138 | if !checkFilters(filter, *newFilter, t) { 139 | t.Error("Filters do not match!") 140 | } 141 | 142 | checkFilters(filter, *newFilter, t) 143 | } 144 | 145 | func TestSerializationToDisk(t *testing.T) { 146 | capacity := uint64(100000) 147 | p := float64(0.001) 148 | samples := uint64(1000) 149 | filter, _ := GenerateExampleFilter(capacity, p, samples) 150 | 151 | var buf bytes.Buffer 152 | 153 | filter.Write(&buf) 154 | 155 | var newFilter BloomFilter 156 | 157 | newFilter.Read(&buf) 158 | 159 | checkFilters(filter, newFilter, t) 160 | } 161 | 162 | func TestSerializationWriteFail(t *testing.T) { 163 | capacity := uint64(100000) 164 | p := float64(0.001) 165 | samples := uint64(1000) 166 | filter, _ := GenerateExampleFilter(capacity, p, samples) 167 | 168 | dir, err := ioutil.TempDir("", "bloomtest") 169 | if err != nil { 170 | log.Fatal(err) 171 | } 172 | defer os.RemoveAll(dir) 173 | 174 | tmpfn := filepath.Join(dir, "tmpfile") 175 | tmpfile, err := os.OpenFile(tmpfn, os.O_CREATE|os.O_RDONLY, 0000) 176 | if err != nil { 177 | t.Fatal(err) 178 | } 179 | defer tmpfile.Close() 180 | 181 | err = filter.Write(tmpfile) 182 | if err == nil { 183 | t.Error("writing to read-only file should fail") 184 | } 185 | } 186 | 187 | func TestSerializationReadFail(t *testing.T) { 188 | var newFilter BloomFilter 189 | 190 | dir, err := ioutil.TempDir("", "bloomtest") 191 | if err != nil { 192 | log.Fatal(err) 193 | } 194 | defer os.RemoveAll(dir) 195 | 196 | tmpfn := filepath.Join(dir, "tmpfile") 197 | tmpfile, err := os.OpenFile(tmpfn, os.O_CREATE, 0777) 198 | if err != nil { 199 | t.Fatal(err) 200 | } 201 | defer tmpfile.Close() 202 | 203 | err = newFilter.Read(tmpfile) 204 | if err == nil { 205 | t.Error("reading from empty file should fail") 206 | } 207 | } 208 | 209 | func GenerateTestValue(length uint64) []byte { 210 | value := make([]byte, length) 211 | for i := uint64(0); i < length; i++ { 212 | value[i] = byte(rand.Int() % 256) 213 | } 214 | return value 215 | } 216 | 217 | func GenerateExampleFilter(capacity uint64, p float64, samples uint64) (BloomFilter, [][]byte) { 218 | filter := Initialize(capacity, p) 219 | filter.Data = []byte("foobar") 220 | testValues := make([][]byte, 0, samples) 221 | for i := uint64(0); i < samples; i++ { 222 | testValue := GenerateTestValue(100) 223 | testValues = append(testValues, testValue) 224 | filter.Add(testValue) 225 | } 226 | return filter, testValues 227 | } 228 | 229 | func GenerateDisjointExampleFilter(capacity uint64, p float64, samples uint64, other BloomFilter) (BloomFilter, [][]byte) { 230 | filter := Initialize(capacity, p) 231 | testValues := make([][]byte, 0, samples) 232 | for i := uint64(0); i < samples; { 233 | testValue := GenerateTestValue(100) 234 | if !other.Check(testValue) { 235 | testValues = append(testValues, testValue) 236 | filter.Add(testValue) 237 | i++ 238 | } 239 | } 240 | return filter, testValues 241 | } 242 | 243 | //This tests the checking of values against a given filter 244 | func TestChecking(t *testing.T) { 245 | capacity := uint64(100000) 246 | p := float64(0.001) 247 | samples := uint64(100000) 248 | filter, testValues := GenerateExampleFilter(capacity, p, samples) 249 | fingerprint := make([]uint64, filter.k) 250 | for _, value := range testValues { 251 | filter.Fingerprint(value, fingerprint) 252 | if !filter.CheckFingerprint(fingerprint) { 253 | t.Error("Did not find test value in filter!") 254 | } 255 | } 256 | } 257 | 258 | //This tests the checking of values against a given filter after resetting it 259 | func TestReset(t *testing.T) { 260 | capacity := uint64(100000) 261 | p := float64(0.001) 262 | samples := uint64(100000) 263 | filter, testValues := GenerateExampleFilter(capacity, p, samples) 264 | filter.Reset() 265 | fingerprint := make([]uint64, filter.k) 266 | for _, value := range testValues { 267 | filter.Fingerprint(value, fingerprint) 268 | if filter.CheckFingerprint(fingerprint) { 269 | t.Error("Did not find test value in filter!") 270 | } 271 | } 272 | } 273 | 274 | //This tests the checking of values against a given filter 275 | //see https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 276 | func TestFalsePositives(t *testing.T) { 277 | capacity := uint64(10000) 278 | p := float64(0.001) 279 | fillingFactor := 0.9 280 | N := uint64(float64(capacity) * fillingFactor) 281 | filter, _ := GenerateExampleFilter(capacity, p, N) 282 | pAcceptable := math.Pow(1-math.Exp(-float64(filter.k)*float64(N)/float64(filter.m)), float64(filter.k)) 283 | fingerprint := make([]uint64, filter.k) 284 | cnt := 0.0 285 | matches := 0.0 286 | for { 287 | cnt++ 288 | value := GenerateTestValue(100) 289 | filter.Fingerprint(value, fingerprint) 290 | if filter.CheckFingerprint(fingerprint) { 291 | matches++ 292 | } 293 | if cnt > float64(capacity)*10 { 294 | break 295 | } 296 | } 297 | //this might still fail sometimes... 298 | //we allow for a probability that is two times higher than the normally acceptable probability 299 | if matches/cnt > pAcceptable*2 { 300 | t.Error("False positive probability is too high at ", matches/cnt*100, "% vs ", pAcceptable*100, "%") 301 | } 302 | } 303 | 304 | func TestJoiningRegularMisdimensioned(t *testing.T) { 305 | a := Initialize(100000, 0.0001) 306 | b := Initialize(10000, 0.0001) 307 | err := a.Join(&b) 308 | if err == nil { 309 | t.Error("joining filters with different capacity should fail") 310 | } 311 | if !strings.Contains(err.Error(), "different dimensions") { 312 | t.Error("wrong error message returned") 313 | } 314 | a = Initialize(100000, 0.0001) 315 | b = Initialize(100000, 0.001) 316 | err = a.Join(&b) 317 | if err == nil { 318 | t.Error("joining filters with different FP prob should fail") 319 | } 320 | if !strings.Contains(err.Error(), "different dimensions") { 321 | t.Error("wrong error message returned") 322 | } 323 | a = Initialize(100000, 0.0001) 324 | b = Initialize(100000, 0.0001) 325 | b.k = 1 326 | err = a.Join(&b) 327 | if err == nil { 328 | t.Error("joining filters with different number of hash funcs should fail") 329 | } 330 | if !strings.Contains(err.Error(), "different dimensions") { 331 | t.Error("wrong error message returned") 332 | } 333 | a = Initialize(100000, 0.0001) 334 | b = Initialize(100000, 0.0001) 335 | b.m = 1 336 | err = a.Join(&b) 337 | if err == nil { 338 | t.Error("joining filters with different number of bits should fail") 339 | } 340 | if !strings.Contains(err.Error(), "different dimensions") { 341 | t.Error("wrong error message returned") 342 | } 343 | a = Initialize(100000, 0.0001) 344 | b = Initialize(100000, 0.0001) 345 | b.M = 1 346 | err = a.Join(&b) 347 | if err == nil { 348 | t.Error("joining filters with different int array size should fail") 349 | } 350 | if !strings.Contains(err.Error(), "different dimensions") { 351 | t.Error("wrong error message returned") 352 | } 353 | } 354 | 355 | func TestAccessors(t *testing.T) { 356 | a, _ := GenerateExampleFilter(100000, 0.0001, 10000) 357 | if a.MaxNumElements() != 100000 { 358 | t.Error("unexpected capacity in filter") 359 | } 360 | if a.NumBits() != 1917011 { 361 | t.Error("unexpected number of bits in filter") 362 | } 363 | if a.NumHashFuncs() != 14 { 364 | t.Error("unexpected number of hash funcs in filter") 365 | } 366 | if a.FalsePositiveProb() != 0.0001 { 367 | t.Error("unexpected FP prob in filter") 368 | } 369 | } 370 | 371 | func TestJoiningRegular(t *testing.T) { 372 | a, aval := GenerateExampleFilter(100000, 0.0001, 10000) 373 | b, bval := GenerateDisjointExampleFilter(100000, 0.0001, 20000, a) 374 | for _, v := range bval { 375 | if a.Check(v) { 376 | t.Errorf("value not missing in joined filter: %s", string(v)) 377 | } 378 | } 379 | if a.N != 10000 { 380 | t.Error("unexpected number of elements in filter") 381 | } 382 | if b.N != 20000 { 383 | t.Error("unexpected number of elements in filter") 384 | } 385 | err := a.Join(&b) 386 | if a.N != 30000 { 387 | t.Errorf("unexpected number of elements in filter") 388 | } 389 | if err != nil { 390 | t.Fatal(err) 391 | } 392 | for _, v := range aval { 393 | if !a.Check(v) { 394 | t.Errorf("value not found in joined filter: %s", string(v)) 395 | } 396 | } 397 | for _, v := range bval { 398 | if !a.Check(v) { 399 | t.Errorf("value not found in joined filter: %s", string(v)) 400 | } 401 | } 402 | } 403 | 404 | //This benchmarks the checking of values against a given filter 405 | func BenchmarkChecking(b *testing.B) { 406 | capacity := uint64(1e9) 407 | p := float64(0.001) 408 | samples := uint64(100000) 409 | filter, testValues := GenerateExampleFilter(capacity, p, samples) 410 | fingerprint := make([]uint64, filter.k) 411 | b.ResetTimer() 412 | for i := 0; i < b.N; i++ { 413 | value := testValues[rand.Int()%len(testValues)] 414 | filter.Fingerprint(value, fingerprint) 415 | if !filter.CheckFingerprint(fingerprint) { 416 | b.Error("Did not find test value in filter!") 417 | } 418 | } 419 | } 420 | 421 | //This benchmarks the checking without using a fixed fingerprint variable (instead a temporary variable is created each time) 422 | func BenchmarkSimpleChecking(b *testing.B) { 423 | capacity := uint64(1e9) 424 | p := float64(0.001) 425 | samples := uint64(100000) 426 | filter, testValues := GenerateExampleFilter(capacity, p, samples) 427 | b.ResetTimer() 428 | for i := 0; i < b.N; i++ { 429 | value := testValues[rand.Int()%len(testValues)] 430 | if !filter.Check(value) { 431 | b.Error("Did not find test value in filter!") 432 | } 433 | } 434 | } 435 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/DCSO/bloom 2 | 3 | go 1.15 4 | 5 | require gopkg.in/urfave/cli.v1 v1.20.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | gopkg.in/urfave/cli.v1 v1.20.0 h1:NdAVW6RYxDif9DhDHaAortIu956m2c0v+09AZBPTbE0= 2 | gopkg.in/urfave/cli.v1 v1.20.0/go.mod h1:vuBzUtMdQeixQj8LVd+/98pzhxNGQoyuPBlsXHOQNO0= 3 | -------------------------------------------------------------------------------- /io.go: -------------------------------------------------------------------------------- 1 | // DCSO go bloom filter 2 | // Copyright (c) 2017, DCSO GmbH 3 | 4 | package bloom 5 | 6 | import ( 7 | "bufio" 8 | "bytes" 9 | gz "compress/gzip" 10 | "io" 11 | "os" 12 | ) 13 | 14 | // LoadFromBytes reads a binary Bloom filter representation from a byte array 15 | // and returns a BloomFilter struct pointer based on it. 16 | // If 'gzip' is true, then compressed input will be expected. 17 | func LoadFromBytes(input []byte, gzip bool) (*BloomFilter, error) { 18 | return LoadFromReader(bytes.NewReader(input), gzip) 19 | } 20 | 21 | // LoadFilter reads a binary Bloom filter representation from a file 22 | // and returns a BloomFilter struct pointer based on it. 23 | // If 'gzip' is true, then compressed input will be expected. 24 | func LoadFilter(path string, gzip bool) (*BloomFilter, error) { 25 | file, err := os.Open(path) 26 | if err != nil { 27 | return nil, err 28 | } 29 | defer file.Close() 30 | 31 | return LoadFromReader(file, gzip) 32 | } 33 | 34 | // LoadFromReader reads a binary Bloom filter representation from an io.Reader 35 | // and returns a BloomFilter struct pointer based on it. 36 | // If 'gzip' is true, then compressed input will be expected. 37 | func LoadFromReader(inReader io.Reader, gzip bool) (*BloomFilter, error) { 38 | var err error 39 | var reader io.Reader 40 | var gzipReader *gz.Reader 41 | var ioReader *bufio.Reader 42 | 43 | if gzip { 44 | gzipReader, err = gz.NewReader(inReader) 45 | if err != nil { 46 | return nil, err 47 | } 48 | defer gzipReader.Close() 49 | reader = gzipReader 50 | } else { 51 | ioReader = bufio.NewReader(inReader) 52 | reader = ioReader 53 | } 54 | 55 | var filter BloomFilter 56 | if err = filter.Read(reader); err != nil { 57 | return nil, err 58 | } 59 | 60 | return &filter, nil 61 | } 62 | 63 | // WriteFilter writes a binary Bloom filter representation for a given struct 64 | // to a file. If 'gzip' is true, then a compressed file will be written. 65 | func WriteFilter(filter *BloomFilter, path string, gzip bool) error { 66 | 67 | file, err := os.Create(path) 68 | 69 | if err != nil { 70 | return err 71 | } 72 | 73 | defer file.Close() 74 | 75 | file.Seek(0, 0) 76 | 77 | var writer io.Writer 78 | var gzipWriter *gz.Writer 79 | var ioWriter *bufio.Writer 80 | 81 | if gzip { 82 | gzipWriter = gz.NewWriter(file) 83 | defer gzipWriter.Close() 84 | writer = gzipWriter 85 | } else { 86 | ioWriter = bufio.NewWriter(file) 87 | writer = ioWriter 88 | } 89 | 90 | err = filter.Write(writer) 91 | 92 | if err != nil { 93 | return err 94 | } 95 | 96 | if gzip { 97 | gzipWriter.Flush() 98 | } else { 99 | ioWriter.Flush() 100 | } 101 | 102 | file.Sync() 103 | 104 | return nil 105 | } 106 | -------------------------------------------------------------------------------- /io_test.go: -------------------------------------------------------------------------------- 1 | // DCSO go bloom filter 2 | // Copyright (c) 2017, DCSO GmbH 3 | 4 | package bloom 5 | 6 | import ( 7 | "io/ioutil" 8 | "os" 9 | "regexp" 10 | "testing" 11 | ) 12 | 13 | func checkResults(t *testing.T, bf *BloomFilter) { 14 | for _, v := range []string{"foo", "bar", "baz"} { 15 | if !bf.Check([]byte(v)) { 16 | t.Fatalf("value %s expected in filter but wasn't found", v) 17 | } 18 | } 19 | if bf.Check([]byte("")) { 20 | t.Fatal("empty value not expected in filter but was found") 21 | } 22 | if bf.Check([]byte("12345")) { 23 | t.Fatal("missing value not expected in filter but was found") 24 | } 25 | } 26 | 27 | func TestFromReaderFile(t *testing.T) { 28 | f, err := os.Open("testdata/test.bloom") 29 | if err != nil { 30 | t.Fatal(err) 31 | } 32 | defer f.Close() 33 | bf, err := LoadFromReader(f, false) 34 | if err != nil { 35 | t.Fatal(err) 36 | } 37 | checkResults(t, bf) 38 | } 39 | 40 | func TestFromReaderCorruptFile(t *testing.T) { 41 | f, err := os.Open("testdata/broken.bloom") 42 | if err != nil { 43 | t.Fatal(err) 44 | } 45 | defer f.Close() 46 | _, err = LoadFromReader(f, false) 47 | if err == nil { 48 | t.Fatal("error expected") 49 | } 50 | r, _ := regexp.Compile("is too high") 51 | if !r.MatchString(err.Error()) { 52 | t.Fatalf("wrong error message: %s", err.Error()) 53 | } 54 | } 55 | 56 | func testFromSerialized(t *testing.T, gzip bool) { 57 | bf := Initialize(100, 0.0001) 58 | for _, v := range []string{"foo", "bar", "baz"} { 59 | bf.Add([]byte(v)) 60 | } 61 | tmpfile, err := ioutil.TempFile("", "test") 62 | if err != nil { 63 | t.Fatal(err) 64 | } 65 | defer os.Remove(tmpfile.Name()) 66 | 67 | err = WriteFilter(&bf, tmpfile.Name(), gzip) 68 | if err != nil { 69 | t.Fatal(err) 70 | } 71 | 72 | loadedBf, err := LoadFilter(tmpfile.Name(), gzip) 73 | if err != nil { 74 | t.Fatal(err) 75 | } 76 | checkResults(t, loadedBf) 77 | } 78 | 79 | func TestFromSerialized(t *testing.T) { 80 | testFromSerialized(t, false) 81 | } 82 | 83 | func TestFromSerializedZip(t *testing.T) { 84 | testFromSerialized(t, true) 85 | } 86 | 87 | func TestFromReaderFileZip(t *testing.T) { 88 | f, err := os.Open("testdata/test.bloom.gz") 89 | if err != nil { 90 | t.Fatal(err) 91 | } 92 | defer f.Close() 93 | bf, err := LoadFromReader(f, true) 94 | if err != nil { 95 | t.Fatal(err) 96 | } 97 | checkResults(t, bf) 98 | } 99 | 100 | func TestFromBytes(t *testing.T) { 101 | testBytes, err := ioutil.ReadFile("testdata/test.bloom") 102 | if err != nil { 103 | t.Fatal(err) 104 | } 105 | bf, err := LoadFromBytes(testBytes, false) 106 | if err != nil { 107 | t.Fatal(err) 108 | } 109 | checkResults(t, bf) 110 | } 111 | 112 | func TestFromFile(t *testing.T) { 113 | bf, err := LoadFilter("testdata/test.bloom", false) 114 | if err != nil { 115 | t.Fatal(err) 116 | } 117 | checkResults(t, bf) 118 | } 119 | -------------------------------------------------------------------------------- /testdata/broken.bloom: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DCSO/bloom/5a226da0429526750d1ed733db5c5b3afa39e808/testdata/broken.bloom -------------------------------------------------------------------------------- /testdata/test-input.txt: -------------------------------------------------------------------------------- 1 | foo 2 | bar 3 | baz 4 | -------------------------------------------------------------------------------- /testdata/test.bloom: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DCSO/bloom/5a226da0429526750d1ed733db5c5b3afa39e808/testdata/test.bloom -------------------------------------------------------------------------------- /testdata/test.bloom.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DCSO/bloom/5a226da0429526750d1ed733db5c5b3afa39e808/testdata/test.bloom.gz --------------------------------------------------------------------------------