├── .github
    └── workflows
    │   ├── go.yml
    │   └── release.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── Makefile
├── README.md
├── bloom.go
├── bloom
    └── manage.go
├── bloom_test.go
├── go.mod
├── go.sum
├── io.go
├── io_test.go
└── testdata
    ├── broken.bloom
    ├── test-input.txt
    ├── test.bloom
    └── test.bloom.gz


/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Go build
 3 | 
 4 | on:
 5 |   - push
 6 |   - pull_request
 7 | 
 8 | jobs:
 9 |   build:
10 |     name: "Go build"
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v2
14 | 
15 |     - name: Set up Go
16 |       uses: actions/setup-go@v2
17 |       with:
18 |         go-version: 1.16
19 | 
20 |     - name: Build
21 |       run: go build -v ./...
22 | 
23 |     - name: Test
24 |       run: go test -v ./...
25 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "Go tagged release"
 3 | 
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - "v*"
 8 | 
 9 | jobs:
10 |   tagged-release:
11 |     name: "Tagged Release"
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v2
16 | 
17 |       - name: Set up Go
18 |         uses: actions/setup-go@v2
19 |         with:
20 |           go-version: 1.16
21 | 
22 |       - name: Build
23 |         run: make release test
24 | 
25 |       - uses: "marvinpinto/action-automatic-releases@latest"
26 |         with:
27 |           repo_token: "${{ secrets.GITHUB_TOKEN }}"
28 |           prerelease: false
29 |           files: |
30 |             bloom_linux_amd64.bin
31 |             bloom_windows_amd64.exe


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.bloom.gz
2 | *.bloom
3 | build/*
4 | *.bin
5 | *.exe
6 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## v0.2.4 (2021-06-29)
 4 | 
 5 | - Add Go module support
 6 | - Switch CI/CD to GitHub Actions
 7 | - Update documentation
 8 | 
 9 | ## v0.2.3 (2019-01-10)
10 | 
11 | - Add test for fingerprint
12 | - Update documentation
13 | - Improve robustness regarding broken input files
14 | 
15 | ## v0.2.2 (2018-10-24)
16 | 
17 | - Remove dead code
18 | 
19 | ## v0.2.0 (2017-08-14)
20 | 
21 | - Add 'bloom join' command line tool
22 | - Make it possible to store arbitary data in a Bloom filter (useful for
23 |   associating meta-data with a filter)
24 | - Remove HTTP test
25 | - Use scientific notation for FP probability
26 | - Add a 64-bit value reserved for the version bit and additional flags
27 | - Use 64-bit integers everywhere
28 | - Switched to new, more robust hashing scheme
29 | 
30 | ## v0.1.0 (2017-06-27)
31 | 
32 | - Initial open source release
33 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017, DCSO Deutsche Cyber-Sicherheitsorganisation GmbH
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of the DCSO Deutsche Cyber-Sicherheitsorganisation GmbH
15 |   nor the names of its contributors may be used to endorse or promote products
16 |   derived from this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | ## simple makefile to log workflow
 2 | .PHONY: all test clean build install
 3 | 
 4 | GOFLAGS ?= $(GOFLAGS:)
 5 | 
 6 | all: install test
 7 | 
 8 | build:
 9 | 	@go build $(GOFLAGS) ./...
10 | 
11 | install:
12 | 	@go get $(GOFLAGS) ./...
13 | 
14 | test: install
15 | 	@go test -cover $(GOFLAGS) ./...
16 | 
17 | bench: install
18 | 	@go test -run=NONE -bench=. $(GOFLAGS) ./...
19 | 
20 | clean:
21 | 	@go clean $(GOFLAGS) -i ./...
22 | 
23 | release:
24 | 	@go get $(GOFLAGS) ./...
25 | 	@go build -v -o bloom_linux_amd64.bin bloom/*
26 | 	GOOS=windows GOARCH=amd64 go build -v -o bloom_windows_amd64.exe bloom/*
27 | 
28 | ## EOF
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Bloom
  2 | 
  3 | ### A highly efficient bloom filter implementation for Go
  4 | 
  5 | [![GoDoc](https://godoc.org/github.com/DCSO/bloom?status.svg)](http://godoc.org/github.com/DCSO/bloom)
  6 | ![Build Status](https://github.com/DCSO/bloom/actions/workflows/go.yml/badge.svg)
  7 | 
  8 | Bloom is a simple tool that provides a very efficient implementation of Bloom filters for the go language.
  9 | It provides a command line tool that can be used to easily create Bloom filters with desired capacity
 10 | and false positive probability. Values can be added to filters through standard input, which makes it
 11 | easy to use the tool in a pipeline workflow.
 12 | 
 13 | # Usage
 14 | 
 15 |     NAME:
 16 |        Bloom Filter - Utility to work with bloom filters
 17 | 
 18 |     USAGE:
 19 |        bloom [global options] command [command options] [arguments...]
 20 | 
 21 |     VERSION:
 22 |        0.2.2
 23 | 
 24 |     COMMANDS:
 25 |          create, cr         Create a new Bloom filter and store it in the given filename.
 26 |          insert, i          Inserts new values into an existing Bloom filter.
 27 |          join, j, merge, m  Joins two Bloom filters into one.
 28 |          check, c           Checks values against an existing Bloom filter.
 29 |          set-data, sd       Sets the data associated with the Bloom filter.
 30 |          get-data, gd       Prints the data associated with the Bloom filter.
 31 |          show, s            Shows various details about a given Bloom filter.
 32 |          help, h            Shows a list of commands or help for one command
 33 | 
 34 |     GLOBAL OPTIONS:
 35 |        --gzip, --gz                      compress bloom file with gzip
 36 |        --interactive, -i                 interactively add values to the filter
 37 |        --split, -s                       split the input string
 38 |        --each, -e                        print each match of a split string individually
 39 |        --delimiter value, -d value       delimiter to use for splitting (default: ",")
 40 |        --fields value, -f value          fields of split output to use in filter (a single number or a comma-separated list of numbers, zero-indexed)
 41 |        --print-fields value, --pf value  fields of split output to print for a successful match (a single number or a comma-separated list of numbers, zero-indexed).
 42 |        --help, -h                        show help
 43 |        --version, -v                     print the version
 44 | 
 45 | 
 46 | # Examples
 47 | 
 48 | To create a new bloom filter with a desired capacity and false positive probability, you can use the `create` command:
 49 | 
 50 |     #will create a gzipped Bloom filter with 100.000 capacity and a 0.1 % false positive probability
 51 |     bloom --gzip create -p 0.001 -n 100000 test.bloom.gz
 52 | 
 53 | To insert values, you can use the `insert` command and pipe some input to it (each line will be treated as one value):
 54 | 
 55 |     cat values | bloom --gzip insert test.bloom.gz
 56 | 
 57 | You can also interactively add values to the filter by specifying the `--interactive` command line option:
 58 | 
 59 |     bloom --gzip --interactive insert test.bloom.gz
 60 | 
 61 | To check if a given value or a list of values is in the filter, you can use the `check` command:
 62 | 
 63 |     cat values | bloom --gzip check test.bloom.gz
 64 | 
 65 | This will return a list of all values in the filter.
 66 | 
 67 | # Advanced Usage
 68 | 
 69 | Sometimes it is useful to attach additional information to a string that we want to check against the Bloom filter,
 70 | such as a timestamp or the original line content. To make passing along this additional information easier within
 71 | a shell context, the Bloom tool provides an option for splitting the input string by a given delimiter and checking
 72 | the filter against the resulting field values. Example:
 73 | 
 74 |     # will check the Bloom filter for the values foo, bar and baz
 75 |     cat "foo,bar,baz" | bloom -s filter.bloom
 76 | 
 77 |     # uses a different delimiter (--magic-delimiter--)
 78 |     cat "foo--ac5ba--bar--ac5ba--baz" | bloom  -d "--ac5ba--" -s filter.bloom
 79 | 
 80 |     # will check the Bloom filter against the second field value only
 81 |     cat "foo,bar,baz" | bloom -f 1 -s filter.bloom
 82 | 
 83 |     # will check the Bloom filter against the second and third field values only
 84 |     cat "foo,bar,baz" | bloom -f 1,2 -s filter.bloom
 85 | 
 86 |     # will print one line for each field value that matched against the filter
 87 |     cat "foo,bar,baz" | bloom -e -s filter.bloom
 88 | 
 89 |     # will print the last field value for each line whose fields matched against the filter
 90 |     cat "foo,bar,baz" | bloom -e -s --pf -1 filter.bloom
 91 | 
 92 | This functionality is especially handy when using CSV data, as it allows you to filter CSV rows by checking individual
 93 | columns against the filter without having to use external tools to split and reassemble the lines.
 94 | 
 95 | # Installation
 96 | 
 97 | ## Installation on Debian-based systems
 98 | 
 99 | Debian [command line tool](https://tracker.debian.org/pkg/golang-github-dcso-bloom):
100 | 
101 |     sudo apt install golang-github-dcso-bloom-cli
102 | 
103 | ## Installation via `go get`:
104 | 
105 |     go get github.com/DCSO/bloom/...
106 | 
107 | ## Installation from source
108 | 
109 | These need to be run from within the `GOPATH` source directory for this project (e.g. `$GOPATH/src/github.com/DCSO/bloom`). To install the command line tool:
110 | 
111 |     make install
112 | 
113 | To run the tests:
114 | 
115 |     make test
116 | 
117 | To run the benchmarks:
118 | 
119 |     make bench
120 | 
121 | # Cross-Compiling
122 | 
123 | To compile a binary, simply specify the target architecture and go:
124 | 
125 |     #Windows, 64 bit
126 |     env GOOS=windows GOARCH=amd64 go build -v -o bloom.exe github.com/DCSO/bloom
127 |     #Windows, 32 bit
128 |     env GOOS=windows GOARCH=i386 go build -v -o /tmp/bloom github.com/DCSO/bloom
129 | 


--------------------------------------------------------------------------------
/bloom.go:
--------------------------------------------------------------------------------
  1 | // DCSO go bloom filter
  2 | // Copyright (c) 2017, DCSO GmbH
  3 | 
  4 | //Implements a simple and highly efficient variant of the Bloom filter that uses only two hash functions.
  5 | 
  6 | package bloom
  7 | 
  8 | import (
  9 | 	"encoding/binary"
 10 | 	"errors"
 11 | 	"fmt"
 12 | 	"hash/fnv"
 13 | 	"io"
 14 | 	"io/ioutil"
 15 | 	"math"
 16 | )
 17 | 
 18 | // BloomFilter represents a Bloom filter, a data structure for quickly checking
 19 | // for set membership, with a specific desired capacity and false positive
 20 | // probability.
 21 | type BloomFilter struct {
 22 | 	//bit array
 23 | 	v []uint64
 24 | 
 25 | 	//desired maximum number of elements
 26 | 	n uint64
 27 | 
 28 | 	//desired false positive probability
 29 | 	p float64
 30 | 
 31 | 	//number of hash functions
 32 | 	k uint64
 33 | 
 34 | 	//number of bits
 35 | 	m uint64
 36 | 
 37 | 	//number of elements in the filter
 38 | 	N uint64
 39 | 
 40 | 	//number of 64-bit integers (generated automatically)
 41 | 	M uint64
 42 | 
 43 | 	//arbitrary data that we can attach to the filter
 44 | 	Data []byte
 45 | }
 46 | 
 47 | // Read loads a filter from a reader object.
 48 | func (s *BloomFilter) Read(input io.Reader) error {
 49 | 	bs8 := make([]byte, 8)
 50 | 
 51 | 	if _, err := io.ReadFull(input, bs8); err != nil {
 52 | 		return err
 53 | 	}
 54 | 
 55 | 	flags := binary.LittleEndian.Uint64(bs8)
 56 | 
 57 | 	if flags&0xFF != 1 {
 58 | 		return fmt.Errorf("Invalid version bit (should be 1)")
 59 | 	}
 60 | 
 61 | 	if _, err := io.ReadFull(input, bs8); err != nil {
 62 | 		return err
 63 | 	}
 64 | 
 65 | 	s.n = binary.LittleEndian.Uint64(bs8)
 66 | 
 67 | 	if _, err := io.ReadFull(input, bs8); err != nil {
 68 | 		return err
 69 | 	}
 70 | 
 71 | 	s.p = math.Float64frombits(binary.LittleEndian.Uint64(bs8))
 72 | 
 73 | 	if _, err := io.ReadFull(input, bs8); err != nil {
 74 | 		return err
 75 | 	}
 76 | 
 77 | 	s.k = binary.LittleEndian.Uint64(bs8)
 78 | 	maxInt := uint64(int(^uint(0) >> 1))
 79 | 	if s.k >= maxInt {
 80 | 		return fmt.Errorf("value of k (number of hash functions) is too high (%d), must be less than maximum int (%d)", s.k, maxInt)
 81 | 	}
 82 | 
 83 | 	if _, err := io.ReadFull(input, bs8); err != nil {
 84 | 		return err
 85 | 	}
 86 | 
 87 | 	s.m = binary.LittleEndian.Uint64(bs8)
 88 | 
 89 | 	if _, err := io.ReadFull(input, bs8); err != nil {
 90 | 		return err
 91 | 	}
 92 | 
 93 | 	s.N = binary.LittleEndian.Uint64(bs8)
 94 | 
 95 | 	s.M = uint64(math.Ceil(float64(s.m) / 64.0))
 96 | 
 97 | 	s.v = make([]uint64, s.M)
 98 | 
 99 | 	for i := uint64(0); i < s.M; i++ {
100 | 		n, err := io.ReadFull(input, bs8)
101 | 		if err != nil {
102 | 			return err
103 | 		}
104 | 		if n != 8 {
105 | 			return fmt.Errorf("Cannot read from file: %d, position: %d, %d", n, i*8, len(bs8))
106 | 		}
107 | 		s.v[i] = binary.LittleEndian.Uint64(bs8)
108 | 	}
109 | 
110 | 	b, err := ioutil.ReadAll(input)
111 | 
112 | 	if err != nil {
113 | 		return err
114 | 	}
115 | 
116 | 	s.Data = b
117 | 
118 | 	return nil
119 | 
120 | }
121 | 
122 | // NumHashFuncs returns the number of hash functions used in the Bloom filter.
123 | func (s *BloomFilter) NumHashFuncs() uint64 {
124 | 	return s.k
125 | }
126 | 
127 | // MaxNumElements returns the maximal supported number of elements in the Bloom
128 | // filter (capacity).
129 | func (s *BloomFilter) MaxNumElements() uint64 {
130 | 	return s.n
131 | }
132 | 
133 | // NumBits returns the number of bits used in the Bloom filter.
134 | func (s *BloomFilter) NumBits() uint64 {
135 | 	return s.m
136 | }
137 | 
138 | // FalsePositiveProb returns the chosen false positive probability for the
139 | // Bloom filter.
140 | func (s *BloomFilter) FalsePositiveProb() float64 {
141 | 	return s.p
142 | }
143 | 
144 | // Write writes the binary representation of a Bloom filter to an io.Writer.
145 | func (s *BloomFilter) Write(output io.Writer) error {
146 | 	bs8 := make([]byte, 8)
147 | 
148 | 	// we write the version bit
149 | 	binary.LittleEndian.PutUint64(bs8, 1)
150 | 	output.Write(bs8)
151 | 
152 | 	binary.LittleEndian.PutUint64(bs8, s.n)
153 | 	output.Write(bs8)
154 | 	binary.LittleEndian.PutUint64(bs8, math.Float64bits(s.p))
155 | 	output.Write(bs8)
156 | 	binary.LittleEndian.PutUint64(bs8, s.k)
157 | 	output.Write(bs8)
158 | 	binary.LittleEndian.PutUint64(bs8, s.m)
159 | 	output.Write(bs8)
160 | 	binary.LittleEndian.PutUint64(bs8, s.N)
161 | 	output.Write(bs8)
162 | 
163 | 	for i := uint64(0); i < s.M; i++ {
164 | 		binary.LittleEndian.PutUint64(bs8, s.v[i])
165 | 		n, err := output.Write(bs8)
166 | 		if n != 8 {
167 | 			return errors.New("Cannot write to file!")
168 | 		}
169 | 		if err != nil {
170 | 			return err
171 | 		}
172 | 	}
173 | 	if s.Data != nil {
174 | 		output.Write(s.Data)
175 | 	}
176 | 	return nil
177 | }
178 | 
179 | // Reset clears the Bloom filter of all elements.
180 | func (s *BloomFilter) Reset() {
181 | 	for i := uint64(0); i < s.M; i++ {
182 | 		s.v[i] = 0
183 | 	}
184 | 	s.N = 0
185 | }
186 | 
187 | // this is the largest prime number < 2^64. As we will probably never encounter
188 | // a Bloom filter with a number of bits > m (if yes sorry to you people from,
189 | // the future, I envy your available RAM though) we can use the pseudorandom
190 | // sequence generated by repeatedly multiplying the initial hash value h
191 | // with g and taking the modulo over m to generate a sequence of hash values
192 | // that has a uniform distribution.
193 | const m uint64 = 18446744073709551557
194 | 
195 | // this is our multiplier. It has a very large primitive root
196 | // so that it will not repeat a given cycle for any practically meaningful
197 | // value of k.
198 | const g uint64 = 18446744073709550147
199 | 
200 | // Fingerprint returns the fingerprint of a given value, as an array of index
201 | // values.
202 | func (s *BloomFilter) Fingerprint(value []byte, fingerprint []uint64) {
203 | 	hv := fnv.New64()
204 | 	hv.Write(value)
205 | 	hn := hv.Sum64() % m
206 | 
207 | 	for i := uint64(0); i < s.k; i++ {
208 | 		hn = (hn * g) % m
209 | 		fingerprint[i] = uint64(hn % s.m)
210 | 	}
211 | }
212 | 
213 | // Add adds a byte array element to the Bloom filter.
214 | func (s *BloomFilter) Add(value []byte) {
215 | 	var k, l uint64
216 | 	newValue := false
217 | 	fingerprint := make([]uint64, s.k)
218 | 	s.Fingerprint(value, fingerprint)
219 | 	for i := uint64(0); i < s.k; i++ {
220 | 		k = uint64(fingerprint[i] / 64)
221 | 		l = uint64(fingerprint[i] % 64)
222 | 		v := uint64(1 << l)
223 | 		if (s.v[k] & v) == 0 {
224 | 			newValue = true
225 | 		}
226 | 		s.v[k] |= v
227 | 	}
228 | 	if newValue {
229 | 		s.N++
230 | 	}
231 | }
232 | 
233 | // Join adds the items of another Bloom filter with identical dimensions to
234 | // the receiver. That is, all elements that are described in the
235 | // second filter will also described by the receiver, and the number of elements
236 | // of the receiver will grow by the number of elements in the added filter.
237 | // Note that it is implicitly assumed that both filters are disjoint! Otherwise
238 | // the number of elements in the joined filter must _only_ be considered an
239 | // upper bound and not an exact value!
240 | // Joining two differently dimensioned filters may yield unexpected results and
241 | // hence is not allowed. An error will be returned in this case, and the
242 | // receiver will be left unaltered.
243 | func (s *BloomFilter) Join(s2 *BloomFilter) error {
244 | 	var i uint64
245 | 	if s.n != s2.n {
246 | 		return fmt.Errorf("filters have different dimensions (n = %d vs. %d))",
247 | 			s.n, s2.n)
248 | 	}
249 | 	if s.p != s2.p {
250 | 		return fmt.Errorf("filters have different dimensions (p = %f vs. %f))",
251 | 			s.p, s2.p)
252 | 	}
253 | 	if s.k != s2.k {
254 | 		return fmt.Errorf("filters have different dimensions (k = %d vs. %d))",
255 | 			s.k, s2.k)
256 | 	}
257 | 	if s.m != s2.m {
258 | 		return fmt.Errorf("filters have different dimensions (m = %d vs. %d))",
259 | 			s.m, s2.m)
260 | 	}
261 | 	if s.M != s2.M {
262 | 		return fmt.Errorf("filters have different dimensions (M = %d vs. %d))",
263 | 			s.M, s2.M)
264 | 	}
265 | 	for i = 0; i < s.M; i++ {
266 | 		s.v[i] |= s2.v[i]
267 | 	}
268 | 	if s.N+s2.N < s.N {
269 | 		return fmt.Errorf("addition of member counts would overflow")
270 | 	}
271 | 	s.N += s2.N
272 | 
273 | 	return nil
274 | }
275 | 
276 | // Check returns true if the given value may be in the Bloom filter, false if it
277 | // is definitely not in it.
278 | func (s *BloomFilter) Check(value []byte) bool {
279 | 	fingerprint := make([]uint64, s.k)
280 | 	s.Fingerprint(value, fingerprint)
281 | 	return s.CheckFingerprint(fingerprint)
282 | }
283 | 
284 | // CheckFingerprint returns true if the given fingerprint occurs in the Bloom
285 | // filter, false if it does not.
286 | func (s *BloomFilter) CheckFingerprint(fingerprint []uint64) bool {
287 | 	var k, l uint64
288 | 	for i := uint64(0); i < s.k; i++ {
289 | 		k = uint64(fingerprint[i] / 64)
290 | 		l = uint64(fingerprint[i] % 64)
291 | 		if (s.v[k] & (1 << l)) == 0 {
292 | 			return false
293 | 		}
294 | 	}
295 | 	return true
296 | }
297 | 
298 | // Initialize returns a new, empty Bloom filter with the given capacity (n)
299 | // and FP probability (p).
300 | func Initialize(n uint64, p float64) BloomFilter {
301 | 	m := math.Abs(math.Ceil(float64(n) * math.Log(p) / math.Pow(math.Log(2.0), 2.0)))
302 | 	var bf BloomFilter
303 | 	bf.n = n
304 | 	bf.p = p
305 | 	bf.m = uint64(m)
306 | 	bf.M = uint64(math.Ceil(m / 64.0))
307 | 	bf.k = uint64(math.Ceil(math.Log(2) * m / float64(n)))
308 | 	bf.v = make([]uint64, bf.M)
309 | 	return bf
310 | }
311 | 


--------------------------------------------------------------------------------
/bloom/manage.go:
--------------------------------------------------------------------------------
  1 | // DCSO go bloom filter
  2 | // Copyright (c) 2017, DCSO GmbH
  3 | 
  4 | package main
  5 | 
  6 | import (
  7 | 	"bufio"
  8 | 	"bytes"
  9 | 	"fmt"
 10 | 	"os"
 11 | 	"path/filepath"
 12 | 	"strconv"
 13 | 	"strings"
 14 | 
 15 | 	"github.com/DCSO/bloom"
 16 | 	"gopkg.in/urfave/cli.v1"
 17 | )
 18 | 
 19 | // BloomParams represents the parameters of the 'bloom' command line tool.
 20 | type BloomParams struct {
 21 | 	gzip           bool
 22 | 	interactive    bool
 23 | 	split          bool
 24 | 	printEachMatch bool
 25 | 	delimiter      string
 26 | 	fields         []int
 27 | 	printFields    []int
 28 | }
 29 | 
 30 | func exitWithError(message string) {
 31 | 	fmt.Fprintf(os.Stderr, "Error: %s \n", message)
 32 | 	os.Exit(-1)
 33 | }
 34 | 
 35 | func readValuesIntoFilter(filter *bloom.BloomFilter, bloomParams BloomParams) {
 36 | 	//we determine if the program is run interactively or within a pipe
 37 | 	stat, _ := os.Stdin.Stat()
 38 | 	var isTerminal = (stat.Mode() & os.ModeCharDevice) != 0
 39 | 	//if we are not in an interactive session and this is a terminal, we quit
 40 | 	if !bloomParams.interactive && isTerminal {
 41 | 		return
 42 | 	}
 43 | 	if bloomParams.interactive {
 44 | 		fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit (values will not be stored otherwise).")
 45 | 	}
 46 | 	scanner := bufio.NewScanner(os.Stdin)
 47 | 	for scanner.Scan() {
 48 | 		line := scanner.Text()
 49 | 		if line == "" && bloomParams.interactive {
 50 | 			break
 51 | 		}
 52 | 		if bloomParams.split {
 53 | 			values := strings.Split(line, bloomParams.delimiter)
 54 | 			for i, value := range values {
 55 | 				j := i - len(values)
 56 | 
 57 | 				if len(bloomParams.fields) > 0 {
 58 | 					if !contains(bloomParams.fields, i) && !contains(bloomParams.fields, j) {
 59 | 						continue
 60 | 					}
 61 | 				}
 62 | 				filter.Add([]byte(value))
 63 | 			}
 64 | 		} else {
 65 | 			filter.Add([]byte(line))
 66 | 		}
 67 | 	}
 68 | }
 69 | 
 70 | func readInputIntoData(filter *bloom.BloomFilter, bloomParams BloomParams) {
 71 | 	//we determine if the program is run interactively or within a pipe
 72 | 	stat, _ := os.Stdin.Stat()
 73 | 	var isTerminal = (stat.Mode() & os.ModeCharDevice) != 0
 74 | 	//if we are not in an interactive session and this is a terminal, we quit
 75 | 	if !bloomParams.interactive && isTerminal {
 76 | 		return
 77 | 	}
 78 | 	if bloomParams.interactive {
 79 | 		fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit (values will not be stored otherwise).")
 80 | 	}
 81 | 	scanner := bufio.NewScanner(os.Stdin)
 82 | 	dataBuffer := bytes.NewBuffer([]byte(""))
 83 | 	for scanner.Scan() {
 84 | 		line := scanner.Bytes()
 85 | 		if len(line) == 0 && bloomParams.interactive {
 86 | 			break
 87 | 		}
 88 | 		dataBuffer.Write(line)
 89 | 		dataBuffer.Write([]byte("\n"))
 90 | 	}
 91 | 	filter.Data = dataBuffer.Bytes()
 92 | }
 93 | 
 94 | func insertIntoFilter(path string, bloomParams BloomParams) {
 95 | 	filter, err := bloom.LoadFilter(path, bloomParams.gzip)
 96 | 	if err != nil {
 97 | 		exitWithError(err.Error())
 98 | 	}
 99 | 	readValuesIntoFilter(filter, bloomParams)
100 | 	err = bloom.WriteFilter(filter, path, bloomParams.gzip)
101 | 	if err != nil {
102 | 		exitWithError(err.Error())
103 | 	}
104 | }
105 | 
106 | func updateFilterData(path string, bloomParams BloomParams) {
107 | 	filter, err := bloom.LoadFilter(path, bloomParams.gzip)
108 | 	if err != nil {
109 | 		exitWithError(err.Error())
110 | 	}
111 | 	readInputIntoData(filter, bloomParams)
112 | 	err = bloom.WriteFilter(filter, path, bloomParams.gzip)
113 | 	if err != nil {
114 | 		exitWithError(err.Error())
115 | 	}
116 | }
117 | 
118 | func getFilterData(path string, bloomParams BloomParams) {
119 | 	filter, err := bloom.LoadFilter(path, bloomParams.gzip)
120 | 	if err != nil {
121 | 		exitWithError(err.Error())
122 | 	}
123 | 	fmt.Print(string(filter.Data))
124 | }
125 | 
126 | func contains(s []int, e int) bool {
127 | 	for _, a := range s {
128 | 		if a == e {
129 | 			return true
130 | 		}
131 | 	}
132 | 	return false
133 | }
134 | 
135 | func checkAgainstFilter(path string, bloomParams BloomParams) {
136 | 	filter, err := bloom.LoadFilter(path, bloomParams.gzip)
137 | 	if err != nil {
138 | 		exitWithError(err.Error())
139 | 	}
140 | 	scanner := bufio.NewScanner(os.Stdin)
141 | 	if bloomParams.interactive {
142 | 		fmt.Println("Interactive mode: Enter a blank line [by pressing ENTER] to exit.")
143 | 	}
144 | 	for scanner.Scan() {
145 | 		line := scanner.Text()
146 | 		if line == "" && bloomParams.interactive {
147 | 			break
148 | 		}
149 | 		var valuesToCheck []string
150 | 		if bloomParams.split {
151 | 			valuesToCheck = strings.Split(line, bloomParams.delimiter)
152 | 		} else {
153 | 			valuesToCheck = make([]string, 1)
154 | 			valuesToCheck[0] = line
155 | 		}
156 | 		printed := false
157 | 		prefix := ""
158 | 		if bloomParams.interactive {
159 | 			prefix = ">"
160 | 		}
161 | 		for i, value := range valuesToCheck {
162 | 			j := i - len(valuesToCheck)
163 | 			//we only check fields that are in the "fields" parameters (if defined)
164 | 			if len(bloomParams.fields) > 0 {
165 | 				if !contains(bloomParams.fields, i) && !contains(bloomParams.fields, j) {
166 | 					continue
167 | 				}
168 | 			}
169 | 
170 | 			if filter.Check([]byte(value)) {
171 | 				if bloomParams.printEachMatch {
172 | 					fmt.Printf("%s%s\n", prefix, value)
173 | 				} else {
174 | 					if !printed {
175 | 						if len(bloomParams.printFields) > 0 {
176 | 							values := make([]string, 0, len(bloomParams.printFields))
177 | 							for _, i := range bloomParams.printFields {
178 | 								j := i
179 | 								if j < 0 {
180 | 									j = j + len(valuesToCheck)
181 | 								}
182 | 								if j >= len(valuesToCheck) || j < 0 {
183 | 									continue
184 | 								}
185 | 								values = append(values, valuesToCheck[j])
186 | 							}
187 | 							fmt.Printf("%s%s\n", prefix, strings.Join(values, bloomParams.delimiter))
188 | 						} else {
189 | 							fmt.Printf("%s%s\n", prefix, line)
190 | 						}
191 | 					}
192 | 					printed = true
193 | 				}
194 | 			}
195 | 		}
196 | 	}
197 | }
198 | 
199 | func printStats(path string, bloomParams BloomParams) {
200 | 	filter, err := bloom.LoadFilter(path, bloomParams.gzip)
201 | 	if err != nil {
202 | 		exitWithError(err.Error())
203 | 	}
204 | 	fmt.Printf("File:\t\t\t%s\n", path)
205 | 	fmt.Printf("Capacity:\t\t%d\n", filter.MaxNumElements())
206 | 	fmt.Printf("Elements present:\t%d\n", filter.N)
207 | 	fmt.Printf("FP probability:\t\t%.2e\n", filter.FalsePositiveProb())
208 | 	fmt.Printf("Bits:\t\t\t%d\n", filter.NumBits())
209 | 	fmt.Printf("Hash functions:\t\t%d\n", filter.NumHashFuncs())
210 | }
211 | 
212 | func createFilter(path string, n uint64, p float64, bloomParams BloomParams) {
213 | 	filter := bloom.Initialize(n, p)
214 | 	readValuesIntoFilter(&filter, bloomParams)
215 | 	err := bloom.WriteFilter(&filter, path, bloomParams.gzip)
216 | 	if err != nil {
217 | 		exitWithError(err.Error())
218 | 	}
219 | }
220 | 
221 | func joinFilters(path string, pathToAdd string, bloomParams BloomParams) {
222 | 	filter, err := bloom.LoadFilter(path, bloomParams.gzip)
223 | 	if err != nil {
224 | 		exitWithError(err.Error())
225 | 	}
226 | 	filter2, err := bloom.LoadFilter(pathToAdd, bloomParams.gzip)
227 | 	if err != nil {
228 | 		exitWithError(err.Error())
229 | 	}
230 | 	err = filter.Join(filter2)
231 | 	if err != nil {
232 | 		exitWithError(err.Error())
233 | 	}
234 | 	err = bloom.WriteFilter(filter, path, bloomParams.gzip)
235 | 	if err != nil {
236 | 		exitWithError(err.Error())
237 | 	}
238 | }
239 | 
240 | func parseFieldIndexes(s string) ([]int, error) {
241 | 	fields := strings.Split(s, ",")
242 | 	fieldNumbers := make([]int, len(fields))
243 | 	for i, field := range fields {
244 | 		num, err := strconv.Atoi(field)
245 | 		if err != nil {
246 | 			return nil, err
247 | 		}
248 | 		fieldNumbers[i] = num
249 | 	}
250 | 	return fieldNumbers, nil
251 | }
252 | 
253 | func parseBloomParams(c *cli.Context) BloomParams {
254 | 	var bloomParams BloomParams
255 | 	var err error
256 | 	bloomParams.gzip = c.GlobalBool("gzip")
257 | 	bloomParams.interactive = c.GlobalBool("interactive")
258 | 	bloomParams.split = c.GlobalBool("split")
259 | 	bloomParams.delimiter = c.GlobalString("delimiter")
260 | 	bloomParams.printEachMatch = c.GlobalBool("each")
261 | 	if c.GlobalString("fields") != "" {
262 | 		bloomParams.fields, err = parseFieldIndexes(c.GlobalString("fields"))
263 | 		if err != nil {
264 | 			exitWithError(err.Error())
265 | 		}
266 | 	}
267 | 	if c.GlobalString("print-fields") != "" {
268 | 		bloomParams.printFields, err = parseFieldIndexes(c.GlobalString("print-fields"))
269 | 		if err != nil {
270 | 			exitWithError(err.Error())
271 | 		}
272 | 		//if printFields is set we also set printEachMatch
273 | 		if len(bloomParams.printFields) > 0 {
274 | 			bloomParams.printEachMatch = false
275 | 		}
276 | 	}
277 | 	return bloomParams
278 | }
279 | 
280 | func main() {
281 | 
282 | 	app := cli.NewApp()
283 | 	app.Name = "Bloom Filter"
284 | 	app.Usage = "Utility to work with bloom filters"
285 | 	app.Flags = []cli.Flag{
286 | 		cli.BoolFlag{
287 | 			Name:  "gzip, gz",
288 | 			Usage: "compress bloom file with gzip",
289 | 		},
290 | 		cli.BoolFlag{
291 | 			Name:  "interactive, i",
292 | 			Usage: "interactively add values to the filter",
293 | 		},
294 | 		cli.BoolFlag{
295 | 			Name:  "split, s",
296 | 			Usage: "split the input string",
297 | 		},
298 | 		cli.BoolFlag{
299 | 			Name:  "each, e",
300 | 			Usage: "print each match of a split string individually",
301 | 		},
302 | 		cli.StringFlag{
303 | 			Name:  "delimiter, d",
304 | 			Value: ",",
305 | 			Usage: "delimiter to use for splitting",
306 | 		},
307 | 		cli.StringFlag{
308 | 			Name:  "fields, f",
309 | 			Value: "",
310 | 			Usage: "fields of split output to use in filter (a single number or a comma-separated list of numbers, zero-indexed)",
311 | 		},
312 | 		cli.StringFlag{
313 | 			Name:  "print-fields, pf",
314 | 			Value: "",
315 | 			Usage: "fields of split output to print for a successful match (a single number or a comma-separated list of numbers, zero-indexed).",
316 | 		},
317 | 	}
318 | 	app.Commands = []cli.Command{
319 | 		{
320 | 			Name:    "create",
321 | 			Aliases: []string{"cr"},
322 | 			Flags: []cli.Flag{
323 | 				cli.Float64Flag{Name: "p", Value: 0.01, Usage: "The desired false positive probability."},
324 | 				cli.Uint64Flag{Name: "n", Value: 10000, Usage: "The desired capacity."},
325 | 			},
326 | 			Usage: "Create a new Bloom filter and store it in the given filename.",
327 | 			Action: func(c *cli.Context) error {
328 | 				path := c.Args().First()
329 | 				bloomParams := parseBloomParams(c)
330 | 				if path == "" {
331 | 					exitWithError("No filename given.")
332 | 				}
333 | 				path, err := filepath.Abs(path)
334 | 				if err != nil {
335 | 					return err
336 | 				}
337 | 				n := c.Uint64("n")
338 | 				p := c.Float64("p")
339 | 				if n < 0 {
340 | 					exitWithError("n cannot be negative.")
341 | 				}
342 | 				if p < 0 || p > 1 {
343 | 					exitWithError("p must be between 0 and 1.")
344 | 				}
345 | 				createFilter(path, n, p, bloomParams)
346 | 				return nil
347 | 			},
348 | 		},
349 | 		{
350 | 			Name:    "insert",
351 | 			Aliases: []string{"i"},
352 | 			Flags:   []cli.Flag{},
353 | 			Usage:   "Inserts new values into an existing Bloom filter.",
354 | 			Action: func(c *cli.Context) error {
355 | 				path := c.Args().First()
356 | 				bloomParams := parseBloomParams(c)
357 | 				if path == "" {
358 | 					exitWithError("No filename given.")
359 | 				}
360 | 				path, err := filepath.Abs(path)
361 | 				if err != nil {
362 | 					return err
363 | 				}
364 | 				insertIntoFilter(path, bloomParams)
365 | 				return nil
366 | 			},
367 | 		},
368 | 		{
369 | 			Name:    "join",
370 | 			Aliases: []string{"j", "merge", "m"},
371 | 			Flags:   []cli.Flag{},
372 | 			Usage:   "Joins two Bloom filters into one.",
373 | 			Action: func(c *cli.Context) error {
374 | 				if len(c.Args()) != 2 {
375 | 					exitWithError("Two filenames are required.")
376 | 				}
377 | 				bloomParams := parseBloomParams(c)
378 | 				path := c.Args().First()
379 | 				if path == "" {
380 | 					exitWithError("No first filename given.")
381 | 				}
382 | 				path, err := filepath.Abs(path)
383 | 				if err != nil {
384 | 					return err
385 | 				}
386 | 				pathToAdd := c.Args().Get(1)
387 | 				if pathToAdd == "" {
388 | 					exitWithError("No second filename given.")
389 | 				}
390 | 				pathToAdd, err = filepath.Abs(pathToAdd)
391 | 				if err != nil {
392 | 					return err
393 | 				}
394 | 				joinFilters(path, pathToAdd, bloomParams)
395 | 				return nil
396 | 			},
397 | 		},
398 | 		{
399 | 			Name:    "check",
400 | 			Aliases: []string{"c"},
401 | 			Flags:   []cli.Flag{},
402 | 			Usage:   "Checks values against an existing Bloom filter.",
403 | 			Action: func(c *cli.Context) error {
404 | 				path := c.Args().First()
405 | 				bloomParams := parseBloomParams(c)
406 | 				if path == "" {
407 | 					exitWithError("No filename given.")
408 | 				}
409 | 				path, err := filepath.Abs(path)
410 | 				if err != nil {
411 | 					return err
412 | 				}
413 | 				checkAgainstFilter(path, bloomParams)
414 | 				return nil
415 | 			},
416 | 		},
417 | 		{
418 | 			Name:    "set-data",
419 | 			Aliases: []string{"sd"},
420 | 			Flags:   []cli.Flag{},
421 | 			Usage:   "Sets the data associated with the Bloom filter.",
422 | 			Action: func(c *cli.Context) error {
423 | 				path := c.Args().First()
424 | 				bloomParams := parseBloomParams(c)
425 | 				if path == "" {
426 | 					exitWithError("No filename given.")
427 | 				}
428 | 				path, err := filepath.Abs(path)
429 | 				if err != nil {
430 | 					return err
431 | 				}
432 | 				updateFilterData(path, bloomParams)
433 | 				return nil
434 | 			},
435 | 		},
436 | 		{
437 | 			Name:    "get-data",
438 | 			Aliases: []string{"gd"},
439 | 			Flags:   []cli.Flag{},
440 | 			Usage:   "Prints the data associated with the Bloom filter.",
441 | 			Action: func(c *cli.Context) error {
442 | 				path := c.Args().First()
443 | 				bloomParams := parseBloomParams(c)
444 | 				if path == "" {
445 | 					exitWithError("No filename given.")
446 | 				}
447 | 				path, err := filepath.Abs(path)
448 | 				if err != nil {
449 | 					return err
450 | 				}
451 | 				getFilterData(path, bloomParams)
452 | 				return nil
453 | 			},
454 | 		},
455 | 		{
456 | 			Name:    "show",
457 | 			Aliases: []string{"s"},
458 | 			Flags:   []cli.Flag{},
459 | 			Usage:   "Shows various details about a given Bloom filter.",
460 | 			Action: func(c *cli.Context) error {
461 | 				path := c.Args().First()
462 | 				bloomParams := parseBloomParams(c)
463 | 				if path == "" {
464 | 					exitWithError("No filename given.")
465 | 				}
466 | 				path, err := filepath.Abs(path)
467 | 				if err != nil {
468 | 					return err
469 | 				}
470 | 				printStats(path, bloomParams)
471 | 				return nil
472 | 			},
473 | 		},
474 | 	}
475 | 	app.Version = "0.2.4"
476 | 
477 | 	app.Run(os.Args)
478 | }
479 | 


--------------------------------------------------------------------------------
/bloom_test.go:
--------------------------------------------------------------------------------
  1 | // DCSO go bloom filter
  2 | // Copyright (c) 2017, DCSO GmbH
  3 | 
  4 | package bloom
  5 | 
  6 | import (
  7 | 	"bytes"
  8 | 	"io/ioutil"
  9 | 	"log"
 10 | 	"math"
 11 | 	"math/rand"
 12 | 	"os"
 13 | 	"path/filepath"
 14 | 	"strings"
 15 | 	"testing"
 16 | )
 17 | 
 18 | func TestFingerprinting(t *testing.T) {
 19 | 	filter := Initialize(100000, 0.01)
 20 | 	fp := make([]uint64, 7)
 21 | 	expected := [7]uint64{20311, 36825, 412501, 835777, 658914, 853361, 307361}
 22 | 	filter.Fingerprint([]byte("bar"), fp)
 23 | 	for i, v := range fp {
 24 | 		if v != expected[i] {
 25 | 			t.Errorf("Wrong fingerprint: %d vs. %d", v, expected[i])
 26 | 			break
 27 | 		}
 28 | 	}
 29 | }
 30 | 
 31 | func TestInitialization(t *testing.T) {
 32 | 	filter := Initialize(10000, 0.001)
 33 | 	if filter.k != 10 {
 34 | 		t.Error("k does not match expectation!")
 35 | 	}
 36 | 	if filter.m != 143775 {
 37 | 		t.Error("m does not match expectation: ", filter.m)
 38 | 	}
 39 | 	if filter.M != uint64(math.Ceil(float64(filter.m)/64)) {
 40 | 		t.Error("M does not match expectation: ", filter.M)
 41 | 	}
 42 | 	for i := uint64(0); i < filter.M; i++ {
 43 | 		if filter.v[i] != 0 {
 44 | 			t.Error("Filter value is not initialized to zero!")
 45 | 		}
 46 | 	}
 47 | }
 48 | 
 49 | func checkFilters(a BloomFilter, b BloomFilter, t *testing.T) bool {
 50 | 	if b.n != a.n ||
 51 | 		b.p != a.p ||
 52 | 		b.k != a.k ||
 53 | 		b.m != a.m ||
 54 | 		b.M != a.M ||
 55 | 		!bytes.Equal(b.Data, a.Data) {
 56 | 		return false
 57 | 	}
 58 | 	for i := uint64(0); i < a.M; i++ {
 59 | 		if a.v[i] != b.v[i] {
 60 | 			return false
 61 | 		}
 62 | 	}
 63 | 	return true
 64 | }
 65 | 
 66 | func serializeToBuffer(filter BloomFilter) (*BloomFilter, error) {
 67 | 	var buf bytes.Buffer
 68 | 	filter.Write(&buf)
 69 | 	var newFilter BloomFilter
 70 | 	newFilter.Read(&buf)
 71 | 	return &newFilter, nil
 72 | }
 73 | 
 74 | func serializeToDisk(filter BloomFilter) (*BloomFilter, error) {
 75 | 	tempFile, err := ioutil.TempFile("", "filter")
 76 | 	if err != nil {
 77 | 		return nil, err
 78 | 	}
 79 | 	defer os.Remove(tempFile.Name())
 80 | 	filter.Write(tempFile)
 81 | 	tempFile.Sync()
 82 | 	tempFile.Seek(0, 0)
 83 | 	var newFilter BloomFilter
 84 | 	err = newFilter.Read(tempFile)
 85 | 	if err != nil {
 86 | 		return nil, err
 87 | 	}
 88 | 	return &newFilter, nil
 89 | }
 90 | 
 91 | func TestSerialization(t *testing.T) {
 92 | 	capacity := uint64(100000)
 93 | 	p := float64(0.01)
 94 | 	samples := uint64(1000)
 95 | 	filter, _ := GenerateExampleFilter(capacity, p, samples)
 96 | 
 97 | 	newFilter, err := serializeToBuffer(filter)
 98 | 	if err != nil {
 99 | 		t.Error("Cannot serialize filter to buffer!")
100 | 		return
101 | 	}
102 | 
103 | 	if !checkFilters(filter, *newFilter, t) {
104 | 		t.Error("Filters do not match!")
105 | 	}
106 | 
107 | 	newFilter, err = serializeToDisk(filter)
108 | 
109 | 	if err != nil {
110 | 		t.Error("Cannot serialize filter to file!")
111 | 		return
112 | 	}
113 | 
114 | 	if !checkFilters(filter, *newFilter, t) {
115 | 		t.Error("Filters do not match!")
116 | 	}
117 | 
118 | 	filter.Add(GenerateTestValue(100))
119 | 	newFilter.Add(GenerateTestValue(100))
120 | 	newFilter, err = serializeToDisk(filter)
121 | 	if err != nil {
122 | 		t.Error("Cannot serialize filter to disk!")
123 | 		return
124 | 	}
125 | 
126 | 	if !checkFilters(filter, *newFilter, t) {
127 | 		t.Error("Filters do not match!")
128 | 	}
129 | 
130 | 	filter.Add(GenerateTestValue(100))
131 | 	newFilter.Add(GenerateTestValue(100))
132 | 	newFilter, err = serializeToDisk(filter)
133 | 	if err != nil {
134 | 		t.Error("Cannot serialize filter to disk!")
135 | 		return
136 | 	}
137 | 
138 | 	if !checkFilters(filter, *newFilter, t) {
139 | 		t.Error("Filters do not match!")
140 | 	}
141 | 
142 | 	checkFilters(filter, *newFilter, t)
143 | }
144 | 
145 | func TestSerializationToDisk(t *testing.T) {
146 | 	capacity := uint64(100000)
147 | 	p := float64(0.001)
148 | 	samples := uint64(1000)
149 | 	filter, _ := GenerateExampleFilter(capacity, p, samples)
150 | 
151 | 	var buf bytes.Buffer
152 | 
153 | 	filter.Write(&buf)
154 | 
155 | 	var newFilter BloomFilter
156 | 
157 | 	newFilter.Read(&buf)
158 | 
159 | 	checkFilters(filter, newFilter, t)
160 | }
161 | 
162 | func TestSerializationWriteFail(t *testing.T) {
163 | 	capacity := uint64(100000)
164 | 	p := float64(0.001)
165 | 	samples := uint64(1000)
166 | 	filter, _ := GenerateExampleFilter(capacity, p, samples)
167 | 
168 | 	dir, err := ioutil.TempDir("", "bloomtest")
169 | 	if err != nil {
170 | 		log.Fatal(err)
171 | 	}
172 | 	defer os.RemoveAll(dir)
173 | 
174 | 	tmpfn := filepath.Join(dir, "tmpfile")
175 | 	tmpfile, err := os.OpenFile(tmpfn, os.O_CREATE|os.O_RDONLY, 0000)
176 | 	if err != nil {
177 | 		t.Fatal(err)
178 | 	}
179 | 	defer tmpfile.Close()
180 | 
181 | 	err = filter.Write(tmpfile)
182 | 	if err == nil {
183 | 		t.Error("writing to read-only file should fail")
184 | 	}
185 | }
186 | 
187 | func TestSerializationReadFail(t *testing.T) {
188 | 	var newFilter BloomFilter
189 | 
190 | 	dir, err := ioutil.TempDir("", "bloomtest")
191 | 	if err != nil {
192 | 		log.Fatal(err)
193 | 	}
194 | 	defer os.RemoveAll(dir)
195 | 
196 | 	tmpfn := filepath.Join(dir, "tmpfile")
197 | 	tmpfile, err := os.OpenFile(tmpfn, os.O_CREATE, 0777)
198 | 	if err != nil {
199 | 		t.Fatal(err)
200 | 	}
201 | 	defer tmpfile.Close()
202 | 
203 | 	err = newFilter.Read(tmpfile)
204 | 	if err == nil {
205 | 		t.Error("reading from empty file should fail")
206 | 	}
207 | }
208 | 
209 | func GenerateTestValue(length uint64) []byte {
210 | 	value := make([]byte, length)
211 | 	for i := uint64(0); i < length; i++ {
212 | 		value[i] = byte(rand.Int() % 256)
213 | 	}
214 | 	return value
215 | }
216 | 
217 | func GenerateExampleFilter(capacity uint64, p float64, samples uint64) (BloomFilter, [][]byte) {
218 | 	filter := Initialize(capacity, p)
219 | 	filter.Data = []byte("foobar")
220 | 	testValues := make([][]byte, 0, samples)
221 | 	for i := uint64(0); i < samples; i++ {
222 | 		testValue := GenerateTestValue(100)
223 | 		testValues = append(testValues, testValue)
224 | 		filter.Add(testValue)
225 | 	}
226 | 	return filter, testValues
227 | }
228 | 
229 | func GenerateDisjointExampleFilter(capacity uint64, p float64, samples uint64, other BloomFilter) (BloomFilter, [][]byte) {
230 | 	filter := Initialize(capacity, p)
231 | 	testValues := make([][]byte, 0, samples)
232 | 	for i := uint64(0); i < samples; {
233 | 		testValue := GenerateTestValue(100)
234 | 		if !other.Check(testValue) {
235 | 			testValues = append(testValues, testValue)
236 | 			filter.Add(testValue)
237 | 			i++
238 | 		}
239 | 	}
240 | 	return filter, testValues
241 | }
242 | 
243 | //This tests the checking of values against a given filter
244 | func TestChecking(t *testing.T) {
245 | 	capacity := uint64(100000)
246 | 	p := float64(0.001)
247 | 	samples := uint64(100000)
248 | 	filter, testValues := GenerateExampleFilter(capacity, p, samples)
249 | 	fingerprint := make([]uint64, filter.k)
250 | 	for _, value := range testValues {
251 | 		filter.Fingerprint(value, fingerprint)
252 | 		if !filter.CheckFingerprint(fingerprint) {
253 | 			t.Error("Did not find test value in filter!")
254 | 		}
255 | 	}
256 | }
257 | 
258 | //This tests the checking of values against a given filter after resetting it
259 | func TestReset(t *testing.T) {
260 | 	capacity := uint64(100000)
261 | 	p := float64(0.001)
262 | 	samples := uint64(100000)
263 | 	filter, testValues := GenerateExampleFilter(capacity, p, samples)
264 | 	filter.Reset()
265 | 	fingerprint := make([]uint64, filter.k)
266 | 	for _, value := range testValues {
267 | 		filter.Fingerprint(value, fingerprint)
268 | 		if filter.CheckFingerprint(fingerprint) {
269 | 			t.Error("Did not find test value in filter!")
270 | 		}
271 | 	}
272 | }
273 | 
274 | //This tests the checking of values against a given filter
275 | //see https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
276 | func TestFalsePositives(t *testing.T) {
277 | 	capacity := uint64(10000)
278 | 	p := float64(0.001)
279 | 	fillingFactor := 0.9
280 | 	N := uint64(float64(capacity) * fillingFactor)
281 | 	filter, _ := GenerateExampleFilter(capacity, p, N)
282 | 	pAcceptable := math.Pow(1-math.Exp(-float64(filter.k)*float64(N)/float64(filter.m)), float64(filter.k))
283 | 	fingerprint := make([]uint64, filter.k)
284 | 	cnt := 0.0
285 | 	matches := 0.0
286 | 	for {
287 | 		cnt++
288 | 		value := GenerateTestValue(100)
289 | 		filter.Fingerprint(value, fingerprint)
290 | 		if filter.CheckFingerprint(fingerprint) {
291 | 			matches++
292 | 		}
293 | 		if cnt > float64(capacity)*10 {
294 | 			break
295 | 		}
296 | 	}
297 | 	//this might still fail sometimes...
298 | 	//we allow for a probability that is two times higher than the normally acceptable probability
299 | 	if matches/cnt > pAcceptable*2 {
300 | 		t.Error("False positive probability is too high at ", matches/cnt*100, "% vs ", pAcceptable*100, "%")
301 | 	}
302 | }
303 | 
304 | func TestJoiningRegularMisdimensioned(t *testing.T) {
305 | 	a := Initialize(100000, 0.0001)
306 | 	b := Initialize(10000, 0.0001)
307 | 	err := a.Join(&b)
308 | 	if err == nil {
309 | 		t.Error("joining filters with different capacity should fail")
310 | 	}
311 | 	if !strings.Contains(err.Error(), "different dimensions") {
312 | 		t.Error("wrong error message returned")
313 | 	}
314 | 	a = Initialize(100000, 0.0001)
315 | 	b = Initialize(100000, 0.001)
316 | 	err = a.Join(&b)
317 | 	if err == nil {
318 | 		t.Error("joining filters with different FP prob should fail")
319 | 	}
320 | 	if !strings.Contains(err.Error(), "different dimensions") {
321 | 		t.Error("wrong error message returned")
322 | 	}
323 | 	a = Initialize(100000, 0.0001)
324 | 	b = Initialize(100000, 0.0001)
325 | 	b.k = 1
326 | 	err = a.Join(&b)
327 | 	if err == nil {
328 | 		t.Error("joining filters with different number of hash funcs should fail")
329 | 	}
330 | 	if !strings.Contains(err.Error(), "different dimensions") {
331 | 		t.Error("wrong error message returned")
332 | 	}
333 | 	a = Initialize(100000, 0.0001)
334 | 	b = Initialize(100000, 0.0001)
335 | 	b.m = 1
336 | 	err = a.Join(&b)
337 | 	if err == nil {
338 | 		t.Error("joining filters with different number of bits should fail")
339 | 	}
340 | 	if !strings.Contains(err.Error(), "different dimensions") {
341 | 		t.Error("wrong error message returned")
342 | 	}
343 | 	a = Initialize(100000, 0.0001)
344 | 	b = Initialize(100000, 0.0001)
345 | 	b.M = 1
346 | 	err = a.Join(&b)
347 | 	if err == nil {
348 | 		t.Error("joining filters with different int array size should fail")
349 | 	}
350 | 	if !strings.Contains(err.Error(), "different dimensions") {
351 | 		t.Error("wrong error message returned")
352 | 	}
353 | }
354 | 
355 | func TestAccessors(t *testing.T) {
356 | 	a, _ := GenerateExampleFilter(100000, 0.0001, 10000)
357 | 	if a.MaxNumElements() != 100000 {
358 | 		t.Error("unexpected capacity in filter")
359 | 	}
360 | 	if a.NumBits() != 1917011 {
361 | 		t.Error("unexpected number of bits in filter")
362 | 	}
363 | 	if a.NumHashFuncs() != 14 {
364 | 		t.Error("unexpected number of hash funcs in filter")
365 | 	}
366 | 	if a.FalsePositiveProb() != 0.0001 {
367 | 		t.Error("unexpected FP prob in filter")
368 | 	}
369 | }
370 | 
371 | func TestJoiningRegular(t *testing.T) {
372 | 	a, aval := GenerateExampleFilter(100000, 0.0001, 10000)
373 | 	b, bval := GenerateDisjointExampleFilter(100000, 0.0001, 20000, a)
374 | 	for _, v := range bval {
375 | 		if a.Check(v) {
376 | 			t.Errorf("value not missing in joined filter: %s", string(v))
377 | 		}
378 | 	}
379 | 	if a.N != 10000 {
380 | 		t.Error("unexpected number of elements in filter")
381 | 	}
382 | 	if b.N != 20000 {
383 | 		t.Error("unexpected number of elements in filter")
384 | 	}
385 | 	err := a.Join(&b)
386 | 	if a.N != 30000 {
387 | 		t.Errorf("unexpected number of elements in filter")
388 | 	}
389 | 	if err != nil {
390 | 		t.Fatal(err)
391 | 	}
392 | 	for _, v := range aval {
393 | 		if !a.Check(v) {
394 | 			t.Errorf("value not found in joined filter: %s", string(v))
395 | 		}
396 | 	}
397 | 	for _, v := range bval {
398 | 		if !a.Check(v) {
399 | 			t.Errorf("value not found in joined filter: %s", string(v))
400 | 		}
401 | 	}
402 | }
403 | 
404 | //This benchmarks the checking of values against a given filter
405 | func BenchmarkChecking(b *testing.B) {
406 | 	capacity := uint64(1e9)
407 | 	p := float64(0.001)
408 | 	samples := uint64(100000)
409 | 	filter, testValues := GenerateExampleFilter(capacity, p, samples)
410 | 	fingerprint := make([]uint64, filter.k)
411 | 	b.ResetTimer()
412 | 	for i := 0; i < b.N; i++ {
413 | 		value := testValues[rand.Int()%len(testValues)]
414 | 		filter.Fingerprint(value, fingerprint)
415 | 		if !filter.CheckFingerprint(fingerprint) {
416 | 			b.Error("Did not find test value in filter!")
417 | 		}
418 | 	}
419 | }
420 | 
421 | //This benchmarks the checking without using a fixed fingerprint variable (instead a temporary variable is created each time)
422 | func BenchmarkSimpleChecking(b *testing.B) {
423 | 	capacity := uint64(1e9)
424 | 	p := float64(0.001)
425 | 	samples := uint64(100000)
426 | 	filter, testValues := GenerateExampleFilter(capacity, p, samples)
427 | 	b.ResetTimer()
428 | 	for i := 0; i < b.N; i++ {
429 | 		value := testValues[rand.Int()%len(testValues)]
430 | 		if !filter.Check(value) {
431 | 			b.Error("Did not find test value in filter!")
432 | 		}
433 | 	}
434 | }
435 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/DCSO/bloom
2 | 
3 | go 1.15
4 | 
5 | require gopkg.in/urfave/cli.v1 v1.20.0
6 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | gopkg.in/urfave/cli.v1 v1.20.0 h1:NdAVW6RYxDif9DhDHaAortIu956m2c0v+09AZBPTbE0=
2 | gopkg.in/urfave/cli.v1 v1.20.0/go.mod h1:vuBzUtMdQeixQj8LVd+/98pzhxNGQoyuPBlsXHOQNO0=
3 | 


--------------------------------------------------------------------------------
/io.go:
--------------------------------------------------------------------------------
  1 | // DCSO go bloom filter
  2 | // Copyright (c) 2017, DCSO GmbH
  3 | 
  4 | package bloom
  5 | 
  6 | import (
  7 | 	"bufio"
  8 | 	"bytes"
  9 | 	gz "compress/gzip"
 10 | 	"io"
 11 | 	"os"
 12 | )
 13 | 
 14 | // LoadFromBytes reads a binary Bloom filter representation from a byte array
 15 | // and returns a BloomFilter struct pointer based on it.
 16 | // If 'gzip' is true, then compressed input will be expected.
 17 | func LoadFromBytes(input []byte, gzip bool) (*BloomFilter, error) {
 18 | 	return LoadFromReader(bytes.NewReader(input), gzip)
 19 | }
 20 | 
 21 | // LoadFilter reads a binary Bloom filter representation from a file
 22 | // and returns a BloomFilter struct pointer based on it.
 23 | // If 'gzip' is true, then compressed input will be expected.
 24 | func LoadFilter(path string, gzip bool) (*BloomFilter, error) {
 25 | 	file, err := os.Open(path)
 26 | 	if err != nil {
 27 | 		return nil, err
 28 | 	}
 29 | 	defer file.Close()
 30 | 
 31 | 	return LoadFromReader(file, gzip)
 32 | }
 33 | 
 34 | // LoadFromReader reads a binary Bloom filter representation from an io.Reader
 35 | // and returns a BloomFilter struct pointer based on it.
 36 | // If 'gzip' is true, then compressed input will be expected.
 37 | func LoadFromReader(inReader io.Reader, gzip bool) (*BloomFilter, error) {
 38 | 	var err error
 39 | 	var reader io.Reader
 40 | 	var gzipReader *gz.Reader
 41 | 	var ioReader *bufio.Reader
 42 | 
 43 | 	if gzip {
 44 | 		gzipReader, err = gz.NewReader(inReader)
 45 | 		if err != nil {
 46 | 			return nil, err
 47 | 		}
 48 | 		defer gzipReader.Close()
 49 | 		reader = gzipReader
 50 | 	} else {
 51 | 		ioReader = bufio.NewReader(inReader)
 52 | 		reader = ioReader
 53 | 	}
 54 | 
 55 | 	var filter BloomFilter
 56 | 	if err = filter.Read(reader); err != nil {
 57 | 		return nil, err
 58 | 	}
 59 | 
 60 | 	return &filter, nil
 61 | }
 62 | 
 63 | // WriteFilter writes a binary Bloom filter representation for a given struct
 64 | // to a file. If 'gzip' is true, then a compressed file will be written.
 65 | func WriteFilter(filter *BloomFilter, path string, gzip bool) error {
 66 | 
 67 | 	file, err := os.Create(path)
 68 | 
 69 | 	if err != nil {
 70 | 		return err
 71 | 	}
 72 | 
 73 | 	defer file.Close()
 74 | 
 75 | 	file.Seek(0, 0)
 76 | 
 77 | 	var writer io.Writer
 78 | 	var gzipWriter *gz.Writer
 79 | 	var ioWriter *bufio.Writer
 80 | 
 81 | 	if gzip {
 82 | 		gzipWriter = gz.NewWriter(file)
 83 | 		defer gzipWriter.Close()
 84 | 		writer = gzipWriter
 85 | 	} else {
 86 | 		ioWriter = bufio.NewWriter(file)
 87 | 		writer = ioWriter
 88 | 	}
 89 | 
 90 | 	err = filter.Write(writer)
 91 | 
 92 | 	if err != nil {
 93 | 		return err
 94 | 	}
 95 | 
 96 | 	if gzip {
 97 | 		gzipWriter.Flush()
 98 | 	} else {
 99 | 		ioWriter.Flush()
100 | 	}
101 | 
102 | 	file.Sync()
103 | 
104 | 	return nil
105 | }
106 | 


--------------------------------------------------------------------------------
/io_test.go:
--------------------------------------------------------------------------------
  1 | // DCSO go bloom filter
  2 | // Copyright (c) 2017, DCSO GmbH
  3 | 
  4 | package bloom
  5 | 
  6 | import (
  7 | 	"io/ioutil"
  8 | 	"os"
  9 | 	"regexp"
 10 | 	"testing"
 11 | )
 12 | 
 13 | func checkResults(t *testing.T, bf *BloomFilter) {
 14 | 	for _, v := range []string{"foo", "bar", "baz"} {
 15 | 		if !bf.Check([]byte(v)) {
 16 | 			t.Fatalf("value %s expected in filter but wasn't found", v)
 17 | 		}
 18 | 	}
 19 | 	if bf.Check([]byte("")) {
 20 | 		t.Fatal("empty value not expected in filter but was found")
 21 | 	}
 22 | 	if bf.Check([]byte("12345")) {
 23 | 		t.Fatal("missing value not expected in filter but was found")
 24 | 	}
 25 | }
 26 | 
 27 | func TestFromReaderFile(t *testing.T) {
 28 | 	f, err := os.Open("testdata/test.bloom")
 29 | 	if err != nil {
 30 | 		t.Fatal(err)
 31 | 	}
 32 | 	defer f.Close()
 33 | 	bf, err := LoadFromReader(f, false)
 34 | 	if err != nil {
 35 | 		t.Fatal(err)
 36 | 	}
 37 | 	checkResults(t, bf)
 38 | }
 39 | 
 40 | func TestFromReaderCorruptFile(t *testing.T) {
 41 | 	f, err := os.Open("testdata/broken.bloom")
 42 | 	if err != nil {
 43 | 		t.Fatal(err)
 44 | 	}
 45 | 	defer f.Close()
 46 | 	_, err = LoadFromReader(f, false)
 47 | 	if err == nil {
 48 | 		t.Fatal("error expected")
 49 | 	}
 50 | 	r, _ := regexp.Compile("is too high")
 51 | 	if !r.MatchString(err.Error()) {
 52 | 		t.Fatalf("wrong error message: %s", err.Error())
 53 | 	}
 54 | }
 55 | 
 56 | func testFromSerialized(t *testing.T, gzip bool) {
 57 | 	bf := Initialize(100, 0.0001)
 58 | 	for _, v := range []string{"foo", "bar", "baz"} {
 59 | 		bf.Add([]byte(v))
 60 | 	}
 61 | 	tmpfile, err := ioutil.TempFile("", "test")
 62 | 	if err != nil {
 63 | 		t.Fatal(err)
 64 | 	}
 65 | 	defer os.Remove(tmpfile.Name())
 66 | 
 67 | 	err = WriteFilter(&bf, tmpfile.Name(), gzip)
 68 | 	if err != nil {
 69 | 		t.Fatal(err)
 70 | 	}
 71 | 
 72 | 	loadedBf, err := LoadFilter(tmpfile.Name(), gzip)
 73 | 	if err != nil {
 74 | 		t.Fatal(err)
 75 | 	}
 76 | 	checkResults(t, loadedBf)
 77 | }
 78 | 
 79 | func TestFromSerialized(t *testing.T) {
 80 | 	testFromSerialized(t, false)
 81 | }
 82 | 
 83 | func TestFromSerializedZip(t *testing.T) {
 84 | 	testFromSerialized(t, true)
 85 | }
 86 | 
 87 | func TestFromReaderFileZip(t *testing.T) {
 88 | 	f, err := os.Open("testdata/test.bloom.gz")
 89 | 	if err != nil {
 90 | 		t.Fatal(err)
 91 | 	}
 92 | 	defer f.Close()
 93 | 	bf, err := LoadFromReader(f, true)
 94 | 	if err != nil {
 95 | 		t.Fatal(err)
 96 | 	}
 97 | 	checkResults(t, bf)
 98 | }
 99 | 
100 | func TestFromBytes(t *testing.T) {
101 | 	testBytes, err := ioutil.ReadFile("testdata/test.bloom")
102 | 	if err != nil {
103 | 		t.Fatal(err)
104 | 	}
105 | 	bf, err := LoadFromBytes(testBytes, false)
106 | 	if err != nil {
107 | 		t.Fatal(err)
108 | 	}
109 | 	checkResults(t, bf)
110 | }
111 | 
112 | func TestFromFile(t *testing.T) {
113 | 	bf, err := LoadFilter("testdata/test.bloom", false)
114 | 	if err != nil {
115 | 		t.Fatal(err)
116 | 	}
117 | 	checkResults(t, bf)
118 | }
119 | 


--------------------------------------------------------------------------------
/testdata/broken.bloom:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DCSO/bloom/5a226da0429526750d1ed733db5c5b3afa39e808/testdata/broken.bloom


--------------------------------------------------------------------------------
/testdata/test-input.txt:
--------------------------------------------------------------------------------
1 | foo
2 | bar
3 | baz
4 | 


--------------------------------------------------------------------------------
/testdata/test.bloom:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DCSO/bloom/5a226da0429526750d1ed733db5c5b3afa39e808/testdata/test.bloom


--------------------------------------------------------------------------------
/testdata/test.bloom.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DCSO/bloom/5a226da0429526750d1ed733db5c5b3afa39e808/testdata/test.bloom.gz


--------------------------------------------------------------------------------