├── .gitignore
├── .travis.yml
├── CHANGES.txt
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── pybloom_live
    ├── __init__.py
    ├── benchmarks.py
    ├── pybloom.py
    ├── test_pybloom.py
    └── utils.py
├── requirements.txt
├── setup.py
└── tox.ini


/.gitignore:
--------------------------------------------------------------------------------
  1 | env
  2 | *.pyc
  3 | *.sqlite3
  4 | 
  5 | ### Django ###
  6 | *.log
  7 | *.pot
  8 | *.pyc
  9 | __pycache__/
 10 | local_settings.py
 11 | db.sqlite3
 12 | media
 13 | 
 14 | ### macOS ###
 15 | *.DS_Store
 16 | .AppleDouble
 17 | .LSOverride
 18 | 
 19 | # Icon must end with two \r
 20 | Icon
 21 | 
 22 | # Thumbnails
 23 | ._*
 24 | 
 25 | # Files that might appear in the root of a volume
 26 | .DocumentRevisions-V100
 27 | .fseventsd
 28 | .Spotlight-V100
 29 | .TemporaryItems
 30 | .Trashes
 31 | .VolumeIcon.icns
 32 | .com.apple.timemachine.donotpresent
 33 | 
 34 | # Directories potentially created on remote AFP share
 35 | .AppleDB
 36 | .AppleDesktop
 37 | Network Trash Folder
 38 | Temporary Items
 39 | .apdisk
 40 | 
 41 | ### PyCharm ###
 42 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 43 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 44 | .idea/*
 45 | 
 46 | # CMake
 47 | cmake-build-debug/
 48 | 
 49 | # Mongo Explorer plugin:
 50 | .idea/**/mongoSettings.xml
 51 | 
 52 | ## File-based project format:
 53 | *.iws
 54 | 
 55 | ## Plugin-specific files:
 56 | 
 57 | # IntelliJ
 58 | /out/
 59 | 
 60 | # mpeltonen/sbt-idea plugin
 61 | .idea_modules/
 62 | 
 63 | # JIRA plugin
 64 | atlassian-ide-plugin.xml
 65 | 
 66 | # Cursive Clojure plugin
 67 | .idea/replstate.xml
 68 | 
 69 | # Crashlytics plugin (for Android Studio and IntelliJ)
 70 | com_crashlytics_export_strings.xml
 71 | crashlytics.properties
 72 | crashlytics-build.properties
 73 | fabric.properties
 74 | 
 75 | ### PyCharm Patch ###
 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 77 | 
 78 | # *.iml
 79 | # modules.xml
 80 | # .idea/misc.xml
 81 | # *.ipr
 82 | 
 83 | # Sonarlint plugin
 84 | .idea/sonarlint
 85 | 
 86 | ### Python ###
 87 | # Byte-compiled / optimized / DLL files
 88 | *.py[cod]
 89 | *$py.class
 90 | 
 91 | # C extensions
 92 | *.so
 93 | 
 94 | # Distribution / packaging
 95 | .Python
 96 | env/
 97 | build/
 98 | develop-eggs/
 99 | dist/
100 | downloads/
101 | eggs/
102 | .eggs/
103 | lib/
104 | lib64/
105 | parts/
106 | sdist/
107 | var/
108 | wheels/
109 | *.egg-info/
110 | .installed.cfg
111 | *.egg
112 | 
113 | # PyInstaller
114 | #  Usually these files are written by a python script from a template
115 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
116 | *.manifest
117 | *.spec
118 | 
119 | # Installer logs
120 | pip-log.txt
121 | pip-delete-this-directory.txt
122 | 
123 | # Unit test / coverage reports
124 | htmlcov/
125 | .tox/
126 | .coverage
127 | .coverage.*
128 | .cache
129 | .pytest_cache
130 | nosetests.xml
131 | coverage.xml
132 | *,cover
133 | .hypothesis/
134 | 
135 | # Translations
136 | *.mo
137 | 
138 | # Django stuff:
139 | 
140 | # Flask stuff:
141 | instance/
142 | .webassets-cache
143 | 
144 | # Scrapy stuff:
145 | .scrapy
146 | 
147 | # Sphinx documentation
148 | docs/_build/
149 | 
150 | # PyBuilder
151 | target/
152 | 
153 | # Jupyter Notebook
154 | .ipynb_checkpoints
155 | 
156 | # pyenv
157 | .python-version
158 | 
159 | # celery beat schedule file
160 | celerybeat-schedule
161 | 
162 | # SageMath parsed files
163 | *.sage.py
164 | 
165 | # dotenv
166 | .env
167 | 
168 | # virtualenv
169 | .venv
170 | venv/
171 | ENV/
172 | 
173 | # Spyder project settings
174 | .spyderproject
175 | .spyproject
176 | 
177 | # Rope project settings
178 | .ropeproject
179 | 
180 | # mkdocs documentation
181 | /site
182 | 
183 | ### SublimeText ###
184 | # cache files for sublime text
185 | *.tmlanguage.cache
186 | *.tmPreferences.cache
187 | *.stTheme.cache
188 | 
189 | # workspace files are user-specific
190 | *.sublime-workspace
191 | 
192 | # project files should be checked into the repository, unless a significant
193 | # proportion of contributors will probably not be using SublimeText
194 | # *.sublime-project
195 | 
196 | # sftp configuration file
197 | sftp-config.json
198 | 
199 | # Package control specific files
200 | Package Control.last-run
201 | Package Control.ca-list
202 | Package Control.ca-bundle
203 | Package Control.system-ca-bundle
204 | Package Control.cache/
205 | Package Control.ca-certs/
206 | Package Control.merged-ca-bundle
207 | Package Control.user-ca-bundle
208 | oscrypto-ca-bundle.crt
209 | bh_unicode_properties.cache
210 | 
211 | # Sublime-github package stores a github token in this file
212 | # https://packagecontrol.io/packages/sublime-github
213 | GitHub.sublime-settings
214 | 
215 | ### Vim ###
216 | # swap
217 | [._]*.s[a-v][a-z]
218 | [._]*.sw[a-p]
219 | [._]s[a-v][a-z]
220 | [._]sw[a-p]
221 | # session
222 | Session.vim
223 | # temporary
224 | .netrwhist
225 | *~
226 | # auto-generated tag files
227 | tags
228 | 
229 | # End of https://www.gitignore.io/api/vim,macos,django,python,pycharm,sublimetext
230 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | matrix:
 3 |   include:
 4 |     - python: 2.7
 5 |       dist: xenial
 6 |       sudo: false
 7 |     - python: 3.4
 8 |       dist: trusty
 9 |       sudo: false
10 |     - python: 3.7
11 |       dist: xenial
12 |       sudo: true
13 | 
14 | install:
15 |   - pip install -r requirements.txt
16 |   - pip install --upgrade pytest==3.6.3
17 | script:
18 |   - py.test pybloom_live
19 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | Changes in 3.1.0
 2 | ================
 3 | Deprecate `bitarray.length()` and use `len`.
 4 | 
 5 | Changes in 3.0.0
 6 | ================
 7 | Backward incompatible change: dropped the support of Python 2.6.
 8 | Fix an BytesIO issues that prevents Python 2.7 from serialising a filter.
 9 | 
10 | Changes in 2.3.2
11 | ==============
12 | Added hash function to the filter instance.
13 | 
14 | Changes in 2.3.1
15 | ==============
16 | Added union functionality to ScalableBloomFilter.
17 | 
18 | Changes in 2.2
19 | ==============
20 | Replaced the xrange by count, so it is a iterator now.
21 | This fixes overflow error when a large integer is passed
22 | the range_fun.
23 | 
24 | Changes in 2.1
25 | ==============
26 | The tightening ratio is 0.9, and it is consistently used.
27 | Choosing r around 0.8 - 0.9 will result in better average
28 | space usage for wide range of growth, therefore the default
29 | value of mode is set to LARGE_SET_GROWTH.
30 | 
31 | Changes in 2.0
32 | ==============
33 | Made major corrections to the algorithms for both BloomFilter and
34 | ScalableBloomFilter. Not numerically compatible with serialized
35 | representations of filters from previous versions. Specifically,
36 | BloomFilter was more accurate than requested and ScalableBloomFilter
37 | was much less accurate than requested.
38 | 
39 | Changes in 1.1
40 | ==============
41 | Added copy, intersection and union functions to BloomFilter
42 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) <2011> <Jay Baird and Bob Ippolito>
 2 | 
 3 | Permission is hereby granted, free of charge, to any person
 4 | obtaining a copy of this software and associated documentation
 5 | files (the "Software"), to deal in the Software without
 6 | restriction, including without limitation the rights to use,
 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 | copies of the Software, and to permit persons to whom the
 9 | Software is furnished to do so, subject to the following
10 | conditions:
11 | 
12 | The above copyright notice and this permission notice shall be
13 | included in all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE.txt
2 | include ez_setup.py
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/joseph-fox/python-bloomfilter.svg?branch=master)](https://travis-ci.org/joseph-fox/python-bloomfilter)
 2 | 
 3 | # Python Bloom Filter
 4 | 
 5 | 
 6 | This Bloom Filter has its tightening ratio updated to 0.9, and this ration 
 7 | is consistently used throughout the `pybloom` module.
 8 | Choosing r around 0.8 - 0.9 will result in better average space usage for wide
 9 | range of growth, therefore the default value of model is set to 
10 | LARGE_SET_GROWTH. This is a module that includes a Bloom Filter data structure 
11 | along with an implementation of Scalable Bloom Filters as discussed in:
12 | 
13 | ```
14 | P. Almeida, C.Baquero, N. Preguiça, D. Hutchison, Scalable Bloom Filters, (GLOBECOM 2007), IEEE, 2007.
15 | ```
16 | Bloom filters are great if you understand what amount of bits you need to set
17 | aside early to store your entire set. Scalable Bloom Filters allow your bloom
18 | filter bits to grow as a function of false positive probability and size.
19 | 
20 | A filter is "full" when at capacity: `M * ((ln 2 ^ 2) / abs(ln p))`, where M
21 | is the number of bits and p is the false positive probability. When capacity
22 | is reached a new filter is then created exponentially larger than the last
23 | with a tighter probability of false positives and a larger number of hash
24 | functions.
25 | 
26 | ```python
27 |     >>> import pybloom_live
28 |     >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001)
29 |     >>> [f.add(x) for x in range(10)]
30 |     [False, False, False, False, False, False, False, False, False, False]
31 |     >>> all([(x in f) for x in range(10)])
32 |     True
33 |     >>> 10 in f
34 |     False
35 |     >>> 5 in f
36 |     True
37 |     >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001)
38 |     >>> for i in xrange(0, f.capacity):
39 |     ...     _ = f.add(i)
40 |     >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18
41 |     True
42 | 
43 |     >>> sbf = pybloom_live.ScalableBloomFilter(mode=pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH)
44 |     >>> count = 10000
45 |     >>> for i in range(0, count):
46 |             _ = sbf.add(i)
47 | 
48 |     >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18
49 |     True
50 |     # len(sbf) may not equal the entire input length. 0.01% error is well
51 |     # below the default 0.1% error threshold. As the capacity goes up, the
52 |     # error will approach 0.1%.
53 | ```
54 | # Development
55 | We follow this [git branching model](http://nvie.com/posts/a-successful-git-branching-model/), 
56 | please have a look at it.
57 | 
58 | 
59 | # Installation instructions
60 | If you are installing from an internet-connected computer (or virtual 
61 | install), you can use the pip python package manager to download and install 
62 | this package. Simply type `pip install pybloom-live` from a DOS command 
63 | prompt (`cmd.exe`) or a linux shell (e.g. `bash` or `dash` on MacOS X as well 
64 | as linux OSes including debian, slackware, redhat, enoch and arch).
65 | 
66 | If using Windows and you are installing onto an air-gapped computer or want 
67 | the most up-to-date version from this repository, you can do the following:
68 | 
69 | 1. Download the zip file by clicking on the green "Clone or Download" 
70 | link followed by "Download Zip."
71 | 
72 | 2. Extract all the contents of the the zip folder.
73 | 
74 | 3. Open command prompt (``cmd.exe``) to the extracted folder.
75 |     a. Find the extracted folder in Windows Explorer.
76 |     b. From the parent folder level Shift+RightClick on the folder.
77 |     c. Select "Open command window here".
78 | 
79 | 4. Type `pip install .`.
80 | 
81 | Similar steps are possible under linux and MacOS X.
82 | 
83 | # Breaking changes with 4.x
84 | Support for non-cryptographic hashes has been added in 4.0.0. For 128 bit hashes, md5 has been replaced with xxh3_128, one of the [fastest](https://github.com/Cyan4973/xxHash) non-cryptographic hash functions. Details of the benchmark runs can be found [here](https://github.com/joseph-fox/python-bloomfilter/pull/38). Files generated with earlier versions of the module *will not work* with this version. Consider re-generating them using the latest version optimized for speed.  
85 | 
86 | # Installation verification
87 | Type `pip show pybloom-live` from a command prompt. Version should be 
88 | 2.2.0 as of 2016-12-11.
89 | 


--------------------------------------------------------------------------------
/pybloom_live/__init__.py:
--------------------------------------------------------------------------------
1 | """pybloom
2 | 
3 | """
4 | 
5 | from .pybloom import BloomFilter, ScalableBloomFilter
6 | 
7 | 


--------------------------------------------------------------------------------
/pybloom_live/benchmarks.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | #
 3 | """Test performance of BloomFilter at a set capacity and error rate."""
 4 | import math
 5 | import sys
 6 | import time
 7 | 
 8 | import bitarray
 9 | 
10 | from pybloom import BloomFilter
11 | from utils import range_fn
12 | 
13 | 
14 | def main(capacity=100000, request_error_rate=0.1):
15 |     f = BloomFilter(capacity=capacity, error_rate=request_error_rate)
16 |     assert (capacity == f.capacity)
17 |     start = time.time()
18 |     for i in range_fn(0, f.capacity):
19 |         f.add(i, skip_check=True)
20 |     end = time.time()
21 |     print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format(
22 |             end - start, f.capacity / (end - start)))
23 |     oneBits = f.bitarray.count(True)
24 |     zeroBits = f.bitarray.count(False)
25 |     print("Number of 1 bits:", oneBits)
26 |     print("Number of 0 bits:", zeroBits)
27 |     print("Number of Filter Bits:", f.num_bits)
28 |     print("Number of slices:", f.num_slices)
29 |     print("Bits per slice:", f.bits_per_slice)
30 |     print("------")
31 |     print("Fraction of 1 bits at capacity: {:5.3f}".format(
32 |             oneBits / float(f.num_bits)))
33 |     # Look for false positives and measure the actual fp rate
34 |     trials = f.capacity
35 |     fp = 0
36 |     start = time.time()
37 |     for i in range_fn(f.capacity, f.capacity + trials + 1):
38 |         if i in f:
39 |             fp += 1
40 |     end = time.time()
41 |     print(("{:5.3f} seconds to check false positives, "
42 |            "{:10.2f} checks/second".format(end - start, trials / (end - start))))
43 |     print("Requested FP rate: {:2.4f}".format(request_error_rate))
44 |     print("Experimental false positive rate: {:2.4f}".format(fp / float(trials)))
45 |     # Compute theoretical fp max (Goel/Gupta)
46 |     k = f.num_slices
47 |     m = f.num_bits
48 |     n = f.capacity
49 |     fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k)
50 |     print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory))
51 | 
52 | if __name__ == '__main__':
53 |     main()
54 | 


--------------------------------------------------------------------------------
/pybloom_live/pybloom.py:
--------------------------------------------------------------------------------
  1 | """This module implements a bloom filter probabilistic data structure and
  2 | an a Scalable Bloom Filter that grows in size as your add more items to it
  3 | without increasing the false positive error_rate.
  4 | 
  5 | Requires the bitarray library: http://pypi.python.org/pypi/bitarray/
  6 | """
  7 | from __future__ import absolute_import
  8 | 
  9 | import copy
 10 | import hashlib
 11 | import math
 12 | from struct import calcsize, pack, unpack
 13 | 
 14 | import xxhash
 15 | 
 16 | from pybloom_live.utils import is_string_io, range_fn, running_python_3
 17 | 
 18 | try:
 19 |     import bitarray
 20 | except ImportError:
 21 |     raise ImportError('pybloom_live requires bitarray >= 0.3.4')
 22 | 
 23 | 
 24 | def make_hashfuncs(num_slices, num_bits):
 25 |     if num_bits >= (1 << 31):
 26 |         fmt_code, chunk_size = 'Q', 8
 27 |     elif num_bits >= (1 << 15):
 28 |         fmt_code, chunk_size = 'I', 4
 29 |     else:
 30 |         fmt_code, chunk_size = 'H', 2
 31 |     total_hash_bits = 8 * num_slices * chunk_size
 32 |     if total_hash_bits > 384:
 33 |         hashfn = hashlib.sha512
 34 |     elif total_hash_bits > 256:
 35 |         hashfn = hashlib.sha384
 36 |     elif total_hash_bits > 160:
 37 |         hashfn = hashlib.sha256
 38 |     elif total_hash_bits > 128:
 39 |         hashfn = hashlib.sha1
 40 |     else:
 41 |         hashfn = xxhash.xxh128
 42 | 
 43 |     fmt = fmt_code * (hashfn().digest_size // chunk_size)
 44 |     num_salts, extra = divmod(num_slices, len(fmt))
 45 |     if extra:
 46 |         num_salts += 1
 47 |     salts = tuple(hashfn(hashfn(pack('I', i)).digest()) for i in range_fn(0, num_salts))
 48 | 
 49 |     def _hash_maker(key):
 50 |         if running_python_3:
 51 |             if isinstance(key, str):
 52 |                 key = key.encode('utf-8')
 53 |             else:
 54 |                 key = str(key).encode('utf-8')
 55 |         else:
 56 |             if isinstance(key, unicode):
 57 |                 key = key.encode('utf-8')
 58 |             else:
 59 |                 key = str(key)
 60 |         i = 0
 61 |         for salt in salts:
 62 |             h = salt.copy()
 63 |             h.update(key)
 64 |             for uint in unpack(fmt, h.digest()):
 65 |                 yield uint % num_bits
 66 |                 i += 1
 67 |                 if i >= num_slices:
 68 |                     return
 69 | 
 70 |     return _hash_maker, hashfn
 71 | 
 72 | 
 73 | class BloomFilter(object):
 74 |     FILE_FMT = b'<dQQQQ'
 75 | 
 76 |     def __init__(self, capacity, error_rate=0.001):
 77 |         """Implements a space-efficient probabilistic data structure
 78 | 
 79 |         capacity
 80 |             this BloomFilter must be able to store at least *capacity* elements
 81 |             while maintaining no more than *error_rate* chance of false
 82 |             positives
 83 |         error_rate
 84 |             the error_rate of the filter returning false positives. This
 85 |             determines the filters capacity. Inserting more than capacity
 86 |             elements greatly increases the chance of false positives.
 87 |         """
 88 |         if not (0 < error_rate < 1):
 89 |             raise ValueError("Error_Rate must be between 0 and 1.")
 90 |         if not capacity > 0:
 91 |             raise ValueError("Capacity must be > 0")
 92 |         # given M = num_bits, k = num_slices, P = error_rate, n = capacity
 93 |         #       k = log2(1/P)
 94 |         # solving for m = bits_per_slice
 95 |         # n ~= M * ((ln(2) ** 2) / abs(ln(P)))
 96 |         # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P)))
 97 |         # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2))
 98 |         num_slices = int(math.ceil(math.log(1.0 / error_rate, 2)))
 99 |         bits_per_slice = int(math.ceil(
100 |             (capacity * abs(math.log(error_rate))) /
101 |             (num_slices * (math.log(2) ** 2))))
102 |         self._setup(error_rate, num_slices, bits_per_slice, capacity, 0)
103 |         self.bitarray = bitarray.bitarray(self.num_bits, endian='little')
104 |         self.bitarray.setall(False)
105 | 
106 |     def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count):
107 |         self.error_rate = error_rate
108 |         self.num_slices = num_slices
109 |         self.bits_per_slice = bits_per_slice
110 |         self.capacity = capacity
111 |         self.num_bits = num_slices * bits_per_slice
112 |         self.count = count
113 |         self.make_hashes, self.hashfn = make_hashfuncs(self.num_slices, self.bits_per_slice)
114 | 
115 |     def __contains__(self, key):
116 |         """Tests a key's membership in this bloom filter.
117 |         """
118 |         bits_per_slice = self.bits_per_slice
119 |         bitarray = self.bitarray
120 |         hashes = self.make_hashes(key)
121 |         offset = 0
122 |         for k in hashes:
123 |             if not bitarray[offset + k]:
124 |                 return False
125 |             offset += bits_per_slice
126 |         return True
127 | 
128 |     def __len__(self):
129 |         """Return the number of keys stored by this bloom filter."""
130 |         return self.count
131 | 
132 |     def add(self, key, skip_check=False):
133 |         """ Adds a key to this bloom filter. If the key already exists in this
134 |         filter it will return True. Otherwise False.
135 |         """
136 |         bitarray = self.bitarray
137 |         bits_per_slice = self.bits_per_slice
138 |         hashes = self.make_hashes(key)
139 |         found_all_bits = True
140 |         if self.count > self.capacity:
141 |             raise IndexError("BloomFilter is at capacity")
142 |         offset = 0
143 |         for k in hashes:
144 |             if not skip_check and found_all_bits and not bitarray[offset + k]:
145 |                 found_all_bits = False
146 |             self.bitarray[offset + k] = True
147 |             offset += bits_per_slice
148 | 
149 |         if skip_check:
150 |             self.count += 1
151 |             return False
152 |         elif not found_all_bits:
153 |             self.count += 1
154 |             return False
155 |         else:
156 |             return True
157 | 
158 |     def copy(self):
159 |         """Return a copy of this bloom filter.
160 |         """
161 |         new_filter = BloomFilter(self.capacity, self.error_rate)
162 |         new_filter.bitarray = self.bitarray.copy()
163 |         return new_filter
164 | 
165 |     def union(self, other):
166 |         """ Calculates the union of the two underlying bitarrays and returns
167 |         a new bloom filter object."""
168 |         if self.capacity != other.capacity or \
169 |                         self.error_rate != other.error_rate:
170 |             raise ValueError(
171 |                 "Unioning filters requires both filters to have both the same capacity and error rate")
172 |         new_bloom = self.copy()
173 |         new_bloom.bitarray = new_bloom.bitarray | other.bitarray
174 |         return new_bloom
175 | 
176 |     def __or__(self, other):
177 |         return self.union(other)
178 | 
179 |     def intersection(self, other):
180 |         """ Calculates the intersection of the two underlying bitarrays and returns
181 |         a new bloom filter object."""
182 |         if self.capacity != other.capacity or \
183 |                         self.error_rate != other.error_rate:
184 |             raise ValueError(
185 |                 "Intersecting filters requires both filters to have equal capacity and error rate")
186 |         new_bloom = self.copy()
187 |         new_bloom.bitarray = new_bloom.bitarray & other.bitarray
188 |         return new_bloom
189 | 
190 |     def __and__(self, other):
191 |         return self.intersection(other)
192 | 
193 |     def tofile(self, f):
194 |         """Write the bloom filter to file object `f'. Underlying bits
195 |         are written as machine values. This is much more space
196 |         efficient than pickling the object."""
197 |         f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices,
198 |                      self.bits_per_slice, self.capacity, self.count))
199 |         (f.write(self.bitarray.tobytes()) if is_string_io(f)
200 |          else self.bitarray.tofile(f))
201 | 
202 |     @classmethod
203 |     def fromfile(cls, f, n=-1):
204 |         """Read a bloom filter from file-object `f' serialized with
205 |         ``BloomFilter.tofile''. If `n' > 0 read only so many bytes."""
206 |         headerlen = calcsize(cls.FILE_FMT)
207 | 
208 |         if 0 < n < headerlen:
209 |             raise ValueError('n too small!')
210 | 
211 |         filter = cls(1)  # Bogus instantiation, we will `_setup'.
212 |         filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen)))
213 |         filter.bitarray = bitarray.bitarray(endian='little')
214 |         if n > 0:
215 |             (filter.bitarray.frombytes(f.read(n - headerlen)) if is_string_io(f)
216 |              else filter.bitarray.fromfile(f, n - headerlen))
217 |         else:
218 |             (filter.bitarray.frombytes(f.read()) if is_string_io(f)
219 |              else filter.bitarray.fromfile(f))
220 |         if filter.num_bits != len(filter.bitarray) and \
221 |                 (filter.num_bits + (8 - filter.num_bits % 8) != len(filter.bitarray)):
222 |             raise ValueError('Bit length mismatch!')
223 | 
224 |         return filter
225 | 
226 |     def __getstate__(self):
227 |         d = self.__dict__.copy()
228 |         del d['make_hashes']
229 |         return d
230 | 
231 |     def __setstate__(self, d):
232 |         self.__dict__.update(d)
233 |         self.make_hashes, self.hashfn = make_hashfuncs(self.num_slices, self.bits_per_slice)
234 | 
235 | 
236 | class ScalableBloomFilter(object):
237 |     SMALL_SET_GROWTH = 2  # slower, but takes up less memory
238 |     LARGE_SET_GROWTH = 4  # faster, but takes up more memory faster
239 |     FILE_FMT = '<idQd'
240 | 
241 |     def __init__(self, initial_capacity=100, error_rate=0.001,
242 |                  mode=LARGE_SET_GROWTH):
243 |         """Implements a space-efficient probabilistic data structure that
244 |         grows as more items are added while maintaining a steady false
245 |         positive rate
246 | 
247 |         initial_capacity
248 |             the initial capacity of the filter
249 |         error_rate
250 |             the error_rate of the filter returning false positives. This
251 |             determines the filters capacity. Going over capacity greatly
252 |             increases the chance of false positives.
253 |         mode
254 |             can be either ScalableBloomFilter.SMALL_SET_GROWTH or
255 |             ScalableBloomFilter.LARGE_SET_GROWTH. SMALL_SET_GROWTH is slower
256 |             but uses less memory. LARGE_SET_GROWTH is faster but consumes
257 |             memory faster.
258 |         """
259 |         if not error_rate or error_rate < 0:
260 |             raise ValueError("Error_Rate must be a decimal less than 0.")
261 |         self._setup(mode, 0.9, initial_capacity, error_rate)
262 |         self.filters = []
263 | 
264 |     def _setup(self, mode, ratio, initial_capacity, error_rate):
265 |         self.scale = mode
266 |         self.ratio = ratio
267 |         self.initial_capacity = initial_capacity
268 |         self.error_rate = error_rate
269 | 
270 |     def __contains__(self, key):
271 |         """Tests a key's membership in this bloom filter.
272 |         """
273 |         for f in reversed(self.filters):
274 |             if key in f:
275 |                 return True
276 |         return False
277 | 
278 |     def add(self, key):
279 |         """Adds a key to this bloom filter.
280 |         If the key already exists in this filter it will return True.
281 |         Otherwise False.
282 |         """
283 |         if key in self:
284 |             return True
285 |         if not self.filters:
286 |             filter = BloomFilter(
287 |                 capacity=self.initial_capacity,
288 |                 error_rate=self.error_rate * self.ratio)
289 |             self.filters.append(filter)
290 |         else:
291 |             filter = self.filters[-1]
292 |             if filter.count >= filter.capacity:
293 |                 filter = BloomFilter(
294 |                     capacity=filter.capacity * self.scale,
295 |                     error_rate=filter.error_rate * self.ratio)
296 |                 self.filters.append(filter)
297 |         filter.add(key, skip_check=True)
298 |         return False
299 | 
300 |     def union(self, other):
301 |         """ Calculates the union of the underlying classic bloom filters and returns
302 |         a new scalable bloom filter object."""
303 | 
304 |         if self.scale != other.scale or \
305 |                 self.initial_capacity != other.initial_capacity or \
306 |                 self.error_rate != other.error_rate:
307 |             raise ValueError("Unioning two scalable bloom filters requires \
308 |             both filters to have both the same mode, initial capacity and error rate")
309 |         if len(self.filters) > len(other.filters):
310 |             larger_sbf = copy.deepcopy(self)
311 |             smaller_sbf = other
312 |         else:
313 |             larger_sbf = copy.deepcopy(other)
314 |             smaller_sbf = self
315 |         # Union the underlying classic bloom filters
316 |         new_filters = []
317 |         for i in range(len(smaller_sbf.filters)):
318 |             new_filter = larger_sbf.filters[i] | smaller_sbf.filters[i]
319 |             new_filters.append(new_filter)
320 |         for i in range(len(smaller_sbf.filters), len(larger_sbf.filters)):
321 |             new_filters.append(larger_sbf.filters[i])
322 |         larger_sbf.filters = new_filters
323 |         return larger_sbf
324 | 
325 |     def __or__(self, other):
326 |         return self.union(other)
327 | 
328 |     @property
329 |     def capacity(self):
330 |         """Returns the total capacity for all filters in this SBF"""
331 |         return sum(f.capacity for f in self.filters)
332 | 
333 |     @property
334 |     def count(self):
335 |         return len(self)
336 | 
337 |     def tofile(self, f):
338 |         """Serialize this ScalableBloomFilter into the file-object
339 |         `f'."""
340 |         f.write(pack(self.FILE_FMT, self.scale, self.ratio,
341 |                      self.initial_capacity, self.error_rate))
342 | 
343 |         # Write #-of-filters
344 |         f.write(pack(b'<l', len(self.filters)))
345 | 
346 |         if len(self.filters) > 0:
347 |             # Then each filter directly, with a header describing
348 |             # their lengths.
349 |             headerpos = f.tell()
350 |             headerfmt = b'<' + b'Q' * (len(self.filters))
351 |             f.write(b'.' * calcsize(headerfmt))
352 |             filter_sizes = []
353 |             for filter in self.filters:
354 |                 begin = f.tell()
355 |                 filter.tofile(f)
356 |                 filter_sizes.append(f.tell() - begin)
357 | 
358 |             f.seek(headerpos)
359 |             f.write(pack(headerfmt, *filter_sizes))
360 | 
361 |     @classmethod
362 |     def fromfile(cls, f):
363 |         """Deserialize the ScalableBloomFilter in file object `f'."""
364 |         filter = cls()
365 |         filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT))))
366 |         nfilters, = unpack(b'<l', f.read(calcsize(b'<l')))
367 |         if nfilters > 0:
368 |             header_fmt = b'<' + b'Q' * nfilters
369 |             bytes = f.read(calcsize(header_fmt))
370 |             filter_lengths = unpack(header_fmt, bytes)
371 |             for fl in filter_lengths:
372 |                 filter.filters.append(BloomFilter.fromfile(f, fl))
373 |         else:
374 |             filter.filters = []
375 | 
376 |         return filter
377 | 
378 |     def __len__(self):
379 |         """Returns the total number of elements stored in this SBF"""
380 |         return sum(f.count for f in self.filters)
381 | 
382 | 
383 | if __name__ == "__main__":
384 |     import doctest
385 | 
386 |     doctest.testmod()
387 | 


--------------------------------------------------------------------------------
/pybloom_live/test_pybloom.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import
  2 | 
  3 | from pybloom_live.pybloom import (BloomFilter, ScalableBloomFilter,
  4 |                                   make_hashfuncs)
  5 | from pybloom_live.utils import range_fn, running_python_3
  6 | 
  7 | try:
  8 |     import cStringIO
  9 |     import StringIO
 10 | except ImportError:
 11 |     pass
 12 | 
 13 | import io
 14 | import random
 15 | import tempfile
 16 | import unittest
 17 | 
 18 | import pytest
 19 | 
 20 | 
 21 | class TestMakeHashFuncs(unittest.TestCase):
 22 |     def test_make_hashfuncs_returns_hashfn(self):
 23 |         make_hashes, hashfn = make_hashfuncs(100, 20)
 24 |         self.assertEqual('openssl_sha512', hashfn.__name__)
 25 |         make_hashes, hashfn = make_hashfuncs(20, 3)
 26 |         self.assertEqual('openssl_sha384', hashfn.__name__)
 27 |         make_hashes, hashfn = make_hashfuncs(15, 2)
 28 |         self.assertEqual('openssl_sha256', hashfn.__name__)
 29 |         make_hashes, hashfn = make_hashfuncs(10, 2)
 30 |         self.assertEqual('openssl_sha1', hashfn.__name__)
 31 |         make_hashes, hashfn = make_hashfuncs(5, 1)
 32 |         self.assertEqual('xxh3_128', hashfn.__name__)
 33 | 
 34 | 
 35 | class TestUnionIntersection(unittest.TestCase):
 36 |     def test_union(self):
 37 |         bloom_one = BloomFilter(100, 0.001)
 38 |         bloom_two = BloomFilter(100, 0.001)
 39 |         chars = [chr(i) for i in range_fn(97, 123)]
 40 |         for char in chars[int(len(chars)/2):]:
 41 |             bloom_one.add(char)
 42 |         for char in chars[:int(len(chars)/2)]:
 43 |             bloom_two.add(char)
 44 |         new_bloom = bloom_one.union(bloom_two)
 45 |         for char in chars:
 46 |             self.assertTrue(char in new_bloom)
 47 | 
 48 |     def test_intersection(self):
 49 |         bloom_one = BloomFilter(100, 0.001)
 50 |         bloom_two = BloomFilter(100, 0.001)
 51 |         chars = [chr(i) for i in range_fn(97, 123)]
 52 |         for char in chars:
 53 |             bloom_one.add(char)
 54 |         for char in chars[:int(len(chars)/2)]:
 55 |             bloom_two.add(char)
 56 |         new_bloom = bloom_one.intersection(bloom_two)
 57 |         for char in chars[:int(len(chars)/2)]:
 58 |             self.assertTrue(char in new_bloom)
 59 |         for char in chars[int(len(chars)/2):]:
 60 |             self.assertTrue(char not in new_bloom)
 61 | 
 62 |     def test_intersection_capacity_fail(self):
 63 |         bloom_one = BloomFilter(1000, 0.001)
 64 |         bloom_two = BloomFilter(100, 0.001)
 65 |         def _run():
 66 |             bloom_one.intersection(bloom_two)
 67 |         self.assertRaises(ValueError, _run)
 68 | 
 69 |     def test_union_capacity_fail(self):
 70 |         bloom_one = BloomFilter(1000, 0.001)
 71 |         bloom_two = BloomFilter(100, 0.001)
 72 |         def _run():
 73 |             bloom_one.union(bloom_two)
 74 |         self.assertRaises(ValueError, _run)
 75 | 
 76 |     def test_intersection_k_fail(self):
 77 |         bloom_one = BloomFilter(100, 0.001)
 78 |         bloom_two = BloomFilter(100, 0.01)
 79 |         def _run():
 80 |             bloom_one.intersection(bloom_two)
 81 |         self.assertRaises(ValueError, _run)
 82 | 
 83 |     def test_union_k_fail(self):
 84 |         bloom_one = BloomFilter(100, 0.01)
 85 |         bloom_two = BloomFilter(100, 0.001)
 86 |         def _run():
 87 |             bloom_one.union(bloom_two)
 88 |         self.assertRaises(ValueError, _run)
 89 | 
 90 |     def test_union_scalable_bloom_filter(self):
 91 |         bloom_one = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
 92 |         bloom_two = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH)
 93 |         numbers = [i for i in range_fn(1, 10000)]
 94 |         middle = int(len(numbers) / 2)
 95 |         for number in numbers[middle:]:
 96 |             bloom_one.add(number)
 97 |         for number in numbers[:middle]:
 98 |             bloom_two.add(number)
 99 |         new_bloom = bloom_one.union(bloom_two)
100 |         for number in numbers:
101 |             self.assertTrue(number in new_bloom)
102 | 
103 | 
104 | class TestSerialization:
105 |     SIZE = 12345
106 |     EXPECTED = set([random.randint(0, 10000100) for _ in range_fn(0, SIZE)])
107 | 
108 |     @pytest.mark.parametrize("cls,args", [
109 |         (BloomFilter, (SIZE,)),
110 |         (ScalableBloomFilter, ()),
111 |     ])
112 |     @pytest.mark.parametrize("stream_factory", [
113 |         lambda: tempfile.TemporaryFile,
114 |         lambda: io.BytesIO,
115 |         pytest.param(
116 |             lambda: cStringIO.StringIO,
117 |             marks=pytest.mark.skipif(running_python_3, reason="Python 2 only")),
118 |         pytest.param(
119 |             lambda: StringIO.StringIO,
120 |             marks=pytest.mark.skipif(running_python_3, reason="Python 2 only")),
121 |     ])
122 |     def test_serialization(self, cls, args, stream_factory):
123 |         filter = cls(*args)
124 |         for item in self.EXPECTED:
125 |             filter.add(item)
126 | 
127 |         f = stream_factory()()
128 |         filter.tofile(f)
129 |         del filter
130 | 
131 |         f.seek(0)
132 |         filter = cls.fromfile(f)
133 |         for item in self.EXPECTED:
134 |             assert item in filter
135 | 
136 | 
137 | if __name__ == '__main__':
138 |     unittest.main()
139 | 


--------------------------------------------------------------------------------
/pybloom_live/utils.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import itertools
 3 | 
 4 | try:
 5 |     import StringIO
 6 |     import cStringIO
 7 | except ImportError:
 8 |     pass
 9 | 
10 | from io import BytesIO
11 | 
12 | running_python_3 = sys.version_info[0] == 3
13 | 
14 | 
15 | def range_fn(start=0, stop=None):
16 |     if running_python_3:
17 |         return range(start, stop)
18 |     else:
19 |         return iter(itertools.count(start).next, stop)
20 | 
21 | 
22 | def is_string_io(instance):
23 |     if isinstance(instance, BytesIO):
24 |         return True
25 |     if not running_python_3:
26 |         return isinstance(instance, (StringIO.StringIO,
27 |                                      cStringIO.InputType,
28 |                                      cStringIO.OutputType))
29 |     return False
30 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bitarray>=0.3.4
2 | xxhash>=3.0.0


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import setup
 3 | 
 4 | VERSION = '3.1.0'
 5 | DESCRIPTION = "Bloom filter: A Probabilistic data structure"
 6 | LONG_DESCRIPTION = """
 7 | This bloom filter is forked from pybloom, and its tightening ratio is changed to 0.9, and this ration is consistently used. Choosing r around 0.8 - 0.9 will result in better average space usage for wide range of growth, therefore the default value of model is set to LARGE_SET_GROWTH.
 8 | This is a Python implementation of the bloom filter probabilistic data
 9 | structure. The module also provides a Scalable Bloom Filter that allows a
10 | bloom filter to grow without knowing the original set size.
11 | """
12 | 
13 | CLASSIFIERS = filter(None, map(str.strip,
14 | """
15 | Intended Audience :: Developers
16 | License :: OSI Approved :: MIT License
17 | Programming Language :: Python
18 | Programming Language :: Python :: 3
19 | Operating System :: OS Independent
20 | Topic :: Utilities
21 | Topic :: Database :: Database Engines/Servers
22 | Topic :: Software Development :: Libraries :: Python Modules
23 | """.splitlines()))
24 | 
25 | setup(
26 |     name="pybloom_live",
27 |     version=VERSION,
28 |     description=DESCRIPTION,
29 |     long_description=LONG_DESCRIPTION,
30 |     classifiers=CLASSIFIERS,
31 |     keywords=('data structures', 'bloom filter', 'bloom', 'filter', 'big data',
32 |               'probabilistic', 'set'),
33 |     author="Jay Baird",
34 |     author_email="jay.baird@me.com",
35 |     url="https://github.com/joseph-fox/python-bloomfilter",
36 |     license="MIT License",
37 |     platforms=['any'],
38 |     test_suite="pybloom_live.tests",
39 |     zip_safe=True,
40 |     install_requires=['bitarray>=0.3.4'],
41 |     packages=['pybloom_live']
42 | )
43 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | envlist = py27,py34,py37
3 | [testenv]
4 | deps=pytest==3.6.3
5 | commands=py.test pybloom_live/test_pybloom.py
6 | 


--------------------------------------------------------------------------------