├── .gitignore ├── .travis.yml ├── CHANGES.txt ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── pybloom_live ├── __init__.py ├── benchmarks.py ├── pybloom.py ├── test_pybloom.py └── utils.py ├── requirements.txt ├── setup.py └── tox.ini /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | *.pyc 3 | *.sqlite3 4 | 5 | ### Django ### 6 | *.log 7 | *.pot 8 | *.pyc 9 | __pycache__/ 10 | local_settings.py 11 | db.sqlite3 12 | media 13 | 14 | ### macOS ### 15 | *.DS_Store 16 | .AppleDouble 17 | .LSOverride 18 | 19 | # Icon must end with two \r 20 | Icon 21 | 22 | # Thumbnails 23 | ._* 24 | 25 | # Files that might appear in the root of a volume 26 | .DocumentRevisions-V100 27 | .fseventsd 28 | .Spotlight-V100 29 | .TemporaryItems 30 | .Trashes 31 | .VolumeIcon.icns 32 | .com.apple.timemachine.donotpresent 33 | 34 | # Directories potentially created on remote AFP share 35 | .AppleDB 36 | .AppleDesktop 37 | Network Trash Folder 38 | Temporary Items 39 | .apdisk 40 | 41 | ### PyCharm ### 42 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 43 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 44 | .idea/* 45 | 46 | # CMake 47 | cmake-build-debug/ 48 | 49 | # Mongo Explorer plugin: 50 | .idea/**/mongoSettings.xml 51 | 52 | ## File-based project format: 53 | *.iws 54 | 55 | ## Plugin-specific files: 56 | 57 | # IntelliJ 58 | /out/ 59 | 60 | # mpeltonen/sbt-idea plugin 61 | .idea_modules/ 62 | 63 | # JIRA plugin 64 | atlassian-ide-plugin.xml 65 | 66 | # Cursive Clojure plugin 67 | .idea/replstate.xml 68 | 69 | # Crashlytics plugin (for Android Studio and IntelliJ) 70 | com_crashlytics_export_strings.xml 71 | crashlytics.properties 72 | crashlytics-build.properties 73 | fabric.properties 74 | 75 | ### PyCharm Patch ### 76 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 77 | 78 | # *.iml 79 | # modules.xml 80 | # .idea/misc.xml 81 | # *.ipr 82 | 83 | # Sonarlint plugin 84 | .idea/sonarlint 85 | 86 | ### Python ### 87 | # Byte-compiled / optimized / DLL files 88 | *.py[cod] 89 | *$py.class 90 | 91 | # C extensions 92 | *.so 93 | 94 | # Distribution / packaging 95 | .Python 96 | env/ 97 | build/ 98 | develop-eggs/ 99 | dist/ 100 | downloads/ 101 | eggs/ 102 | .eggs/ 103 | lib/ 104 | lib64/ 105 | parts/ 106 | sdist/ 107 | var/ 108 | wheels/ 109 | *.egg-info/ 110 | .installed.cfg 111 | *.egg 112 | 113 | # PyInstaller 114 | # Usually these files are written by a python script from a template 115 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 116 | *.manifest 117 | *.spec 118 | 119 | # Installer logs 120 | pip-log.txt 121 | pip-delete-this-directory.txt 122 | 123 | # Unit test / coverage reports 124 | htmlcov/ 125 | .tox/ 126 | .coverage 127 | .coverage.* 128 | .cache 129 | .pytest_cache 130 | nosetests.xml 131 | coverage.xml 132 | *,cover 133 | .hypothesis/ 134 | 135 | # Translations 136 | *.mo 137 | 138 | # Django stuff: 139 | 140 | # Flask stuff: 141 | instance/ 142 | .webassets-cache 143 | 144 | # Scrapy stuff: 145 | .scrapy 146 | 147 | # Sphinx documentation 148 | docs/_build/ 149 | 150 | # PyBuilder 151 | target/ 152 | 153 | # Jupyter Notebook 154 | .ipynb_checkpoints 155 | 156 | # pyenv 157 | .python-version 158 | 159 | # celery beat schedule file 160 | celerybeat-schedule 161 | 162 | # SageMath parsed files 163 | *.sage.py 164 | 165 | # dotenv 166 | .env 167 | 168 | # virtualenv 169 | .venv 170 | venv/ 171 | ENV/ 172 | 173 | # Spyder project settings 174 | .spyderproject 175 | .spyproject 176 | 177 | # Rope project settings 178 | .ropeproject 179 | 180 | # mkdocs documentation 181 | /site 182 | 183 | ### SublimeText ### 184 | # cache files for sublime text 185 | *.tmlanguage.cache 186 | *.tmPreferences.cache 187 | *.stTheme.cache 188 | 189 | # workspace files are user-specific 190 | *.sublime-workspace 191 | 192 | # project files should be checked into the repository, unless a significant 193 | # proportion of contributors will probably not be using SublimeText 194 | # *.sublime-project 195 | 196 | # sftp configuration file 197 | sftp-config.json 198 | 199 | # Package control specific files 200 | Package Control.last-run 201 | Package Control.ca-list 202 | Package Control.ca-bundle 203 | Package Control.system-ca-bundle 204 | Package Control.cache/ 205 | Package Control.ca-certs/ 206 | Package Control.merged-ca-bundle 207 | Package Control.user-ca-bundle 208 | oscrypto-ca-bundle.crt 209 | bh_unicode_properties.cache 210 | 211 | # Sublime-github package stores a github token in this file 212 | # https://packagecontrol.io/packages/sublime-github 213 | GitHub.sublime-settings 214 | 215 | ### Vim ### 216 | # swap 217 | [._]*.s[a-v][a-z] 218 | [._]*.sw[a-p] 219 | [._]s[a-v][a-z] 220 | [._]sw[a-p] 221 | # session 222 | Session.vim 223 | # temporary 224 | .netrwhist 225 | *~ 226 | # auto-generated tag files 227 | tags 228 | 229 | # End of https://www.gitignore.io/api/vim,macos,django,python,pycharm,sublimetext 230 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | matrix: 3 | include: 4 | - python: 2.7 5 | dist: xenial 6 | sudo: false 7 | - python: 3.4 8 | dist: trusty 9 | sudo: false 10 | - python: 3.7 11 | dist: xenial 12 | sudo: true 13 | 14 | install: 15 | - pip install -r requirements.txt 16 | - pip install --upgrade pytest==3.6.3 17 | script: 18 | - py.test pybloom_live 19 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | Changes in 3.1.0 2 | ================ 3 | Deprecate `bitarray.length()` and use `len`. 4 | 5 | Changes in 3.0.0 6 | ================ 7 | Backward incompatible change: dropped the support of Python 2.6. 8 | Fix an BytesIO issues that prevents Python 2.7 from serialising a filter. 9 | 10 | Changes in 2.3.2 11 | ============== 12 | Added hash function to the filter instance. 13 | 14 | Changes in 2.3.1 15 | ============== 16 | Added union functionality to ScalableBloomFilter. 17 | 18 | Changes in 2.2 19 | ============== 20 | Replaced the xrange by count, so it is a iterator now. 21 | This fixes overflow error when a large integer is passed 22 | the range_fun. 23 | 24 | Changes in 2.1 25 | ============== 26 | The tightening ratio is 0.9, and it is consistently used. 27 | Choosing r around 0.8 - 0.9 will result in better average 28 | space usage for wide range of growth, therefore the default 29 | value of mode is set to LARGE_SET_GROWTH. 30 | 31 | Changes in 2.0 32 | ============== 33 | Made major corrections to the algorithms for both BloomFilter and 34 | ScalableBloomFilter. Not numerically compatible with serialized 35 | representations of filters from previous versions. Specifically, 36 | BloomFilter was more accurate than requested and ScalableBloomFilter 37 | was much less accurate than requested. 38 | 39 | Changes in 1.1 40 | ============== 41 | Added copy, intersection and union functions to BloomFilter 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) <2011> 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt 2 | include ez_setup.py 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/joseph-fox/python-bloomfilter.svg?branch=master)](https://travis-ci.org/joseph-fox/python-bloomfilter) 2 | 3 | # Python Bloom Filter 4 | 5 | 6 | This Bloom Filter has its tightening ratio updated to 0.9, and this ration 7 | is consistently used throughout the `pybloom` module. 8 | Choosing r around 0.8 - 0.9 will result in better average space usage for wide 9 | range of growth, therefore the default value of model is set to 10 | LARGE_SET_GROWTH. This is a module that includes a Bloom Filter data structure 11 | along with an implementation of Scalable Bloom Filters as discussed in: 12 | 13 | ``` 14 | P. Almeida, C.Baquero, N. Preguiça, D. Hutchison, Scalable Bloom Filters, (GLOBECOM 2007), IEEE, 2007. 15 | ``` 16 | Bloom filters are great if you understand what amount of bits you need to set 17 | aside early to store your entire set. Scalable Bloom Filters allow your bloom 18 | filter bits to grow as a function of false positive probability and size. 19 | 20 | A filter is "full" when at capacity: `M * ((ln 2 ^ 2) / abs(ln p))`, where M 21 | is the number of bits and p is the false positive probability. When capacity 22 | is reached a new filter is then created exponentially larger than the last 23 | with a tighter probability of false positives and a larger number of hash 24 | functions. 25 | 26 | ```python 27 | >>> import pybloom_live 28 | >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001) 29 | >>> [f.add(x) for x in range(10)] 30 | [False, False, False, False, False, False, False, False, False, False] 31 | >>> all([(x in f) for x in range(10)]) 32 | True 33 | >>> 10 in f 34 | False 35 | >>> 5 in f 36 | True 37 | >>> f = pybloom_live.BloomFilter(capacity=1000, error_rate=0.001) 38 | >>> for i in xrange(0, f.capacity): 39 | ... _ = f.add(i) 40 | >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 41 | True 42 | 43 | >>> sbf = pybloom_live.ScalableBloomFilter(mode=pybloom_live.ScalableBloomFilter.SMALL_SET_GROWTH) 44 | >>> count = 10000 45 | >>> for i in range(0, count): 46 | _ = sbf.add(i) 47 | 48 | >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 49 | True 50 | # len(sbf) may not equal the entire input length. 0.01% error is well 51 | # below the default 0.1% error threshold. As the capacity goes up, the 52 | # error will approach 0.1%. 53 | ``` 54 | # Development 55 | We follow this [git branching model](http://nvie.com/posts/a-successful-git-branching-model/), 56 | please have a look at it. 57 | 58 | 59 | # Installation instructions 60 | If you are installing from an internet-connected computer (or virtual 61 | install), you can use the pip python package manager to download and install 62 | this package. Simply type `pip install pybloom-live` from a DOS command 63 | prompt (`cmd.exe`) or a linux shell (e.g. `bash` or `dash` on MacOS X as well 64 | as linux OSes including debian, slackware, redhat, enoch and arch). 65 | 66 | If using Windows and you are installing onto an air-gapped computer or want 67 | the most up-to-date version from this repository, you can do the following: 68 | 69 | 1. Download the zip file by clicking on the green "Clone or Download" 70 | link followed by "Download Zip." 71 | 72 | 2. Extract all the contents of the the zip folder. 73 | 74 | 3. Open command prompt (``cmd.exe``) to the extracted folder. 75 | a. Find the extracted folder in Windows Explorer. 76 | b. From the parent folder level Shift+RightClick on the folder. 77 | c. Select "Open command window here". 78 | 79 | 4. Type `pip install .`. 80 | 81 | Similar steps are possible under linux and MacOS X. 82 | 83 | # Breaking changes with 4.x 84 | Support for non-cryptographic hashes has been added in 4.0.0. For 128 bit hashes, md5 has been replaced with xxh3_128, one of the [fastest](https://github.com/Cyan4973/xxHash) non-cryptographic hash functions. Details of the benchmark runs can be found [here](https://github.com/joseph-fox/python-bloomfilter/pull/38). Files generated with earlier versions of the module *will not work* with this version. Consider re-generating them using the latest version optimized for speed. 85 | 86 | # Installation verification 87 | Type `pip show pybloom-live` from a command prompt. Version should be 88 | 2.2.0 as of 2016-12-11. 89 | -------------------------------------------------------------------------------- /pybloom_live/__init__.py: -------------------------------------------------------------------------------- 1 | """pybloom 2 | 3 | """ 4 | 5 | from .pybloom import BloomFilter, ScalableBloomFilter 6 | 7 | -------------------------------------------------------------------------------- /pybloom_live/benchmarks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | """Test performance of BloomFilter at a set capacity and error rate.""" 4 | import math 5 | import sys 6 | import time 7 | 8 | import bitarray 9 | 10 | from pybloom import BloomFilter 11 | from utils import range_fn 12 | 13 | 14 | def main(capacity=100000, request_error_rate=0.1): 15 | f = BloomFilter(capacity=capacity, error_rate=request_error_rate) 16 | assert (capacity == f.capacity) 17 | start = time.time() 18 | for i in range_fn(0, f.capacity): 19 | f.add(i, skip_check=True) 20 | end = time.time() 21 | print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( 22 | end - start, f.capacity / (end - start))) 23 | oneBits = f.bitarray.count(True) 24 | zeroBits = f.bitarray.count(False) 25 | print("Number of 1 bits:", oneBits) 26 | print("Number of 0 bits:", zeroBits) 27 | print("Number of Filter Bits:", f.num_bits) 28 | print("Number of slices:", f.num_slices) 29 | print("Bits per slice:", f.bits_per_slice) 30 | print("------") 31 | print("Fraction of 1 bits at capacity: {:5.3f}".format( 32 | oneBits / float(f.num_bits))) 33 | # Look for false positives and measure the actual fp rate 34 | trials = f.capacity 35 | fp = 0 36 | start = time.time() 37 | for i in range_fn(f.capacity, f.capacity + trials + 1): 38 | if i in f: 39 | fp += 1 40 | end = time.time() 41 | print(("{:5.3f} seconds to check false positives, " 42 | "{:10.2f} checks/second".format(end - start, trials / (end - start)))) 43 | print("Requested FP rate: {:2.4f}".format(request_error_rate)) 44 | print("Experimental false positive rate: {:2.4f}".format(fp / float(trials))) 45 | # Compute theoretical fp max (Goel/Gupta) 46 | k = f.num_slices 47 | m = f.num_bits 48 | n = f.capacity 49 | fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) 50 | print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)) 51 | 52 | if __name__ == '__main__': 53 | main() 54 | -------------------------------------------------------------------------------- /pybloom_live/pybloom.py: -------------------------------------------------------------------------------- 1 | """This module implements a bloom filter probabilistic data structure and 2 | an a Scalable Bloom Filter that grows in size as your add more items to it 3 | without increasing the false positive error_rate. 4 | 5 | Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ 6 | """ 7 | from __future__ import absolute_import 8 | 9 | import copy 10 | import hashlib 11 | import math 12 | from struct import calcsize, pack, unpack 13 | 14 | import xxhash 15 | 16 | from pybloom_live.utils import is_string_io, range_fn, running_python_3 17 | 18 | try: 19 | import bitarray 20 | except ImportError: 21 | raise ImportError('pybloom_live requires bitarray >= 0.3.4') 22 | 23 | 24 | def make_hashfuncs(num_slices, num_bits): 25 | if num_bits >= (1 << 31): 26 | fmt_code, chunk_size = 'Q', 8 27 | elif num_bits >= (1 << 15): 28 | fmt_code, chunk_size = 'I', 4 29 | else: 30 | fmt_code, chunk_size = 'H', 2 31 | total_hash_bits = 8 * num_slices * chunk_size 32 | if total_hash_bits > 384: 33 | hashfn = hashlib.sha512 34 | elif total_hash_bits > 256: 35 | hashfn = hashlib.sha384 36 | elif total_hash_bits > 160: 37 | hashfn = hashlib.sha256 38 | elif total_hash_bits > 128: 39 | hashfn = hashlib.sha1 40 | else: 41 | hashfn = xxhash.xxh128 42 | 43 | fmt = fmt_code * (hashfn().digest_size // chunk_size) 44 | num_salts, extra = divmod(num_slices, len(fmt)) 45 | if extra: 46 | num_salts += 1 47 | salts = tuple(hashfn(hashfn(pack('I', i)).digest()) for i in range_fn(0, num_salts)) 48 | 49 | def _hash_maker(key): 50 | if running_python_3: 51 | if isinstance(key, str): 52 | key = key.encode('utf-8') 53 | else: 54 | key = str(key).encode('utf-8') 55 | else: 56 | if isinstance(key, unicode): 57 | key = key.encode('utf-8') 58 | else: 59 | key = str(key) 60 | i = 0 61 | for salt in salts: 62 | h = salt.copy() 63 | h.update(key) 64 | for uint in unpack(fmt, h.digest()): 65 | yield uint % num_bits 66 | i += 1 67 | if i >= num_slices: 68 | return 69 | 70 | return _hash_maker, hashfn 71 | 72 | 73 | class BloomFilter(object): 74 | FILE_FMT = b' 0: 91 | raise ValueError("Capacity must be > 0") 92 | # given M = num_bits, k = num_slices, P = error_rate, n = capacity 93 | # k = log2(1/P) 94 | # solving for m = bits_per_slice 95 | # n ~= M * ((ln(2) ** 2) / abs(ln(P))) 96 | # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) 97 | # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) 98 | num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) 99 | bits_per_slice = int(math.ceil( 100 | (capacity * abs(math.log(error_rate))) / 101 | (num_slices * (math.log(2) ** 2)))) 102 | self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) 103 | self.bitarray = bitarray.bitarray(self.num_bits, endian='little') 104 | self.bitarray.setall(False) 105 | 106 | def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): 107 | self.error_rate = error_rate 108 | self.num_slices = num_slices 109 | self.bits_per_slice = bits_per_slice 110 | self.capacity = capacity 111 | self.num_bits = num_slices * bits_per_slice 112 | self.count = count 113 | self.make_hashes, self.hashfn = make_hashfuncs(self.num_slices, self.bits_per_slice) 114 | 115 | def __contains__(self, key): 116 | """Tests a key's membership in this bloom filter. 117 | """ 118 | bits_per_slice = self.bits_per_slice 119 | bitarray = self.bitarray 120 | hashes = self.make_hashes(key) 121 | offset = 0 122 | for k in hashes: 123 | if not bitarray[offset + k]: 124 | return False 125 | offset += bits_per_slice 126 | return True 127 | 128 | def __len__(self): 129 | """Return the number of keys stored by this bloom filter.""" 130 | return self.count 131 | 132 | def add(self, key, skip_check=False): 133 | """ Adds a key to this bloom filter. If the key already exists in this 134 | filter it will return True. Otherwise False. 135 | """ 136 | bitarray = self.bitarray 137 | bits_per_slice = self.bits_per_slice 138 | hashes = self.make_hashes(key) 139 | found_all_bits = True 140 | if self.count > self.capacity: 141 | raise IndexError("BloomFilter is at capacity") 142 | offset = 0 143 | for k in hashes: 144 | if not skip_check and found_all_bits and not bitarray[offset + k]: 145 | found_all_bits = False 146 | self.bitarray[offset + k] = True 147 | offset += bits_per_slice 148 | 149 | if skip_check: 150 | self.count += 1 151 | return False 152 | elif not found_all_bits: 153 | self.count += 1 154 | return False 155 | else: 156 | return True 157 | 158 | def copy(self): 159 | """Return a copy of this bloom filter. 160 | """ 161 | new_filter = BloomFilter(self.capacity, self.error_rate) 162 | new_filter.bitarray = self.bitarray.copy() 163 | return new_filter 164 | 165 | def union(self, other): 166 | """ Calculates the union of the two underlying bitarrays and returns 167 | a new bloom filter object.""" 168 | if self.capacity != other.capacity or \ 169 | self.error_rate != other.error_rate: 170 | raise ValueError( 171 | "Unioning filters requires both filters to have both the same capacity and error rate") 172 | new_bloom = self.copy() 173 | new_bloom.bitarray = new_bloom.bitarray | other.bitarray 174 | return new_bloom 175 | 176 | def __or__(self, other): 177 | return self.union(other) 178 | 179 | def intersection(self, other): 180 | """ Calculates the intersection of the two underlying bitarrays and returns 181 | a new bloom filter object.""" 182 | if self.capacity != other.capacity or \ 183 | self.error_rate != other.error_rate: 184 | raise ValueError( 185 | "Intersecting filters requires both filters to have equal capacity and error rate") 186 | new_bloom = self.copy() 187 | new_bloom.bitarray = new_bloom.bitarray & other.bitarray 188 | return new_bloom 189 | 190 | def __and__(self, other): 191 | return self.intersection(other) 192 | 193 | def tofile(self, f): 194 | """Write the bloom filter to file object `f'. Underlying bits 195 | are written as machine values. This is much more space 196 | efficient than pickling the object.""" 197 | f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, 198 | self.bits_per_slice, self.capacity, self.count)) 199 | (f.write(self.bitarray.tobytes()) if is_string_io(f) 200 | else self.bitarray.tofile(f)) 201 | 202 | @classmethod 203 | def fromfile(cls, f, n=-1): 204 | """Read a bloom filter from file-object `f' serialized with 205 | ``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" 206 | headerlen = calcsize(cls.FILE_FMT) 207 | 208 | if 0 < n < headerlen: 209 | raise ValueError('n too small!') 210 | 211 | filter = cls(1) # Bogus instantiation, we will `_setup'. 212 | filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) 213 | filter.bitarray = bitarray.bitarray(endian='little') 214 | if n > 0: 215 | (filter.bitarray.frombytes(f.read(n - headerlen)) if is_string_io(f) 216 | else filter.bitarray.fromfile(f, n - headerlen)) 217 | else: 218 | (filter.bitarray.frombytes(f.read()) if is_string_io(f) 219 | else filter.bitarray.fromfile(f)) 220 | if filter.num_bits != len(filter.bitarray) and \ 221 | (filter.num_bits + (8 - filter.num_bits % 8) != len(filter.bitarray)): 222 | raise ValueError('Bit length mismatch!') 223 | 224 | return filter 225 | 226 | def __getstate__(self): 227 | d = self.__dict__.copy() 228 | del d['make_hashes'] 229 | return d 230 | 231 | def __setstate__(self, d): 232 | self.__dict__.update(d) 233 | self.make_hashes, self.hashfn = make_hashfuncs(self.num_slices, self.bits_per_slice) 234 | 235 | 236 | class ScalableBloomFilter(object): 237 | SMALL_SET_GROWTH = 2 # slower, but takes up less memory 238 | LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster 239 | FILE_FMT = '= filter.capacity: 293 | filter = BloomFilter( 294 | capacity=filter.capacity * self.scale, 295 | error_rate=filter.error_rate * self.ratio) 296 | self.filters.append(filter) 297 | filter.add(key, skip_check=True) 298 | return False 299 | 300 | def union(self, other): 301 | """ Calculates the union of the underlying classic bloom filters and returns 302 | a new scalable bloom filter object.""" 303 | 304 | if self.scale != other.scale or \ 305 | self.initial_capacity != other.initial_capacity or \ 306 | self.error_rate != other.error_rate: 307 | raise ValueError("Unioning two scalable bloom filters requires \ 308 | both filters to have both the same mode, initial capacity and error rate") 309 | if len(self.filters) > len(other.filters): 310 | larger_sbf = copy.deepcopy(self) 311 | smaller_sbf = other 312 | else: 313 | larger_sbf = copy.deepcopy(other) 314 | smaller_sbf = self 315 | # Union the underlying classic bloom filters 316 | new_filters = [] 317 | for i in range(len(smaller_sbf.filters)): 318 | new_filter = larger_sbf.filters[i] | smaller_sbf.filters[i] 319 | new_filters.append(new_filter) 320 | for i in range(len(smaller_sbf.filters), len(larger_sbf.filters)): 321 | new_filters.append(larger_sbf.filters[i]) 322 | larger_sbf.filters = new_filters 323 | return larger_sbf 324 | 325 | def __or__(self, other): 326 | return self.union(other) 327 | 328 | @property 329 | def capacity(self): 330 | """Returns the total capacity for all filters in this SBF""" 331 | return sum(f.capacity for f in self.filters) 332 | 333 | @property 334 | def count(self): 335 | return len(self) 336 | 337 | def tofile(self, f): 338 | """Serialize this ScalableBloomFilter into the file-object 339 | `f'.""" 340 | f.write(pack(self.FILE_FMT, self.scale, self.ratio, 341 | self.initial_capacity, self.error_rate)) 342 | 343 | # Write #-of-filters 344 | f.write(pack(b' 0: 347 | # Then each filter directly, with a header describing 348 | # their lengths. 349 | headerpos = f.tell() 350 | headerfmt = b'<' + b'Q' * (len(self.filters)) 351 | f.write(b'.' * calcsize(headerfmt)) 352 | filter_sizes = [] 353 | for filter in self.filters: 354 | begin = f.tell() 355 | filter.tofile(f) 356 | filter_sizes.append(f.tell() - begin) 357 | 358 | f.seek(headerpos) 359 | f.write(pack(headerfmt, *filter_sizes)) 360 | 361 | @classmethod 362 | def fromfile(cls, f): 363 | """Deserialize the ScalableBloomFilter in file object `f'.""" 364 | filter = cls() 365 | filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) 366 | nfilters, = unpack(b' 0: 368 | header_fmt = b'<' + b'Q' * nfilters 369 | bytes = f.read(calcsize(header_fmt)) 370 | filter_lengths = unpack(header_fmt, bytes) 371 | for fl in filter_lengths: 372 | filter.filters.append(BloomFilter.fromfile(f, fl)) 373 | else: 374 | filter.filters = [] 375 | 376 | return filter 377 | 378 | def __len__(self): 379 | """Returns the total number of elements stored in this SBF""" 380 | return sum(f.count for f in self.filters) 381 | 382 | 383 | if __name__ == "__main__": 384 | import doctest 385 | 386 | doctest.testmod() 387 | -------------------------------------------------------------------------------- /pybloom_live/test_pybloom.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from pybloom_live.pybloom import (BloomFilter, ScalableBloomFilter, 4 | make_hashfuncs) 5 | from pybloom_live.utils import range_fn, running_python_3 6 | 7 | try: 8 | import cStringIO 9 | import StringIO 10 | except ImportError: 11 | pass 12 | 13 | import io 14 | import random 15 | import tempfile 16 | import unittest 17 | 18 | import pytest 19 | 20 | 21 | class TestMakeHashFuncs(unittest.TestCase): 22 | def test_make_hashfuncs_returns_hashfn(self): 23 | make_hashes, hashfn = make_hashfuncs(100, 20) 24 | self.assertEqual('openssl_sha512', hashfn.__name__) 25 | make_hashes, hashfn = make_hashfuncs(20, 3) 26 | self.assertEqual('openssl_sha384', hashfn.__name__) 27 | make_hashes, hashfn = make_hashfuncs(15, 2) 28 | self.assertEqual('openssl_sha256', hashfn.__name__) 29 | make_hashes, hashfn = make_hashfuncs(10, 2) 30 | self.assertEqual('openssl_sha1', hashfn.__name__) 31 | make_hashes, hashfn = make_hashfuncs(5, 1) 32 | self.assertEqual('xxh3_128', hashfn.__name__) 33 | 34 | 35 | class TestUnionIntersection(unittest.TestCase): 36 | def test_union(self): 37 | bloom_one = BloomFilter(100, 0.001) 38 | bloom_two = BloomFilter(100, 0.001) 39 | chars = [chr(i) for i in range_fn(97, 123)] 40 | for char in chars[int(len(chars)/2):]: 41 | bloom_one.add(char) 42 | for char in chars[:int(len(chars)/2)]: 43 | bloom_two.add(char) 44 | new_bloom = bloom_one.union(bloom_two) 45 | for char in chars: 46 | self.assertTrue(char in new_bloom) 47 | 48 | def test_intersection(self): 49 | bloom_one = BloomFilter(100, 0.001) 50 | bloom_two = BloomFilter(100, 0.001) 51 | chars = [chr(i) for i in range_fn(97, 123)] 52 | for char in chars: 53 | bloom_one.add(char) 54 | for char in chars[:int(len(chars)/2)]: 55 | bloom_two.add(char) 56 | new_bloom = bloom_one.intersection(bloom_two) 57 | for char in chars[:int(len(chars)/2)]: 58 | self.assertTrue(char in new_bloom) 59 | for char in chars[int(len(chars)/2):]: 60 | self.assertTrue(char not in new_bloom) 61 | 62 | def test_intersection_capacity_fail(self): 63 | bloom_one = BloomFilter(1000, 0.001) 64 | bloom_two = BloomFilter(100, 0.001) 65 | def _run(): 66 | bloom_one.intersection(bloom_two) 67 | self.assertRaises(ValueError, _run) 68 | 69 | def test_union_capacity_fail(self): 70 | bloom_one = BloomFilter(1000, 0.001) 71 | bloom_two = BloomFilter(100, 0.001) 72 | def _run(): 73 | bloom_one.union(bloom_two) 74 | self.assertRaises(ValueError, _run) 75 | 76 | def test_intersection_k_fail(self): 77 | bloom_one = BloomFilter(100, 0.001) 78 | bloom_two = BloomFilter(100, 0.01) 79 | def _run(): 80 | bloom_one.intersection(bloom_two) 81 | self.assertRaises(ValueError, _run) 82 | 83 | def test_union_k_fail(self): 84 | bloom_one = BloomFilter(100, 0.01) 85 | bloom_two = BloomFilter(100, 0.001) 86 | def _run(): 87 | bloom_one.union(bloom_two) 88 | self.assertRaises(ValueError, _run) 89 | 90 | def test_union_scalable_bloom_filter(self): 91 | bloom_one = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) 92 | bloom_two = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) 93 | numbers = [i for i in range_fn(1, 10000)] 94 | middle = int(len(numbers) / 2) 95 | for number in numbers[middle:]: 96 | bloom_one.add(number) 97 | for number in numbers[:middle]: 98 | bloom_two.add(number) 99 | new_bloom = bloom_one.union(bloom_two) 100 | for number in numbers: 101 | self.assertTrue(number in new_bloom) 102 | 103 | 104 | class TestSerialization: 105 | SIZE = 12345 106 | EXPECTED = set([random.randint(0, 10000100) for _ in range_fn(0, SIZE)]) 107 | 108 | @pytest.mark.parametrize("cls,args", [ 109 | (BloomFilter, (SIZE,)), 110 | (ScalableBloomFilter, ()), 111 | ]) 112 | @pytest.mark.parametrize("stream_factory", [ 113 | lambda: tempfile.TemporaryFile, 114 | lambda: io.BytesIO, 115 | pytest.param( 116 | lambda: cStringIO.StringIO, 117 | marks=pytest.mark.skipif(running_python_3, reason="Python 2 only")), 118 | pytest.param( 119 | lambda: StringIO.StringIO, 120 | marks=pytest.mark.skipif(running_python_3, reason="Python 2 only")), 121 | ]) 122 | def test_serialization(self, cls, args, stream_factory): 123 | filter = cls(*args) 124 | for item in self.EXPECTED: 125 | filter.add(item) 126 | 127 | f = stream_factory()() 128 | filter.tofile(f) 129 | del filter 130 | 131 | f.seek(0) 132 | filter = cls.fromfile(f) 133 | for item in self.EXPECTED: 134 | assert item in filter 135 | 136 | 137 | if __name__ == '__main__': 138 | unittest.main() 139 | -------------------------------------------------------------------------------- /pybloom_live/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import itertools 3 | 4 | try: 5 | import StringIO 6 | import cStringIO 7 | except ImportError: 8 | pass 9 | 10 | from io import BytesIO 11 | 12 | running_python_3 = sys.version_info[0] == 3 13 | 14 | 15 | def range_fn(start=0, stop=None): 16 | if running_python_3: 17 | return range(start, stop) 18 | else: 19 | return iter(itertools.count(start).next, stop) 20 | 21 | 22 | def is_string_io(instance): 23 | if isinstance(instance, BytesIO): 24 | return True 25 | if not running_python_3: 26 | return isinstance(instance, (StringIO.StringIO, 27 | cStringIO.InputType, 28 | cStringIO.OutputType)) 29 | return False 30 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bitarray>=0.3.4 2 | xxhash>=3.0.0 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup 3 | 4 | VERSION = '3.1.0' 5 | DESCRIPTION = "Bloom filter: A Probabilistic data structure" 6 | LONG_DESCRIPTION = """ 7 | This bloom filter is forked from pybloom, and its tightening ratio is changed to 0.9, and this ration is consistently used. Choosing r around 0.8 - 0.9 will result in better average space usage for wide range of growth, therefore the default value of model is set to LARGE_SET_GROWTH. 8 | This is a Python implementation of the bloom filter probabilistic data 9 | structure. The module also provides a Scalable Bloom Filter that allows a 10 | bloom filter to grow without knowing the original set size. 11 | """ 12 | 13 | CLASSIFIERS = filter(None, map(str.strip, 14 | """ 15 | Intended Audience :: Developers 16 | License :: OSI Approved :: MIT License 17 | Programming Language :: Python 18 | Programming Language :: Python :: 3 19 | Operating System :: OS Independent 20 | Topic :: Utilities 21 | Topic :: Database :: Database Engines/Servers 22 | Topic :: Software Development :: Libraries :: Python Modules 23 | """.splitlines())) 24 | 25 | setup( 26 | name="pybloom_live", 27 | version=VERSION, 28 | description=DESCRIPTION, 29 | long_description=LONG_DESCRIPTION, 30 | classifiers=CLASSIFIERS, 31 | keywords=('data structures', 'bloom filter', 'bloom', 'filter', 'big data', 32 | 'probabilistic', 'set'), 33 | author="Jay Baird", 34 | author_email="jay.baird@me.com", 35 | url="https://github.com/joseph-fox/python-bloomfilter", 36 | license="MIT License", 37 | platforms=['any'], 38 | test_suite="pybloom_live.tests", 39 | zip_safe=True, 40 | install_requires=['bitarray>=0.3.4'], 41 | packages=['pybloom_live'] 42 | ) 43 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py27,py34,py37 3 | [testenv] 4 | deps=pytest==3.6.3 5 | commands=py.test pybloom_live/test_pybloom.py 6 | --------------------------------------------------------------------------------