├── requirements.txt ├── MANIFEST.in ├── .gitignore ├── pybloom ├── __init__.py ├── utils.py ├── benchmarks.py ├── tests.py └── pybloom.py ├── tox.ini ├── .travis.yml ├── CHANGES.txt ├── LICENSE.txt ├── setup.py └── README.rst /requirements.txt: -------------------------------------------------------------------------------- 1 | bitarray>=0.3.4 -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include ez_setup.py 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg 2 | *.pyc 3 | *.egg-info 4 | build 5 | dist 6 | .venv 7 | .tox 8 | -------------------------------------------------------------------------------- /pybloom/__init__.py: -------------------------------------------------------------------------------- 1 | """pybloom 2 | 3 | """ 4 | 5 | from .pybloom import BloomFilter, ScalableBloomFilter 6 | 7 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = py26,py27,py34 3 | [testenv] 4 | deps=pytest 5 | commands=py.test pybloom/tests.py 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.6" 4 | - "2.7" 5 | - "3.4" 6 | 7 | install: 8 | - pip install -r requirements.txt 9 | - pip install pytest 10 | script: 11 | - py.test pybloom 12 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | Changes in 2.0 2 | ============== 3 | Made major corrections to the algorithms for both BloomFilter and 4 | ScalableBloomFilter. Not numerically compatible with serialized 5 | representations of filters from previous versions. Specifically, 6 | BloomFilter was more accurate than requested and ScalableBloomFilter 7 | was much less accurate than requested. 8 | 9 | Changes in 1.1 10 | ============== 11 | Added copy, intersection and union functions to BloomFilter 12 | -------------------------------------------------------------------------------- /pybloom/utils.py: -------------------------------------------------------------------------------- 1 | import sys 2 | try: 3 | import StringIO 4 | import cStringIO 5 | except ImportError: 6 | from io import BytesIO 7 | 8 | running_python_3 = sys.version_info[0] == 3 9 | 10 | 11 | def range_fn(*args): 12 | if running_python_3: 13 | return range(*args) 14 | else: 15 | return xrange(*args) 16 | 17 | 18 | def is_string_io(instance): 19 | if running_python_3: 20 | return isinstance(instance, BytesIO) 21 | else: 22 | return isinstance(instance, (StringIO.StringIO, 23 | cStringIO.InputType, 24 | cStringIO.OutputType)) -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) <2011> 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup 3 | 4 | VERSION = '2.0.0' 5 | DESCRIPTION = "PyBloom: A Probabilistic data structure" 6 | LONG_DESCRIPTION = """ 7 | pybloom is a Python implementation of the bloom filter probabilistic data 8 | structure. The module also provides a Scalable Bloom Filter that allows a 9 | bloom filter to grow without knowing the original set size. 10 | """ 11 | 12 | CLASSIFIERS = filter(None, map(str.strip, 13 | """ 14 | Intended Audience :: Developers 15 | License :: OSI Approved :: MIT License 16 | Programming Language :: Python 17 | Programming Language :: Python :: 3", 18 | Operating System :: OS Independent 19 | Topic :: Utilities 20 | Topic :: Database :: Database Engines/Servers 21 | Topic :: Software Development :: Libraries :: Python Modules 22 | """.splitlines())) 23 | 24 | setup( 25 | name="pybloom", 26 | version=VERSION, 27 | description=DESCRIPTION, 28 | long_description=LONG_DESCRIPTION, 29 | classifiers=CLASSIFIERS, 30 | keywords=('data structures', 'bloom filter', 'bloom', 'filter', 31 | 'probabilistic', 'set'), 32 | author="Jay Baird", 33 | author_email="jay.baird@me.com", 34 | url="http://github.com/jaybaird/python-bloomfilter/", 35 | license="MIT License", 36 | platforms=['any'], 37 | test_suite="pybloom.tests", 38 | zip_safe=True, 39 | install_requires=['bitarray>=0.3.4'], 40 | packages=['pybloom'] 41 | ) 42 | -------------------------------------------------------------------------------- /pybloom/benchmarks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | """Test performance of BloomFilter at a set capacity and error rate.""" 4 | import sys 5 | from pybloom import BloomFilter 6 | import bitarray, math, time 7 | from utils import range_fn 8 | 9 | 10 | def main(capacity=100000, request_error_rate=0.1): 11 | f = BloomFilter(capacity=capacity, error_rate=request_error_rate) 12 | assert (capacity == f.capacity) 13 | start = time.time() 14 | for i in range_fn(0, f.capacity): 15 | f.add(i, skip_check=True) 16 | end = time.time() 17 | print("{:5.3f} seconds to add to capacity, {:10.2f} entries/second".format( 18 | end - start, f.capacity / (end - start))) 19 | oneBits = f.bitarray.count(True) 20 | zeroBits = f.bitarray.count(False) 21 | #print "Number of 1 bits:", oneBits 22 | #print "Number of 0 bits:", zeroBits 23 | print("Number of Filter Bits:", f.num_bits) 24 | print("Number of slices:", f.num_slices) 25 | print("Bits per slice:", f.bits_per_slice) 26 | print("------") 27 | print("Fraction of 1 bits at capacity: {:5.3f}".format( 28 | oneBits / float(f.num_bits))) 29 | # Look for false positives and measure the actual fp rate 30 | trials = f.capacity 31 | fp = 0 32 | start = time.time() 33 | for i in range_fn(f.capacity, f.capacity + trials + 1): 34 | if i in f: 35 | fp += 1 36 | end = time.time() 37 | print(("{:5.3f} seconds to check false positives, " 38 | "{:10.2f} checks/second".format(end - start, trials / (end - start)))) 39 | print("Requested FP rate: {:2.4f}".format(request_error_rate)) 40 | print("Experimental false positive rate: {:2.4f}".format(fp / float(trials))) 41 | # Compute theoretical fp max (Goel/Gupta) 42 | k = f.num_slices 43 | m = f.num_bits 44 | n = f.capacity 45 | fp_theory = math.pow((1 - math.exp(-k * (n + 0.5) / (m - 1))), k) 46 | print("Projected FP rate (Goel/Gupta): {:2.6f}".format(fp_theory)) 47 | 48 | if __name__ == '__main__' : 49 | status = main() 50 | sys.exit(status) 51 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | pybloom 2 | ======= 3 | 4 | .. image:: https://travis-ci.org/jaybaird/python-bloomfilter.svg?branch=master 5 | :target: https://travis-ci.org/jaybaird/python-bloomfilter 6 | 7 | ``pybloom`` is a module that includes a Bloom Filter data structure along with 8 | an implmentation of Scalable Bloom Filters as discussed in: 9 | 10 | P. Almeida, C.Baquero, N. Preguiça, D. Hutchison, Scalable Bloom Filters, 11 | (GLOBECOM 2007), IEEE, 2007. 12 | 13 | Bloom filters are great if you understand what amount of bits you need to set 14 | aside early to store your entire set. Scalable Bloom Filters allow your bloom 15 | filter bits to grow as a function of false positive probability and size. 16 | 17 | A filter is "full" when at capacity: M * ((ln 2 ^ 2) / abs(ln p)), where M 18 | is the number of bits and p is the false positive probability. When capacity 19 | is reached a new filter is then created exponentially larger than the last 20 | with a tighter probability of false positives and a larger number of hash 21 | functions. 22 | 23 | .. code-block:: python 24 | 25 | >>> from pybloom import BloomFilter 26 | >>> f = BloomFilter(capacity=1000, error_rate=0.001) 27 | >>> [f.add(x) for x in range(10)] 28 | [False, False, False, False, False, False, False, False, False, False] 29 | >>> all([(x in f) for x in range(10)]) 30 | True 31 | >>> 10 in f 32 | False 33 | >>> 5 in f 34 | True 35 | >>> f = BloomFilter(capacity=1000, error_rate=0.001) 36 | >>> for i in xrange(0, f.capacity): 37 | ... _ = f.add(i) 38 | >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 39 | True 40 | 41 | >>> from pybloom import ScalableBloomFilter 42 | >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) 43 | >>> count = 10000 44 | >>> for i in xrange(0, count): 45 | ... _ = sbf.add(i) 46 | ... 47 | >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 48 | True 49 | 50 | # len(sbf) may not equal the entire input length. 0.01% error is well 51 | # below the default 0.1% error threshold. As the capacity goes up, the 52 | # error will approach 0.1%. 53 | -------------------------------------------------------------------------------- /pybloom/tests.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from pybloom.pybloom import BloomFilter, ScalableBloomFilter 3 | from pybloom.utils import running_python_3, range_fn 4 | 5 | try: 6 | from StringIO import StringIO 7 | import cStringIO 8 | except ImportError: 9 | from io import BytesIO as StringIO 10 | import os 11 | import doctest 12 | import unittest 13 | import random 14 | import tempfile 15 | from unittest import TestSuite 16 | 17 | def additional_tests(): 18 | proj_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 19 | readme_fn = os.path.join(proj_dir, 'README.txt') 20 | suite = TestSuite([doctest.DocTestSuite('pybloom.pybloom')]) 21 | if os.path.exists(readme_fn): 22 | suite.addTest(doctest.DocFileSuite(readme_fn, module_relative=False)) 23 | return suite 24 | 25 | class TestUnionIntersection(unittest.TestCase): 26 | def test_union(self): 27 | bloom_one = BloomFilter(100, 0.001) 28 | bloom_two = BloomFilter(100, 0.001) 29 | chars = [chr(i) for i in range_fn(97, 123)] 30 | for char in chars[int(len(chars)/2):]: 31 | bloom_one.add(char) 32 | for char in chars[:int(len(chars)/2)]: 33 | bloom_two.add(char) 34 | new_bloom = bloom_one.union(bloom_two) 35 | for char in chars: 36 | self.assertTrue(char in new_bloom) 37 | 38 | def test_intersection(self): 39 | bloom_one = BloomFilter(100, 0.001) 40 | bloom_two = BloomFilter(100, 0.001) 41 | chars = [chr(i) for i in range_fn(97, 123)] 42 | for char in chars: 43 | bloom_one.add(char) 44 | for char in chars[:int(len(chars)/2)]: 45 | bloom_two.add(char) 46 | new_bloom = bloom_one.intersection(bloom_two) 47 | for char in chars[:int(len(chars)/2)]: 48 | self.assertTrue(char in new_bloom) 49 | for char in chars[int(len(chars)/2):]: 50 | self.assertTrue(char not in new_bloom) 51 | 52 | def test_intersection_capacity_fail(self): 53 | bloom_one = BloomFilter(1000, 0.001) 54 | bloom_two = BloomFilter(100, 0.001) 55 | def _run(): 56 | new_bloom = bloom_one.intersection(bloom_two) 57 | self.assertRaises(ValueError, _run) 58 | 59 | def test_union_capacity_fail(self): 60 | bloom_one = BloomFilter(1000, 0.001) 61 | bloom_two = BloomFilter(100, 0.001) 62 | def _run(): 63 | new_bloom = bloom_one.union(bloom_two) 64 | self.assertRaises(ValueError, _run) 65 | 66 | def test_intersection_k_fail(self): 67 | bloom_one = BloomFilter(100, 0.001) 68 | bloom_two = BloomFilter(100, 0.01) 69 | def _run(): 70 | new_bloom = bloom_one.intersection(bloom_two) 71 | self.assertRaises(ValueError, _run) 72 | 73 | def test_union_k_fail(self): 74 | bloom_one = BloomFilter(100, 0.01) 75 | bloom_two = BloomFilter(100, 0.001) 76 | def _run(): 77 | new_bloom = bloom_one.union(bloom_two) 78 | self.assertRaises(ValueError, _run) 79 | 80 | class Serialization(unittest.TestCase): 81 | SIZE = 12345 82 | EXPECTED = set([random.randint(0, 10000100) for _ in range_fn(SIZE)]) 83 | 84 | def test_serialization(self): 85 | for klass, args in [(BloomFilter, (self.SIZE,)), 86 | (ScalableBloomFilter, ())]: 87 | filter = klass(*args) 88 | for item in self.EXPECTED: 89 | filter.add(item) 90 | 91 | f = tempfile.TemporaryFile() 92 | filter.tofile(f) 93 | stringio = StringIO() 94 | filter.tofile(stringio) 95 | streams_to_test = [f, stringio] 96 | if not running_python_3: 97 | cstringio = cStringIO.StringIO() 98 | filter.tofile(cstringio) 99 | streams_to_test.append(cstringio) 100 | 101 | del filter 102 | 103 | for stream in streams_to_test: 104 | stream.seek(0) 105 | filter = klass.fromfile(stream) 106 | for item in self.EXPECTED: 107 | self.assertTrue(item in filter) 108 | del(filter) 109 | stream.close() 110 | 111 | if __name__ == '__main__': 112 | unittest.main() 113 | -------------------------------------------------------------------------------- /pybloom/pybloom.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | """This module implements a bloom filter probabilistic data structure and 3 | an a Scalable Bloom Filter that grows in size as your add more items to it 4 | without increasing the false positive error_rate. 5 | 6 | Requires the bitarray library: http://pypi.python.org/pypi/bitarray/ 7 | 8 | >>> from pybloom import BloomFilter 9 | >>> f = BloomFilter(capacity=10000, error_rate=0.001) 10 | >>> for i in range_fn(0, f.capacity): 11 | ... _ = f.add(i) 12 | ... 13 | >>> 0 in f 14 | True 15 | >>> f.capacity in f 16 | False 17 | >>> len(f) <= f.capacity 18 | True 19 | >>> (1.0 - (len(f) / float(f.capacity))) <= f.error_rate + 2e-18 20 | True 21 | 22 | >>> from pybloom import ScalableBloomFilter 23 | >>> sbf = ScalableBloomFilter(mode=ScalableBloomFilter.SMALL_SET_GROWTH) 24 | >>> count = 10000 25 | >>> for i in range_fn(0, count): 26 | ... _ = sbf.add(i) 27 | ... 28 | >>> sbf.capacity > count 29 | True 30 | >>> len(sbf) <= count 31 | True 32 | >>> (1.0 - (len(sbf) / float(count))) <= sbf.error_rate + 2e-18 33 | True 34 | 35 | """ 36 | from __future__ import absolute_import 37 | import math 38 | import hashlib 39 | from pybloom.utils import range_fn, is_string_io, running_python_3 40 | from struct import unpack, pack, calcsize 41 | 42 | try: 43 | import bitarray 44 | except ImportError: 45 | raise ImportError('pybloom requires bitarray >= 0.3.4') 46 | 47 | __version__ = '2.0' 48 | __author__ = "Jay Baird , Bob Ippolito ,\ 49 | Marius Eriksen ,\ 50 | Alex Brasetvik ,\ 51 | Matt Bachmann ,\ 52 | " 53 | 54 | def make_hashfuncs(num_slices, num_bits): 55 | if num_bits >= (1 << 31): 56 | fmt_code, chunk_size = 'Q', 8 57 | elif num_bits >= (1 << 15): 58 | fmt_code, chunk_size = 'I', 4 59 | else: 60 | fmt_code, chunk_size = 'H', 2 61 | total_hash_bits = 8 * num_slices * chunk_size 62 | if total_hash_bits > 384: 63 | hashfn = hashlib.sha512 64 | elif total_hash_bits > 256: 65 | hashfn = hashlib.sha384 66 | elif total_hash_bits > 160: 67 | hashfn = hashlib.sha256 68 | elif total_hash_bits > 128: 69 | hashfn = hashlib.sha1 70 | else: 71 | hashfn = hashlib.md5 72 | fmt = fmt_code * (hashfn().digest_size // chunk_size) 73 | num_salts, extra = divmod(num_slices, len(fmt)) 74 | if extra: 75 | num_salts += 1 76 | salts = tuple(hashfn(hashfn(pack('I', i)).digest()) for i in range_fn(num_salts)) 77 | def _make_hashfuncs(key): 78 | if running_python_3: 79 | if isinstance(key, str): 80 | key = key.encode('utf-8') 81 | else: 82 | key = str(key).encode('utf-8') 83 | else: 84 | if isinstance(key, unicode): 85 | key = key.encode('utf-8') 86 | else: 87 | key = str(key) 88 | i = 0 89 | for salt in salts: 90 | h = salt.copy() 91 | h.update(key) 92 | for uint in unpack(fmt, h.digest()): 93 | yield uint % num_bits 94 | i += 1 95 | if i >= num_slices: 96 | return 97 | 98 | return _make_hashfuncs 99 | 100 | 101 | class BloomFilter(object): 102 | FILE_FMT = b'>> b = BloomFilter(capacity=100000, error_rate=0.001) 117 | >>> b.add("test") 118 | False 119 | >>> "test" in b 120 | True 121 | 122 | """ 123 | if not (0 < error_rate < 1): 124 | raise ValueError("Error_Rate must be between 0 and 1.") 125 | if not capacity > 0: 126 | raise ValueError("Capacity must be > 0") 127 | # given M = num_bits, k = num_slices, P = error_rate, n = capacity 128 | # k = log2(1/P) 129 | # solving for m = bits_per_slice 130 | # n ~= M * ((ln(2) ** 2) / abs(ln(P))) 131 | # n ~= (k * m) * ((ln(2) ** 2) / abs(ln(P))) 132 | # m ~= n * abs(ln(P)) / (k * (ln(2) ** 2)) 133 | num_slices = int(math.ceil(math.log(1.0 / error_rate, 2))) 134 | bits_per_slice = int(math.ceil( 135 | (capacity * abs(math.log(error_rate))) / 136 | (num_slices * (math.log(2) ** 2)))) 137 | self._setup(error_rate, num_slices, bits_per_slice, capacity, 0) 138 | self.bitarray = bitarray.bitarray(self.num_bits, endian='little') 139 | self.bitarray.setall(False) 140 | 141 | def _setup(self, error_rate, num_slices, bits_per_slice, capacity, count): 142 | self.error_rate = error_rate 143 | self.num_slices = num_slices 144 | self.bits_per_slice = bits_per_slice 145 | self.capacity = capacity 146 | self.num_bits = num_slices * bits_per_slice 147 | self.count = count 148 | self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) 149 | 150 | def __contains__(self, key): 151 | """Tests a key's membership in this bloom filter. 152 | 153 | >>> b = BloomFilter(capacity=100) 154 | >>> b.add("hello") 155 | False 156 | >>> "hello" in b 157 | True 158 | 159 | """ 160 | bits_per_slice = self.bits_per_slice 161 | bitarray = self.bitarray 162 | hashes = self.make_hashes(key) 163 | offset = 0 164 | for k in hashes: 165 | if not bitarray[offset + k]: 166 | return False 167 | offset += bits_per_slice 168 | return True 169 | 170 | def __len__(self): 171 | """Return the number of keys stored by this bloom filter.""" 172 | return self.count 173 | 174 | def add(self, key, skip_check=False): 175 | """ Adds a key to this bloom filter. If the key already exists in this 176 | filter it will return True. Otherwise False. 177 | 178 | >>> b = BloomFilter(capacity=100) 179 | >>> b.add("hello") 180 | False 181 | >>> b.add("hello") 182 | True 183 | >>> b.count 184 | 1 185 | 186 | """ 187 | bitarray = self.bitarray 188 | bits_per_slice = self.bits_per_slice 189 | hashes = self.make_hashes(key) 190 | found_all_bits = True 191 | if self.count > self.capacity: 192 | raise IndexError("BloomFilter is at capacity") 193 | offset = 0 194 | for k in hashes: 195 | if not skip_check and found_all_bits and not bitarray[offset + k]: 196 | found_all_bits = False 197 | self.bitarray[offset + k] = True 198 | offset += bits_per_slice 199 | 200 | if skip_check: 201 | self.count += 1 202 | return False 203 | elif not found_all_bits: 204 | self.count += 1 205 | return False 206 | else: 207 | return True 208 | 209 | def copy(self): 210 | """Return a copy of this bloom filter. 211 | """ 212 | new_filter = BloomFilter(self.capacity, self.error_rate) 213 | new_filter.bitarray = self.bitarray.copy() 214 | return new_filter 215 | 216 | def union(self, other): 217 | """ Calculates the union of the two underlying bitarrays and returns 218 | a new bloom filter object.""" 219 | if self.capacity != other.capacity or \ 220 | self.error_rate != other.error_rate: 221 | raise ValueError("Unioning filters requires both filters to have \ 222 | both the same capacity and error rate") 223 | new_bloom = self.copy() 224 | new_bloom.bitarray = new_bloom.bitarray | other.bitarray 225 | return new_bloom 226 | 227 | def __or__(self, other): 228 | return self.union(other) 229 | 230 | def intersection(self, other): 231 | """ Calculates the intersection of the two underlying bitarrays and returns 232 | a new bloom filter object.""" 233 | if self.capacity != other.capacity or \ 234 | self.error_rate != other.error_rate: 235 | raise ValueError("Intersecting filters requires both filters to \ 236 | have equal capacity and error rate") 237 | new_bloom = self.copy() 238 | new_bloom.bitarray = new_bloom.bitarray & other.bitarray 239 | return new_bloom 240 | 241 | def __and__(self, other): 242 | return self.intersection(other) 243 | 244 | def tofile(self, f): 245 | """Write the bloom filter to file object `f'. Underlying bits 246 | are written as machine values. This is much more space 247 | efficient than pickling the object.""" 248 | f.write(pack(self.FILE_FMT, self.error_rate, self.num_slices, 249 | self.bits_per_slice, self.capacity, self.count)) 250 | (f.write(self.bitarray.tobytes()) if is_string_io(f) 251 | else self.bitarray.tofile(f)) 252 | 253 | @classmethod 254 | def fromfile(cls, f, n=-1): 255 | """Read a bloom filter from file-object `f' serialized with 256 | ``BloomFilter.tofile''. If `n' > 0 read only so many bytes.""" 257 | headerlen = calcsize(cls.FILE_FMT) 258 | 259 | if 0 < n < headerlen: 260 | raise ValueError('n too small!') 261 | 262 | filter = cls(1) # Bogus instantiation, we will `_setup'. 263 | filter._setup(*unpack(cls.FILE_FMT, f.read(headerlen))) 264 | filter.bitarray = bitarray.bitarray(endian='little') 265 | if n > 0: 266 | (filter.bitarray.frombytes(f.read(n-headerlen)) if is_string_io(f) 267 | else filter.bitarray.fromfile(f, n - headerlen)) 268 | else: 269 | (filter.bitarray.frombytes(f.read()) if is_string_io(f) 270 | else filter.bitarray.fromfile(f)) 271 | if filter.num_bits != filter.bitarray.length() and \ 272 | (filter.num_bits + (8 - filter.num_bits % 8) 273 | != filter.bitarray.length()): 274 | raise ValueError('Bit length mismatch!') 275 | 276 | return filter 277 | 278 | def __getstate__(self): 279 | d = self.__dict__.copy() 280 | del d['make_hashes'] 281 | return d 282 | 283 | def __setstate__(self, d): 284 | self.__dict__.update(d) 285 | self.make_hashes = make_hashfuncs(self.num_slices, self.bits_per_slice) 286 | 287 | class ScalableBloomFilter(object): 288 | SMALL_SET_GROWTH = 2 # slower, but takes up less memory 289 | LARGE_SET_GROWTH = 4 # faster, but takes up more memory faster 290 | FILE_FMT = '>> b = ScalableBloomFilter(initial_capacity=512, error_rate=0.001, \ 311 | mode=ScalableBloomFilter.SMALL_SET_GROWTH) 312 | >>> b.add("test") 313 | False 314 | >>> "test" in b 315 | True 316 | >>> unicode_string = u'¡' 317 | >>> b.add(unicode_string) 318 | False 319 | >>> unicode_string in b 320 | True 321 | """ 322 | if not error_rate or error_rate < 0: 323 | raise ValueError("Error_Rate must be a decimal less than 0.") 324 | self._setup(mode, 0.9, initial_capacity, error_rate) 325 | self.filters = [] 326 | 327 | def _setup(self, mode, ratio, initial_capacity, error_rate): 328 | self.scale = mode 329 | self.ratio = ratio 330 | self.initial_capacity = initial_capacity 331 | self.error_rate = error_rate 332 | 333 | def __contains__(self, key): 334 | """Tests a key's membership in this bloom filter. 335 | 336 | >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ 337 | mode=ScalableBloomFilter.SMALL_SET_GROWTH) 338 | >>> b.add("hello") 339 | False 340 | >>> "hello" in b 341 | True 342 | 343 | """ 344 | for f in reversed(self.filters): 345 | if key in f: 346 | return True 347 | return False 348 | 349 | def add(self, key): 350 | """Adds a key to this bloom filter. 351 | If the key already exists in this filter it will return True. 352 | Otherwise False. 353 | 354 | >>> b = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, \ 355 | mode=ScalableBloomFilter.SMALL_SET_GROWTH) 356 | >>> b.add("hello") 357 | False 358 | >>> b.add("hello") 359 | True 360 | 361 | """ 362 | if key in self: 363 | return True 364 | if not self.filters: 365 | filter = BloomFilter( 366 | capacity=self.initial_capacity, 367 | error_rate=self.error_rate * (1.0 - self.ratio)) 368 | self.filters.append(filter) 369 | else: 370 | filter = self.filters[-1] 371 | if filter.count >= filter.capacity: 372 | filter = BloomFilter( 373 | capacity=filter.capacity * self.scale, 374 | error_rate=filter.error_rate * self.ratio) 375 | self.filters.append(filter) 376 | filter.add(key, skip_check=True) 377 | return False 378 | 379 | @property 380 | def capacity(self): 381 | """Returns the total capacity for all filters in this SBF""" 382 | return sum(f.capacity for f in self.filters) 383 | 384 | @property 385 | def count(self): 386 | return len(self) 387 | 388 | def tofile(self, f): 389 | """Serialize this ScalableBloomFilter into the file-object 390 | `f'.""" 391 | f.write(pack(self.FILE_FMT, self.scale, self.ratio, 392 | self.initial_capacity, self.error_rate)) 393 | 394 | # Write #-of-filters 395 | f.write(pack(b' 0: 398 | # Then each filter directly, with a header describing 399 | # their lengths. 400 | headerpos = f.tell() 401 | headerfmt = b'<' + b'Q'*(len(self.filters)) 402 | f.write(b'.' * calcsize(headerfmt)) 403 | filter_sizes = [] 404 | for filter in self.filters: 405 | begin = f.tell() 406 | filter.tofile(f) 407 | filter_sizes.append(f.tell() - begin) 408 | 409 | f.seek(headerpos) 410 | f.write(pack(headerfmt, *filter_sizes)) 411 | 412 | @classmethod 413 | def fromfile(cls, f): 414 | """Deserialize the ScalableBloomFilter in file object `f'.""" 415 | filter = cls() 416 | filter._setup(*unpack(cls.FILE_FMT, f.read(calcsize(cls.FILE_FMT)))) 417 | nfilters, = unpack(b' 0: 419 | header_fmt = b'<' + b'Q'*nfilters 420 | bytes = f.read(calcsize(header_fmt)) 421 | filter_lengths = unpack(header_fmt, bytes) 422 | for fl in filter_lengths: 423 | filter.filters.append(BloomFilter.fromfile(f, fl)) 424 | else: 425 | filter.filters = [] 426 | 427 | return filter 428 | 429 | def __len__(self): 430 | """Returns the total number of elements stored in this SBF""" 431 | return sum(f.count for f in self.filters) 432 | 433 | 434 | if __name__ == "__main__": 435 | import doctest 436 | doctest.testmod() 437 | --------------------------------------------------------------------------------