├── .github └── workflows │ └── test.yml ├── .gitignore ├── AUTHORS.md ├── Makefile ├── README.rst ├── bin └── count_bits.py ├── setup.py ├── src └── bloom_filter2 │ ├── __init__.py │ └── bloom_filter.py └── tests └── test_bloom_filter.py /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | test: 9 | strategy: 10 | fail-fast: false 11 | matrix: 12 | os: [ubuntu-latest] 13 | python: 14 | - "3.5" 15 | - "3.8" 16 | runs-on: ${{ matrix.os }} 17 | steps: 18 | - uses: actions/checkout@v2 19 | - uses: actions/setup-python@v1 20 | with: 21 | python-version: ${{ matrix.python }} 22 | - name: Test 23 | run: PYTHONPATH=src python tests/test_bloom_filter.py 24 | 25 | check: 26 | runs-on: ubuntu-latest 27 | steps: 28 | - uses: actions/checkout@v2 29 | - uses: actions/setup-python@v1 30 | with: 31 | python-version: "3.8" 32 | - name: Install dependencies 33 | run: pip install flake8 34 | - name: Test 35 | run: flake8 --ignore=W503 src tests bin 36 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff: 7 | .idea/**/workspace.xml 8 | .idea/**/tasks.xml 9 | .idea/dictionaries 10 | 11 | # Sensitive or high-churn files: 12 | .idea/**/dataSources/ 13 | .idea/**/dataSources.ids 14 | .idea/**/dataSources.xml 15 | .idea/**/dataSources.local.xml 16 | .idea/**/sqlDataSources.xml 17 | .idea/**/dynamic.xml 18 | .idea/**/uiDesigner.xml 19 | 20 | # Gradle: 21 | .idea/**/gradle.xml 22 | .idea/**/libraries 23 | 24 | # Mongo Explorer plugin: 25 | .idea/**/mongoSettings.xml 26 | 27 | ## File-based project format: 28 | *.iws 29 | 30 | ## Plugin-specific files: 31 | 32 | # IntelliJ 33 | /out/ 34 | 35 | # mpeltonen/sbt-idea plugin 36 | .idea_modules/ 37 | 38 | # JIRA plugin 39 | atlassian-ide-plugin.xml 40 | 41 | # Crashlytics plugin (for Android Studio and IntelliJ) 42 | com_crashlytics_export_strings.xml 43 | crashlytics.properties 44 | crashlytics-build.properties 45 | fabric.properties 46 | ### VirtualEnv template 47 | # Virtualenv 48 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 49 | .Python 50 | [Ii]nclude 51 | [Ll]ib64 52 | [Ll]ocal 53 | [Ss]cripts 54 | pyvenv.cfg 55 | .venv 56 | pip-selfcheck.json 57 | ### Python template 58 | # Byte-compiled / optimized / DLL files 59 | __pycache__/ 60 | *.py[cod] 61 | *$py.class 62 | 63 | # C extensions 64 | *.so 65 | 66 | # Distribution / packaging 67 | .Python 68 | env/ 69 | build/ 70 | develop-eggs/ 71 | dist/ 72 | downloads/ 73 | eggs/ 74 | .eggs/ 75 | lib/ 76 | lib64/ 77 | parts/ 78 | sdist/ 79 | var/ 80 | wheels/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | 85 | # PyInstaller 86 | # Usually these files are written by a python script from a template 87 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 88 | *.manifest 89 | *.spec 90 | 91 | # Installer logs 92 | pip-log.txt 93 | pip-delete-this-directory.txt 94 | 95 | # Unit test / coverage reports 96 | htmlcov/ 97 | .tox/ 98 | .coverage 99 | .coverage.* 100 | .cache 101 | nosetests.xml 102 | coverage.xml 103 | *,cover 104 | .hypothesis/ 105 | 106 | # Translations 107 | *.mo 108 | *.pot 109 | 110 | # Django stuff: 111 | *.log 112 | local_settings.py 113 | 114 | # Flask stuff: 115 | instance/ 116 | .webassets-cache 117 | 118 | # Scrapy stuff: 119 | .scrapy 120 | 121 | # Sphinx documentation 122 | docs/_build/ 123 | 124 | # PyBuilder 125 | target/ 126 | 127 | # Jupyter Notebook 128 | .ipynb_checkpoints 129 | 130 | # pyenv 131 | .python-version 132 | 133 | # celery beat schedule file 134 | celerybeat-schedule 135 | 136 | # SageMath parsed files 137 | *.sage.py 138 | 139 | # dotenv 140 | .env 141 | 142 | # virtualenv 143 | .venv 144 | venv/ 145 | ENV/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | bloom-filter-rm-me 154 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Original code: 2 | 3 | - http://code.activestate.com/recipes/577686-bloom-filter/ 4 | - Author: Sundar Srinivasan 5 | 6 | Forked into SVN Repo: 7 | 8 | - http://stromberg.dnsalias.org/svn/bloom-filter/trunk/ 9 | - Author: Daniel Richard Stromberg 10 | 11 | Forked to GitHub, renamed to `bloom_filter`: 12 | 13 | - https://github.com/hiway/python-bloom-filter 14 | - Author: Harshad Sharma 15 | 16 | Forked after it was found unmaintained for a way, renamed to `bloom_filter2`: 17 | 18 | - https://github.com/remram44/python-bloom-filter 19 | - Maintainer: Remi Rampin 20 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | go: performance-graph.pdf 3 | evince performance-graph.pdf 4 | 5 | performance-graph.pdf: performance-numbers.db gen-performance-graph 6 | ./gen-performance-graph 7 | 8 | performance-numbers.db: test-bloom-filter 9 | ./this-pylint \ 10 | --ignore-message ".*Unable to import 'dbm'" \ 11 | --ignore-message ".*Unable to import 'anydbm'" \ 12 | --to-pylint bloom_filter_mod.py test-bloom-filter 13 | rm -f seek.txt array.txt hybrid.txt mmap.txt 14 | #/usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter --performance-test 15 | /usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter 16 | /usr/local/cpython-3.4/bin/python ./test-bloom-filter 17 | /usr/local/cpython-2.5/bin/python ./test-bloom-filter 18 | #/usr/local/cpython-2.7/bin/python ./test-bloom-filter 19 | #/usr/local/cpython-3.0/bin/python ./test-bloom-filter 20 | /usr/local/jython-2.7b3/bin/jython ./test-bloom-filter 21 | 22 | clean: 23 | rm -f *.pyc *.class 24 | rm -rf __pycache__ 25 | rm -f bloom-filter-rm-me 26 | rm -f *.ps *.pdf 27 | rm -f seek.txt array.txt 28 | rm -rf dist build bloom_filter.egg-info 29 | rm -f performance-numbers 30 | 31 | veryclean: clean 32 | rm -f performance-numbers.db 33 | rm -f performance-numbers 34 | 35 | build: 36 | python setup.py sdist bdist_wheel 37 | 38 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | bloom-filter 2 | ============ 3 | 4 | A pure python bloom filter (low storage requirement, probabilistic 5 | set datastructure) is provided. It is known to work on CPython 3.x, Pypy, 6 | and Jython. 7 | 8 | Includes mmap, in-memory and disk-seek backends. 9 | 10 | This project builds on `drs-bloom-filter` and `bloom_filter_mod`. 11 | Credits and links can be found in AUTHORS.md. 12 | 13 | Usage 14 | ----- 15 | 16 | The user specifies the desired maximum number of elements and the 17 | desired maximum false positive probability, and the module 18 | calculates the rest. 19 | 20 | .. code-block:: python 21 | 22 | from bloom_filter2 import BloomFilter 23 | 24 | # instantiate BloomFilter with custom settings, 25 | # max_elements is how many elements you expect the filter to hold. 26 | # error_rate defines accuracy; You can use defaults with 27 | # `BloomFilter()` without any arguments. Following example 28 | # is same as defaults: 29 | bloom = BloomFilter(max_elements=10000, error_rate=0.1) 30 | 31 | # Test whether the bloom-filter has seen a key: 32 | assert "test-key" not in bloom 33 | 34 | # Mark the key as seen 35 | bloom.add("test-key") 36 | 37 | # Now check again 38 | assert "test-key" in bloom 39 | 40 | Bloom filter are pretty space efficient : only 200MB of memory usage for storing 100M elements with an error of 1%, compared to the 7GB required for set(range(10**8)) 41 | 42 | It still can be pretty useful to save/load to files with the mmap implementation, for example to avoid rebuilding the bloom filter. The `mmap `_ functionality also save some memory depending on system settings. 43 | 44 | .. code-block:: python 45 | 46 | bloom = BloomFilter(max_elements=10**8, error_rate=0.01, filename=('/tmp/bloom.bin', -1)) 47 | -------------------------------------------------------------------------------- /bin/count_bits.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/pypy-1.6/bin/pypy 2 | 3 | import sys 4 | 5 | 6 | def main(): 7 | total_bits = 0 8 | bits_set = 0 9 | 10 | while True: 11 | block = sys.stdin.read(2 ** 19) 12 | if not block: 13 | break 14 | total_bits += len(block) * 8 15 | # print('got block of length %d' % len(block)) 16 | for char in block: 17 | byte = ord(char) 18 | # print('got char %d' % byte) 19 | for exponent in range(8): 20 | bitmask = 2 ** exponent 21 | # print('checking mask %d' % bitmask) 22 | if byte & bitmask != 0: 23 | # print('adding 1 to count') 24 | bits_set += 1 25 | 26 | print( 27 | '%s set, %s present, %6.2f%%' % ( 28 | bits_set, 29 | total_bits, 30 | bits_set * 100.0 / total_bits, 31 | ) 32 | ) 33 | 34 | 35 | if __name__ == '__main__': 36 | main() 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import find_packages 3 | from setuptools import setup 4 | 5 | 6 | setup( 7 | name="bloom-filter2", 8 | version="2.0.0", 9 | packages=find_packages('src'), 10 | package_dir={'': 'src'}, 11 | author="Harshad Sharma", 12 | author_email="harshad@sharma.io", 13 | maintainer="Remi Rampin", 14 | maintainer_email="remi@rampin.org", 15 | description='Pure Python Bloom Filter module', 16 | long_description=open('README.rst').read(), 17 | license="MIT", 18 | keywords="probabilistic set datastructure", 19 | url='https://github.com/remram44/python-bloom-filter', 20 | platforms='Cross platform', 21 | classifiers=[ 22 | "Development Status :: 5 - Production/Stable", 23 | "Intended Audience :: Developers", 24 | "Programming Language :: Python :: 3", 25 | ], 26 | ) 27 | -------------------------------------------------------------------------------- /src/bloom_filter2/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from .bloom_filter import ( 5 | BloomFilter, 6 | get_filter_bitno_probes, 7 | get_bitno_seed_rnd, 8 | ) 9 | 10 | 11 | __version__ = '2.0.0' 12 | 13 | 14 | __all__ = [ 15 | 'BloomFilter', 16 | 'get_filter_bitno_probes', 17 | 'get_bitno_seed_rnd', 18 | ] 19 | -------------------------------------------------------------------------------- /src/bloom_filter2/bloom_filter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # pylint: disable=superfluous-parens,redefined-variable-type 3 | # superfluous-parens: Sometimes extra parens are more clear 4 | 5 | """Bloom Filter: Probabilistic set membership testing for large sets""" 6 | 7 | # Shamelessly borrowed (under MIT license) from 8 | # https://code.activestate.com/recipes/577686-bloom-filter/ 9 | # About Bloom Filters: https://en.wikipedia.org/wiki/Bloom_filter 10 | 11 | # Tweaked by Daniel Richard Stromberg, mostly to: 12 | # 1) Give it a little nicer __init__ parameters. 13 | # 2) Improve the hash functions to get a much lower rate of false positives. 14 | # 3) Give it a selection of backends. 15 | # 4) Make it pass pylint. 16 | 17 | # In the literature: 18 | # k is the number of probes - we call this num_probes_k 19 | # m is the number of bits in the filter - we call this num_bits_m 20 | # n is the ideal number of elements to eventually be stored in the filter - we 21 | # call this ideal_num_elements_n 22 | # p is the desired error rate when full - we call this error_rate_p 23 | 24 | from __future__ import division 25 | 26 | import array 27 | import math 28 | import os 29 | import random 30 | 31 | try: 32 | import mmap as mmap_mod 33 | except ImportError: 34 | # Jython lacks mmap() 35 | HAVE_MMAP = False 36 | else: 37 | HAVE_MMAP = True 38 | 39 | 40 | class Mmap_backend(object): 41 | """ 42 | Backend storage for our "array of bits" using an mmap'd file. 43 | Please note that this has only been tested on Linux so far. 44 | """ 45 | 46 | effs = 2 ** 8 - 1 47 | 48 | def __init__(self, num_bits, filename): 49 | if not HAVE_MMAP: 50 | raise NotImplementedError("mmap is not available") 51 | self.num_bits = num_bits 52 | self.num_chars = (self.num_bits + 7) // 8 53 | flags = os.O_RDWR | os.O_CREAT 54 | if hasattr(os, 'O_BINARY'): 55 | flags |= getattr(os, 'O_BINARY') 56 | self.file_ = os.open(filename, flags) 57 | os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET) 58 | os.write(self.file_, b'\x00') 59 | self.mmap = mmap_mod.mmap(self.file_, self.num_chars) 60 | 61 | def is_set(self, bitno): 62 | """Return true iff bit number bitno is set""" 63 | byteno, bit_within_wordno = divmod(bitno, 8) 64 | mask = 1 << bit_within_wordno 65 | byte = self.mmap[byteno] 66 | return byte & mask 67 | 68 | def set(self, bitno): 69 | """set bit number bitno to true""" 70 | 71 | byteno, bit_within_byteno = divmod(bitno, 8) 72 | mask = 1 << bit_within_byteno 73 | byte = self.mmap[byteno] 74 | byte |= mask 75 | self.mmap[byteno] = byte 76 | 77 | def clear(self, bitno): 78 | """clear bit number bitno - set it to false""" 79 | 80 | byteno, bit_within_byteno = divmod(bitno, 8) 81 | mask = 1 << bit_within_byteno 82 | byte = self.mmap[byteno] 83 | byte &= Mmap_backend.effs - mask 84 | self.mmap[byteno] = byte 85 | 86 | def __iand__(self, other): 87 | assert self.num_bits == other.num_bits 88 | 89 | for byteno in range(self.num_chars): 90 | self.mmap[byteno] = ( 91 | self.mmap[byteno] 92 | & other.mmap[byteno] 93 | ) 94 | 95 | return self 96 | 97 | def __ior__(self, other): 98 | assert self.num_bits == other.num_bits 99 | 100 | for byteno in range(self.num_chars): 101 | self.mmap[byteno] = ( 102 | self.mmap[byteno] 103 | | other.mmap[byteno] 104 | ) 105 | 106 | return self 107 | 108 | def close(self): 109 | """Close the file""" 110 | os.close(self.file_) 111 | 112 | 113 | class File_seek_backend(object): 114 | """Backend storage for our "array of bits" using a file in which we seek""" 115 | 116 | effs = 2 ** 8 - 1 117 | 118 | def __init__(self, num_bits, filename): 119 | self.num_bits = num_bits 120 | self.num_chars = (self.num_bits + 7) // 8 121 | flags = os.O_RDWR | os.O_CREAT 122 | if hasattr(os, 'O_BINARY'): 123 | flags |= getattr(os, 'O_BINARY') 124 | self.file_ = os.open(filename, flags) 125 | os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET) 126 | os.write(self.file_, b'\x00') 127 | 128 | def is_set(self, bitno): 129 | """Return true iff bit number bitno is set""" 130 | byteno, bit_within_wordno = divmod(bitno, 8) 131 | mask = 1 << bit_within_wordno 132 | os.lseek(self.file_, byteno, os.SEEK_SET) 133 | byte = os.read(self.file_, 1)[0] 134 | return byte & mask 135 | 136 | def set(self, bitno): 137 | """set bit number bitno to true""" 138 | 139 | byteno, bit_within_byteno = divmod(bitno, 8) 140 | mask = 1 << bit_within_byteno 141 | os.lseek(self.file_, byteno, os.SEEK_SET) 142 | byte = os.read(self.file_, 1)[0] 143 | byte |= mask 144 | os.lseek(self.file_, byteno, os.SEEK_SET) 145 | os.write(self.file_, bytes([byte])) 146 | 147 | def clear(self, bitno): 148 | """clear bit number bitno - set it to false""" 149 | 150 | byteno, bit_within_byteno = divmod(bitno, 8) 151 | mask = 1 << bit_within_byteno 152 | os.lseek(self.file_, byteno, os.SEEK_SET) 153 | byte = os.read(self.file_, 1)[0] 154 | byte &= File_seek_backend.effs - mask 155 | os.lseek(self.file_, byteno, os.SEEK_SET) 156 | os.write(self.file_, bytes([byte])) 157 | 158 | # These are quite slow ways to do iand and ior, but they should work, 159 | # and a faster version is going to take more time 160 | def __iand__(self, other): 161 | assert self.num_bits == other.num_bits 162 | 163 | for bitno in range(self.num_bits): 164 | if self.is_set(bitno) and other.is_set(bitno): 165 | self.set(bitno) 166 | else: 167 | self.clear(bitno) 168 | 169 | return self 170 | 171 | def __ior__(self, other): 172 | assert self.num_bits == other.num_bits 173 | 174 | for bitno in range(self.num_bits): 175 | if self.is_set(bitno) or other.is_set(bitno): 176 | self.set(bitno) 177 | else: 178 | self.clear(bitno) 179 | 180 | return self 181 | 182 | def close(self): 183 | """Close the file""" 184 | os.close(self.file_) 185 | 186 | 187 | class Array_then_file_seek_backend(object): 188 | # pylint: disable=R0902 189 | # R0902: We kinda need a bunch of instance attributes 190 | """ 191 | Backend storage for our "array of bits" using a python array of integers up 192 | to some maximum number of bytes, then spilling over to a file. 193 | This is -not- a cache; we instead save the leftmost bits in RAM, and the 194 | rightmost bits (if necessary) in a file. On open, we read from the file to 195 | RAM. On close, we write from RAM to the file. 196 | """ 197 | 198 | effs = 2 ** 8 - 1 199 | 200 | def __init__(self, num_bits, filename, max_bytes_in_memory): 201 | self.num_bits = num_bits 202 | num_chars = (self.num_bits + 7) // 8 203 | self.filename = filename 204 | self.max_bytes_in_memory = max_bytes_in_memory 205 | self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8) 206 | self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0) 207 | self.bytes_in_memory = (self.bits_in_memory + 7) // 8 208 | self.bytes_in_file = (self.bits_in_file + 7) // 8 209 | 210 | self.array_ = array.array('B', [0]) * self.bytes_in_memory 211 | flags = os.O_RDWR | os.O_CREAT 212 | if hasattr(os, 'O_BINARY'): 213 | flags |= getattr(os, 'O_BINARY') 214 | self.file_ = os.open(filename, flags) 215 | os.lseek(self.file_, num_chars + 1, os.SEEK_SET) 216 | os.write(self.file_, b'\x00') 217 | 218 | os.lseek(self.file_, 0, os.SEEK_SET) 219 | offset = 0 220 | intended_block_len = 2 ** 17 221 | while True: 222 | if offset + intended_block_len < self.bytes_in_memory: 223 | block = os.read(self.file_, intended_block_len) 224 | elif offset < self.bytes_in_memory: 225 | block = os.read(self.file_, self.bytes_in_memory - offset) 226 | else: 227 | break 228 | for index_in_block, byte in enumerate(block): 229 | self.array_[offset + index_in_block] = byte 230 | offset += intended_block_len 231 | 232 | def is_set(self, bitno): 233 | """Return true iff bit number bitno is set""" 234 | byteno, bit_within_byteno = divmod(bitno, 8) 235 | mask = 1 << bit_within_byteno 236 | if byteno < self.bytes_in_memory: 237 | return self.array_[byteno] & mask 238 | else: 239 | os.lseek(self.file_, byteno, os.SEEK_SET) 240 | byte = os.read(self.file_, 1)[0] 241 | return byte & mask 242 | 243 | def set(self, bitno): 244 | """set bit number bitno to true""" 245 | byteno, bit_within_byteno = divmod(bitno, 8) 246 | mask = 1 << bit_within_byteno 247 | if byteno < self.bytes_in_memory: 248 | self.array_[byteno] |= mask 249 | else: 250 | os.lseek(self.file_, byteno, os.SEEK_SET) 251 | byte = os.read(self.file_, 1)[0] 252 | byte |= mask 253 | os.lseek(self.file_, byteno, os.SEEK_SET) 254 | os.write(self.file_, bytes([byte])) 255 | 256 | def clear(self, bitno): 257 | """clear bit number bitno - set it to false""" 258 | byteno, bit_within_byteno = divmod(bitno, 8) 259 | mask = Array_backend.effs - (1 << bit_within_byteno) 260 | if byteno < self.bytes_in_memory: 261 | self.array_[byteno] &= mask 262 | else: 263 | os.lseek(self.file_, byteno, os.SEEK_SET) 264 | byte = os.read(self.file_, 1)[0] 265 | byte &= File_seek_backend.effs - mask 266 | os.lseek(self.file_, byteno, os.SEEK_SET) 267 | os.write(self.file_, bytes([byte])) 268 | 269 | # These are quite slow ways to do iand and ior, but they should work, 270 | # and a faster version is going to take more time 271 | def __iand__(self, other): 272 | assert self.num_bits == other.num_bits 273 | 274 | for bitno in range(self.num_bits): 275 | if self.is_set(bitno) and other.is_set(bitno): 276 | self.set(bitno) 277 | else: 278 | self.clear(bitno) 279 | 280 | return self 281 | 282 | def __ior__(self, other): 283 | assert self.num_bits == other.num_bits 284 | 285 | for bitno in range(self.num_bits): 286 | if self.is_set(bitno) or other.is_set(bitno): 287 | self.set(bitno) 288 | else: 289 | self.clear(bitno) 290 | 291 | return self 292 | 293 | def close(self): 294 | """ 295 | Write the in-memory portion to disk, leave the already-on-disk portion 296 | unchanged 297 | """ 298 | 299 | os.lseek(self.file_, 0, os.SEEK_SET) 300 | os.write(self.file_, bytes(self.array_[0:self.bytes_in_memory])) 301 | 302 | os.close(self.file_) 303 | 304 | 305 | class Array_backend(object): 306 | """ 307 | Backend storage for our "array of bits" using a python array of integers 308 | """ 309 | 310 | # Note that this has now been split out into a bits_mod for the benefit of 311 | # other projects. 312 | effs = 2 ** 32 - 1 313 | 314 | def __init__(self, num_bits): 315 | self.num_bits = num_bits 316 | self.num_words = (self.num_bits + 31) // 32 317 | self.array_ = array.array('L', [0]) * self.num_words 318 | 319 | def is_set(self, bitno): 320 | """Return true iff bit number bitno is set""" 321 | wordno, bit_within_wordno = divmod(bitno, 32) 322 | mask = 1 << bit_within_wordno 323 | return self.array_[wordno] & mask 324 | 325 | def set(self, bitno): 326 | """set bit number bitno to true""" 327 | wordno, bit_within_wordno = divmod(bitno, 32) 328 | mask = 1 << bit_within_wordno 329 | self.array_[wordno] |= mask 330 | 331 | def clear(self, bitno): 332 | """clear bit number bitno - set it to false""" 333 | wordno, bit_within_wordno = divmod(bitno, 32) 334 | mask = Array_backend.effs - (1 << bit_within_wordno) 335 | self.array_[wordno] &= mask 336 | 337 | # It'd be nice to do __iand__ and __ior__ in a base class, but 338 | # that'd be Much slower 339 | 340 | def __iand__(self, other): 341 | assert self.num_bits == other.num_bits 342 | 343 | for wordno in range(self.num_words): 344 | self.array_[wordno] &= other.array_[wordno] 345 | 346 | return self 347 | 348 | def __ior__(self, other): 349 | assert self.num_bits == other.num_bits 350 | 351 | for wordno in range(self.num_words): 352 | self.array_[wordno] |= other.array_[wordno] 353 | 354 | return self 355 | 356 | def close(self): 357 | """Noop for compatibility with the file+seek backend""" 358 | pass 359 | 360 | 361 | def get_bitno_seed_rnd(bloom_filter, key): 362 | """ 363 | Apply num_probes_k hash functions to key. 364 | 365 | Generate the array index and bitmask corresponding to each result. 366 | """ 367 | 368 | # We're using key as a seed to a pseudorandom number generator 369 | hasher = random.Random(key).randrange 370 | for dummy in range(bloom_filter.num_probes_k): 371 | bitno = hasher(bloom_filter.num_bits_m) 372 | yield bitno % bloom_filter.num_bits_m 373 | 374 | 375 | MERSENNES1 = [2 ** x - 1 for x in [17, 31, 127]] 376 | MERSENNES2 = [2 ** x - 1 for x in [19, 67, 257]] 377 | 378 | 379 | def simple_hash(int_list, prime1, prime2, prime3): 380 | """Compute a hash value from a list of integers and 3 primes""" 381 | result = 0 382 | for integer in int_list: 383 | result += ((result + integer + prime1) * prime2) % prime3 384 | return result 385 | 386 | 387 | def hash1(int_list): 388 | """Basic hash function #1""" 389 | return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2]) 390 | 391 | 392 | def hash2(int_list): 393 | """Basic hash function #2""" 394 | return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2]) 395 | 396 | 397 | def get_filter_bitno_probes(bloom_filter, key): 398 | """ 399 | Apply num_probes_k hash functions to key. 400 | 401 | Generate the array index and bitmask corresponding to each result 402 | """ 403 | 404 | # This one assumes key is either bytes or str (or other list of integers) 405 | 406 | if hasattr(key, '__divmod__'): 407 | int_list = [] 408 | temp = key 409 | while temp: 410 | quotient, remainder = divmod(temp, 256) 411 | int_list.append(remainder) 412 | temp = quotient 413 | elif isinstance(key, (list, tuple, str, bytes)) and not key: 414 | int_list = [] 415 | elif hasattr(key[0], '__divmod__'): 416 | int_list = key 417 | elif isinstance(key[0], str): 418 | int_list = [ord(char) for char in key] 419 | else: 420 | raise TypeError('Sorry, I do not know how to hash this type') 421 | 422 | hash_value1 = hash1(int_list) 423 | hash_value2 = hash2(int_list) 424 | probe_value = hash_value1 425 | 426 | for probeno in range(1, bloom_filter.num_probes_k + 1): 427 | probe_value *= hash_value1 428 | probe_value += hash_value2 429 | probe_value %= MERSENNES1[2] 430 | yield probe_value % bloom_filter.num_bits_m 431 | 432 | 433 | def try_unlink(filename): 434 | """unlink a file. Don't complain if it's not there""" 435 | try: 436 | os.unlink(filename) 437 | except OSError: 438 | pass 439 | return 440 | 441 | 442 | class BloomFilter(object): 443 | """Probabilistic set membership testing for large sets""" 444 | def __init__(self, 445 | max_elements=10000, 446 | error_rate=0.1, 447 | probe_bitnoer=get_filter_bitno_probes, 448 | filename=None, 449 | start_fresh=False): 450 | # pylint: disable=R0913 451 | # R0913: We want a few arguments 452 | if max_elements <= 0: 453 | raise ValueError('ideal_num_elements_n must be > 0') 454 | if not (0 < error_rate < 1): 455 | raise ValueError('error_rate_p must be between 0 and 1 exclusive') 456 | 457 | self.error_rate_p = error_rate 458 | # With fewer elements, we should do very well. With more elements, our 459 | # error rate "guarantee" drops rapidly. 460 | self.ideal_num_elements_n = max_elements 461 | 462 | numerator = ( 463 | -1 464 | * self.ideal_num_elements_n 465 | * math.log(self.error_rate_p) 466 | ) 467 | denominator = math.log(2) ** 2 468 | real_num_bits_m = numerator / denominator 469 | self.num_bits_m = int(math.ceil(real_num_bits_m)) 470 | 471 | if filename is None: 472 | self.backend = Array_backend(self.num_bits_m) 473 | elif isinstance(filename, tuple) and isinstance(filename[1], int): 474 | if start_fresh: 475 | try_unlink(filename[0]) 476 | if filename[1] == -1: 477 | self.backend = Mmap_backend(self.num_bits_m, filename[0]) 478 | else: 479 | self.backend = Array_then_file_seek_backend( 480 | self.num_bits_m, 481 | filename[0], 482 | filename[1], 483 | ) 484 | else: 485 | if start_fresh: 486 | try_unlink(filename) 487 | self.backend = File_seek_backend(self.num_bits_m, filename) 488 | 489 | # AKA num_offsetters 490 | # Verified against 491 | # https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 492 | real_num_probes_k = ( 493 | (self.num_bits_m / self.ideal_num_elements_n) 494 | * math.log(2) 495 | ) 496 | self.num_probes_k = int(math.ceil(real_num_probes_k)) 497 | self.probe_bitnoer = probe_bitnoer 498 | 499 | def __repr__(self): 500 | return ( 501 | 'BloomFilter(ideal_num_elements_n=%d, error_rate_p=%f, ' 502 | + 'num_bits_m=%d)' 503 | ) % ( 504 | self.ideal_num_elements_n, 505 | self.error_rate_p, 506 | self.num_bits_m, 507 | ) 508 | 509 | def add(self, key): 510 | """Add an element to the filter""" 511 | for bitno in self.probe_bitnoer(self, key): 512 | self.backend.set(bitno) 513 | 514 | def __iadd__(self, key): 515 | self.add(key) 516 | return self 517 | 518 | def _match_template(self, bloom_filter): 519 | """ 520 | Compare a sort of signature for two bloom filters. 521 | 522 | Used in preparation for binary operations 523 | """ 524 | return (self.num_bits_m == bloom_filter.num_bits_m 525 | and self.num_probes_k == bloom_filter.num_probes_k 526 | and self.probe_bitnoer == bloom_filter.probe_bitnoer) 527 | 528 | def union(self, bloom_filter): 529 | """Compute the set union of two bloom filters""" 530 | self.backend |= bloom_filter.backend 531 | 532 | def __ior__(self, bloom_filter): 533 | self.union(bloom_filter) 534 | return self 535 | 536 | def intersection(self, bloom_filter): 537 | """Compute the set intersection of two bloom filters""" 538 | self.backend &= bloom_filter.backend 539 | 540 | def __iand__(self, bloom_filter): 541 | self.intersection(bloom_filter) 542 | return self 543 | 544 | def __contains__(self, key): 545 | for bitno in self.probe_bitnoer(self, key): 546 | if not self.backend.is_set(bitno): 547 | return False 548 | return True 549 | 550 | def close(self): 551 | self.backend.close() 552 | self.backend = None 553 | 554 | def __enter__(self): 555 | return self 556 | 557 | def __exit__(self, exc_type, exc_val, exc_tb): 558 | self.close() 559 | self.backend = None 560 | 561 | def __del__(self): 562 | if self.backend is not None: 563 | self.backend.close() 564 | self.backend = None 565 | -------------------------------------------------------------------------------- /tests/test_bloom_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | 4 | # pylint: disable=superfluous-parens 5 | # superfluous-parens: Parentheses are good for clarity and portability 6 | 7 | """Unit tests for bloom_filter_mod""" 8 | 9 | import dbm 10 | import math 11 | import os 12 | import random 13 | import sys 14 | import time 15 | import unittest 16 | 17 | import bloom_filter2 18 | 19 | 20 | CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890' 21 | 22 | 23 | class States(object): 24 | """Generate the USA's state names""" 25 | 26 | def __init__(self): 27 | pass 28 | 29 | states = """Alabama Alaska Arizona Arkansas California Colorado Connecticut 30 | Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas 31 | Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota 32 | Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey 33 | NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon 34 | Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah 35 | Vermont Virginia Washington WestVirginia Wisconsin Wyoming""".split() 36 | 37 | @staticmethod 38 | def generator(): 39 | """Generate the states""" 40 | for state in States.states: 41 | yield state 42 | 43 | @staticmethod 44 | def within(value): 45 | """Is the value in our list of states?""" 46 | return value in States.states 47 | 48 | @staticmethod 49 | def length(): 50 | """What is the length of our contained values?""" 51 | return len(States.states) 52 | 53 | 54 | def random_string(): 55 | """Generate a random, 10 character string - for testing purposes""" 56 | list_ = [] 57 | for _ in range(10): 58 | character = CHARACTERS[int(random.random() * len(CHARACTERS))] 59 | list_.append(character) 60 | return ''.join(list_) 61 | 62 | 63 | class Random_content(object): 64 | """Generated a bunch of random strings in sorted order""" 65 | 66 | random_content = [random_string() for dummy in range(1000)] 67 | 68 | def __init__(self): 69 | pass 70 | 71 | @staticmethod 72 | def generator(): 73 | """Generate all values""" 74 | for item in Random_content.random_content: 75 | yield item 76 | 77 | @staticmethod 78 | def within(value): 79 | """Test for membership""" 80 | return value in Random_content.random_content 81 | 82 | @staticmethod 83 | def length(): 84 | """How many members?""" 85 | return len(Random_content.random_content) 86 | 87 | 88 | class Evens(object): 89 | """Generate a bunch of even numbers""" 90 | 91 | def __init__(self, maximum): 92 | self.maximum = maximum 93 | 94 | def generator(self): 95 | """Generate all values""" 96 | for value in range(self.maximum): 97 | if value % 2 == 0: 98 | yield str(value) 99 | 100 | def within(self, value): 101 | """Test for membership""" 102 | try: 103 | int_value = int(value) 104 | except ValueError: 105 | return False 106 | 107 | if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0: 108 | return True 109 | else: 110 | return False 111 | 112 | def length(self): 113 | """How many members?""" 114 | return int(math.ceil(self.maximum / 2.0)) 115 | 116 | 117 | def give_description(filename): 118 | """ 119 | Return a description of the filename type 120 | 121 | Could be array, file or hybrid. 122 | """ 123 | if filename is None: 124 | return 'array' 125 | elif isinstance(filename, tuple): 126 | if filename[1] == -1: 127 | return 'mmap' 128 | else: 129 | return 'hybrid' 130 | else: 131 | return 'seek' 132 | 133 | 134 | class TestBloomFilter(unittest.TestCase): 135 | def _test( 136 | self, 137 | description, values, trials, error_rate, 138 | probe_bitnoer=None, filename=None, 139 | ): 140 | # pylint: disable=R0913,R0914 141 | # R0913: We want a few arguments 142 | # R0914: We want some local variables too. This is just test code. 143 | """Some quick automatic tests for the bloom filter class""" 144 | if not probe_bitnoer: 145 | probe_bitnoer = bloom_filter2.get_filter_bitno_probes 146 | 147 | divisor = 100000 148 | 149 | bloom = bloom_filter2.BloomFilter( 150 | max_elements=values.length() * 2, 151 | error_rate=error_rate, 152 | probe_bitnoer=probe_bitnoer, 153 | filename=filename, 154 | start_fresh=True, 155 | ) 156 | 157 | message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n' 158 | filled_out_message = message % ( 159 | description, 160 | bloom.num_bits_m, 161 | bloom.num_probes_k, 162 | ) 163 | 164 | sys.stdout.write(filled_out_message) 165 | 166 | print('starting to add values to an empty bloom filter') 167 | for valueno, value in enumerate(values.generator()): 168 | reverse_valueno = values.length() - valueno 169 | if reverse_valueno % divisor == 0: 170 | print('adding valueno %d' % reverse_valueno) 171 | bloom.add(value) 172 | 173 | print('testing all known members') 174 | include_in_count = sum( 175 | include in bloom 176 | for include in values.generator() 177 | ) 178 | self.assertEqual(include_in_count, values.length()) 179 | 180 | print('testing random non-members') 181 | false_positives = 0 182 | for trialno in range(trials): 183 | if trialno % divisor == 0: 184 | print( 185 | 'trialno progress: %d / %d' % (trialno, trials), 186 | file=sys.stderr, 187 | ) 188 | while True: 189 | candidate = ''.join(random.sample(CHARACTERS, 5)) 190 | # If we accidentally found a member, try again 191 | if values.within(candidate): 192 | continue 193 | if candidate in bloom: 194 | # print('false positive: %s' % candidate) 195 | false_positives += 1 196 | break 197 | 198 | actual_error_rate = float(false_positives) / trials 199 | 200 | self.assertLess( 201 | actual_error_rate, error_rate, 202 | "Too many false positives: actual: %s, expected: %s" % ( 203 | actual_error_rate, 204 | error_rate, 205 | ), 206 | ) 207 | 208 | bloom.close() 209 | 210 | def test_and(self): 211 | """Test the & operator""" 212 | 213 | abc = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01) 214 | for character in ['a', 'b', 'c']: 215 | abc += character 216 | 217 | bcd = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01) 218 | for character in ['b', 'c', 'd']: 219 | bcd += character 220 | 221 | abc &= bcd 222 | 223 | self.assertNotIn('a', abc) 224 | self.assertIn('b', abc) 225 | self.assertIn('c', abc) 226 | self.assertNotIn('d', abc) 227 | 228 | abc.close() 229 | bcd.close() 230 | 231 | def test_or(self): 232 | """Test the | operator""" 233 | 234 | abc = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01) 235 | for character in ['a', 'b', 'c']: 236 | abc += character 237 | 238 | bcd = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01) 239 | for character in ['b', 'c', 'd']: 240 | bcd += character 241 | 242 | abc |= bcd 243 | 244 | self.assertIn('a', abc) 245 | self.assertIn('b', abc) 246 | self.assertIn('c', abc) 247 | self.assertIn('d', abc) 248 | self.assertNotIn('e', abc) 249 | 250 | abc.close() 251 | bcd.close() 252 | 253 | def test_states(self): 254 | self._test('states', States(), trials=100000, error_rate=0.01) 255 | 256 | def test_random(self): 257 | self._test('random', Random_content(), trials=10000, error_rate=0.1) 258 | self._test('random', Random_content(), trials=1000000, error_rate=1E-9) 259 | self._test('random', Random_content(), trials=10000, error_rate=0.1, 260 | probe_bitnoer=bloom_filter2.get_bitno_seed_rnd) 261 | 262 | filename = 'bloom-filter-rm-me' 263 | self._test( 264 | 'random', 265 | Random_content(), 266 | trials=10000, 267 | error_rate=0.1, 268 | filename=filename, 269 | ) 270 | 271 | @unittest.skipUnless(os.environ.get('TEST_PERF', ''), "disabled") 272 | def test_performance(self): 273 | """Unit tests for BloomFilter class""" 274 | 275 | sqrt_of_10 = math.sqrt(10) 276 | for exponent in range(19): # it's a lot, but probably not unreasonable 277 | elements = int(sqrt_of_10 ** exponent + 0.5) 278 | for filename in [ 279 | None, 280 | 'bloom-filter-rm-me', 281 | ('bloom-filter-rm-me', 768 * 2 ** 20), 282 | ('bloom-filter-rm-me', -1), 283 | ]: 284 | description = give_description(filename) 285 | key = '%s %s' % (description, elements) 286 | with dbm.open('performance-numbers', 'c') as database: 287 | if key in database.keys(): 288 | continue 289 | if elements >= 100000000 and description == 'seek': 290 | continue 291 | if elements >= 100000000 and description == 'mmap': 292 | continue 293 | if elements >= 1000000000 and description == 'array': 294 | continue 295 | time0 = time.time() 296 | self._test( 297 | 'evens %s elements: %d' % ( 298 | give_description(filename), 299 | elements, 300 | ), 301 | Evens(elements), 302 | trials=elements, 303 | error_rate=1e-2, 304 | filename=filename, 305 | ) 306 | time1 = time.time() 307 | delta_t = time1 - time0 308 | # file_ = open('%s.txt' % description, 'a') 309 | # file_.write('%d %f\n' % (elements, delta_t)) 310 | # file_.close() 311 | with dbm.open('performance-numbers', 'c') as database: 312 | database[key] = '%f' % delta_t 313 | 314 | def test_probe_count(self): 315 | # test prob count ok 316 | bloom = bloom_filter2.BloomFilter(1000000, error_rate=.99) 317 | self.assertEqual(bloom.num_probes_k, 1) 318 | 319 | bloom.close() 320 | 321 | 322 | if __name__ == '__main__': 323 | unittest.main() 324 | --------------------------------------------------------------------------------