├── .github
    └── workflows
    │   └── test.yml
├── .gitignore
├── AUTHORS.md
├── Makefile
├── README.rst
├── bin
    └── count_bits.py
├── setup.py
├── src
    └── bloom_filter2
    │   ├── __init__.py
    │   └── bloom_filter.py
└── tests
    └── test_bloom_filter.py


/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 | - push
 5 | - pull_request
 6 | 
 7 | jobs:
 8 |   test:
 9 |     strategy:
10 |       fail-fast: false
11 |       matrix:
12 |         os: [ubuntu-latest]
13 |         python:
14 |         - "3.5"
15 |         - "3.8"
16 |     runs-on: ${{ matrix.os }}
17 |     steps:
18 |     - uses: actions/checkout@v2
19 |     - uses: actions/setup-python@v1
20 |       with:
21 |         python-version: ${{ matrix.python }}
22 |     - name: Test
23 |       run: PYTHONPATH=src python tests/test_bloom_filter.py
24 | 
25 |   check:
26 |     runs-on: ubuntu-latest
27 |     steps:
28 |     - uses: actions/checkout@v2
29 |     - uses: actions/setup-python@v1
30 |       with:
31 |         python-version: "3.8"
32 |     - name: Install dependencies
33 |       run: pip install flake8
34 |     - name: Test
35 |       run: flake8 --ignore=W503 src tests bin
36 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | # User-specific stuff:
  7 | .idea/**/workspace.xml
  8 | .idea/**/tasks.xml
  9 | .idea/dictionaries
 10 | 
 11 | # Sensitive or high-churn files:
 12 | .idea/**/dataSources/
 13 | .idea/**/dataSources.ids
 14 | .idea/**/dataSources.xml
 15 | .idea/**/dataSources.local.xml
 16 | .idea/**/sqlDataSources.xml
 17 | .idea/**/dynamic.xml
 18 | .idea/**/uiDesigner.xml
 19 | 
 20 | # Gradle:
 21 | .idea/**/gradle.xml
 22 | .idea/**/libraries
 23 | 
 24 | # Mongo Explorer plugin:
 25 | .idea/**/mongoSettings.xml
 26 | 
 27 | ## File-based project format:
 28 | *.iws
 29 | 
 30 | ## Plugin-specific files:
 31 | 
 32 | # IntelliJ
 33 | /out/
 34 | 
 35 | # mpeltonen/sbt-idea plugin
 36 | .idea_modules/
 37 | 
 38 | # JIRA plugin
 39 | atlassian-ide-plugin.xml
 40 | 
 41 | # Crashlytics plugin (for Android Studio and IntelliJ)
 42 | com_crashlytics_export_strings.xml
 43 | crashlytics.properties
 44 | crashlytics-build.properties
 45 | fabric.properties
 46 | ### VirtualEnv template
 47 | # Virtualenv
 48 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 49 | .Python
 50 | [Ii]nclude
 51 | [Ll]ib64
 52 | [Ll]ocal
 53 | [Ss]cripts
 54 | pyvenv.cfg
 55 | .venv
 56 | pip-selfcheck.json
 57 | ### Python template
 58 | # Byte-compiled / optimized / DLL files
 59 | __pycache__/
 60 | *.py[cod]
 61 | *$py.class
 62 | 
 63 | # C extensions
 64 | *.so
 65 | 
 66 | # Distribution / packaging
 67 | .Python
 68 | env/
 69 | build/
 70 | develop-eggs/
 71 | dist/
 72 | downloads/
 73 | eggs/
 74 | .eggs/
 75 | lib/
 76 | lib64/
 77 | parts/
 78 | sdist/
 79 | var/
 80 | wheels/
 81 | *.egg-info/
 82 | .installed.cfg
 83 | *.egg
 84 | 
 85 | # PyInstaller
 86 | #  Usually these files are written by a python script from a template
 87 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 88 | *.manifest
 89 | *.spec
 90 | 
 91 | # Installer logs
 92 | pip-log.txt
 93 | pip-delete-this-directory.txt
 94 | 
 95 | # Unit test / coverage reports
 96 | htmlcov/
 97 | .tox/
 98 | .coverage
 99 | .coverage.*
100 | .cache
101 | nosetests.xml
102 | coverage.xml
103 | *,cover
104 | .hypothesis/
105 | 
106 | # Translations
107 | *.mo
108 | *.pot
109 | 
110 | # Django stuff:
111 | *.log
112 | local_settings.py
113 | 
114 | # Flask stuff:
115 | instance/
116 | .webassets-cache
117 | 
118 | # Scrapy stuff:
119 | .scrapy
120 | 
121 | # Sphinx documentation
122 | docs/_build/
123 | 
124 | # PyBuilder
125 | target/
126 | 
127 | # Jupyter Notebook
128 | .ipynb_checkpoints
129 | 
130 | # pyenv
131 | .python-version
132 | 
133 | # celery beat schedule file
134 | celerybeat-schedule
135 | 
136 | # SageMath parsed files
137 | *.sage.py
138 | 
139 | # dotenv
140 | .env
141 | 
142 | # virtualenv
143 | .venv
144 | venv/
145 | ENV/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | 
150 | # Rope project settings
151 | .ropeproject
152 | 
153 | bloom-filter-rm-me
154 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | Original code:
 2 | 
 3 |  - http://code.activestate.com/recipes/577686-bloom-filter/
 4 |  - Author: Sundar Srinivasan
 5 | 
 6 | Forked into SVN Repo:
 7 | 
 8 |  - http://stromberg.dnsalias.org/svn/bloom-filter/trunk/
 9 |  - Author: Daniel Richard Stromberg
10 | 
11 | Forked to GitHub, renamed to `bloom_filter`:
12 | 
13 |  - https://github.com/hiway/python-bloom-filter
14 |  - Author: Harshad Sharma
15 | 
16 | Forked after it was found unmaintained for a way, renamed to `bloom_filter2`:
17 | 
18 |  - https://github.com/remram44/python-bloom-filter
19 |  - Maintainer: Remi Rampin
20 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | go: performance-graph.pdf
 3 | 	evince performance-graph.pdf
 4 | 
 5 | performance-graph.pdf: performance-numbers.db gen-performance-graph
 6 | 	./gen-performance-graph
 7 | 
 8 | performance-numbers.db: test-bloom-filter
 9 | 	./this-pylint \
10 | 		--ignore-message ".*Unable to import 'dbm'" \
11 | 		--ignore-message ".*Unable to import 'anydbm'" \
12 | 		--to-pylint bloom_filter_mod.py test-bloom-filter
13 | 	rm -f seek.txt array.txt hybrid.txt mmap.txt
14 | 	#/usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter --performance-test
15 | 	/usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter
16 | 	/usr/local/cpython-3.4/bin/python ./test-bloom-filter
17 | 	/usr/local/cpython-2.5/bin/python ./test-bloom-filter
18 | 	#/usr/local/cpython-2.7/bin/python ./test-bloom-filter
19 | 	#/usr/local/cpython-3.0/bin/python ./test-bloom-filter
20 | 	/usr/local/jython-2.7b3/bin/jython ./test-bloom-filter
21 | 
22 | clean:
23 | 	rm -f *.pyc *.class
24 | 	rm -rf __pycache__
25 | 	rm -f bloom-filter-rm-me
26 | 	rm -f *.ps *.pdf
27 | 	rm -f seek.txt array.txt
28 | 	rm -rf dist build bloom_filter.egg-info
29 | 	rm -f performance-numbers
30 | 
31 | veryclean: clean
32 | 	rm -f performance-numbers.db
33 | 	rm -f performance-numbers
34 | 
35 | build:
36 | 	python setup.py sdist bdist_wheel
37 | 
38 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | bloom-filter
 2 | ============
 3 | 
 4 | A pure python bloom filter (low storage requirement, probabilistic
 5 | set datastructure) is provided.  It is known to work on CPython 3.x, Pypy,
 6 | and Jython.
 7 | 
 8 | Includes mmap, in-memory and disk-seek backends.
 9 | 
10 | This project builds on `drs-bloom-filter` and `bloom_filter_mod`.
11 | Credits and links can be found in AUTHORS.md.
12 | 
13 | Usage
14 | -----
15 | 
16 | The user specifies the desired maximum number of elements and the
17 | desired maximum false positive probability, and the module
18 | calculates the rest.
19 | 
20 | .. code-block:: python
21 | 
22 |     from bloom_filter2 import BloomFilter
23 | 
24 |     # instantiate BloomFilter with custom settings,
25 |     # max_elements is how many elements you expect the filter to hold.
26 |     # error_rate defines accuracy; You can use defaults with
27 |     # `BloomFilter()` without any arguments. Following example
28 |     # is same as defaults:
29 |     bloom = BloomFilter(max_elements=10000, error_rate=0.1)
30 | 
31 |     # Test whether the bloom-filter has seen a key:
32 |     assert "test-key" not in bloom
33 | 
34 |     # Mark the key as seen
35 |     bloom.add("test-key")
36 | 
37 |     # Now check again
38 |     assert "test-key" in bloom
39 | 
40 | Bloom filter are pretty space efficient : only 200MB of memory usage for storing 100M elements with an error of 1%, compared to the 7GB required for set(range(10**8))
41 | 
42 | It still can be pretty useful to save/load to files with the mmap implementation, for example to avoid rebuilding the bloom filter. The `mmap <https://en.wikipedia.org/wiki/Mmap>`_ functionality also save some memory depending on system settings.
43 | 
44 | .. code-block:: python
45 | 
46 |     bloom = BloomFilter(max_elements=10**8, error_rate=0.01, filename=('/tmp/bloom.bin', -1))
47 | 


--------------------------------------------------------------------------------
/bin/count_bits.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/pypy-1.6/bin/pypy
 2 | 
 3 | import sys
 4 | 
 5 | 
 6 | def main():
 7 |     total_bits = 0
 8 |     bits_set = 0
 9 | 
10 |     while True:
11 |         block = sys.stdin.read(2 ** 19)
12 |         if not block:
13 |             break
14 |         total_bits += len(block) * 8
15 |         # print('got block of length %d' % len(block))
16 |         for char in block:
17 |             byte = ord(char)
18 |             # print('got char %d' % byte)
19 |             for exponent in range(8):
20 |                 bitmask = 2 ** exponent
21 |                 # print('checking mask %d' % bitmask)
22 |                 if byte & bitmask != 0:
23 |                     # print('adding 1 to count')
24 |                     bits_set += 1
25 | 
26 |     print(
27 |         '%s set, %s present, %6.2f%%' % (
28 |             bits_set,
29 |             total_bits,
30 |             bits_set * 100.0 / total_bits,
31 |         )
32 |     )
33 | 
34 | 
35 | if __name__ == '__main__':
36 |     main()
37 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from setuptools import find_packages
 3 | from setuptools import setup
 4 | 
 5 | 
 6 | setup(
 7 |     name="bloom-filter2",
 8 |     version="2.0.0",
 9 |     packages=find_packages('src'),
10 |     package_dir={'': 'src'},
11 |     author="Harshad Sharma",
12 |     author_email="harshad@sharma.io",
13 |     maintainer="Remi Rampin",
14 |     maintainer_email="remi@rampin.org",
15 |     description='Pure Python Bloom Filter module',
16 |     long_description=open('README.rst').read(),
17 |     license="MIT",
18 |     keywords="probabilistic set datastructure",
19 |     url='https://github.com/remram44/python-bloom-filter',
20 |     platforms='Cross platform',
21 |     classifiers=[
22 |         "Development Status :: 5 - Production/Stable",
23 |         "Intended Audience :: Developers",
24 |         "Programming Language :: Python :: 3",
25 |     ],
26 | )
27 | 


--------------------------------------------------------------------------------
/src/bloom_filter2/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from .bloom_filter import (
 5 |     BloomFilter,
 6 |     get_filter_bitno_probes,
 7 |     get_bitno_seed_rnd,
 8 | )
 9 | 
10 | 
11 | __version__ = '2.0.0'
12 | 
13 | 
14 | __all__ = [
15 |     'BloomFilter',
16 |     'get_filter_bitno_probes',
17 |     'get_bitno_seed_rnd',
18 | ]
19 | 


--------------------------------------------------------------------------------
/src/bloom_filter2/bloom_filter.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # pylint: disable=superfluous-parens,redefined-variable-type
  3 | # superfluous-parens: Sometimes extra parens are more clear
  4 | 
  5 | """Bloom Filter: Probabilistic set membership testing for large sets"""
  6 | 
  7 | # Shamelessly borrowed (under MIT license) from
  8 | # https://code.activestate.com/recipes/577686-bloom-filter/
  9 | # About Bloom Filters: https://en.wikipedia.org/wiki/Bloom_filter
 10 | 
 11 | # Tweaked by Daniel Richard Stromberg, mostly to:
 12 | # 1) Give it a little nicer __init__ parameters.
 13 | # 2) Improve the hash functions to get a much lower rate of false positives.
 14 | # 3) Give it a selection of backends.
 15 | # 4) Make it pass pylint.
 16 | 
 17 | # In the literature:
 18 | # k is the number of probes - we call this num_probes_k
 19 | # m is the number of bits in the filter - we call this num_bits_m
 20 | # n is the ideal number of elements to eventually be stored in the filter - we
 21 | # call this ideal_num_elements_n
 22 | # p is the desired error rate when full - we call this error_rate_p
 23 | 
 24 | from __future__ import division
 25 | 
 26 | import array
 27 | import math
 28 | import os
 29 | import random
 30 | 
 31 | try:
 32 |     import mmap as mmap_mod
 33 | except ImportError:
 34 |     # Jython lacks mmap()
 35 |     HAVE_MMAP = False
 36 | else:
 37 |     HAVE_MMAP = True
 38 | 
 39 | 
 40 | class Mmap_backend(object):
 41 |     """
 42 |     Backend storage for our "array of bits" using an mmap'd file.
 43 |     Please note that this has only been tested on Linux so far.
 44 |     """
 45 | 
 46 |     effs = 2 ** 8 - 1
 47 | 
 48 |     def __init__(self, num_bits, filename):
 49 |         if not HAVE_MMAP:
 50 |             raise NotImplementedError("mmap is not available")
 51 |         self.num_bits = num_bits
 52 |         self.num_chars = (self.num_bits + 7) // 8
 53 |         flags = os.O_RDWR | os.O_CREAT
 54 |         if hasattr(os, 'O_BINARY'):
 55 |             flags |= getattr(os, 'O_BINARY')
 56 |         self.file_ = os.open(filename, flags)
 57 |         os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
 58 |         os.write(self.file_, b'\x00')
 59 |         self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
 60 | 
 61 |     def is_set(self, bitno):
 62 |         """Return true iff bit number bitno is set"""
 63 |         byteno, bit_within_wordno = divmod(bitno, 8)
 64 |         mask = 1 << bit_within_wordno
 65 |         byte = self.mmap[byteno]
 66 |         return byte & mask
 67 | 
 68 |     def set(self, bitno):
 69 |         """set bit number bitno to true"""
 70 | 
 71 |         byteno, bit_within_byteno = divmod(bitno, 8)
 72 |         mask = 1 << bit_within_byteno
 73 |         byte = self.mmap[byteno]
 74 |         byte |= mask
 75 |         self.mmap[byteno] = byte
 76 | 
 77 |     def clear(self, bitno):
 78 |         """clear bit number bitno - set it to false"""
 79 | 
 80 |         byteno, bit_within_byteno = divmod(bitno, 8)
 81 |         mask = 1 << bit_within_byteno
 82 |         byte = self.mmap[byteno]
 83 |         byte &= Mmap_backend.effs - mask
 84 |         self.mmap[byteno] = byte
 85 | 
 86 |     def __iand__(self, other):
 87 |         assert self.num_bits == other.num_bits
 88 | 
 89 |         for byteno in range(self.num_chars):
 90 |             self.mmap[byteno] = (
 91 |                 self.mmap[byteno]
 92 |                 & other.mmap[byteno]
 93 |             )
 94 | 
 95 |         return self
 96 | 
 97 |     def __ior__(self, other):
 98 |         assert self.num_bits == other.num_bits
 99 | 
100 |         for byteno in range(self.num_chars):
101 |             self.mmap[byteno] = (
102 |                 self.mmap[byteno]
103 |                 | other.mmap[byteno]
104 |             )
105 | 
106 |         return self
107 | 
108 |     def close(self):
109 |         """Close the file"""
110 |         os.close(self.file_)
111 | 
112 | 
113 | class File_seek_backend(object):
114 |     """Backend storage for our "array of bits" using a file in which we seek"""
115 | 
116 |     effs = 2 ** 8 - 1
117 | 
118 |     def __init__(self, num_bits, filename):
119 |         self.num_bits = num_bits
120 |         self.num_chars = (self.num_bits + 7) // 8
121 |         flags = os.O_RDWR | os.O_CREAT
122 |         if hasattr(os, 'O_BINARY'):
123 |             flags |= getattr(os, 'O_BINARY')
124 |         self.file_ = os.open(filename, flags)
125 |         os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
126 |         os.write(self.file_, b'\x00')
127 | 
128 |     def is_set(self, bitno):
129 |         """Return true iff bit number bitno is set"""
130 |         byteno, bit_within_wordno = divmod(bitno, 8)
131 |         mask = 1 << bit_within_wordno
132 |         os.lseek(self.file_, byteno, os.SEEK_SET)
133 |         byte = os.read(self.file_, 1)[0]
134 |         return byte & mask
135 | 
136 |     def set(self, bitno):
137 |         """set bit number bitno to true"""
138 | 
139 |         byteno, bit_within_byteno = divmod(bitno, 8)
140 |         mask = 1 << bit_within_byteno
141 |         os.lseek(self.file_, byteno, os.SEEK_SET)
142 |         byte = os.read(self.file_, 1)[0]
143 |         byte |= mask
144 |         os.lseek(self.file_, byteno, os.SEEK_SET)
145 |         os.write(self.file_, bytes([byte]))
146 | 
147 |     def clear(self, bitno):
148 |         """clear bit number bitno - set it to false"""
149 | 
150 |         byteno, bit_within_byteno = divmod(bitno, 8)
151 |         mask = 1 << bit_within_byteno
152 |         os.lseek(self.file_, byteno, os.SEEK_SET)
153 |         byte = os.read(self.file_, 1)[0]
154 |         byte &= File_seek_backend.effs - mask
155 |         os.lseek(self.file_, byteno, os.SEEK_SET)
156 |         os.write(self.file_, bytes([byte]))
157 | 
158 |     # These are quite slow ways to do iand and ior, but they should work,
159 |     # and a faster version is going to take more time
160 |     def __iand__(self, other):
161 |         assert self.num_bits == other.num_bits
162 | 
163 |         for bitno in range(self.num_bits):
164 |             if self.is_set(bitno) and other.is_set(bitno):
165 |                 self.set(bitno)
166 |             else:
167 |                 self.clear(bitno)
168 | 
169 |         return self
170 | 
171 |     def __ior__(self, other):
172 |         assert self.num_bits == other.num_bits
173 | 
174 |         for bitno in range(self.num_bits):
175 |             if self.is_set(bitno) or other.is_set(bitno):
176 |                 self.set(bitno)
177 |             else:
178 |                 self.clear(bitno)
179 | 
180 |         return self
181 | 
182 |     def close(self):
183 |         """Close the file"""
184 |         os.close(self.file_)
185 | 
186 | 
187 | class Array_then_file_seek_backend(object):
188 |     # pylint: disable=R0902
189 |     # R0902: We kinda need a bunch of instance attributes
190 |     """
191 |     Backend storage for our "array of bits" using a python array of integers up
192 |     to some maximum number of bytes, then spilling over to a file.
193 |     This is -not- a cache; we instead save the leftmost bits in RAM, and the
194 |     rightmost bits (if necessary) in a file.  On open, we read from the file to
195 |     RAM.  On close, we write from RAM to the file.
196 |     """
197 | 
198 |     effs = 2 ** 8 - 1
199 | 
200 |     def __init__(self, num_bits, filename, max_bytes_in_memory):
201 |         self.num_bits = num_bits
202 |         num_chars = (self.num_bits + 7) // 8
203 |         self.filename = filename
204 |         self.max_bytes_in_memory = max_bytes_in_memory
205 |         self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
206 |         self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
207 |         self.bytes_in_memory = (self.bits_in_memory + 7) // 8
208 |         self.bytes_in_file = (self.bits_in_file + 7) // 8
209 | 
210 |         self.array_ = array.array('B', [0]) * self.bytes_in_memory
211 |         flags = os.O_RDWR | os.O_CREAT
212 |         if hasattr(os, 'O_BINARY'):
213 |             flags |= getattr(os, 'O_BINARY')
214 |         self.file_ = os.open(filename, flags)
215 |         os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
216 |         os.write(self.file_, b'\x00')
217 | 
218 |         os.lseek(self.file_, 0, os.SEEK_SET)
219 |         offset = 0
220 |         intended_block_len = 2 ** 17
221 |         while True:
222 |             if offset + intended_block_len < self.bytes_in_memory:
223 |                 block = os.read(self.file_, intended_block_len)
224 |             elif offset < self.bytes_in_memory:
225 |                 block = os.read(self.file_, self.bytes_in_memory - offset)
226 |             else:
227 |                 break
228 |             for index_in_block, byte in enumerate(block):
229 |                 self.array_[offset + index_in_block] = byte
230 |             offset += intended_block_len
231 | 
232 |     def is_set(self, bitno):
233 |         """Return true iff bit number bitno is set"""
234 |         byteno, bit_within_byteno = divmod(bitno, 8)
235 |         mask = 1 << bit_within_byteno
236 |         if byteno < self.bytes_in_memory:
237 |             return self.array_[byteno] & mask
238 |         else:
239 |             os.lseek(self.file_, byteno, os.SEEK_SET)
240 |             byte = os.read(self.file_, 1)[0]
241 |             return byte & mask
242 | 
243 |     def set(self, bitno):
244 |         """set bit number bitno to true"""
245 |         byteno, bit_within_byteno = divmod(bitno, 8)
246 |         mask = 1 << bit_within_byteno
247 |         if byteno < self.bytes_in_memory:
248 |             self.array_[byteno] |= mask
249 |         else:
250 |             os.lseek(self.file_, byteno, os.SEEK_SET)
251 |             byte = os.read(self.file_, 1)[0]
252 |             byte |= mask
253 |             os.lseek(self.file_, byteno, os.SEEK_SET)
254 |             os.write(self.file_, bytes([byte]))
255 | 
256 |     def clear(self, bitno):
257 |         """clear bit number bitno - set it to false"""
258 |         byteno, bit_within_byteno = divmod(bitno, 8)
259 |         mask = Array_backend.effs - (1 << bit_within_byteno)
260 |         if byteno < self.bytes_in_memory:
261 |             self.array_[byteno] &= mask
262 |         else:
263 |             os.lseek(self.file_, byteno, os.SEEK_SET)
264 |             byte = os.read(self.file_, 1)[0]
265 |             byte &= File_seek_backend.effs - mask
266 |             os.lseek(self.file_, byteno, os.SEEK_SET)
267 |             os.write(self.file_, bytes([byte]))
268 | 
269 |     # These are quite slow ways to do iand and ior, but they should work,
270 |     # and a faster version is going to take more time
271 |     def __iand__(self, other):
272 |         assert self.num_bits == other.num_bits
273 | 
274 |         for bitno in range(self.num_bits):
275 |             if self.is_set(bitno) and other.is_set(bitno):
276 |                 self.set(bitno)
277 |             else:
278 |                 self.clear(bitno)
279 | 
280 |         return self
281 | 
282 |     def __ior__(self, other):
283 |         assert self.num_bits == other.num_bits
284 | 
285 |         for bitno in range(self.num_bits):
286 |             if self.is_set(bitno) or other.is_set(bitno):
287 |                 self.set(bitno)
288 |             else:
289 |                 self.clear(bitno)
290 | 
291 |         return self
292 | 
293 |     def close(self):
294 |         """
295 |         Write the in-memory portion to disk, leave the already-on-disk  portion
296 |         unchanged
297 |         """
298 | 
299 |         os.lseek(self.file_, 0, os.SEEK_SET)
300 |         os.write(self.file_, bytes(self.array_[0:self.bytes_in_memory]))
301 | 
302 |         os.close(self.file_)
303 | 
304 | 
305 | class Array_backend(object):
306 |     """
307 |     Backend storage for our "array of bits" using a python array of integers
308 |     """
309 | 
310 |     # Note that this has now been split out into a bits_mod for the benefit of
311 |     # other projects.
312 |     effs = 2 ** 32 - 1
313 | 
314 |     def __init__(self, num_bits):
315 |         self.num_bits = num_bits
316 |         self.num_words = (self.num_bits + 31) // 32
317 |         self.array_ = array.array('L', [0]) * self.num_words
318 | 
319 |     def is_set(self, bitno):
320 |         """Return true iff bit number bitno is set"""
321 |         wordno, bit_within_wordno = divmod(bitno, 32)
322 |         mask = 1 << bit_within_wordno
323 |         return self.array_[wordno] & mask
324 | 
325 |     def set(self, bitno):
326 |         """set bit number bitno to true"""
327 |         wordno, bit_within_wordno = divmod(bitno, 32)
328 |         mask = 1 << bit_within_wordno
329 |         self.array_[wordno] |= mask
330 | 
331 |     def clear(self, bitno):
332 |         """clear bit number bitno - set it to false"""
333 |         wordno, bit_within_wordno = divmod(bitno, 32)
334 |         mask = Array_backend.effs - (1 << bit_within_wordno)
335 |         self.array_[wordno] &= mask
336 | 
337 |     # It'd be nice to do __iand__ and __ior__ in a base class, but
338 |     # that'd be Much slower
339 | 
340 |     def __iand__(self, other):
341 |         assert self.num_bits == other.num_bits
342 | 
343 |         for wordno in range(self.num_words):
344 |             self.array_[wordno] &= other.array_[wordno]
345 | 
346 |         return self
347 | 
348 |     def __ior__(self, other):
349 |         assert self.num_bits == other.num_bits
350 | 
351 |         for wordno in range(self.num_words):
352 |             self.array_[wordno] |= other.array_[wordno]
353 | 
354 |         return self
355 | 
356 |     def close(self):
357 |         """Noop for compatibility with the file+seek backend"""
358 |         pass
359 | 
360 | 
361 | def get_bitno_seed_rnd(bloom_filter, key):
362 |     """
363 |     Apply num_probes_k hash functions to key.
364 | 
365 |     Generate the array index and bitmask corresponding to each result.
366 |     """
367 | 
368 |     # We're using key as a seed to a pseudorandom number generator
369 |     hasher = random.Random(key).randrange
370 |     for dummy in range(bloom_filter.num_probes_k):
371 |         bitno = hasher(bloom_filter.num_bits_m)
372 |         yield bitno % bloom_filter.num_bits_m
373 | 
374 | 
375 | MERSENNES1 = [2 ** x - 1 for x in [17, 31, 127]]
376 | MERSENNES2 = [2 ** x - 1 for x in [19, 67, 257]]
377 | 
378 | 
379 | def simple_hash(int_list, prime1, prime2, prime3):
380 |     """Compute a hash value from a list of integers and 3 primes"""
381 |     result = 0
382 |     for integer in int_list:
383 |         result += ((result + integer + prime1) * prime2) % prime3
384 |     return result
385 | 
386 | 
387 | def hash1(int_list):
388 |     """Basic hash function #1"""
389 |     return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
390 | 
391 | 
392 | def hash2(int_list):
393 |     """Basic hash function #2"""
394 |     return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
395 | 
396 | 
397 | def get_filter_bitno_probes(bloom_filter, key):
398 |     """
399 |     Apply num_probes_k hash functions to key.
400 | 
401 |     Generate the array index and bitmask corresponding to each result
402 |     """
403 | 
404 |     # This one assumes key is either bytes or str (or other list of integers)
405 | 
406 |     if hasattr(key, '__divmod__'):
407 |         int_list = []
408 |         temp = key
409 |         while temp:
410 |             quotient, remainder = divmod(temp, 256)
411 |             int_list.append(remainder)
412 |             temp = quotient
413 |     elif isinstance(key, (list, tuple, str, bytes)) and not key:
414 |         int_list = []
415 |     elif hasattr(key[0], '__divmod__'):
416 |         int_list = key
417 |     elif isinstance(key[0], str):
418 |         int_list = [ord(char) for char in key]
419 |     else:
420 |         raise TypeError('Sorry, I do not know how to hash this type')
421 | 
422 |     hash_value1 = hash1(int_list)
423 |     hash_value2 = hash2(int_list)
424 |     probe_value = hash_value1
425 | 
426 |     for probeno in range(1, bloom_filter.num_probes_k + 1):
427 |         probe_value *= hash_value1
428 |         probe_value += hash_value2
429 |         probe_value %= MERSENNES1[2]
430 |         yield probe_value % bloom_filter.num_bits_m
431 | 
432 | 
433 | def try_unlink(filename):
434 |     """unlink a file.  Don't complain if it's not there"""
435 |     try:
436 |         os.unlink(filename)
437 |     except OSError:
438 |         pass
439 |     return
440 | 
441 | 
442 | class BloomFilter(object):
443 |     """Probabilistic set membership testing for large sets"""
444 |     def __init__(self,
445 |                  max_elements=10000,
446 |                  error_rate=0.1,
447 |                  probe_bitnoer=get_filter_bitno_probes,
448 |                  filename=None,
449 |                  start_fresh=False):
450 |         # pylint: disable=R0913
451 |         # R0913: We want a few arguments
452 |         if max_elements <= 0:
453 |             raise ValueError('ideal_num_elements_n must be > 0')
454 |         if not (0 < error_rate < 1):
455 |             raise ValueError('error_rate_p must be between 0 and 1 exclusive')
456 | 
457 |         self.error_rate_p = error_rate
458 |         # With fewer elements, we should do very well. With more elements, our
459 |         # error rate "guarantee" drops rapidly.
460 |         self.ideal_num_elements_n = max_elements
461 | 
462 |         numerator = (
463 |             -1
464 |             * self.ideal_num_elements_n
465 |             * math.log(self.error_rate_p)
466 |         )
467 |         denominator = math.log(2) ** 2
468 |         real_num_bits_m = numerator / denominator
469 |         self.num_bits_m = int(math.ceil(real_num_bits_m))
470 | 
471 |         if filename is None:
472 |             self.backend = Array_backend(self.num_bits_m)
473 |         elif isinstance(filename, tuple) and isinstance(filename[1], int):
474 |             if start_fresh:
475 |                 try_unlink(filename[0])
476 |             if filename[1] == -1:
477 |                 self.backend = Mmap_backend(self.num_bits_m, filename[0])
478 |             else:
479 |                 self.backend = Array_then_file_seek_backend(
480 |                     self.num_bits_m,
481 |                     filename[0],
482 |                     filename[1],
483 |                 )
484 |         else:
485 |             if start_fresh:
486 |                 try_unlink(filename)
487 |             self.backend = File_seek_backend(self.num_bits_m, filename)
488 | 
489 |         # AKA num_offsetters
490 |         # Verified against
491 |         # https://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
492 |         real_num_probes_k = (
493 |             (self.num_bits_m / self.ideal_num_elements_n)
494 |             * math.log(2)
495 |         )
496 |         self.num_probes_k = int(math.ceil(real_num_probes_k))
497 |         self.probe_bitnoer = probe_bitnoer
498 | 
499 |     def __repr__(self):
500 |         return (
501 |             'BloomFilter(ideal_num_elements_n=%d, error_rate_p=%f, '
502 |             + 'num_bits_m=%d)'
503 |         ) % (
504 |             self.ideal_num_elements_n,
505 |             self.error_rate_p,
506 |             self.num_bits_m,
507 |         )
508 | 
509 |     def add(self, key):
510 |         """Add an element to the filter"""
511 |         for bitno in self.probe_bitnoer(self, key):
512 |             self.backend.set(bitno)
513 | 
514 |     def __iadd__(self, key):
515 |         self.add(key)
516 |         return self
517 | 
518 |     def _match_template(self, bloom_filter):
519 |         """
520 |         Compare a sort of signature for two bloom filters.
521 | 
522 |         Used in preparation for binary operations
523 |         """
524 |         return (self.num_bits_m == bloom_filter.num_bits_m
525 |                 and self.num_probes_k == bloom_filter.num_probes_k
526 |                 and self.probe_bitnoer == bloom_filter.probe_bitnoer)
527 | 
528 |     def union(self, bloom_filter):
529 |         """Compute the set union of two bloom filters"""
530 |         self.backend |= bloom_filter.backend
531 | 
532 |     def __ior__(self, bloom_filter):
533 |         self.union(bloom_filter)
534 |         return self
535 | 
536 |     def intersection(self, bloom_filter):
537 |         """Compute the set intersection of two bloom filters"""
538 |         self.backend &= bloom_filter.backend
539 | 
540 |     def __iand__(self, bloom_filter):
541 |         self.intersection(bloom_filter)
542 |         return self
543 | 
544 |     def __contains__(self, key):
545 |         for bitno in self.probe_bitnoer(self, key):
546 |             if not self.backend.is_set(bitno):
547 |                 return False
548 |         return True
549 | 
550 |     def close(self):
551 |         self.backend.close()
552 |         self.backend = None
553 | 
554 |     def __enter__(self):
555 |         return self
556 | 
557 |     def __exit__(self, exc_type, exc_val, exc_tb):
558 |         self.close()
559 |         self.backend = None
560 | 
561 |     def __del__(self):
562 |         if self.backend is not None:
563 |             self.backend.close()
564 |             self.backend = None
565 | 


--------------------------------------------------------------------------------
/tests/test_bloom_filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding=utf-8
  3 | 
  4 | # pylint: disable=superfluous-parens
  5 | # superfluous-parens: Parentheses are good for clarity and portability
  6 | 
  7 | """Unit tests for bloom_filter_mod"""
  8 | 
  9 | import dbm
 10 | import math
 11 | import os
 12 | import random
 13 | import sys
 14 | import time
 15 | import unittest
 16 | 
 17 | import bloom_filter2
 18 | 
 19 | 
 20 | CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
 21 | 
 22 | 
 23 | class States(object):
 24 |     """Generate the USA's state names"""
 25 | 
 26 |     def __init__(self):
 27 |         pass
 28 | 
 29 |     states = """Alabama Alaska Arizona Arkansas California Colorado Connecticut
 30 |         Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
 31 |         Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
 32 |         Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
 33 |         NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
 34 |         Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
 35 |         Vermont Virginia Washington WestVirginia Wisconsin Wyoming""".split()
 36 | 
 37 |     @staticmethod
 38 |     def generator():
 39 |         """Generate the states"""
 40 |         for state in States.states:
 41 |             yield state
 42 | 
 43 |     @staticmethod
 44 |     def within(value):
 45 |         """Is the value in our list of states?"""
 46 |         return value in States.states
 47 | 
 48 |     @staticmethod
 49 |     def length():
 50 |         """What is the length of our contained values?"""
 51 |         return len(States.states)
 52 | 
 53 | 
 54 | def random_string():
 55 |     """Generate a random, 10 character string - for testing purposes"""
 56 |     list_ = []
 57 |     for _ in range(10):
 58 |         character = CHARACTERS[int(random.random() * len(CHARACTERS))]
 59 |         list_.append(character)
 60 |     return ''.join(list_)
 61 | 
 62 | 
 63 | class Random_content(object):
 64 |     """Generated a bunch of random strings in sorted order"""
 65 | 
 66 |     random_content = [random_string() for dummy in range(1000)]
 67 | 
 68 |     def __init__(self):
 69 |         pass
 70 | 
 71 |     @staticmethod
 72 |     def generator():
 73 |         """Generate all values"""
 74 |         for item in Random_content.random_content:
 75 |             yield item
 76 | 
 77 |     @staticmethod
 78 |     def within(value):
 79 |         """Test for membership"""
 80 |         return value in Random_content.random_content
 81 | 
 82 |     @staticmethod
 83 |     def length():
 84 |         """How many members?"""
 85 |         return len(Random_content.random_content)
 86 | 
 87 | 
 88 | class Evens(object):
 89 |     """Generate a bunch of even numbers"""
 90 | 
 91 |     def __init__(self, maximum):
 92 |         self.maximum = maximum
 93 | 
 94 |     def generator(self):
 95 |         """Generate all values"""
 96 |         for value in range(self.maximum):
 97 |             if value % 2 == 0:
 98 |                 yield str(value)
 99 | 
100 |     def within(self, value):
101 |         """Test for membership"""
102 |         try:
103 |             int_value = int(value)
104 |         except ValueError:
105 |             return False
106 | 
107 |         if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
108 |             return True
109 |         else:
110 |             return False
111 | 
112 |     def length(self):
113 |         """How many members?"""
114 |         return int(math.ceil(self.maximum / 2.0))
115 | 
116 | 
117 | def give_description(filename):
118 |     """
119 |     Return a description of the filename type
120 | 
121 |     Could be array, file or hybrid.
122 |     """
123 |     if filename is None:
124 |         return 'array'
125 |     elif isinstance(filename, tuple):
126 |         if filename[1] == -1:
127 |             return 'mmap'
128 |         else:
129 |             return 'hybrid'
130 |     else:
131 |         return 'seek'
132 | 
133 | 
134 | class TestBloomFilter(unittest.TestCase):
135 |     def _test(
136 |         self,
137 |         description, values, trials, error_rate,
138 |         probe_bitnoer=None, filename=None,
139 |     ):
140 |         # pylint: disable=R0913,R0914
141 |         # R0913: We want a few arguments
142 |         # R0914: We want some local variables too.  This is just test code.
143 |         """Some quick automatic tests for the bloom filter class"""
144 |         if not probe_bitnoer:
145 |             probe_bitnoer = bloom_filter2.get_filter_bitno_probes
146 | 
147 |         divisor = 100000
148 | 
149 |         bloom = bloom_filter2.BloomFilter(
150 |             max_elements=values.length() * 2,
151 |             error_rate=error_rate,
152 |             probe_bitnoer=probe_bitnoer,
153 |             filename=filename,
154 |             start_fresh=True,
155 |         )
156 | 
157 |         message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n'
158 |         filled_out_message = message % (
159 |             description,
160 |             bloom.num_bits_m,
161 |             bloom.num_probes_k,
162 |         )
163 | 
164 |         sys.stdout.write(filled_out_message)
165 | 
166 |         print('starting to add values to an empty bloom filter')
167 |         for valueno, value in enumerate(values.generator()):
168 |             reverse_valueno = values.length() - valueno
169 |             if reverse_valueno % divisor == 0:
170 |                 print('adding valueno %d' % reverse_valueno)
171 |             bloom.add(value)
172 | 
173 |         print('testing all known members')
174 |         include_in_count = sum(
175 |             include in bloom
176 |             for include in values.generator()
177 |         )
178 |         self.assertEqual(include_in_count, values.length())
179 | 
180 |         print('testing random non-members')
181 |         false_positives = 0
182 |         for trialno in range(trials):
183 |             if trialno % divisor == 0:
184 |                 print(
185 |                     'trialno progress: %d / %d' % (trialno, trials),
186 |                     file=sys.stderr,
187 |                 )
188 |             while True:
189 |                 candidate = ''.join(random.sample(CHARACTERS, 5))
190 |                 # If we accidentally found a member, try again
191 |                 if values.within(candidate):
192 |                     continue
193 |                 if candidate in bloom:
194 |                     # print('false positive: %s' % candidate)
195 |                     false_positives += 1
196 |                 break
197 | 
198 |         actual_error_rate = float(false_positives) / trials
199 | 
200 |         self.assertLess(
201 |             actual_error_rate, error_rate,
202 |             "Too many false positives: actual: %s, expected: %s" % (
203 |                 actual_error_rate,
204 |                 error_rate,
205 |             ),
206 |         )
207 | 
208 |         bloom.close()
209 | 
210 |     def test_and(self):
211 |         """Test the & operator"""
212 | 
213 |         abc = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01)
214 |         for character in ['a', 'b', 'c']:
215 |             abc += character
216 | 
217 |         bcd = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01)
218 |         for character in ['b', 'c', 'd']:
219 |             bcd += character
220 | 
221 |         abc &= bcd
222 | 
223 |         self.assertNotIn('a', abc)
224 |         self.assertIn('b', abc)
225 |         self.assertIn('c', abc)
226 |         self.assertNotIn('d', abc)
227 | 
228 |         abc.close()
229 |         bcd.close()
230 | 
231 |     def test_or(self):
232 |         """Test the | operator"""
233 | 
234 |         abc = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01)
235 |         for character in ['a', 'b', 'c']:
236 |             abc += character
237 | 
238 |         bcd = bloom_filter2.BloomFilter(max_elements=100, error_rate=0.01)
239 |         for character in ['b', 'c', 'd']:
240 |             bcd += character
241 | 
242 |         abc |= bcd
243 | 
244 |         self.assertIn('a', abc)
245 |         self.assertIn('b', abc)
246 |         self.assertIn('c', abc)
247 |         self.assertIn('d', abc)
248 |         self.assertNotIn('e', abc)
249 | 
250 |         abc.close()
251 |         bcd.close()
252 | 
253 |     def test_states(self):
254 |         self._test('states', States(), trials=100000, error_rate=0.01)
255 | 
256 |     def test_random(self):
257 |         self._test('random', Random_content(), trials=10000, error_rate=0.1)
258 |         self._test('random', Random_content(), trials=1000000, error_rate=1E-9)
259 |         self._test('random', Random_content(), trials=10000, error_rate=0.1,
260 |                    probe_bitnoer=bloom_filter2.get_bitno_seed_rnd)
261 | 
262 |         filename = 'bloom-filter-rm-me'
263 |         self._test(
264 |             'random',
265 |             Random_content(),
266 |             trials=10000,
267 |             error_rate=0.1,
268 |             filename=filename,
269 |         )
270 | 
271 |     @unittest.skipUnless(os.environ.get('TEST_PERF', ''), "disabled")
272 |     def test_performance(self):
273 |         """Unit tests for BloomFilter class"""
274 | 
275 |         sqrt_of_10 = math.sqrt(10)
276 |         for exponent in range(19):  # it's a lot, but probably not unreasonable
277 |             elements = int(sqrt_of_10 ** exponent + 0.5)
278 |             for filename in [
279 |                 None,
280 |                 'bloom-filter-rm-me',
281 |                 ('bloom-filter-rm-me', 768 * 2 ** 20),
282 |                 ('bloom-filter-rm-me', -1),
283 |             ]:
284 |                 description = give_description(filename)
285 |                 key = '%s %s' % (description, elements)
286 |                 with dbm.open('performance-numbers', 'c') as database:
287 |                     if key in database.keys():
288 |                         continue
289 |                 if elements >= 100000000 and description == 'seek':
290 |                     continue
291 |                 if elements >= 100000000 and description == 'mmap':
292 |                     continue
293 |                 if elements >= 1000000000 and description == 'array':
294 |                     continue
295 |                 time0 = time.time()
296 |                 self._test(
297 |                     'evens %s elements: %d' % (
298 |                         give_description(filename),
299 |                         elements,
300 |                     ),
301 |                     Evens(elements),
302 |                     trials=elements,
303 |                     error_rate=1e-2,
304 |                     filename=filename,
305 |                 )
306 |                 time1 = time.time()
307 |                 delta_t = time1 - time0
308 |                 # file_ = open('%s.txt' % description, 'a')
309 |                 # file_.write('%d %f\n' % (elements, delta_t))
310 |                 # file_.close()
311 |                 with dbm.open('performance-numbers', 'c') as database:
312 |                     database[key] = '%f' % delta_t
313 | 
314 |     def test_probe_count(self):
315 |         # test prob count ok
316 |         bloom = bloom_filter2.BloomFilter(1000000, error_rate=.99)
317 |         self.assertEqual(bloom.num_probes_k, 1)
318 | 
319 |         bloom.close()
320 | 
321 | 
322 | if __name__ == '__main__':
323 |     unittest.main()
324 | 


--------------------------------------------------------------------------------