├── src
    ├── bloom_filter
    │   ├── __init__.py
    │   └── bloom_filter.py
    └── python2x3.py
├── AUTHORS.md
├── bin
    ├── count_bits.py
    └── gen_performance_graph.py
├── Makefile
├── README.md
├── setup.py
├── .gitignore
└── tests
    └── test_bloom_filter.py


/src/bloom_filter/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | 
 4 | from .bloom_filter import (
 5 |     BloomFilter,
 6 |     get_filter_bitno_probes,
 7 |     get_bitno_seed_rnd,
 8 | )
 9 | 
10 | __all__ = [
11 |     'BloomFilter',
12 |     'get_filter_bitno_probes',
13 |     'get_bitno_seed_rnd',
14 | ]
15 | 


--------------------------------------------------------------------------------
/AUTHORS.md:
--------------------------------------------------------------------------------
 1 | Original code:
 2 | 
 3 |  - http://code.activestate.com/recipes/577686-bloom-filter/
 4 |  - Author: Sundar Srinivasan
 5 | 
 6 | Forked into SVN Repo:
 7 | 
 8 |  - http://stromberg.dnsalias.org/svn/bloom-filter/trunk/
 9 |  - Author: Daniel Richard Stromberg
10 | 
11 | Forked to GitHub, renamed to `bloom_filter`:
12 | 
13 |  - https://github.com/hiway/python-bloom-filter
14 |  - Author: Harshad Sharma
15 | 


--------------------------------------------------------------------------------
/bin/count_bits.py:
--------------------------------------------------------------------------------
 1 | #!/usr/local/pypy-1.6/bin/pypy
 2 | 
 3 | import sys
 4 | 
 5 | total_bits = 0
 6 | bits_set = 0
 7 | 
 8 | while True:
 9 |     block = sys.stdin.read(2 ** 19)
10 |     if not block:
11 |         break
12 |     total_bits += len(block) * 8
13 |     # print('got block of length %d' % len(block))
14 |     for char in block:
15 |         byte = ord(char)
16 |         # print('got char %d' % byte)
17 |         for exponent in range(8):
18 |             bitmask = 2 ** exponent
19 |             # print('checking mask %d' % bitmask)
20 |             if byte & bitmask != 0:
21 |                 # print('adding 1 to count')
22 |                 bits_set += 1
23 | 
24 | print('%s set, %s present, %6.2f%%' % (bits_set, total_bits, bits_set * 100.0 / total_bits))
25 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | go: performance-graph.pdf
 3 | 	evince performance-graph.pdf
 4 | 
 5 | performance-graph.pdf: performance-numbers.db gen-performance-graph
 6 | 	./gen-performance-graph
 7 | 
 8 | performance-numbers.db: test-bloom-filter
 9 | 	./this-pylint \
10 | 		--ignore-message ".*Unable to import 'dbm'" \
11 | 		--ignore-message ".*Unable to import 'anydbm'" \
12 | 		--to-pylint bloom_filter_mod.py test-bloom-filter
13 | 	rm -f seek.txt array.txt hybrid.txt mmap.txt
14 | 	#/usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter --performance-test
15 | 	/usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter
16 | 	/usr/local/cpython-3.4/bin/python ./test-bloom-filter
17 | 	/usr/local/cpython-2.5/bin/python ./test-bloom-filter
18 | 	#/usr/local/cpython-2.7/bin/python ./test-bloom-filter
19 | 	#/usr/local/cpython-3.0/bin/python ./test-bloom-filter
20 | 	/usr/local/jython-2.7b3/bin/jython ./test-bloom-filter
21 | 
22 | clean:
23 | 	rm -f *.pyc *.class
24 | 	rm -rf __pycache__
25 | 	rm -f bloom-filter-rm-me
26 | 	rm -f *.ps *.pdf
27 | 	rm -f seek.txt array.txt
28 | 	rm -rf dist build bloom_filter.egg-info
29 | 	rm -f performance-numbers
30 | 
31 | veryclean: clean
32 | 	rm -f performance-numbers.db
33 | 	rm -f performance-numbers
34 | 
35 | build:
36 | 	python setup.py sdist bdist_wheel
37 | 
38 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | > Note: This project has gone unmaintained for a while,
 2 | please use the more up-to-date project at: 
 3 | - https://github.com/remram44/python-bloom-filter
 4 | - https://pypi.org/project/bloom-filter2/
 5 | 
 6 | # bloom-filter
 7 | 
 8 | This project builds on `drs-bloom-filter` and `bloom_filter_mod`.
 9 | Credits and links can be found in AUTHORS.md.
10 | 
11 | ## Installation
12 | 
13 |     pip install bloom_filter
14 | 
15 | 
16 | ## Example:
17 | 
18 |     from bloom_filter import BloomFilter
19 | 
20 |     have_met = BloomFilter()
21 | 
22 |     def have_i_met(name):
23 |         met = name in have_met
24 |         print('Have I met {} before: {}'.format(name, met))
25 | 
26 |     def meet(name):
27 |         have_met.add(name)
28 |         print('Hello, {}'.format(name))
29 | 
30 |     for name in ['Harry', 'Larry', 'Moe']:
31 |         have_i_met(name)
32 |         meet(name)
33 |         have_i_met(name)
34 | 
35 | 
36 | ## Usage:
37 | 
38 |     from bloom_filter import BloomFilter
39 | 
40 |     # instantiate BloomFilter with custom settings,
41 |     # max_elements is how many elements you expect the filter to hold.
42 |     # error_rate defines accuracy; You can use defaults with
43 |     # `BloomFilter()` without any arguments. Following example
44 |     # is same as defaults:
45 |     bloom = BloomFilter(max_elements=10000, error_rate=0.1)
46 | 
47 |     # Test whether the bloom-filter has seen a key:
48 |     assert "test-key" in bloom is False
49 | 
50 |     # Mark the key as seen
51 |     bloom.add("test-key")
52 | 
53 |     # Now check again
54 |     assert "test-key" in bloom is True
55 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- encoding: utf-8 -*-
 3 | from __future__ import absolute_import
 4 | from __future__ import print_function
 5 | 
 6 | from glob import glob
 7 | from os.path import basename
 8 | from os.path import splitext
 9 | 
10 | from setuptools import find_packages
11 | from setuptools import setup
12 | 
13 | 
14 | setup(
15 |     name="bloom_filter",
16 |     version="1.3",
17 |     packages=find_packages('src'),
18 |     package_dir={'': 'src'},
19 |     py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')],
20 | 
21 |     # metadata for upload to PyPI
22 |     author="Harshad Sharma",
23 |     author_email="harshad@sharma.io",
24 |     description='Pure Python Bloom Filter module',
25 |     long_description="""
26 | A pure python bloom filter (low storage requirement, probabilistic
27 | set datastructure) is provided.  It is known to work on CPython 2.x,
28 | CPython 3.x, Pypy and Jython.
29 | 
30 | Includes mmap, in-memory and disk-seek backends.
31 | 
32 | The user specifies the desired maximum number of elements and the
33 | desired maximum false positive probability, and the module
34 | calculates the rest.
35 | 
36 | Usage:
37 | 
38 | ::
39 | 
40 |     from bloom_filter import BloomFilter
41 | 
42 |     # instantiate BloomFilter with custom settings,
43 |     # max_elements is how many elements you expect the filter to hold.
44 |     # error_rate defines accuracy; You can use defaults with
45 |     # `BloomFilter()` without any arguments. Following example
46 |     # is same as defaults:
47 |     bloom = BloomFilter(max_elements=10000, error_rate=0.1)
48 | 
49 |     # Test whether the bloom-filter has seen a key:
50 |     assert "test-key" in bloom is False
51 | 
52 |     # Mark the key as seen
53 |     bloom.add("test-key")
54 | 
55 |     # Now check again
56 |     assert "test-key" in bloom is True
57 |     
58 | """,
59 |     license="MIT",
60 |     keywords="probabilistic set datastructure",
61 |     url='https://github.com/hiway/python-bloom-filter',
62 |     platforms='Cross platform',
63 |     classifiers=[
64 |         "Development Status :: 5 - Production/Stable",
65 |         "Intended Audience :: Developers",
66 |         "Programming Language :: Python :: 2",
67 |         "Programming Language :: Python :: 3",
68 |     ],
69 | )
70 | 


--------------------------------------------------------------------------------
/src/python2x3.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | #!/usr/bin/env python
 3 | 
 4 | # pylint: disable=invalid-name
 5 | # invalid-name: We define a few global-scope constants in lower case.  Deal with it.
 6 | 
 7 | '''Provides code and data to facilitate writing python code that runs on 2.x and 3.x, including pypy'''
 8 | 
 9 | # I'm afraid pylint won't like this one...
10 | 
11 | import sys
12 | 
13 | 
14 | def python_major():
15 |     '''Return an integer corresponding to the major version # of the python interpreter we're running on'''
16 |     # This originally used the platform module, but platform fails on IronPython; sys.version seems to work
17 |     # on everything I've tried
18 |     result = sys.version_info[0]
19 |     return result
20 | 
21 | if python_major() == 2:
22 |     empty_bytes = ''
23 |     null_byte = '\0'
24 |     bytes_type = str
25 | 
26 |     def intlist_to_binary(intlist):
27 |         '''Convert a list of integers to a binary string type'''
28 |         return ''.join(chr(byte) for byte in intlist)
29 | 
30 |     def string_to_binary(string):
31 |         '''Convert a text string to a binary string type'''
32 |         return string
33 | 
34 |     def binary_to_intlist(binary):
35 |         '''Convert a binary string to a list of integers'''
36 |         return [ord(character) for character in binary]
37 | 
38 |     def binary_to_string(binary):
39 |         '''Convert a binary string to a text string'''
40 |         return binary
41 | elif python_major() == 3:
42 |     empty_bytes = ''.encode('utf-8')
43 |     null_byte = bytes([0])
44 |     bytes_type = bytes
45 | 
46 |     def intlist_to_binary(intlist):
47 |         '''Convert a list of integers to a binary string type'''
48 |         return bytes(intlist)
49 | 
50 |     def string_to_binary(string):
51 |         '''Convert a text string (or binary string type) to a binary string type'''
52 |         if isinstance(string, str):
53 |             return string.encode('latin-1')
54 |         else:
55 |             return string
56 | 
57 |     def binary_to_intlist(binary):
58 |         '''Convert a binary string to a list of integers'''
59 |         return binary
60 | 
61 |     def binary_to_string(binary):
62 |         '''Convert a binary string to a text string'''
63 |         return binary.decode('latin-1')
64 | else:
65 |     sys.stderr.write('%s: Python < 2 or > 3 not (yet) supported\n' % sys.argv[0])
66 |     sys.exit(1)
67 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by .ignore support plugin (hsz.mobi)
  2 | ### JetBrains template
  3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
  4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  5 | 
  6 | # User-specific stuff:
  7 | .idea/**/workspace.xml
  8 | .idea/**/tasks.xml
  9 | .idea/dictionaries
 10 | 
 11 | # Sensitive or high-churn files:
 12 | .idea/**/dataSources/
 13 | .idea/**/dataSources.ids
 14 | .idea/**/dataSources.xml
 15 | .idea/**/dataSources.local.xml
 16 | .idea/**/sqlDataSources.xml
 17 | .idea/**/dynamic.xml
 18 | .idea/**/uiDesigner.xml
 19 | 
 20 | # Gradle:
 21 | .idea/**/gradle.xml
 22 | .idea/**/libraries
 23 | 
 24 | # Mongo Explorer plugin:
 25 | .idea/**/mongoSettings.xml
 26 | 
 27 | ## File-based project format:
 28 | *.iws
 29 | 
 30 | ## Plugin-specific files:
 31 | 
 32 | # IntelliJ
 33 | /out/
 34 | 
 35 | # mpeltonen/sbt-idea plugin
 36 | .idea_modules/
 37 | 
 38 | # JIRA plugin
 39 | atlassian-ide-plugin.xml
 40 | 
 41 | # Crashlytics plugin (for Android Studio and IntelliJ)
 42 | com_crashlytics_export_strings.xml
 43 | crashlytics.properties
 44 | crashlytics-build.properties
 45 | fabric.properties
 46 | ### VirtualEnv template
 47 | # Virtualenv
 48 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 49 | .Python
 50 | [Ii]nclude
 51 | [Ll]ib64
 52 | [Ll]ocal
 53 | [Ss]cripts
 54 | pyvenv.cfg
 55 | .venv
 56 | pip-selfcheck.json
 57 | ### Python template
 58 | # Byte-compiled / optimized / DLL files
 59 | __pycache__/
 60 | *.py[cod]
 61 | *$py.class
 62 | 
 63 | # C extensions
 64 | *.so
 65 | 
 66 | # Distribution / packaging
 67 | .Python
 68 | env/
 69 | build/
 70 | develop-eggs/
 71 | dist/
 72 | downloads/
 73 | eggs/
 74 | .eggs/
 75 | lib/
 76 | lib64/
 77 | parts/
 78 | sdist/
 79 | var/
 80 | wheels/
 81 | *.egg-info/
 82 | .installed.cfg
 83 | *.egg
 84 | 
 85 | # PyInstaller
 86 | #  Usually these files are written by a python script from a template
 87 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 88 | *.manifest
 89 | *.spec
 90 | 
 91 | # Installer logs
 92 | pip-log.txt
 93 | pip-delete-this-directory.txt
 94 | 
 95 | # Unit test / coverage reports
 96 | htmlcov/
 97 | .tox/
 98 | .coverage
 99 | .coverage.*
100 | .cache
101 | nosetests.xml
102 | coverage.xml
103 | *,cover
104 | .hypothesis/
105 | 
106 | # Translations
107 | *.mo
108 | *.pot
109 | 
110 | # Django stuff:
111 | *.log
112 | local_settings.py
113 | 
114 | # Flask stuff:
115 | instance/
116 | .webassets-cache
117 | 
118 | # Scrapy stuff:
119 | .scrapy
120 | 
121 | # Sphinx documentation
122 | docs/_build/
123 | 
124 | # PyBuilder
125 | target/
126 | 
127 | # Jupyter Notebook
128 | .ipynb_checkpoints
129 | 
130 | # pyenv
131 | .python-version
132 | 
133 | # celery beat schedule file
134 | celerybeat-schedule
135 | 
136 | # SageMath parsed files
137 | *.sage.py
138 | 
139 | # dotenv
140 | .env
141 | 
142 | # virtualenv
143 | .venv
144 | venv/
145 | ENV/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | 
150 | # Rope project settings
151 | .ropeproject
152 | 
153 | bloom-filter-rm-me
154 | 


--------------------------------------------------------------------------------
/bin/gen_performance_graph.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import sys
  4 | import anydbm
  5 | from pychart import *
  6 | 
  7 | def get_timing(kind):
  8 |     database = anydbm.open('performance-numbers', 'r')
  9 |     list_ = []
 10 |     for key in database.keys():
 11 |         fields = key.split()
 12 |         #print fields[0], kind
 13 |         if fields[0] == kind:
 14 |             set_size = float(fields[1])
 15 |             duration = float(database[key])
 16 |             list_.append( ( set_size, duration ) )
 17 |     database.close()
 18 |     list_.sort()
 19 |     return list_
 20 | 
 21 | #def get_timing(filename):
 22 | #    file_ = open(filename, 'r')
 23 | #    list_ = []
 24 | #    for line in file_:
 25 | #        fields = line.split()
 26 | #        set_size = float(fields[0])
 27 | #        duration = float(fields[1])
 28 | #        list_.append( ( set_size, duration ) )
 29 | #    file_.close()
 30 | #    return list_
 31 | 
 32 | def get_hybrid_timing():
 33 |     return get_timing('hybrid')
 34 | 
 35 | def get_array_timing():
 36 |     return get_timing('array')
 37 | 
 38 | def get_seek_timing():
 39 |     return get_timing('seek')
 40 | 
 41 | def get_mmap_timing():
 42 |     return get_timing('mmap')
 43 | 
 44 | def desired_y_max(*list_):
 45 |     maximum = 0.0
 46 |     for element in list_:
 47 |         for set_size, duration in element:
 48 |             maximum = max(duration, maximum)
 49 |     return maximum
 50 | 
 51 | def main():
 52 |     theme.get_options()
 53 |     theme.output_format = 'pdf'
 54 |     theme.use_color = 1
 55 |     theme.output_file = 'performance-graph.pdf'
 56 |     theme.default_font_size = 15
 57 |     theme.reinitialize()
 58 | 
 59 |     width = 800
 60 |     height = width * 4 // 5
 61 |     size = (width, height)
 62 | 
 63 |     hybrid_timing_data = get_hybrid_timing()
 64 |     print 'hybrid', hybrid_timing_data
 65 |     array_timing_data = get_array_timing()
 66 |     print 'array', array_timing_data
 67 |     seek_timing_data = get_seek_timing()
 68 |     print 'seek', seek_timing_data
 69 |     mmap_timing_data = get_mmap_timing()
 70 |     print 'mmap', mmap_timing_data
 71 | 
 72 |     y_max = desired_y_max(array_timing_data, seek_timing_data, hybrid_timing_data, mmap_timing_data)
 73 | 
 74 |     can = canvas.default_canvas()
 75 | 
 76 |     ar = area.T(
 77 |         size = size,
 78 |         legend=legend.T(),
 79 |         x_range = (1, None),
 80 |         y_range = (0.0001, y_max + 100),
 81 |         #x_coord = log_coord.T(),
 82 |         #y_coord = log_coord.T(),
 83 |         x_coord = linear_coord.T(),
 84 |         y_coord = linear_coord.T(),
 85 |         x_axis = axis.X(format="%g", label="Number of elements in set"),
 86 |         y_axis = axis.Y(format="%g", label="Seconds"),
 87 |         )
 88 | 
 89 |     lp = line_plot.T(data=array_timing_data, label="Array")
 90 |     ar.add_plot(lp)
 91 |                     
 92 |     lp = line_plot.T(data=seek_timing_data, label="Seek")
 93 |     ar.add_plot(lp)
 94 |                     
 95 |     lp = line_plot.T(data=hybrid_timing_data, label="Hybrid")
 96 |     ar.add_plot(lp)
 97 |                     
 98 |     lp = line_plot.T(data=mmap_timing_data, label="mmap")
 99 |     ar.add_plot(lp)
100 |                     
101 |     ar.draw()
102 | 
103 |     #can.show(ar.x_pos(4), ar.y_pos(970), "/a50{}seek")
104 | 
105 | main()
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/tests/test_bloom_filter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | # coding=utf-8
  3 | 
  4 | # pylint: disable=superfluous-parens
  5 | # superfluous-parens: Parentheses are good for clarity and portability
  6 | 
  7 | """Unit tests for bloom_filter_mod"""
  8 | 
  9 | # mport os
 10 | import sys
 11 | import math
 12 | import time
 13 | 
 14 | try:
 15 |     import anydbm
 16 | except ImportError:
 17 |     import dbm as anydbm
 18 | 
 19 | import random
 20 | 
 21 | import bloom_filter
 22 | 
 23 | CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890'
 24 | 
 25 | 
 26 | def my_range(maximum):
 27 |     """A range function with consistent semantics on 2.x and 3.x"""
 28 |     value = 0
 29 |     while True:
 30 |         if value >= maximum:
 31 |             break
 32 |         yield value
 33 |         value += 1
 34 | 
 35 | 
 36 | def _test(description, values, trials, error_rate, probe_bitnoer=None, filename=None):
 37 |     # pylint: disable=R0913,R0914
 38 |     # R0913: We want a few arguments
 39 |     # R0914: We want some local variables too.  This is just test code.
 40 |     """Some quick automatic tests for the bloom filter class"""
 41 |     if not probe_bitnoer:
 42 |         probe_bitnoer = bloom_filter.get_filter_bitno_probes
 43 | 
 44 |     all_good = True
 45 | 
 46 |     divisor = 100000
 47 | 
 48 |     bloom = bloom_filter.BloomFilter(
 49 |         max_elements=values.length() * 2,
 50 |         error_rate=error_rate,
 51 |         probe_bitnoer=probe_bitnoer,
 52 |         filename=filename,
 53 |         start_fresh=True,
 54 |     )
 55 | 
 56 |     message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n'
 57 |     filled_out_message = message % (
 58 |         description,
 59 |         bloom.num_bits_m,
 60 |         bloom.num_probes_k,
 61 |     )
 62 | 
 63 |     sys.stdout.write(filled_out_message)
 64 | 
 65 |     print('starting to add values to an empty bloom filter')
 66 |     for valueno, value in enumerate(values.generator()):
 67 |         reverse_valueno = values.length() - valueno
 68 |         if reverse_valueno % divisor == 0:
 69 |             print('adding valueno %d' % reverse_valueno)
 70 |         bloom.add(value)
 71 | 
 72 |     print('testing all known members')
 73 |     include_in_count = sum(include in bloom for include in values.generator())
 74 |     if include_in_count == values.length():
 75 |         # Good
 76 |         pass
 77 |     else:
 78 |         sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length()))
 79 |         all_good = False
 80 | 
 81 |     print('testing random non-members')
 82 |     false_positives = 0
 83 |     for trialno in my_range(trials):
 84 |         if trialno % divisor == 0:
 85 |             sys.stderr.write('trialno countdown: %d\n' % (trials - trialno))
 86 |         while True:
 87 |             candidate = ''.join(random.sample(CHARACTERS, 5))
 88 |             # If we accidentally found a member, try again
 89 |             if values.within(candidate):
 90 |                 continue
 91 |             if candidate in bloom:
 92 |                 # print 'We erroneously think %s is in the filter' % candidate
 93 |                 false_positives += 1
 94 |             break
 95 | 
 96 |     actual_error_rate = float(false_positives) / trials
 97 | 
 98 |     if actual_error_rate > error_rate:
 99 |         sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % (
100 |             sys.argv[0],
101 |             actual_error_rate,
102 |             error_rate,
103 |         ))
104 |         all_good = False
105 | 
106 |     return all_good
107 | 
108 | 
109 | class States(object):
110 |     """Generate the USA's state names"""
111 | 
112 |     def __init__(self):
113 |         pass
114 | 
115 |     states = """Alabama Alaska Arizona Arkansas California Colorado Connecticut
116 |         Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas
117 |         Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota
118 |         Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey
119 |         NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon
120 |         Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah
121 |         Vermont Virginia Washington WestVirginia Wisconsin Wyoming""".split()
122 | 
123 |     @staticmethod
124 |     def generator():
125 |         """Generate the states"""
126 |         for state in States.states:
127 |             yield state
128 | 
129 |     @staticmethod
130 |     def within(value):
131 |         """Is the value in our list of states?"""
132 |         return value in States.states
133 | 
134 |     @staticmethod
135 |     def length():
136 |         """What is the length of our contained values?"""
137 |         return len(States.states)
138 | 
139 | 
140 | def random_string():
141 |     """Generate a random, 10 character string - for testing purposes"""
142 |     list_ = []
143 |     for chrno in range(10):
144 |         dummy = chrno
145 |         character = CHARACTERS[int(random.random() * len(CHARACTERS))]
146 |         list_.append(character)
147 |     return ''.join(list_)
148 | 
149 | 
150 | class Random_content(object):
151 |     """Generated a bunch of random strings in sorted order"""
152 | 
153 |     random_content = [random_string() for dummy in range(1000)]
154 | 
155 |     def __init__(self):
156 |         pass
157 | 
158 |     @staticmethod
159 |     def generator():
160 |         """Generate all values"""
161 |         for item in Random_content.random_content:
162 |             yield item
163 | 
164 |     @staticmethod
165 |     def within(value):
166 |         """Test for membership"""
167 |         return value in Random_content.random_content
168 | 
169 |     @staticmethod
170 |     def length():
171 |         """How many members?"""
172 |         return len(Random_content.random_content)
173 | 
174 | 
175 | class Evens(object):
176 |     """Generate a bunch of even numbers"""
177 | 
178 |     def __init__(self, maximum):
179 |         self.maximum = maximum
180 | 
181 |     def generator(self):
182 |         """Generate all values"""
183 |         for value in my_range(self.maximum):
184 |             if value % 2 == 0:
185 |                 yield str(value)
186 | 
187 |     def within(self, value):
188 |         """Test for membership"""
189 |         try:
190 |             int_value = int(value)
191 |         except ValueError:
192 |             return False
193 | 
194 |         if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0:
195 |             return True
196 |         else:
197 |             return False
198 | 
199 |     def length(self):
200 |         """How many members?"""
201 |         return int(math.ceil(self.maximum / 2.0))
202 | 
203 | 
204 | def and_test():
205 |     """Test the & operator"""
206 | 
207 |     all_good = True
208 | 
209 |     abc = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
210 |     for character in ['a', 'b', 'c']:
211 |         abc += character
212 | 
213 |     bcd = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
214 |     for character in ['b', 'c', 'd']:
215 |         bcd += character
216 | 
217 |     abc_and_bcd = abc
218 |     abc_and_bcd &= bcd
219 | 
220 |     if 'a' in abc_and_bcd:
221 |         sys.stderr.write('a in abc_and_bcd, but should not be')
222 |         all_good = False
223 |     if not 'b' in abc_and_bcd:
224 |         sys.stderr.write('b not in abc_and_bcd, but should be')
225 |         all_good = False
226 |     if not 'c' in abc_and_bcd:
227 |         sys.stderr.write('c not in abc_and_bcd, but should be')
228 |         all_good = False
229 |     if 'd' in abc_and_bcd:
230 |         sys.stderr.write('d in abc_and_bcd, but should not be')
231 |         all_good = False
232 | 
233 |     return all_good
234 | 
235 | 
236 | def or_test():
237 |     """Test the | operator"""
238 | 
239 |     all_good = True
240 | 
241 |     abc = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
242 |     for character in ['a', 'b', 'c']:
243 |         abc += character
244 | 
245 |     bcd = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01)
246 |     for character in ['b', 'c', 'd']:
247 |         bcd += character
248 | 
249 |     abc_and_bcd = abc
250 |     abc_and_bcd |= bcd
251 | 
252 |     if not 'a' in abc_and_bcd:
253 |         sys.stderr.write('a not in abc_and_bcd, but should be')
254 |         all_good = False
255 |     if not 'b' in abc_and_bcd:
256 |         sys.stderr.write('b not in abc_and_bcd, but should be')
257 |         all_good = False
258 |     if not 'c' in abc_and_bcd:
259 |         sys.stderr.write('c not in abc_and_bcd, but should be')
260 |         all_good = False
261 |     if not 'd' in abc_and_bcd:
262 |         sys.stderr.write('d not in abc_and_bcd, but should be')
263 |         all_good = False
264 |     if 'e' in abc_and_bcd:
265 |         sys.stderr.write('e in abc_and_bcd, but should not be')
266 |         all_good = False
267 | 
268 |     return all_good
269 | 
270 | 
271 | def give_description(filename):
272 |     """Return a description of the filename type - could be array, file or hybrid"""
273 |     if filename is None:
274 |         return 'array'
275 |     elif isinstance(filename, tuple):
276 |         if filename[1] == -1:
277 |             return 'mmap'
278 |         else:
279 |             return 'hybrid'
280 |     else:
281 |         return 'seek'
282 | 
283 | 
284 | def test_bloom_filter():
285 |     """Unit tests for BloomFilter class"""
286 | 
287 |     if sys.argv[1:] == ['--performance-test']:
288 |         performance_test = True
289 |     else:
290 |         performance_test = False
291 | 
292 |     all_good = True
293 | 
294 |     all_good &= _test('states', States(), trials=100000, error_rate=0.01)
295 | 
296 |     all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1)
297 |     all_good &= _test('random', Random_content(), trials=1000000, error_rate=1E-9)
298 |     all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1,
299 |                       probe_bitnoer=bloom_filter.get_bitno_seed_rnd)
300 | 
301 |     filename = 'bloom-filter-rm-me'
302 |     all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename)
303 | 
304 |     all_good &= and_test()
305 | 
306 |     all_good &= or_test()
307 | 
308 |     if performance_test:
309 |         sqrt_of_10 = math.sqrt(10)
310 |         # for exponent in range(5): # this is a lot, but probably not unreasonable
311 |         for exponent in range(19):  # this is a lot, but probably not unreasonable
312 |             elements = int(sqrt_of_10 ** exponent + 0.5)
313 |             for filename in [None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2 ** 20),
314 |                              ('bloom-filter-rm-me', -1)]:
315 |                 description = give_description(filename)
316 |                 key = '%s %s' % (description, elements)
317 |                 database = anydbm.open('performance-numbers', 'c')
318 |                 if key in database.keys():
319 |                     database.close()
320 |                     continue
321 |                 if elements >= 100000000 and description == 'seek':
322 |                     continue
323 |                 if elements >= 100000000 and description == 'mmap':
324 |                     continue
325 |                 if elements >= 1000000000 and description == 'array':
326 |                     continue
327 |                 time0 = time.time()
328 |                 all_good &= _test(
329 |                     'evens %s elements: %d' % (give_description(filename), elements),
330 |                     Evens(elements),
331 |                     trials=elements,
332 |                     error_rate=1e-2,
333 |                     filename=filename,
334 |                 )
335 |                 time1 = time.time()
336 |                 delta_t = time1 - time0
337 |                 # file_ = open('%s.txt' % description, 'a')
338 |                 # file_.write('%d %f\n' % (elements, delta_t))
339 |                 # file_.close()
340 |                 database = anydbm.open('performance-numbers', 'c')
341 |                 database[key] = '%f' % delta_t
342 |                 database.close()
343 | 
344 |     # test prob count ok
345 |     bloom = bloom_filter.BloomFilter(1000000, error_rate=.99)
346 |     all_good &= bloom.num_probes_k == 1
347 |     if not all_good:
348 |         sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0])
349 |         sys.exit(1)
350 | 
351 | 
352 | if __name__ == '__main__':
353 |     test_bloom_filter()
354 | 


--------------------------------------------------------------------------------
/src/bloom_filter/bloom_filter.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # pylint: disable=superfluous-parens,redefined-variable-type
  3 | # superfluous-parens: Sometimes extra parens are more clear
  4 | 
  5 | """Bloom Filter: Probabilistic set membership testing for large sets"""
  6 | 
  7 | # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/
  8 | # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter
  9 | 
 10 | # Tweaked by Daniel Richard Stromberg, mostly to:
 11 | # 1) Give it a little nicer __init__ parameters.
 12 | # 2) Improve the hash functions to get a much lower rate of false positives.
 13 | # 3) Give it a selection of backends.
 14 | # 4) Make it pass pylint.
 15 | 
 16 | from __future__ import division
 17 | import os
 18 | #mport sys
 19 | import math
 20 | import array
 21 | import random
 22 | 
 23 | try:
 24 |     import mmap as mmap_mod
 25 | except ImportError:
 26 |     # Jython lacks mmap()
 27 |     HAVE_MMAP = False
 28 | else:
 29 |     HAVE_MMAP = True
 30 | 
 31 | #mport bufsock
 32 | #mport hashlib
 33 | #mport numbers
 34 | 
 35 | import python2x3
 36 | 
 37 | # In the literature:
 38 | # k is the number of probes - we call this num_probes_k
 39 | # m is the number of bits in the filter - we call this num_bits_m
 40 | # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n
 41 | # p is the desired error rate when full - we call this error_rate_p
 42 | 
 43 | 
 44 | def my_range(num_values):
 45 |     """Generate numbers from 0..num_values-1"""
 46 | 
 47 |     value = 0
 48 |     while value < num_values:
 49 |         yield value
 50 |         value += 1
 51 | 
 52 | # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow
 53 | #class Backend_set_operations:
 54 | #    """Provide &= and |= for backends"""
 55 | #    # pylint: disable=W0232
 56 | #    # W0232: We don't need an __init__ method; we're never instantiated directly
 57 | #    def __iand__(self, other):
 58 | #        assert self.num_bits == other.num_bits
 59 | #
 60 | #        for bitno in my_range(num_bits):
 61 | #            if self.is_set(bitno) and other.is_set(bitno):
 62 | #                self[bitno].set()
 63 | #            else:
 64 | #                self[bitno].clear()
 65 | #
 66 | #    def __ior__(self, other):
 67 | #        assert self.num_bits == other.num_bits
 68 | #
 69 | #        for bitno in xrange(num_bits):
 70 | #            if self[bitno] or other[bitno]:
 71 | #                self[bitno].set()
 72 | #            else:
 73 | #                self[bitno].clear()
 74 | 
 75 | 
 76 | if HAVE_MMAP:
 77 | 
 78 |     class Mmap_backend(object):
 79 |         """
 80 |         Backend storage for our "array of bits" using an mmap'd file.
 81 |         Please note that this has only been tested on Linux so far: 2    -11-01.
 82 |         """
 83 | 
 84 |         effs = 2 ^ 8 - 1
 85 | 
 86 |         def __init__(self, num_bits, filename):
 87 |             self.num_bits = num_bits
 88 |             self.num_chars = (self.num_bits + 7) // 8
 89 |             flags = os.O_RDWR | os.O_CREAT
 90 |             if hasattr(os, 'O_BINARY'):
 91 |                 flags |= getattr(os, 'O_BINARY')
 92 |             self.file_ = os.open(filename, flags)
 93 |             os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
 94 |             os.write(self.file_, python2x3.null_byte)
 95 |             self.mmap = mmap_mod.mmap(self.file_, self.num_chars)
 96 | 
 97 |         def is_set(self, bitno):
 98 |             """Return true iff bit number bitno is set"""
 99 |             byteno, bit_within_wordno = divmod(bitno, 8)
100 |             mask = 1 << bit_within_wordno
101 |             char = self.mmap[byteno]
102 |             if isinstance(char, str):
103 |                 byte = ord(char)
104 |             else:
105 |                 byte = int(char)
106 |             return byte & mask
107 | 
108 |         def set(self, bitno):
109 |             """set bit number bitno to true"""
110 | 
111 |             byteno, bit_within_byteno = divmod(bitno, 8)
112 |             mask = 1 << bit_within_byteno
113 |             char = self.mmap[byteno]
114 |             byte = ord(char)
115 |             byte |= mask
116 |             self.mmap[byteno] = chr(byte)
117 | 
118 |         def clear(self, bitno):
119 |             """clear bit number bitno - set it to false"""
120 | 
121 |             byteno, bit_within_byteno = divmod(bitno, 8)
122 |             mask = 1 << bit_within_byteno
123 |             char = self.mmap[byteno]
124 |             byte = ord(char)
125 |             byte &= Mmap_backend.effs - mask
126 |             self.mmap[byteno] = chr(byte)
127 | 
128 |         def __iand__(self, other):
129 |             assert self.num_bits == other.num_bits
130 | 
131 |             for byteno in my_range(self.num_chars):
132 |                 self.mmap[byteno] = chr(ord(self.mmap[byteno]) & ord(other.mmap[byteno]))
133 | 
134 |             return self
135 | 
136 |         def __ior__(self, other):
137 |             assert self.num_bits == other.num_bits
138 | 
139 |             for byteno in my_range(self.num_chars):
140 |                 self.mmap[byteno] = chr(ord(self.mmap[byteno]) | ord(other.mmap[byteno]))
141 | 
142 |             return self
143 | 
144 |         def close(self):
145 |             """Close the file"""
146 |             os.close(self.file_)
147 | 
148 | 
149 | class File_seek_backend(object):
150 |     """Backend storage for our "array of bits" using a file in which we seek"""
151 | 
152 |     effs = 2 ^ 8 - 1
153 | 
154 |     def __init__(self, num_bits, filename):
155 |         self.num_bits = num_bits
156 |         self.num_chars = (self.num_bits + 7) // 8
157 |         flags = os.O_RDWR | os.O_CREAT
158 |         if hasattr(os, 'O_BINARY'):
159 |             flags |= getattr(os, 'O_BINARY')
160 |         self.file_ = os.open(filename, flags)
161 |         os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET)
162 |         os.write(self.file_, python2x3.null_byte)
163 | 
164 |     def is_set(self, bitno):
165 |         """Return true iff bit number bitno is set"""
166 |         byteno, bit_within_wordno = divmod(bitno, 8)
167 |         mask = 1 << bit_within_wordno
168 |         os.lseek(self.file_, byteno, os.SEEK_SET)
169 |         char = os.read(self.file_, 1)
170 |         if isinstance(char, str):
171 |             byte = ord(char)
172 |         else:
173 |             byte = char[0]
174 |         return byte & mask
175 | 
176 |     def set(self, bitno):
177 |         """set bit number bitno to true"""
178 | 
179 |         byteno, bit_within_byteno = divmod(bitno, 8)
180 |         mask = 1 << bit_within_byteno
181 |         os.lseek(self.file_, byteno, os.SEEK_SET)
182 |         char = os.read(self.file_, 1)
183 |         if isinstance(char, str):
184 |             byte = ord(char)
185 |             was_char = True
186 |         else:
187 |             byte = char[0]
188 |             was_char = False
189 |         byte |= mask
190 |         os.lseek(self.file_, byteno, os.SEEK_SET)
191 |         if was_char:
192 |             os.write(self.file_, chr(byte))
193 |         else:
194 |             char = python2x3.intlist_to_binary([byte])
195 |             os.write(self.file_, char)
196 | 
197 |     def clear(self, bitno):
198 |         """clear bit number bitno - set it to false"""
199 | 
200 |         byteno, bit_within_byteno = divmod(bitno, 8)
201 |         mask = 1 << bit_within_byteno
202 |         os.lseek(self.file_, byteno, os.SEEK_SET)
203 |         char = os.read(self.file_, 1)
204 |         if isinstance(char, str):
205 |             byte = ord(char)
206 |             was_char = True
207 |         else:
208 |             byte = int(char)
209 |             was_char = False
210 |         byte &= File_seek_backend.effs - mask
211 |         os.lseek(self.file_, byteno, os.SEEK_SET)
212 |         if was_char:
213 |             os.write(chr(byte))
214 |         else:
215 |             char = python2x3.intlist_to_binary([byte])
216 |             os.write(char)
217 | 
218 |     # These are quite slow ways to do iand and ior, but they should work,
219 |     # and a faster version is going to take more time
220 |     def __iand__(self, other):
221 |         assert self.num_bits == other.num_bits
222 | 
223 |         for bitno in my_range(self.num_bits):
224 |             if self.is_set(bitno) and other.is_set(bitno):
225 |                 self.set(bitno)
226 |             else:
227 |                 self.clear(bitno)
228 | 
229 |         return self
230 | 
231 |     def __ior__(self, other):
232 |         assert self.num_bits == other.num_bits
233 | 
234 |         for bitno in my_range(self.num_bits):
235 |             if self.is_set(bitno) or other.is_set(bitno):
236 |                 self.set(bitno)
237 |             else:
238 |                 self.clear(bitno)
239 | 
240 |         return self
241 | 
242 |     def close(self):
243 |         """Close the file"""
244 |         os.close(self.file_)
245 | 
246 | 
247 | class Array_then_file_seek_backend(object):
248 |     # pylint: disable=R0902
249 |     # R0902: We kinda need a bunch of instance attributes
250 |     """
251 |     Backend storage for our "array of bits" using a python array of integers up to some maximum number of bytes,
252 |     then spilling over to a file.  This is -not- a cache; we instead save the leftmost bits in RAM, and the
253 |     rightmost bits (if necessary) in a file.  On open, we read from the file to RAM.  On close, we write from RAM
254 |     to the file.
255 |     """
256 | 
257 |     effs = 2 ** 8 - 1
258 | 
259 |     def __init__(self, num_bits, filename, max_bytes_in_memory):
260 |         self.num_bits = num_bits
261 |         num_chars = (self.num_bits + 7) // 8
262 |         self.filename = filename
263 |         self.max_bytes_in_memory = max_bytes_in_memory
264 |         self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8)
265 |         self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0)
266 |         self.bytes_in_memory = (self.bits_in_memory + 7) // 8
267 |         self.bytes_in_file = (self.bits_in_file + 7) // 8
268 | 
269 |         self.array_ = array.array('B', [0]) * self.bytes_in_memory
270 |         flags = os.O_RDWR | os.O_CREAT
271 |         if hasattr(os, 'O_BINARY'):
272 |             flags |= getattr(os, 'O_BINARY')
273 |         self.file_ = os.open(filename, flags)
274 |         os.lseek(self.file_, num_chars + 1, os.SEEK_SET)
275 |         os.write(self.file_, python2x3.null_byte)
276 | 
277 |         os.lseek(self.file_, 0, os.SEEK_SET)
278 |         offset = 0
279 |         intended_block_len = 2 ** 17
280 |         while True:
281 |             if offset + intended_block_len < self.bytes_in_memory:
282 |                 block = os.read(self.file_, intended_block_len)
283 |             elif offset < self.bytes_in_memory:
284 |                 block = os.read(self.file_, self.bytes_in_memory - offset)
285 |             else:
286 |                 break
287 |             for index_in_block, character in enumerate(block):
288 |                 self.array_[offset + index_in_block] = ord(character)
289 |             offset += intended_block_len
290 | 
291 |     def is_set(self, bitno):
292 |         """Return true iff bit number bitno is set"""
293 |         byteno, bit_within_byteno = divmod(bitno, 8)
294 |         mask = 1 << bit_within_byteno
295 |         if byteno < self.bytes_in_memory:
296 |             return self.array_[byteno] & mask
297 |         else:
298 |             os.lseek(self.file_, byteno, os.SEEK_SET)
299 |             char = os.read(self.file_, 1)
300 |             if isinstance(char, str):
301 |                 byte = ord(char)
302 |             else:
303 |                 byte = int(char)
304 |             return byte & mask
305 | 
306 |     def set(self, bitno):
307 |         """set bit number bitno to true"""
308 |         byteno, bit_within_byteno = divmod(bitno, 8)
309 |         mask = 1 << bit_within_byteno
310 |         if byteno < self.bytes_in_memory:
311 |             self.array_[byteno] |= mask
312 |         else:
313 |             os.lseek(self.file_, byteno, os.SEEK_SET)
314 |             char = os.read(self.file_, 1)
315 |             if isinstance(char, str):
316 |                 byte = ord(char)
317 |                 was_char = True
318 |             else:
319 |                 byte = char
320 |                 was_char = False
321 |             byte |= mask
322 |             os.lseek(self.file_, byteno, os.SEEK_SET)
323 |             if was_char:
324 |                 os.write(self.file_, chr(byte))
325 |             else:
326 |                 os.write(self.file_, byte)
327 | 
328 |     def clear(self, bitno):
329 |         """clear bit number bitno - set it to false"""
330 |         byteno, bit_within_byteno = divmod(bitno, 8)
331 |         mask = Array_backend.effs - (1 << bit_within_byteno)
332 |         if byteno < self.bytes_in_memory:
333 |             self.array_[byteno] &= mask
334 |         else:
335 |             os.lseek(self.file_, byteno, os.SEEK_SET)
336 |             char = os.read(self.file_, 1)
337 |             if isinstance(char, str):
338 |                 byte = ord(char)
339 |                 was_char = True
340 |             else:
341 |                 byte = int(char)
342 |                 was_char = False
343 |             byte &= File_seek_backend.effs - mask
344 |             os.lseek(self.file_, byteno, os.SEEK_SET)
345 |             if was_char:
346 |                 os.write(chr(byte))
347 |             else:
348 |                 os.write(byte)
349 | 
350 |     # These are quite slow ways to do iand and ior, but they should work,
351 |     # and a faster version is going to take more time
352 |     def __iand__(self, other):
353 |         assert self.num_bits == other.num_bits
354 | 
355 |         for bitno in my_range(self.num_bits):
356 |             if self.is_set(bitno) and other.is_set(bitno):
357 |                 self.set(bitno)
358 |             else:
359 |                 self.clear(bitno)
360 | 
361 |         return self
362 | 
363 |     def __ior__(self, other):
364 |         assert self.num_bits == other.num_bits
365 | 
366 |         for bitno in my_range(self.num_bits):
367 |             if self.is_set(bitno) or other.is_set(bitno):
368 |                 self.set(bitno)
369 |             else:
370 |                 self.clear(bitno)
371 | 
372 |         return self
373 | 
374 |     def close(self):
375 |         """Write the in-memory portion to disk, leave the already-on-disk portion unchanged"""
376 | 
377 |         os.lseek(self.file_, 0, os.SEEK_SET)
378 |         for index in my_range(self.bytes_in_memory):
379 |             self.file_.write(self.array_[index])
380 | 
381 |         os.close(self.file_)
382 | 
383 | 
384 | class Array_backend(object):
385 |     """Backend storage for our "array of bits" using a python array of integers"""
386 | 
387 |     # Note that this has now been split out into a bits_mod for the benefit of other projects.
388 |     effs = 2 ** 32 - 1
389 | 
390 |     def __init__(self, num_bits):
391 |         self.num_bits = num_bits
392 |         self.num_words = (self.num_bits + 31) // 32
393 |         self.array_ = array.array('L', [0]) * self.num_words
394 | 
395 |     def is_set(self, bitno):
396 |         """Return true iff bit number bitno is set"""
397 |         wordno, bit_within_wordno = divmod(bitno, 32)
398 |         mask = 1 << bit_within_wordno
399 |         return self.array_[wordno] & mask
400 | 
401 |     def set(self, bitno):
402 |         """set bit number bitno to true"""
403 |         wordno, bit_within_wordno = divmod(bitno, 32)
404 |         mask = 1 << bit_within_wordno
405 |         self.array_[wordno] |= mask
406 | 
407 |     def clear(self, bitno):
408 |         """clear bit number bitno - set it to false"""
409 |         wordno, bit_within_wordno = divmod(bitno, 32)
410 |         mask = Array_backend.effs - (1 << bit_within_wordno)
411 |         self.array_[wordno] &= mask
412 | 
413 |     # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower
414 | 
415 |     def __iand__(self, other):
416 |         assert self.num_bits == other.num_bits
417 | 
418 |         for wordno in my_range(self.num_words):
419 |             self.array_[wordno] &= other.array_[wordno]
420 | 
421 |         return self
422 | 
423 |     def __ior__(self, other):
424 |         assert self.num_bits == other.num_bits
425 | 
426 |         for wordno in my_range(self.num_words):
427 |             self.array_[wordno] |= other.array_[wordno]
428 | 
429 |         return self
430 | 
431 |     def close(self):
432 |         """Noop for compatibility with the file+seek backend"""
433 |         pass
434 | 
435 | 
436 | def get_bitno_seed_rnd(bloom_filter, key):
437 |     """Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result"""
438 | 
439 |     # We're using key as a seed to a pseudorandom number generator
440 |     hasher = random.Random(key).randrange
441 |     for dummy in range(bloom_filter.num_probes_k):
442 |         bitno = hasher(bloom_filter.num_bits_m)
443 |         yield bitno % bloom_filter.num_bits_m
444 | 
445 | 
446 | MERSENNES1 = [2 ** x - 1 for x in [17, 31, 127]]
447 | MERSENNES2 = [2 ** x - 1 for x in [19, 67, 257]]
448 | 
449 | 
450 | def simple_hash(int_list, prime1, prime2, prime3):
451 |     """Compute a hash value from a list of integers and 3 primes"""
452 |     result = 0
453 |     for integer in int_list:
454 |         result += ((result + integer + prime1) * prime2) % prime3
455 |     return result
456 | 
457 | 
458 | def hash1(int_list):
459 |     """Basic hash function #1"""
460 |     return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2])
461 | 
462 | 
463 | def hash2(int_list):
464 |     """Basic hash function #2"""
465 |     return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2])
466 | 
467 | 
468 | def get_filter_bitno_probes(bloom_filter, key):
469 |     """Apply num_probes_k hash functions to key.  Generate the array index and bitmask corresponding to each result"""
470 | 
471 |     # This one assumes key is either bytes or str (or other list of integers)
472 | 
473 |     # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type
474 |     if hasattr(key, '__divmod__'):
475 |         int_list = []
476 |         temp = key
477 |         while temp:
478 |             quotient, remainder = divmod(temp, 256)
479 |             int_list.append(remainder)
480 |             temp = quotient
481 |     elif hasattr(key[0], '__divmod__'):
482 |         int_list = key
483 |     elif isinstance(key[0], str):
484 |         int_list = [ord(char) for char in key]
485 |     else:
486 |         raise TypeError('Sorry, I do not know how to hash this type')
487 | 
488 |     hash_value1 = hash1(int_list)
489 |     hash_value2 = hash2(int_list)
490 |     probe_value = hash_value1
491 | 
492 |     for probeno in range(1, bloom_filter.num_probes_k + 1):
493 |         probe_value *= hash_value1
494 |         probe_value += hash_value2
495 |         probe_value %= MERSENNES1[2]
496 |         yield probe_value % bloom_filter.num_bits_m
497 | 
498 | 
499 | def try_unlink(filename):
500 |     """unlink a file.  Don't complain if it's not there"""
501 |     try:
502 |         os.unlink(filename)
503 |     except OSError:
504 |         pass
505 |     return
506 | 
507 | 
508 | class BloomFilter(object):
509 |     """Probabilistic set membership testing for large sets"""
510 |     def __init__(self,
511 |                  max_elements=10000,
512 |                  error_rate=0.1,
513 |                  probe_bitnoer=get_filter_bitno_probes,
514 |                  filename=None,
515 |                  start_fresh=False):
516 |         # pylint: disable=R0913
517 |         # R0913: We want a few arguments
518 |         if max_elements <= 0:
519 |             raise ValueError('ideal_num_elements_n must be > 0')
520 |         if not (0 < error_rate < 1):
521 |             raise ValueError('error_rate_p must be between 0 and 1 exclusive')
522 | 
523 |         self.error_rate_p = error_rate
524 |         # With fewer elements, we should do very well.  With more elements, our error rate "guarantee"
525 |         # drops rapidly.
526 |         self.ideal_num_elements_n = max_elements
527 | 
528 |         numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p)
529 |         denominator = math.log(2) ** 2
530 |         real_num_bits_m = numerator / denominator
531 |         self.num_bits_m = int(math.ceil(real_num_bits_m))
532 | 
533 |         if filename is None:
534 |             self.backend = Array_backend(self.num_bits_m)
535 |         elif isinstance(filename, tuple) and isinstance(filename[1], int):
536 |             if start_fresh:
537 |                 try_unlink(filename[0])
538 |             if filename[1] == -1:
539 |                 self.backend = Mmap_backend(self.num_bits_m, filename[0])
540 |             else:
541 |                 self.backend = Array_then_file_seek_backend(self.num_bits_m, filename[0], filename[1])
542 |         else:
543 |             if start_fresh:
544 |                 try_unlink(filename)
545 |             self.backend = File_seek_backend(self.num_bits_m, filename)
546 | 
547 |         # AKA num_offsetters
548 |         # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives
549 |         real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2)
550 |         self.num_probes_k = int(math.ceil(real_num_probes_k))
551 |         self.probe_bitnoer = probe_bitnoer
552 | 
553 |     def __repr__(self):
554 |         return 'BloomFilter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % (
555 |             self.ideal_num_elements_n,
556 |             self.error_rate_p,
557 |             self.num_bits_m,
558 |         )
559 | 
560 |     def add(self, key):
561 |         """Add an element to the filter"""
562 |         for bitno in self.probe_bitnoer(self, key):
563 |             self.backend.set(bitno)
564 | 
565 |     def __iadd__(self, key):
566 |         self.add(key)
567 |         return self
568 | 
569 |     def _match_template(self, bloom_filter):
570 |         """Compare a sort of signature for two bloom filters.  Used in preparation for binary operations"""
571 |         return (self.num_bits_m == bloom_filter.num_bits_m
572 |                 and self.num_probes_k == bloom_filter.num_probes_k
573 |                 and self.probe_bitnoer == bloom_filter.probe_bitnoer)
574 | 
575 |     def union(self, bloom_filter):
576 |         """Compute the set union of two bloom filters"""
577 |         self.backend |= bloom_filter.backend
578 | 
579 |     def __ior__(self, bloom_filter):
580 |         self.union(bloom_filter)
581 |         return self
582 | 
583 |     def intersection(self, bloom_filter):
584 |         """Compute the set intersection of two bloom filters"""
585 |         self.backend &= bloom_filter.backend
586 | 
587 |     def __iand__(self, bloom_filter):
588 |         self.intersection(bloom_filter)
589 |         return self
590 | 
591 |     def __contains__(self, key):
592 |         for bitno in self.probe_bitnoer(self, key):
593 |             if not self.backend.is_set(bitno):
594 |                 return False
595 |         return True
596 | 


--------------------------------------------------------------------------------