├── src ├── bloom_filter │ ├── __init__.py │ └── bloom_filter.py └── python2x3.py ├── AUTHORS.md ├── bin ├── count_bits.py └── gen_performance_graph.py ├── Makefile ├── README.md ├── setup.py ├── .gitignore └── tests └── test_bloom_filter.py /src/bloom_filter/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | 4 | from .bloom_filter import ( 5 | BloomFilter, 6 | get_filter_bitno_probes, 7 | get_bitno_seed_rnd, 8 | ) 9 | 10 | __all__ = [ 11 | 'BloomFilter', 12 | 'get_filter_bitno_probes', 13 | 'get_bitno_seed_rnd', 14 | ] 15 | -------------------------------------------------------------------------------- /AUTHORS.md: -------------------------------------------------------------------------------- 1 | Original code: 2 | 3 | - http://code.activestate.com/recipes/577686-bloom-filter/ 4 | - Author: Sundar Srinivasan 5 | 6 | Forked into SVN Repo: 7 | 8 | - http://stromberg.dnsalias.org/svn/bloom-filter/trunk/ 9 | - Author: Daniel Richard Stromberg 10 | 11 | Forked to GitHub, renamed to `bloom_filter`: 12 | 13 | - https://github.com/hiway/python-bloom-filter 14 | - Author: Harshad Sharma 15 | -------------------------------------------------------------------------------- /bin/count_bits.py: -------------------------------------------------------------------------------- 1 | #!/usr/local/pypy-1.6/bin/pypy 2 | 3 | import sys 4 | 5 | total_bits = 0 6 | bits_set = 0 7 | 8 | while True: 9 | block = sys.stdin.read(2 ** 19) 10 | if not block: 11 | break 12 | total_bits += len(block) * 8 13 | # print('got block of length %d' % len(block)) 14 | for char in block: 15 | byte = ord(char) 16 | # print('got char %d' % byte) 17 | for exponent in range(8): 18 | bitmask = 2 ** exponent 19 | # print('checking mask %d' % bitmask) 20 | if byte & bitmask != 0: 21 | # print('adding 1 to count') 22 | bits_set += 1 23 | 24 | print('%s set, %s present, %6.2f%%' % (bits_set, total_bits, bits_set * 100.0 / total_bits)) 25 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | go: performance-graph.pdf 3 | evince performance-graph.pdf 4 | 5 | performance-graph.pdf: performance-numbers.db gen-performance-graph 6 | ./gen-performance-graph 7 | 8 | performance-numbers.db: test-bloom-filter 9 | ./this-pylint \ 10 | --ignore-message ".*Unable to import 'dbm'" \ 11 | --ignore-message ".*Unable to import 'anydbm'" \ 12 | --to-pylint bloom_filter_mod.py test-bloom-filter 13 | rm -f seek.txt array.txt hybrid.txt mmap.txt 14 | #/usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter --performance-test 15 | /usr/local/pypy-2.3.1/bin/pypy ./test-bloom-filter 16 | /usr/local/cpython-3.4/bin/python ./test-bloom-filter 17 | /usr/local/cpython-2.5/bin/python ./test-bloom-filter 18 | #/usr/local/cpython-2.7/bin/python ./test-bloom-filter 19 | #/usr/local/cpython-3.0/bin/python ./test-bloom-filter 20 | /usr/local/jython-2.7b3/bin/jython ./test-bloom-filter 21 | 22 | clean: 23 | rm -f *.pyc *.class 24 | rm -rf __pycache__ 25 | rm -f bloom-filter-rm-me 26 | rm -f *.ps *.pdf 27 | rm -f seek.txt array.txt 28 | rm -rf dist build bloom_filter.egg-info 29 | rm -f performance-numbers 30 | 31 | veryclean: clean 32 | rm -f performance-numbers.db 33 | rm -f performance-numbers 34 | 35 | build: 36 | python setup.py sdist bdist_wheel 37 | 38 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | > Note: This project has gone unmaintained for a while, 2 | please use the more up-to-date project at: 3 | - https://github.com/remram44/python-bloom-filter 4 | - https://pypi.org/project/bloom-filter2/ 5 | 6 | # bloom-filter 7 | 8 | This project builds on `drs-bloom-filter` and `bloom_filter_mod`. 9 | Credits and links can be found in AUTHORS.md. 10 | 11 | ## Installation 12 | 13 | pip install bloom_filter 14 | 15 | 16 | ## Example: 17 | 18 | from bloom_filter import BloomFilter 19 | 20 | have_met = BloomFilter() 21 | 22 | def have_i_met(name): 23 | met = name in have_met 24 | print('Have I met {} before: {}'.format(name, met)) 25 | 26 | def meet(name): 27 | have_met.add(name) 28 | print('Hello, {}'.format(name)) 29 | 30 | for name in ['Harry', 'Larry', 'Moe']: 31 | have_i_met(name) 32 | meet(name) 33 | have_i_met(name) 34 | 35 | 36 | ## Usage: 37 | 38 | from bloom_filter import BloomFilter 39 | 40 | # instantiate BloomFilter with custom settings, 41 | # max_elements is how many elements you expect the filter to hold. 42 | # error_rate defines accuracy; You can use defaults with 43 | # `BloomFilter()` without any arguments. Following example 44 | # is same as defaults: 45 | bloom = BloomFilter(max_elements=10000, error_rate=0.1) 46 | 47 | # Test whether the bloom-filter has seen a key: 48 | assert "test-key" in bloom is False 49 | 50 | # Mark the key as seen 51 | bloom.add("test-key") 52 | 53 | # Now check again 54 | assert "test-key" in bloom is True 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- encoding: utf-8 -*- 3 | from __future__ import absolute_import 4 | from __future__ import print_function 5 | 6 | from glob import glob 7 | from os.path import basename 8 | from os.path import splitext 9 | 10 | from setuptools import find_packages 11 | from setuptools import setup 12 | 13 | 14 | setup( 15 | name="bloom_filter", 16 | version="1.3", 17 | packages=find_packages('src'), 18 | package_dir={'': 'src'}, 19 | py_modules=[splitext(basename(path))[0] for path in glob('src/*.py')], 20 | 21 | # metadata for upload to PyPI 22 | author="Harshad Sharma", 23 | author_email="harshad@sharma.io", 24 | description='Pure Python Bloom Filter module', 25 | long_description=""" 26 | A pure python bloom filter (low storage requirement, probabilistic 27 | set datastructure) is provided. It is known to work on CPython 2.x, 28 | CPython 3.x, Pypy and Jython. 29 | 30 | Includes mmap, in-memory and disk-seek backends. 31 | 32 | The user specifies the desired maximum number of elements and the 33 | desired maximum false positive probability, and the module 34 | calculates the rest. 35 | 36 | Usage: 37 | 38 | :: 39 | 40 | from bloom_filter import BloomFilter 41 | 42 | # instantiate BloomFilter with custom settings, 43 | # max_elements is how many elements you expect the filter to hold. 44 | # error_rate defines accuracy; You can use defaults with 45 | # `BloomFilter()` without any arguments. Following example 46 | # is same as defaults: 47 | bloom = BloomFilter(max_elements=10000, error_rate=0.1) 48 | 49 | # Test whether the bloom-filter has seen a key: 50 | assert "test-key" in bloom is False 51 | 52 | # Mark the key as seen 53 | bloom.add("test-key") 54 | 55 | # Now check again 56 | assert "test-key" in bloom is True 57 | 58 | """, 59 | license="MIT", 60 | keywords="probabilistic set datastructure", 61 | url='https://github.com/hiway/python-bloom-filter', 62 | platforms='Cross platform', 63 | classifiers=[ 64 | "Development Status :: 5 - Production/Stable", 65 | "Intended Audience :: Developers", 66 | "Programming Language :: Python :: 2", 67 | "Programming Language :: Python :: 3", 68 | ], 69 | ) 70 | -------------------------------------------------------------------------------- /src/python2x3.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | #!/usr/bin/env python 3 | 4 | # pylint: disable=invalid-name 5 | # invalid-name: We define a few global-scope constants in lower case. Deal with it. 6 | 7 | '''Provides code and data to facilitate writing python code that runs on 2.x and 3.x, including pypy''' 8 | 9 | # I'm afraid pylint won't like this one... 10 | 11 | import sys 12 | 13 | 14 | def python_major(): 15 | '''Return an integer corresponding to the major version # of the python interpreter we're running on''' 16 | # This originally used the platform module, but platform fails on IronPython; sys.version seems to work 17 | # on everything I've tried 18 | result = sys.version_info[0] 19 | return result 20 | 21 | if python_major() == 2: 22 | empty_bytes = '' 23 | null_byte = '\0' 24 | bytes_type = str 25 | 26 | def intlist_to_binary(intlist): 27 | '''Convert a list of integers to a binary string type''' 28 | return ''.join(chr(byte) for byte in intlist) 29 | 30 | def string_to_binary(string): 31 | '''Convert a text string to a binary string type''' 32 | return string 33 | 34 | def binary_to_intlist(binary): 35 | '''Convert a binary string to a list of integers''' 36 | return [ord(character) for character in binary] 37 | 38 | def binary_to_string(binary): 39 | '''Convert a binary string to a text string''' 40 | return binary 41 | elif python_major() == 3: 42 | empty_bytes = ''.encode('utf-8') 43 | null_byte = bytes([0]) 44 | bytes_type = bytes 45 | 46 | def intlist_to_binary(intlist): 47 | '''Convert a list of integers to a binary string type''' 48 | return bytes(intlist) 49 | 50 | def string_to_binary(string): 51 | '''Convert a text string (or binary string type) to a binary string type''' 52 | if isinstance(string, str): 53 | return string.encode('latin-1') 54 | else: 55 | return string 56 | 57 | def binary_to_intlist(binary): 58 | '''Convert a binary string to a list of integers''' 59 | return binary 60 | 61 | def binary_to_string(binary): 62 | '''Convert a binary string to a text string''' 63 | return binary.decode('latin-1') 64 | else: 65 | sys.stderr.write('%s: Python < 2 or > 3 not (yet) supported\n' % sys.argv[0]) 66 | sys.exit(1) 67 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 4 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 5 | 6 | # User-specific stuff: 7 | .idea/**/workspace.xml 8 | .idea/**/tasks.xml 9 | .idea/dictionaries 10 | 11 | # Sensitive or high-churn files: 12 | .idea/**/dataSources/ 13 | .idea/**/dataSources.ids 14 | .idea/**/dataSources.xml 15 | .idea/**/dataSources.local.xml 16 | .idea/**/sqlDataSources.xml 17 | .idea/**/dynamic.xml 18 | .idea/**/uiDesigner.xml 19 | 20 | # Gradle: 21 | .idea/**/gradle.xml 22 | .idea/**/libraries 23 | 24 | # Mongo Explorer plugin: 25 | .idea/**/mongoSettings.xml 26 | 27 | ## File-based project format: 28 | *.iws 29 | 30 | ## Plugin-specific files: 31 | 32 | # IntelliJ 33 | /out/ 34 | 35 | # mpeltonen/sbt-idea plugin 36 | .idea_modules/ 37 | 38 | # JIRA plugin 39 | atlassian-ide-plugin.xml 40 | 41 | # Crashlytics plugin (for Android Studio and IntelliJ) 42 | com_crashlytics_export_strings.xml 43 | crashlytics.properties 44 | crashlytics-build.properties 45 | fabric.properties 46 | ### VirtualEnv template 47 | # Virtualenv 48 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 49 | .Python 50 | [Ii]nclude 51 | [Ll]ib64 52 | [Ll]ocal 53 | [Ss]cripts 54 | pyvenv.cfg 55 | .venv 56 | pip-selfcheck.json 57 | ### Python template 58 | # Byte-compiled / optimized / DLL files 59 | __pycache__/ 60 | *.py[cod] 61 | *$py.class 62 | 63 | # C extensions 64 | *.so 65 | 66 | # Distribution / packaging 67 | .Python 68 | env/ 69 | build/ 70 | develop-eggs/ 71 | dist/ 72 | downloads/ 73 | eggs/ 74 | .eggs/ 75 | lib/ 76 | lib64/ 77 | parts/ 78 | sdist/ 79 | var/ 80 | wheels/ 81 | *.egg-info/ 82 | .installed.cfg 83 | *.egg 84 | 85 | # PyInstaller 86 | # Usually these files are written by a python script from a template 87 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 88 | *.manifest 89 | *.spec 90 | 91 | # Installer logs 92 | pip-log.txt 93 | pip-delete-this-directory.txt 94 | 95 | # Unit test / coverage reports 96 | htmlcov/ 97 | .tox/ 98 | .coverage 99 | .coverage.* 100 | .cache 101 | nosetests.xml 102 | coverage.xml 103 | *,cover 104 | .hypothesis/ 105 | 106 | # Translations 107 | *.mo 108 | *.pot 109 | 110 | # Django stuff: 111 | *.log 112 | local_settings.py 113 | 114 | # Flask stuff: 115 | instance/ 116 | .webassets-cache 117 | 118 | # Scrapy stuff: 119 | .scrapy 120 | 121 | # Sphinx documentation 122 | docs/_build/ 123 | 124 | # PyBuilder 125 | target/ 126 | 127 | # Jupyter Notebook 128 | .ipynb_checkpoints 129 | 130 | # pyenv 131 | .python-version 132 | 133 | # celery beat schedule file 134 | celerybeat-schedule 135 | 136 | # SageMath parsed files 137 | *.sage.py 138 | 139 | # dotenv 140 | .env 141 | 142 | # virtualenv 143 | .venv 144 | venv/ 145 | ENV/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | 150 | # Rope project settings 151 | .ropeproject 152 | 153 | bloom-filter-rm-me 154 | -------------------------------------------------------------------------------- /bin/gen_performance_graph.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import sys 4 | import anydbm 5 | from pychart import * 6 | 7 | def get_timing(kind): 8 | database = anydbm.open('performance-numbers', 'r') 9 | list_ = [] 10 | for key in database.keys(): 11 | fields = key.split() 12 | #print fields[0], kind 13 | if fields[0] == kind: 14 | set_size = float(fields[1]) 15 | duration = float(database[key]) 16 | list_.append( ( set_size, duration ) ) 17 | database.close() 18 | list_.sort() 19 | return list_ 20 | 21 | #def get_timing(filename): 22 | # file_ = open(filename, 'r') 23 | # list_ = [] 24 | # for line in file_: 25 | # fields = line.split() 26 | # set_size = float(fields[0]) 27 | # duration = float(fields[1]) 28 | # list_.append( ( set_size, duration ) ) 29 | # file_.close() 30 | # return list_ 31 | 32 | def get_hybrid_timing(): 33 | return get_timing('hybrid') 34 | 35 | def get_array_timing(): 36 | return get_timing('array') 37 | 38 | def get_seek_timing(): 39 | return get_timing('seek') 40 | 41 | def get_mmap_timing(): 42 | return get_timing('mmap') 43 | 44 | def desired_y_max(*list_): 45 | maximum = 0.0 46 | for element in list_: 47 | for set_size, duration in element: 48 | maximum = max(duration, maximum) 49 | return maximum 50 | 51 | def main(): 52 | theme.get_options() 53 | theme.output_format = 'pdf' 54 | theme.use_color = 1 55 | theme.output_file = 'performance-graph.pdf' 56 | theme.default_font_size = 15 57 | theme.reinitialize() 58 | 59 | width = 800 60 | height = width * 4 // 5 61 | size = (width, height) 62 | 63 | hybrid_timing_data = get_hybrid_timing() 64 | print 'hybrid', hybrid_timing_data 65 | array_timing_data = get_array_timing() 66 | print 'array', array_timing_data 67 | seek_timing_data = get_seek_timing() 68 | print 'seek', seek_timing_data 69 | mmap_timing_data = get_mmap_timing() 70 | print 'mmap', mmap_timing_data 71 | 72 | y_max = desired_y_max(array_timing_data, seek_timing_data, hybrid_timing_data, mmap_timing_data) 73 | 74 | can = canvas.default_canvas() 75 | 76 | ar = area.T( 77 | size = size, 78 | legend=legend.T(), 79 | x_range = (1, None), 80 | y_range = (0.0001, y_max + 100), 81 | #x_coord = log_coord.T(), 82 | #y_coord = log_coord.T(), 83 | x_coord = linear_coord.T(), 84 | y_coord = linear_coord.T(), 85 | x_axis = axis.X(format="%g", label="Number of elements in set"), 86 | y_axis = axis.Y(format="%g", label="Seconds"), 87 | ) 88 | 89 | lp = line_plot.T(data=array_timing_data, label="Array") 90 | ar.add_plot(lp) 91 | 92 | lp = line_plot.T(data=seek_timing_data, label="Seek") 93 | ar.add_plot(lp) 94 | 95 | lp = line_plot.T(data=hybrid_timing_data, label="Hybrid") 96 | ar.add_plot(lp) 97 | 98 | lp = line_plot.T(data=mmap_timing_data, label="mmap") 99 | ar.add_plot(lp) 100 | 101 | ar.draw() 102 | 103 | #can.show(ar.x_pos(4), ar.y_pos(970), "/a50{}seek") 104 | 105 | main() 106 | 107 | 108 | -------------------------------------------------------------------------------- /tests/test_bloom_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # coding=utf-8 3 | 4 | # pylint: disable=superfluous-parens 5 | # superfluous-parens: Parentheses are good for clarity and portability 6 | 7 | """Unit tests for bloom_filter_mod""" 8 | 9 | # mport os 10 | import sys 11 | import math 12 | import time 13 | 14 | try: 15 | import anydbm 16 | except ImportError: 17 | import dbm as anydbm 18 | 19 | import random 20 | 21 | import bloom_filter 22 | 23 | CHARACTERS = 'abcdefghijklmnopqrstuvwxyz1234567890' 24 | 25 | 26 | def my_range(maximum): 27 | """A range function with consistent semantics on 2.x and 3.x""" 28 | value = 0 29 | while True: 30 | if value >= maximum: 31 | break 32 | yield value 33 | value += 1 34 | 35 | 36 | def _test(description, values, trials, error_rate, probe_bitnoer=None, filename=None): 37 | # pylint: disable=R0913,R0914 38 | # R0913: We want a few arguments 39 | # R0914: We want some local variables too. This is just test code. 40 | """Some quick automatic tests for the bloom filter class""" 41 | if not probe_bitnoer: 42 | probe_bitnoer = bloom_filter.get_filter_bitno_probes 43 | 44 | all_good = True 45 | 46 | divisor = 100000 47 | 48 | bloom = bloom_filter.BloomFilter( 49 | max_elements=values.length() * 2, 50 | error_rate=error_rate, 51 | probe_bitnoer=probe_bitnoer, 52 | filename=filename, 53 | start_fresh=True, 54 | ) 55 | 56 | message = '\ndescription: %s num_bits_m: %s num_probes_k: %s\n' 57 | filled_out_message = message % ( 58 | description, 59 | bloom.num_bits_m, 60 | bloom.num_probes_k, 61 | ) 62 | 63 | sys.stdout.write(filled_out_message) 64 | 65 | print('starting to add values to an empty bloom filter') 66 | for valueno, value in enumerate(values.generator()): 67 | reverse_valueno = values.length() - valueno 68 | if reverse_valueno % divisor == 0: 69 | print('adding valueno %d' % reverse_valueno) 70 | bloom.add(value) 71 | 72 | print('testing all known members') 73 | include_in_count = sum(include in bloom for include in values.generator()) 74 | if include_in_count == values.length(): 75 | # Good 76 | pass 77 | else: 78 | sys.stderr.write('Include count bad: %s, %d\n' % (include_in_count, values.length())) 79 | all_good = False 80 | 81 | print('testing random non-members') 82 | false_positives = 0 83 | for trialno in my_range(trials): 84 | if trialno % divisor == 0: 85 | sys.stderr.write('trialno countdown: %d\n' % (trials - trialno)) 86 | while True: 87 | candidate = ''.join(random.sample(CHARACTERS, 5)) 88 | # If we accidentally found a member, try again 89 | if values.within(candidate): 90 | continue 91 | if candidate in bloom: 92 | # print 'We erroneously think %s is in the filter' % candidate 93 | false_positives += 1 94 | break 95 | 96 | actual_error_rate = float(false_positives) / trials 97 | 98 | if actual_error_rate > error_rate: 99 | sys.stderr.write('%s: Too many false positives: actual: %s, expected: %s\n' % ( 100 | sys.argv[0], 101 | actual_error_rate, 102 | error_rate, 103 | )) 104 | all_good = False 105 | 106 | return all_good 107 | 108 | 109 | class States(object): 110 | """Generate the USA's state names""" 111 | 112 | def __init__(self): 113 | pass 114 | 115 | states = """Alabama Alaska Arizona Arkansas California Colorado Connecticut 116 | Delaware Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas 117 | Kentucky Louisiana Maine Maryland Massachusetts Michigan Minnesota 118 | Mississippi Missouri Montana Nebraska Nevada NewHampshire NewJersey 119 | NewMexico NewYork NorthCarolina NorthDakota Ohio Oklahoma Oregon 120 | Pennsylvania RhodeIsland SouthCarolina SouthDakota Tennessee Texas Utah 121 | Vermont Virginia Washington WestVirginia Wisconsin Wyoming""".split() 122 | 123 | @staticmethod 124 | def generator(): 125 | """Generate the states""" 126 | for state in States.states: 127 | yield state 128 | 129 | @staticmethod 130 | def within(value): 131 | """Is the value in our list of states?""" 132 | return value in States.states 133 | 134 | @staticmethod 135 | def length(): 136 | """What is the length of our contained values?""" 137 | return len(States.states) 138 | 139 | 140 | def random_string(): 141 | """Generate a random, 10 character string - for testing purposes""" 142 | list_ = [] 143 | for chrno in range(10): 144 | dummy = chrno 145 | character = CHARACTERS[int(random.random() * len(CHARACTERS))] 146 | list_.append(character) 147 | return ''.join(list_) 148 | 149 | 150 | class Random_content(object): 151 | """Generated a bunch of random strings in sorted order""" 152 | 153 | random_content = [random_string() for dummy in range(1000)] 154 | 155 | def __init__(self): 156 | pass 157 | 158 | @staticmethod 159 | def generator(): 160 | """Generate all values""" 161 | for item in Random_content.random_content: 162 | yield item 163 | 164 | @staticmethod 165 | def within(value): 166 | """Test for membership""" 167 | return value in Random_content.random_content 168 | 169 | @staticmethod 170 | def length(): 171 | """How many members?""" 172 | return len(Random_content.random_content) 173 | 174 | 175 | class Evens(object): 176 | """Generate a bunch of even numbers""" 177 | 178 | def __init__(self, maximum): 179 | self.maximum = maximum 180 | 181 | def generator(self): 182 | """Generate all values""" 183 | for value in my_range(self.maximum): 184 | if value % 2 == 0: 185 | yield str(value) 186 | 187 | def within(self, value): 188 | """Test for membership""" 189 | try: 190 | int_value = int(value) 191 | except ValueError: 192 | return False 193 | 194 | if int_value >= 0 and int_value < self.maximum and int_value % 2 == 0: 195 | return True 196 | else: 197 | return False 198 | 199 | def length(self): 200 | """How many members?""" 201 | return int(math.ceil(self.maximum / 2.0)) 202 | 203 | 204 | def and_test(): 205 | """Test the & operator""" 206 | 207 | all_good = True 208 | 209 | abc = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01) 210 | for character in ['a', 'b', 'c']: 211 | abc += character 212 | 213 | bcd = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01) 214 | for character in ['b', 'c', 'd']: 215 | bcd += character 216 | 217 | abc_and_bcd = abc 218 | abc_and_bcd &= bcd 219 | 220 | if 'a' in abc_and_bcd: 221 | sys.stderr.write('a in abc_and_bcd, but should not be') 222 | all_good = False 223 | if not 'b' in abc_and_bcd: 224 | sys.stderr.write('b not in abc_and_bcd, but should be') 225 | all_good = False 226 | if not 'c' in abc_and_bcd: 227 | sys.stderr.write('c not in abc_and_bcd, but should be') 228 | all_good = False 229 | if 'd' in abc_and_bcd: 230 | sys.stderr.write('d in abc_and_bcd, but should not be') 231 | all_good = False 232 | 233 | return all_good 234 | 235 | 236 | def or_test(): 237 | """Test the | operator""" 238 | 239 | all_good = True 240 | 241 | abc = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01) 242 | for character in ['a', 'b', 'c']: 243 | abc += character 244 | 245 | bcd = bloom_filter.BloomFilter(max_elements=100, error_rate=0.01) 246 | for character in ['b', 'c', 'd']: 247 | bcd += character 248 | 249 | abc_and_bcd = abc 250 | abc_and_bcd |= bcd 251 | 252 | if not 'a' in abc_and_bcd: 253 | sys.stderr.write('a not in abc_and_bcd, but should be') 254 | all_good = False 255 | if not 'b' in abc_and_bcd: 256 | sys.stderr.write('b not in abc_and_bcd, but should be') 257 | all_good = False 258 | if not 'c' in abc_and_bcd: 259 | sys.stderr.write('c not in abc_and_bcd, but should be') 260 | all_good = False 261 | if not 'd' in abc_and_bcd: 262 | sys.stderr.write('d not in abc_and_bcd, but should be') 263 | all_good = False 264 | if 'e' in abc_and_bcd: 265 | sys.stderr.write('e in abc_and_bcd, but should not be') 266 | all_good = False 267 | 268 | return all_good 269 | 270 | 271 | def give_description(filename): 272 | """Return a description of the filename type - could be array, file or hybrid""" 273 | if filename is None: 274 | return 'array' 275 | elif isinstance(filename, tuple): 276 | if filename[1] == -1: 277 | return 'mmap' 278 | else: 279 | return 'hybrid' 280 | else: 281 | return 'seek' 282 | 283 | 284 | def test_bloom_filter(): 285 | """Unit tests for BloomFilter class""" 286 | 287 | if sys.argv[1:] == ['--performance-test']: 288 | performance_test = True 289 | else: 290 | performance_test = False 291 | 292 | all_good = True 293 | 294 | all_good &= _test('states', States(), trials=100000, error_rate=0.01) 295 | 296 | all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1) 297 | all_good &= _test('random', Random_content(), trials=1000000, error_rate=1E-9) 298 | all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1, 299 | probe_bitnoer=bloom_filter.get_bitno_seed_rnd) 300 | 301 | filename = 'bloom-filter-rm-me' 302 | all_good &= _test('random', Random_content(), trials=10000, error_rate=0.1, filename=filename) 303 | 304 | all_good &= and_test() 305 | 306 | all_good &= or_test() 307 | 308 | if performance_test: 309 | sqrt_of_10 = math.sqrt(10) 310 | # for exponent in range(5): # this is a lot, but probably not unreasonable 311 | for exponent in range(19): # this is a lot, but probably not unreasonable 312 | elements = int(sqrt_of_10 ** exponent + 0.5) 313 | for filename in [None, 'bloom-filter-rm-me', ('bloom-filter-rm-me', 768 * 2 ** 20), 314 | ('bloom-filter-rm-me', -1)]: 315 | description = give_description(filename) 316 | key = '%s %s' % (description, elements) 317 | database = anydbm.open('performance-numbers', 'c') 318 | if key in database.keys(): 319 | database.close() 320 | continue 321 | if elements >= 100000000 and description == 'seek': 322 | continue 323 | if elements >= 100000000 and description == 'mmap': 324 | continue 325 | if elements >= 1000000000 and description == 'array': 326 | continue 327 | time0 = time.time() 328 | all_good &= _test( 329 | 'evens %s elements: %d' % (give_description(filename), elements), 330 | Evens(elements), 331 | trials=elements, 332 | error_rate=1e-2, 333 | filename=filename, 334 | ) 335 | time1 = time.time() 336 | delta_t = time1 - time0 337 | # file_ = open('%s.txt' % description, 'a') 338 | # file_.write('%d %f\n' % (elements, delta_t)) 339 | # file_.close() 340 | database = anydbm.open('performance-numbers', 'c') 341 | database[key] = '%f' % delta_t 342 | database.close() 343 | 344 | # test prob count ok 345 | bloom = bloom_filter.BloomFilter(1000000, error_rate=.99) 346 | all_good &= bloom.num_probes_k == 1 347 | if not all_good: 348 | sys.stderr.write('%s: One or more tests failed\n' % sys.argv[0]) 349 | sys.exit(1) 350 | 351 | 352 | if __name__ == '__main__': 353 | test_bloom_filter() 354 | -------------------------------------------------------------------------------- /src/bloom_filter/bloom_filter.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # pylint: disable=superfluous-parens,redefined-variable-type 3 | # superfluous-parens: Sometimes extra parens are more clear 4 | 5 | """Bloom Filter: Probabilistic set membership testing for large sets""" 6 | 7 | # Shamelessly borrowed (under MIT license) from http://code.activestate.com/recipes/577686-bloom-filter/ 8 | # About Bloom Filters: http://en.wikipedia.org/wiki/Bloom_filter 9 | 10 | # Tweaked by Daniel Richard Stromberg, mostly to: 11 | # 1) Give it a little nicer __init__ parameters. 12 | # 2) Improve the hash functions to get a much lower rate of false positives. 13 | # 3) Give it a selection of backends. 14 | # 4) Make it pass pylint. 15 | 16 | from __future__ import division 17 | import os 18 | #mport sys 19 | import math 20 | import array 21 | import random 22 | 23 | try: 24 | import mmap as mmap_mod 25 | except ImportError: 26 | # Jython lacks mmap() 27 | HAVE_MMAP = False 28 | else: 29 | HAVE_MMAP = True 30 | 31 | #mport bufsock 32 | #mport hashlib 33 | #mport numbers 34 | 35 | import python2x3 36 | 37 | # In the literature: 38 | # k is the number of probes - we call this num_probes_k 39 | # m is the number of bits in the filter - we call this num_bits_m 40 | # n is the ideal number of elements to eventually be stored in the filter - we call this ideal_num_elements_n 41 | # p is the desired error rate when full - we call this error_rate_p 42 | 43 | 44 | def my_range(num_values): 45 | """Generate numbers from 0..num_values-1""" 46 | 47 | value = 0 48 | while value < num_values: 49 | yield value 50 | value += 1 51 | 52 | # In the abstract, this is what we want &= and |= to do, but especially for disk-based filters, this is extremely slow 53 | #class Backend_set_operations: 54 | # """Provide &= and |= for backends""" 55 | # # pylint: disable=W0232 56 | # # W0232: We don't need an __init__ method; we're never instantiated directly 57 | # def __iand__(self, other): 58 | # assert self.num_bits == other.num_bits 59 | # 60 | # for bitno in my_range(num_bits): 61 | # if self.is_set(bitno) and other.is_set(bitno): 62 | # self[bitno].set() 63 | # else: 64 | # self[bitno].clear() 65 | # 66 | # def __ior__(self, other): 67 | # assert self.num_bits == other.num_bits 68 | # 69 | # for bitno in xrange(num_bits): 70 | # if self[bitno] or other[bitno]: 71 | # self[bitno].set() 72 | # else: 73 | # self[bitno].clear() 74 | 75 | 76 | if HAVE_MMAP: 77 | 78 | class Mmap_backend(object): 79 | """ 80 | Backend storage for our "array of bits" using an mmap'd file. 81 | Please note that this has only been tested on Linux so far: 2 -11-01. 82 | """ 83 | 84 | effs = 2 ^ 8 - 1 85 | 86 | def __init__(self, num_bits, filename): 87 | self.num_bits = num_bits 88 | self.num_chars = (self.num_bits + 7) // 8 89 | flags = os.O_RDWR | os.O_CREAT 90 | if hasattr(os, 'O_BINARY'): 91 | flags |= getattr(os, 'O_BINARY') 92 | self.file_ = os.open(filename, flags) 93 | os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET) 94 | os.write(self.file_, python2x3.null_byte) 95 | self.mmap = mmap_mod.mmap(self.file_, self.num_chars) 96 | 97 | def is_set(self, bitno): 98 | """Return true iff bit number bitno is set""" 99 | byteno, bit_within_wordno = divmod(bitno, 8) 100 | mask = 1 << bit_within_wordno 101 | char = self.mmap[byteno] 102 | if isinstance(char, str): 103 | byte = ord(char) 104 | else: 105 | byte = int(char) 106 | return byte & mask 107 | 108 | def set(self, bitno): 109 | """set bit number bitno to true""" 110 | 111 | byteno, bit_within_byteno = divmod(bitno, 8) 112 | mask = 1 << bit_within_byteno 113 | char = self.mmap[byteno] 114 | byte = ord(char) 115 | byte |= mask 116 | self.mmap[byteno] = chr(byte) 117 | 118 | def clear(self, bitno): 119 | """clear bit number bitno - set it to false""" 120 | 121 | byteno, bit_within_byteno = divmod(bitno, 8) 122 | mask = 1 << bit_within_byteno 123 | char = self.mmap[byteno] 124 | byte = ord(char) 125 | byte &= Mmap_backend.effs - mask 126 | self.mmap[byteno] = chr(byte) 127 | 128 | def __iand__(self, other): 129 | assert self.num_bits == other.num_bits 130 | 131 | for byteno in my_range(self.num_chars): 132 | self.mmap[byteno] = chr(ord(self.mmap[byteno]) & ord(other.mmap[byteno])) 133 | 134 | return self 135 | 136 | def __ior__(self, other): 137 | assert self.num_bits == other.num_bits 138 | 139 | for byteno in my_range(self.num_chars): 140 | self.mmap[byteno] = chr(ord(self.mmap[byteno]) | ord(other.mmap[byteno])) 141 | 142 | return self 143 | 144 | def close(self): 145 | """Close the file""" 146 | os.close(self.file_) 147 | 148 | 149 | class File_seek_backend(object): 150 | """Backend storage for our "array of bits" using a file in which we seek""" 151 | 152 | effs = 2 ^ 8 - 1 153 | 154 | def __init__(self, num_bits, filename): 155 | self.num_bits = num_bits 156 | self.num_chars = (self.num_bits + 7) // 8 157 | flags = os.O_RDWR | os.O_CREAT 158 | if hasattr(os, 'O_BINARY'): 159 | flags |= getattr(os, 'O_BINARY') 160 | self.file_ = os.open(filename, flags) 161 | os.lseek(self.file_, self.num_chars + 1, os.SEEK_SET) 162 | os.write(self.file_, python2x3.null_byte) 163 | 164 | def is_set(self, bitno): 165 | """Return true iff bit number bitno is set""" 166 | byteno, bit_within_wordno = divmod(bitno, 8) 167 | mask = 1 << bit_within_wordno 168 | os.lseek(self.file_, byteno, os.SEEK_SET) 169 | char = os.read(self.file_, 1) 170 | if isinstance(char, str): 171 | byte = ord(char) 172 | else: 173 | byte = char[0] 174 | return byte & mask 175 | 176 | def set(self, bitno): 177 | """set bit number bitno to true""" 178 | 179 | byteno, bit_within_byteno = divmod(bitno, 8) 180 | mask = 1 << bit_within_byteno 181 | os.lseek(self.file_, byteno, os.SEEK_SET) 182 | char = os.read(self.file_, 1) 183 | if isinstance(char, str): 184 | byte = ord(char) 185 | was_char = True 186 | else: 187 | byte = char[0] 188 | was_char = False 189 | byte |= mask 190 | os.lseek(self.file_, byteno, os.SEEK_SET) 191 | if was_char: 192 | os.write(self.file_, chr(byte)) 193 | else: 194 | char = python2x3.intlist_to_binary([byte]) 195 | os.write(self.file_, char) 196 | 197 | def clear(self, bitno): 198 | """clear bit number bitno - set it to false""" 199 | 200 | byteno, bit_within_byteno = divmod(bitno, 8) 201 | mask = 1 << bit_within_byteno 202 | os.lseek(self.file_, byteno, os.SEEK_SET) 203 | char = os.read(self.file_, 1) 204 | if isinstance(char, str): 205 | byte = ord(char) 206 | was_char = True 207 | else: 208 | byte = int(char) 209 | was_char = False 210 | byte &= File_seek_backend.effs - mask 211 | os.lseek(self.file_, byteno, os.SEEK_SET) 212 | if was_char: 213 | os.write(chr(byte)) 214 | else: 215 | char = python2x3.intlist_to_binary([byte]) 216 | os.write(char) 217 | 218 | # These are quite slow ways to do iand and ior, but they should work, 219 | # and a faster version is going to take more time 220 | def __iand__(self, other): 221 | assert self.num_bits == other.num_bits 222 | 223 | for bitno in my_range(self.num_bits): 224 | if self.is_set(bitno) and other.is_set(bitno): 225 | self.set(bitno) 226 | else: 227 | self.clear(bitno) 228 | 229 | return self 230 | 231 | def __ior__(self, other): 232 | assert self.num_bits == other.num_bits 233 | 234 | for bitno in my_range(self.num_bits): 235 | if self.is_set(bitno) or other.is_set(bitno): 236 | self.set(bitno) 237 | else: 238 | self.clear(bitno) 239 | 240 | return self 241 | 242 | def close(self): 243 | """Close the file""" 244 | os.close(self.file_) 245 | 246 | 247 | class Array_then_file_seek_backend(object): 248 | # pylint: disable=R0902 249 | # R0902: We kinda need a bunch of instance attributes 250 | """ 251 | Backend storage for our "array of bits" using a python array of integers up to some maximum number of bytes, 252 | then spilling over to a file. This is -not- a cache; we instead save the leftmost bits in RAM, and the 253 | rightmost bits (if necessary) in a file. On open, we read from the file to RAM. On close, we write from RAM 254 | to the file. 255 | """ 256 | 257 | effs = 2 ** 8 - 1 258 | 259 | def __init__(self, num_bits, filename, max_bytes_in_memory): 260 | self.num_bits = num_bits 261 | num_chars = (self.num_bits + 7) // 8 262 | self.filename = filename 263 | self.max_bytes_in_memory = max_bytes_in_memory 264 | self.bits_in_memory = min(num_bits, self.max_bytes_in_memory * 8) 265 | self.bits_in_file = max(self.num_bits - self.bits_in_memory, 0) 266 | self.bytes_in_memory = (self.bits_in_memory + 7) // 8 267 | self.bytes_in_file = (self.bits_in_file + 7) // 8 268 | 269 | self.array_ = array.array('B', [0]) * self.bytes_in_memory 270 | flags = os.O_RDWR | os.O_CREAT 271 | if hasattr(os, 'O_BINARY'): 272 | flags |= getattr(os, 'O_BINARY') 273 | self.file_ = os.open(filename, flags) 274 | os.lseek(self.file_, num_chars + 1, os.SEEK_SET) 275 | os.write(self.file_, python2x3.null_byte) 276 | 277 | os.lseek(self.file_, 0, os.SEEK_SET) 278 | offset = 0 279 | intended_block_len = 2 ** 17 280 | while True: 281 | if offset + intended_block_len < self.bytes_in_memory: 282 | block = os.read(self.file_, intended_block_len) 283 | elif offset < self.bytes_in_memory: 284 | block = os.read(self.file_, self.bytes_in_memory - offset) 285 | else: 286 | break 287 | for index_in_block, character in enumerate(block): 288 | self.array_[offset + index_in_block] = ord(character) 289 | offset += intended_block_len 290 | 291 | def is_set(self, bitno): 292 | """Return true iff bit number bitno is set""" 293 | byteno, bit_within_byteno = divmod(bitno, 8) 294 | mask = 1 << bit_within_byteno 295 | if byteno < self.bytes_in_memory: 296 | return self.array_[byteno] & mask 297 | else: 298 | os.lseek(self.file_, byteno, os.SEEK_SET) 299 | char = os.read(self.file_, 1) 300 | if isinstance(char, str): 301 | byte = ord(char) 302 | else: 303 | byte = int(char) 304 | return byte & mask 305 | 306 | def set(self, bitno): 307 | """set bit number bitno to true""" 308 | byteno, bit_within_byteno = divmod(bitno, 8) 309 | mask = 1 << bit_within_byteno 310 | if byteno < self.bytes_in_memory: 311 | self.array_[byteno] |= mask 312 | else: 313 | os.lseek(self.file_, byteno, os.SEEK_SET) 314 | char = os.read(self.file_, 1) 315 | if isinstance(char, str): 316 | byte = ord(char) 317 | was_char = True 318 | else: 319 | byte = char 320 | was_char = False 321 | byte |= mask 322 | os.lseek(self.file_, byteno, os.SEEK_SET) 323 | if was_char: 324 | os.write(self.file_, chr(byte)) 325 | else: 326 | os.write(self.file_, byte) 327 | 328 | def clear(self, bitno): 329 | """clear bit number bitno - set it to false""" 330 | byteno, bit_within_byteno = divmod(bitno, 8) 331 | mask = Array_backend.effs - (1 << bit_within_byteno) 332 | if byteno < self.bytes_in_memory: 333 | self.array_[byteno] &= mask 334 | else: 335 | os.lseek(self.file_, byteno, os.SEEK_SET) 336 | char = os.read(self.file_, 1) 337 | if isinstance(char, str): 338 | byte = ord(char) 339 | was_char = True 340 | else: 341 | byte = int(char) 342 | was_char = False 343 | byte &= File_seek_backend.effs - mask 344 | os.lseek(self.file_, byteno, os.SEEK_SET) 345 | if was_char: 346 | os.write(chr(byte)) 347 | else: 348 | os.write(byte) 349 | 350 | # These are quite slow ways to do iand and ior, but they should work, 351 | # and a faster version is going to take more time 352 | def __iand__(self, other): 353 | assert self.num_bits == other.num_bits 354 | 355 | for bitno in my_range(self.num_bits): 356 | if self.is_set(bitno) and other.is_set(bitno): 357 | self.set(bitno) 358 | else: 359 | self.clear(bitno) 360 | 361 | return self 362 | 363 | def __ior__(self, other): 364 | assert self.num_bits == other.num_bits 365 | 366 | for bitno in my_range(self.num_bits): 367 | if self.is_set(bitno) or other.is_set(bitno): 368 | self.set(bitno) 369 | else: 370 | self.clear(bitno) 371 | 372 | return self 373 | 374 | def close(self): 375 | """Write the in-memory portion to disk, leave the already-on-disk portion unchanged""" 376 | 377 | os.lseek(self.file_, 0, os.SEEK_SET) 378 | for index in my_range(self.bytes_in_memory): 379 | self.file_.write(self.array_[index]) 380 | 381 | os.close(self.file_) 382 | 383 | 384 | class Array_backend(object): 385 | """Backend storage for our "array of bits" using a python array of integers""" 386 | 387 | # Note that this has now been split out into a bits_mod for the benefit of other projects. 388 | effs = 2 ** 32 - 1 389 | 390 | def __init__(self, num_bits): 391 | self.num_bits = num_bits 392 | self.num_words = (self.num_bits + 31) // 32 393 | self.array_ = array.array('L', [0]) * self.num_words 394 | 395 | def is_set(self, bitno): 396 | """Return true iff bit number bitno is set""" 397 | wordno, bit_within_wordno = divmod(bitno, 32) 398 | mask = 1 << bit_within_wordno 399 | return self.array_[wordno] & mask 400 | 401 | def set(self, bitno): 402 | """set bit number bitno to true""" 403 | wordno, bit_within_wordno = divmod(bitno, 32) 404 | mask = 1 << bit_within_wordno 405 | self.array_[wordno] |= mask 406 | 407 | def clear(self, bitno): 408 | """clear bit number bitno - set it to false""" 409 | wordno, bit_within_wordno = divmod(bitno, 32) 410 | mask = Array_backend.effs - (1 << bit_within_wordno) 411 | self.array_[wordno] &= mask 412 | 413 | # It'd be nice to do __iand__ and __ior__ in a base class, but that'd be Much slower 414 | 415 | def __iand__(self, other): 416 | assert self.num_bits == other.num_bits 417 | 418 | for wordno in my_range(self.num_words): 419 | self.array_[wordno] &= other.array_[wordno] 420 | 421 | return self 422 | 423 | def __ior__(self, other): 424 | assert self.num_bits == other.num_bits 425 | 426 | for wordno in my_range(self.num_words): 427 | self.array_[wordno] |= other.array_[wordno] 428 | 429 | return self 430 | 431 | def close(self): 432 | """Noop for compatibility with the file+seek backend""" 433 | pass 434 | 435 | 436 | def get_bitno_seed_rnd(bloom_filter, key): 437 | """Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result""" 438 | 439 | # We're using key as a seed to a pseudorandom number generator 440 | hasher = random.Random(key).randrange 441 | for dummy in range(bloom_filter.num_probes_k): 442 | bitno = hasher(bloom_filter.num_bits_m) 443 | yield bitno % bloom_filter.num_bits_m 444 | 445 | 446 | MERSENNES1 = [2 ** x - 1 for x in [17, 31, 127]] 447 | MERSENNES2 = [2 ** x - 1 for x in [19, 67, 257]] 448 | 449 | 450 | def simple_hash(int_list, prime1, prime2, prime3): 451 | """Compute a hash value from a list of integers and 3 primes""" 452 | result = 0 453 | for integer in int_list: 454 | result += ((result + integer + prime1) * prime2) % prime3 455 | return result 456 | 457 | 458 | def hash1(int_list): 459 | """Basic hash function #1""" 460 | return simple_hash(int_list, MERSENNES1[0], MERSENNES1[1], MERSENNES1[2]) 461 | 462 | 463 | def hash2(int_list): 464 | """Basic hash function #2""" 465 | return simple_hash(int_list, MERSENNES2[0], MERSENNES2[1], MERSENNES2[2]) 466 | 467 | 468 | def get_filter_bitno_probes(bloom_filter, key): 469 | """Apply num_probes_k hash functions to key. Generate the array index and bitmask corresponding to each result""" 470 | 471 | # This one assumes key is either bytes or str (or other list of integers) 472 | 473 | # I'd love to check for long too, but that doesn't exist in 3.2, and 2.5 doesn't have the numbers.Integral base type 474 | if hasattr(key, '__divmod__'): 475 | int_list = [] 476 | temp = key 477 | while temp: 478 | quotient, remainder = divmod(temp, 256) 479 | int_list.append(remainder) 480 | temp = quotient 481 | elif hasattr(key[0], '__divmod__'): 482 | int_list = key 483 | elif isinstance(key[0], str): 484 | int_list = [ord(char) for char in key] 485 | else: 486 | raise TypeError('Sorry, I do not know how to hash this type') 487 | 488 | hash_value1 = hash1(int_list) 489 | hash_value2 = hash2(int_list) 490 | probe_value = hash_value1 491 | 492 | for probeno in range(1, bloom_filter.num_probes_k + 1): 493 | probe_value *= hash_value1 494 | probe_value += hash_value2 495 | probe_value %= MERSENNES1[2] 496 | yield probe_value % bloom_filter.num_bits_m 497 | 498 | 499 | def try_unlink(filename): 500 | """unlink a file. Don't complain if it's not there""" 501 | try: 502 | os.unlink(filename) 503 | except OSError: 504 | pass 505 | return 506 | 507 | 508 | class BloomFilter(object): 509 | """Probabilistic set membership testing for large sets""" 510 | def __init__(self, 511 | max_elements=10000, 512 | error_rate=0.1, 513 | probe_bitnoer=get_filter_bitno_probes, 514 | filename=None, 515 | start_fresh=False): 516 | # pylint: disable=R0913 517 | # R0913: We want a few arguments 518 | if max_elements <= 0: 519 | raise ValueError('ideal_num_elements_n must be > 0') 520 | if not (0 < error_rate < 1): 521 | raise ValueError('error_rate_p must be between 0 and 1 exclusive') 522 | 523 | self.error_rate_p = error_rate 524 | # With fewer elements, we should do very well. With more elements, our error rate "guarantee" 525 | # drops rapidly. 526 | self.ideal_num_elements_n = max_elements 527 | 528 | numerator = -1 * self.ideal_num_elements_n * math.log(self.error_rate_p) 529 | denominator = math.log(2) ** 2 530 | real_num_bits_m = numerator / denominator 531 | self.num_bits_m = int(math.ceil(real_num_bits_m)) 532 | 533 | if filename is None: 534 | self.backend = Array_backend(self.num_bits_m) 535 | elif isinstance(filename, tuple) and isinstance(filename[1], int): 536 | if start_fresh: 537 | try_unlink(filename[0]) 538 | if filename[1] == -1: 539 | self.backend = Mmap_backend(self.num_bits_m, filename[0]) 540 | else: 541 | self.backend = Array_then_file_seek_backend(self.num_bits_m, filename[0], filename[1]) 542 | else: 543 | if start_fresh: 544 | try_unlink(filename) 545 | self.backend = File_seek_backend(self.num_bits_m, filename) 546 | 547 | # AKA num_offsetters 548 | # Verified against http://en.wikipedia.org/wiki/Bloom_filter#Probability_of_false_positives 549 | real_num_probes_k = (self.num_bits_m / self.ideal_num_elements_n) * math.log(2) 550 | self.num_probes_k = int(math.ceil(real_num_probes_k)) 551 | self.probe_bitnoer = probe_bitnoer 552 | 553 | def __repr__(self): 554 | return 'BloomFilter(ideal_num_elements_n=%d, error_rate_p=%f, num_bits_m=%d)' % ( 555 | self.ideal_num_elements_n, 556 | self.error_rate_p, 557 | self.num_bits_m, 558 | ) 559 | 560 | def add(self, key): 561 | """Add an element to the filter""" 562 | for bitno in self.probe_bitnoer(self, key): 563 | self.backend.set(bitno) 564 | 565 | def __iadd__(self, key): 566 | self.add(key) 567 | return self 568 | 569 | def _match_template(self, bloom_filter): 570 | """Compare a sort of signature for two bloom filters. Used in preparation for binary operations""" 571 | return (self.num_bits_m == bloom_filter.num_bits_m 572 | and self.num_probes_k == bloom_filter.num_probes_k 573 | and self.probe_bitnoer == bloom_filter.probe_bitnoer) 574 | 575 | def union(self, bloom_filter): 576 | """Compute the set union of two bloom filters""" 577 | self.backend |= bloom_filter.backend 578 | 579 | def __ior__(self, bloom_filter): 580 | self.union(bloom_filter) 581 | return self 582 | 583 | def intersection(self, bloom_filter): 584 | """Compute the set intersection of two bloom filters""" 585 | self.backend &= bloom_filter.backend 586 | 587 | def __iand__(self, bloom_filter): 588 | self.intersection(bloom_filter) 589 | return self 590 | 591 | def __contains__(self, key): 592 | for bitno in self.probe_bitnoer(self, key): 593 | if not self.backend.is_set(bitno): 594 | return False 595 | return True 596 | --------------------------------------------------------------------------------