├── .gitignore ├── .travis.yml ├── AUTHORS ├── CHANGELOG ├── LICENSE ├── MANIFEST.in ├── Makefile ├── README.markdown ├── docs ├── Makefile ├── conf.py ├── index.txt ├── make.bat ├── pushtogh └── ref.txt ├── fake_pyrex └── Pyrex │ ├── .svn │ ├── all-wcprops │ ├── entries │ └── text-base │ │ └── __init__.py.svn-base │ ├── Distutils │ ├── .svn │ │ ├── all-wcprops │ │ ├── entries │ │ └── text-base │ │ │ ├── __init__.py.svn-base │ │ │ └── build_ext.py.svn-base │ ├── __init__.py │ └── build_ext.py │ └── __init__.py ├── setup.py ├── src ├── Makefile ├── MurmurHash3.c ├── MurmurHash3.h ├── bloomfilter.c ├── bloomfilter.h ├── cbloomfilter.pxd ├── md5.c ├── md5.h ├── mmapbitarray.c ├── mmapbitarray.h ├── primetester.c ├── primetester.h ├── pybloomfilter.c ├── pybloomfilter.pyx └── superfast.h └── tests ├── __init__.py ├── accuracytest.py ├── comparisons ├── accuracytest.py ├── speedtest.py ├── testwords └── words └── simpletest.py /.gitignore: -------------------------------------------------------------------------------- 1 | /local 2 | pybloomfilter.so 3 | /build/ 4 | /dist/ 5 | /docs/_build 6 | MANIFEST 7 | /bin 8 | /include 9 | /lib 10 | *.egg-info 11 | *~ 12 | *pyc 13 | /tags 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - 2.6 4 | - 2.7 5 | - pypy 6 | script: python setup.py test 7 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | Michael Axiak 2 | Rob Stacey 3 | dlecocq - For superfast addition 4 | pbutler - Fix memory leak 5 | Dan Crosta - Convert MurmurHash3 to C from C++ 6 | -------------------------------------------------------------------------------- /CHANGELOG: -------------------------------------------------------------------------------- 1 | 0.3.6 2012-10-21) 2 | - Minor cosmetic changes to reduce noise in clang. Might have fixed 3 | cryptolib linking error. [BUG] 4 | 5 | 0.3.6 2012-09-04) 6 | - Fixed memory leak in SHA512 computation. Thanks pbutler! [BUG] 7 | 8 | 0.3.2 2011-12-07) 9 | - Fixed segfault when using non-strings as values. [BUG] 10 | 11 | 0.3.1 2011-12-07) 12 | - Fixed casting of size_t for very large bloom filters. [BUG] 13 | - Added option to use standard memory (rather than mmap) to back the bloom filter. 14 | (Specify None or '' as the argument to file name.) [FEATURE] 15 | - Changed hash to use a "superfast" hash rather than md5 since md5 is 16 | unnecessarily slow. [FEATURE] 17 | 18 | 0.2.0 2011-11-13) 19 | - Fixed hashing to use md5 hash and change the bit computation to pass 20 | accuracy test. [BUG] 21 | 22 | 0.1.28 2011-03-12) 23 | - Added check to ensure that the required permissions are available when 24 | opening a file. [BUG] 25 | 26 | 0.1.26 2011-03-07) 27 | - Added __dealloc__ to fix memory/resource leaks. [BUG] 28 | - Added .close() method to forcibly close a bloom filter object. [FEATURE] 29 | 30 | 0.1.20 2010-12-21) 31 | - Changed prime number finder to use a standard double and add 1 to find 32 | likely numbers in log(n) time. 33 | 34 | 0.1.18 2010-10-25) 35 | - Fixed issue #5, wherein a call to open() returning an invalid fd would 36 | cause a seg fault. [BUG] 37 | 38 | 0.1.12 2010-4-5) 39 | - Added __len__ which gives a good estimate of how many elements were 40 | inserted [FEATURE] 41 | - Removed __ixor__ since it didn't make much sense. 42 | - Added .intersect() and .union() as a synonym for __iand__ and __ior__ [FEATURE] 43 | - Added reserved space to the BloomFilter structure to support future updates. [FEATURE] 44 | 45 | 0.1.10 2010-4-2) 46 | - Added MANIFEST so that building distributions works [FEATURE] 47 | - Restructured docs so that html is a symlink to the _build/html directory [FEATURE] 48 | - Added Cython detection in setup.py. Building now works without Cython installed [FEATURE] 49 | 50 | 0.1.8 2010-3-31) 51 | - Fixed bug where opening invalid bloom files caused seg faults [BUG] 52 | - Fixed alignment bug where sometimes hash seeds would not [BUG] 53 | be compared correctly. 54 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2011 Michael Axiak 2 | 3 | Permission is hereby granted, free of charge, to any person 4 | obtaining a copy of this software and associated documentation 5 | files (the "Software"), to deal in the Software without 6 | restriction, including without limitation the rights to use, 7 | copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the 9 | Software is furnished to do so, subject to the following 10 | conditions: 11 | 12 | The above copyright notice and this permission notice shall be 13 | included in all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 17 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 19 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 20 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include CHANGELOG 2 | include LICENSE 3 | include AUTHORS 4 | include README.markdown 5 | recursive-include src * 6 | recursive-include tests * 7 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | python setup.py build_ext --inplace 3 | 4 | install: 5 | @# Support Debian package building with fall-back default 6 | python setup.py install --root $${DESTDIR:-/} 7 | 8 | 9 | clean: 10 | rm -rf build/ 11 | rm -rf dist/ 12 | rm -fv *so 13 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # pybloomfiltermmap [![Build Status](https://secure.travis-ci.org/axiak/pybloomfiltermmap.png?branch=master)](http://travis-ci.org/axiak/pybloomfiltermmap) 2 | 3 | The goal of `pybloomfiltermmap` is simple: to provide a fast, simple, scalable, 4 | correct library for Bloom Filters in Python. 5 | 6 | ## Docs 7 | 8 | See . 9 | 10 | ## Overview 11 | 12 | After you install, the interface to use is a cross between a file 13 | interface and a ste interface. As an example: 14 | 15 | >>> fruit = pybloomfilter.BloomFilter(100000, 0.1, '/tmp/words.bloom') 16 | >>> fruit.update(('apple', 'pear', 'orange', 'apple')) 17 | >>> len(fruit) 18 | 3 19 | >>> 'mike' in fruit 20 | False 21 | >>> 'apple' in fruit 22 | True 23 | 24 | ## Install 25 | 26 | You may or may not want to use Cython. If you have it installed, the 27 | setup file will build the C file from the pyx file. Otherwise, it will 28 | skip that step automatically and build from the packaged C file. 29 | 30 | To install: 31 | 32 | $ sudo python setup.py install 33 | 34 | and you should be set. 35 | 36 | ## License 37 | 38 | See the LICENSE file. It's under the MIT License. 39 | 40 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # Internal variables. 11 | PAPEROPT_a4 = -D latex_paper_size=a4 12 | PAPEROPT_letter = -D latex_paper_size=letter 13 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 14 | 15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest 16 | 17 | help: 18 | @echo "Please use \`make ' where is one of" 19 | @echo " html to make standalone HTML files" 20 | @echo " dirhtml to make HTML files named index.html in directories" 21 | @echo " pickle to make pickle files" 22 | @echo " json to make JSON files" 23 | @echo " htmlhelp to make HTML files and a HTML help project" 24 | @echo " qthelp to make HTML files and a qthelp project" 25 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 26 | @echo " changes to make an overview of all changed/added/deprecated items" 27 | @echo " linkcheck to check all external links for integrity" 28 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 29 | 30 | clean: 31 | -rm -rf $(BUILDDIR)/* 32 | 33 | html: 34 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 35 | @echo 36 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 37 | 38 | dirhtml: 39 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 40 | @echo 41 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 42 | 43 | pickle: 44 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 45 | @echo 46 | @echo "Build finished; now you can process the pickle files." 47 | 48 | json: 49 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 50 | @echo 51 | @echo "Build finished; now you can process the JSON files." 52 | 53 | htmlhelp: 54 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 55 | @echo 56 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 57 | ".hhp project file in $(BUILDDIR)/htmlhelp." 58 | 59 | qthelp: 60 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 61 | @echo 62 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 63 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 64 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PythonBloomFilter.qhcp" 65 | @echo "To view the help file:" 66 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PythonBloomFilter.qhc" 67 | 68 | latex: 69 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 70 | @echo 71 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 72 | @echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \ 73 | "run these through (pdf)latex." 74 | 75 | changes: 76 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 77 | @echo 78 | @echo "The overview file is in $(BUILDDIR)/changes." 79 | 80 | linkcheck: 81 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 82 | @echo 83 | @echo "Link check complete; look for any errors in the above output " \ 84 | "or in $(BUILDDIR)/linkcheck/output.txt." 85 | 86 | doctest: 87 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 88 | @echo "Testing of doctests in the sources finished, look at the " \ 89 | "results in $(BUILDDIR)/doctest/output.txt." 90 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Python BloomFilter documentation build configuration file, created by 4 | # sphinx-quickstart on Wed Mar 31 16:25:58 2010. 5 | # 6 | # This file is execfile()d with the current directory set to its containing dir. 7 | # 8 | # Note that not all possible configuration values are present in this 9 | # autogenerated file. 10 | # 11 | # All configuration values have a default; values that are commented out 12 | # serve to show the default. 13 | 14 | import sys, os 15 | 16 | # If extensions (or modules to document with autodoc) are in another directory, 17 | # add these directories to sys.path here. If the directory is relative to the 18 | # documentation root, use os.path.abspath to make it absolute, like shown here. 19 | #sys.path.append(os.path.abspath('.')) 20 | 21 | # -- General configuration ----------------------------------------------------- 22 | 23 | # Add any Sphinx extension module names here, as strings. They can be extensions 24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. 25 | extensions = ['sphinx.ext.doctest', 'sphinx.ext.coverage', 'sphinxtogithub'] 26 | 27 | # Add any paths that contain templates here, relative to this directory. 28 | templates_path = ['_templates'] 29 | 30 | # The suffix of source filenames. 31 | source_suffix = '.txt' 32 | 33 | # The encoding of source files. 34 | #source_encoding = 'utf-8' 35 | 36 | # The master toctree document. 37 | master_doc = 'index' 38 | 39 | # General information about the project. 40 | project = u'Python BloomFilter' 41 | copyright = u'2010-2012, Michael Axiak' 42 | 43 | # The version info for the project you're documenting, acts as replacement for 44 | # |version| and |release|, also used in various other places throughout the 45 | # built documents. 46 | # 47 | # The short X.Y version. 48 | version = '0.3.2' 49 | # The full version, including alpha/beta/rc tags. 50 | release = '0.3.2' 51 | 52 | # The language for content autogenerated by Sphinx. Refer to documentation 53 | # for a list of supported languages. 54 | #language = None 55 | 56 | # There are two options for replacing |today|: either, you set today to some 57 | # non-false value, then it is used: 58 | #today = '' 59 | # Else, today_fmt is used as the format for a strftime call. 60 | #today_fmt = '%B %d, %Y' 61 | 62 | # List of documents that shouldn't be included in the build. 63 | #unused_docs = [] 64 | 65 | # List of directories, relative to source directory, that shouldn't be searched 66 | # for source files. 67 | exclude_trees = ['_build','html'] 68 | 69 | # The reST default role (used for this markup: `text`) to use for all documents. 70 | #default_role = None 71 | 72 | # If true, '()' will be appended to :func: etc. cross-reference text. 73 | #add_function_parentheses = True 74 | 75 | # If true, the current module name will be prepended to all description 76 | # unit titles (such as .. function::). 77 | #add_module_names = True 78 | 79 | # If true, sectionauthor and moduleauthor directives will be shown in the 80 | # output. They are ignored by default. 81 | #show_authors = False 82 | 83 | # The name of the Pygments (syntax highlighting) style to use. 84 | pygments_style = 'sphinx' 85 | 86 | # A list of ignored prefixes for module index sorting. 87 | #modindex_common_prefix = [] 88 | 89 | 90 | # -- Options for HTML output --------------------------------------------------- 91 | 92 | # The theme to use for HTML and HTML Help pages. Major themes that come with 93 | # Sphinx are currently 'default' and 'sphinxdoc'. 94 | html_theme = 'default' 95 | 96 | # Theme options are theme-specific and customize the look and feel of a theme 97 | # further. For a list of options available for each theme, see the 98 | # documentation. 99 | #html_theme_options = {} 100 | 101 | # Add any paths that contain custom themes here, relative to this directory. 102 | #html_theme_path = [] 103 | 104 | # The name for this set of Sphinx documents. If None, it defaults to 105 | # " v documentation". 106 | #html_title = None 107 | 108 | # A shorter title for the navigation bar. Default is the same as html_title. 109 | #html_short_title = None 110 | 111 | # The name of an image file (relative to this directory) to place at the top 112 | # of the sidebar. 113 | #html_logo = None 114 | 115 | # The name of an image file (within the static path) to use as favicon of the 116 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 117 | # pixels large. 118 | #html_favicon = None 119 | 120 | # Add any paths that contain custom static files (such as style sheets) here, 121 | # relative to this directory. They are copied after the builtin static files, 122 | # so a file named "default.css" will overwrite the builtin "default.css". 123 | html_static_path = ['_static'] 124 | 125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 126 | # using the given strftime format. 127 | #html_last_updated_fmt = '%b %d, %Y' 128 | 129 | # If true, SmartyPants will be used to convert quotes and dashes to 130 | # typographically correct entities. 131 | #html_use_smartypants = True 132 | 133 | # Custom sidebar templates, maps document names to template names. 134 | #html_sidebars = {} 135 | 136 | # Additional templates that should be rendered to pages, maps page names to 137 | # template names. 138 | #html_additional_pages = {} 139 | 140 | # If false, no module index is generated. 141 | #html_use_modindex = True 142 | 143 | # If false, no index is generated. 144 | #html_use_index = True 145 | 146 | # If true, the index is split into individual pages for each letter. 147 | #html_split_index = False 148 | 149 | # If true, links to the reST sources are added to the pages. 150 | #html_show_sourcelink = True 151 | 152 | # If true, an OpenSearch description file will be output, and all pages will 153 | # contain a tag referring to it. The value of this option must be the 154 | # base URL from which the finished HTML is served. 155 | #html_use_opensearch = '' 156 | 157 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml"). 158 | #html_file_suffix = '' 159 | 160 | # Output file base name for HTML help builder. 161 | htmlhelp_basename = 'PythonBloomFilterdoc' 162 | 163 | 164 | # -- Options for LaTeX output -------------------------------------------------- 165 | 166 | # The paper size ('letter' or 'a4'). 167 | #latex_paper_size = 'letter' 168 | 169 | # The font size ('10pt', '11pt' or '12pt'). 170 | #latex_font_size = '10pt' 171 | 172 | # Grouping the document tree into LaTeX files. List of tuples 173 | # (source start file, target name, title, author, documentclass [howto/manual]). 174 | latex_documents = [ 175 | ('index', 'PythonBloomFilter.tex', u'Python BloomFilter Documentation', 176 | u'Michael Axiak', 'manual'), 177 | ] 178 | 179 | # The name of an image file (relative to this directory) to place at the top of 180 | # the title page. 181 | #latex_logo = None 182 | 183 | # For "manual" documents, if this is true, then toplevel headings are parts, 184 | # not chapters. 185 | #latex_use_parts = False 186 | 187 | # Additional stuff for the LaTeX preamble. 188 | #latex_preamble = '' 189 | 190 | # Documents to append as an appendix to all manuals. 191 | #latex_appendices = [] 192 | 193 | # If false, no module index is generated. 194 | #latex_use_modindex = True 195 | -------------------------------------------------------------------------------- /docs/index.txt: -------------------------------------------------------------------------------- 1 | .. Python BloomFilter documentation master file, created by 2 | sphinx-quickstart on Wed Mar 31 16:25:58 2010. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Python BloomFilter's documentation! 7 | ============================================== 8 | 9 | If you are here, you probably don't need to be reminded 10 | about the nature of a Bloom filter. If you need to learn 11 | more, just visit the `wikipedia page `_ 12 | to learn more. This module implements a Bloom filter in python 13 | that's fast and uses mmap files for better scalability. 14 | Did I mention that it's fast? 15 | 16 | Here's a quick example:: 17 | 18 | from pybloomfilter import BloomFilter 19 | 20 | bf = BloomFilter(10000000, 0.01, 'filter.bloom') 21 | 22 | with open("/usr/share/dict/words") as f: 23 | for word in f: 24 | bf.add(word.rstrip()) 25 | 26 | print 'apple' in bf 27 | #outputs True 28 | 29 | That wasn't so hard, was it? Now, there are a lot of other things 30 | we can do. For instance, let's say we want to create a similar 31 | filter with just a few pieces of fruit:: 32 | 33 | fruitbf = bf.copy_template("fruit.bloom") 34 | fruitbf.update(("apple", "banana", "orange", "pear")) 35 | print fruitbf.to_base64() 36 | "eJzt2k13ojAUBuA9f8WFyofF5TWChlTHaPzqrlqFCtj6gQi/frqZM2N7aq3Gis59d2ye85KTRbhk" 37 | "0lyu1NRmsQrgRda0I+wZCfXIaxuWv+jqDxA8vdaf21HIOSn1u6LRE0VL9Z/qghfbBmxZoHsqM3k8" 38 | "N5XyPAxH2p22TJJoqwU9Q0y0dNDYrOHBIa3BwuznapG+KZZq69JUG0zu1tqI5weJKdpGq7PNJ6tB" 39 | "GKmzcGWWy8o0FeNNYNZAQpSdJwajt7eRhJ2YM2NOkTnSsBOCGGKIIYbY2TA663GgWWyWfUwn3oIc" 40 | "fyLYxeQwiF07RqBg9NgHrG5ba3jba5yl4zS2LtEMMcQQQwwxmRiBhPGOJOywIPafYhUwqnTvZOfY" 41 | "Zu40HH/YxDexZojJwsx6ObDcT7D8vVOtJBxiAhD/AjMmjeF2Wnqd+5RrHdo4azPEzoANabiUhh0b" 42 | "xBBDDDHEENsf8twlrizswEjDhnTbzWazbGKpQ5k07E9Ox2iFvXBZ2D9B7DawyqLFu5lshhhiiGUK" 43 | "a4nUloa9yxkwR7XhgPPXYdhRIa77uDtnyvqaIXalGK02ufv3J36GmsnG4lquPnN9gJo1VNxqgYbt" 44 | "ji/EC8s1PWG5fuVizW4Jox6/3o9XxBBDDLFbwcg9v/AwjrPHtTRsX34O01mxLw37bhCTjJk0+PLK" 45 | "08HYd4MYYojdKmYnBfjsktEpySY2tGGZzWaIIfYDGB271Yaieaat/AaOkNKb" 46 | 47 | Reference 48 | ------------ 49 | 50 | All of the reference information is available below: 51 | 52 | .. toctree:: 53 | :maxdepth: 2 54 | 55 | ref 56 | 57 | 58 | 59 | Why pybloomfilter 60 | --------------------- 61 | 62 | As I already mentioned, there are a couple reasons to use this 63 | module: 64 | 65 | * It natively uses `mmaped files `_. 66 | * It natively does the set things you want a Bloom filter to do. 67 | * It is Fast (see Benchmarks). 68 | 69 | Benchmarks 70 | --------------------- 71 | 72 | Simple load and add speed 73 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 74 | 75 | I have a simple benchmark in `test/speedtest.py `_ which compares 76 | this module to the good 77 | `pybloom module `_:: 78 | 79 | 80 | (pybloom module) 81 | pybloom load took 0.76436 s/run 82 | pybloom tests took 0.16205 s/run 83 | Errors: 0.25% positive 0.00% negative 84 | 85 | (this module) 86 | pybloomfilter load took 0.05423 s/run 87 | pybloomfilter tests took 0.00659 s/run 88 | Errors: 0.26% positive 0.00% negative 89 | 90 | 91 | In this test we just looked at adding words from a dictionary file, 92 | then testing to see if each word of another file was in the dictionary. 93 | 94 | Serialization 95 | ^^^^^^^^^^^^^^^^^ 96 | 97 | Since this package natively uses mmap files, **no serialization is needed**. 98 | Therefore, if you have to do a lot of moving between disks etc, this 99 | module is an obvious win. 100 | 101 | Install 102 | --------------------- 103 | 104 | You do not need Cython to install from sources, since I keep a cached version 105 | of the c output in the source distribution. Thus, to install you should only 106 | need to run:: 107 | 108 | $ sudo pip install pybloomfiltermmap 109 | 110 | You can also download the latest tar file from the `github tags `_. Once you download it, you should only have to run:: 111 | 112 | $ sudo python setup.py install 113 | 114 | to build and install the module. 115 | 116 | Develop 117 | ----------------------- 118 | 119 | To develop you will need Cython. The setup.py script should automatically 120 | build from Cython source if the Cython module is available. 121 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | set SPHINXBUILD=sphinx-build 6 | set BUILDDIR=_build 7 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 8 | if NOT "%PAPER%" == "" ( 9 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 10 | ) 11 | 12 | if "%1" == "" goto help 13 | 14 | if "%1" == "help" ( 15 | :help 16 | echo.Please use `make ^` where ^ is one of 17 | echo. html to make standalone HTML files 18 | echo. dirhtml to make HTML files named index.html in directories 19 | echo. pickle to make pickle files 20 | echo. json to make JSON files 21 | echo. htmlhelp to make HTML files and a HTML help project 22 | echo. qthelp to make HTML files and a qthelp project 23 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 24 | echo. changes to make an overview over all changed/added/deprecated items 25 | echo. linkcheck to check all external links for integrity 26 | echo. doctest to run all doctests embedded in the documentation if enabled 27 | goto end 28 | ) 29 | 30 | if "%1" == "clean" ( 31 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 32 | del /q /s %BUILDDIR%\* 33 | goto end 34 | ) 35 | 36 | if "%1" == "html" ( 37 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 38 | echo. 39 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 40 | goto end 41 | ) 42 | 43 | if "%1" == "dirhtml" ( 44 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 45 | echo. 46 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 47 | goto end 48 | ) 49 | 50 | if "%1" == "pickle" ( 51 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 52 | echo. 53 | echo.Build finished; now you can process the pickle files. 54 | goto end 55 | ) 56 | 57 | if "%1" == "json" ( 58 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 59 | echo. 60 | echo.Build finished; now you can process the JSON files. 61 | goto end 62 | ) 63 | 64 | if "%1" == "htmlhelp" ( 65 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 66 | echo. 67 | echo.Build finished; now you can run HTML Help Workshop with the ^ 68 | .hhp project file in %BUILDDIR%/htmlhelp. 69 | goto end 70 | ) 71 | 72 | if "%1" == "qthelp" ( 73 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 74 | echo. 75 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 76 | .qhcp project file in %BUILDDIR%/qthelp, like this: 77 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PythonBloomFilter.qhcp 78 | echo.To view the help file: 79 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PythonBloomFilter.ghc 80 | goto end 81 | ) 82 | 83 | if "%1" == "latex" ( 84 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 85 | echo. 86 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 87 | goto end 88 | ) 89 | 90 | if "%1" == "changes" ( 91 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 92 | echo. 93 | echo.The overview file is in %BUILDDIR%/changes. 94 | goto end 95 | ) 96 | 97 | if "%1" == "linkcheck" ( 98 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 99 | echo. 100 | echo.Link check complete; look for any errors in the above output ^ 101 | or in %BUILDDIR%/linkcheck/output.txt. 102 | goto end 103 | ) 104 | 105 | if "%1" == "doctest" ( 106 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 107 | echo. 108 | echo.Testing of doctests in the sources finished, look at the ^ 109 | results in %BUILDDIR%/doctest/output.txt. 110 | goto end 111 | ) 112 | 113 | :end 114 | -------------------------------------------------------------------------------- /docs/pushtogh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd $(dirname "$0") 3 | make html 4 | 5 | TEMPFILE=$(tempfile) 6 | rm -f $TEMPFILE 7 | mkdir $TEMPFILE 8 | cp -r ./_build/html/* "$TEMPFILE" 9 | cd ../ 10 | git stash 11 | git checkout gh-pages 12 | rm -rf * 13 | cp -r "$TEMPFILE/"* ./ 14 | git add . 15 | git commit -m "Updated documents build" 16 | git checkout master 17 | git stash pop 18 | -------------------------------------------------------------------------------- /docs/ref.txt: -------------------------------------------------------------------------------- 1 | BloomFilter Class Reference 2 | ============================ 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | .. module:: pybloomfilter 8 | :platform: Unix, Windows 9 | :synopsis: A fast BloomFilter for Python 10 | .. moduleauthor:: Michael Axiak 11 | 12 | 13 | .. class:: BloomFilter(capacity : int, error_rate : float, [filename=None : string], [perm=0755]) 14 | 15 | Create a new BloomFilter object with a given capacity and error_rate. 16 | **Note that we do not check capacity.** This is important, because 17 | I want to be able to support logical OR and AND (see below). 18 | The capacity and error_rate then together serve as a contract---you add 19 | less than capacity items, and the Bloom Filter will have an error rate 20 | less than error_rate. 21 | 22 | Class Methods 23 | ------------- 24 | 25 | .. classmethod:: BloomFilter.open(filename) 26 | 27 | Return a BloomFilter object using an already-existing Bloomfilter file. 28 | 29 | .. classmethod:: BloomFilter.from_base64(filename, string, [perm=0755]) 30 | 31 | Unpack the supplied base64 string (as returned by BloomFilter.to_base64) 32 | into the supplied filename and return a BloomFilter object using that 33 | file. 34 | 35 | Example:: 36 | 37 | >>> bf = BloomFilter.from_base64("/tmp/mike.bf", 38 | "eJwFwcuWgiAAANC9v+JCx7By0QKt0GHEbKSknflAQ9QmTyRfP/fW5E9XTRSX" 39 | "qcLlqGNXphAqcfVH\nRoNv0n4JlTpIvAP0e1+RyXX6I637ggA+VPZnTYR1A4" 40 | "Um5s9geYaZZLiT208JIiG3iwhf3Fwlzb3Y\n5NRL4uNQS6/d9OvTDJbnZMnR" 41 | "zcrplOX5kmsVIkQziM+vw4hCDQ3OkN9m3WVfPWzGfaTeRftMCLws\nPnzEzs" 42 | "gjAW60xZTBbj/bOAgYbK50PqjdzvgHZ6FHZw==\n") 43 | >>> "MIKE" in bf 44 | True 45 | 46 | Instance Attributes 47 | --------------------- 48 | 49 | .. attribute:: BloomFilter.capacity 50 | 51 | The number of elements for this filter. 52 | 53 | .. attribute:: BloomFilter.error_rate 54 | 55 | The acceptable probability of false positives. 56 | 57 | .. attribute:: BloomFilter.hash_seeds 58 | 59 | The integer seeds used for the random hashing. 60 | 61 | .. attribute:: BloomFilter.name 62 | 63 | The file name (compatible with file objects) 64 | 65 | .. attribute:: BloomFilter.num_bits 66 | 67 | The number of bits used in the filter as buckets 68 | 69 | .. attribute:: BloomFilter.num_hashes 70 | 71 | The number of hash functions used when computing 72 | 73 | 74 | Instance Methods 75 | ------------------- 76 | 77 | .. method:: BloomFilter.add(item) -> Boolean 78 | 79 | Add the item to the bloom filter. 80 | 81 | :param item: Hashable object 82 | :rtype: Boolean (True if item already in the filter) 83 | 84 | .. method:: BloomFilter.clear_all() 85 | 86 | Remove all elements from the bloom filter at once. 87 | 88 | .. method:: BloomFilter.copy(filename) -> BloomFilter 89 | 90 | Copies the current BloomFilter object to another object with 91 | new filename. 92 | 93 | :param filename: string filename 94 | :rtype: new BloomFilter object 95 | 96 | .. method:: BloomFilter.copy_template(filename, [perm=0755]) -> BloomFilter 97 | 98 | Creates a new BloomFilter object with the same *parameters*--same 99 | hash seeds, same size.. everything. Once this is performed, the 100 | two filters are *comparable*, so you can perform logical operators. 101 | Example:: 102 | 103 | >>> apple = BloomFilter(100, 0.1, '/tmp/apple') 104 | >>> apple.add('apple') 105 | False 106 | >>> pear = apple.copy_template('/tmp/pear') 107 | >>> pear.add('pear') 108 | False 109 | >>> pear |= apple 110 | 111 | .. method:: BloomFilter.sync() 112 | 113 | Forces a sync() call on the underlying mmap file object. Use this if 114 | you are about to copy the file and you want to be Sure (TM) you got 115 | everything correctly. 116 | 117 | .. method:: BloomFilter.to_base64() -> string 118 | 119 | Creates a compressed, base64 encoded version of the Bloom filter. 120 | Since the bloom filter is efficiently in binary on the file system 121 | this may not be too useful. I find it useful for debugging so I can 122 | copy filters from one terminal to another in their entirety. 123 | 124 | :rtype: Base64 encoded string representing filter 125 | 126 | .. method:: BloomFilter.update(iterable) 127 | 128 | Calls add() on all items in the iterable. 129 | 130 | .. method:: BloomFilter.union(filter) -> BloomFilter 131 | 132 | Perform a set OR with another *comparable* filter. 133 | You can (only) construct comparable filters with **copy_template** above. 134 | See the example in copy_template. In that example, pear will have 135 | both "apple" and "pear". 136 | 137 | The result will occur **in place**. That is, calling:: 138 | 139 | bf.union(bf2) 140 | 141 | is a way to add all the elements of bf2 to bf. 142 | 143 | *N.B.: Calling this function will render future calls to len() 144 | invalid.* 145 | 146 | .. method:: BloomFilter.intersection(filter) -> BloomFilter 147 | 148 | The same as union() above except it uses a set AND instead of a 149 | set OR. 150 | 151 | *N.B.: Calling this function will render future calls to len() 152 | invalid.* 153 | 154 | Magic Methods 155 | -------------- 156 | 157 | .. method:: BloomFilter.__len__(item) -> Integer 158 | 159 | Returns the number of distinct elements that have been 160 | added to the BloomFilter object, subject to the error 161 | given in error_rate. 162 | 163 | Example:: 164 | 165 | >>> bf = BloomFilter(100, 0.1, '/tmp/fruit.bloom') 166 | >>> bf.add("Apple") 167 | >>> bf.add('Apple') 168 | >>> bf.add('orange') 169 | >>> len(bf) 170 | 2 171 | >>> bf2 = bf.copy_template('/tmp/new.bloom') 172 | >>> bf2 |= bf 173 | >>> len(bf2) 174 | Traceback (most recent call last): 175 | ... 176 | pybloomfilter.IndeterminateCountError: Length of BloomFilter object is unavailable after intersection or union called. 177 | 178 | .. method:: BloomFilter.__in__(item) -> Boolean 179 | 180 | Check to see if item is contained in the filter, with 181 | an acceptable false positive rate of error_rate (see above). 182 | 183 | .. method:: BloomFilter.__ior__(filter) -> BloomFilter 184 | 185 | See union(filter) 186 | 187 | .. method:: BloomFilter.__iand__(filter) -> BloomFilter 188 | 189 | See intersection(filter) 190 | 191 | Exceptions 192 | -------------- 193 | 194 | .. class:: IndeterminateCountError(message) 195 | 196 | The exception that is raised if len() is called on a BloomFilter 197 | object after |=, &=, intersection(), or union() is used. 198 | 199 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/.svn/all-wcprops: -------------------------------------------------------------------------------- 1 | K 25 2 | svn:wc:ra_dav:version-url 3 | V 47 4 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex 5 | END 6 | __init__.py 7 | K 25 8 | svn:wc:ra_dav:version-url 9 | V 59 10 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/__init__.py 11 | END 12 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/.svn/entries: -------------------------------------------------------------------------------- 1 | 10 2 | 3 | dir 4 | 80414 5 | http://codespeak.net/svn/lxml/trunk/fake_pyrex/Pyrex 6 | http://codespeak.net/svn 7 | 8 | 9 | 10 | 2007-08-31T17:01:53.941550Z 11 | 46219 12 | scoder 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | fd0d7bf2-dfb6-0310-8d31-b7ecfe96aada 28 | 29 | Distutils 30 | dir 31 | 32 | __init__.py 33 | file 34 | 35 | 36 | 37 | 38 | 2012-01-04T20:10:47.429233Z 39 | 8213a7cbff30b82637a6a31ea1b8e4c1 40 | 2007-08-31T17:01:53.941550Z 41 | 46219 42 | scoder 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 48 65 | 66 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/.svn/text-base/__init__.py.svn-base: -------------------------------------------------------------------------------- 1 | # work around broken setuptools monkey patching 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/.svn/all-wcprops: -------------------------------------------------------------------------------- 1 | K 25 2 | svn:wc:ra_dav:version-url 3 | V 57 4 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/Distutils 5 | END 6 | __init__.py 7 | K 25 8 | svn:wc:ra_dav:version-url 9 | V 69 10 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/Distutils/__init__.py 11 | END 12 | build_ext.py 13 | K 25 14 | svn:wc:ra_dav:version-url 15 | V 70 16 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/Distutils/build_ext.py 17 | END 18 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/.svn/entries: -------------------------------------------------------------------------------- 1 | 10 2 | 3 | dir 4 | 80414 5 | http://codespeak.net/svn/lxml/trunk/fake_pyrex/Pyrex/Distutils 6 | http://codespeak.net/svn 7 | 8 | 9 | 10 | 2007-08-31T17:01:53.941550Z 11 | 46219 12 | scoder 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | fd0d7bf2-dfb6-0310-8d31-b7ecfe96aada 28 | 29 | __init__.py 30 | file 31 | 32 | 33 | 34 | 35 | 2012-01-04T20:10:47.421233Z 36 | 8213a7cbff30b82637a6a31ea1b8e4c1 37 | 2007-08-31T17:01:53.941550Z 38 | 46219 39 | scoder 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 48 62 | 63 | build_ext.py 64 | file 65 | 66 | 67 | 68 | 69 | 2012-01-04T20:10:47.421233Z 70 | 708366fb576674605bad33ea1d8600f4 71 | 2007-08-31T17:01:53.941550Z 72 | 46219 73 | scoder 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 31 96 | 97 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/.svn/text-base/__init__.py.svn-base: -------------------------------------------------------------------------------- 1 | # work around broken setuptools monkey patching 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/.svn/text-base/build_ext.py.svn-base: -------------------------------------------------------------------------------- 1 | build_ext = "yes, it's there!" 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/__init__.py: -------------------------------------------------------------------------------- 1 | # work around broken setuptools monkey patching 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/Distutils/build_ext.py: -------------------------------------------------------------------------------- 1 | build_ext = "yes, it's there!" 2 | -------------------------------------------------------------------------------- /fake_pyrex/Pyrex/__init__.py: -------------------------------------------------------------------------------- 1 | # work around broken setuptools monkey patching 2 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | here = os.path.dirname(__file__) 5 | 6 | ext_files = ["src/mmapbitarray.c", 7 | "src/bloomfilter.c", 8 | "src/md5.c", 9 | "src/primetester.c", 10 | "src/MurmurHash3.c", 11 | ] 12 | 13 | kwargs = {} 14 | 15 | try: 16 | if '--no-cython' in sys.argv: 17 | raise ImportError() 18 | import Cython # noqa 19 | sys.path.insert(0, os.path.join(here, 'fake_pyrex')) 20 | except ImportError: 21 | pass 22 | 23 | 24 | from setuptools import setup, Extension 25 | 26 | try: 27 | if '--no-cython' in sys.argv: 28 | sys.argv.remove('--no-cython') 29 | raise ImportError() 30 | from Cython.Distutils import build_ext 31 | print "info: Building from Cython" 32 | ext_files.append("src/pybloomfilter.pyx") 33 | kwargs['cmdclass'] = {'build_ext': build_ext} 34 | #try: 35 | # os.unlink(os.path.join(here, 'src', 'pybloomfilter.c')) 36 | # os.unlink(os.path.join(here, 'pybloomfilter.so')) 37 | #except: 38 | # pass 39 | except ImportError: 40 | if '--cython' in sys.argv: 41 | raise 42 | ext_files.append("src/pybloomfilter.c") 43 | print "info: Building from C" 44 | 45 | if '--cython' in sys.argv: 46 | sys.argv.remove('--cython') 47 | 48 | ext_modules = [Extension("pybloomfilter", 49 | ext_files, 50 | libraries=['crypto'])] 51 | 52 | requirements = [] 53 | 54 | if sys.version_info[0] < 3 and sys.version_info[1] < 7: 55 | requirements.append('importlib') 56 | 57 | setup(name='pybloomfiltermmap', 58 | version="0.3.14", 59 | author="Michael Axiak, Rob Stacey", 60 | author_email="mike@axiak.net", 61 | url="http://github.com/axiak/pybloomfiltermmap/", 62 | description="A Bloom filter (bloomfilter) for Python built on mmap", 63 | license="MIT License", 64 | test_suite='tests.test_all', 65 | install_requires=requirements, 66 | ext_modules=ext_modules, 67 | classifiers=[ 68 | 'Intended Audience :: Developers', 69 | 'License :: OSI Approved :: MIT License', 70 | 'Programming Language :: C', 71 | 'Programming Language :: Cython', 72 | 'Programming Language :: Python', 73 | 'Topic :: Software Development :: Libraries :: Python Modules', 74 | ], 75 | **kwargs) 76 | -------------------------------------------------------------------------------- /src/Makefile: -------------------------------------------------------------------------------- 1 | CC ?= gcc 2 | 3 | bloomfilter: mmapbitarray.* bloomfilter.* 4 | $(CC) $(CFLAGS) $(CPPFLAGS) -O3 mmapbitarray.c md5.c MurmurHash3.cpp bloomfilter.c -o bf -lm $(LDFLAGS) 5 | 6 | mbarray: mmapbitarray.* 7 | $(CC) $(CFLAGS) $(CPPFLAGS) -O3 -DMBAQUERY mmapbitarray.c -o mbaquery -lm $(LDFLAGS) 8 | $(CC) $(CFLAGS) $(CPPFLAGS) -O3 -DMBACREATE mmapbitarray.c -o mbacreate -lm $(LDFLAGS) 9 | -------------------------------------------------------------------------------- /src/MurmurHash3.c: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the 6 | // algorithms are optimized for their respective platforms. You can still 7 | // compile and run any of them on any platform, but your performance with the 8 | // non-native version will be less than optimal. 9 | // 10 | // modification in mmh3: 11 | // __attribute__((always_inline)) is replaced to inline by Hajime Senuma 12 | 13 | #include "MurmurHash3.h" 14 | 15 | //----------------------------------------------------------------------------- 16 | // Platform-specific functions and macros 17 | 18 | // Microsoft Visual Studio 19 | 20 | #if defined(_MSC_VER) 21 | 22 | #define FORCE_INLINE __forceinline 23 | 24 | #include 25 | 26 | #define ROTL32(x,y) _rotl(x,y) 27 | #define ROTL64(x,y) _rotl64(x,y) 28 | 29 | #define BIG_CONSTANT(x) (x) 30 | 31 | // Other compilers 32 | 33 | #else // defined(_MSC_VER) 34 | 35 | #define FORCE_INLINE /*__attribute__((always_inline))*/ inline 36 | 37 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 38 | { 39 | return (x << r) | (x >> (32 - r)); 40 | } 41 | 42 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 43 | { 44 | return (x << r) | (x >> (64 - r)); 45 | } 46 | 47 | #define ROTL32(x,y) rotl32(x,y) 48 | #define ROTL64(x,y) rotl64(x,y) 49 | 50 | #define BIG_CONSTANT(x) (x##LLU) 51 | 52 | #endif // !defined(_MSC_VER) 53 | 54 | //----------------------------------------------------------------------------- 55 | // Block read - if your platform needs to do endian-swapping or can only 56 | // handle aligned reads, do the conversion here 57 | 58 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) 59 | { 60 | return p[i]; 61 | } 62 | 63 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) 64 | { 65 | return p[i]; 66 | } 67 | 68 | //----------------------------------------------------------------------------- 69 | // Finalization mix - force all bits of a hash block to avalanche 70 | 71 | FORCE_INLINE uint32_t fmix32 ( uint32_t h ) 72 | { 73 | h ^= h >> 16; 74 | h *= 0x85ebca6b; 75 | h ^= h >> 13; 76 | h *= 0xc2b2ae35; 77 | h ^= h >> 16; 78 | 79 | return h; 80 | } 81 | 82 | //---------- 83 | 84 | FORCE_INLINE uint64_t fmix64 ( uint64_t k ) 85 | { 86 | k ^= k >> 33; 87 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 88 | k ^= k >> 33; 89 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 90 | k ^= k >> 33; 91 | 92 | return k; 93 | } 94 | 95 | 96 | //----------------------------------------------------------------------------- 97 | 98 | void MurmurHash3_x86_32 ( const void * key, int len, 99 | uint32_t seed, void * out ) 100 | { 101 | const uint8_t * data = (const uint8_t*)key; 102 | const int nblocks = len / 4; 103 | 104 | uint32_t h1 = seed; 105 | 106 | uint32_t c1 = 0xcc9e2d51; 107 | uint32_t c2 = 0x1b873593; 108 | 109 | //---------- 110 | // body 111 | 112 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 113 | 114 | int i; 115 | for(i = -nblocks; i; i++) 116 | { 117 | uint32_t k1 = getblock32(blocks,i); 118 | 119 | k1 *= c1; 120 | k1 = ROTL32(k1,15); 121 | k1 *= c2; 122 | 123 | h1 ^= k1; 124 | h1 = ROTL32(h1,13); 125 | h1 = h1*5+0xe6546b64; 126 | } 127 | 128 | //---------- 129 | // tail 130 | 131 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 132 | 133 | uint32_t k1 = 0; 134 | 135 | switch(len & 3) 136 | { 137 | case 3: k1 ^= tail[2] << 16; 138 | case 2: k1 ^= tail[1] << 8; 139 | case 1: k1 ^= tail[0]; 140 | k1 *= c1; k1 = ROTL32(k1,16); k1 *= c2; h1 ^= k1; 141 | }; 142 | 143 | //---------- 144 | // finalization 145 | 146 | h1 ^= len; 147 | 148 | h1 = fmix32(h1); 149 | 150 | *(uint32_t*)out = h1; 151 | } 152 | 153 | //----------------------------------------------------------------------------- 154 | 155 | void MurmurHash3_x86_128 ( const void * key, const int len, 156 | uint32_t seed, void * out ) 157 | { 158 | const uint8_t * data = (const uint8_t*)key; 159 | const int nblocks = len / 16; 160 | 161 | uint32_t h1 = seed; 162 | uint32_t h2 = seed; 163 | uint32_t h3 = seed; 164 | uint32_t h4 = seed; 165 | 166 | uint32_t c1 = 0x239b961b; 167 | uint32_t c2 = 0xab0e9789; 168 | uint32_t c3 = 0x38b34ae5; 169 | uint32_t c4 = 0xa1e38b93; 170 | 171 | //---------- 172 | // body 173 | 174 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 175 | 176 | int i; 177 | for(i = -nblocks; i; i++) 178 | { 179 | uint32_t k1 = getblock32(blocks,i*4+0); 180 | uint32_t k2 = getblock32(blocks,i*4+1); 181 | uint32_t k3 = getblock32(blocks,i*4+2); 182 | uint32_t k4 = getblock32(blocks,i*4+3); 183 | 184 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 185 | 186 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 187 | 188 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 189 | 190 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 191 | 192 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 193 | 194 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 195 | 196 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 197 | 198 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 199 | } 200 | 201 | //---------- 202 | // tail 203 | 204 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 205 | 206 | uint32_t k1 = 0; 207 | uint32_t k2 = 0; 208 | uint32_t k3 = 0; 209 | uint32_t k4 = 0; 210 | 211 | switch(len & 15) 212 | { 213 | case 15: k4 ^= tail[14] << 16; 214 | case 14: k4 ^= tail[13] << 8; 215 | case 13: k4 ^= tail[12] << 0; 216 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 217 | 218 | case 12: k3 ^= tail[11] << 24; 219 | case 11: k3 ^= tail[10] << 16; 220 | case 10: k3 ^= tail[ 9] << 8; 221 | case 9: k3 ^= tail[ 8] << 0; 222 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 223 | 224 | case 8: k2 ^= tail[ 7] << 24; 225 | case 7: k2 ^= tail[ 6] << 16; 226 | case 6: k2 ^= tail[ 5] << 8; 227 | case 5: k2 ^= tail[ 4] << 0; 228 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 229 | 230 | case 4: k1 ^= tail[ 3] << 24; 231 | case 3: k1 ^= tail[ 2] << 16; 232 | case 2: k1 ^= tail[ 1] << 8; 233 | case 1: k1 ^= tail[ 0] << 0; 234 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 235 | }; 236 | 237 | //---------- 238 | // finalization 239 | 240 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 241 | 242 | h1 += h2; h1 += h3; h1 += h4; 243 | h2 += h1; h3 += h1; h4 += h1; 244 | 245 | h1 = fmix32(h1); 246 | h2 = fmix32(h2); 247 | h3 = fmix32(h3); 248 | h4 = fmix32(h4); 249 | 250 | h1 += h2; h1 += h3; h1 += h4; 251 | h2 += h1; h3 += h1; h4 += h1; 252 | 253 | ((uint32_t*)out)[0] = h1; 254 | ((uint32_t*)out)[1] = h2; 255 | ((uint32_t*)out)[2] = h3; 256 | ((uint32_t*)out)[3] = h4; 257 | } 258 | 259 | //----------------------------------------------------------------------------- 260 | 261 | void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, void * out ) 262 | { 263 | const uint8_t * data = (const uint8_t*)key; 264 | const int nblocks = len / 16; 265 | 266 | uint64_t h1 = seed; 267 | uint64_t h2 = seed; 268 | 269 | uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 270 | uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 271 | 272 | //---------- 273 | // body 274 | 275 | const uint64_t * blocks = (const uint64_t *)(data); 276 | 277 | int i; 278 | for(i = 0; i < nblocks; i++) 279 | { 280 | uint64_t k1 = getblock64(blocks,i*2+0); 281 | uint64_t k2 = getblock64(blocks,i*2+1); 282 | 283 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 284 | 285 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 286 | 287 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 288 | 289 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 290 | } 291 | 292 | //---------- 293 | // tail 294 | 295 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 296 | 297 | uint64_t k1 = 0; 298 | uint64_t k2 = 0; 299 | 300 | switch(len & 15) 301 | { 302 | case 15: k2 ^= (uint64_t)(tail[14]) << 48; 303 | case 14: k2 ^= (uint64_t)(tail[13]) << 40; 304 | case 13: k2 ^= (uint64_t)(tail[12]) << 32; 305 | case 12: k2 ^= (uint64_t)(tail[11]) << 24; 306 | case 11: k2 ^= (uint64_t)(tail[10]) << 16; 307 | case 10: k2 ^= (uint64_t)(tail[ 9]) << 8; 308 | case 9: k2 ^= (uint64_t)(tail[ 8]) << 0; 309 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 310 | 311 | case 8: k1 ^= (uint64_t)(tail[ 7]) << 56; 312 | case 7: k1 ^= (uint64_t)(tail[ 6]) << 48; 313 | case 6: k1 ^= (uint64_t)(tail[ 5]) << 40; 314 | case 5: k1 ^= (uint64_t)(tail[ 4]) << 32; 315 | case 4: k1 ^= (uint64_t)(tail[ 3]) << 24; 316 | case 3: k1 ^= (uint64_t)(tail[ 2]) << 16; 317 | case 2: k1 ^= (uint64_t)(tail[ 1]) << 8; 318 | case 1: k1 ^= (uint64_t)(tail[ 0]) << 0; 319 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 320 | }; 321 | 322 | //---------- 323 | // finalization 324 | 325 | h1 ^= len; h2 ^= len; 326 | 327 | h1 += h2; 328 | h2 += h1; 329 | 330 | h1 = fmix64(h1); 331 | h2 = fmix64(h2); 332 | 333 | h1 += h2; 334 | h2 += h1; 335 | 336 | ((uint64_t*)out)[0] = h1; 337 | ((uint64_t*)out)[1] = h2; 338 | } 339 | 340 | //----------------------------------------------------------------------------- 341 | -------------------------------------------------------------------------------- /src/MurmurHash3.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | #if defined(_MSC_VER) 14 | 15 | typedef unsigned char uint8_t; 16 | typedef unsigned long uint32_t; 17 | typedef unsigned __int64 uint64_t; 18 | 19 | // Other compilers 20 | 21 | #else // defined(_MSC_VER) 22 | 23 | #include 24 | 25 | #endif // !defined(_MSC_VER) 26 | 27 | //----------------------------------------------------------------------------- 28 | 29 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 30 | 31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 32 | 33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 34 | 35 | 36 | //----------------------------------------------------------------------------- 37 | 38 | #endif // _MURMURHASH3_H_ 39 | -------------------------------------------------------------------------------- /src/bloomfilter.c: -------------------------------------------------------------------------------- 1 | #ifndef __BLOOMFILTER_C 2 | #define __BLOOMFILTER_C 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include "md5.h" 8 | 9 | #include "bloomfilter.h" 10 | 11 | BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, 12 | BTYPE num_bits, int *hash_seeds, int num_hashes) 13 | { 14 | BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter)); 15 | MBArray * array; 16 | 17 | if (!bf) { 18 | return NULL; 19 | } 20 | 21 | bf->max_num_elem = max_num_elem; 22 | bf->error_rate = error_rate; 23 | bf->num_hashes = num_hashes; 24 | bf->count_correct = 1; 25 | bf->bf_version = BF_CURRENT_VERSION; 26 | bf->elem_count = 0; 27 | bf->array = NULL; 28 | memset(bf->reserved, 0, sizeof(uint32_t) * 32); 29 | memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256); 30 | memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes); 31 | array = mbarray_Create_Malloc(num_bits); 32 | if (!array) { 33 | bloomfilter_Destroy(bf); 34 | return NULL; 35 | } 36 | 37 | bf->array = array; 38 | 39 | return bf; 40 | } 41 | 42 | BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate, 43 | const char * file, BTYPE num_bits, int oflags, int perms, 44 | int *hash_seeds, int num_hashes) 45 | { 46 | BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter)); 47 | MBArray * array; 48 | 49 | if (!bf) { 50 | return NULL; 51 | } 52 | 53 | bf->max_num_elem = max_num_elem; 54 | bf->error_rate = error_rate; 55 | bf->num_hashes = num_hashes; 56 | bf->count_correct = 1; 57 | bf->bf_version = BF_CURRENT_VERSION; 58 | bf->elem_count = 0; 59 | bf->array = NULL; 60 | memset(bf->reserved, 0, sizeof(uint32_t) * 32); 61 | memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256); 62 | memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes); 63 | array = mbarray_Create_Mmap(num_bits, file, (char *)bf, sizeof(BloomFilter), oflags, perms); 64 | if (!array) { 65 | bloomfilter_Destroy(bf); 66 | return NULL; 67 | } 68 | 69 | /* After we create the new array object, this array may already 70 | have all of the bloom filter data from the file in the 71 | header info. 72 | By calling mbarray_Header, we copy that header data 73 | back into this BloomFilter object. 74 | */ 75 | if (mbarray_Header((char *)bf, array, sizeof(BloomFilter)) == NULL) { 76 | bloomfilter_Destroy(bf); 77 | mbarray_Destroy(array); 78 | return NULL; 79 | } 80 | 81 | /* Since we just initialized from a file, we have to 82 | fix our pointers */ 83 | bf->array = array; 84 | 85 | return bf; 86 | } 87 | 88 | 89 | void bloomfilter_Destroy(BloomFilter * bf) 90 | { 91 | if (bf) { 92 | if (bf->array) { 93 | mbarray_Destroy(bf->array); 94 | bf->array = NULL; 95 | } 96 | free(bf); 97 | } 98 | } 99 | 100 | 101 | void bloomfilter_Print(BloomFilter * bf) 102 | { 103 | printf("\n", 104 | (unsigned long)bf->max_num_elem, bf->error_rate, bf->num_hashes); 105 | } 106 | 107 | int bloomfilter_Update(BloomFilter * bf, char * data, int size) 108 | { 109 | MBArray * array = bf->array; 110 | int retval = mbarray_Update(bf->array, data, size); 111 | if (retval) { 112 | return retval; 113 | } 114 | if (mbarray_Header((char *)bf, array, sizeof(BloomFilter)) == NULL) { 115 | return 1; 116 | } 117 | bf->array = array; 118 | return 0; 119 | } 120 | 121 | 122 | BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int perms) 123 | { 124 | BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter)); 125 | MBArray * array; 126 | 127 | if (bf == NULL) { 128 | return NULL; 129 | } 130 | 131 | array = mbarray_Copy_Template(src->array, filename, perms); 132 | if (array == NULL) { 133 | free(bf); 134 | return NULL; 135 | } 136 | 137 | if (mbarray_Header((char *)bf, array, sizeof(BloomFilter)) == NULL) { 138 | bloomfilter_Destroy(bf); 139 | mbarray_Destroy(array); 140 | return NULL; 141 | } 142 | 143 | bf->array = array; 144 | return bf; 145 | } 146 | 147 | 148 | BTYPE _hash_long(uint32_t hash_seed, Key * key) { 149 | Key newKey = { 150 | .shash = (char *)&key->nhash, 151 | .nhash = sizeof(key->nhash) 152 | }; 153 | 154 | return _hash_char(hash_seed, &newKey); 155 | } 156 | 157 | /* 158 | CODE TO USE SHA512.. 159 | #include 160 | 161 | uint32_t _hash_char(uint32_t hash_seed, Key * key) { 162 | EVP_MD_CTX ctx; 163 | unsigned char result_buffer[64]; 164 | 165 | EVP_MD_CTX_init(&ctx); 166 | 167 | EVP_DigestInit_ex(&ctx, EVP_sha512(), NULL); 168 | EVP_DigestUpdate(&ctx, (const unsigned char *)&hash_seed, sizeof(hash_seed)); 169 | EVP_DigestUpdate(&ctx, (const unsigned char *)key->shash, key->nhash); 170 | EVP_DigestFinal_ex(&ctx, (unsigned char *)&result_buffer, NULL); 171 | EVP_MD_CTX_cleanup(&ctx); 172 | return *(uint32_t *)result_buffer; 173 | } 174 | */ 175 | 176 | /* Code for MurmurHash3 */ 177 | #include "MurmurHash3.h" 178 | BTYPE _hash_char(uint32_t hash_seed, Key * key) { 179 | BTYPE hashed_pieces[2]; 180 | MurmurHash3_x64_128((const void *)key->shash, (int)key->nhash, 181 | hash_seed, &hashed_pieces); 182 | return hashed_pieces[0] ^ hashed_pieces[1]; 183 | } 184 | 185 | 186 | #if 0 187 | int main(int argc, char **argv) 188 | { 189 | int hash_seeds[5] = { 4234 , 2123, 4434, 444, 12123}; 190 | BloomFilter *bf = bloomfilter_Create(100000, 0.4, 191 | "/tmp/bf2", 10000000, O_RDWR, 0, 192 | hash_seeds, 5); 193 | 194 | Key key; 195 | char line[255]; 196 | key.shash = line; 197 | 198 | if (!bf) 199 | goto error; 200 | 201 | bloomfilter_Print(bf); 202 | 203 | while (fgets(line, 255, stdin)) { 204 | line[strlen(line) - 1] = '\0'; 205 | key.nhash = strlen(line); 206 | 207 | /*if (bloomfilter_Add(bf, &key)) { 208 | goto error; 209 | }*/ 210 | if (bloomfilter_Test(bf, &key)) { 211 | printf("Found '%s'!\n", line); 212 | } 213 | } 214 | bloomfilter_Destroy(bf); 215 | return 0; 216 | 217 | error: 218 | fprintf(stderr, "ERROR: %s [%d]\n", strerror(errno), errno); 219 | return 255; 220 | } 221 | #endif 222 | #endif 223 | -------------------------------------------------------------------------------- /src/bloomfilter.h: -------------------------------------------------------------------------------- 1 | #ifndef __BLOOMFILTER_H 2 | #define __BLOOMFILTER_H 1 3 | 4 | #include 5 | 6 | #include "mmapbitarray.h" 7 | #define BF_CURRENT_VERSION 1 8 | 9 | struct _BloomFilter { 10 | uint64_t max_num_elem; 11 | double error_rate; 12 | uint32_t num_hashes; 13 | uint32_t hash_seeds[256]; 14 | /* All of the bit data is already in here. */ 15 | MBArray * array; 16 | unsigned char bf_version; 17 | unsigned char count_correct; 18 | uint64_t elem_count; 19 | uint32_t reserved[32]; 20 | }; 21 | 22 | typedef struct { 23 | uint64_t nhash; 24 | char * shash; 25 | } Key; 26 | 27 | typedef struct _BloomFilter BloomFilter; 28 | 29 | /* Create a bloom filter without a memory-mapped file backing it */ 30 | BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate, 31 | BTYPE num_bits, int *hash_seeds, int num_hashes); 32 | 33 | /* Create a bloom filter with a memory-mapped file backing it */ 34 | BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate, 35 | const char * file, BTYPE num_bits, int oflags, int perms, 36 | int *hash_seeds, int num_hashes); 37 | 38 | void bloomfilter_Destroy(BloomFilter * bf); 39 | 40 | int bloomfilter_Update(BloomFilter * bf, char * data, int size); 41 | 42 | BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int perms); 43 | 44 | /* A lot of this is inlined.. */ 45 | BTYPE _hash_char(uint32_t hash_seed, Key * key); 46 | 47 | BTYPE _hash_long(uint32_t hash_seed, Key * key); 48 | 49 | 50 | static inline int bloomfilter_Add(BloomFilter * bf, Key * key) 51 | { 52 | BTYPE (*hashfunc)(uint32_t, Key *) = _hash_char; 53 | register BTYPE mod = bf->array->bits; 54 | register int i; 55 | register int result = 1; 56 | register BTYPE hash_res; 57 | 58 | if (key->shash == NULL) 59 | hashfunc = _hash_long; 60 | 61 | for (i = bf->num_hashes - 1; i >= 0; --i) { 62 | hash_res = (*hashfunc)(bf->hash_seeds[i], key) % mod; 63 | if (result && !mbarray_Test(bf->array, hash_res)) { 64 | result = 0; 65 | } 66 | if (mbarray_Set(bf->array, hash_res)) { 67 | return 2; 68 | } 69 | } 70 | if (!result && bf->count_correct) { 71 | bf->elem_count ++; 72 | } 73 | return result; 74 | } 75 | __attribute__((always_inline)) 76 | 77 | 78 | static inline int bloomfilter_Test(BloomFilter * bf, Key * key) 79 | { 80 | register BTYPE mod = bf->array->bits; 81 | register BTYPE (*hashfunc)(uint32_t, Key *) = _hash_char; 82 | register int i; 83 | 84 | if (key->shash == NULL) 85 | hashfunc = _hash_long; 86 | 87 | for (i = bf->num_hashes - 1; i >= 0; --i) { 88 | if (!mbarray_Test(bf->array, (*hashfunc)(bf->hash_seeds[i], key) % mod)) { 89 | return 0; 90 | } 91 | } 92 | return 1; 93 | } 94 | __attribute__((always_inline)) 95 | 96 | 97 | 98 | 99 | #endif 100 | -------------------------------------------------------------------------------- /src/cbloomfilter.pxd: -------------------------------------------------------------------------------- 1 | 2 | cdef extern from "primetester.h": 3 | long next_prime(long prime) 4 | 5 | cdef extern from "mmapbitarray.h": 6 | ctypedef struct MBArray: 7 | long bits 8 | long size 9 | char * filename 10 | int fd 11 | 12 | MBArray * mbarray_ClearAll(MBArray * array) 13 | MBArray * mbarray_Sync(MBArray * array) 14 | MBArray * mbarray_And(MBArray * dest, MBArray * src) 15 | MBArray * mbarray_Or(MBArray * dest, MBArray * src) 16 | MBArray * mbarray_Xor(MBArray * dest, MBArray * src) 17 | MBArray * mbarray_And_Ternary(MBArray * dest, MBArray * a, MBArray * b) 18 | MBArray * mbarray_Or_Ternary(MBArray * dest, MBArray * a, MBArray * b) 19 | MBArray * mbarray_Xor_Ternary(MBArray * dest, MBArray * a, MBArray * b) 20 | int mbarray_Update(MBArray * array, char * data, int size) 21 | int mbarray_FileSize(MBArray * array) 22 | char * mbarray_CharData(MBArray * array) 23 | 24 | 25 | cdef extern from "bloomfilter.h": 26 | ctypedef struct BloomFilter: 27 | long max_num_elem 28 | double error_rate 29 | int num_hashes 30 | long * hash_seeds 31 | MBArray * array 32 | unsigned char bf_version 33 | unsigned char count_correct 34 | unsigned long long elem_count 35 | 36 | ctypedef struct Key: 37 | long nhash 38 | char * shash 39 | 40 | BloomFilter * bloomfilter_Create_Mmap(long max_num_elem, 41 | double error_rate, 42 | char * fname, long num_bits, 43 | int oflags, int perms, 44 | int * hash_seeds, int num_hashes) 45 | BloomFilter * bloomfilter_Create_Malloc(long max_num_elem, 46 | double error_rate, 47 | long num_bits, 48 | int * hash_seeds, int num_hashes) 49 | void bloomfilter_Destroy(BloomFilter * bf) 50 | int bloomfilter_Add(BloomFilter * bf, Key * key) 51 | int bloomfilter_Test(BloomFilter * bf, Key * key) 52 | int bloomfilter_Update(BloomFilter * bf, char * data, int size) 53 | BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int perms) 54 | -------------------------------------------------------------------------------- /src/md5.c: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 1999, 2000, 2002 Aladdin Enterprises. All rights reserved. 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgment in the product documentation would be 15 | appreciated but is not required. 16 | 2. Altered source versions must be plainly marked as such, and must not be 17 | misrepresented as being the original software. 18 | 3. This notice may not be removed or altered from any source distribution. 19 | 20 | L. Peter Deutsch 21 | ghost@aladdin.com 22 | 23 | */ 24 | /* $Id: md5.c,v 1.6 2002/04/13 19:20:28 lpd Exp $ */ 25 | /* 26 | Independent implementation of MD5 (RFC 1321). 27 | 28 | This code implements the MD5 Algorithm defined in RFC 1321, whose 29 | text is available at 30 | http://www.ietf.org/rfc/rfc1321.txt 31 | The code is derived from the text of the RFC, including the test suite 32 | (section A.5) but excluding the rest of Appendix A. It does not include 33 | any code or documentation that is identified in the RFC as being 34 | copyrighted. 35 | 36 | The original and principal author of md5.c is L. Peter Deutsch 37 | . Other authors are noted in the change history 38 | that follows (in reverse chronological order): 39 | 40 | 2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order 41 | either statically or dynamically; added missing #include 42 | in library. 43 | 2002-03-11 lpd Corrected argument list for main(), and added int return 44 | type, in test program and T value program. 45 | 2002-02-21 lpd Added missing #include in test program. 46 | 2000-07-03 lpd Patched to eliminate warnings about "constant is 47 | unsigned in ANSI C, signed in traditional"; made test program 48 | self-checking. 49 | 1999-11-04 lpd Edited comments slightly for automatic TOC extraction. 50 | 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5). 51 | 1999-05-03 lpd Original version. 52 | */ 53 | 54 | #include "md5.h" 55 | #include 56 | #include 57 | 58 | #undef BYTE_ORDER /* 1 = big-endian, -1 = little-endian, 0 = unknown */ 59 | #ifdef ARCH_IS_BIG_ENDIAN 60 | # define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1) 61 | #else 62 | # define BYTE_ORDER 0 63 | #endif 64 | 65 | #define T_MASK ((md5_word_t)~0) 66 | #define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87) 67 | #define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9) 68 | #define T3 0x242070db 69 | #define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111) 70 | #define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050) 71 | #define T6 0x4787c62a 72 | #define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec) 73 | #define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe) 74 | #define T9 0x698098d8 75 | #define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850) 76 | #define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e) 77 | #define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841) 78 | #define T13 0x6b901122 79 | #define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c) 80 | #define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71) 81 | #define T16 0x49b40821 82 | #define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d) 83 | #define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf) 84 | #define T19 0x265e5a51 85 | #define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855) 86 | #define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2) 87 | #define T22 0x02441453 88 | #define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e) 89 | #define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437) 90 | #define T25 0x21e1cde6 91 | #define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829) 92 | #define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278) 93 | #define T28 0x455a14ed 94 | #define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa) 95 | #define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07) 96 | #define T31 0x676f02d9 97 | #define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375) 98 | #define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd) 99 | #define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e) 100 | #define T35 0x6d9d6122 101 | #define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3) 102 | #define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb) 103 | #define T38 0x4bdecfa9 104 | #define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f) 105 | #define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f) 106 | #define T41 0x289b7ec6 107 | #define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805) 108 | #define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a) 109 | #define T44 0x04881d05 110 | #define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6) 111 | #define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a) 112 | #define T47 0x1fa27cf8 113 | #define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a) 114 | #define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb) 115 | #define T50 0x432aff97 116 | #define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58) 117 | #define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6) 118 | #define T53 0x655b59c3 119 | #define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d) 120 | #define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82) 121 | #define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e) 122 | #define T57 0x6fa87e4f 123 | #define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f) 124 | #define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb) 125 | #define T60 0x4e0811a1 126 | #define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d) 127 | #define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca) 128 | #define T63 0x2ad7d2bb 129 | #define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e) 130 | 131 | 132 | static void 133 | md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/) 134 | { 135 | md5_word_t 136 | a = pms->abcd[0], b = pms->abcd[1], 137 | c = pms->abcd[2], d = pms->abcd[3]; 138 | md5_word_t t; 139 | #if BYTE_ORDER > 0 140 | /* Define storage only for big-endian CPUs. */ 141 | md5_word_t X[16]; 142 | #else 143 | /* Define storage for little-endian or both types of CPUs. */ 144 | md5_word_t xbuf[16]; 145 | const md5_word_t *X; 146 | #endif 147 | 148 | { 149 | #if BYTE_ORDER == 0 150 | /* 151 | * Determine dynamically whether this is a big-endian or 152 | * little-endian machine, since we can use a more efficient 153 | * algorithm on the latter. 154 | */ 155 | static const int w = 1; 156 | 157 | if (*((const md5_byte_t *)&w)) /* dynamic little-endian */ 158 | #endif 159 | #if BYTE_ORDER <= 0 /* little-endian */ 160 | { 161 | /* 162 | * On little-endian machines, we can process properly aligned 163 | * data without copying it. 164 | */ 165 | if (!((data - (const md5_byte_t *)0) & 3)) { 166 | /* data are properly aligned */ 167 | X = (const md5_word_t *)data; 168 | } else { 169 | /* not aligned */ 170 | memcpy(xbuf, data, 64); 171 | X = xbuf; 172 | } 173 | } 174 | #endif 175 | #if BYTE_ORDER == 0 176 | else /* dynamic big-endian */ 177 | #endif 178 | #if BYTE_ORDER >= 0 /* big-endian */ 179 | { 180 | /* 181 | * On big-endian machines, we must arrange the bytes in the 182 | * right order. 183 | */ 184 | const md5_byte_t *xp = data; 185 | int i; 186 | 187 | # if BYTE_ORDER == 0 188 | X = xbuf; /* (dynamic only) */ 189 | # else 190 | # define xbuf X /* (static only) */ 191 | # endif 192 | for (i = 0; i < 16; ++i, xp += 4) 193 | xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24); 194 | } 195 | #endif 196 | } 197 | 198 | #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) 199 | 200 | /* Round 1. */ 201 | /* Let [abcd k s i] denote the operation 202 | a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */ 203 | #define F(x, y, z) (((x) & (y)) | (~(x) & (z))) 204 | #define SET(a, b, c, d, k, s, Ti)\ 205 | t = a + F(b,c,d) + X[k] + Ti;\ 206 | a = ROTATE_LEFT(t, s) + b 207 | /* Do the following 16 operations. */ 208 | SET(a, b, c, d, 0, 7, T1); 209 | SET(d, a, b, c, 1, 12, T2); 210 | SET(c, d, a, b, 2, 17, T3); 211 | SET(b, c, d, a, 3, 22, T4); 212 | SET(a, b, c, d, 4, 7, T5); 213 | SET(d, a, b, c, 5, 12, T6); 214 | SET(c, d, a, b, 6, 17, T7); 215 | SET(b, c, d, a, 7, 22, T8); 216 | SET(a, b, c, d, 8, 7, T9); 217 | SET(d, a, b, c, 9, 12, T10); 218 | SET(c, d, a, b, 10, 17, T11); 219 | SET(b, c, d, a, 11, 22, T12); 220 | SET(a, b, c, d, 12, 7, T13); 221 | SET(d, a, b, c, 13, 12, T14); 222 | SET(c, d, a, b, 14, 17, T15); 223 | SET(b, c, d, a, 15, 22, T16); 224 | #undef SET 225 | 226 | /* Round 2. */ 227 | /* Let [abcd k s i] denote the operation 228 | a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */ 229 | #define G(x, y, z) (((x) & (z)) | ((y) & ~(z))) 230 | #define SET(a, b, c, d, k, s, Ti)\ 231 | t = a + G(b,c,d) + X[k] + Ti;\ 232 | a = ROTATE_LEFT(t, s) + b 233 | /* Do the following 16 operations. */ 234 | SET(a, b, c, d, 1, 5, T17); 235 | SET(d, a, b, c, 6, 9, T18); 236 | SET(c, d, a, b, 11, 14, T19); 237 | SET(b, c, d, a, 0, 20, T20); 238 | SET(a, b, c, d, 5, 5, T21); 239 | SET(d, a, b, c, 10, 9, T22); 240 | SET(c, d, a, b, 15, 14, T23); 241 | SET(b, c, d, a, 4, 20, T24); 242 | SET(a, b, c, d, 9, 5, T25); 243 | SET(d, a, b, c, 14, 9, T26); 244 | SET(c, d, a, b, 3, 14, T27); 245 | SET(b, c, d, a, 8, 20, T28); 246 | SET(a, b, c, d, 13, 5, T29); 247 | SET(d, a, b, c, 2, 9, T30); 248 | SET(c, d, a, b, 7, 14, T31); 249 | SET(b, c, d, a, 12, 20, T32); 250 | #undef SET 251 | 252 | /* Round 3. */ 253 | /* Let [abcd k s t] denote the operation 254 | a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */ 255 | #define H(x, y, z) ((x) ^ (y) ^ (z)) 256 | #define SET(a, b, c, d, k, s, Ti)\ 257 | t = a + H(b,c,d) + X[k] + Ti;\ 258 | a = ROTATE_LEFT(t, s) + b 259 | /* Do the following 16 operations. */ 260 | SET(a, b, c, d, 5, 4, T33); 261 | SET(d, a, b, c, 8, 11, T34); 262 | SET(c, d, a, b, 11, 16, T35); 263 | SET(b, c, d, a, 14, 23, T36); 264 | SET(a, b, c, d, 1, 4, T37); 265 | SET(d, a, b, c, 4, 11, T38); 266 | SET(c, d, a, b, 7, 16, T39); 267 | SET(b, c, d, a, 10, 23, T40); 268 | SET(a, b, c, d, 13, 4, T41); 269 | SET(d, a, b, c, 0, 11, T42); 270 | SET(c, d, a, b, 3, 16, T43); 271 | SET(b, c, d, a, 6, 23, T44); 272 | SET(a, b, c, d, 9, 4, T45); 273 | SET(d, a, b, c, 12, 11, T46); 274 | SET(c, d, a, b, 15, 16, T47); 275 | SET(b, c, d, a, 2, 23, T48); 276 | #undef SET 277 | 278 | /* Round 4. */ 279 | /* Let [abcd k s t] denote the operation 280 | a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */ 281 | #define I(x, y, z) ((y) ^ ((x) | ~(z))) 282 | #define SET(a, b, c, d, k, s, Ti)\ 283 | t = a + I(b,c,d) + X[k] + Ti;\ 284 | a = ROTATE_LEFT(t, s) + b 285 | /* Do the following 16 operations. */ 286 | SET(a, b, c, d, 0, 6, T49); 287 | SET(d, a, b, c, 7, 10, T50); 288 | SET(c, d, a, b, 14, 15, T51); 289 | SET(b, c, d, a, 5, 21, T52); 290 | SET(a, b, c, d, 12, 6, T53); 291 | SET(d, a, b, c, 3, 10, T54); 292 | SET(c, d, a, b, 10, 15, T55); 293 | SET(b, c, d, a, 1, 21, T56); 294 | SET(a, b, c, d, 8, 6, T57); 295 | SET(d, a, b, c, 15, 10, T58); 296 | SET(c, d, a, b, 6, 15, T59); 297 | SET(b, c, d, a, 13, 21, T60); 298 | SET(a, b, c, d, 4, 6, T61); 299 | SET(d, a, b, c, 11, 10, T62); 300 | SET(c, d, a, b, 2, 15, T63); 301 | SET(b, c, d, a, 9, 21, T64); 302 | #undef SET 303 | 304 | /* Then perform the following additions. (That is increment each 305 | of the four registers by the value it had before this block 306 | was started.) */ 307 | pms->abcd[0] += a; 308 | pms->abcd[1] += b; 309 | pms->abcd[2] += c; 310 | pms->abcd[3] += d; 311 | } 312 | 313 | void 314 | md5_init(md5_state_t *pms) 315 | { 316 | pms->count[0] = pms->count[1] = 0; 317 | pms->abcd[0] = 0x67452301; 318 | pms->abcd[1] = /*0xefcdab89*/ T_MASK ^ 0x10325476; 319 | pms->abcd[2] = /*0x98badcfe*/ T_MASK ^ 0x67452301; 320 | pms->abcd[3] = 0x10325476; 321 | } 322 | 323 | void 324 | md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes) 325 | { 326 | const md5_byte_t *p = data; 327 | unsigned int left = nbytes; 328 | unsigned int offset = (pms->count[0] >> 3) & 63; 329 | md5_word_t nbits = (md5_word_t)(nbytes << 3); 330 | 331 | if (nbytes <= 0) 332 | return; 333 | 334 | /* this special case is handled recursively */ 335 | if (nbytes > INT_MAX - offset) { 336 | unsigned int overlap; 337 | 338 | /* handle the append in two steps to prevent overflow */ 339 | overlap = 64 - offset; 340 | 341 | md5_append(pms, data, overlap); 342 | md5_append(pms, data + overlap, nbytes - overlap); 343 | return; 344 | } 345 | 346 | /* Update the message length. */ 347 | pms->count[1] += nbytes >> 29; 348 | pms->count[0] += nbits; 349 | if (pms->count[0] < nbits) 350 | pms->count[1]++; 351 | 352 | /* Process an initial partial block. */ 353 | if (offset) { 354 | unsigned int copy = (offset + nbytes > 64 ? 64 - offset : nbytes); 355 | 356 | memcpy(pms->buf + offset, p, copy); 357 | if (offset + copy < 64) 358 | return; 359 | p += copy; 360 | left -= copy; 361 | md5_process(pms, pms->buf); 362 | } 363 | 364 | /* Process full blocks. */ 365 | for (; left >= 64; p += 64, left -= 64) 366 | md5_process(pms, p); 367 | 368 | /* Process a final partial block. */ 369 | if (left) 370 | memcpy(pms->buf, p, left); 371 | } 372 | 373 | void 374 | md5_finish(md5_state_t *pms, md5_byte_t digest[16]) 375 | { 376 | static const md5_byte_t pad[64] = { 377 | 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 378 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 379 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 380 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 381 | }; 382 | md5_byte_t data[8]; 383 | int i; 384 | 385 | /* Save the length before padding. */ 386 | for (i = 0; i < 8; ++i) 387 | data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3)); 388 | /* Pad to 56 bytes mod 64. */ 389 | md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1); 390 | /* Append the length. */ 391 | md5_append(pms, data, 8); 392 | for (i = 0; i < 16; ++i) 393 | digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3)); 394 | } 395 | -------------------------------------------------------------------------------- /src/md5.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (C) 1999, 2002 Aladdin Enterprises. All rights reserved. 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgment in the product documentation would be 15 | appreciated but is not required. 16 | 2. Altered source versions must be plainly marked as such, and must not be 17 | misrepresented as being the original software. 18 | 3. This notice may not be removed or altered from any source distribution. 19 | 20 | L. Peter Deutsch 21 | ghost@aladdin.com 22 | 23 | */ 24 | /* $Id$ */ 25 | /* 26 | Independent implementation of MD5 (RFC 1321). 27 | 28 | This code implements the MD5 Algorithm defined in RFC 1321, whose 29 | text is available at 30 | http://www.ietf.org/rfc/rfc1321.txt 31 | The code is derived from the text of the RFC, including the test suite 32 | (section A.5) but excluding the rest of Appendix A. It does not include 33 | any code or documentation that is identified in the RFC as being 34 | copyrighted. 35 | 36 | The original and principal author of md5.h is L. Peter Deutsch 37 | . Other authors are noted in the change history 38 | that follows (in reverse chronological order): 39 | 40 | 2002-04-13 lpd Removed support for non-ANSI compilers; removed 41 | references to Ghostscript; clarified derivation from RFC 1321; 42 | now handles byte order either statically or dynamically. 43 | 1999-11-04 lpd Edited comments slightly for automatic TOC extraction. 44 | 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5); 45 | added conditionalization for C++ compilation from Martin 46 | Purschke . 47 | 1999-05-03 lpd Original version. 48 | */ 49 | 50 | #ifndef md5_INCLUDED 51 | # define md5_INCLUDED 52 | 53 | /* 54 | * This package supports both compile-time and run-time determination of CPU 55 | * byte order. If ARCH_IS_BIG_ENDIAN is defined as 0, the code will be 56 | * compiled to run only on little-endian CPUs; if ARCH_IS_BIG_ENDIAN is 57 | * defined as non-zero, the code will be compiled to run only on big-endian 58 | * CPUs; if ARCH_IS_BIG_ENDIAN is not defined, the code will be compiled to 59 | * run on either big- or little-endian CPUs, but will run slightly less 60 | * efficiently on either one than if ARCH_IS_BIG_ENDIAN is defined. 61 | */ 62 | 63 | typedef unsigned char md5_byte_t; /* 8-bit byte */ 64 | typedef unsigned int md5_word_t; /* 32-bit word */ 65 | 66 | /* Define the state of the MD5 Algorithm. */ 67 | typedef struct md5_state_s { 68 | md5_word_t count[2]; /* message length in bits, lsw first */ 69 | md5_word_t abcd[4]; /* digest buffer */ 70 | md5_byte_t buf[64]; /* accumulate block */ 71 | } md5_state_t; 72 | 73 | #ifdef __cplusplus 74 | extern "C" 75 | { 76 | #endif 77 | 78 | /* Initialize the algorithm. */ 79 | void md5_init(md5_state_t *pms); 80 | 81 | /* Append a string to the message. */ 82 | void md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes); 83 | 84 | /* Finish the message and return the digest. */ 85 | void md5_finish(md5_state_t *pms, md5_byte_t digest[16]); 86 | 87 | #ifdef __cplusplus 88 | } /* end extern "C" */ 89 | #endif 90 | 91 | #endif /* md5_INCLUDED */ 92 | -------------------------------------------------------------------------------- /src/mmapbitarray.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #include "mmapbitarray.h" 13 | 14 | /* Private helpers */ 15 | static inline uint64_t _filesize(int fd); 16 | static inline int _valid_magic(int fd); 17 | int _initialize_file(int fd, size_t end, BTYPE num_bits, const char * header, int32_t header_len); 18 | uint64_t _get_num_bits(int fd); 19 | static inline size_t _mmap_size(MBArray * array); 20 | /* __attribute__((always_inline));*/ 21 | 22 | static inline int _assert_comparable(MBArray * array1, MBArray * array2); 23 | /* __attribute__((always_inline));;*/ 24 | 25 | MBArray * mbarray_Create_Malloc(BTYPE num_bits) 26 | { 27 | // Try to allocate space for a MBArray struct 28 | errno = 0; 29 | MBArray * array = (MBArray *)malloc(sizeof(MBArray)); 30 | 31 | // And ensure that it was constructed properly 32 | if (!array || errno) { 33 | return NULL; 34 | } 35 | 36 | // Since we're not using a real mmap file for this instance, 37 | // we can get away with setting a bunch of the internal vars 38 | // to be reasonable default values 39 | array->filename = NULL; 40 | array->vector = NULL; 41 | array->fd = 0; 42 | array->preamblesize = 0; 43 | array->preamblebytes = 0; 44 | 45 | // This is how many DTYPEs there are, and how many bytes there 46 | // are in this particular structure. As well as the number of 47 | // bits 48 | array->size = (size_t)ceil((double)num_bits / sizeof(DTYPE) / 8.0); 49 | array->bytes = (size_t)ceil((double)num_bits / 8.0); 50 | array->bits = num_bits; 51 | 52 | // Now try to allocate enough space for our array 53 | errno = 0; 54 | array->vector = (DTYPE *)calloc(array->bytes, 1); 55 | if (errno || !array->vector) { 56 | mbarray_Destroy(array); 57 | return NULL; 58 | } 59 | 60 | return array; 61 | } 62 | 63 | MBArray * mbarray_Create_Mmap(BTYPE num_bits, const char * file, const char * header, int32_t header_len, int oflag, int perms) 64 | { 65 | errno = 0; 66 | MBArray * array = (MBArray *)malloc(sizeof(MBArray)); 67 | uint64_t filesize; 68 | int32_t fheaderlen; 69 | 70 | if (!array || errno) { 71 | return NULL; 72 | } 73 | 74 | array->filename = NULL; 75 | array->vector = NULL; 76 | errno = 0; 77 | array->fd = open(file, oflag, perms); 78 | 79 | if (array->fd < 0) { 80 | errno = EINVAL; 81 | mbarray_Destroy(array); 82 | return NULL; 83 | } 84 | 85 | fheaderlen = mbarray_HeaderLen(array); 86 | errno = 0; 87 | if (fheaderlen >= 0 && !(oflag & O_CREAT) && fheaderlen != header_len) { 88 | errno = EINVAL; 89 | mbarray_Destroy(array); 90 | return NULL; 91 | } 92 | else if (fheaderlen >= 0) { 93 | header_len = fheaderlen; 94 | } 95 | 96 | array->preamblebytes = MBAMAGICSIZE + sizeof(BTYPE) + sizeof(header_len) + header_len; 97 | 98 | /* This size is using 256-byte alignment so that we can use pretty much any base 2 data type */ 99 | array->preamblesize = ((int)ceil((double)array->preamblebytes / 256.0) * 256) / sizeof(DTYPE); 100 | array->preamblebytes = array->preamblesize * (sizeof(DTYPE)); 101 | 102 | if (errno) { 103 | mbarray_Destroy(array); 104 | return NULL; 105 | } 106 | 107 | filesize = _filesize(array->fd); 108 | if (filesize > 50 && !num_bits) { 109 | num_bits = _get_num_bits(array->fd); 110 | } 111 | array->size = (size_t)ceil((double)num_bits / sizeof(DTYPE) / 8.0); 112 | array->bytes = (size_t)ceil((double)num_bits / 8.0); 113 | 114 | if (filesize == 0xffffffffffffffff) { 115 | mbarray_Destroy(array); 116 | return NULL; 117 | } 118 | else if (filesize && !_valid_magic(array->fd)) { 119 | errno = EINVAL; 120 | mbarray_Destroy(array); 121 | return NULL; 122 | } 123 | else if (filesize && filesize < (array->bytes + array->preamblebytes - 1)) { 124 | errno = EINVAL; 125 | mbarray_Destroy(array); 126 | return NULL; 127 | } 128 | else if (!filesize) { 129 | if (!(oflag & O_CREAT) || (!num_bits) || _initialize_file(array->fd, array->bytes + array->preamblebytes - 1, num_bits, header, header_len)) { 130 | if (!errno) { 131 | errno = ENOENT; 132 | } 133 | mbarray_Destroy(array); 134 | return NULL; 135 | } 136 | } 137 | else { 138 | if (!num_bits) { 139 | num_bits = _get_num_bits(array->fd); 140 | array->size = (size_t)ceil((double)num_bits / sizeof(DTYPE) / 8.0); 141 | array->bytes = (size_t)ceil((double)num_bits / 8.0); 142 | } 143 | else if (_get_num_bits(array->fd) != num_bits) { 144 | mbarray_Destroy(array); 145 | errno = EINVAL; 146 | return NULL; 147 | } 148 | } 149 | 150 | errno = 0; 151 | array->vector = (DTYPE *)mmap(NULL, 152 | _mmap_size(array), 153 | PROT_READ | PROT_WRITE, 154 | MAP_SHARED, 155 | array->fd, 156 | 0); 157 | if (errno || !array->vector) { 158 | mbarray_Destroy(array); 159 | return NULL; 160 | } 161 | array->filename = (char *)malloc(strlen(file) + 1); 162 | if (!array->filename) { 163 | mbarray_Destroy(array); 164 | return NULL; 165 | } 166 | strcpy((char *)array->filename, file); 167 | array->bits = num_bits; 168 | return array; 169 | } 170 | 171 | void mbarray_Destroy(MBArray * array) 172 | { 173 | if (array != NULL) { 174 | if (array->vector != NULL) { 175 | if (array->filename == NULL) { 176 | // This is the case where we initialized the vector 177 | // with malloc, and not mmap. As such, be free! 178 | free((void*)array->vector); 179 | array->vector = NULL; 180 | } else { 181 | if (munmap(array->vector, _mmap_size(array))) { 182 | fprintf(stderr, "Unable to close mmap!\n"); 183 | } 184 | if (array->fd >= 0) { 185 | fsync(array->fd); 186 | close(array->fd); 187 | array->fd = -1; 188 | } 189 | array->vector = NULL; 190 | } 191 | } 192 | if (array->filename) { 193 | free((void *)array->filename); 194 | array->filename = NULL; 195 | } 196 | free(array); 197 | } 198 | } 199 | 200 | int32_t mbarray_HeaderLen(MBArray * array) 201 | { 202 | int32_t header_len; 203 | errno = 0; 204 | if (pread(array->fd, &header_len, sizeof(header_len), MBAMAGICSIZE + sizeof(BTYPE)) != sizeof(header_len)) { 205 | return -1; 206 | } 207 | return header_len; 208 | } 209 | 210 | char * mbarray_Header(char * dest, MBArray * array, int maxlen) 211 | { 212 | int32_t header_len = mbarray_HeaderLen(array); 213 | int readnum = (maxlen < header_len) ? (maxlen) : header_len; 214 | 215 | errno = 0; 216 | 217 | if (pread(array->fd, 218 | dest, 219 | readnum, 220 | MBAMAGICSIZE + sizeof(BTYPE) + sizeof(int32_t)) != readnum) { 221 | return NULL; 222 | } 223 | return dest; 224 | } 225 | 226 | 227 | int mbarray_Sync(MBArray * array) 228 | { 229 | if (!array || !array->vector) { 230 | errno = EINVAL; 231 | return 1; 232 | } 233 | if (msync(array->vector, _mmap_size(array), MS_ASYNC)) { 234 | return 1; 235 | } 236 | return 0; 237 | } 238 | 239 | 240 | int mbarray_ClearAll(MBArray * array) 241 | { 242 | if (!array || !array->vector) { 243 | errno = EINVAL; 244 | return 1; 245 | } 246 | memset((void *)(array->vector + array->preamblesize), 0, sizeof(DTYPE) * array->size); 247 | return 0; 248 | } 249 | 250 | 251 | MBArray * mbarray_And(MBArray * dest, MBArray * array2) 252 | { 253 | register int i; 254 | if (_assert_comparable(dest, array2)) 255 | return NULL; 256 | 257 | for (i = 0; i < dest->size + dest->preamblesize; i++) { 258 | dest->vector[i] &= array2->vector[i]; 259 | } 260 | return dest; 261 | } 262 | 263 | 264 | MBArray * mbarray_Or(MBArray * dest, MBArray * array2) 265 | { 266 | register int i; 267 | if (_assert_comparable(dest, array2)) 268 | return NULL; 269 | for (i = 0; i < dest->size + dest->preamblesize; i++) { 270 | dest->vector[i] |= array2->vector[i]; 271 | } 272 | return dest; 273 | } 274 | 275 | 276 | MBArray * mbarray_Xor(MBArray * dest, MBArray * array2) 277 | { 278 | register int i; 279 | if (_assert_comparable(dest, array2)) 280 | return NULL; 281 | 282 | for (i = 0; i < dest->size + dest->preamblesize; i++) { 283 | dest->vector[i] ^= array2->vector[i]; 284 | } 285 | return dest; 286 | } 287 | 288 | 289 | MBArray * mbarray_And_Ternary(MBArray * dest, MBArray * a, MBArray * b) 290 | { 291 | register int i; 292 | if (_assert_comparable(a, b) || _assert_comparable(dest, b)) 293 | return NULL; 294 | 295 | for (i = 0; i < a->size + a->preamblesize; i++) { 296 | dest->vector[i] = a->vector[i] & b->vector[i]; 297 | } 298 | return dest; 299 | } 300 | 301 | MBArray * mbarray_Or_Ternary(MBArray * dest, MBArray * a, MBArray * b) 302 | { 303 | register int i; 304 | if (_assert_comparable(a, b) || _assert_comparable(dest, b)) 305 | return NULL; 306 | 307 | for (i = 0; i < a->size + a->preamblesize; i++) { 308 | dest->vector[i] = a->vector[i] | b->vector[i]; 309 | } 310 | return dest; 311 | } 312 | 313 | MBArray * mbarray_Xor_Ternary(MBArray * dest, MBArray * a, MBArray * b) 314 | { 315 | register int i; 316 | if (_assert_comparable(a, b) || _assert_comparable(dest, b)) 317 | return NULL; 318 | 319 | for (i = 0; i < a->size + a->preamblesize; i++) { 320 | dest->vector[i] = a->vector[i] ^ b->vector[i]; 321 | } 322 | return dest; 323 | } 324 | 325 | 326 | MBArray * mbarray_Copy_Template(MBArray * src, char * filename, int perms) 327 | { 328 | int header_len = mbarray_HeaderLen(src); 329 | char * header; 330 | 331 | if (header_len < 0) { 332 | return NULL; 333 | } 334 | 335 | if (!strcmp(filename, src->filename)) { 336 | errno = EINVAL; 337 | return NULL; 338 | } 339 | 340 | header = (char *)malloc(header_len + 1); 341 | if (header == NULL) { 342 | errno = ENOMEM; 343 | return NULL; 344 | } 345 | 346 | if (mbarray_Header(header, src, header_len) == NULL) { 347 | free(header); 348 | return NULL; 349 | } 350 | 351 | return mbarray_Create_Mmap( 352 | src->bits, 353 | filename, 354 | header, 355 | header_len, 356 | O_CREAT | O_RDWR, 357 | perms); 358 | } 359 | 360 | 361 | /*MBArray * mbarray_Copy(MBarray * src, const char * filename);*/ 362 | uint64_t mbarray_FileSize(MBArray * array) 363 | { 364 | return _filesize(array->fd); 365 | } 366 | 367 | char * mbarray_CharData(MBArray * array) 368 | { 369 | return (char *)array->vector; 370 | } 371 | 372 | 373 | int mbarray_Update(MBArray * array, char * data, int size) 374 | { 375 | memcpy(array->vector, data, size); 376 | array->bits = _get_num_bits(array->fd); 377 | array->size = (size_t)ceil((double)array->bits / sizeof(DTYPE) / 8.0); 378 | array->bytes = (size_t)ceil((double)array->bits / 8.0); 379 | return 0; 380 | } 381 | 382 | static inline int _assert_comparable(MBArray * array1, MBArray * array2) 383 | { 384 | errno = EINVAL; 385 | if (array1->preamblebytes != array2->preamblebytes) { 386 | return 1; 387 | } 388 | 389 | if (memcmp((char *)array1->vector, (char *)array2->vector, array1->preamblebytes)) { 390 | return 1; 391 | } 392 | 393 | return 0; 394 | } 395 | __attribute__((always_inline)) 396 | 397 | 398 | static inline size_t _mmap_size(MBArray * array) 399 | { 400 | return array->bytes + array->preamblebytes; 401 | } 402 | __attribute__((always_inline)) 403 | 404 | 405 | static inline int _valid_magic(int fd) 406 | { 407 | size_t nbytes; 408 | char buffer[MBAMAGICSIZE + 1]; 409 | 410 | nbytes = pread(fd, buffer, MBAMAGICSIZE, 0); 411 | if (errno || nbytes != MBAMAGICSIZE || strncmp(MBAMAGIC, buffer, MBAMAGICSIZE)) { 412 | return 0; 413 | } 414 | else { 415 | return 1; 416 | } 417 | } 418 | 419 | static inline uint64_t _filesize(int fd) 420 | { 421 | struct stat buffer; 422 | int status; 423 | status = fstat(fd, &buffer); 424 | if (status || errno) { 425 | return (uint64_t)0xffffffffffffffff; 426 | } 427 | 428 | return (uint64_t)buffer.st_size; 429 | } 430 | 431 | uint64_t _get_num_bits(int fd) { 432 | uint64_t num_bits; 433 | errno = 0; 434 | if (pread(fd, &num_bits, sizeof(uint64_t), MBAMAGICSIZE) != sizeof(uint64_t)) { 435 | return 0; 436 | } 437 | return num_bits; 438 | } 439 | 440 | int _initialize_file(int fd, size_t end, BTYPE num_bits, const char * header, int32_t header_len) 441 | { 442 | unsigned char zero = 0; 443 | errno = 0; 444 | lseek(fd, 0, SEEK_SET); 445 | if (write(fd, MBAMAGIC, MBAMAGICSIZE) != MBAMAGICSIZE) { 446 | return 1; 447 | } 448 | if (write(fd, &num_bits, sizeof(BTYPE)) != sizeof(BTYPE)) { 449 | return 1; 450 | } 451 | if (write(fd, &header_len, sizeof(header_len)) != sizeof(header_len)) { 452 | return 1; 453 | } 454 | if (header_len) { 455 | if (write(fd, header, header_len) != header_len) { 456 | return 1; 457 | } 458 | } 459 | 460 | lseek(fd, end, SEEK_SET); 461 | if (write(fd, &zero, 1) != 1) { 462 | return 1; 463 | } 464 | 465 | if (errno) { 466 | return 1; 467 | } 468 | return 0; 469 | } 470 | 471 | 472 | 473 | #ifdef MBACREATE 474 | int main(int argc, char ** argv) 475 | { 476 | MBArray * array; 477 | if (argc < 3) { 478 | fprintf(stderr, "Usage: %s FILENAME SIZE\nCreate new mmap'd array file.\n", argv[0]); 479 | return 1; 480 | } 481 | 482 | array = mbarray_Create_Mmap( 483 | atol(argv[2]), 484 | argv[1], 485 | "", 486 | 0, 487 | O_RDWR | O_CREAT, 488 | 0777); 489 | if (!array) 490 | goto error; 491 | mbarray_ClearAll(array); 492 | mbarray_Destroy(array); 493 | return 0; 494 | error: 495 | fprintf(stderr, "Error: %s [%d]\n", strerror(errno), errno); 496 | return 255; 497 | } 498 | #endif 499 | 500 | #ifdef MBAQUERY 501 | int main(int argc, char ** argv) 502 | { 503 | BTYPE bit; 504 | int value; 505 | MBArray * array; 506 | int i; 507 | if (argc < 3) { 508 | fprintf(stderr, "Usage: %s FILE BIT [VALUE]\nValue is either 0 or 1 and will define a set/clear operation.\n", argv[0]); 509 | return 255; 510 | } 511 | 512 | /* Open file */ 513 | array = mbarray_Create_Mmap( 514 | 0, 515 | argv[1], 516 | "", 517 | 0, 518 | O_RDWR, 519 | 0); 520 | if (!array) 521 | goto error; 522 | 523 | bit = atol(argv[2]); 524 | 525 | if (argc > 3) { 526 | value = atol(argv[3]); 527 | if (value) { 528 | if (mbarray_Set(array, bit)) 529 | goto error; 530 | } 531 | else { 532 | if (mbarray_Clear(array, bit)) 533 | goto error; 534 | } 535 | } 536 | 537 | for (i = 0; i < array->bits; i++) { 538 | mbarray_Set(array, i); 539 | mbarray_Test(array, i); 540 | } 541 | getc(stdin); 542 | bit = 1 - mbarray_Test(array, bit); 543 | mbarray_Destroy(array); 544 | return bit; 545 | error: 546 | fprintf(stderr, "Error: %s [%d]\n", strerror(errno), errno); 547 | return 255; 548 | } 549 | #endif 550 | -------------------------------------------------------------------------------- /src/mmapbitarray.h: -------------------------------------------------------------------------------- 1 | #ifndef __MMAPBITARRAY_H 2 | #define __MMAPBITARRAY_H 1 3 | #include 4 | #include 5 | #include 6 | 7 | /* Types */ 8 | typedef uint32_t DTYPE; 9 | typedef uint64_t BTYPE; 10 | 11 | struct MmapBitArray { 12 | BTYPE bits; 13 | size_t size; 14 | size_t preamblesize; 15 | size_t bytes; 16 | size_t preamblebytes; 17 | const char * filename; 18 | DTYPE * vector; 19 | int32_t fd; 20 | }; 21 | 22 | typedef struct MmapBitArray MBArray; 23 | 24 | 25 | /* Constants */ 26 | enum { 27 | ONES = (DTYPE)-1, 28 | MBAMAGICSIZE = 9 29 | }; 30 | #define MBAMAGIC "MBITARRAY" 31 | 32 | 33 | 34 | /* Functions */ 35 | MBArray * mbarray_Create_Malloc(BTYPE num_bits); 36 | 37 | MBArray * mbarray_Create_Mmap(BTYPE num_bits, const char * file, const char * header, int header_len, int oflag, int perms); 38 | 39 | void mbarray_Destroy(MBArray * array); 40 | 41 | int mbarray_ClearAll(MBArray * array); 42 | 43 | int mbarray_Sync(MBArray * array); 44 | 45 | int32_t mbarray_HeaderLen(MBArray * array); 46 | 47 | char * mbarray_Header(char * dest, MBArray * array, int maxlen); 48 | 49 | MBArray * mbarray_And(MBArray * dest, MBArray * array2); 50 | 51 | MBArray * mbarray_Or(MBArray * dest, MBArray * array2); 52 | 53 | MBArray * mbarray_Xor(MBArray * dest, MBArray * array2); 54 | 55 | MBArray * mbarray_And_Ternary(MBArray * dest, MBArray * a, MBArray * b); 56 | 57 | MBArray * mbarray_Or_Ternary(MBArray * dest, MBArray * a, MBArray * b); 58 | 59 | MBArray * mbarray_Xor_Ternary(MBArray * dest, MBArray * a, MBArray * b); 60 | 61 | MBArray * mbarray_Copy_Template(MBArray * src, char * filename, int perms); 62 | 63 | int mbarray_Update(MBArray * array, char * data, int size); 64 | /*MBArray * mbarray_Copy(MBarray * src, const char * filename);*/ 65 | 66 | uint64_t mbarray_FileSize(MBArray * array); 67 | 68 | char * mbarray_CharData(MBArray * array); 69 | 70 | static inline size_t _vector_offset(MBArray * array, BTYPE bit) 71 | { 72 | return (size_t)(array->preamblesize + bit / (sizeof(DTYPE) << 3)); 73 | } 74 | __attribute__((always_inline)) 75 | 76 | 77 | static inline size_t _vector_byte(BTYPE bit) { 78 | return 1 << (bit % (sizeof(DTYPE) << 3)); 79 | } 80 | __attribute__((always_inline)) 81 | 82 | 83 | static inline int mbarray_Set(MBArray * array, BTYPE bit) 84 | { 85 | if (bit > array->bits) { 86 | errno = EINVAL; 87 | return 1; 88 | } 89 | array->vector[_vector_offset(array, bit)] |= _vector_byte(bit); 90 | return 0; 91 | } 92 | __attribute__((always_inline)) 93 | 94 | 95 | static inline int mbarray_Clear(MBArray * array, BTYPE bit) 96 | { 97 | if (bit > array->bits) { 98 | errno = EINVAL; 99 | return 1; 100 | } 101 | array->vector[_vector_offset(array, bit)] &= (ONES - _vector_byte(bit)); 102 | return 0; 103 | } 104 | __attribute__((always_inline)) 105 | 106 | 107 | static inline int mbarray_Test(MBArray * array, BTYPE bit) 108 | { 109 | if (bit > array->bits) { 110 | errno = EINVAL; 111 | return -1; 112 | } 113 | return ((array->vector[_vector_offset(array, bit)] & _vector_byte(bit)) != 0); 114 | } 115 | __attribute__((always_inline)) 116 | 117 | 118 | #endif 119 | -------------------------------------------------------------------------------- /src/primetester.c: -------------------------------------------------------------------------------- 1 | #ifndef __PRIMETESTER_C 2 | #define __PRIMETESTER_C 1 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | 9 | #include "primetester.h" 10 | 11 | PTYPE next_prime(PTYPE prime) 12 | { 13 | register PTYPE initial_prime = 89; 14 | while (initial_prime < prime) { 15 | initial_prime <<= 1; 16 | ++initial_prime; 17 | } 18 | return initial_prime; 19 | } 20 | 21 | #endif 22 | -------------------------------------------------------------------------------- /src/primetester.h: -------------------------------------------------------------------------------- 1 | #ifndef __PRIMETESTER_H 2 | #define __PRIMETESTER_H 1 3 | 4 | typedef unsigned long PTYPE; 5 | PTYPE next_prime(PTYPE prime); 6 | 7 | #endif 8 | -------------------------------------------------------------------------------- /src/pybloomfilter.pyx: -------------------------------------------------------------------------------- 1 | VERSION = (0, 3, 14) 2 | AUTHOR = "Michael Axiak" 3 | 4 | __VERSION__ = VERSION 5 | 6 | 7 | cimport cbloomfilter 8 | cimport cpython 9 | 10 | import random 11 | import os 12 | import math 13 | import errno as eno 14 | import array 15 | import zlib 16 | import shutil 17 | 18 | 19 | cdef extern int errno 20 | 21 | cdef construct_mode(mode): 22 | result = os.O_RDONLY 23 | if 'w' in mode: 24 | result |= os.O_RDWR 25 | if 'b' in mode and hasattr(os, 'O_BINARY'): 26 | result |= os.O_BINARY 27 | if mode.endswith('+'): 28 | result |= os.O_CREAT 29 | return result 30 | 31 | cdef NoConstruct = object() 32 | 33 | class IndeterminateCountError(ValueError): 34 | pass 35 | 36 | cdef class BloomFilter: 37 | """ 38 | The BloomFilter class implements a bloom filter that uses mmap'd files. 39 | For more information on what a bloom filter is, please read the Wikipedia article about it. 40 | """ 41 | cdef cbloomfilter.BloomFilter * _bf 42 | cdef int _closed 43 | cdef int _in_memory 44 | cdef public ReadFile 45 | 46 | def __cinit__(self, capacity, error_rate, filename=None, perm=0755): 47 | cdef char * seeds 48 | cdef long long num_bits 49 | self._closed = 0 50 | self._in_memory = 0 51 | self.ReadFile = self.__class__.ReadFile 52 | mode = "rw+" 53 | if filename is NoConstruct: 54 | return 55 | 56 | if capacity is self.ReadFile: 57 | mode = "rw" 58 | capacity = 0 59 | if not os.path.exists(filename): 60 | raise OSError("File %s not found" % filename) 61 | 62 | if not os.access(filename, os.O_RDWR): 63 | raise OSError("Insufficient permissions for file %s" % filename) 64 | 65 | mode = construct_mode(mode) 66 | 67 | 68 | if not mode & os.O_CREAT: 69 | if os.path.exists(filename): 70 | self._bf = cbloomfilter.bloomfilter_Create_Mmap(capacity, 71 | error_rate, 72 | filename, 73 | 0, 74 | mode, 75 | perm, 76 | NULL, 0) 77 | if self._bf is NULL: 78 | raise ValueError("Invalid %s file: %s" % 79 | (self.__class__.__name__, filename)) 80 | else: 81 | raise OSError(eno.ENOENT, '%s: %s' % (os.strerror(eno.ENOENT), 82 | filename)) 83 | else: 84 | # Make sure that if the filename is defined, that the 85 | # file exists 86 | if filename and os.path.exists(filename): 87 | os.unlink(filename) 88 | 89 | # For why we round down for determining the number of hashes: 90 | # http://corte.si/%2Fposts/code/bloom-filter-rules-of-thumb/index.html 91 | # "The number of hashes determines the number of bits that need to 92 | # be read to test for membership, the number of bits that need to be 93 | # written to add an element, and the amount of computation needed to 94 | # calculate hashes themselves. We may sometimes choose to use a less 95 | # than optimal number of hashes for performance reasons (especially 96 | # when we choose to round down when the calculated optimal number of 97 | # hashes is fractional)." 98 | 99 | assert(error_rate > 0.0 and error_rate < 1.0), "error_rate allowable range (0.0,1.0) %f" % (error_rate,) 100 | num_hashes = max(int(math.floor(math.log(1.0 / error_rate, 2.0))),1) 101 | bits_per_hash = int(math.ceil( 102 | capacity * abs(math.log(error_rate)) / 103 | (num_hashes * (math.log(2) ** 2)))) 104 | 105 | # mininum bitvector of 128 bits 106 | num_bits = max(num_hashes * bits_per_hash,128) 107 | 108 | #print "k = %d m = %d n = %d p ~= %.8f" % ( 109 | # num_hashes, num_bits, capacity, 110 | # (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits)) 111 | # ** num_hashes) 112 | 113 | hash_seeds = array.array('I') 114 | hash_seeds.extend([random.getrandbits(32) for i in range(num_hashes)]) 115 | test = hash_seeds.tostring() 116 | seeds = test 117 | 118 | # If a filename is provided, we should make a mmap-file 119 | # backed bloom filter. Otherwise, it will be malloc 120 | if filename: 121 | self._bf = cbloomfilter.bloomfilter_Create_Mmap(capacity, 122 | error_rate, 123 | filename, 124 | num_bits, 125 | mode, 126 | perm, 127 | seeds, 128 | num_hashes) 129 | else: 130 | self._in_memory = 1 131 | self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity, 132 | error_rate, 133 | num_bits, 134 | seeds, 135 | num_hashes) 136 | if self._bf is NULL: 137 | if filename: 138 | raise OSError(errno, '%s: %s' % (os.strerror(errno), 139 | filename)) 140 | else: 141 | cpython.PyErr_NoMemory() 142 | 143 | def __dealloc__(self): 144 | cbloomfilter.bloomfilter_Destroy(self._bf) 145 | self._bf = NULL 146 | 147 | property hash_seeds: 148 | def __get__(self): 149 | self._assert_open() 150 | result = array.array('I') 151 | result.fromstring((self._bf.hash_seeds)[:4 * self.num_hashes]) 152 | return result 153 | 154 | property capacity: 155 | def __get__(self): 156 | self._assert_open() 157 | return self._bf.max_num_elem 158 | 159 | property error_rate: 160 | def __get__(self): 161 | self._assert_open() 162 | return self._bf.error_rate 163 | 164 | property num_hashes: 165 | def __get__(self): 166 | self._assert_open() 167 | return self._bf.num_hashes 168 | 169 | property num_bits: 170 | def __get__(self): 171 | self._assert_open() 172 | return self._bf.array.bits 173 | 174 | property name: 175 | def __get__(self): 176 | self._assert_open() 177 | if self._in_memory: 178 | raise NotImplementedError('Cannot access .name on an ' 179 | 'in-memory %s' % 180 | self.__class__.__name__) 181 | 182 | return self._bf.array.filename 183 | 184 | def fileno(self): 185 | self._assert_open() 186 | return self._bf.array.fd 187 | 188 | def __repr__(self): 189 | self._assert_open() 190 | my_name = self.__class__.__name__ 191 | return '<%s capacity: %d, error: %0.3f, num_hashes: %d>' % ( 192 | my_name, self._bf.max_num_elem, self._bf.error_rate, 193 | self._bf.num_hashes) 194 | 195 | def __str__(self): 196 | return self.__repr__() 197 | 198 | def sync(self): 199 | self._assert_open() 200 | cbloomfilter.mbarray_Sync(self._bf.array) 201 | 202 | def clear_all(self): 203 | self._assert_open() 204 | cbloomfilter.mbarray_ClearAll(self._bf.array) 205 | 206 | def __contains__(self, item): 207 | self._assert_open() 208 | cdef cbloomfilter.Key key 209 | if isinstance(item, str): 210 | key.shash = item 211 | key.nhash = len(item) 212 | else: 213 | key.shash = NULL 214 | key.nhash = hash(item) 215 | return cbloomfilter.bloomfilter_Test(self._bf, &key) == 1 216 | 217 | def copy_template(self, filename, perm=0755): 218 | self._assert_open() 219 | cdef BloomFilter copy = BloomFilter(0, 0, NoConstruct) 220 | if os.path.exists(filename): 221 | os.unlink(filename) 222 | copy._bf = cbloomfilter.bloomfilter_Copy_Template(self._bf, filename, perm) 223 | return copy 224 | 225 | def copy(self, filename): 226 | self._assert_open() 227 | if self._in_memory: 228 | raise NotImplementedError('Cannot call .copy on an in-memory %s' % 229 | self.__class__.__name__) 230 | shutil.copy(self._bf.array.filename, filename) 231 | return self.__class__(self.ReadFile, 0.1, filename, perm=0) 232 | 233 | def add(self, item): 234 | self._assert_open() 235 | cdef cbloomfilter.Key key 236 | if isinstance(item, str): 237 | key.shash = item 238 | key.nhash = len(item) 239 | else: 240 | key.shash = NULL 241 | key.nhash = hash(item) 242 | 243 | result = cbloomfilter.bloomfilter_Add(self._bf, &key) 244 | if result == 2: 245 | raise RuntimeError("Some problem occured while trying to add key.") 246 | return bool(result) 247 | 248 | def update(self, iterable): 249 | self._assert_open() 250 | for item in iterable: 251 | self.add(item) 252 | 253 | def __len__(self): 254 | self._assert_open() 255 | if not self._bf.count_correct: 256 | raise IndeterminateCountError("Length of %s object is unavailable " 257 | "after intersection or union called." % 258 | self.__class__.__name__) 259 | return self._bf.elem_count 260 | 261 | def close(self): 262 | if self._closed == 0: 263 | self._closed = 1 264 | cbloomfilter.bloomfilter_Destroy(self._bf) 265 | self._bf = NULL 266 | 267 | def __ior__(self, BloomFilter other): 268 | self._assert_open() 269 | self._assert_comparable(other) 270 | cbloomfilter.mbarray_Or(self._bf.array, other._bf.array) 271 | self._bf.count_correct = 0 272 | return self 273 | 274 | def union(self, BloomFilter other): 275 | self._assert_open() 276 | other._assert_open() 277 | self._assert_comparable(other) 278 | cbloomfilter.mbarray_Or(self._bf.array, other._bf.array) 279 | self._bf.count_correct = 0 280 | return self 281 | 282 | def __iand__(self, BloomFilter other): 283 | self._assert_open() 284 | other._assert_open() 285 | self._assert_comparable(other) 286 | cbloomfilter.mbarray_And(self._bf.array, other._bf.array) 287 | self._bf.count_correct = 0 288 | return self 289 | 290 | def intersection(self, BloomFilter other): 291 | self._assert_open() 292 | other._assert_open() 293 | self._assert_comparable(other) 294 | cbloomfilter.mbarray_And(self._bf.array, other._bf.array) 295 | self._bf.count_correct = 0 296 | return self 297 | 298 | def _assert_open(self): 299 | if self._closed != 0: 300 | raise ValueError("I/O operation on closed file") 301 | 302 | def _assert_comparable(self, BloomFilter other): 303 | error = ValueError("The two %s objects are not the same type (hint, " 304 | "use copy_template)" % self.__class__.__name__) 305 | if self._bf.array.bits != other._bf.array.bits: 306 | raise error 307 | if self.hash_seeds != other.hash_seeds: 308 | raise error 309 | return 310 | 311 | def to_base64(self): 312 | self._assert_open() 313 | bfile = open(self.name, 'r') 314 | result = zlib.compress(zlib.compress(bfile.read(), 9).encode('base64')).encode('base64') 315 | bfile.close() 316 | return result 317 | 318 | @classmethod 319 | def from_base64(cls, filename, string, perm=0755): 320 | bfile_fp = os.open(filename, construct_mode('w+'), perm) 321 | os.write(bfile_fp, zlib.decompress(zlib.decompress( 322 | string.decode('base64')).decode('base64'))) 323 | os.close(bfile_fp) 324 | return cls.open(filename) 325 | 326 | @classmethod 327 | def open(cls, filename): 328 | return cls(cls.ReadFile, 0.1, filename, 0) 329 | -------------------------------------------------------------------------------- /src/superfast.h: -------------------------------------------------------------------------------- 1 | /* 2 | * The superfast function is licensed under the LGPL: 3 | * http://www.gnu.org/licenses/lgpl-2.1.txt) 4 | * as described on the page: 5 | * http://www.azillionmonkeys.com/qed/hash.html) 6 | * Retrieved Dec 03, 2011 7 | */ 8 | 9 | #include "stdint.h" /* Replace with if appropriate */ 10 | #undef get16bits 11 | #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \ 12 | || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__) 13 | #define get16bits(d) (*((const uint16_t *) (d))) 14 | #endif 15 | 16 | #if !defined (get16bits) 17 | #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\ 18 | +(uint32_t)(((const uint8_t *)(d))[0]) ) 19 | #endif 20 | 21 | uint32_t SuperFastHash (const char * data, int len, uint32_t hash) { 22 | uint32_t tmp; 23 | int rem; 24 | 25 | if (len <= 0 || data == NULL) return 0; 26 | 27 | rem = len & 3; 28 | len >>= 2; 29 | 30 | /* Main loop */ 31 | for (;len > 0; len--) { 32 | hash += get16bits (data); 33 | tmp = (get16bits (data+2) << 11) ^ hash; 34 | hash = (hash << 16) ^ tmp; 35 | data += 2*sizeof (uint16_t); 36 | hash += hash >> 11; 37 | } 38 | 39 | /* Handle end cases */ 40 | switch (rem) { 41 | case 3: hash += get16bits (data); 42 | hash ^= hash << 16; 43 | hash ^= data[sizeof (uint16_t)] << 18; 44 | hash += hash >> 11; 45 | break; 46 | case 2: hash += get16bits (data); 47 | hash ^= hash << 11; 48 | hash += hash >> 17; 49 | break; 50 | case 1: hash += *data; 51 | hash ^= hash << 10; 52 | hash += hash >> 1; 53 | } 54 | 55 | /* Force "avalanching" of final 127 bits */ 56 | hash ^= hash << 3; 57 | hash += hash >> 5; 58 | hash ^= hash << 4; 59 | hash += hash >> 17; 60 | hash ^= hash << 25; 61 | hash += hash >> 6; 62 | 63 | return hash; 64 | } -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | import unittest 4 | import importlib 5 | import functools 6 | import tempfile 7 | 8 | here = os.path.dirname(__file__) 9 | 10 | def with_test_file(method): 11 | @functools.wraps(method) 12 | def _wrapped(*args, **kwargs): 13 | f = tempfile.NamedTemporaryFile(suffix='.bloom') 14 | kwargs['filename'] = f.name 15 | try: 16 | return method(*args, **kwargs) 17 | finally: 18 | f.close() 19 | return _wrapped 20 | 21 | def test_all(): 22 | suite = unittest.TestSuite() 23 | for fname in glob.glob(os.path.join(here, '*.py')): 24 | if '__init__' in fname: 25 | continue 26 | module = importlib.import_module('tests.' + os.path.basename(fname).split('.py')[0]) 27 | if hasattr(module, 'suite'): 28 | suite.addTest(module.suite()) 29 | return suite 30 | -------------------------------------------------------------------------------- /tests/accuracytest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import string 4 | import random 5 | import unittest 6 | import tempfile 7 | 8 | import pybloomfilter 9 | 10 | 11 | class TestAccuracyMixin(object): 12 | FILTER_SIZE = 1000 13 | 14 | def _gen_random_items(self, n, exclude=None): 15 | # Yield n unique random items; if an existing set is provided, 16 | # items already in that set will not be yielded. 17 | if exclude is not None: 18 | yielded = exclude 19 | else: 20 | yielded = set() 21 | 22 | yield_count = 0 23 | while yield_count < n: 24 | random_item = self._random_item() 25 | if random_item not in yielded: 26 | yield random_item 27 | yielded.add(random_item) 28 | yield_count += 1 29 | 30 | def test_false_pos_degredation(self): 31 | # we'll check 10% to 0.01% 32 | for error_rate in (0.1, 0.01, 0.001, 0.0001): 33 | bf = self._bf(error_rate) 34 | items_in_filter = set( 35 | self._gen_random_items(bf.capacity)) 36 | 37 | items_in_filter_list = list(items_in_filter) 38 | n = 0 39 | chunk_count = 10 40 | chunk_size = int(math.ceil(float(bf.capacity) / chunk_count)) 41 | print 'error_rate = %.4f' % error_rate 42 | print ' %6s %9s %s' % ('n', 'false_pos', 43 | 'estimated error_rate') 44 | for i in range(chunk_count): 45 | chunk = items_in_filter_list[i*chunk_size:(i+1)*chunk_size] 46 | n += len(chunk) 47 | bf.update(chunk) 48 | pos_test_count = int(5 * 1.0 / error_rate) 49 | false_pos = len(filter(bf.__contains__, 50 | self._gen_random_items( 51 | pos_test_count, items_in_filter))) 52 | est_error_rate = float(false_pos) / pos_test_count 53 | print ' %6d %9d %.8f %s' % ( 54 | n, false_pos, est_error_rate, 55 | '******' if est_error_rate > error_rate else '') 56 | 57 | def test_accuracy(self): 58 | print '\n%14s\t%14s\t%17s' % ('pos_test_count', 'false_pos_rate', 59 | 'error_rate_target') 60 | # we'll check 10% to 0.01% 61 | for error_rate in (0.1, 0.01, 0.001, 0.0001): 62 | bf = self._bf(error_rate) 63 | items_in_filter = set( 64 | self._gen_random_items(bf.capacity)) 65 | bf.update(items_in_filter) 66 | 67 | # sanity check 68 | self.assertEqual(bf.capacity, len(items_in_filter)) 69 | 70 | false_neg = len(items_in_filter) - \ 71 | len(filter(bf.__contains__, items_in_filter)) 72 | 73 | pos_test_count = int(10 * (1.0 / error_rate)) 74 | false_pos = len(filter(bf.__contains__, self._gen_random_items( 75 | pos_test_count, items_in_filter))) 76 | false_pos = 0 77 | for test in self._gen_random_items(pos_test_count, 78 | items_in_filter): 79 | if test in bf: 80 | false_pos += 1 81 | 82 | false_pos_rate = float(false_pos) / pos_test_count 83 | false_neg_rate = float(false_neg) / len(items_in_filter) 84 | error_rate_target = error_rate * 2 # cut it some slack 85 | 86 | print '%14d\t%14f\t%17f' % (pos_test_count, false_pos_rate, 87 | error_rate_target) 88 | self.assertTrue( 89 | false_pos_rate <= error_rate_target, 90 | "false_pos: %r / %r = %r > %r" % ( 91 | false_pos, pos_test_count, 92 | false_pos / float(pos_test_count), error_rate_target)) 93 | self.assertEqual(false_neg_rate, 0.0, 94 | "false negative rate is nonzero: %0.6f" % 95 | (false_neg_rate,)) 96 | del bf 97 | 98 | 99 | class StringAccuracyMallocTestCase(unittest.TestCase, TestAccuracyMixin): 100 | CHARS = string.lowercase + string.uppercase 101 | STR_LEN = 10 102 | 103 | def _random_item(self): 104 | return ''.join(random.choice(self.CHARS) 105 | for _ in xrange(self.STR_LEN)) 106 | 107 | def _bf(self, error_rate): 108 | return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate) 109 | 110 | 111 | class StringAccuracyMmapTestCase(unittest.TestCase, TestAccuracyMixin): 112 | CHARS = string.lowercase + string.uppercase 113 | STR_LEN = 10 114 | 115 | def setUp(self): 116 | self.temp_file = tempfile.NamedTemporaryFile(suffix='.bloom', 117 | delete=False) 118 | 119 | def tearDown(self): 120 | os.unlink(self.temp_file.name) 121 | 122 | def _random_item(self): 123 | return ''.join(random.choice(self.CHARS) 124 | for _ in xrange(self.STR_LEN)) 125 | 126 | def _bf(self, error_rate): 127 | return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate, 128 | self.temp_file.name) 129 | 130 | 131 | class IntegerAccuracyMallocTestCase(unittest.TestCase, TestAccuracyMixin): 132 | 133 | def _random_item(self): 134 | return random.randint(-2**31, 2**31) 135 | 136 | def _bf(self, error_rate): 137 | return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate) 138 | 139 | 140 | class IntegerAccuracyMmapTestCase(unittest.TestCase, TestAccuracyMixin): 141 | 142 | def setUp(self): 143 | self.temp_file = tempfile.NamedTemporaryFile(suffix='.bloom', 144 | delete=False) 145 | 146 | def tearDown(self): 147 | os.unlink(self.temp_file.name) 148 | 149 | def _random_item(self): 150 | return random.randint(-2**31, 2**31) 151 | 152 | def _bf(self, error_rate): 153 | return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate, 154 | self.temp_file.name) 155 | 156 | 157 | def suite(): 158 | suite = unittest.TestSuite() 159 | suite.addTest(unittest.makeSuite(StringAccuracyMmapTestCase)) 160 | suite.addTest(unittest.makeSuite(StringAccuracyMallocTestCase)) 161 | suite.addTest(unittest.makeSuite(IntegerAccuracyMmapTestCase)) 162 | suite.addTest(unittest.makeSuite(IntegerAccuracyMallocTestCase)) 163 | return suite 164 | -------------------------------------------------------------------------------- /tests/comparisons/accuracytest.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import sys 4 | import os 5 | import tempfile 6 | import pybloomfilter 7 | 8 | WORDS_FILE = os.path.join(os.path.dirname(__file__), 'words') 9 | TEST_WORDS = os.path.join(os.path.dirname(__file__), 'testwords') 10 | 11 | def main(): 12 | global pybloomfilter 13 | 14 | if len(sys.argv) > 1 and sys.argv[1].lower() == '-pybloom': 15 | import pybloom 16 | pybloomfilter = pybloom 17 | 18 | with open(WORDS_FILE) as base_file: 19 | with open(TEST_WORDS) as test_file: 20 | base_words = set(base_file) 21 | test_words = set(test_file) 22 | correct_overlap = len(base_words & test_words) 23 | num_test_words = len(test_words) 24 | number_words = len(base_words) 25 | 26 | for error_rate in (0.01, 0.001, 0.0001): 27 | test_errors(error_rate, number_words, correct_overlap, num_test_words) 28 | 29 | 30 | def test_errors(error_rate, filter_size, correct_overlap, num_test_words): 31 | bloom_file = tempfile.NamedTemporaryFile() 32 | try: 33 | bf = pybloomfilter.BloomFilter(filter_size, error_rate, bloom_file.name) 34 | except TypeError: 35 | bf = pybloomfilter.BloomFilter(filter_size, error_rate) 36 | 37 | with open(WORDS_FILE) as source_file: 38 | with open(TEST_WORDS) as test_file: 39 | run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_rate) 40 | 41 | #os.unlink(bloom_file.name) 42 | 43 | 44 | def run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_rate): 45 | for word in source_file: 46 | bf.add(word.rstrip()) 47 | 48 | positive_matches = sum(1 for word in test_file 49 | if word.rstrip() in bf) 50 | 51 | 52 | actual_error_rate = float(positive_matches - correct_overlap) / correct_overlap 53 | 54 | print "Specified: %f; Measured: %f; num_hashes: %d, num_bits: %d" % ( 55 | error_rate, 56 | actual_error_rate, 57 | getattr(bf, 'num_hashes', None) or getattr(bf, 'num_slices'), 58 | bf.num_bits, 59 | ) 60 | 61 | 62 | 63 | if __name__ == '__main__': 64 | main() 65 | -------------------------------------------------------------------------------- /tests/comparisons/speedtest.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | import os 4 | import tempfile 5 | import time 6 | import timeit 7 | 8 | import pybloomfilter 9 | 10 | tempfiles = [] 11 | 12 | ERROR_RATE = 0.1 13 | 14 | #def get_and_add_words(Creator, wordlist): 15 | def get_and_add_words(Creator, wordlist): 16 | bf = Creator(len(wordlist), ERROR_RATE) 17 | for word in wordlist: 18 | bf.add(word) 19 | return bf 20 | 21 | def check_words(bf, wordlist): 22 | for word in wordlist: 23 | word in bf 24 | 25 | def test_errors(bf, correct_wordlist, test_wordlist): 26 | errors = [0, 0] 27 | for word in test_wordlist: 28 | if word in bf: 29 | if word not in correct_wordlist: 30 | errors[0] += 1 31 | else: 32 | if word in correct_wordlist: 33 | errors[1] += 1 34 | print '%0.2f%% positive %0.2f%% negative' % ( 35 | errors[0] / float(len(correct_wordlist)) * 100, 36 | errors[1] / float(len(correct_wordlist)) * 100) 37 | 38 | def create_word_list(filename): 39 | f = open(filename, 'r') 40 | words_set = set() 41 | for line in f: 42 | line = line.strip().lower() 43 | if line: 44 | words_set.add(line) 45 | f.close() 46 | return words_set 47 | 48 | def create_cbloomfilter(*args): 49 | args = list(args) 50 | f = tempfile.NamedTemporaryFile() 51 | tempfiles.append(f) 52 | os.unlink(f.name) 53 | args.append(f.name) 54 | return pybloomfilter.BloomFilter(*tuple(args)) 55 | 56 | creators = [create_cbloomfilter] 57 | try: 58 | import pybloom 59 | except ImportError: 60 | pass 61 | else: 62 | creators.append(pybloom.BloomFilter) 63 | 64 | def run_test(): 65 | dict_wordlist = create_word_list('words') 66 | test_wordlist = create_word_list('testwords') 67 | NUM = 10 68 | 69 | for creator in creators: 70 | start = time.time() 71 | if NUM: 72 | t = timeit.Timer(lambda : get_and_add_words(creator, dict_wordlist)) 73 | print "%s took %0.5f s/run" % ( 74 | creator, 75 | t.timeit(NUM) / float(NUM)) 76 | bf = get_and_add_words(creator, dict_wordlist) 77 | 78 | if NUM: 79 | t = timeit.Timer(lambda : check_words(bf, test_wordlist)) 80 | print "%s took %0.5f s/run" % ( 81 | creator, 82 | t.timeit(NUM) / float(NUM)) 83 | 84 | raw_input() 85 | 86 | test_errors(bf, dict_wordlist, test_wordlist) 87 | 88 | if __name__ == "__main__": 89 | run_test() 90 | -------------------------------------------------------------------------------- /tests/simpletest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import string 3 | import unittest 4 | import tempfile 5 | from random import randint, choice 6 | 7 | import pybloomfilter 8 | 9 | from tests import with_test_file 10 | 11 | 12 | class SimpleTestCase(unittest.TestCase): 13 | FILTER_SIZE = 200 14 | FILTER_ERROR_RATE = 0.001 15 | 16 | def setUp(self): 17 | # Convenience file-backed bloomfilter 18 | self.tempfile = tempfile.NamedTemporaryFile(suffix='.bloom', 19 | delete=False) 20 | self.bf = pybloomfilter.BloomFilter(self.FILTER_SIZE, 21 | self.FILTER_ERROR_RATE, 22 | self.tempfile.name) 23 | 24 | # Convenience memory-backed bloomfilter 25 | self.bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE, 26 | self.FILTER_ERROR_RATE) 27 | 28 | def tearDown(self): 29 | os.unlink(self.tempfile.name) 30 | 31 | def assertPropertiesPreserved(self, old_bf, new_bf): 32 | # Assert that a "new" BloomFilter has the same properties as an "old" 33 | # one. 34 | failures = [] 35 | for prop in ['capacity', 'error_rate', 'num_hashes', 'num_bits', 36 | 'hash_seeds']: 37 | old, new = getattr(old_bf, prop), getattr(new_bf, prop) 38 | if new != old: 39 | failures.append((prop, old, new)) 40 | self.assertEqual([], failures) 41 | 42 | def _random_str(self, length=16): 43 | chars = string.lowercase + string.uppercase 44 | return ''.join(choice(chars) for _ in xrange(length)) 45 | 46 | def _random_set_of_stuff(self, c): 47 | """ 48 | Return a random set containing up to "c" count of each type of Python 49 | object. 50 | """ 51 | return set( 52 | # Due to a small chance of collision, there's no guarantee on the 53 | # count of elements in this set, but we'll make sure that's okay. 54 | [self._random_str() for _ in range(c)] + 55 | [randint(-1000, 1000) for _ in range(c)] + 56 | [(randint(-200, 200), self._random_str()) for _ in range(c)] + 57 | [float(randint(10, 100)) / randint(10, 100) 58 | for _ in range(c)] + 59 | [long(randint(50000, 1000000)) for _ in range(c)] + 60 | [object() for _ in range(c)] + 61 | [unicode(self._random_str) for _ in range(c)]) 62 | 63 | def _populate_filter(self, bf, use_update=False): 64 | """ 65 | Populate given BloomFilter with a handfull of hashable things. 66 | """ 67 | self._in_filter = self._random_set_of_stuff(10) 68 | self._not_in_filter = self._random_set_of_stuff(15) 69 | # Just in case we randomly chose a key which was also in 70 | # self._in_filter... 71 | self._not_in_filter = self._not_in_filter - self._in_filter 72 | 73 | if use_update: 74 | bf.update(self._in_filter) 75 | else: 76 | for item in self._in_filter: 77 | bf.add(item) 78 | 79 | def _check_filter_contents(self, bf): 80 | for item in self._in_filter: 81 | # We should *never* say "not in" for something which was added 82 | self.assertTrue(item in bf, '%r was NOT in %r' % (item, bf)) 83 | 84 | # We might say something is in the filter which isn't; we're only 85 | # trying to test correctness, here, so we are very lenient. If the 86 | # false positive rate is within 2 orders of magnitude, we're okay. 87 | false_pos = len(filter(bf.__contains__, self._not_in_filter)) 88 | error_rate = float(false_pos) / len(self._not_in_filter) 89 | self.assertTrue(error_rate < 100 * self.FILTER_ERROR_RATE, 90 | '%r / %r = %r > %r' % (false_pos, 91 | len(self._not_in_filter), 92 | error_rate, 93 | 100 * self.FILTER_ERROR_RATE)) 94 | for item in self._not_in_filter: 95 | # We should *never* have a false negative 96 | self.assertFalse(item in bf, '%r WAS in %r' % (item, bf)) 97 | 98 | def test_repr(self): 99 | self.assertEqual( 100 | '' % ( 101 | self.bf.capacity, self.bf.error_rate, self.bf.num_hashes), 102 | repr(self.bf)) 103 | self.assertEqual( 104 | u'' % ( 105 | self.bf.capacity, self.bf.error_rate, self.bf.num_hashes), 106 | unicode(self.bf)) 107 | self.assertEqual( 108 | '' % ( 109 | self.bf.capacity, self.bf.error_rate, self.bf.num_hashes), 110 | str(self.bf)) 111 | 112 | def test_add_and_check_file_backed(self): 113 | self._populate_filter(self.bf) 114 | self._check_filter_contents(self.bf) 115 | 116 | def test_update_and_check_file_backed(self): 117 | self._populate_filter(self.bf, use_update=True) 118 | self._check_filter_contents(self.bf) 119 | 120 | def test_add_and_check_memory_backed(self): 121 | self._populate_filter(self.bf_mem) 122 | self._check_filter_contents(self.bf_mem) 123 | 124 | def test_open(self): 125 | self._populate_filter(self.bf) 126 | self.bf.sync() 127 | 128 | bf = pybloomfilter.BloomFilter.open(self.bf.name) 129 | self._check_filter_contents(bf) 130 | 131 | @with_test_file 132 | def test_copy(self, filename): 133 | self._populate_filter(self.bf) 134 | self.bf.sync() 135 | 136 | bf = self.bf.copy(filename) 137 | self._check_filter_contents(bf) 138 | self.assertPropertiesPreserved(self.bf, bf) 139 | 140 | def assertBfPermissions(self, bf, perms): 141 | oct_mode = oct(os.stat(bf.name).st_mode) 142 | self.assert_(oct_mode.endswith(perms), 143 | 'unexpected perms %s' % oct_mode) 144 | 145 | @with_test_file 146 | def test_to_from_base64(self, filename): 147 | self._populate_filter(self.bf) 148 | self.bf.sync() 149 | 150 | # sanity-check 151 | self.assertBfPermissions(self.bf, '0755') 152 | 153 | b64 = self.bf.to_base64() 154 | 155 | old_umask = os.umask(0) 156 | try: 157 | os.unlink(filename) 158 | bf = pybloomfilter.BloomFilter.from_base64(filename, b64, 159 | perm=0775) 160 | self.assertBfPermissions(bf, '0775') 161 | self._check_filter_contents(bf) 162 | self.assertPropertiesPreserved(self.bf, bf) 163 | finally: 164 | os.umask(old_umask) 165 | 166 | def test_missing_file_is_os_error(self): 167 | self.assertRaises(OSError, pybloomfilter.BloomFilter, 1000, 0.1, 168 | 'missing_directory/some_file.bloom') 169 | 170 | @with_test_file 171 | def test_others(self, filename): 172 | bf = pybloomfilter.BloomFilter(100, 0.01, filename) 173 | for elem in (1.2, 2343L, (1, 2), object(), u'\u2131\u3184'): 174 | bf.add(elem) 175 | self.assertEquals(elem in bf, True) 176 | 177 | def test_number_nofile(self): 178 | bf = pybloomfilter.BloomFilter(100, 0.01) 179 | bf.add(1234) 180 | self.assertEquals(1234 in bf, True) 181 | 182 | def test_string_nofile(self): 183 | bf = pybloomfilter.BloomFilter(100, 0.01) 184 | bf.add("test") 185 | self.assertEquals("test" in bf, True) 186 | 187 | def test_others_nofile(self): 188 | bf = pybloomfilter.BloomFilter(100, 0.01) 189 | for elem in (1.2, 2343L, (1, 2), object(), u'\u2131\u3184'): 190 | bf.add(elem) 191 | self.assertEquals(elem in bf, True) 192 | 193 | #@unittest.skip("unfortunately large files cannot be tested on Travis") 194 | @with_test_file 195 | def _test_large_file(self, filename): 196 | bf = pybloomfilter.BloomFilter(400000000, 0.01, filename) 197 | bf.add(1234) 198 | self.assertEquals(1234 in bf, True) 199 | 200 | def test_name_does_not_segfault(self): 201 | bf = pybloomfilter.BloomFilter(100, 0.01) 202 | self.assertRaises(NotImplementedError, lambda: bf.name) 203 | 204 | def test_copy_does_not_segfault(self): 205 | bf = pybloomfilter.BloomFilter(100, 0.01) 206 | with tempfile.NamedTemporaryFile(suffix='.bloom') as f2: 207 | self.assertRaises(NotImplementedError, bf.copy, f2.name) 208 | 209 | def test_to_base64_does_not_segfault(self): 210 | bf = pybloomfilter.BloomFilter(100, 0.01) 211 | self.assertRaises(NotImplementedError, bf.to_base64) 212 | 213 | def test_ReadFile_is_public(self): 214 | self.assertEquals( 215 | isinstance(pybloomfilter.BloomFilter.ReadFile, object), True) 216 | bf = pybloomfilter.BloomFilter(100, 0.01) 217 | bf2 = pybloomfilter.BloomFilter(100, 0.01) 218 | self.assertEquals(bf.ReadFile, bf2.ReadFile) 219 | self.assertEquals(pybloomfilter.BloomFilter.ReadFile, 220 | bf.ReadFile) 221 | 222 | 223 | def suite(): 224 | suite = unittest.TestSuite() 225 | suite.addTest(unittest.makeSuite(SimpleTestCase)) 226 | return suite 227 | --------------------------------------------------------------------------------