├── .gitignore
├── .travis.yml
├── AUTHORS
├── CHANGELOG
├── LICENSE
├── MANIFEST.in
├── Makefile
├── README.markdown
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.txt
    ├── make.bat
    ├── pushtogh
    └── ref.txt
├── fake_pyrex
    └── Pyrex
    │   ├── .svn
    │       ├── all-wcprops
    │       ├── entries
    │       └── text-base
    │       │   └── __init__.py.svn-base
    │   ├── Distutils
    │       ├── .svn
    │       │   ├── all-wcprops
    │       │   ├── entries
    │       │   └── text-base
    │       │   │   ├── __init__.py.svn-base
    │       │   │   └── build_ext.py.svn-base
    │       ├── __init__.py
    │       └── build_ext.py
    │   └── __init__.py
├── setup.py
├── src
    ├── Makefile
    ├── MurmurHash3.c
    ├── MurmurHash3.h
    ├── bloomfilter.c
    ├── bloomfilter.h
    ├── cbloomfilter.pxd
    ├── md5.c
    ├── md5.h
    ├── mmapbitarray.c
    ├── mmapbitarray.h
    ├── primetester.c
    ├── primetester.h
    ├── pybloomfilter.c
    ├── pybloomfilter.pyx
    └── superfast.h
└── tests
    ├── __init__.py
    ├── accuracytest.py
    ├── comparisons
        ├── accuracytest.py
        ├── speedtest.py
        ├── testwords
        └── words
    └── simpletest.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | /local
 2 | pybloomfilter.so
 3 | /build/
 4 | /dist/
 5 | /docs/_build
 6 | MANIFEST
 7 | /bin
 8 | /include
 9 | /lib
10 | *.egg-info
11 | *~
12 | *pyc
13 | /tags
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 |   - 2.6
4 |   - 2.7
5 |   - pypy
6 | script:  python setup.py test
7 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | Michael Axiak
2 | Rob Stacey
3 | dlecocq - For superfast addition
4 | pbutler - Fix memory leak
5 | Dan Crosta - Convert MurmurHash3 to C from C++
6 | 


--------------------------------------------------------------------------------
/CHANGELOG:
--------------------------------------------------------------------------------
 1 | 0.3.6 2012-10-21)
 2 |  - Minor cosmetic changes to reduce noise in clang. Might have fixed
 3 |    cryptolib linking error.                                              [BUG]
 4 | 
 5 | 0.3.6 2012-09-04)
 6 |  - Fixed memory leak in SHA512 computation. Thanks pbutler!              [BUG]
 7 | 
 8 | 0.3.2 2011-12-07)
 9 |  - Fixed segfault when using non-strings as values.                      [BUG]
10 | 
11 | 0.3.1 2011-12-07)
12 |  - Fixed casting of size_t for very large bloom filters.                 [BUG]
13 |  - Added option to use standard memory (rather than mmap) to back the bloom filter.
14 |    (Specify None or '' as the argument to file name.)                    [FEATURE]
15 |  - Changed hash to use a "superfast" hash rather than md5 since md5 is
16 |    unnecessarily slow.                                                   [FEATURE]
17 | 
18 | 0.2.0 2011-11-13)
19 |  - Fixed hashing to use md5 hash and change the bit computation to pass
20 |    accuracy test.                                                        [BUG]
21 | 
22 | 0.1.28 2011-03-12)
23 |  - Added check to ensure that the required permissions are available when
24 |    opening a file.                                                       [BUG]
25 | 
26 | 0.1.26 2011-03-07)
27 |  - Added __dealloc__ to fix memory/resource leaks.                       [BUG]
28 |  - Added .close() method to forcibly close a bloom filter object.        [FEATURE]
29 | 
30 | 0.1.20 2010-12-21)
31 |  - Changed prime number finder to use a standard double and add 1 to find
32 |    likely numbers in log(n) time.
33 | 
34 | 0.1.18 2010-10-25)
35 |  - Fixed issue #5, wherein a call to open() returning an invalid fd would
36 |    cause a seg fault.                                                     [BUG]
37 | 
38 | 0.1.12 2010-4-5)
39 |  - Added __len__ which gives a good estimate of how many elements were
40 |    inserted [FEATURE]
41 |  - Removed __ixor__ since it didn't make much sense.
42 |  - Added .intersect() and .union() as a synonym for __iand__ and __ior__ [FEATURE]
43 |  - Added reserved space to the BloomFilter structure to support future updates. [FEATURE]
44 | 
45 | 0.1.10 2010-4-2)
46 |  - Added MANIFEST so that building distributions works [FEATURE]
47 |  - Restructured docs so that html is a symlink to the _build/html directory [FEATURE]
48 |  - Added Cython detection in setup.py. Building now works without Cython installed [FEATURE]
49 | 
50 | 0.1.8 2010-3-31)
51 |  - Fixed bug where opening invalid bloom files caused seg faults [BUG]
52 |  - Fixed alignment bug where sometimes hash seeds would not      [BUG]
53 |    be compared correctly.
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |  Copyright (c) 2010-2011 Michael Axiak <mike@axiak.net>
 2 | 
 3 |  Permission is hereby granted, free of charge, to any person
 4 |  obtaining a copy of this software and associated documentation
 5 |  files (the "Software"), to deal in the Software without
 6 |  restriction, including without limitation the rights to use,
 7 |  copy, modify, merge, publish, distribute, sublicense, and/or sell
 8 |  copies of the Software, and to permit persons to whom the
 9 |  Software is furnished to do so, subject to the following
10 |  conditions:
11 | 
12 |  The above copyright notice and this permission notice shall be
13 |  included in all copies or substantial portions of the Software.
14 | 
15 |  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 |  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
17 |  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 |  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
19 |  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
20 |  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 |  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 |  OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include CHANGELOG
2 | include LICENSE
3 | include AUTHORS
4 | include README.markdown
5 | recursive-include src *
6 | recursive-include tests *
7 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | all:
 2 | 	python setup.py build_ext --inplace
 3 | 
 4 | install:
 5 | 	@# Support Debian package building with fall-back default
 6 | 	python setup.py install --root $${DESTDIR:-/}
 7 | 
 8 | 
 9 | clean:
10 | 	rm -rf build/
11 | 	rm -rf dist/
12 | 	rm -fv *so
13 | 


--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
 1 | # pybloomfiltermmap [![Build Status](https://secure.travis-ci.org/axiak/pybloomfiltermmap.png?branch=master)](http://travis-ci.org/axiak/pybloomfiltermmap)
 2 | 
 3 | The goal of `pybloomfiltermmap` is simple: to provide a fast, simple, scalable,
 4 | correct library for Bloom Filters in Python.
 5 | 
 6 | ## Docs
 7 | 
 8 | See <http://axiak.github.com/pybloomfiltermmap/>.
 9 | 
10 | ## Overview
11 | 
12 | After you install, the interface to use is a cross between a file
13 | interface and a ste interface. As an example:
14 | 
15 |     >>> fruit = pybloomfilter.BloomFilter(100000, 0.1, '/tmp/words.bloom')
16 |     >>> fruit.update(('apple', 'pear', 'orange', 'apple'))
17 |     >>> len(fruit)
18 |     3
19 |     >>> 'mike' in fruit
20 |     False
21 |     >>> 'apple' in fruit
22 |     True
23 | 
24 | ## Install
25 | 
26 | You may or may not want to use Cython. If you have it installed, the
27 | setup file will build the C file from the pyx file. Otherwise, it will
28 | skip that step automatically and build from the packaged C file.
29 | 
30 | To install:
31 | 
32 |    $ sudo python setup.py install
33 | 
34 | and you should be set.
35 | 
36 | ## License
37 | 
38 | See the LICENSE file. It's under the MIT License.
39 | 
40 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | PAPER         =
 8 | BUILDDIR      = _build
 9 | 
10 | # Internal variables.
11 | PAPEROPT_a4     = -D latex_paper_size=a4
12 | PAPEROPT_letter = -D latex_paper_size=letter
13 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
14 | 
15 | .PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
16 | 
17 | help:
18 | 	@echo "Please use \`make <target>' where <target> is one of"
19 | 	@echo "  html      to make standalone HTML files"
20 | 	@echo "  dirhtml   to make HTML files named index.html in directories"
21 | 	@echo "  pickle    to make pickle files"
22 | 	@echo "  json      to make JSON files"
23 | 	@echo "  htmlhelp  to make HTML files and a HTML help project"
24 | 	@echo "  qthelp    to make HTML files and a qthelp project"
25 | 	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
26 | 	@echo "  changes   to make an overview of all changed/added/deprecated items"
27 | 	@echo "  linkcheck to check all external links for integrity"
28 | 	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
29 | 
30 | clean:
31 | 	-rm -rf $(BUILDDIR)/*
32 | 
33 | html:
34 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
35 | 	@echo
36 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
37 | 
38 | dirhtml:
39 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
40 | 	@echo
41 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
42 | 
43 | pickle:
44 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
45 | 	@echo
46 | 	@echo "Build finished; now you can process the pickle files."
47 | 
48 | json:
49 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
50 | 	@echo
51 | 	@echo "Build finished; now you can process the JSON files."
52 | 
53 | htmlhelp:
54 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
55 | 	@echo
56 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
57 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
58 | 
59 | qthelp:
60 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
61 | 	@echo
62 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
63 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
64 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PythonBloomFilter.qhcp"
65 | 	@echo "To view the help file:"
66 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PythonBloomFilter.qhc"
67 | 
68 | latex:
69 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
70 | 	@echo
71 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
72 | 	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
73 | 	      "run these through (pdf)latex."
74 | 
75 | changes:
76 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
77 | 	@echo
78 | 	@echo "The overview file is in $(BUILDDIR)/changes."
79 | 
80 | linkcheck:
81 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
82 | 	@echo
83 | 	@echo "Link check complete; look for any errors in the above output " \
84 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
85 | 
86 | doctest:
87 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
88 | 	@echo "Testing of doctests in the sources finished, look at the " \
89 | 	      "results in $(BUILDDIR)/doctest/output.txt."
90 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Python BloomFilter documentation build configuration file, created by
  4 | # sphinx-quickstart on Wed Mar 31 16:25:58 2010.
  5 | #
  6 | # This file is execfile()d with the current directory set to its containing dir.
  7 | #
  8 | # Note that not all possible configuration values are present in this
  9 | # autogenerated file.
 10 | #
 11 | # All configuration values have a default; values that are commented out
 12 | # serve to show the default.
 13 | 
 14 | import sys, os
 15 | 
 16 | # If extensions (or modules to document with autodoc) are in another directory,
 17 | # add these directories to sys.path here. If the directory is relative to the
 18 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 19 | #sys.path.append(os.path.abspath('.'))
 20 | 
 21 | # -- General configuration -----------------------------------------------------
 22 | 
 23 | # Add any Sphinx extension module names here, as strings. They can be extensions
 24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
 25 | extensions = ['sphinx.ext.doctest', 'sphinx.ext.coverage', 'sphinxtogithub']
 26 | 
 27 | # Add any paths that contain templates here, relative to this directory.
 28 | templates_path = ['_templates']
 29 | 
 30 | # The suffix of source filenames.
 31 | source_suffix = '.txt'
 32 | 
 33 | # The encoding of source files.
 34 | #source_encoding = 'utf-8'
 35 | 
 36 | # The master toctree document.
 37 | master_doc = 'index'
 38 | 
 39 | # General information about the project.
 40 | project = u'Python BloomFilter'
 41 | copyright = u'2010-2012, Michael Axiak'
 42 | 
 43 | # The version info for the project you're documenting, acts as replacement for
 44 | # |version| and |release|, also used in various other places throughout the
 45 | # built documents.
 46 | #
 47 | # The short X.Y version.
 48 | version = '0.3.2'
 49 | # The full version, including alpha/beta/rc tags.
 50 | release = '0.3.2'
 51 | 
 52 | # The language for content autogenerated by Sphinx. Refer to documentation
 53 | # for a list of supported languages.
 54 | #language = None
 55 | 
 56 | # There are two options for replacing |today|: either, you set today to some
 57 | # non-false value, then it is used:
 58 | #today = ''
 59 | # Else, today_fmt is used as the format for a strftime call.
 60 | #today_fmt = '%B %d, %Y'
 61 | 
 62 | # List of documents that shouldn't be included in the build.
 63 | #unused_docs = []
 64 | 
 65 | # List of directories, relative to source directory, that shouldn't be searched
 66 | # for source files.
 67 | exclude_trees = ['_build','html']
 68 | 
 69 | # The reST default role (used for this markup: `text`) to use for all documents.
 70 | #default_role = None
 71 | 
 72 | # If true, '()' will be appended to :func: etc. cross-reference text.
 73 | #add_function_parentheses = True
 74 | 
 75 | # If true, the current module name will be prepended to all description
 76 | # unit titles (such as .. function::).
 77 | #add_module_names = True
 78 | 
 79 | # If true, sectionauthor and moduleauthor directives will be shown in the
 80 | # output. They are ignored by default.
 81 | #show_authors = False
 82 | 
 83 | # The name of the Pygments (syntax highlighting) style to use.
 84 | pygments_style = 'sphinx'
 85 | 
 86 | # A list of ignored prefixes for module index sorting.
 87 | #modindex_common_prefix = []
 88 | 
 89 | 
 90 | # -- Options for HTML output ---------------------------------------------------
 91 | 
 92 | # The theme to use for HTML and HTML Help pages.  Major themes that come with
 93 | # Sphinx are currently 'default' and 'sphinxdoc'.
 94 | html_theme = 'default'
 95 | 
 96 | # Theme options are theme-specific and customize the look and feel of a theme
 97 | # further.  For a list of options available for each theme, see the
 98 | # documentation.
 99 | #html_theme_options = {}
100 | 
101 | # Add any paths that contain custom themes here, relative to this directory.
102 | #html_theme_path = []
103 | 
104 | # The name for this set of Sphinx documents.  If None, it defaults to
105 | # "<project> v<release> documentation".
106 | #html_title = None
107 | 
108 | # A shorter title for the navigation bar.  Default is the same as html_title.
109 | #html_short_title = None
110 | 
111 | # The name of an image file (relative to this directory) to place at the top
112 | # of the sidebar.
113 | #html_logo = None
114 | 
115 | # The name of an image file (within the static path) to use as favicon of the
116 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
117 | # pixels large.
118 | #html_favicon = None
119 | 
120 | # Add any paths that contain custom static files (such as style sheets) here,
121 | # relative to this directory. They are copied after the builtin static files,
122 | # so a file named "default.css" will overwrite the builtin "default.css".
123 | html_static_path = ['_static']
124 | 
125 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
126 | # using the given strftime format.
127 | #html_last_updated_fmt = '%b %d, %Y'
128 | 
129 | # If true, SmartyPants will be used to convert quotes and dashes to
130 | # typographically correct entities.
131 | #html_use_smartypants = True
132 | 
133 | # Custom sidebar templates, maps document names to template names.
134 | #html_sidebars = {}
135 | 
136 | # Additional templates that should be rendered to pages, maps page names to
137 | # template names.
138 | #html_additional_pages = {}
139 | 
140 | # If false, no module index is generated.
141 | #html_use_modindex = True
142 | 
143 | # If false, no index is generated.
144 | #html_use_index = True
145 | 
146 | # If true, the index is split into individual pages for each letter.
147 | #html_split_index = False
148 | 
149 | # If true, links to the reST sources are added to the pages.
150 | #html_show_sourcelink = True
151 | 
152 | # If true, an OpenSearch description file will be output, and all pages will
153 | # contain a <link> tag referring to it.  The value of this option must be the
154 | # base URL from which the finished HTML is served.
155 | #html_use_opensearch = ''
156 | 
157 | # If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
158 | #html_file_suffix = ''
159 | 
160 | # Output file base name for HTML help builder.
161 | htmlhelp_basename = 'PythonBloomFilterdoc'
162 | 
163 | 
164 | # -- Options for LaTeX output --------------------------------------------------
165 | 
166 | # The paper size ('letter' or 'a4').
167 | #latex_paper_size = 'letter'
168 | 
169 | # The font size ('10pt', '11pt' or '12pt').
170 | #latex_font_size = '10pt'
171 | 
172 | # Grouping the document tree into LaTeX files. List of tuples
173 | # (source start file, target name, title, author, documentclass [howto/manual]).
174 | latex_documents = [
175 |   ('index', 'PythonBloomFilter.tex', u'Python BloomFilter Documentation',
176 |    u'Michael Axiak', 'manual'),
177 | ]
178 | 
179 | # The name of an image file (relative to this directory) to place at the top of
180 | # the title page.
181 | #latex_logo = None
182 | 
183 | # For "manual" documents, if this is true, then toplevel headings are parts,
184 | # not chapters.
185 | #latex_use_parts = False
186 | 
187 | # Additional stuff for the LaTeX preamble.
188 | #latex_preamble = ''
189 | 
190 | # Documents to append as an appendix to all manuals.
191 | #latex_appendices = []
192 | 
193 | # If false, no module index is generated.
194 | #latex_use_modindex = True
195 | 


--------------------------------------------------------------------------------
/docs/index.txt:
--------------------------------------------------------------------------------
  1 | .. Python BloomFilter documentation master file, created by
  2 |    sphinx-quickstart on Wed Mar 31 16:25:58 2010.
  3 |    You can adapt this file completely to your liking, but it should at least
  4 |    contain the root `toctree` directive.
  5 | 
  6 | Welcome to Python BloomFilter's documentation!
  7 | ==============================================
  8 | 
  9 | If you are here, you probably don't need to be reminded
 10 | about the nature of a Bloom filter. If you need to learn
 11 | more, just visit the `wikipedia page <http://en.wikipedia.org/wiki/Bloom_filter>`_
 12 | to learn more. This module implements a Bloom filter in python
 13 | that's fast and uses mmap files for better scalability.
 14 | Did I mention that it's fast?
 15 | 
 16 | Here's a quick example::
 17 |     
 18 |     from pybloomfilter import BloomFilter
 19 | 
 20 |     bf = BloomFilter(10000000, 0.01, 'filter.bloom')
 21 | 
 22 |     with open("/usr/share/dict/words") as f:
 23 |         for word in f:
 24 |             bf.add(word.rstrip())
 25 | 
 26 |     print 'apple' in bf
 27 |     #outputs True
 28 | 
 29 | That wasn't so hard, was it? Now, there are a lot of other things
 30 | we can do. For instance, let's say we want to create a similar
 31 | filter with just a few pieces of fruit::
 32 | 
 33 |     fruitbf = bf.copy_template("fruit.bloom")
 34 |     fruitbf.update(("apple", "banana", "orange", "pear"))
 35 |     print fruitbf.to_base64()
 36 |     "eJzt2k13ojAUBuA9f8WFyofF5TWChlTHaPzqrlqFCtj6gQi/frqZM2N7aq3Gis59d2ye85KTRbhk"
 37 |     "0lyu1NRmsQrgRda0I+wZCfXIaxuWv+jqDxA8vdaf21HIOSn1u6LRE0VL9Z/qghfbBmxZoHsqM3k8"
 38 |     "N5XyPAxH2p22TJJoqwU9Q0y0dNDYrOHBIa3BwuznapG+KZZq69JUG0zu1tqI5weJKdpGq7PNJ6tB"
 39 |     "GKmzcGWWy8o0FeNNYNZAQpSdJwajt7eRhJ2YM2NOkTnSsBOCGGKIIYbY2TA663GgWWyWfUwn3oIc"
 40 |     "fyLYxeQwiF07RqBg9NgHrG5ba3jba5yl4zS2LtEMMcQQQwwxmRiBhPGOJOywIPafYhUwqnTvZOfY"
 41 |     "Zu40HH/YxDexZojJwsx6ObDcT7D8vVOtJBxiAhD/AjMmjeF2Wnqd+5RrHdo4azPEzoANabiUhh0b"
 42 |     "xBBDDDHEENsf8twlrizswEjDhnTbzWazbGKpQ5k07E9Ox2iFvXBZ2D9B7DawyqLFu5lshhhiiGUK"
 43 |     "a4nUloa9yxkwR7XhgPPXYdhRIa77uDtnyvqaIXalGK02ufv3J36GmsnG4lquPnN9gJo1VNxqgYbt"
 44 |     "ji/EC8s1PWG5fuVizW4Jox6/3o9XxBBDDLFbwcg9v/AwjrPHtTRsX34O01mxLw37bhCTjJk0+PLK"
 45 |     "08HYd4MYYojdKmYnBfjsktEpySY2tGGZzWaIIfYDGB271Yaieaat/AaOkNKb"
 46 | 
 47 | Reference
 48 | ------------
 49 | 
 50 | All of the reference information is available below:
 51 | 
 52 | .. toctree::
 53 |    :maxdepth: 2
 54 | 
 55 |    ref
 56 | 
 57 | 
 58 | 
 59 | Why pybloomfilter
 60 | ---------------------
 61 | 
 62 | As I already mentioned, there are a couple reasons to use this
 63 | module:
 64 | 
 65 |  * It natively uses `mmaped files <http://en.wikipedia.org/wiki/Mmap>`_.
 66 |  * It natively does the set things you want a Bloom filter to do.
 67 |  * It is Fast (see Benchmarks).
 68 | 
 69 | Benchmarks
 70 | ---------------------
 71 | 
 72 | Simple load and add speed
 73 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 74 | 
 75 | I have a simple benchmark in `test/speedtest.py <http://github.com/axiak/pybloomfiltermmap/blob/master/tests/speedtest.py>`_ which compares
 76 | this module to the good
 77 | `pybloom module <http://github.com/jaybaird/python-bloomfilter/>`_::
 78 | 
 79 | 
 80 |     (pybloom module)
 81 |     pybloom load took 0.76436 s/run
 82 |     pybloom tests took 0.16205 s/run
 83 |     Errors: 0.25% positive 0.00% negative
 84 | 
 85 |     (this module)
 86 |     pybloomfilter load took 0.05423 s/run
 87 |     pybloomfilter tests took 0.00659 s/run
 88 |     Errors: 0.26% positive 0.00% negative
 89 | 
 90 | 
 91 | In this test we just looked at adding words from a dictionary file,
 92 | then testing to see if each word of another file was in the dictionary.
 93 | 
 94 | Serialization
 95 | ^^^^^^^^^^^^^^^^^
 96 | 
 97 | Since this package natively uses mmap files, **no serialization is needed**.
 98 | Therefore, if you have to do a lot of moving between disks etc, this
 99 | module is an obvious win.
100 | 
101 | Install
102 | ---------------------
103 | 
104 | You do not need Cython to install from sources, since I keep a cached version
105 | of the c output in the source distribution. Thus, to install you should only
106 | need to run::
107 | 
108 |     $ sudo pip install pybloomfiltermmap
109 | 
110 | You can also download the latest tar file from the `github tags <https://github.com/axiak/pybloomfiltermmap/tags>`_. Once you download it, you should only have to run::
111 | 
112 |     $ sudo python setup.py install
113 | 
114 | to build and install the module.
115 | 
116 | Develop
117 | -----------------------
118 | 
119 | To develop you will need Cython. The setup.py script should automatically
120 | build from Cython source if the Cython module is available.
121 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | set SPHINXBUILD=sphinx-build
  6 | set BUILDDIR=_build
  7 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
  8 | if NOT "%PAPER%" == "" (
  9 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 10 | )
 11 | 
 12 | if "%1" == "" goto help
 13 | 
 14 | if "%1" == "help" (
 15 | 	:help
 16 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 17 | 	echo.  html      to make standalone HTML files
 18 | 	echo.  dirhtml   to make HTML files named index.html in directories
 19 | 	echo.  pickle    to make pickle files
 20 | 	echo.  json      to make JSON files
 21 | 	echo.  htmlhelp  to make HTML files and a HTML help project
 22 | 	echo.  qthelp    to make HTML files and a qthelp project
 23 | 	echo.  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 24 | 	echo.  changes   to make an overview over all changed/added/deprecated items
 25 | 	echo.  linkcheck to check all external links for integrity
 26 | 	echo.  doctest   to run all doctests embedded in the documentation if enabled
 27 | 	goto end
 28 | )
 29 | 
 30 | if "%1" == "clean" (
 31 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 32 | 	del /q /s %BUILDDIR%\*
 33 | 	goto end
 34 | )
 35 | 
 36 | if "%1" == "html" (
 37 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 38 | 	echo.
 39 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 40 | 	goto end
 41 | )
 42 | 
 43 | if "%1" == "dirhtml" (
 44 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 45 | 	echo.
 46 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 47 | 	goto end
 48 | )
 49 | 
 50 | if "%1" == "pickle" (
 51 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
 52 | 	echo.
 53 | 	echo.Build finished; now you can process the pickle files.
 54 | 	goto end
 55 | )
 56 | 
 57 | if "%1" == "json" (
 58 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
 59 | 	echo.
 60 | 	echo.Build finished; now you can process the JSON files.
 61 | 	goto end
 62 | )
 63 | 
 64 | if "%1" == "htmlhelp" (
 65 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
 66 | 	echo.
 67 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
 68 | .hhp project file in %BUILDDIR%/htmlhelp.
 69 | 	goto end
 70 | )
 71 | 
 72 | if "%1" == "qthelp" (
 73 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
 74 | 	echo.
 75 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
 76 | .qhcp project file in %BUILDDIR%/qthelp, like this:
 77 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\PythonBloomFilter.qhcp
 78 | 	echo.To view the help file:
 79 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\PythonBloomFilter.ghc
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "latex" (
 84 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
 85 | 	echo.
 86 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
 87 | 	goto end
 88 | )
 89 | 
 90 | if "%1" == "changes" (
 91 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
 92 | 	echo.
 93 | 	echo.The overview file is in %BUILDDIR%/changes.
 94 | 	goto end
 95 | )
 96 | 
 97 | if "%1" == "linkcheck" (
 98 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
 99 | 	echo.
100 | 	echo.Link check complete; look for any errors in the above output ^
101 | or in %BUILDDIR%/linkcheck/output.txt.
102 | 	goto end
103 | )
104 | 
105 | if "%1" == "doctest" (
106 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
107 | 	echo.
108 | 	echo.Testing of doctests in the sources finished, look at the ^
109 | results in %BUILDDIR%/doctest/output.txt.
110 | 	goto end
111 | )
112 | 
113 | :end
114 | 


--------------------------------------------------------------------------------
/docs/pushtogh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | cd $(dirname "$0")
 3 | make html
 4 | 
 5 | TEMPFILE=$(tempfile)
 6 | rm -f $TEMPFILE
 7 | mkdir $TEMPFILE
 8 | cp -r ./_build/html/* "$TEMPFILE"
 9 | cd ../
10 | git stash
11 | git checkout gh-pages
12 | rm -rf *
13 | cp -r "$TEMPFILE/"* ./
14 | git add .
15 | git commit -m "Updated documents build"
16 | git checkout master
17 | git stash pop
18 | 


--------------------------------------------------------------------------------
/docs/ref.txt:
--------------------------------------------------------------------------------
  1 | BloomFilter Class Reference
  2 | ============================
  3 | 
  4 | .. toctree::
  5 |    :maxdepth: 2
  6 | 
  7 | .. module:: pybloomfilter
  8 |    :platform: Unix, Windows
  9 |    :synopsis: A fast BloomFilter for Python
 10 | .. moduleauthor:: Michael Axiak <mike@axiak.net>
 11 | 
 12 | 
 13 | .. class:: BloomFilter(capacity : int, error_rate : float, [filename=None : string], [perm=0755])
 14 | 
 15 |    Create a new BloomFilter object with a given capacity and error_rate.
 16 |    **Note that we do not check capacity.** This is important, because
 17 |    I want to be able to support logical OR and AND (see below).
 18 |    The capacity and error_rate then together serve as a contract---you add
 19 |    less than capacity items, and the Bloom Filter will have an error rate
 20 |    less than error_rate.
 21 | 
 22 | Class Methods
 23 | -------------
 24 | 
 25 | .. classmethod:: BloomFilter.open(filename)
 26 | 
 27 |    Return a BloomFilter object using an already-existing Bloomfilter file.
 28 | 
 29 | .. classmethod:: BloomFilter.from_base64(filename, string, [perm=0755])
 30 | 
 31 |    Unpack the supplied base64 string (as returned by BloomFilter.to_base64)
 32 |    into the supplied filename and return a BloomFilter object using that
 33 |    file.
 34 | 
 35 |    Example::
 36 | 
 37 |     >>> bf = BloomFilter.from_base64("/tmp/mike.bf", 
 38 |          "eJwFwcuWgiAAANC9v+JCx7By0QKt0GHEbKSknflAQ9QmTyRfP/fW5E9XTRSX"
 39 |          "qcLlqGNXphAqcfVH\nRoNv0n4JlTpIvAP0e1+RyXX6I637ggA+VPZnTYR1A4"
 40 |          "Um5s9geYaZZLiT208JIiG3iwhf3Fwlzb3Y\n5NRL4uNQS6/d9OvTDJbnZMnR"
 41 |          "zcrplOX5kmsVIkQziM+vw4hCDQ3OkN9m3WVfPWzGfaTeRftMCLws\nPnzEzs"
 42 |          "gjAW60xZTBbj/bOAgYbK50PqjdzvgHZ6FHZw==\n")
 43 |     >>> "MIKE" in bf
 44 |     True
 45 | 
 46 | Instance Attributes
 47 | ---------------------
 48 | 
 49 | .. attribute:: BloomFilter.capacity
 50 | 
 51 |     The number of elements for this filter.
 52 | 
 53 | .. attribute:: BloomFilter.error_rate
 54 | 
 55 |     The acceptable probability of false positives.
 56 | 
 57 | .. attribute:: BloomFilter.hash_seeds
 58 | 
 59 |     The integer seeds used for the random hashing.
 60 | 
 61 | .. attribute:: BloomFilter.name
 62 | 
 63 |     The file name (compatible with file objects)
 64 | 
 65 | .. attribute:: BloomFilter.num_bits
 66 | 
 67 |     The number of bits used in the filter as buckets
 68 | 
 69 | .. attribute:: BloomFilter.num_hashes
 70 | 
 71 |     The number of hash functions used when computing
 72 | 
 73 | 
 74 | Instance Methods
 75 | -------------------
 76 | 
 77 | .. method:: BloomFilter.add(item) -> Boolean
 78 | 
 79 |    Add the item to the bloom filter.
 80 | 
 81 |    :param item: Hashable object
 82 |    :rtype: Boolean (True if item already in the filter)
 83 | 
 84 | .. method:: BloomFilter.clear_all()
 85 | 
 86 |    Remove all elements from the bloom filter at once.
 87 | 
 88 | .. method:: BloomFilter.copy(filename) -> BloomFilter
 89 | 
 90 |    Copies the current BloomFilter object to another object with
 91 |    new filename.
 92 | 
 93 |    :param filename: string filename
 94 |    :rtype: new BloomFilter object
 95 | 
 96 | .. method:: BloomFilter.copy_template(filename, [perm=0755]) -> BloomFilter
 97 | 
 98 |    Creates a new BloomFilter object with the same *parameters*--same
 99 |    hash seeds, same size.. everything. Once this is performed, the
100 |    two filters are *comparable*, so you can perform logical operators.
101 |    Example::
102 | 
103 |     >>> apple = BloomFilter(100, 0.1, '/tmp/apple')
104 |     >>> apple.add('apple')
105 |     False   
106 |     >>> pear = apple.copy_template('/tmp/pear')
107 |     >>> pear.add('pear')
108 |     False
109 |     >>> pear |= apple
110 | 
111 | .. method:: BloomFilter.sync()
112 | 
113 |    Forces a sync() call on the underlying mmap file object. Use this if
114 |    you are about to copy the file and you want to be Sure (TM) you got
115 |    everything correctly.
116 | 
117 | .. method:: BloomFilter.to_base64() -> string
118 | 
119 |    Creates a compressed, base64 encoded version of the Bloom filter.
120 |    Since the bloom filter is efficiently in binary on the file system
121 |    this may not be too useful. I find it useful for debugging so I can
122 |    copy filters from one terminal to another in their entirety.
123 | 
124 |    :rtype: Base64 encoded string representing filter
125 | 
126 | .. method:: BloomFilter.update(iterable)
127 | 
128 |    Calls add() on all items in the iterable.
129 | 
130 | .. method:: BloomFilter.union(filter) -> BloomFilter
131 | 
132 |    Perform a set OR with another *comparable* filter.
133 |    You can (only) construct comparable filters with **copy_template** above.
134 |    See the example in copy_template. In that example, pear will have
135 |    both "apple" and "pear".
136 |    
137 |    The result will occur **in place**. That is, calling::
138 | 
139 |    bf.union(bf2)
140 | 
141 |    is a way to add all the elements of bf2 to bf.
142 | 
143 |    *N.B.: Calling this function will render future calls to len()
144 |    invalid.*
145 | 
146 | .. method:: BloomFilter.intersection(filter) -> BloomFilter
147 | 
148 |    The same as union() above except it uses a set AND instead of a
149 |    set OR.
150 |    
151 |    *N.B.: Calling this function will render future calls to len()
152 |    invalid.*
153 | 
154 | Magic Methods
155 | --------------
156 | 
157 | .. method:: BloomFilter.__len__(item) -> Integer
158 | 
159 |    Returns the number of distinct elements that have been
160 |    added to the BloomFilter object, subject to the error
161 |    given in error_rate.
162 | 
163 |    Example::
164 | 
165 |       >>> bf = BloomFilter(100, 0.1, '/tmp/fruit.bloom')
166 |       >>> bf.add("Apple")
167 |       >>> bf.add('Apple')
168 |       >>> bf.add('orange')
169 |       >>> len(bf)
170 |       2
171 |       >>> bf2 = bf.copy_template('/tmp/new.bloom')
172 |       >>> bf2 |= bf
173 |       >>> len(bf2)
174 |       Traceback (most recent call last):
175 |         ...
176 |       pybloomfilter.IndeterminateCountError: Length of BloomFilter object is unavailable after intersection or union called.
177 | 
178 | .. method:: BloomFilter.__in__(item) -> Boolean
179 | 
180 |    Check to see if item is contained in the filter, with
181 |    an acceptable false positive rate of error_rate (see above).
182 | 
183 | .. method:: BloomFilter.__ior__(filter) -> BloomFilter
184 | 
185 |    See union(filter)
186 | 
187 | .. method:: BloomFilter.__iand__(filter) -> BloomFilter
188 | 
189 |    See intersection(filter)
190 | 
191 | Exceptions
192 | --------------
193 | 
194 | .. class:: IndeterminateCountError(message)
195 | 
196 |    The exception that is raised if len() is called on a BloomFilter
197 |    object after |=, &=, intersection(), or union() is used.
198 | 
199 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/.svn/all-wcprops:
--------------------------------------------------------------------------------
 1 | K 25
 2 | svn:wc:ra_dav:version-url
 3 | V 47
 4 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex
 5 | END
 6 | __init__.py
 7 | K 25
 8 | svn:wc:ra_dav:version-url
 9 | V 59
10 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/__init__.py
11 | END
12 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/.svn/entries:
--------------------------------------------------------------------------------
 1 | 10
 2 | 
 3 | dir
 4 | 80414
 5 | http://codespeak.net/svn/lxml/trunk/fake_pyrex/Pyrex
 6 | http://codespeak.net/svn
 7 | 
 8 | 
 9 | 
10 | 2007-08-31T17:01:53.941550Z
11 | 46219
12 | scoder
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | fd0d7bf2-dfb6-0310-8d31-b7ecfe96aada
28 | 
29 | Distutils
30 | dir
31 | 
32 | __init__.py
33 | file
34 | 
35 | 
36 | 
37 | 
38 | 2012-01-04T20:10:47.429233Z
39 | 8213a7cbff30b82637a6a31ea1b8e4c1
40 | 2007-08-31T17:01:53.941550Z
41 | 46219
42 | scoder
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 48
65 | 
66 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/.svn/text-base/__init__.py.svn-base:
--------------------------------------------------------------------------------
1 | # work around broken setuptools monkey patching
2 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/Distutils/.svn/all-wcprops:
--------------------------------------------------------------------------------
 1 | K 25
 2 | svn:wc:ra_dav:version-url
 3 | V 57
 4 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/Distutils
 5 | END
 6 | __init__.py
 7 | K 25
 8 | svn:wc:ra_dav:version-url
 9 | V 69
10 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/Distutils/__init__.py
11 | END
12 | build_ext.py
13 | K 25
14 | svn:wc:ra_dav:version-url
15 | V 70
16 | /svn/!svn/ver/46219/lxml/trunk/fake_pyrex/Pyrex/Distutils/build_ext.py
17 | END
18 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/Distutils/.svn/entries:
--------------------------------------------------------------------------------
 1 | 10
 2 | 
 3 | dir
 4 | 80414
 5 | http://codespeak.net/svn/lxml/trunk/fake_pyrex/Pyrex/Distutils
 6 | http://codespeak.net/svn
 7 | 
 8 | 
 9 | 
10 | 2007-08-31T17:01:53.941550Z
11 | 46219
12 | scoder
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | fd0d7bf2-dfb6-0310-8d31-b7ecfe96aada
28 | 
29 | __init__.py
30 | file
31 | 
32 | 
33 | 
34 | 
35 | 2012-01-04T20:10:47.421233Z
36 | 8213a7cbff30b82637a6a31ea1b8e4c1
37 | 2007-08-31T17:01:53.941550Z
38 | 46219
39 | scoder
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 | 
54 | 
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 
61 | 48
62 | 
63 | build_ext.py
64 | file
65 | 
66 | 
67 | 
68 | 
69 | 2012-01-04T20:10:47.421233Z
70 | 708366fb576674605bad33ea1d8600f4
71 | 2007-08-31T17:01:53.941550Z
72 | 46219
73 | scoder
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 31
96 | 
97 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/Distutils/.svn/text-base/__init__.py.svn-base:
--------------------------------------------------------------------------------
1 | # work around broken setuptools monkey patching
2 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/Distutils/.svn/text-base/build_ext.py.svn-base:
--------------------------------------------------------------------------------
1 | build_ext = "yes, it's there!"
2 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/Distutils/__init__.py:
--------------------------------------------------------------------------------
1 | # work around broken setuptools monkey patching
2 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/Distutils/build_ext.py:
--------------------------------------------------------------------------------
1 | build_ext = "yes, it's there!"
2 | 


--------------------------------------------------------------------------------
/fake_pyrex/Pyrex/__init__.py:
--------------------------------------------------------------------------------
1 | # work around broken setuptools monkey patching
2 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | here = os.path.dirname(__file__)
 5 | 
 6 | ext_files = ["src/mmapbitarray.c",
 7 |              "src/bloomfilter.c",
 8 |              "src/md5.c",
 9 |              "src/primetester.c",
10 |              "src/MurmurHash3.c",
11 |              ]
12 | 
13 | kwargs = {}
14 | 
15 | try:
16 |     if '--no-cython' in sys.argv:
17 |         raise ImportError()
18 |     import Cython  # noqa
19 |     sys.path.insert(0, os.path.join(here, 'fake_pyrex'))
20 | except ImportError:
21 |     pass
22 | 
23 | 
24 | from setuptools import setup, Extension
25 | 
26 | try:
27 |     if '--no-cython' in sys.argv:
28 |         sys.argv.remove('--no-cython')
29 |         raise ImportError()
30 |     from Cython.Distutils import build_ext
31 |     print "info: Building from Cython"
32 |     ext_files.append("src/pybloomfilter.pyx")
33 |     kwargs['cmdclass'] = {'build_ext': build_ext}
34 |     #try:
35 |     #    os.unlink(os.path.join(here, 'src', 'pybloomfilter.c'))
36 |     #    os.unlink(os.path.join(here, 'pybloomfilter.so'))
37 |     #except:
38 |     #    pass
39 | except ImportError:
40 |     if '--cython' in sys.argv:
41 |         raise
42 |     ext_files.append("src/pybloomfilter.c")
43 |     print "info: Building from C"
44 | 
45 | if '--cython' in sys.argv:
46 |     sys.argv.remove('--cython')
47 | 
48 | ext_modules = [Extension("pybloomfilter",
49 |                          ext_files,
50 |                          libraries=['crypto'])]
51 | 
52 | requirements = []
53 | 
54 | if sys.version_info[0] < 3 and sys.version_info[1] < 7:
55 |     requirements.append('importlib')
56 | 
57 | setup(name='pybloomfiltermmap',
58 |       version="0.3.14",
59 |       author="Michael Axiak, Rob Stacey",
60 |       author_email="mike@axiak.net",
61 |       url="http://github.com/axiak/pybloomfiltermmap/",
62 |       description="A Bloom filter (bloomfilter) for Python built on mmap",
63 |       license="MIT License",
64 |       test_suite='tests.test_all',
65 |       install_requires=requirements,
66 |       ext_modules=ext_modules,
67 |       classifiers=[
68 |           'Intended Audience :: Developers',
69 |           'License :: OSI Approved :: MIT License',
70 |           'Programming Language :: C',
71 |           'Programming Language :: Cython',
72 |           'Programming Language :: Python',
73 |           'Topic :: Software Development :: Libraries :: Python Modules',
74 |       ],
75 |       **kwargs)
76 | 


--------------------------------------------------------------------------------
/src/Makefile:
--------------------------------------------------------------------------------
1 | CC ?= gcc
2 | 
3 | bloomfilter: mmapbitarray.* bloomfilter.*
4 | 	$(CC) $(CFLAGS) $(CPPFLAGS) -O3 mmapbitarray.c md5.c MurmurHash3.cpp bloomfilter.c -o bf -lm $(LDFLAGS)
5 | 
6 | mbarray: mmapbitarray.*
7 | 	$(CC) $(CFLAGS) $(CPPFLAGS) -O3 -DMBAQUERY mmapbitarray.c -o mbaquery -lm $(LDFLAGS)
8 | 	$(CC) $(CFLAGS) $(CPPFLAGS) -O3 -DMBACREATE mmapbitarray.c -o mbacreate -lm $(LDFLAGS)
9 | 


--------------------------------------------------------------------------------
/src/MurmurHash3.c:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
  3 | // domain. The author hereby disclaims copyright to this source code.
  4 | 
  5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the
  6 | // algorithms are optimized for their respective platforms. You can still
  7 | // compile and run any of them on any platform, but your performance with the
  8 | // non-native version will be less than optimal.
  9 | //
 10 | // modification in mmh3:
 11 | // __attribute__((always_inline)) is replaced to inline by Hajime Senuma
 12 | 
 13 | #include "MurmurHash3.h"
 14 | 
 15 | //-----------------------------------------------------------------------------
 16 | // Platform-specific functions and macros
 17 | 
 18 | // Microsoft Visual Studio
 19 | 
 20 | #if defined(_MSC_VER)
 21 | 
 22 | #define FORCE_INLINE	__forceinline
 23 | 
 24 | #include <stdlib.h>
 25 | 
 26 | #define ROTL32(x,y)	_rotl(x,y)
 27 | #define ROTL64(x,y)	_rotl64(x,y)
 28 | 
 29 | #define BIG_CONSTANT(x) (x)
 30 | 
 31 | // Other compilers
 32 | 
 33 | #else	// defined(_MSC_VER)
 34 | 
 35 | #define	FORCE_INLINE /*__attribute__((always_inline))*/ inline
 36 | 
 37 | inline uint32_t rotl32 ( uint32_t x, int8_t r )
 38 | {
 39 |   return (x << r) | (x >> (32 - r));
 40 | }
 41 | 
 42 | inline uint64_t rotl64 ( uint64_t x, int8_t r )
 43 | {
 44 |   return (x << r) | (x >> (64 - r));
 45 | }
 46 | 
 47 | #define	ROTL32(x,y)	rotl32(x,y)
 48 | #define ROTL64(x,y)	rotl64(x,y)
 49 | 
 50 | #define BIG_CONSTANT(x) (x##LLU)
 51 | 
 52 | #endif // !defined(_MSC_VER)
 53 | 
 54 | //-----------------------------------------------------------------------------
 55 | // Block read - if your platform needs to do endian-swapping or can only
 56 | // handle aligned reads, do the conversion here
 57 | 
 58 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
 59 | {
 60 |   return p[i];
 61 | }
 62 | 
 63 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
 64 | {
 65 |   return p[i];
 66 | }
 67 | 
 68 | //-----------------------------------------------------------------------------
 69 | // Finalization mix - force all bits of a hash block to avalanche
 70 | 
 71 | FORCE_INLINE uint32_t fmix32 ( uint32_t h )
 72 | {
 73 |   h ^= h >> 16;
 74 |   h *= 0x85ebca6b;
 75 |   h ^= h >> 13;
 76 |   h *= 0xc2b2ae35;
 77 |   h ^= h >> 16;
 78 | 
 79 |   return h;
 80 | }
 81 | 
 82 | //----------
 83 | 
 84 | FORCE_INLINE uint64_t fmix64 ( uint64_t k )
 85 | {
 86 |   k ^= k >> 33;
 87 |   k *= BIG_CONSTANT(0xff51afd7ed558ccd);
 88 |   k ^= k >> 33;
 89 |   k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
 90 |   k ^= k >> 33;
 91 | 
 92 |   return k;
 93 | }
 94 | 
 95 | 
 96 | //-----------------------------------------------------------------------------
 97 | 
 98 | void MurmurHash3_x86_32 ( const void * key, int len,
 99 |                           uint32_t seed, void * out )
100 | {
101 |   const uint8_t * data = (const uint8_t*)key;
102 |   const int nblocks = len / 4;
103 | 
104 |   uint32_t h1 = seed;
105 | 
106 |   uint32_t c1 = 0xcc9e2d51;
107 |   uint32_t c2 = 0x1b873593;
108 | 
109 |   //----------
110 |   // body
111 | 
112 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
113 | 
114 |   int i;
115 |   for(i = -nblocks; i; i++)
116 |   {
117 |     uint32_t k1 = getblock32(blocks,i);
118 | 
119 |     k1 *= c1;
120 |     k1 = ROTL32(k1,15);
121 |     k1 *= c2;
122 |     
123 |     h1 ^= k1;
124 |     h1 = ROTL32(h1,13); 
125 |     h1 = h1*5+0xe6546b64;
126 |   }
127 | 
128 |   //----------
129 |   // tail
130 | 
131 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
132 | 
133 |   uint32_t k1 = 0;
134 | 
135 |   switch(len & 3)
136 |   {
137 |   case 3: k1 ^= tail[2] << 16;
138 |   case 2: k1 ^= tail[1] << 8;
139 |   case 1: k1 ^= tail[0];
140 |           k1 *= c1; k1 = ROTL32(k1,16); k1 *= c2; h1 ^= k1;
141 |   };
142 | 
143 |   //----------
144 |   // finalization
145 | 
146 |   h1 ^= len;
147 | 
148 |   h1 = fmix32(h1);
149 | 
150 |   *(uint32_t*)out = h1;
151 | } 
152 | 
153 | //-----------------------------------------------------------------------------
154 | 
155 | void MurmurHash3_x86_128 ( const void * key, const int len,
156 |                            uint32_t seed, void * out )
157 | {
158 |   const uint8_t * data = (const uint8_t*)key;
159 |   const int nblocks = len / 16;
160 | 
161 |   uint32_t h1 = seed;
162 |   uint32_t h2 = seed;
163 |   uint32_t h3 = seed;
164 |   uint32_t h4 = seed;
165 | 
166 |   uint32_t c1 = 0x239b961b; 
167 |   uint32_t c2 = 0xab0e9789;
168 |   uint32_t c3 = 0x38b34ae5; 
169 |   uint32_t c4 = 0xa1e38b93;
170 | 
171 |   //----------
172 |   // body
173 | 
174 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
175 | 
176 |   int i;
177 |   for(i = -nblocks; i; i++)
178 |   {
179 |     uint32_t k1 = getblock32(blocks,i*4+0);
180 |     uint32_t k2 = getblock32(blocks,i*4+1);
181 |     uint32_t k3 = getblock32(blocks,i*4+2);
182 |     uint32_t k4 = getblock32(blocks,i*4+3);
183 | 
184 |     k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
185 | 
186 |     h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
187 | 
188 |     k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
189 | 
190 |     h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
191 | 
192 |     k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
193 | 
194 |     h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
195 | 
196 |     k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
197 | 
198 |     h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
199 |   }
200 | 
201 |   //----------
202 |   // tail
203 | 
204 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
205 | 
206 |   uint32_t k1 = 0;
207 |   uint32_t k2 = 0;
208 |   uint32_t k3 = 0;
209 |   uint32_t k4 = 0;
210 | 
211 |   switch(len & 15)
212 |   {
213 |   case 15: k4 ^= tail[14] << 16;
214 |   case 14: k4 ^= tail[13] << 8;
215 |   case 13: k4 ^= tail[12] << 0;
216 |            k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
217 | 
218 |   case 12: k3 ^= tail[11] << 24;
219 |   case 11: k3 ^= tail[10] << 16;
220 |   case 10: k3 ^= tail[ 9] << 8;
221 |   case  9: k3 ^= tail[ 8] << 0;
222 |            k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
223 | 
224 |   case  8: k2 ^= tail[ 7] << 24;
225 |   case  7: k2 ^= tail[ 6] << 16;
226 |   case  6: k2 ^= tail[ 5] << 8;
227 |   case  5: k2 ^= tail[ 4] << 0;
228 |            k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
229 | 
230 |   case  4: k1 ^= tail[ 3] << 24;
231 |   case  3: k1 ^= tail[ 2] << 16;
232 |   case  2: k1 ^= tail[ 1] << 8;
233 |   case  1: k1 ^= tail[ 0] << 0;
234 |            k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
235 |   };
236 | 
237 |   //----------
238 |   // finalization
239 | 
240 |   h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
241 | 
242 |   h1 += h2; h1 += h3; h1 += h4;
243 |   h2 += h1; h3 += h1; h4 += h1;
244 | 
245 |   h1 = fmix32(h1);
246 |   h2 = fmix32(h2);
247 |   h3 = fmix32(h3);
248 |   h4 = fmix32(h4);
249 | 
250 |   h1 += h2; h1 += h3; h1 += h4;
251 |   h2 += h1; h3 += h1; h4 += h1;
252 | 
253 |   ((uint32_t*)out)[0] = h1;
254 |   ((uint32_t*)out)[1] = h2;
255 |   ((uint32_t*)out)[2] = h3;
256 |   ((uint32_t*)out)[3] = h4;
257 | }
258 | 
259 | //-----------------------------------------------------------------------------
260 | 
261 | void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, void * out )
262 | {
263 |   const uint8_t * data = (const uint8_t*)key;
264 |   const int nblocks = len / 16;
265 | 
266 |   uint64_t h1 = seed;
267 |   uint64_t h2 = seed;
268 | 
269 |   uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
270 |   uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
271 | 
272 |   //----------
273 |   // body
274 | 
275 |   const uint64_t * blocks = (const uint64_t *)(data);
276 | 
277 |   int i;
278 |   for(i = 0; i < nblocks; i++)
279 |   {
280 |     uint64_t k1 = getblock64(blocks,i*2+0);
281 |     uint64_t k2 = getblock64(blocks,i*2+1);
282 | 
283 |     k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
284 | 
285 |     h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
286 | 
287 |     k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
288 | 
289 |     h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
290 |   }
291 | 
292 |   //----------
293 |   // tail
294 | 
295 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
296 | 
297 |   uint64_t k1 = 0;
298 |   uint64_t k2 = 0;
299 | 
300 |   switch(len & 15)
301 |   {
302 |   case 15: k2 ^= (uint64_t)(tail[14]) << 48;
303 |   case 14: k2 ^= (uint64_t)(tail[13]) << 40;
304 |   case 13: k2 ^= (uint64_t)(tail[12]) << 32;
305 |   case 12: k2 ^= (uint64_t)(tail[11]) << 24;
306 |   case 11: k2 ^= (uint64_t)(tail[10]) << 16;
307 |   case 10: k2 ^= (uint64_t)(tail[ 9]) << 8;
308 |   case  9: k2 ^= (uint64_t)(tail[ 8]) << 0;
309 |            k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
310 | 
311 |   case  8: k1 ^= (uint64_t)(tail[ 7]) << 56;
312 |   case  7: k1 ^= (uint64_t)(tail[ 6]) << 48;
313 |   case  6: k1 ^= (uint64_t)(tail[ 5]) << 40;
314 |   case  5: k1 ^= (uint64_t)(tail[ 4]) << 32;
315 |   case  4: k1 ^= (uint64_t)(tail[ 3]) << 24;
316 |   case  3: k1 ^= (uint64_t)(tail[ 2]) << 16;
317 |   case  2: k1 ^= (uint64_t)(tail[ 1]) << 8;
318 |   case  1: k1 ^= (uint64_t)(tail[ 0]) << 0;
319 |            k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
320 |   };
321 | 
322 |   //----------
323 |   // finalization
324 | 
325 |   h1 ^= len; h2 ^= len;
326 | 
327 |   h1 += h2;
328 |   h2 += h1;
329 | 
330 |   h1 = fmix64(h1);
331 |   h2 = fmix64(h2);
332 | 
333 |   h1 += h2;
334 |   h2 += h1;
335 | 
336 |   ((uint64_t*)out)[0] = h1;
337 |   ((uint64_t*)out)[1] = h2;
338 | }
339 | 
340 | //-----------------------------------------------------------------------------
341 | 


--------------------------------------------------------------------------------
/src/MurmurHash3.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
 3 | // domain. The author hereby disclaims copyright to this source code.
 4 | 
 5 | #ifndef _MURMURHASH3_H_
 6 | #define _MURMURHASH3_H_
 7 | 
 8 | //-----------------------------------------------------------------------------
 9 | // Platform-specific functions and macros
10 | 
11 | // Microsoft Visual Studio
12 | 
13 | #if defined(_MSC_VER)
14 | 
15 | typedef unsigned char uint8_t;
16 | typedef unsigned long uint32_t;
17 | typedef unsigned __int64 uint64_t;
18 | 
19 | // Other compilers
20 | 
21 | #else	// defined(_MSC_VER)
22 | 
23 | #include <stdint.h>
24 | 
25 | #endif // !defined(_MSC_VER)
26 | 
27 | //-----------------------------------------------------------------------------
28 | 
29 | void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
30 | 
31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
32 | 
33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
34 | 
35 | 
36 | //-----------------------------------------------------------------------------
37 | 
38 | #endif // _MURMURHASH3_H_
39 | 


--------------------------------------------------------------------------------
/src/bloomfilter.c:
--------------------------------------------------------------------------------
  1 | #ifndef __BLOOMFILTER_C
  2 | #define __BLOOMFILTER_C
  3 | #include <string.h>
  4 | #include <errno.h>
  5 | #include <stdio.h>
  6 | #include <fcntl.h>
  7 | #include "md5.h"
  8 | 
  9 | #include "bloomfilter.h"
 10 | 
 11 | BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
 12 |                                 BTYPE num_bits, int *hash_seeds, int num_hashes)
 13 | {
 14 |     BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter));
 15 |     MBArray * array;
 16 | 
 17 |     if (!bf) {
 18 |         return NULL;
 19 |     }
 20 | 
 21 |     bf->max_num_elem = max_num_elem;
 22 |     bf->error_rate = error_rate;
 23 |     bf->num_hashes = num_hashes;
 24 |     bf->count_correct = 1;
 25 |     bf->bf_version = BF_CURRENT_VERSION;
 26 |     bf->elem_count = 0;
 27 |     bf->array = NULL;
 28 |     memset(bf->reserved, 0, sizeof(uint32_t) * 32);
 29 |     memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256);
 30 |     memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes);
 31 |     array = mbarray_Create_Malloc(num_bits);
 32 |     if (!array) {
 33 |         bloomfilter_Destroy(bf);
 34 |         return NULL;
 35 |     }
 36 | 
 37 |     bf->array = array;
 38 | 
 39 |     return bf;
 40 | }
 41 | 
 42 | BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate,
 43 |                                 const char * file, BTYPE num_bits, int oflags, int perms,
 44 |                                 int *hash_seeds, int num_hashes)
 45 | {
 46 |     BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter));
 47 |     MBArray * array;
 48 | 
 49 |     if (!bf) {
 50 |         return NULL;
 51 |     }
 52 | 
 53 |     bf->max_num_elem = max_num_elem;
 54 |     bf->error_rate = error_rate;
 55 |     bf->num_hashes = num_hashes;
 56 |     bf->count_correct = 1;
 57 |     bf->bf_version = BF_CURRENT_VERSION;
 58 |     bf->elem_count = 0;
 59 |     bf->array = NULL;
 60 |     memset(bf->reserved, 0,  sizeof(uint32_t) * 32);
 61 |     memset(bf->hash_seeds, 0, sizeof(uint32_t) * 256);
 62 |     memcpy(bf->hash_seeds, hash_seeds, sizeof(uint32_t) * num_hashes);
 63 |     array = mbarray_Create_Mmap(num_bits, file, (char *)bf, sizeof(BloomFilter), oflags, perms);
 64 |     if (!array) {
 65 |         bloomfilter_Destroy(bf);
 66 |         return NULL;
 67 |     }
 68 | 
 69 |     /* After we create the new array object, this array may already
 70 |        have all of the bloom filter data from the file in the
 71 |        header info.
 72 |        By calling mbarray_Header, we copy that header data
 73 |        back into this BloomFilter object.
 74 |     */
 75 |     if (mbarray_Header((char *)bf, array, sizeof(BloomFilter)) == NULL) {
 76 |         bloomfilter_Destroy(bf);
 77 |         mbarray_Destroy(array);
 78 |         return NULL;
 79 |     }
 80 | 
 81 |     /* Since we just initialized from a file, we have to
 82 |        fix our pointers */
 83 |     bf->array = array;
 84 | 
 85 |     return bf;
 86 | }
 87 | 
 88 | 
 89 | void bloomfilter_Destroy(BloomFilter * bf)
 90 | {
 91 |     if (bf) {
 92 |         if (bf->array) {
 93 |             mbarray_Destroy(bf->array);
 94 |             bf->array = NULL;
 95 |         }
 96 |         free(bf);
 97 |     }
 98 | }
 99 | 
100 | 
101 | void bloomfilter_Print(BloomFilter * bf)
102 | {
103 |     printf("<BloomFilter num: %lu, error: %0.3f, num_hashes: %d>\n",
104 |            (unsigned long)bf->max_num_elem, bf->error_rate, bf->num_hashes);
105 | }
106 | 
107 | int bloomfilter_Update(BloomFilter * bf, char * data, int size)
108 | {
109 |     MBArray * array = bf->array;
110 |     int retval = mbarray_Update(bf->array, data, size);
111 |     if (retval) {
112 |         return retval;
113 |     }
114 |     if (mbarray_Header((char *)bf, array, sizeof(BloomFilter)) == NULL) {
115 |         return 1;
116 |     }
117 |     bf->array = array;
118 |     return 0;
119 | }
120 | 
121 | 
122 | BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int perms)
123 | {
124 |     BloomFilter * bf = (BloomFilter *)malloc(sizeof(BloomFilter));
125 |     MBArray * array;
126 | 
127 |     if (bf == NULL) {
128 |         return NULL;
129 |     }
130 | 
131 |     array = mbarray_Copy_Template(src->array, filename, perms);
132 |     if (array == NULL) {
133 |         free(bf);
134 |         return NULL;
135 |     }
136 | 
137 |     if (mbarray_Header((char *)bf, array, sizeof(BloomFilter)) == NULL) {
138 |         bloomfilter_Destroy(bf);
139 |         mbarray_Destroy(array);
140 |         return NULL;
141 |     }
142 | 
143 |     bf->array = array;
144 |     return bf;
145 | }
146 | 
147 | 
148 | BTYPE _hash_long(uint32_t hash_seed, Key * key) {
149 |     Key newKey = {
150 |         .shash = (char *)&key->nhash,
151 |         .nhash = sizeof(key->nhash)
152 |     };
153 | 
154 |     return _hash_char(hash_seed, &newKey);
155 | }
156 | 
157 | /*
158 | CODE TO USE SHA512..
159 | #include <openssl/evp.h>
160 | 
161 | uint32_t _hash_char(uint32_t hash_seed, Key * key) {
162 |     EVP_MD_CTX ctx;
163 |     unsigned char result_buffer[64];
164 | 
165 |     EVP_MD_CTX_init(&ctx);
166 | 
167 |     EVP_DigestInit_ex(&ctx, EVP_sha512(), NULL);
168 |     EVP_DigestUpdate(&ctx, (const unsigned char *)&hash_seed, sizeof(hash_seed));
169 |     EVP_DigestUpdate(&ctx, (const unsigned char *)key->shash, key->nhash);
170 |     EVP_DigestFinal_ex(&ctx, (unsigned char *)&result_buffer, NULL);
171 |     EVP_MD_CTX_cleanup(&ctx);
172 |     return *(uint32_t *)result_buffer;
173 | }
174 | */
175 | 
176 | /* Code for MurmurHash3 */
177 | #include "MurmurHash3.h"
178 | BTYPE _hash_char(uint32_t hash_seed, Key * key) {
179 |     BTYPE hashed_pieces[2];
180 |     MurmurHash3_x64_128((const void *)key->shash, (int)key->nhash,
181 |                        hash_seed, &hashed_pieces);
182 |     return hashed_pieces[0] ^ hashed_pieces[1];
183 | }
184 | 
185 | 
186 | #if 0
187 | int main(int argc, char **argv)
188 | {
189 |     int hash_seeds[5] = { 4234 , 2123, 4434, 444, 12123};
190 |     BloomFilter *bf = bloomfilter_Create(100000, 0.4,
191 |                                          "/tmp/bf2", 10000000, O_RDWR, 0,
192 |                                         hash_seeds, 5);
193 | 
194 |     Key key;
195 |     char line[255];
196 |     key.shash = line;
197 | 
198 |     if (!bf)
199 |         goto error;
200 | 
201 |     bloomfilter_Print(bf);
202 | 
203 |     while (fgets(line, 255, stdin)) {
204 |         line[strlen(line) - 1] = '\0';
205 |         key.nhash = strlen(line);
206 | 
207 |         /*if (bloomfilter_Add(bf, &key)) {
208 |             goto error;
209 |             }*/
210 |         if (bloomfilter_Test(bf, &key)) {
211 |             printf("Found '%s'!\n", line);
212 |         }
213 |     }
214 |     bloomfilter_Destroy(bf);
215 |     return 0;
216 | 
217 |  error:
218 |     fprintf(stderr, "ERROR: %s [%d]\n", strerror(errno), errno);
219 |     return 255;
220 | }
221 | #endif
222 | #endif
223 | 


--------------------------------------------------------------------------------
/src/bloomfilter.h:
--------------------------------------------------------------------------------
  1 | #ifndef __BLOOMFILTER_H
  2 | #define __BLOOMFILTER_H 1
  3 | 
  4 | #include <stdlib.h>
  5 | 
  6 | #include "mmapbitarray.h"
  7 | #define BF_CURRENT_VERSION 1
  8 | 
  9 | struct _BloomFilter {
 10 |     uint64_t max_num_elem;
 11 |     double error_rate;
 12 |     uint32_t num_hashes;
 13 |     uint32_t hash_seeds[256];
 14 |     /* All of the bit data is already in here. */
 15 |     MBArray * array;
 16 |     unsigned char bf_version;
 17 |     unsigned char count_correct;
 18 |     uint64_t elem_count;
 19 |     uint32_t reserved[32];
 20 | };
 21 | 
 22 | typedef struct {
 23 |     uint64_t nhash;
 24 |     char * shash;
 25 | } Key;
 26 | 
 27 | typedef struct _BloomFilter BloomFilter;
 28 | 
 29 | /* Create a bloom filter without a memory-mapped file backing it */
 30 | BloomFilter *bloomfilter_Create_Malloc(size_t max_num_elem, double error_rate,
 31 |                                 BTYPE num_bits, int *hash_seeds, int num_hashes);
 32 | 
 33 | /* Create a bloom filter with a memory-mapped file backing it */
 34 | BloomFilter *bloomfilter_Create_Mmap(size_t max_num_elem, double error_rate,
 35 |                                 const char * file, BTYPE num_bits, int oflags, int perms,
 36 |                                 int *hash_seeds, int num_hashes);
 37 | 
 38 | void bloomfilter_Destroy(BloomFilter * bf);
 39 | 
 40 | int bloomfilter_Update(BloomFilter * bf, char * data, int size);
 41 | 
 42 | BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int perms);
 43 | 
 44 | /* A lot of this is inlined.. */
 45 | BTYPE _hash_char(uint32_t hash_seed, Key * key);
 46 | 
 47 | BTYPE _hash_long(uint32_t hash_seed, Key * key);
 48 | 
 49 | 
 50 | static inline int bloomfilter_Add(BloomFilter * bf, Key * key)
 51 | {
 52 |     BTYPE (*hashfunc)(uint32_t, Key *) = _hash_char;
 53 |     register BTYPE mod = bf->array->bits;
 54 |     register int i;
 55 |     register int result = 1;
 56 |     register BTYPE hash_res;
 57 | 
 58 |     if (key->shash == NULL)
 59 |         hashfunc = _hash_long;
 60 | 
 61 |     for (i = bf->num_hashes - 1; i >= 0; --i) {
 62 |         hash_res = (*hashfunc)(bf->hash_seeds[i], key) % mod;
 63 |         if (result && !mbarray_Test(bf->array, hash_res)) {
 64 |             result = 0;
 65 |         }
 66 |         if (mbarray_Set(bf->array, hash_res)) {
 67 |             return 2;
 68 |         }
 69 |     }
 70 |     if (!result && bf->count_correct) {
 71 |         bf->elem_count ++;
 72 |     }
 73 |     return result;
 74 | }
 75 | __attribute__((always_inline))
 76 | 
 77 | 
 78 | static inline int bloomfilter_Test(BloomFilter * bf, Key * key)
 79 | {
 80 |     register BTYPE mod = bf->array->bits;
 81 |     register BTYPE (*hashfunc)(uint32_t, Key *) = _hash_char;
 82 |     register int i;
 83 | 
 84 |     if (key->shash == NULL)
 85 |         hashfunc = _hash_long;
 86 | 
 87 |     for (i = bf->num_hashes - 1; i >= 0; --i) {
 88 |         if (!mbarray_Test(bf->array, (*hashfunc)(bf->hash_seeds[i], key) % mod)) {
 89 |             return 0;
 90 |         }
 91 |     }
 92 |     return 1;
 93 | }
 94 | __attribute__((always_inline))
 95 | 
 96 | 
 97 | 
 98 | 
 99 | #endif
100 | 


--------------------------------------------------------------------------------
/src/cbloomfilter.pxd:
--------------------------------------------------------------------------------
 1 | 
 2 | cdef extern from "primetester.h":
 3 |      long next_prime(long prime)
 4 | 
 5 | cdef extern from "mmapbitarray.h":
 6 |      ctypedef struct MBArray:
 7 |          long bits
 8 |          long size
 9 |          char * filename
10 |          int fd
11 | 
12 |      MBArray * mbarray_ClearAll(MBArray * array)
13 |      MBArray * mbarray_Sync(MBArray * array)
14 |      MBArray * mbarray_And(MBArray * dest, MBArray * src)
15 |      MBArray * mbarray_Or(MBArray * dest, MBArray * src)
16 |      MBArray * mbarray_Xor(MBArray * dest, MBArray * src)
17 |      MBArray * mbarray_And_Ternary(MBArray * dest, MBArray * a, MBArray * b)
18 |      MBArray * mbarray_Or_Ternary(MBArray * dest, MBArray * a, MBArray * b)
19 |      MBArray * mbarray_Xor_Ternary(MBArray * dest, MBArray * a, MBArray * b)
20 |      int mbarray_Update(MBArray * array, char * data, int size)
21 |      int mbarray_FileSize(MBArray * array)
22 |      char * mbarray_CharData(MBArray * array)
23 | 
24 | 
25 | cdef extern from "bloomfilter.h":
26 |      ctypedef struct BloomFilter:
27 |          long max_num_elem
28 |          double error_rate
29 |          int num_hashes
30 |          long * hash_seeds
31 |          MBArray * array
32 |          unsigned char bf_version
33 |          unsigned char count_correct
34 |          unsigned long long elem_count
35 | 
36 |      ctypedef struct Key:
37 |          long nhash
38 |          char * shash
39 | 
40 |      BloomFilter * bloomfilter_Create_Mmap(long max_num_elem,
41 |                                       double error_rate,
42 |                                       char * fname, long num_bits,
43 |                                       int oflags, int perms,
44 |                                       int * hash_seeds, int num_hashes)
45 |      BloomFilter * bloomfilter_Create_Malloc(long max_num_elem,
46 |                                       double error_rate,
47 |                                       long num_bits,
48 |                                       int * hash_seeds, int num_hashes)
49 |      void bloomfilter_Destroy(BloomFilter * bf)
50 |      int bloomfilter_Add(BloomFilter * bf, Key * key)
51 |      int bloomfilter_Test(BloomFilter * bf, Key * key)
52 |      int bloomfilter_Update(BloomFilter * bf, char * data, int size)
53 |      BloomFilter * bloomfilter_Copy_Template(BloomFilter * src, char * filename, int perms)
54 | 


--------------------------------------------------------------------------------
/src/md5.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |   Copyright (C) 1999, 2000, 2002 Aladdin Enterprises.  All rights reserved.
  3 | 
  4 |   This software is provided 'as-is', without any express or implied
  5 |   warranty.  In no event will the authors be held liable for any damages
  6 |   arising from the use of this software.
  7 | 
  8 |   Permission is granted to anyone to use this software for any purpose,
  9 |   including commercial applications, and to alter it and redistribute it
 10 |   freely, subject to the following restrictions:
 11 | 
 12 |   1. The origin of this software must not be misrepresented; you must not
 13 |      claim that you wrote the original software. If you use this software
 14 |      in a product, an acknowledgment in the product documentation would be
 15 |      appreciated but is not required.
 16 |   2. Altered source versions must be plainly marked as such, and must not be
 17 |      misrepresented as being the original software.
 18 |   3. This notice may not be removed or altered from any source distribution.
 19 | 
 20 |   L. Peter Deutsch
 21 |   ghost@aladdin.com
 22 | 
 23 |  */
 24 | /* $Id: md5.c,v 1.6 2002/04/13 19:20:28 lpd Exp $ */
 25 | /*
 26 |   Independent implementation of MD5 (RFC 1321).
 27 | 
 28 |   This code implements the MD5 Algorithm defined in RFC 1321, whose
 29 |   text is available at
 30 |         http://www.ietf.org/rfc/rfc1321.txt
 31 |   The code is derived from the text of the RFC, including the test suite
 32 |   (section A.5) but excluding the rest of Appendix A.  It does not include
 33 |   any code or documentation that is identified in the RFC as being
 34 |   copyrighted.
 35 | 
 36 |   The original and principal author of md5.c is L. Peter Deutsch
 37 |   <ghost@aladdin.com>.  Other authors are noted in the change history
 38 |   that follows (in reverse chronological order):
 39 | 
 40 |   2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order
 41 |         either statically or dynamically; added missing #include <string.h>
 42 |         in library.
 43 |   2002-03-11 lpd Corrected argument list for main(), and added int return
 44 |         type, in test program and T value program.
 45 |   2002-02-21 lpd Added missing #include <stdio.h> in test program.
 46 |   2000-07-03 lpd Patched to eliminate warnings about "constant is
 47 |         unsigned in ANSI C, signed in traditional"; made test program
 48 |         self-checking.
 49 |   1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
 50 |   1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5).
 51 |   1999-05-03 lpd Original version.
 52 |  */
 53 | 
 54 | #include "md5.h"
 55 | #include <string.h>
 56 | #include <limits.h>
 57 | 
 58 | #undef BYTE_ORDER       /* 1 = big-endian, -1 = little-endian, 0 = unknown */
 59 | #ifdef ARCH_IS_BIG_ENDIAN
 60 | #  define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1)
 61 | #else
 62 | #  define BYTE_ORDER 0
 63 | #endif
 64 | 
 65 | #define T_MASK ((md5_word_t)~0)
 66 | #define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87)
 67 | #define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9)
 68 | #define T3    0x242070db
 69 | #define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111)
 70 | #define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050)
 71 | #define T6    0x4787c62a
 72 | #define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec)
 73 | #define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe)
 74 | #define T9    0x698098d8
 75 | #define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850)
 76 | #define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e)
 77 | #define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841)
 78 | #define T13    0x6b901122
 79 | #define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c)
 80 | #define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71)
 81 | #define T16    0x49b40821
 82 | #define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d)
 83 | #define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf)
 84 | #define T19    0x265e5a51
 85 | #define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855)
 86 | #define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2)
 87 | #define T22    0x02441453
 88 | #define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e)
 89 | #define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437)
 90 | #define T25    0x21e1cde6
 91 | #define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829)
 92 | #define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278)
 93 | #define T28    0x455a14ed
 94 | #define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa)
 95 | #define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07)
 96 | #define T31    0x676f02d9
 97 | #define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375)
 98 | #define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd)
 99 | #define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e)
100 | #define T35    0x6d9d6122
101 | #define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3)
102 | #define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb)
103 | #define T38    0x4bdecfa9
104 | #define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f)
105 | #define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f)
106 | #define T41    0x289b7ec6
107 | #define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805)
108 | #define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a)
109 | #define T44    0x04881d05
110 | #define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6)
111 | #define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a)
112 | #define T47    0x1fa27cf8
113 | #define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a)
114 | #define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb)
115 | #define T50    0x432aff97
116 | #define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58)
117 | #define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6)
118 | #define T53    0x655b59c3
119 | #define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d)
120 | #define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82)
121 | #define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e)
122 | #define T57    0x6fa87e4f
123 | #define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f)
124 | #define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb)
125 | #define T60    0x4e0811a1
126 | #define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d)
127 | #define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca)
128 | #define T63    0x2ad7d2bb
129 | #define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e)
130 | 
131 | 
132 | static void
133 | md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/)
134 | {
135 |     md5_word_t
136 |         a = pms->abcd[0], b = pms->abcd[1],
137 |         c = pms->abcd[2], d = pms->abcd[3];
138 |     md5_word_t t;
139 | #if BYTE_ORDER > 0
140 |     /* Define storage only for big-endian CPUs. */
141 |     md5_word_t X[16];
142 | #else
143 |     /* Define storage for little-endian or both types of CPUs. */
144 |     md5_word_t xbuf[16];
145 |     const md5_word_t *X;
146 | #endif
147 | 
148 |     {
149 | #if BYTE_ORDER == 0
150 |         /*
151 |          * Determine dynamically whether this is a big-endian or
152 |          * little-endian machine, since we can use a more efficient
153 |          * algorithm on the latter.
154 |          */
155 |         static const int w = 1;
156 | 
157 |         if (*((const md5_byte_t *)&w)) /* dynamic little-endian */
158 | #endif
159 | #if BYTE_ORDER <= 0             /* little-endian */
160 |         {
161 |             /*
162 |              * On little-endian machines, we can process properly aligned
163 |              * data without copying it.
164 |              */
165 |             if (!((data - (const md5_byte_t *)0) & 3)) {
166 |                 /* data are properly aligned */
167 |                 X = (const md5_word_t *)data;
168 |             } else {
169 |                 /* not aligned */
170 |                 memcpy(xbuf, data, 64);
171 |                 X = xbuf;
172 |             }
173 |         }
174 | #endif
175 | #if BYTE_ORDER == 0
176 |         else                    /* dynamic big-endian */
177 | #endif
178 | #if BYTE_ORDER >= 0             /* big-endian */
179 |         {
180 |             /*
181 |              * On big-endian machines, we must arrange the bytes in the
182 |              * right order.
183 |              */
184 |             const md5_byte_t *xp = data;
185 |             int i;
186 | 
187 | #  if BYTE_ORDER == 0
188 |             X = xbuf;           /* (dynamic only) */
189 | #  else
190 | #    define xbuf X              /* (static only) */
191 | #  endif
192 |             for (i = 0; i < 16; ++i, xp += 4)
193 |                 xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24);
194 |         }
195 | #endif
196 |     }
197 | 
198 | #define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
199 | 
200 |     /* Round 1. */
201 |     /* Let [abcd k s i] denote the operation
202 |        a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */
203 | #define F(x, y, z) (((x) & (y)) | (~(x) & (z)))
204 | #define SET(a, b, c, d, k, s, Ti)\
205 |   t = a + F(b,c,d) + X[k] + Ti;\
206 |   a = ROTATE_LEFT(t, s) + b
207 |     /* Do the following 16 operations. */
208 |     SET(a, b, c, d,  0,  7,  T1);
209 |     SET(d, a, b, c,  1, 12,  T2);
210 |     SET(c, d, a, b,  2, 17,  T3);
211 |     SET(b, c, d, a,  3, 22,  T4);
212 |     SET(a, b, c, d,  4,  7,  T5);
213 |     SET(d, a, b, c,  5, 12,  T6);
214 |     SET(c, d, a, b,  6, 17,  T7);
215 |     SET(b, c, d, a,  7, 22,  T8);
216 |     SET(a, b, c, d,  8,  7,  T9);
217 |     SET(d, a, b, c,  9, 12, T10);
218 |     SET(c, d, a, b, 10, 17, T11);
219 |     SET(b, c, d, a, 11, 22, T12);
220 |     SET(a, b, c, d, 12,  7, T13);
221 |     SET(d, a, b, c, 13, 12, T14);
222 |     SET(c, d, a, b, 14, 17, T15);
223 |     SET(b, c, d, a, 15, 22, T16);
224 | #undef SET
225 | 
226 |      /* Round 2. */
227 |      /* Let [abcd k s i] denote the operation
228 |           a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */
229 | #define G(x, y, z) (((x) & (z)) | ((y) & ~(z)))
230 | #define SET(a, b, c, d, k, s, Ti)\
231 |   t = a + G(b,c,d) + X[k] + Ti;\
232 |   a = ROTATE_LEFT(t, s) + b
233 |      /* Do the following 16 operations. */
234 |     SET(a, b, c, d,  1,  5, T17);
235 |     SET(d, a, b, c,  6,  9, T18);
236 |     SET(c, d, a, b, 11, 14, T19);
237 |     SET(b, c, d, a,  0, 20, T20);
238 |     SET(a, b, c, d,  5,  5, T21);
239 |     SET(d, a, b, c, 10,  9, T22);
240 |     SET(c, d, a, b, 15, 14, T23);
241 |     SET(b, c, d, a,  4, 20, T24);
242 |     SET(a, b, c, d,  9,  5, T25);
243 |     SET(d, a, b, c, 14,  9, T26);
244 |     SET(c, d, a, b,  3, 14, T27);
245 |     SET(b, c, d, a,  8, 20, T28);
246 |     SET(a, b, c, d, 13,  5, T29);
247 |     SET(d, a, b, c,  2,  9, T30);
248 |     SET(c, d, a, b,  7, 14, T31);
249 |     SET(b, c, d, a, 12, 20, T32);
250 | #undef SET
251 | 
252 |      /* Round 3. */
253 |      /* Let [abcd k s t] denote the operation
254 |           a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */
255 | #define H(x, y, z) ((x) ^ (y) ^ (z))
256 | #define SET(a, b, c, d, k, s, Ti)\
257 |   t = a + H(b,c,d) + X[k] + Ti;\
258 |   a = ROTATE_LEFT(t, s) + b
259 |      /* Do the following 16 operations. */
260 |     SET(a, b, c, d,  5,  4, T33);
261 |     SET(d, a, b, c,  8, 11, T34);
262 |     SET(c, d, a, b, 11, 16, T35);
263 |     SET(b, c, d, a, 14, 23, T36);
264 |     SET(a, b, c, d,  1,  4, T37);
265 |     SET(d, a, b, c,  4, 11, T38);
266 |     SET(c, d, a, b,  7, 16, T39);
267 |     SET(b, c, d, a, 10, 23, T40);
268 |     SET(a, b, c, d, 13,  4, T41);
269 |     SET(d, a, b, c,  0, 11, T42);
270 |     SET(c, d, a, b,  3, 16, T43);
271 |     SET(b, c, d, a,  6, 23, T44);
272 |     SET(a, b, c, d,  9,  4, T45);
273 |     SET(d, a, b, c, 12, 11, T46);
274 |     SET(c, d, a, b, 15, 16, T47);
275 |     SET(b, c, d, a,  2, 23, T48);
276 | #undef SET
277 | 
278 |      /* Round 4. */
279 |      /* Let [abcd k s t] denote the operation
280 |           a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */
281 | #define I(x, y, z) ((y) ^ ((x) | ~(z)))
282 | #define SET(a, b, c, d, k, s, Ti)\
283 |   t = a + I(b,c,d) + X[k] + Ti;\
284 |   a = ROTATE_LEFT(t, s) + b
285 |      /* Do the following 16 operations. */
286 |     SET(a, b, c, d,  0,  6, T49);
287 |     SET(d, a, b, c,  7, 10, T50);
288 |     SET(c, d, a, b, 14, 15, T51);
289 |     SET(b, c, d, a,  5, 21, T52);
290 |     SET(a, b, c, d, 12,  6, T53);
291 |     SET(d, a, b, c,  3, 10, T54);
292 |     SET(c, d, a, b, 10, 15, T55);
293 |     SET(b, c, d, a,  1, 21, T56);
294 |     SET(a, b, c, d,  8,  6, T57);
295 |     SET(d, a, b, c, 15, 10, T58);
296 |     SET(c, d, a, b,  6, 15, T59);
297 |     SET(b, c, d, a, 13, 21, T60);
298 |     SET(a, b, c, d,  4,  6, T61);
299 |     SET(d, a, b, c, 11, 10, T62);
300 |     SET(c, d, a, b,  2, 15, T63);
301 |     SET(b, c, d, a,  9, 21, T64);
302 | #undef SET
303 | 
304 |      /* Then perform the following additions. (That is increment each
305 |         of the four registers by the value it had before this block
306 |         was started.) */
307 |     pms->abcd[0] += a;
308 |     pms->abcd[1] += b;
309 |     pms->abcd[2] += c;
310 |     pms->abcd[3] += d;
311 | }
312 | 
313 | void
314 | md5_init(md5_state_t *pms)
315 | {
316 |     pms->count[0] = pms->count[1] = 0;
317 |     pms->abcd[0] = 0x67452301;
318 |     pms->abcd[1] = /*0xefcdab89*/ T_MASK ^ 0x10325476;
319 |     pms->abcd[2] = /*0x98badcfe*/ T_MASK ^ 0x67452301;
320 |     pms->abcd[3] = 0x10325476;
321 | }
322 | 
323 | void
324 | md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes)
325 | {
326 |     const md5_byte_t *p = data;
327 |     unsigned int left = nbytes;
328 |     unsigned int offset = (pms->count[0] >> 3) & 63;
329 |     md5_word_t nbits = (md5_word_t)(nbytes << 3);
330 | 
331 |     if (nbytes <= 0)
332 |         return;
333 | 
334 |     /* this special case is handled recursively */
335 |     if (nbytes > INT_MAX - offset) {
336 |         unsigned int overlap;
337 | 
338 |         /* handle the append in two steps to prevent overflow */
339 |         overlap = 64 - offset;
340 | 
341 |         md5_append(pms, data, overlap);
342 |         md5_append(pms, data + overlap, nbytes - overlap);
343 |         return;
344 |     }
345 | 
346 |     /* Update the message length. */
347 |     pms->count[1] += nbytes >> 29;
348 |     pms->count[0] += nbits;
349 |     if (pms->count[0] < nbits)
350 |         pms->count[1]++;
351 | 
352 |     /* Process an initial partial block. */
353 |     if (offset) {
354 |         unsigned int copy = (offset + nbytes > 64 ? 64 - offset : nbytes);
355 | 
356 |         memcpy(pms->buf + offset, p, copy);
357 |         if (offset + copy < 64)
358 |             return;
359 |         p += copy;
360 |         left -= copy;
361 |         md5_process(pms, pms->buf);
362 |     }
363 | 
364 |     /* Process full blocks. */
365 |     for (; left >= 64; p += 64, left -= 64)
366 |         md5_process(pms, p);
367 | 
368 |     /* Process a final partial block. */
369 |     if (left)
370 |         memcpy(pms->buf, p, left);
371 | }
372 | 
373 | void
374 | md5_finish(md5_state_t *pms, md5_byte_t digest[16])
375 | {
376 |     static const md5_byte_t pad[64] = {
377 |         0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
378 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
379 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
380 |         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
381 |     };
382 |     md5_byte_t data[8];
383 |     int i;
384 | 
385 |     /* Save the length before padding. */
386 |     for (i = 0; i < 8; ++i)
387 |         data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3));
388 |     /* Pad to 56 bytes mod 64. */
389 |     md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1);
390 |     /* Append the length. */
391 |     md5_append(pms, data, 8);
392 |     for (i = 0; i < 16; ++i)
393 |         digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3));
394 | }
395 | 


--------------------------------------------------------------------------------
/src/md5.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |   Copyright (C) 1999, 2002 Aladdin Enterprises.  All rights reserved.
 3 | 
 4 |   This software is provided 'as-is', without any express or implied
 5 |   warranty.  In no event will the authors be held liable for any damages
 6 |   arising from the use of this software.
 7 | 
 8 |   Permission is granted to anyone to use this software for any purpose,
 9 |   including commercial applications, and to alter it and redistribute it
10 |   freely, subject to the following restrictions:
11 | 
12 |   1. The origin of this software must not be misrepresented; you must not
13 |      claim that you wrote the original software. If you use this software
14 |      in a product, an acknowledgment in the product documentation would be
15 |      appreciated but is not required.
16 |   2. Altered source versions must be plainly marked as such, and must not be
17 |      misrepresented as being the original software.
18 |   3. This notice may not be removed or altered from any source distribution.
19 | 
20 |   L. Peter Deutsch
21 |   ghost@aladdin.com
22 | 
23 |  */
24 | /* $Id$ */
25 | /*
26 |   Independent implementation of MD5 (RFC 1321).
27 | 
28 |   This code implements the MD5 Algorithm defined in RFC 1321, whose
29 |   text is available at
30 | 	http://www.ietf.org/rfc/rfc1321.txt
31 |   The code is derived from the text of the RFC, including the test suite
32 |   (section A.5) but excluding the rest of Appendix A.  It does not include
33 |   any code or documentation that is identified in the RFC as being
34 |   copyrighted.
35 | 
36 |   The original and principal author of md5.h is L. Peter Deutsch
37 |   <ghost@aladdin.com>.  Other authors are noted in the change history
38 |   that follows (in reverse chronological order):
39 | 
40 |   2002-04-13 lpd Removed support for non-ANSI compilers; removed
41 | 	references to Ghostscript; clarified derivation from RFC 1321;
42 | 	now handles byte order either statically or dynamically.
43 |   1999-11-04 lpd Edited comments slightly for automatic TOC extraction.
44 |   1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5);
45 | 	added conditionalization for C++ compilation from Martin
46 | 	Purschke <purschke@bnl.gov>.
47 |   1999-05-03 lpd Original version.
48 |  */
49 | 
50 | #ifndef md5_INCLUDED
51 | #  define md5_INCLUDED
52 | 
53 | /*
54 |  * This package supports both compile-time and run-time determination of CPU
55 |  * byte order.  If ARCH_IS_BIG_ENDIAN is defined as 0, the code will be
56 |  * compiled to run only on little-endian CPUs; if ARCH_IS_BIG_ENDIAN is
57 |  * defined as non-zero, the code will be compiled to run only on big-endian
58 |  * CPUs; if ARCH_IS_BIG_ENDIAN is not defined, the code will be compiled to
59 |  * run on either big- or little-endian CPUs, but will run slightly less
60 |  * efficiently on either one than if ARCH_IS_BIG_ENDIAN is defined.
61 |  */
62 | 
63 | typedef unsigned char md5_byte_t; /* 8-bit byte */
64 | typedef unsigned int md5_word_t; /* 32-bit word */
65 | 
66 | /* Define the state of the MD5 Algorithm. */
67 | typedef struct md5_state_s {
68 |     md5_word_t count[2];	/* message length in bits, lsw first */
69 |     md5_word_t abcd[4];		/* digest buffer */
70 |     md5_byte_t buf[64];		/* accumulate block */
71 | } md5_state_t;
72 | 
73 | #ifdef __cplusplus
74 | extern "C" 
75 | {
76 | #endif
77 | 
78 | /* Initialize the algorithm. */
79 | void md5_init(md5_state_t *pms);
80 | 
81 | /* Append a string to the message. */
82 | void md5_append(md5_state_t *pms, const md5_byte_t *data, unsigned int nbytes);
83 | 
84 | /* Finish the message and return the digest. */
85 | void md5_finish(md5_state_t *pms, md5_byte_t digest[16]);
86 | 
87 | #ifdef __cplusplus
88 | }  /* end extern "C" */
89 | #endif
90 | 
91 | #endif /* md5_INCLUDED */
92 | 


--------------------------------------------------------------------------------
/src/mmapbitarray.c:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <string.h>
  3 | #include <stdio.h>
  4 | #include <sys/mman.h>
  5 | #include <errno.h>
  6 | #include <sys/stat.h>
  7 | #include <fcntl.h>
  8 | #include <unistd.h>
  9 | #include <math.h>
 10 | #include <time.h>
 11 | 
 12 | #include "mmapbitarray.h"
 13 | 
 14 | /* Private helpers */
 15 | static inline uint64_t _filesize(int fd);
 16 | static inline int _valid_magic(int fd);
 17 | int _initialize_file(int fd, size_t end, BTYPE num_bits, const char * header, int32_t header_len);
 18 | uint64_t _get_num_bits(int fd);
 19 | static inline size_t _mmap_size(MBArray * array);
 20 | /*    __attribute__((always_inline));*/
 21 | 
 22 | static inline int _assert_comparable(MBArray * array1, MBArray * array2);
 23 | /*    __attribute__((always_inline));;*/
 24 | 
 25 | MBArray * mbarray_Create_Malloc(BTYPE num_bits)
 26 | {
 27 |     // Try to allocate space for a MBArray struct
 28 |     errno = 0;
 29 |     MBArray * array = (MBArray *)malloc(sizeof(MBArray));
 30 | 
 31 |     // And ensure that it was constructed properly
 32 |     if (!array || errno) {
 33 |         return NULL;
 34 |     }
 35 | 
 36 |     // Since we're not using a real mmap file for this instance,
 37 |     // we can get away with setting a bunch of the internal vars
 38 |     // to be reasonable default values
 39 |     array->filename      = NULL;
 40 |     array->vector        = NULL;
 41 | 	array->fd            = 0;
 42 | 	array->preamblesize  = 0;
 43 | 	array->preamblebytes = 0;
 44 | 
 45 |     // This is how many DTYPEs there are, and how many bytes there
 46 |     // are in this particular structure. As well as the number of 
 47 |     // bits
 48 |     array->size  = (size_t)ceil((double)num_bits / sizeof(DTYPE) / 8.0);
 49 |     array->bytes = (size_t)ceil((double)num_bits / 8.0);
 50 |     array->bits  = num_bits;
 51 | 
 52 |     // Now try to allocate enough space for our array
 53 |     errno = 0;
 54 |     array->vector = (DTYPE *)calloc(array->bytes, 1);
 55 |     if (errno || !array->vector) {
 56 |         mbarray_Destroy(array);
 57 |         return NULL;
 58 |     }
 59 | 
 60 |     return array;
 61 | }
 62 | 
 63 | MBArray * mbarray_Create_Mmap(BTYPE num_bits, const char * file, const char * header, int32_t header_len, int oflag, int perms)
 64 | {
 65 |     errno = 0;
 66 |     MBArray * array = (MBArray *)malloc(sizeof(MBArray));
 67 |     uint64_t filesize;
 68 |     int32_t fheaderlen;
 69 | 
 70 |     if (!array || errno) {
 71 |         return NULL;
 72 |     }
 73 | 
 74 |     array->filename = NULL;
 75 |     array->vector = NULL;
 76 |     errno = 0;
 77 |     array->fd = open(file, oflag, perms);
 78 | 
 79 |     if (array->fd < 0) {
 80 |         errno = EINVAL;
 81 |         mbarray_Destroy(array);
 82 |         return NULL;
 83 |     }
 84 | 
 85 |     fheaderlen = mbarray_HeaderLen(array);
 86 |     errno = 0;
 87 |     if (fheaderlen >= 0 && !(oflag & O_CREAT) && fheaderlen != header_len) {
 88 |         errno = EINVAL;
 89 |         mbarray_Destroy(array);
 90 |         return NULL;
 91 |     }
 92 |     else if (fheaderlen >= 0) {
 93 |         header_len = fheaderlen;
 94 |     }
 95 | 
 96 |     array->preamblebytes = MBAMAGICSIZE + sizeof(BTYPE) + sizeof(header_len) + header_len;
 97 | 
 98 |     /* This size is using 256-byte alignment so that we can use pretty much any base 2 data type */
 99 |     array->preamblesize = ((int)ceil((double)array->preamblebytes / 256.0) * 256) / sizeof(DTYPE);
100 |     array->preamblebytes = array->preamblesize * (sizeof(DTYPE));
101 | 
102 |     if (errno) {
103 |         mbarray_Destroy(array);
104 |         return NULL;
105 |     }
106 | 
107 |     filesize = _filesize(array->fd);
108 |     if (filesize > 50 && !num_bits) {
109 |         num_bits = _get_num_bits(array->fd);
110 |     }
111 |     array->size = (size_t)ceil((double)num_bits / sizeof(DTYPE) / 8.0);
112 |     array->bytes = (size_t)ceil((double)num_bits / 8.0);
113 | 
114 |     if (filesize == 0xffffffffffffffff) {
115 |         mbarray_Destroy(array);
116 |         return NULL;
117 |     }
118 |     else if (filesize && !_valid_magic(array->fd)) {
119 |         errno = EINVAL;
120 |         mbarray_Destroy(array);
121 |         return NULL;
122 |     }
123 |     else if (filesize && filesize < (array->bytes + array->preamblebytes - 1)) {
124 |         errno = EINVAL;
125 |         mbarray_Destroy(array);
126 |         return NULL;
127 |     }
128 |     else if (!filesize) {
129 |         if (!(oflag & O_CREAT) || (!num_bits) || _initialize_file(array->fd, array->bytes + array->preamblebytes - 1, num_bits, header, header_len)) {
130 |             if (!errno) {
131 |                 errno = ENOENT;
132 |             }
133 |             mbarray_Destroy(array);
134 |             return NULL;
135 |         }
136 |     }
137 |     else {
138 |         if (!num_bits) {
139 |             num_bits = _get_num_bits(array->fd);
140 |             array->size = (size_t)ceil((double)num_bits / sizeof(DTYPE) / 8.0);
141 |             array->bytes = (size_t)ceil((double)num_bits / 8.0);
142 |         }
143 |         else if (_get_num_bits(array->fd) != num_bits) {
144 |             mbarray_Destroy(array);
145 |             errno = EINVAL;
146 |             return NULL;
147 |         }
148 |     }
149 | 
150 |     errno = 0;
151 |     array->vector = (DTYPE *)mmap(NULL,
152 |                                   _mmap_size(array),
153 |                                   PROT_READ | PROT_WRITE,
154 |                                   MAP_SHARED, 
155 |                                   array->fd,
156 |                                   0);
157 |     if (errno || !array->vector) {
158 |         mbarray_Destroy(array);
159 |         return NULL;
160 |     }
161 |     array->filename = (char *)malloc(strlen(file) + 1);
162 |     if (!array->filename) {
163 |         mbarray_Destroy(array);
164 |         return NULL;
165 |     }
166 |     strcpy((char *)array->filename, file);
167 |     array->bits = num_bits;
168 |     return array;
169 | }
170 | 
171 | void mbarray_Destroy(MBArray * array)
172 | {
173 |     if (array != NULL) {
174 |         if (array->vector != NULL) {
175 |             if (array->filename == NULL) {
176 |                 // This is the case where we initialized the vector
177 |                 // with malloc, and not mmap. As such, be free!
178 | 				free((void*)array->vector);
179 | 				array->vector = NULL;
180 |             } else {
181 |                 if (munmap(array->vector, _mmap_size(array))) {
182 |                     fprintf(stderr, "Unable to close mmap!\n");
183 |                 }
184 |                 if (array->fd >= 0) {
185 |                     fsync(array->fd);
186 |                     close(array->fd);
187 |                     array->fd = -1;
188 |                 }
189 |                 array->vector = NULL;
190 |             }
191 |         }
192 |         if (array->filename) {
193 |             free((void *)array->filename);
194 |             array->filename = NULL;
195 |         }
196 |         free(array);
197 |     }
198 | }
199 | 
200 | int32_t mbarray_HeaderLen(MBArray * array)
201 | {
202 |     int32_t header_len;
203 |     errno = 0;
204 |     if (pread(array->fd, &header_len, sizeof(header_len), MBAMAGICSIZE + sizeof(BTYPE)) != sizeof(header_len)) {
205 |         return -1;
206 |     }
207 |     return header_len;
208 | }
209 | 
210 | char * mbarray_Header(char * dest, MBArray * array, int maxlen)
211 | {
212 |     int32_t header_len = mbarray_HeaderLen(array);
213 |     int readnum = (maxlen < header_len) ? (maxlen) : header_len;
214 | 
215 |     errno = 0;
216 | 
217 |     if (pread(array->fd,
218 |               dest,
219 |               readnum,
220 |               MBAMAGICSIZE + sizeof(BTYPE) + sizeof(int32_t)) != readnum) {
221 |         return NULL;
222 |     }
223 |     return dest;
224 | }
225 | 
226 | 
227 | int mbarray_Sync(MBArray * array)
228 | {
229 |     if (!array || !array->vector) {
230 |         errno = EINVAL;
231 |         return 1;
232 |     }
233 |     if (msync(array->vector, _mmap_size(array), MS_ASYNC)) {
234 |         return 1;
235 |     }
236 |     return 0;
237 | }
238 | 
239 | 
240 | int mbarray_ClearAll(MBArray * array)
241 | {
242 |     if (!array || !array->vector) {
243 |         errno = EINVAL;
244 |         return 1;
245 |     }
246 |     memset((void *)(array->vector + array->preamblesize), 0, sizeof(DTYPE) * array->size);
247 |     return 0;
248 | }
249 | 
250 | 
251 | MBArray * mbarray_And(MBArray * dest, MBArray * array2)
252 | {
253 |     register int i;
254 |     if (_assert_comparable(dest, array2))
255 |         return NULL;
256 | 
257 |     for (i = 0; i < dest->size + dest->preamblesize; i++) {
258 |         dest->vector[i] &= array2->vector[i];
259 |     }
260 |     return dest;
261 | }
262 | 
263 | 
264 | MBArray * mbarray_Or(MBArray * dest, MBArray * array2)
265 | {
266 |     register int i;
267 |     if (_assert_comparable(dest, array2))
268 |         return NULL;
269 |     for (i = 0; i < dest->size + dest->preamblesize; i++) {
270 |         dest->vector[i] |= array2->vector[i];
271 |     }
272 |     return dest;
273 | }
274 | 
275 | 
276 | MBArray * mbarray_Xor(MBArray * dest, MBArray * array2)
277 | {
278 |     register int i;
279 |     if (_assert_comparable(dest, array2))
280 |         return NULL;
281 | 
282 |     for (i = 0; i < dest->size + dest->preamblesize; i++) {
283 |         dest->vector[i] ^= array2->vector[i];
284 |     }
285 |     return dest;
286 | }
287 | 
288 | 
289 | MBArray * mbarray_And_Ternary(MBArray * dest, MBArray * a, MBArray * b)
290 | {
291 |     register int i;
292 |     if (_assert_comparable(a, b) || _assert_comparable(dest, b))
293 |         return NULL;
294 | 
295 |     for (i = 0; i < a->size + a->preamblesize; i++) {
296 |         dest->vector[i] = a->vector[i] & b->vector[i];
297 |     }
298 |     return dest;
299 | }
300 | 
301 | MBArray * mbarray_Or_Ternary(MBArray * dest, MBArray * a, MBArray * b)
302 | {
303 |     register int i;
304 |     if (_assert_comparable(a, b) || _assert_comparable(dest, b))
305 |         return NULL;
306 | 
307 |     for (i = 0; i < a->size + a->preamblesize; i++) {
308 |         dest->vector[i] = a->vector[i] | b->vector[i];
309 |     }
310 |     return dest;
311 | }
312 | 
313 | MBArray * mbarray_Xor_Ternary(MBArray * dest, MBArray * a, MBArray * b)
314 | {
315 |     register int i;
316 |     if (_assert_comparable(a, b) || _assert_comparable(dest, b))
317 |         return NULL;
318 | 
319 |     for (i = 0; i < a->size + a->preamblesize; i++) {
320 |         dest->vector[i] = a->vector[i] ^ b->vector[i];
321 |     }
322 |     return dest;
323 | }
324 | 
325 | 
326 | MBArray * mbarray_Copy_Template(MBArray * src, char * filename, int perms)
327 | {
328 |     int header_len = mbarray_HeaderLen(src);
329 |     char * header;
330 | 
331 |     if (header_len < 0) {
332 |         return NULL;
333 |     }
334 | 
335 |     if (!strcmp(filename, src->filename)) {
336 |         errno = EINVAL;
337 |         return NULL;
338 |     }
339 | 
340 |     header = (char *)malloc(header_len + 1);
341 |     if (header == NULL) {
342 |         errno = ENOMEM;
343 |         return NULL;
344 |     }
345 | 
346 |     if (mbarray_Header(header, src, header_len) == NULL) {
347 |         free(header);
348 |         return NULL;
349 |     }
350 | 
351 |     return mbarray_Create_Mmap(
352 |                           src->bits,
353 |                           filename,
354 |                           header,
355 |                           header_len,
356 |                           O_CREAT | O_RDWR,
357 |                           perms);
358 | }
359 | 
360 | 
361 | /*MBArray * mbarray_Copy(MBarray * src, const char * filename);*/
362 | uint64_t mbarray_FileSize(MBArray * array)
363 | {
364 |     return _filesize(array->fd);
365 | }
366 | 
367 | char * mbarray_CharData(MBArray * array)
368 | {
369 |     return (char *)array->vector;
370 | }
371 | 
372 | 
373 | int mbarray_Update(MBArray * array, char * data, int size)
374 | {
375 |     memcpy(array->vector, data, size);
376 |     array->bits = _get_num_bits(array->fd);
377 |     array->size = (size_t)ceil((double)array->bits / sizeof(DTYPE) / 8.0);
378 |     array->bytes = (size_t)ceil((double)array->bits / 8.0);
379 |     return 0;
380 | }
381 | 
382 | static inline int _assert_comparable(MBArray * array1, MBArray * array2)
383 | {
384 |     errno = EINVAL;
385 |     if (array1->preamblebytes != array2->preamblebytes) {
386 |         return 1;
387 |     }
388 | 
389 |     if (memcmp((char *)array1->vector, (char *)array2->vector, array1->preamblebytes)) {
390 |         return 1;
391 |     }
392 | 
393 |     return 0;
394 | }
395 | __attribute__((always_inline))
396 | 
397 | 
398 | static inline size_t _mmap_size(MBArray * array)
399 | {
400 |     return array->bytes + array->preamblebytes;
401 | }
402 | __attribute__((always_inline))
403 | 
404 | 
405 | static inline int _valid_magic(int fd)
406 | {
407 |     size_t nbytes;
408 |     char buffer[MBAMAGICSIZE + 1];
409 | 
410 |     nbytes = pread(fd, buffer, MBAMAGICSIZE, 0);
411 |     if (errno || nbytes != MBAMAGICSIZE || strncmp(MBAMAGIC, buffer, MBAMAGICSIZE)) {
412 |         return 0;
413 |     }
414 |     else {
415 |         return 1;
416 |     }
417 | }
418 | 
419 | static inline uint64_t _filesize(int fd)
420 | {
421 |     struct stat buffer;
422 |     int status;
423 |     status = fstat(fd, &buffer);
424 |     if (status || errno) {
425 |         return (uint64_t)0xffffffffffffffff;
426 |     }
427 | 
428 |     return (uint64_t)buffer.st_size;
429 | }
430 | 
431 | uint64_t _get_num_bits(int fd) {
432 |     uint64_t num_bits;
433 |     errno = 0;
434 |     if (pread(fd, &num_bits, sizeof(uint64_t), MBAMAGICSIZE) != sizeof(uint64_t)) {
435 |         return 0;
436 |     }
437 |     return num_bits;
438 | }
439 | 
440 | int _initialize_file(int fd, size_t end, BTYPE num_bits, const char * header, int32_t header_len)
441 | {
442 |     unsigned char zero = 0;
443 |     errno = 0;
444 |     lseek(fd, 0, SEEK_SET);
445 |     if (write(fd, MBAMAGIC, MBAMAGICSIZE) != MBAMAGICSIZE) {
446 |         return 1;
447 |     }
448 |     if (write(fd, &num_bits, sizeof(BTYPE)) != sizeof(BTYPE)) {
449 |         return 1;
450 |     }
451 |     if (write(fd, &header_len, sizeof(header_len)) != sizeof(header_len)) {
452 |         return 1;
453 |     }
454 |     if (header_len) {
455 |         if (write(fd, header, header_len) != header_len) {
456 |             return 1;
457 |         }
458 |     }
459 | 
460 |     lseek(fd, end, SEEK_SET);
461 |     if (write(fd, &zero, 1) != 1) {
462 |         return 1;
463 |     }
464 | 
465 |     if (errno) {
466 |         return 1;
467 |     }
468 |     return 0;
469 | }
470 | 
471 | 
472 | 
473 | #ifdef MBACREATE
474 | int main(int argc, char ** argv)
475 | {
476 |     MBArray * array;
477 |     if (argc < 3) {
478 |         fprintf(stderr, "Usage: %s FILENAME SIZE\nCreate new mmap'd array file.\n", argv[0]);
479 |         return 1;
480 |     }
481 | 
482 |     array = mbarray_Create_Mmap(
483 |                            atol(argv[2]),
484 |                            argv[1],
485 |                            "",
486 |                            0,
487 |                            O_RDWR | O_CREAT,
488 |                            0777);
489 |     if (!array)
490 |         goto error;
491 |     mbarray_ClearAll(array);
492 |     mbarray_Destroy(array);
493 |     return 0;
494 |     error:
495 |     fprintf(stderr, "Error: %s [%d]\n", strerror(errno), errno);
496 |     return 255;
497 | }
498 | #endif
499 | 
500 | #ifdef MBAQUERY
501 | int main(int argc, char ** argv)
502 | {
503 |     BTYPE bit;
504 |     int value;
505 |     MBArray * array;
506 |     int i;
507 |     if (argc < 3) {
508 |         fprintf(stderr, "Usage: %s FILE BIT [VALUE]\nValue is either 0 or 1 and will define a set/clear operation.\n", argv[0]);
509 |         return 255;
510 |     }
511 | 
512 |     /* Open file */
513 |     array = mbarray_Create_Mmap(
514 |                            0,
515 |                            argv[1],
516 |                            "",
517 |                            0,
518 |                            O_RDWR,
519 |                            0);
520 |     if (!array)
521 |         goto error;
522 | 
523 |     bit = atol(argv[2]);
524 | 
525 |     if (argc > 3) {
526 |         value = atol(argv[3]);
527 |         if (value) {
528 |             if (mbarray_Set(array, bit))
529 |                 goto error;
530 |         }
531 |         else {
532 |             if (mbarray_Clear(array, bit))
533 |                 goto error;
534 |         }
535 |     }
536 | 
537 |     for (i = 0; i < array->bits; i++) {
538 |         mbarray_Set(array, i);
539 |         mbarray_Test(array, i);
540 |     }
541 |     getc(stdin);
542 |     bit = 1 - mbarray_Test(array, bit);
543 |     mbarray_Destroy(array);
544 |     return bit;
545 |     error:
546 |     fprintf(stderr, "Error: %s [%d]\n", strerror(errno), errno);
547 |     return 255;
548 | }
549 | #endif
550 | 


--------------------------------------------------------------------------------
/src/mmapbitarray.h:
--------------------------------------------------------------------------------
  1 | #ifndef __MMAPBITARRAY_H
  2 | #define __MMAPBITARRAY_H 1
  3 | #include <stdlib.h>
  4 | #include <stdint.h>
  5 | #include <errno.h>
  6 | 
  7 | /* Types */
  8 | typedef uint32_t DTYPE;
  9 | typedef uint64_t BTYPE;
 10 | 
 11 | struct MmapBitArray {
 12 |     BTYPE bits;
 13 |     size_t size;
 14 |     size_t preamblesize;
 15 |     size_t bytes;
 16 |     size_t preamblebytes;
 17 |     const char * filename;
 18 |     DTYPE * vector;
 19 |     int32_t fd;
 20 | };
 21 | 
 22 | typedef struct MmapBitArray MBArray;
 23 | 
 24 | 
 25 | /* Constants */
 26 | enum {
 27 |     ONES = (DTYPE)-1,
 28 |     MBAMAGICSIZE = 9
 29 | };
 30 | #define MBAMAGIC "MBITARRAY"
 31 | 
 32 | 
 33 | 
 34 | /* Functions */
 35 | MBArray * mbarray_Create_Malloc(BTYPE num_bits);
 36 | 
 37 | MBArray * mbarray_Create_Mmap(BTYPE num_bits, const char * file, const char * header, int header_len, int oflag, int perms);
 38 | 
 39 | void mbarray_Destroy(MBArray * array);
 40 | 
 41 | int mbarray_ClearAll(MBArray * array);
 42 | 
 43 | int mbarray_Sync(MBArray * array);
 44 | 
 45 | int32_t mbarray_HeaderLen(MBArray * array);
 46 | 
 47 | char * mbarray_Header(char * dest, MBArray * array, int maxlen);
 48 | 
 49 | MBArray * mbarray_And(MBArray * dest, MBArray * array2);
 50 | 
 51 | MBArray * mbarray_Or(MBArray * dest, MBArray * array2);
 52 | 
 53 | MBArray * mbarray_Xor(MBArray * dest, MBArray * array2);
 54 | 
 55 | MBArray * mbarray_And_Ternary(MBArray * dest, MBArray * a, MBArray * b);
 56 | 
 57 | MBArray * mbarray_Or_Ternary(MBArray * dest, MBArray * a, MBArray * b);
 58 | 
 59 | MBArray * mbarray_Xor_Ternary(MBArray * dest, MBArray * a, MBArray * b);
 60 | 
 61 | MBArray * mbarray_Copy_Template(MBArray * src, char * filename, int perms);
 62 | 
 63 | int mbarray_Update(MBArray * array, char * data, int size);
 64 | /*MBArray * mbarray_Copy(MBarray * src, const char * filename);*/
 65 | 
 66 | uint64_t mbarray_FileSize(MBArray * array);
 67 | 
 68 | char * mbarray_CharData(MBArray * array);
 69 | 
 70 | static inline size_t _vector_offset(MBArray * array, BTYPE bit)
 71 | {
 72 |     return (size_t)(array->preamblesize + bit / (sizeof(DTYPE) << 3));
 73 | }
 74 | __attribute__((always_inline))
 75 | 
 76 | 
 77 | static inline size_t _vector_byte(BTYPE bit) {
 78 |     return 1 << (bit % (sizeof(DTYPE) << 3));
 79 | }
 80 | __attribute__((always_inline))
 81 | 
 82 | 
 83 | static inline int mbarray_Set(MBArray * array, BTYPE bit)
 84 | {
 85 |     if (bit > array->bits) {
 86 |         errno = EINVAL;
 87 |         return 1;
 88 |     }
 89 |     array->vector[_vector_offset(array, bit)] |= _vector_byte(bit);
 90 |     return 0;
 91 | }
 92 | __attribute__((always_inline))
 93 | 
 94 | 
 95 | static inline int mbarray_Clear(MBArray * array, BTYPE bit)
 96 | {
 97 |     if (bit > array->bits) {
 98 |         errno = EINVAL;
 99 |         return 1;
100 |     }
101 |     array->vector[_vector_offset(array, bit)] &= (ONES - _vector_byte(bit));
102 |     return 0;
103 | }
104 | __attribute__((always_inline))
105 | 
106 | 
107 | static inline int mbarray_Test(MBArray * array, BTYPE bit)
108 | {
109 |     if (bit > array->bits) {
110 |         errno = EINVAL;
111 |         return -1;
112 |     }
113 |     return ((array->vector[_vector_offset(array, bit)] & _vector_byte(bit)) != 0);
114 | }
115 | __attribute__((always_inline))
116 | 
117 | 
118 | #endif
119 | 


--------------------------------------------------------------------------------
/src/primetester.c:
--------------------------------------------------------------------------------
 1 | #ifndef __PRIMETESTER_C
 2 | #define __PRIMETESTER_C 1
 3 | 
 4 | #include <stdlib.h>
 5 | #include <time.h>
 6 | #include <stdio.h>
 7 | 
 8 | 
 9 | #include "primetester.h"
10 | 
11 | PTYPE next_prime(PTYPE prime)
12 | {
13 |     register PTYPE initial_prime = 89;
14 |     while (initial_prime < prime) {
15 |         initial_prime <<= 1;
16 |         ++initial_prime;
17 |     }
18 |     return initial_prime;
19 | }
20 | 
21 | #endif
22 | 


--------------------------------------------------------------------------------
/src/primetester.h:
--------------------------------------------------------------------------------
1 | #ifndef __PRIMETESTER_H
2 | #define __PRIMETESTER_H 1
3 | 
4 | typedef unsigned long PTYPE;
5 | PTYPE next_prime(PTYPE prime);
6 | 
7 | #endif
8 | 


--------------------------------------------------------------------------------
/src/pybloomfilter.pyx:
--------------------------------------------------------------------------------
  1 | VERSION = (0, 3, 14)
  2 | AUTHOR = "Michael Axiak"
  3 | 
  4 | __VERSION__ = VERSION
  5 | 
  6 | 
  7 | cimport cbloomfilter
  8 | cimport cpython
  9 | 
 10 | import random
 11 | import os
 12 | import math
 13 | import errno as eno
 14 | import array
 15 | import zlib
 16 | import shutil
 17 | 
 18 | 
 19 | cdef extern int errno
 20 | 
 21 | cdef construct_mode(mode):
 22 |     result = os.O_RDONLY
 23 |     if 'w' in mode:
 24 |         result |= os.O_RDWR
 25 |     if 'b' in mode and hasattr(os, 'O_BINARY'):
 26 |         result |= os.O_BINARY
 27 |     if mode.endswith('+'):
 28 |         result |= os.O_CREAT
 29 |     return result
 30 | 
 31 | cdef NoConstruct = object()
 32 | 
 33 | class IndeterminateCountError(ValueError):
 34 |     pass
 35 | 
 36 | cdef class BloomFilter:
 37 |     """
 38 |     The BloomFilter class implements a bloom filter that uses mmap'd files.
 39 |     For more information on what a bloom filter is, please read the Wikipedia article about it.
 40 |     """
 41 |     cdef cbloomfilter.BloomFilter * _bf
 42 |     cdef int _closed
 43 |     cdef int _in_memory
 44 |     cdef public ReadFile
 45 | 
 46 |     def __cinit__(self, capacity, error_rate, filename=None, perm=0755):
 47 |         cdef char * seeds
 48 |         cdef long long num_bits
 49 |         self._closed = 0
 50 |         self._in_memory = 0
 51 |         self.ReadFile = self.__class__.ReadFile
 52 |         mode = "rw+"
 53 |         if filename is NoConstruct:
 54 |             return
 55 | 
 56 |         if capacity is self.ReadFile:
 57 |             mode = "rw"
 58 |             capacity = 0
 59 |             if not os.path.exists(filename):
 60 |                 raise OSError("File %s not found" % filename)
 61 | 
 62 |             if not os.access(filename, os.O_RDWR):
 63 |                 raise OSError("Insufficient permissions for file %s" % filename)
 64 | 
 65 |         mode = construct_mode(mode)
 66 | 
 67 | 
 68 |         if not mode & os.O_CREAT:
 69 |             if os.path.exists(filename):
 70 |                 self._bf = cbloomfilter.bloomfilter_Create_Mmap(capacity,
 71 |                                                            error_rate,
 72 |                                                            filename,
 73 |                                                            0,
 74 |                                                            mode,
 75 |                                                            perm,
 76 |                                                            NULL, 0)
 77 |                 if self._bf is NULL:
 78 |                     raise ValueError("Invalid %s file: %s" %
 79 |                                      (self.__class__.__name__, filename))
 80 |             else:
 81 |                 raise OSError(eno.ENOENT, '%s: %s' % (os.strerror(eno.ENOENT),
 82 |                                                       filename))
 83 |         else:
 84 |             # Make sure that if the filename is defined, that the
 85 |             # file exists
 86 |             if filename and os.path.exists(filename):
 87 |                 os.unlink(filename)
 88 | 
 89 |             # For why we round down for determining the number of hashes:
 90 |             # http://corte.si/%2Fposts/code/bloom-filter-rules-of-thumb/index.html
 91 |             # "The number of hashes determines the number of bits that need to
 92 |             # be read to test for membership, the number of bits that need to be
 93 |             # written to add an element, and the amount of computation needed to
 94 |             # calculate hashes themselves. We may sometimes choose to use a less
 95 |             # than optimal number of hashes for performance reasons (especially
 96 |             # when we choose to round down when the calculated optimal number of
 97 |             # hashes is fractional)."
 98 | 
 99 |             assert(error_rate > 0.0 and error_rate < 1.0), "error_rate allowable range (0.0,1.0) %f" % (error_rate,)
100 |             num_hashes = max(int(math.floor(math.log(1.0 / error_rate, 2.0))),1)
101 |             bits_per_hash = int(math.ceil(
102 |                     capacity * abs(math.log(error_rate)) /
103 |                     (num_hashes * (math.log(2) ** 2))))
104 | 
105 |             # mininum bitvector of 128 bits
106 |             num_bits = max(num_hashes * bits_per_hash,128)
107 | 
108 |             #print "k = %d  m = %d  n = %d   p ~= %.8f" % (
109 |             #    num_hashes, num_bits, capacity,
110 |             #    (1.0 - math.exp(- float(num_hashes) * float(capacity) / num_bits))
111 |             #    ** num_hashes)
112 | 
113 |             hash_seeds = array.array('I')
114 |             hash_seeds.extend([random.getrandbits(32) for i in range(num_hashes)])
115 |             test = hash_seeds.tostring()
116 |             seeds = test
117 | 
118 |             # If a filename is provided, we should make a mmap-file
119 |             # backed bloom filter. Otherwise, it will be malloc
120 |             if filename:
121 |                 self._bf = cbloomfilter.bloomfilter_Create_Mmap(capacity,
122 |                                                        error_rate,
123 |                                                        filename,
124 |                                                        num_bits,
125 |                                                        mode,
126 |                                                        perm,
127 |                                                        <int *>seeds,
128 |                                                        num_hashes)
129 |             else:
130 |                 self._in_memory = 1
131 |                 self._bf = cbloomfilter.bloomfilter_Create_Malloc(capacity,
132 |                                                        error_rate,
133 |                                                        num_bits,
134 |                                                        <int *>seeds,
135 |                                                        num_hashes)
136 |             if self._bf is NULL:
137 |                 if filename:
138 |                     raise OSError(errno, '%s: %s' % (os.strerror(errno),
139 |                                                      filename))
140 |                 else:
141 |                     cpython.PyErr_NoMemory()
142 | 
143 |     def __dealloc__(self):
144 |         cbloomfilter.bloomfilter_Destroy(self._bf)
145 |         self._bf = NULL
146 | 
147 |     property hash_seeds:
148 |         def __get__(self):
149 |             self._assert_open()
150 |             result = array.array('I')
151 |             result.fromstring((<char *>self._bf.hash_seeds)[:4 * self.num_hashes])
152 |             return result
153 | 
154 |     property capacity:
155 |         def __get__(self):
156 |             self._assert_open()
157 |             return self._bf.max_num_elem
158 | 
159 |     property error_rate:
160 |         def __get__(self):
161 |             self._assert_open()
162 |             return self._bf.error_rate
163 | 
164 |     property num_hashes:
165 |         def __get__(self):
166 |             self._assert_open()
167 |             return self._bf.num_hashes
168 | 
169 |     property num_bits:
170 |         def __get__(self):
171 |             self._assert_open()
172 |             return self._bf.array.bits
173 | 
174 |     property name:
175 |         def __get__(self):
176 |             self._assert_open()
177 |             if self._in_memory:
178 |                 raise NotImplementedError('Cannot access .name on an '
179 |                                           'in-memory %s' %
180 |                                           self.__class__.__name__)
181 | 
182 |             return self._bf.array.filename
183 | 
184 |     def fileno(self):
185 |         self._assert_open()
186 |         return self._bf.array.fd
187 | 
188 |     def __repr__(self):
189 |         self._assert_open()
190 |         my_name = self.__class__.__name__
191 |         return '<%s capacity: %d, error: %0.3f, num_hashes: %d>' % (
192 |             my_name, self._bf.max_num_elem, self._bf.error_rate,
193 |             self._bf.num_hashes)
194 | 
195 |     def __str__(self):
196 |         return self.__repr__()
197 | 
198 |     def sync(self):
199 |         self._assert_open()
200 |         cbloomfilter.mbarray_Sync(self._bf.array)
201 | 
202 |     def clear_all(self):
203 |         self._assert_open()
204 |         cbloomfilter.mbarray_ClearAll(self._bf.array)
205 | 
206 |     def __contains__(self, item):
207 |         self._assert_open()
208 |         cdef cbloomfilter.Key key
209 |         if isinstance(item, str):
210 |             key.shash = item
211 |             key.nhash = len(item)
212 |         else:
213 |             key.shash = NULL
214 |             key.nhash = hash(item)
215 |         return cbloomfilter.bloomfilter_Test(self._bf, &key) == 1
216 | 
217 |     def copy_template(self, filename, perm=0755):
218 |         self._assert_open()
219 |         cdef BloomFilter copy = BloomFilter(0, 0, NoConstruct)
220 |         if os.path.exists(filename):
221 |             os.unlink(filename)
222 |         copy._bf = cbloomfilter.bloomfilter_Copy_Template(self._bf, filename, perm)
223 |         return copy
224 | 
225 |     def copy(self, filename):
226 |         self._assert_open()
227 |         if self._in_memory:
228 |             raise NotImplementedError('Cannot call .copy on an in-memory %s' %
229 |                                       self.__class__.__name__)
230 |         shutil.copy(self._bf.array.filename, filename)
231 |         return self.__class__(self.ReadFile, 0.1, filename, perm=0)
232 | 
233 |     def add(self, item):
234 |         self._assert_open()
235 |         cdef cbloomfilter.Key key
236 |         if isinstance(item, str):
237 |             key.shash = item
238 |             key.nhash = len(item)
239 |         else:
240 |             key.shash = NULL
241 |             key.nhash = hash(item)
242 | 
243 |         result = cbloomfilter.bloomfilter_Add(self._bf, &key)
244 |         if result == 2:
245 |             raise RuntimeError("Some problem occured while trying to add key.")
246 |         return bool(result)
247 | 
248 |     def update(self, iterable):
249 |         self._assert_open()
250 |         for item in iterable:
251 |             self.add(item)
252 | 
253 |     def __len__(self):
254 |         self._assert_open()
255 |         if not self._bf.count_correct:
256 |             raise IndeterminateCountError("Length of %s object is unavailable "
257 |                                           "after intersection or union called." %
258 |                                           self.__class__.__name__)
259 |         return self._bf.elem_count
260 | 
261 |     def close(self):
262 |         if self._closed == 0:
263 |             self._closed = 1
264 |             cbloomfilter.bloomfilter_Destroy(self._bf)
265 |             self._bf = NULL
266 | 
267 |     def __ior__(self, BloomFilter other):
268 |         self._assert_open()
269 |         self._assert_comparable(other)
270 |         cbloomfilter.mbarray_Or(self._bf.array, other._bf.array)
271 |         self._bf.count_correct = 0
272 |         return self
273 | 
274 |     def union(self, BloomFilter other):
275 |         self._assert_open()
276 |         other._assert_open()
277 |         self._assert_comparable(other)
278 |         cbloomfilter.mbarray_Or(self._bf.array, other._bf.array)
279 |         self._bf.count_correct = 0
280 |         return self
281 | 
282 |     def __iand__(self, BloomFilter other):
283 |         self._assert_open()
284 |         other._assert_open()
285 |         self._assert_comparable(other)
286 |         cbloomfilter.mbarray_And(self._bf.array, other._bf.array)
287 |         self._bf.count_correct = 0
288 |         return self
289 | 
290 |     def intersection(self, BloomFilter other):
291 |         self._assert_open()
292 |         other._assert_open()
293 |         self._assert_comparable(other)
294 |         cbloomfilter.mbarray_And(self._bf.array, other._bf.array)
295 |         self._bf.count_correct = 0
296 |         return self
297 | 
298 |     def _assert_open(self):
299 |         if self._closed != 0:
300 |             raise ValueError("I/O operation on closed file")
301 | 
302 |     def _assert_comparable(self, BloomFilter other):
303 |         error = ValueError("The two %s objects are not the same type (hint, "
304 |                            "use copy_template)" % self.__class__.__name__)
305 |         if self._bf.array.bits != other._bf.array.bits:
306 |             raise error
307 |         if self.hash_seeds != other.hash_seeds:
308 |             raise error
309 |         return
310 | 
311 |     def to_base64(self):
312 |         self._assert_open()
313 |         bfile = open(self.name, 'r')
314 |         result = zlib.compress(zlib.compress(bfile.read(), 9).encode('base64')).encode('base64')
315 |         bfile.close()
316 |         return result
317 | 
318 |     @classmethod
319 |     def from_base64(cls, filename, string, perm=0755):
320 |         bfile_fp = os.open(filename, construct_mode('w+'), perm)
321 |         os.write(bfile_fp, zlib.decompress(zlib.decompress(
322 |             string.decode('base64')).decode('base64')))
323 |         os.close(bfile_fp)
324 |         return cls.open(filename)
325 | 
326 |     @classmethod
327 |     def open(cls, filename):
328 |         return cls(cls.ReadFile, 0.1, filename, 0)
329 | 


--------------------------------------------------------------------------------
/src/superfast.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * The superfast function is licensed under the LGPL:
 3 |  *      http://www.gnu.org/licenses/lgpl-2.1.txt)
 4 |  * as described on the page:
 5 |  *      http://www.azillionmonkeys.com/qed/hash.html)
 6 |  *      Retrieved Dec 03, 2011
 7 |  */
 8 | 
 9 | #include "stdint.h" /* Replace with <stdint.h> if appropriate */
10 | #undef get16bits
11 | #if (defined(__GNUC__) && defined(__i386__)) || defined(__WATCOMC__) \
12 |   || defined(_MSC_VER) || defined (__BORLANDC__) || defined (__TURBOC__)
13 | #define get16bits(d) (*((const uint16_t *) (d)))
14 | #endif
15 | 
16 | #if !defined (get16bits)
17 | #define get16bits(d) ((((uint32_t)(((const uint8_t *)(d))[1])) << 8)\
18 |                        +(uint32_t)(((const uint8_t *)(d))[0]) )
19 | #endif
20 | 
21 | uint32_t SuperFastHash (const char * data, int len, uint32_t hash) {
22 | uint32_t tmp;
23 | int rem;
24 | 
25 |     if (len <= 0 || data == NULL) return 0;
26 | 
27 |     rem = len & 3;
28 |     len >>= 2;
29 | 
30 |     /* Main loop */
31 |     for (;len > 0; len--) {
32 |         hash  += get16bits (data);
33 |         tmp    = (get16bits (data+2) << 11) ^ hash;
34 |         hash   = (hash << 16) ^ tmp;
35 |         data  += 2*sizeof (uint16_t);
36 |         hash  += hash >> 11;
37 |     }
38 | 
39 |     /* Handle end cases */
40 |     switch (rem) {
41 |         case 3: hash += get16bits (data);
42 |                 hash ^= hash << 16;
43 |                 hash ^= data[sizeof (uint16_t)] << 18;
44 |                 hash += hash >> 11;
45 |                 break;
46 |         case 2: hash += get16bits (data);
47 |                 hash ^= hash << 11;
48 |                 hash += hash >> 17;
49 |                 break;
50 |         case 1: hash += *data;
51 |                 hash ^= hash << 10;
52 |                 hash += hash >> 1;
53 |     }
54 | 
55 |     /* Force "avalanching" of final 127 bits */
56 |     hash ^= hash << 3;
57 |     hash += hash >> 5;
58 |     hash ^= hash << 4;
59 |     hash += hash >> 17;
60 |     hash ^= hash << 25;
61 |     hash += hash >> 6;
62 | 
63 |     return hash;
64 | }


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import glob
 3 | import unittest
 4 | import importlib
 5 | import functools
 6 | import tempfile
 7 | 
 8 | here = os.path.dirname(__file__)
 9 | 
10 | def with_test_file(method):
11 |     @functools.wraps(method)
12 |     def _wrapped(*args, **kwargs):
13 |         f = tempfile.NamedTemporaryFile(suffix='.bloom')
14 |         kwargs['filename'] = f.name
15 |         try:
16 |             return method(*args, **kwargs)
17 |         finally:
18 |             f.close()
19 |     return _wrapped
20 | 
21 | def test_all():
22 |     suite = unittest.TestSuite()
23 |     for fname in glob.glob(os.path.join(here, '*.py')):
24 |         if '__init__' in fname:
25 |             continue
26 |         module = importlib.import_module('tests.' + os.path.basename(fname).split('.py')[0])
27 |         if hasattr(module, 'suite'):
28 |             suite.addTest(module.suite())
29 |     return suite
30 | 


--------------------------------------------------------------------------------
/tests/accuracytest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import string
  4 | import random
  5 | import unittest
  6 | import tempfile
  7 | 
  8 | import pybloomfilter
  9 | 
 10 | 
 11 | class TestAccuracyMixin(object):
 12 |     FILTER_SIZE = 1000
 13 | 
 14 |     def _gen_random_items(self, n, exclude=None):
 15 |         # Yield n unique random items; if an existing set is provided,
 16 |         # items already in that set will not be yielded.
 17 |         if exclude is not None:
 18 |             yielded = exclude
 19 |         else:
 20 |             yielded = set()
 21 | 
 22 |         yield_count = 0
 23 |         while yield_count < n:
 24 |             random_item = self._random_item()
 25 |             if random_item not in yielded:
 26 |                 yield random_item
 27 |                 yielded.add(random_item)
 28 |                 yield_count += 1
 29 | 
 30 |     def test_false_pos_degredation(self):
 31 |         # we'll check 10% to 0.01%
 32 |         for error_rate in (0.1, 0.01, 0.001, 0.0001):
 33 |             bf = self._bf(error_rate)
 34 |             items_in_filter = set(
 35 |                 self._gen_random_items(bf.capacity))
 36 | 
 37 |             items_in_filter_list = list(items_in_filter)
 38 |             n = 0
 39 |             chunk_count = 10
 40 |             chunk_size = int(math.ceil(float(bf.capacity) / chunk_count))
 41 |             print 'error_rate = %.4f' % error_rate
 42 |             print '    %6s %9s %s' % ('n', 'false_pos',
 43 |                                       'estimated error_rate')
 44 |             for i in range(chunk_count):
 45 |                 chunk = items_in_filter_list[i*chunk_size:(i+1)*chunk_size]
 46 |                 n += len(chunk)
 47 |                 bf.update(chunk)
 48 |                 pos_test_count = int(5 * 1.0 / error_rate)
 49 |                 false_pos = len(filter(bf.__contains__,
 50 |                                        self._gen_random_items(
 51 |                                            pos_test_count, items_in_filter)))
 52 |                 est_error_rate = float(false_pos) / pos_test_count
 53 |                 print '    %6d %9d %.8f %s' % (
 54 |                     n, false_pos, est_error_rate,
 55 |                     '******' if est_error_rate > error_rate else '')
 56 | 
 57 |     def test_accuracy(self):
 58 |         print '\n%14s\t%14s\t%17s' % ('pos_test_count', 'false_pos_rate',
 59 |                                       'error_rate_target')
 60 |         # we'll check 10% to 0.01%
 61 |         for error_rate in (0.1, 0.01, 0.001, 0.0001):
 62 |             bf = self._bf(error_rate)
 63 |             items_in_filter = set(
 64 |                 self._gen_random_items(bf.capacity))
 65 |             bf.update(items_in_filter)
 66 | 
 67 |             # sanity check
 68 |             self.assertEqual(bf.capacity, len(items_in_filter))
 69 | 
 70 |             false_neg = len(items_in_filter) - \
 71 |                 len(filter(bf.__contains__, items_in_filter))
 72 | 
 73 |             pos_test_count = int(10 * (1.0 / error_rate))
 74 |             false_pos = len(filter(bf.__contains__, self._gen_random_items(
 75 |                 pos_test_count, items_in_filter)))
 76 |             false_pos = 0
 77 |             for test in self._gen_random_items(pos_test_count,
 78 |                                                items_in_filter):
 79 |                 if test in bf:
 80 |                     false_pos += 1
 81 | 
 82 |             false_pos_rate = float(false_pos) / pos_test_count
 83 |             false_neg_rate = float(false_neg) / len(items_in_filter)
 84 |             error_rate_target = error_rate * 2  # cut it some slack
 85 | 
 86 |             print '%14d\t%14f\t%17f' % (pos_test_count, false_pos_rate,
 87 |                                         error_rate_target)
 88 |             self.assertTrue(
 89 |                 false_pos_rate <= error_rate_target,
 90 |                 "false_pos: %r / %r = %r > %r" % (
 91 |                     false_pos, pos_test_count,
 92 |                     false_pos / float(pos_test_count), error_rate_target))
 93 |             self.assertEqual(false_neg_rate, 0.0,
 94 |                              "false negative rate is nonzero: %0.6f" %
 95 |                              (false_neg_rate,))
 96 |             del bf
 97 | 
 98 | 
 99 | class StringAccuracyMallocTestCase(unittest.TestCase, TestAccuracyMixin):
100 |     CHARS = string.lowercase + string.uppercase
101 |     STR_LEN = 10
102 | 
103 |     def _random_item(self):
104 |         return ''.join(random.choice(self.CHARS)
105 |                        for _ in xrange(self.STR_LEN))
106 | 
107 |     def _bf(self, error_rate):
108 |         return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate)
109 | 
110 | 
111 | class StringAccuracyMmapTestCase(unittest.TestCase, TestAccuracyMixin):
112 |     CHARS = string.lowercase + string.uppercase
113 |     STR_LEN = 10
114 | 
115 |     def setUp(self):
116 |         self.temp_file = tempfile.NamedTemporaryFile(suffix='.bloom',
117 |                                                      delete=False)
118 | 
119 |     def tearDown(self):
120 |         os.unlink(self.temp_file.name)
121 | 
122 |     def _random_item(self):
123 |         return ''.join(random.choice(self.CHARS)
124 |                        for _ in xrange(self.STR_LEN))
125 | 
126 |     def _bf(self, error_rate):
127 |         return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate,
128 |                                          self.temp_file.name)
129 | 
130 | 
131 | class IntegerAccuracyMallocTestCase(unittest.TestCase, TestAccuracyMixin):
132 | 
133 |     def _random_item(self):
134 |         return random.randint(-2**31, 2**31)
135 | 
136 |     def _bf(self, error_rate):
137 |         return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate)
138 | 
139 | 
140 | class IntegerAccuracyMmapTestCase(unittest.TestCase, TestAccuracyMixin):
141 | 
142 |     def setUp(self):
143 |         self.temp_file = tempfile.NamedTemporaryFile(suffix='.bloom',
144 |                                                      delete=False)
145 | 
146 |     def tearDown(self):
147 |         os.unlink(self.temp_file.name)
148 | 
149 |     def _random_item(self):
150 |         return random.randint(-2**31, 2**31)
151 | 
152 |     def _bf(self, error_rate):
153 |         return pybloomfilter.BloomFilter(self.FILTER_SIZE, error_rate,
154 |                                          self.temp_file.name)
155 | 
156 | 
157 | def suite():
158 |     suite = unittest.TestSuite()
159 |     suite.addTest(unittest.makeSuite(StringAccuracyMmapTestCase))
160 |     suite.addTest(unittest.makeSuite(StringAccuracyMallocTestCase))
161 |     suite.addTest(unittest.makeSuite(IntegerAccuracyMmapTestCase))
162 |     suite.addTest(unittest.makeSuite(IntegerAccuracyMallocTestCase))
163 |     return suite
164 | 


--------------------------------------------------------------------------------
/tests/comparisons/accuracytest.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | import sys
 4 | import os
 5 | import tempfile
 6 | import pybloomfilter
 7 | 
 8 | WORDS_FILE = os.path.join(os.path.dirname(__file__), 'words')
 9 | TEST_WORDS = os.path.join(os.path.dirname(__file__), 'testwords')
10 | 
11 | def main():
12 |     global pybloomfilter
13 | 
14 |     if len(sys.argv) > 1 and sys.argv[1].lower() == '-pybloom':
15 |         import pybloom
16 |         pybloomfilter = pybloom
17 | 
18 |     with open(WORDS_FILE) as base_file:
19 |         with open(TEST_WORDS) as test_file:
20 |             base_words = set(base_file)
21 |             test_words = set(test_file)
22 |             correct_overlap = len(base_words & test_words)
23 |             num_test_words = len(test_words)
24 |             number_words = len(base_words)
25 | 
26 |     for error_rate in (0.01, 0.001, 0.0001):
27 |         test_errors(error_rate, number_words, correct_overlap, num_test_words)
28 | 
29 | 
30 | def test_errors(error_rate, filter_size, correct_overlap, num_test_words):
31 |     bloom_file = tempfile.NamedTemporaryFile()
32 |     try:
33 |         bf = pybloomfilter.BloomFilter(filter_size, error_rate, bloom_file.name)
34 |     except TypeError:
35 |         bf = pybloomfilter.BloomFilter(filter_size, error_rate)
36 | 
37 |     with open(WORDS_FILE) as source_file:
38 |         with open(TEST_WORDS) as test_file:
39 |             run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_rate)
40 | 
41 |     #os.unlink(bloom_file.name)
42 | 
43 | 
44 | def run_test(bf, source_file, test_file, correct_overlap, num_test_words, error_rate):
45 |     for word in source_file:
46 |         bf.add(word.rstrip())
47 | 
48 |     positive_matches = sum(1 for word in test_file
49 |                            if word.rstrip() in bf)
50 | 
51 | 
52 |     actual_error_rate = float(positive_matches - correct_overlap) / correct_overlap
53 | 
54 |     print "Specified: %f; Measured: %f; num_hashes: %d, num_bits: %d" % (
55 |         error_rate,
56 |         actual_error_rate,
57 |         getattr(bf, 'num_hashes', None) or getattr(bf, 'num_slices'),
58 |         bf.num_bits,
59 |         )
60 | 
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     main()
65 | 


--------------------------------------------------------------------------------
/tests/comparisons/speedtest.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | import os
 4 | import tempfile
 5 | import time
 6 | import timeit
 7 | 
 8 | import pybloomfilter
 9 | 
10 | tempfiles = []
11 | 
12 | ERROR_RATE = 0.1
13 | 
14 | #def get_and_add_words(Creator, wordlist):
15 | def get_and_add_words(Creator, wordlist):
16 |     bf = Creator(len(wordlist), ERROR_RATE)
17 |     for word in wordlist:
18 |         bf.add(word)
19 |     return bf
20 | 
21 | def check_words(bf, wordlist):
22 |     for word in wordlist:
23 |         word in bf
24 | 
25 | def test_errors(bf, correct_wordlist, test_wordlist):
26 |     errors = [0, 0]
27 |     for word in test_wordlist:
28 |         if word in bf:
29 |             if word not in correct_wordlist:
30 |                 errors[0] += 1
31 |         else:
32 |             if word in correct_wordlist:
33 |                 errors[1] += 1
34 |     print '%0.2f%% positive %0.2f%% negative' % (
35 |         errors[0] / float(len(correct_wordlist)) * 100,
36 |         errors[1] / float(len(correct_wordlist)) * 100)
37 | 
38 | def create_word_list(filename):
39 |     f = open(filename, 'r')
40 |     words_set = set()
41 |     for line in f:
42 |         line = line.strip().lower()
43 |         if line:
44 |             words_set.add(line)
45 |     f.close()
46 |     return words_set
47 | 
48 | def create_cbloomfilter(*args):
49 |     args = list(args)
50 |     f = tempfile.NamedTemporaryFile()
51 |     tempfiles.append(f)
52 |     os.unlink(f.name)
53 |     args.append(f.name)
54 |     return pybloomfilter.BloomFilter(*tuple(args))
55 | 
56 | creators = [create_cbloomfilter]
57 | try:
58 |     import pybloom
59 | except ImportError:
60 |     pass
61 | else:
62 |     creators.append(pybloom.BloomFilter)
63 | 
64 | def run_test():
65 |     dict_wordlist = create_word_list('words')
66 |     test_wordlist = create_word_list('testwords')
67 |     NUM = 10
68 | 
69 |     for creator in creators:
70 |         start = time.time()
71 |         if NUM:
72 |             t = timeit.Timer(lambda : get_and_add_words(creator, dict_wordlist))
73 |             print "%s took %0.5f s/run" % (
74 |                 creator,
75 |                 t.timeit(NUM) / float(NUM))
76 |         bf = get_and_add_words(creator, dict_wordlist)
77 | 
78 |         if NUM:
79 |             t = timeit.Timer(lambda : check_words(bf, test_wordlist))
80 |             print "%s took %0.5f s/run" % (
81 |                 creator,
82 |                 t.timeit(NUM) / float(NUM))
83 | 
84 |         raw_input()
85 | 
86 |         test_errors(bf, dict_wordlist, test_wordlist)
87 | 
88 | if __name__ == "__main__":
89 |     run_test()
90 | 


--------------------------------------------------------------------------------
/tests/simpletest.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import string
  3 | import unittest
  4 | import tempfile
  5 | from random import randint, choice
  6 | 
  7 | import pybloomfilter
  8 | 
  9 | from tests import with_test_file
 10 | 
 11 | 
 12 | class SimpleTestCase(unittest.TestCase):
 13 |     FILTER_SIZE = 200
 14 |     FILTER_ERROR_RATE = 0.001
 15 | 
 16 |     def setUp(self):
 17 |         # Convenience file-backed bloomfilter
 18 |         self.tempfile = tempfile.NamedTemporaryFile(suffix='.bloom',
 19 |                                                     delete=False)
 20 |         self.bf = pybloomfilter.BloomFilter(self.FILTER_SIZE,
 21 |                                             self.FILTER_ERROR_RATE,
 22 |                                             self.tempfile.name)
 23 | 
 24 |         # Convenience memory-backed bloomfilter
 25 |         self.bf_mem = pybloomfilter.BloomFilter(self.FILTER_SIZE,
 26 |                                                 self.FILTER_ERROR_RATE)
 27 | 
 28 |     def tearDown(self):
 29 |         os.unlink(self.tempfile.name)
 30 | 
 31 |     def assertPropertiesPreserved(self, old_bf, new_bf):
 32 |         # Assert that a "new" BloomFilter has the same properties as an "old"
 33 |         # one.
 34 |         failures = []
 35 |         for prop in ['capacity', 'error_rate', 'num_hashes', 'num_bits',
 36 |                      'hash_seeds']:
 37 |             old, new = getattr(old_bf, prop), getattr(new_bf, prop)
 38 |             if new != old:
 39 |                 failures.append((prop, old, new))
 40 |         self.assertEqual([], failures)
 41 | 
 42 |     def _random_str(self, length=16):
 43 |         chars = string.lowercase + string.uppercase
 44 |         return ''.join(choice(chars) for _ in xrange(length))
 45 | 
 46 |     def _random_set_of_stuff(self, c):
 47 |         """
 48 |         Return a random set containing up to "c" count of each type of Python
 49 |         object.
 50 |         """
 51 |         return set(
 52 |             # Due to a small chance of collision, there's no guarantee on the
 53 |             # count of elements in this set, but we'll make sure that's okay.
 54 |             [self._random_str() for _ in range(c)] +
 55 |             [randint(-1000, 1000) for _ in range(c)] +
 56 |             [(randint(-200, 200), self._random_str()) for _ in range(c)] +
 57 |             [float(randint(10, 100)) / randint(10, 100)
 58 |              for _ in range(c)] +
 59 |             [long(randint(50000, 1000000)) for _ in range(c)] +
 60 |             [object() for _ in range(c)] +
 61 |             [unicode(self._random_str) for _ in range(c)])
 62 | 
 63 |     def _populate_filter(self, bf, use_update=False):
 64 |         """
 65 |         Populate given BloomFilter with a handfull of hashable things.
 66 |         """
 67 |         self._in_filter = self._random_set_of_stuff(10)
 68 |         self._not_in_filter = self._random_set_of_stuff(15)
 69 |         # Just in case we randomly chose a key which was also in
 70 |         # self._in_filter...
 71 |         self._not_in_filter = self._not_in_filter - self._in_filter
 72 | 
 73 |         if use_update:
 74 |             bf.update(self._in_filter)
 75 |         else:
 76 |             for item in self._in_filter:
 77 |                 bf.add(item)
 78 | 
 79 |     def _check_filter_contents(self, bf):
 80 |         for item in self._in_filter:
 81 |             # We should *never* say "not in" for something which was added
 82 |             self.assertTrue(item in bf, '%r was NOT in %r' % (item, bf))
 83 | 
 84 |         # We might say something is in the filter which isn't; we're only
 85 |         # trying to test correctness, here, so we are very lenient.  If the
 86 |         # false positive rate is within 2 orders of magnitude, we're okay.
 87 |         false_pos = len(filter(bf.__contains__, self._not_in_filter))
 88 |         error_rate = float(false_pos) / len(self._not_in_filter)
 89 |         self.assertTrue(error_rate < 100 * self.FILTER_ERROR_RATE,
 90 |                         '%r / %r = %r > %r' % (false_pos,
 91 |                                                len(self._not_in_filter),
 92 |                                                error_rate,
 93 |                                                100 * self.FILTER_ERROR_RATE))
 94 |         for item in self._not_in_filter:
 95 |             # We should *never* have a false negative
 96 |             self.assertFalse(item in bf, '%r WAS in %r' % (item, bf))
 97 | 
 98 |     def test_repr(self):
 99 |         self.assertEqual(
100 |             '<BloomFilter capacity: %d, error: %0.3f, num_hashes: %d>' % (
101 |                 self.bf.capacity, self.bf.error_rate, self.bf.num_hashes),
102 |             repr(self.bf))
103 |         self.assertEqual(
104 |             u'<BloomFilter capacity: %d, error: %0.3f, num_hashes: %d>' % (
105 |                 self.bf.capacity, self.bf.error_rate, self.bf.num_hashes),
106 |             unicode(self.bf))
107 |         self.assertEqual(
108 |             '<BloomFilter capacity: %d, error: %0.3f, num_hashes: %d>' % (
109 |                 self.bf.capacity, self.bf.error_rate, self.bf.num_hashes),
110 |             str(self.bf))
111 | 
112 |     def test_add_and_check_file_backed(self):
113 |         self._populate_filter(self.bf)
114 |         self._check_filter_contents(self.bf)
115 | 
116 |     def test_update_and_check_file_backed(self):
117 |         self._populate_filter(self.bf, use_update=True)
118 |         self._check_filter_contents(self.bf)
119 | 
120 |     def test_add_and_check_memory_backed(self):
121 |         self._populate_filter(self.bf_mem)
122 |         self._check_filter_contents(self.bf_mem)
123 | 
124 |     def test_open(self):
125 |         self._populate_filter(self.bf)
126 |         self.bf.sync()
127 | 
128 |         bf = pybloomfilter.BloomFilter.open(self.bf.name)
129 |         self._check_filter_contents(bf)
130 | 
131 |     @with_test_file
132 |     def test_copy(self, filename):
133 |         self._populate_filter(self.bf)
134 |         self.bf.sync()
135 | 
136 |         bf = self.bf.copy(filename)
137 |         self._check_filter_contents(bf)
138 |         self.assertPropertiesPreserved(self.bf, bf)
139 | 
140 |     def assertBfPermissions(self, bf, perms):
141 |         oct_mode = oct(os.stat(bf.name).st_mode)
142 |         self.assert_(oct_mode.endswith(perms),
143 |                      'unexpected perms %s' % oct_mode)
144 | 
145 |     @with_test_file
146 |     def test_to_from_base64(self, filename):
147 |         self._populate_filter(self.bf)
148 |         self.bf.sync()
149 | 
150 |         # sanity-check
151 |         self.assertBfPermissions(self.bf, '0755')
152 | 
153 |         b64 = self.bf.to_base64()
154 | 
155 |         old_umask = os.umask(0)
156 |         try:
157 |             os.unlink(filename)
158 |             bf = pybloomfilter.BloomFilter.from_base64(filename, b64,
159 |                                                        perm=0775)
160 |             self.assertBfPermissions(bf, '0775')
161 |             self._check_filter_contents(bf)
162 |             self.assertPropertiesPreserved(self.bf, bf)
163 |         finally:
164 |             os.umask(old_umask)
165 | 
166 |     def test_missing_file_is_os_error(self):
167 |         self.assertRaises(OSError, pybloomfilter.BloomFilter, 1000, 0.1,
168 |                           'missing_directory/some_file.bloom')
169 | 
170 |     @with_test_file
171 |     def test_others(self, filename):
172 |         bf = pybloomfilter.BloomFilter(100, 0.01, filename)
173 |         for elem in (1.2, 2343L, (1, 2), object(), u'\u2131\u3184'):
174 |             bf.add(elem)
175 |             self.assertEquals(elem in bf, True)
176 | 
177 |     def test_number_nofile(self):
178 |         bf = pybloomfilter.BloomFilter(100, 0.01)
179 |         bf.add(1234)
180 |         self.assertEquals(1234 in bf, True)
181 | 
182 |     def test_string_nofile(self):
183 |         bf = pybloomfilter.BloomFilter(100, 0.01)
184 |         bf.add("test")
185 |         self.assertEquals("test" in bf, True)
186 | 
187 |     def test_others_nofile(self):
188 |         bf = pybloomfilter.BloomFilter(100, 0.01)
189 |         for elem in (1.2, 2343L, (1, 2), object(), u'\u2131\u3184'):
190 |             bf.add(elem)
191 |             self.assertEquals(elem in bf, True)
192 | 
193 |     #@unittest.skip("unfortunately large files cannot be tested on Travis")
194 |     @with_test_file
195 |     def _test_large_file(self, filename):
196 |         bf = pybloomfilter.BloomFilter(400000000, 0.01, filename)
197 |         bf.add(1234)
198 |         self.assertEquals(1234 in bf, True)
199 | 
200 |     def test_name_does_not_segfault(self):
201 |         bf = pybloomfilter.BloomFilter(100, 0.01)
202 |         self.assertRaises(NotImplementedError, lambda: bf.name)
203 | 
204 |     def test_copy_does_not_segfault(self):
205 |         bf = pybloomfilter.BloomFilter(100, 0.01)
206 |         with tempfile.NamedTemporaryFile(suffix='.bloom') as f2:
207 |             self.assertRaises(NotImplementedError, bf.copy, f2.name)
208 | 
209 |     def test_to_base64_does_not_segfault(self):
210 |         bf = pybloomfilter.BloomFilter(100, 0.01)
211 |         self.assertRaises(NotImplementedError, bf.to_base64)
212 | 
213 |     def test_ReadFile_is_public(self):
214 |         self.assertEquals(
215 |             isinstance(pybloomfilter.BloomFilter.ReadFile, object), True)
216 |         bf = pybloomfilter.BloomFilter(100, 0.01)
217 |         bf2 = pybloomfilter.BloomFilter(100, 0.01)
218 |         self.assertEquals(bf.ReadFile, bf2.ReadFile)
219 |         self.assertEquals(pybloomfilter.BloomFilter.ReadFile,
220 |                           bf.ReadFile)
221 | 
222 | 
223 | def suite():
224 |     suite = unittest.TestSuite()
225 |     suite.addTest(unittest.makeSuite(SimpleTestCase))
226 |     return suite
227 | 


--------------------------------------------------------------------------------