├── requirements.txt
├── .gitignore
├── MANIFEST.in
├── .travis.yml
├── src
    ├── mmap_writer.h
    ├── MurmurHash3.h
    ├── hydra.py
    ├── mmap_writer.c
    ├── MurmurHash3.c
    └── _hydra.pyx
├── CHANGES.rst
├── tests
    ├── timeit.py
    ├── test_murmur.py
    ├── helpers.py
    ├── test_mmapbitarray.py
    └── test_bloom.py
├── license.txt
├── README.md
└── setup.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | flake8
3 | nose
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | a.out
 2 | *.a
 3 | *.o
 4 | *.pyc
 5 | *.so
 6 | *.swp
 7 | *egg-info
 8 | bin
 9 | build
10 | dist
11 | include
12 | javabloom
13 | lib
14 | man
15 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include *.md
 2 | include *.rst
 3 | include *.txt
 4 | include MANIFEST.in
 5 | include setup.py
 6 | include setup.cfg
 7 | 
 8 | recursive-include src *.c *.h *.py
 9 | recursive-include tests *.c *.h *.py
10 | recursive-include docs *.html *.css *.gif *.jpg *.txt
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # new container based environment
 2 | sudo: false
 3 | 
 4 | cache:
 5 |   pip: true
 6 | 
 7 | language: python
 8 | python:
 9 |   - "2.7"
10 |   - "3.4"
11 |   - "3.5"
12 | install:
13 |   - "pip install -U pip wheel"
14 |   - "pip install -r requirements.txt"
15 | script:
16 |   - "cythonize src/_hydra.pyx"
17 |   - "python setup.py build_ext --inplace"
18 |   - "flake8 src tests"
19 |   - "python setup.py test"
20 | 


--------------------------------------------------------------------------------
/src/mmap_writer.h:
--------------------------------------------------------------------------------
 1 | 
 2 | int open_mmap_file_rw(char* filename, size_t bytesize);
 3 | int open_mmap_file_ro(char* filepath);
 4 | char* map_file_rw(int fd, size_t filesize, int want_lock);
 5 | char* map_file_ro(int fd, size_t filesize, int want_lock);
 6 | void turn_bits_on(char *map, off_t index, char bitmask);
 7 | int flush_to_disk(int fd);
 8 | int close_file(int fd);
 9 | int unmap_file(char* map, size_t filesize);
10 | void bulkload_file(char* buffer, char* filename);
11 | 


--------------------------------------------------------------------------------
/CHANGES.rst:
--------------------------------------------------------------------------------
 1 | =========
 2 | Changelog
 3 | =========
 4 | 
 5 | 2.6 (unreleased)
 6 | ----------------
 7 | 
 8 | 
 9 | 2.5 (2016-08-02)
10 | ----------------
11 | 
12 | - Close descriptor file in Reading/UpdatingBloomFilter.
13 | 
14 | - Define some BloomFilter and MMapBitField methods as cpdef.
15 | 
16 | 2.4 (2016-08-02)
17 | ----------------
18 | 
19 | - #12, #15: Ship C code to avoid Cython install time dependency.
20 | 
21 | 2.3 (2015-06-04)
22 | ----------------
23 | 
24 | - Look at Git commit history for changes.
25 | 


--------------------------------------------------------------------------------
/tests/timeit.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a speed profiler for insertion and lookup
 3 | """
 4 | import cProfile
 5 | from hydra import WritingBloomFilter
 6 | from helpers import KeyGenerator
 7 | 
 8 | keygen = KeyGenerator()
 9 | input_keys = [keygen.random_string() for i in range(100000)]
10 | other_keys = [keygen.random_string() for i in range(200000)]
11 | 
12 | ELEMENTS = 10000000
13 | MAX_FAILURE_RATE = 0.1
14 | bf = WritingBloomFilter(ELEMENTS, MAX_FAILURE_RATE)
15 | 
16 | 
17 | def test_one():
18 |     for key in input_keys:
19 |         bf[key] = 0
20 | 
21 |     for key in other_keys:
22 |         key in bf
23 | 
24 | cProfile.run('test_one()')
25 | 


--------------------------------------------------------------------------------
/tests/test_murmur.py:
--------------------------------------------------------------------------------
 1 | import _hydra
 2 | from helpers import KeyGenerator
 3 | 
 4 | 
 5 | def test__hydra():
 6 |     # This test will probably fail on big-endian machines
 7 |     h1 = _hydra.hash('foo')
 8 |     h2 = _hydra.hash('foo', h1 & 0xFFFFFFFF)
 9 |     assert (-39287385592190013122878999397579195001,
10 |             -73964642705803263641983394469427790275) == (h1, h2)
11 | 
12 | 
13 | def test_collisions():
14 |     keygen = KeyGenerator()
15 |     hashes = {}
16 |     for i, key in enumerate(keygen.randomKeys()):
17 |         hcode = _hydra.hash(key)
18 |         if hcode not in hashes:
19 |             hashes[hcode] = key
20 |         else:
21 |             raise RuntimeError("Hash collision!: {} {}".format(
22 |                 key, hashes[hcode]))
23 | 
24 | 
25 | def test_null_key():
26 |     h0 = _hydra.hash('foo')
27 |     h1 = _hydra.hash('foo\0bar')
28 |     h2 = _hydra.hash('foo\0baz')
29 |     assert h0 != h1, 'Hash collision for appended null'
30 |     assert h0 != h2, 'Hash collision for appended null'
31 |     assert h1 != h2, 'Hash collision for bytes after null'
32 | 


--------------------------------------------------------------------------------
/license.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2010 Victor Ng
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/helpers.py:
--------------------------------------------------------------------------------
 1 | import string
 2 | import random
 3 | from os.path import join, dirname
 4 | 
 5 | 
 6 | class KeyGenerator(object):
 7 |     def __init__(self, seed=314519):
 8 |         fname = join(dirname(__file__), 'words')
 9 |         self._fname = fname
10 |         self._lines = open(fname, 'r').readlines()
11 |         self._linecount = len(self._lines)
12 |         self.ELEMENTS = 10000
13 | 
14 |         self._r1 = random.Random()
15 |         self._r1.seed(seed)
16 | 
17 |     def __len__(self):
18 |         return self._linecount
19 | 
20 |     def __getitem__(self, i):
21 |         start = i.start or 0
22 |         stop = i.stop or self._linecount
23 |         step = i.step or 1
24 |         return self._lines[start:stop:step]
25 | 
26 |     def random_string(self, length=16):
27 |         return "".join([self._r1.choice(string.ascii_letters + string.digits)
28 |                        for x in range(1, length)])
29 | 
30 |     def randomKeys(self, num_elem=None):
31 |         '''
32 |         Return a bunch of random keys
33 |         '''
34 |         if not num_elem:
35 |             num_elem = self.ELEMENTS
36 |         return self._r1.sample(self._lines, num_elem)
37 | 


--------------------------------------------------------------------------------
/src/MurmurHash3.h:
--------------------------------------------------------------------------------
 1 | //-----------------------------------------------------------------------------
 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
 3 | // domain. The author hereby disclaims copyright to this source code.
 4 | 
 5 | #ifndef _MURMURHASH3_H_
 6 | #define _MURMURHASH3_H_
 7 | 
 8 | //-----------------------------------------------------------------------------
 9 | // Platform-specific functions and macros
10 | 
11 | // Microsoft Visual Studio
12 | 
13 | #if defined(_MSC_VER) && (_MSC_VER < 1600)
14 | 
15 | typedef unsigned char uint8_t;
16 | typedef unsigned int uint32_t;
17 | typedef unsigned __int64 uint64_t;
18 | 
19 | // Other compilers
20 | 
21 | #else	// defined(_MSC_VER)
22 | 
23 | #include <stdint.h>
24 | 
25 | #endif // !defined(_MSC_VER)
26 | 
27 | //-----------------------------------------------------------------------------
28 | 
29 | void MurmurHash3_x86_32  ( const void * key, int len, uint32_t seed, void * out );
30 | 
31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out );
32 | 
33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out );
34 | 
35 | //-----------------------------------------------------------------------------
36 | 
37 | #endif // _MURMURHASH3_H_
38 | 


--------------------------------------------------------------------------------
/tests/test_mmapbitarray.py:
--------------------------------------------------------------------------------
 1 | from _hydra import MMapBitField
 2 | import tempfile
 3 | 
 4 | 
 5 | def test_ro_segfault():
 6 |     tf = tempfile.NamedTemporaryFile(delete=True)
 7 |     rw_field = MMapBitField(tf.name, 80, 0)
 8 |     rw_field[0] = 1
 9 |     ro_field = MMapBitField(tf.name, 80, 1)
10 |     try:
11 |         ro_field[0] = 1
12 |     except ValueError:
13 |         pass
14 | 
15 | 
16 | def test_setitem():
17 |     tf = tempfile.NamedTemporaryFile(delete=True)
18 |     bf = MMapBitField(tf.name, 80, 0)
19 | 
20 |     # verify set once
21 |     bf[0] = 1
22 |     assert bf[0]
23 |     for idx in range(1, len(bf)):
24 |         assert not bf[idx]
25 |     bf[0] = 1
26 |     assert bf[0]
27 |     for idx in range(1, len(bf)):
28 |         assert not bf[idx]
29 | 
30 |     # verify unset twice
31 |     bf[0] = 0
32 |     for idx in range(len(bf)):
33 |         assert not bf[idx]
34 |     bf[0] = 0
35 |     for idx in range(len(bf)):
36 |         assert not bf[idx]
37 | 
38 |     # verify set at end twice
39 |     bf[len(bf) - 1] = 1
40 |     assert bf[len(bf) - 1]
41 |     bf[len(bf) - 1] = 1
42 |     assert bf[len(bf) - 1]
43 | 
44 |     # verify unset at end twice
45 |     bf[len(bf) - 1] = 0
46 |     assert not bf[len(bf) - 1]
47 |     bf[len(bf) - 1] = 0
48 |     assert not bf[len(bf) - 1]
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![Build Status](https://travis-ci.org/crankycoder/hydra.svg?branch=master)](https://travis-ci.org/crankycoder/hydra)
 2 | 
 3 | Hydra: The Python Bloom Filter.
 4 | 
 5 | Compile with Cython 0.24 or higher.
 6 | 
 7 | ---
 8 | 
 9 | Hydra is a high performance bloom filter.  It's basically a port of
10 | the Cassandra bloom filter with some fun Cython hackery.
11 | 
12 | 1) It's persistent using memory mapped io.  On Linux, the mmap uses
13 | the MAP_POPULATE flag so the entire file is loaded into kernel space
14 | virtual memory.  In other words - fast.
15 | 
16 | 2) The hash function uses the MurmurHash3 algorithm, so it should be
17 | fast and have excellent key distribution and avalanche properties.
18 | 
19 | 3) The filter exports a set-like interface. Use .add(..), .contains()
20 | or use the "in" operator.
21 | 
22 | 4) Tests. OMG what is wrong with people with no tests?
23 | 
24 | The filter supports periodic forced synchronization to disk using
25 | fdatasync(), or you can just let the deallocator flush everything to
26 | disk when your filter goes out of scope, or your process terminates.
27 | 
28 | Hydras are snakes with multiple heads.  They're also bad dudes with
29 | snake logos on their chest who regularly try to beat on Nick Fury.
30 | Now it's a bloom filter.  
31 | 
32 | Mostly, I couldn't bear to make this yet another PySomeLibraryName
33 | library.
34 | 
35 | 
36 | Build, install a dev build and test:
37 | 
38 |     $ pip install -r requirements.txt
39 |     $ cythonize src/_hydra.pyx
40 |     $ python setup.py develop
41 |     $ python setup.py test
42 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | from setuptools.extension import Extension
 3 | from os.path import join
 4 | 
 5 | import os
 6 | 
 7 | __version__ = '2.6.dev0'
 8 | 
 9 | ext_modules = [Extension("_hydra",
10 |                          extra_compile_args=['-std=gnu99',
11 |                                              '-O2',
12 |                                              '-D_LARGEFILE64_SOURCE'],
13 |                          sources=["src/_hydra.c",
14 |                                   'src/mmap_writer.c',
15 |                                   'src/MurmurHash3.c'],
16 | 
17 |                          # path to .h file(s)
18 |                          include_dirs=[join(os.getcwd(), 'src')],
19 | 
20 |                          # path to .a or .so file(s)
21 |                          library_dirs=[join(os.getcwd(), 'src')])]
22 | 
23 | setup(name='Hydra',
24 |       author='Victor Ng',
25 |       author_email='crankycoder@gmail.com',
26 |       description='A high performance persistent bloom filter',
27 |       url="http://github.com/crankycoder/Hydra",
28 |       version=__version__,
29 |       license='MIT License',
30 |       zip_safe=False,
31 |       package_dir={'': 'src'},
32 |       py_modules=['hydra'],
33 |       ext_modules=ext_modules,
34 |       test_suite='nose.collector',
35 |       classifiers=[
36 |           'License :: OSI Approved :: MIT License',
37 |           'Programming Language :: Python',
38 |           'Programming Language :: Python :: 2',
39 |           'Programming Language :: Python :: 2.7',
40 |           'Programming Language :: Python :: 3',
41 |           'Programming Language :: Python :: 3.4',
42 |           'Programming Language :: Python :: 3.5',
43 |           'Programming Language :: Python :: Implementation :: CPython',
44 |       ],
45 | )
46 | 


--------------------------------------------------------------------------------
/src/hydra.py:
--------------------------------------------------------------------------------
 1 | import _hydra
 2 | 
 3 | 
 4 | def ReadingBloomFilter(filename, want_lock=False):
 5 |     """
 6 |     Create a read-only bloom filter with an upperbound of
 7 |     (num_elements, max_fp_prob) as a specification and using filename
 8 |     as the backing datastore.
 9 |     """
10 |     with open('{}.desc'.format(filename), 'r') as descriptor:
11 |         num_elements = int(descriptor.readline())
12 |         max_fp_prob = float(descriptor.readline())
13 |         ignore_case = int(descriptor.readline())
14 | 
15 |     return _hydra.BloomFilter.getFilter(
16 |         num_elements, max_fp_prob,
17 |         filename=filename, ignore_case=ignore_case,
18 |         read_only=True, want_lock=want_lock)
19 | 
20 | 
21 | def UpdatingBloomFilter(filename, want_lock=False, fdatasync_on_close=True):
22 |     """
23 |     Load an existing bloom filter in read-write mode using filename
24 |     as the backing datastore.
25 |     """
26 |     with open('{}.desc'.format(filename), 'r') as descriptor:
27 |         num_elements = int(descriptor.readline())
28 |         max_fp_prob = float(descriptor.readline())
29 |         ignore_case = int(descriptor.readline())
30 | 
31 |     return _hydra.BloomFilter.getFilter(
32 |         num_elements, max_fp_prob,
33 |         filename=filename, ignore_case=ignore_case,
34 |         read_only=False, want_lock=want_lock,
35 |         fdatasync_on_close=fdatasync_on_close)
36 | 
37 | 
38 | def WritingBloomFilter(num_elements, max_fp_prob, filename=None,
39 |                        ignore_case=False, want_lock=False,
40 |                        fdatasync_on_close=True):
41 |     """
42 |     Create a read/write bloom filter with an upperbound of
43 |     (num_elements, max_fp_prob) as a specification and using filename
44 |     as the backing datastore.
45 |     """
46 |     new_filter = _hydra.BloomFilter.getFilter(
47 |         num_elements, max_fp_prob,
48 |         filename=filename, ignore_case=ignore_case,
49 |         read_only=False, want_lock=want_lock,
50 |         fdatasync_on_close=fdatasync_on_close)
51 |     if filename:
52 |         with open('{}.desc'.format(filename), 'w') as descriptor:
53 |             descriptor.write("{}\n".format(num_elements))
54 |             descriptor.write("{:0.8f}\n".format(max_fp_prob))
55 |             descriptor.write("{:d}\n".format(ignore_case))
56 |     return new_filter
57 | 
58 | # Expose the murmur hash
59 | murmur_hash = _hydra.hash
60 | 


--------------------------------------------------------------------------------
/src/mmap_writer.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <sys/types.h>
  4 | #include <sys/stat.h>
  5 | #include <unistd.h>
  6 | #include <fcntl.h>
  7 | #include <sys/mman.h>
  8 | #include <errno.h>
  9 | 
 10 | #include <Python.h>
 11 | 
 12 | #include "mmap_writer.h"
 13 | 
 14 | #define FILEPATH "/tmp/mmapped.bin"
 15 | #define NUMINTS (255)
 16 | #define FILESIZE (NUMINTS * sizeof(char))
 17 | 
 18 | /*
 19 |  * Create the file and reallocate NULL bytes.
 20 |  *
 21 |  * Return the file descriptor to the file
 22 |  */
 23 | int open_mmap_file_rw(char* filename, size_t bytesize)
 24 | {
 25 |     int fd;
 26 |     int result;
 27 | 
 28 |     /* Open a file for writing.
 29 |      * * - Creating the file if it doesn't exist.
 30 |      * *
 31 |      * * Note: "O_WRONLY" mode is not sufficient when mmaping.
 32 |      * */
 33 | 
 34 |     fd = open(filename, O_RDWR | O_CREAT, (mode_t)0644);
 35 |     if (fd == -1) {
 36 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
 37 |                            "Error opening file for writing");
 38 |          return -1;
 39 |     }
 40 | 
 41 | #ifdef __linux__
 42 |     /* Stretch the file size to the size of the (mmapped) array of
 43 |      * ints
 44 |      * */
 45 |     result = posix_fallocate(fd, 0, bytesize);
 46 |     if (result) {
 47 |         errno = result;
 48 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
 49 |                            "Error calling lseek() to 'stretch' the file");
 50 |         close(fd);
 51 |         return -1;
 52 |     }
 53 | #else
 54 |     /* Stretch the file size to the size of the (mmapped) array of
 55 |      * ints
 56 |      * */
 57 |     result = lseek(fd, bytesize-1, SEEK_SET);
 58 |     if (result == -1) {
 59 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
 60 |                            "Error calling lseek() to 'stretch' the file");
 61 |         close(fd);
 62 |          return -1;
 63 |     }
 64 | 
 65 |     /* Something needs to be written at the end of the file to
 66 |      * * have the file actually have the new size.
 67 |      * * Just writing an empty string at the current file position
 68 |      * will do.
 69 |      * *
 70 |      * * Note:
 71 |      * * - The current position in the file is at the end of the
 72 |      * stretched
 73 |      * * file due to the call to lseek().
 74 |      * * - An empty string is actually a single '\0' character, so a
 75 |      * zero-byte
 76 |      * * will be written at the last byte of the file.
 77 |      * */
 78 |     result = write(fd, "", 1);
 79 |     if (result != 1) {
 80 |         close(fd);
 81 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
 82 |                            "Error writing last byte of the file");
 83 |          return -1;
 84 |     }
 85 | #endif
 86 | 
 87 |     return fd;
 88 | }
 89 | 
 90 | int open_mmap_file_ro(char* filepath)
 91 | {
 92 |     int fd;
 93 |     fd = open(filepath, O_RDONLY);
 94 |     if (fd == -1) {
 95 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
 96 |                            "Error opening file for reading");
 97 |          return -1;
 98 |     }
 99 |     return fd;
100 | }
101 | 
102 | /*
103 |  * mmap a file descriptor in read-only mode and return a char array
104 |  */
105 | char* map_file_ro(int fd, size_t filesize, int want_lock)
106 | {
107 |     char* map;
108 |     int flags = MAP_SHARED;
109 |     #ifdef __linux__
110 |     if (want_lock) {
111 |         flags |= MAP_LOCKED;
112 |     }
113 |     #endif
114 |     map = mmap(0, filesize, PROT_READ, flags, fd, 0);
115 |     if (map == MAP_FAILED) {
116 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
117 |                            "Error mmapping the file");
118 |         close(fd);
119 |          return 0;
120 |     }
121 |     return map;
122 | }
123 | 
124 | /*
125 |  * mmap the file descriptor in r/w/ mode.  Return the char array 
126 |  */
127 | char* map_file_rw(int fd, size_t filesize, int want_lock)
128 | {
129 |     char* map;
130 |     int flags = MAP_SHARED;
131 | 
132 |     #ifdef __linux__
133 |     flags |= MAP_POPULATE;
134 |     if (want_lock) {
135 |         flags |= MAP_LOCKED;
136 |     }
137 |     #endif
138 | 
139 |     map = (char *) mmap(0, filesize, PROT_READ | PROT_WRITE, flags, fd, 0);
140 | 
141 |     if (map == MAP_FAILED) {
142 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
143 |                            "Error mmapping the file");
144 |         close(fd);
145 |          return 0;
146 |     }
147 | 
148 |     return map;
149 | }
150 | 
151 | /* 
152 |  * Don't forget to free the mmapped memory
153 |  */
154 | int unmap_file(char* map, size_t filesize) {
155 |     if (munmap(map, filesize) == -1) {
156 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
157 |                            "Error un-mmapping the file");
158 |         return -1;
159 |     }
160 |     return 0;
161 | }
162 | 
163 | void turn_bits_on(char *map, off_t index, char bitmask)
164 | {
165 |     map[index] = map[index] | bitmask;
166 | }
167 | 
168 | int flush_to_disk(int fd)
169 | {
170 |     int  result;
171 |     result = fdatasync(fd);
172 |     if (result == -1) {
173 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
174 |                            "Error flushing the file");
175 |         close(fd);
176 |          return -1;
177 |     }
178 |     return 0;
179 | }
180 | 
181 | int close_file(int fd)
182 | {
183 |     int  result;
184 |     flush_to_disk(fd);
185 |     result = close(fd);
186 |     if (result == -1) {
187 |         PyErr_SetFromErrnoWithFilename(PyExc_OSError,
188 |                            "Error closing the file");
189 |          return -1;
190 |     }
191 |     return 0;
192 | }
193 | 
194 | int main(int argc, char *argv[])
195 | {
196 |     int fd;
197 |     char *map; /* mmapped array of chars */
198 |     fd = open_mmap_file_rw(FILEPATH, FILESIZE);
199 | 
200 |     map = map_file_rw(fd, FILESIZE, 0);
201 | 
202 |     /* Now write int's to the file as if it were memory (an array of
203 |      * ints).
204 |      * */
205 |     for (off_t i = 0; i <NUMINTS; i++) {
206 |         turn_bits_on(map, i, (char) i);
207 |     }
208 | 
209 |     unmap_file(map, FILESIZE);
210 | 
211 |     /* Un-mmaping doesn't close the file, so we still need to do that.
212 |      * */
213 |     close(fd);
214 | 
215 |     fd = open_mmap_file_ro(FILEPATH);
216 |     map = map_file_ro(fd, FILESIZE, 0);
217 | 
218 |     /* Read the file int-by-int from the mmap
219 |      * */
220 |     for (off_t i = 0; i < NUMINTS; i++) {
221 |         printf("%d\n", (unsigned int) map[i]);
222 |     }
223 | 
224 |     if (munmap(map, FILESIZE) == -1) {
225 |         perror("Error un-mmapping the file");
226 |     }
227 |     close(fd);
228 |     return 0;
229 | }
230 | 
231 | void bulkload_file(char* buffer, char* filename)
232 | {
233 |     FILE *file = fopen ( filename, "r" );
234 |     if ( file != NULL )
235 |     {
236 |         char line [ 128 ]; /* or other suitable maximum line size */
237 | 
238 |         while ( fgets ( line, sizeof line, file ) != NULL ) /* read a line */
239 |         {
240 |             // TODO: inline the getbuckets and settinghere
241 |         }
242 |         fclose ( file );
243 |     }
244 |     else
245 |     {
246 |         perror ( filename ); /* why didn't the file open? */
247 |     }
248 | }
249 | 
250 | 


--------------------------------------------------------------------------------
/tests/test_bloom.py:
--------------------------------------------------------------------------------
  1 | from _hydra import BloomCalculations, BloomFilter, \
  2 |     UnsupportedOperationException
  3 | from hydra import WritingBloomFilter, murmur_hash
  4 | from helpers import KeyGenerator
  5 | from nose.plugins.skip import SkipTest
  6 | 
  7 | BENCH_SPEC = BloomCalculations.computeBloomSpec2(15, 0.1)
  8 | 
  9 | 
 10 | def test_compute_spec():
 11 |     bs1 = BloomCalculations.computeBloomSpec1(12)
 12 |     bs2 = BloomCalculations.computeBloomSpec2(12, 0.0032)
 13 |     bs3 = BloomCalculations.computeBloomSpec1(10)
 14 | 
 15 |     assert bs1 == bs2
 16 |     assert bs1 != bs3
 17 |     assert bs2 != bs3
 18 | 
 19 | 
 20 | class TestFilter(object):
 21 | 
 22 |     def test_many_random(self):
 23 |         keygen = KeyGenerator()
 24 |         MAX_HASH_COUNT = 128
 25 |         bloom = WritingBloomFilter(15, 0.0009)
 26 |         hashes = set()
 27 |         collisions = 0
 28 |         for key in keygen.randomKeys():
 29 |             for i, hashIndex in enumerate(bloom.getHashBuckets(
 30 |                     key, MAX_HASH_COUNT, 1024 * 1024)):
 31 |                 hashes.add(hashIndex)
 32 |             collisions += MAX_HASH_COUNT - len(hashes)
 33 |             hashes.clear()
 34 |         assert collisions <= 100, "Got {} collisions.".format(collisions)
 35 | 
 36 |     def test_hash_buckets(self):
 37 |         bloom = WritingBloomFilter(15, 0.0009)
 38 |         buckets = bloom.getHashBuckets('hydra', 128, 1024 * 1024)
 39 |         assert buckets == [
 40 |             536658, 898974, 212714, 575030, 937346, 251086, 613402,
 41 |             975718, 289458, 651774, 1014090, 327830, 690146, 3886,
 42 |             366202, 728518, 42258, 404574, 766890, 80630, 442946,
 43 |             805262, 119002, 481318, 843634, 157374, 519690, 882006,
 44 |             195746, 558062, 920378, 234118, 596434, 958750, 272490,
 45 |             634806, 997122, 310862, 673178, 1035494, 349234, 711550,
 46 |             25290, 387606, 749922, 63662, 425978, 788294, 102034,
 47 |             464350, 826666, 140406, 502722, 865038, 178778, 541094,
 48 |             903410, 217150, 579466, 941782, 255522, 617838, 980154,
 49 |             293894, 656210, 1018526, 332266, 694582, 8322, 370638,
 50 |             732954, 46694, 409010, 771326, 85066, 447382, 809698,
 51 |             123438, 485754, 848070, 161810, 524126, 886442, 200182,
 52 |             562498, 924814, 238554, 600870, 963186, 276926, 639242,
 53 |             1001558, 315298, 677614, 1039930, 353670, 715986, 29726,
 54 |             392042, 754358, 68098, 430414, 792730, 106470, 468786,
 55 |             831102, 144842, 507158, 869474, 183214, 545530, 907846,
 56 |             221586, 583902, 946218, 259958, 622274, 984590, 298330,
 57 |             660646, 1022962, 336702, 699018, 12758, 375074, 737390,
 58 |             51130, 413446]
 59 | 
 60 | 
 61 | class TestBloomFilter(object):
 62 |     ELEMENTS = 10000
 63 |     MAX_FAILURE_RATE = 0.1
 64 | 
 65 |     def setup(self):
 66 |         self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)
 67 | 
 68 |     def _test_false_positives(self, bf, keys, otherkeys):
 69 |         fp = 0
 70 | 
 71 |         assert len(keys) == len(otherkeys)
 72 | 
 73 |         for key in keys:
 74 |             bf.add(key)
 75 | 
 76 |         for key in otherkeys:
 77 |             if bf.contains(key):
 78 |                 fp += 1
 79 | 
 80 |         bucketsPerElement = BloomFilter._maxBucketsPerElement(self.ELEMENTS)
 81 |         spec = BloomCalculations.computeBloomSpec2(
 82 |             bucketsPerElement, self.MAX_FAILURE_RATE)
 83 | 
 84 |         fp_ratio = fp / (
 85 |             len(keys) *
 86 |             BloomCalculations.PROBS[spec.bucketsPerElement][spec.K]) * 100
 87 |         assert fp_ratio < 103.25, "Over 103.25% of the maximum expected " \
 88 |             "false positives found. {:0.3f}%".format(fp_ratio)
 89 |         print("OK: Got {:0.3f}% of the expected false positives ".format(
 90 |             fp_ratio))
 91 | 
 92 |         # False negatives never occur - this should always work
 93 |         for k in keys:
 94 |             assert bf.contains(k)
 95 | 
 96 |     def test_bloom_limits1(self):
 97 |         maxBuckets = len(BloomCalculations.PROBS) - 1
 98 |         maxK = len(BloomCalculations.PROBS[maxBuckets]) - 1
 99 | 
100 |         # possible
101 |         BloomCalculations.computeBloomSpec2(
102 |             maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK])
103 | 
104 |         # impossible, throws
105 |         try:
106 |             BloomCalculations.computeBloomSpec2(
107 |                 maxBuckets, BloomCalculations.PROBS[maxBuckets][maxK] / 2)
108 |             raise RuntimeError
109 |         except UnsupportedOperationException:
110 |             pass
111 | 
112 |     def test_one(self):
113 |         self.bf.add("a")
114 |         self.bf["aa"] = 0
115 |         assert self.bf.contains("a")
116 |         assert "aa" in self.bf
117 |         assert not self.bf.contains("b")
118 |         assert "b" not in self.bf
119 | 
120 |     def test_false_positives_int(self):
121 |         keygen = KeyGenerator()
122 |         self._test_false_positives(
123 |             self.bf,
124 |             [str(x) for x in range(10000)],
125 |             keygen.randomKeys(10000))
126 | 
127 |     def test_false_positives_random(self):
128 |         keygen1 = KeyGenerator(314159)
129 |         self._test_false_positives(
130 |             self.bf,
131 |             [keygen1.random_string() for i in range(10000)],
132 |             [keygen1.random_string() for i in range(10000)],)
133 | 
134 |     def test_words(self):
135 |         keygen1 = KeyGenerator()
136 |         bf = WritingBloomFilter(
137 |             len(keygen1) / 2, self.MAX_FAILURE_RATE, ignore_case=False)
138 | 
139 |         even_keys = keygen1[::2]
140 |         odd_keys = keygen1[1::2]
141 |         self._test_false_positives(bf, even_keys, odd_keys)
142 | 
143 |     def test_null_keys(self):
144 |         assert 'foo' not in self.bf
145 |         assert 'foo\0bar' not in self.bf
146 |         assert 'foo\0baz' not in self.bf
147 | 
148 |         self.bf.add('foo')
149 | 
150 |         assert 'foo' in self.bf
151 |         assert 'foo\0bar' not in self.bf
152 |         assert 'foo\0baz' not in self.bf
153 | 
154 |         self.bf.add('foo\0bar')
155 | 
156 |         assert 'foo\0bar' in self.bf
157 |         assert 'foo\0baz' not in self.bf
158 | 
159 |         self.bf.add('foo\0baz')
160 | 
161 |         assert 'foo\0baz' in self.bf
162 | 
163 | 
164 | class TestHugeBloom(object):
165 |     ELEMENTS = 1000000000
166 |     MAX_FAILURE_RATE = 0.001
167 | 
168 |     def setup(self):
169 |         import struct
170 |         if 8 * struct.calcsize("P") == 32:
171 |             raise SkipTest("Skip HugeBloom tests on 32-bit platforms")
172 |         self.bf = WritingBloomFilter(self.ELEMENTS, self.MAX_FAILURE_RATE)
173 | 
174 |     def test_one(self):
175 |         self.bf.add("a")
176 |         assert self.bf.contains("a")
177 |         assert not self.bf.contains("b")
178 | 
179 | 
180 | def test_murmur():
181 |     # Just make sure we can run the hash function from pure python
182 |     print(murmur_hash('food'))
183 | 
184 | 
185 | def test_unicrap():
186 |     bf = WritingBloomFilter(100000, 0.1)
187 |     assert u'\u2019' not in bf
188 |     assert u'\u2018' not in bf
189 | 
190 |     bf.add(u'\u2018')
191 |     bf.add(u'\u2019')
192 | 
193 |     bf.add('just a plain string')
194 | 
195 |     assert u'\u2019' in bf
196 |     assert u'\u2018' in bf
197 |     assert 'just a plain string' in bf
198 | 
199 |     assert bf[u'\u2019'] == 1
200 |     assert bf[u'\u2018'] == 1
201 |     assert bf['just a plain string'] == 1
202 | 


--------------------------------------------------------------------------------
/src/MurmurHash3.c:
--------------------------------------------------------------------------------
  1 | //-----------------------------------------------------------------------------
  2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public
  3 | // domain. The author hereby disclaims copyright to this source code.
  4 | 
  5 | // Note - The x86 and x64 versions do _not_ produce the same results, as the
  6 | // algorithms are optimized for their respective platforms. You can still
  7 | // compile and run any of them on any platform, but your performance with the
  8 | // non-native version will be less than optimal.
  9 | 
 10 | #include "MurmurHash3.h"
 11 | 
 12 | //-----------------------------------------------------------------------------
 13 | // Platform-specific functions and macros
 14 | 
 15 | // Microsoft Visual Studio
 16 | 
 17 | #if defined(_MSC_VER)
 18 | 
 19 | #define FORCE_INLINE	__forceinline
 20 | 
 21 | #include <stdlib.h>
 22 | 
 23 | #define ROTL32(x,y)	_rotl(x,y)
 24 | #define ROTL64(x,y)	_rotl64(x,y)
 25 | 
 26 | #define BIG_CONSTANT(x) (x)
 27 | 
 28 | // Other compilers
 29 | 
 30 | #else	// defined(_MSC_VER)
 31 | 
 32 | #define	FORCE_INLINE inline __attribute__((always_inline))
 33 | 
 34 | inline uint32_t rotl32 ( uint32_t x, int8_t r )
 35 | {
 36 |   return (x << r) | (x >> (32 - r));
 37 | }
 38 | 
 39 | inline uint64_t rotl64 ( uint64_t x, int8_t r )
 40 | {
 41 |   return (x << r) | (x >> (64 - r));
 42 | }
 43 | 
 44 | #define	ROTL32(x,y)	rotl32(x,y)
 45 | #define ROTL64(x,y)	rotl64(x,y)
 46 | 
 47 | #define BIG_CONSTANT(x) (x##LLU)
 48 | 
 49 | #endif // !defined(_MSC_VER)
 50 | 
 51 | //-----------------------------------------------------------------------------
 52 | // Block read - if your platform needs to do endian-swapping or can only
 53 | // handle aligned reads, do the conversion here
 54 | 
 55 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i )
 56 | {
 57 |   return p[i];
 58 | }
 59 | 
 60 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i )
 61 | {
 62 |   return p[i];
 63 | }
 64 | 
 65 | //-----------------------------------------------------------------------------
 66 | // Finalization mix - force all bits of a hash block to avalanche
 67 | 
 68 | FORCE_INLINE uint32_t fmix32 ( uint32_t h )
 69 | {
 70 |   h ^= h >> 16;
 71 |   h *= 0x85ebca6b;
 72 |   h ^= h >> 13;
 73 |   h *= 0xc2b2ae35;
 74 |   h ^= h >> 16;
 75 | 
 76 |   return h;
 77 | }
 78 | 
 79 | //----------
 80 | 
 81 | FORCE_INLINE uint64_t fmix64 ( uint64_t k )
 82 | {
 83 |   k ^= k >> 33;
 84 |   k *= BIG_CONSTANT(0xff51afd7ed558ccd);
 85 |   k ^= k >> 33;
 86 |   k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53);
 87 |   k ^= k >> 33;
 88 | 
 89 |   return k;
 90 | }
 91 | 
 92 | //-----------------------------------------------------------------------------
 93 | 
 94 | void MurmurHash3_x86_32 ( const void * key, int len,
 95 |                           uint32_t seed, void * out )
 96 | {
 97 |   const uint8_t * data = (const uint8_t*)key;
 98 |   const int nblocks = len / 4;
 99 | 
100 |   uint32_t h1 = seed;
101 | 
102 |   const uint32_t c1 = 0xcc9e2d51;
103 |   const uint32_t c2 = 0x1b873593;
104 | 
105 |   //----------
106 |   // body
107 | 
108 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*4);
109 | 
110 |   for(int i = -nblocks; i; i++)
111 |   {
112 |     uint32_t k1 = getblock32(blocks,i);
113 | 
114 |     k1 *= c1;
115 |     k1 = ROTL32(k1,15);
116 |     k1 *= c2;
117 |     
118 |     h1 ^= k1;
119 |     h1 = ROTL32(h1,13); 
120 |     h1 = h1*5+0xe6546b64;
121 |   }
122 | 
123 |   //----------
124 |   // tail
125 | 
126 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
127 | 
128 |   uint32_t k1 = 0;
129 | 
130 |   switch(len & 3)
131 |   {
132 |   case 3: k1 ^= tail[2] << 16;
133 |   case 2: k1 ^= tail[1] << 8;
134 |   case 1: k1 ^= tail[0];
135 |           k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
136 |   };
137 | 
138 |   //----------
139 |   // finalization
140 | 
141 |   h1 ^= len;
142 | 
143 |   h1 = fmix32(h1);
144 | 
145 |   *(uint32_t*)out = h1;
146 | } 
147 | 
148 | //-----------------------------------------------------------------------------
149 | 
150 | void MurmurHash3_x86_128 ( const void * key, const int len,
151 |                            uint32_t seed, void * out )
152 | {
153 |   const uint8_t * data = (const uint8_t*)key;
154 |   const int nblocks = len / 16;
155 | 
156 |   uint32_t h1 = seed;
157 |   uint32_t h2 = seed;
158 |   uint32_t h3 = seed;
159 |   uint32_t h4 = seed;
160 | 
161 |   const uint32_t c1 = 0x239b961b; 
162 |   const uint32_t c2 = 0xab0e9789;
163 |   const uint32_t c3 = 0x38b34ae5; 
164 |   const uint32_t c4 = 0xa1e38b93;
165 | 
166 |   //----------
167 |   // body
168 | 
169 |   const uint32_t * blocks = (const uint32_t *)(data + nblocks*16);
170 | 
171 |   for(int i = -nblocks; i; i++)
172 |   {
173 |     uint32_t k1 = getblock32(blocks,i*4+0);
174 |     uint32_t k2 = getblock32(blocks,i*4+1);
175 |     uint32_t k3 = getblock32(blocks,i*4+2);
176 |     uint32_t k4 = getblock32(blocks,i*4+3);
177 | 
178 |     k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
179 | 
180 |     h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b;
181 | 
182 |     k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
183 | 
184 |     h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747;
185 | 
186 |     k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
187 | 
188 |     h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35;
189 | 
190 |     k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
191 | 
192 |     h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17;
193 |   }
194 | 
195 |   //----------
196 |   // tail
197 | 
198 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
199 | 
200 |   uint32_t k1 = 0;
201 |   uint32_t k2 = 0;
202 |   uint32_t k3 = 0;
203 |   uint32_t k4 = 0;
204 | 
205 |   switch(len & 15)
206 |   {
207 |   case 15: k4 ^= tail[14] << 16;
208 |   case 14: k4 ^= tail[13] << 8;
209 |   case 13: k4 ^= tail[12] << 0;
210 |            k4 *= c4; k4  = ROTL32(k4,18); k4 *= c1; h4 ^= k4;
211 | 
212 |   case 12: k3 ^= tail[11] << 24;
213 |   case 11: k3 ^= tail[10] << 16;
214 |   case 10: k3 ^= tail[ 9] << 8;
215 |   case  9: k3 ^= tail[ 8] << 0;
216 |            k3 *= c3; k3  = ROTL32(k3,17); k3 *= c4; h3 ^= k3;
217 | 
218 |   case  8: k2 ^= tail[ 7] << 24;
219 |   case  7: k2 ^= tail[ 6] << 16;
220 |   case  6: k2 ^= tail[ 5] << 8;
221 |   case  5: k2 ^= tail[ 4] << 0;
222 |            k2 *= c2; k2  = ROTL32(k2,16); k2 *= c3; h2 ^= k2;
223 | 
224 |   case  4: k1 ^= tail[ 3] << 24;
225 |   case  3: k1 ^= tail[ 2] << 16;
226 |   case  2: k1 ^= tail[ 1] << 8;
227 |   case  1: k1 ^= tail[ 0] << 0;
228 |            k1 *= c1; k1  = ROTL32(k1,15); k1 *= c2; h1 ^= k1;
229 |   };
230 | 
231 |   //----------
232 |   // finalization
233 | 
234 |   h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len;
235 | 
236 |   h1 += h2; h1 += h3; h1 += h4;
237 |   h2 += h1; h3 += h1; h4 += h1;
238 | 
239 |   h1 = fmix32(h1);
240 |   h2 = fmix32(h2);
241 |   h3 = fmix32(h3);
242 |   h4 = fmix32(h4);
243 | 
244 |   h1 += h2; h1 += h3; h1 += h4;
245 |   h2 += h1; h3 += h1; h4 += h1;
246 | 
247 |   ((uint32_t*)out)[0] = h1;
248 |   ((uint32_t*)out)[1] = h2;
249 |   ((uint32_t*)out)[2] = h3;
250 |   ((uint32_t*)out)[3] = h4;
251 | }
252 | 
253 | //-----------------------------------------------------------------------------
254 | 
255 | void MurmurHash3_x64_128 ( const void * key, const int len,
256 |                            const uint32_t seed, void * out )
257 | {
258 |   const uint8_t * data = (const uint8_t*)key;
259 |   const int nblocks = len / 16;
260 | 
261 |   uint64_t h1 = seed;
262 |   uint64_t h2 = seed;
263 | 
264 |   const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5);
265 |   const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f);
266 | 
267 |   //----------
268 |   // body
269 | 
270 |   const uint64_t * blocks = (const uint64_t *)(data);
271 | 
272 |   for(int i = 0; i < nblocks; i++)
273 |   {
274 |     uint64_t k1 = getblock64(blocks,i*2+0);
275 |     uint64_t k2 = getblock64(blocks,i*2+1);
276 | 
277 |     k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
278 | 
279 |     h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;
280 | 
281 |     k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
282 | 
283 |     h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
284 |   }
285 | 
286 |   //----------
287 |   // tail
288 | 
289 |   const uint8_t * tail = (const uint8_t*)(data + nblocks*16);
290 | 
291 |   uint64_t k1 = 0;
292 |   uint64_t k2 = 0;
293 | 
294 |   switch(len & 15)
295 |   {
296 |   case 15: k2 ^= ((uint64_t)tail[14]) << 48;
297 |   case 14: k2 ^= ((uint64_t)tail[13]) << 40;
298 |   case 13: k2 ^= ((uint64_t)tail[12]) << 32;
299 |   case 12: k2 ^= ((uint64_t)tail[11]) << 24;
300 |   case 11: k2 ^= ((uint64_t)tail[10]) << 16;
301 |   case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
302 |   case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
303 |            k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;
304 | 
305 |   case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
306 |   case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
307 |   case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
308 |   case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
309 |   case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
310 |   case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
311 |   case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
312 |   case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
313 |            k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
314 |   };
315 | 
316 |   //----------
317 |   // finalization
318 | 
319 |   h1 ^= len; h2 ^= len;
320 | 
321 |   h1 += h2;
322 |   h2 += h1;
323 | 
324 |   h1 = fmix64(h1);
325 |   h2 = fmix64(h2);
326 | 
327 |   h1 += h2;
328 |   h2 += h1;
329 | 
330 |   ((uint64_t*)out)[0] = h1;
331 |   ((uint64_t*)out)[1] = h2;
332 | }
333 | 
334 | //-----------------------------------------------------------------------------
335 | 


--------------------------------------------------------------------------------
/src/_hydra.pyx:
--------------------------------------------------------------------------------
  1 | cimport cython
  2 | import operator
  3 | import os
  4 | import sys
  5 | import tempfile
  6 | 
  7 | cdef extern from "ctype.h" nogil:
  8 |     cdef int tolower (int c)
  9 | 
 10 | cdef extern from "stdlib.h" nogil:
 11 |     long long int llabs(long long int j)
 12 | 
 13 | 
 14 | cdef extern from "stdio.h" nogil:
 15 |     ctypedef struct FILE
 16 |     FILE *fopen(char *path, char *mode)
 17 |     int fclose(FILE *strea)
 18 |     cdef char* fgets (char *buffer, int fd, FILE *stream)
 19 | 
 20 | cdef extern from "mmap_writer.h" nogil:
 21 |     cdef char* map_file_ro(int fd, size_t filesize, int want_lock) except NULL
 22 |     cdef char* map_file_rw(int fd, size_t filesize, int want_lock) except NULL
 23 |     cdef int open_mmap_file_ro(char* filepath) except -1
 24 |     cdef int open_mmap_file_rw(char* filename, size_t bytesize) except -1
 25 |     cdef void bulkload_file(char* buffer, char* filename)
 26 |     cdef int close_file(int fd) except -1
 27 |     cdef int flush_to_disk(int fd) except -1
 28 |     cdef void turn_bits_on(char *map, size_t index, char bitmask)
 29 |     cdef int unmap_file(char* map, size_t filesize) except -1
 30 | 
 31 | cdef extern from "MurmurHash3.h" nogil:
 32 |     void MurmurHash3_x64_128 (void * key, int len, unsigned int seed, void * out)
 33 | 
 34 | def hash(key, int seed=0):
 35 |     """ This function hashes a string using the Murmur3 hash algorithm"""
 36 |     cdef long result[2]
 37 |     if isinstance(key, unicode):
 38 |         key = key.encode('utf8')
 39 |     MurmurHash3_x64_128(<char*>key, len(key), seed, result)
 40 |     return long(result[0]) << 64 | (long(result[1]) & 0xFFFFFFFFFFFFFFFF)
 41 | 
 42 | cdef class MMapBitField:
 43 |     cdef char* _filename
 44 |     cdef int _fd
 45 |     cdef long _bitsize
 46 |     cdef long _bytesize
 47 |     cdef char* _buffer
 48 |     cdef int _read_only
 49 |     cdef int _fdatasync_on_close
 50 | 
 51 |     def __cinit__(self, filename, long bitsize, int read_only, int want_lock=False, int fdatasync_on_close=True):
 52 |         if isinstance(filename, unicode):
 53 |             filename = filename.encode('utf8')
 54 |         self._filename = filename
 55 |         self._bitsize = bitsize
 56 |         self._bytesize = (bitsize / 8) + 2
 57 |         self._read_only = read_only
 58 |         self._fdatasync_on_close = fdatasync_on_close
 59 | 
 60 |         # Now setup the file and mmap
 61 |         if read_only:
 62 |             self.open_ro_buffer(want_lock)
 63 |         else:
 64 |             self.open_rw_buffer(want_lock)
 65 | 
 66 |     cdef void open_rw_buffer(self, want_lock=False):
 67 |         self._fd = open_mmap_file_rw(self._filename, self._bytesize)
 68 |         self._buffer = map_file_rw(self._fd, self._bytesize, want_lock)
 69 | 
 70 |     cdef void open_ro_buffer(self, want_lock=False):
 71 |         self._fd = open_mmap_file_ro(self._filename)
 72 |         self._buffer = map_file_ro(self._fd, self._bytesize, want_lock)
 73 |         
 74 |     def __dealloc__(self):
 75 |         self.close()
 76 | 
 77 |     cpdef close(self):
 78 |         if self._fd >= 0 and self._buffer:
 79 |             if not self._read_only and self._fdatasync_on_close:
 80 |                 flush_to_disk(self._fd)
 81 |             unmap_file(self._buffer, self._bytesize)
 82 |             close_file(self._fd)
 83 |             self._fd = -1
 84 |             self._buffer = NULL
 85 | 
 86 |     cpdef fdatasync(self):
 87 |         """ Flush everything to disk """
 88 |         if self._fd < 0 or not self._buffer:
 89 |             raise ValueError('I/O operation on closed file')
 90 | 
 91 |         if self._read_only:
 92 |             raise ValueError('bit field is read only')
 93 | 
 94 |         flush_to_disk(self._fd)
 95 | 
 96 |     def __setitem__(self, size_t key, int value):
 97 |         cdef size_t byte_offset = key / 8
 98 |         cdef char bitmask
 99 |         cdef char bitval
100 | 
101 |         if self._fd < 0 or not self._buffer:
102 |             raise ValueError('I/O operation on closed file')
103 | 
104 |         if self._read_only:
105 |             raise ValueError('bit field is read only')
106 | 
107 |         bitmask = 2 ** (key % 8)
108 |         if value:
109 |             bitval = self._buffer[byte_offset] | bitmask
110 |         else:
111 |             bitval = self._buffer[byte_offset] & ~bitmask
112 |         if bitval != self._buffer[byte_offset]:
113 |             self._buffer[byte_offset] = bitval
114 | 
115 |     def __getitem__(self, size_t key):
116 |         cdef size_t byte_offset = key / 8
117 | 
118 |         if self._fd < 0 or not self._buffer:
119 |             raise ValueError('I/O operation on closed file')
120 | 
121 |         cdef char old_bitmask = self._buffer[byte_offset]
122 |         return <int> (old_bitmask & <char> (2 ** (key % 8)))
123 | 
124 |     def __iter__(self):
125 |         if self._fd < 0 or not self._buffer:
126 |             raise ValueError('I/O operation on closed file')
127 | 
128 |         return MMapIter(self)
129 | 
130 |     def __len__(self):
131 |         return self.size()
132 | 
133 |     cpdef size(self):
134 |         if self._fd < 0 or not self._buffer:
135 |             raise ValueError('I/O operation on closed file')
136 | 
137 |         return self._bitsize
138 | 
139 | 
140 | cdef class MMapIter:
141 |     cdef size_t _idx
142 |     cdef MMapBitField  _bitfield
143 |     def __cinit__(self, bitfield):
144 |         self._bitfield = bitfield
145 |         self._idx = 0
146 | 
147 |     def __next__(self):
148 |         cdef int result
149 |         if self._idx < len(self._bitfield):
150 |             result = self._bitfield[self._idx]
151 |             self._idx +=1
152 |             return result
153 |         raise StopIteration
154 | 
155 | 
156 | class UnsupportedOperationException(Exception): pass
157 | 
158 | class BloomSpecification:
159 |     """
160 |     A wrapper class that holds two key parameters for a Bloom Filter: the
161 |     number of hash functions used, and the number of buckets per element used.
162 |     """
163 | 
164 |     def __init__(self, k, bucketsPerElement):
165 |         self.K = k
166 |         self.bucketsPerElement = bucketsPerElement
167 | 
168 |     def __eq__(self, other):
169 |         c1 = getattr(other, 'K', None) == self.K
170 |         c2 = getattr(other, 'bucketsPerElement', None) == self.bucketsPerElement
171 |         return c1 and c2
172 | 
173 | cdef class BloomCalculations:
174 |     """
175 |     This calculation class is ported straight from Cassandra.
176 |     """
177 |     minBuckets = 2
178 |     minK = 1
179 | 
180 |     PROBS = [
181 |             [1.0], #  dummy row representing 0 buckets per element
182 |             [1.0, 1.0], #  dummy row representing 1 buckets per element
183 |             [1.0, 0.393,  0.400],
184 |             [1.0, 0.283,  0.237,   0.253],
185 |             [1.0, 0.221,  0.155,   0.147,   0.160],
186 |             [1.0, 0.181,  0.109,   0.092,   0.092,   0.101], # 5
187 |             [1.0, 0.154,  0.0804,  0.0609,  0.0561,  0.0578,   0.0638],
188 |             [1.0, 0.133,  0.0618,  0.0423,  0.0359,  0.0347,   0.0364],
189 |             [1.0, 0.118,  0.0489,  0.0306,  0.024,   0.0217,   0.0216,   0.0229],
190 |             [1.0, 0.105,  0.0397,  0.0228,  0.0166,  0.0141,   0.0133,   0.0135,   0.0145],
191 |             [1.0, 0.0952, 0.0329,  0.0174,  0.0118,  0.00943,  0.00844,  0.00819,  0.00846], # 10
192 |             [1.0, 0.0869, 0.0276,  0.0136,  0.00864, 0.0065,   0.00552,  0.00513,  0.00509],
193 |             [1.0, 0.08,   0.0236,  0.0108,  0.00646, 0.00459,  0.00371,  0.00329,  0.00314],
194 |             [1.0, 0.074,  0.0203,  0.00875, 0.00492, 0.00332,  0.00255,  0.00217,  0.00199,  0.00194],
195 |             [1.0, 0.0689, 0.0177,  0.00718, 0.00381, 0.00244,  0.00179,  0.00146,  0.00129,  0.00121,  0.0012],
196 |             [1.0, 0.0645, 0.0156,  0.00596, 0.003,   0.00183,  0.00128,  0.001,    0.000852, 0.000775, 0.000744], # 15
197 |             [1.0, 0.0606, 0.0138,  0.005,   0.00239, 0.00139,  0.000935, 0.000702, 0.000574, 0.000505, 0.00047,  0.000459],
198 |             [1.0, 0.0571, 0.0123,  0.00423, 0.00193, 0.00107,  0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284],
199 |             [1.0, 0.054,  0.0111,  0.00362, 0.00158, 0.000839, 0.000519, 0.00036,  0.000275, 0.000226, 0.000198, 0.000183, 0.000176],
200 |             [1.0, 0.0513, 0.00998, 0.00312, 0.0013,  0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109],
201 |             [1.0, 0.0488, 0.00906, 0.0027,  0.00108, 0.00053,  0.000303, 0.000196, 0.00014,  0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05] # 20
202 |             ]
203 | 
204 |     optKPerBuckets = [max(1, min(enumerate(probs), key=operator.itemgetter(1))[0]) for probs in PROBS]
205 | 
206 |     @classmethod
207 |     def computeBloomSpec1(cls, bucketsPerElement):
208 |         """
209 |         Given the number of buckets that can be used per element, return a
210 |         specification that minimizes the false positive rate.
211 | 
212 |         @param bucketsPerElement The number of buckets per element for the filter.
213 |         @return A spec that minimizes the false positive rate.
214 |         """
215 |         assert bucketsPerElement >= 1
216 |         assert bucketsPerElement <= len(BloomCalculations.PROBS) - 1
217 |         return BloomSpecification(cls.optKPerBuckets[bucketsPerElement], bucketsPerElement)
218 | 
219 | 
220 |     @classmethod
221 |     def computeBloomSpec2(cls, maxBucketsPerElement, maxFalsePosProb):
222 |         """
223 |         Given a maximum tolerable false positive probability, compute a Bloom
224 |         specification which will give less than the specified false positive rate,
225 |         but minimize the number of buckets per element and the number of hash
226 |         functions used.  Because bandwidth (and therefore total bitvector size)
227 |         is considered more expensive than computing power, preference is given
228 |         to minimizing buckets per element rather than number of hash functions.
229 | 
230 |         @param maxBucketsPerElement The maximum number of buckets available for the filter.
231 |         @param maxFalsePosProb The maximum tolerable false positive rate.
232 |         @return A Bloom Specification which would result in a false positive rate
233 |         less than specified by the function call
234 |         @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met
235 |         """
236 |         assert maxBucketsPerElement >= 1
237 |         assert maxBucketsPerElement <= len(BloomCalculations.PROBS) - 1
238 |         maxK = len(BloomCalculations.PROBS[maxBucketsPerElement]) - 1
239 | 
240 |         # Handle the trivial cases
241 |         if maxFalsePosProb >= BloomCalculations.PROBS[cls.minBuckets][cls.minK]:
242 |             return BloomSpecification(2, cls.optKPerBuckets[2])
243 | 
244 |         if maxFalsePosProb < BloomCalculations.PROBS[maxBucketsPerElement][maxK]:
245 |             msg = "Unable to satisfy %s with %s buckets per element"
246 |             raise  UnsupportedOperationException(msg % (maxFalsePosProb, maxBucketsPerElement))
247 | 
248 |         # First find the minimal required number of buckets:
249 |         bucketsPerElement = 2
250 |         K = cls.optKPerBuckets[2]
251 |         while(BloomCalculations.PROBS[bucketsPerElement][K] > maxFalsePosProb):
252 |             bucketsPerElement += 1
253 |             K = cls.optKPerBuckets[bucketsPerElement]
254 |         # Now that the number of buckets is sufficient, see if we can relax K
255 |         # without losing too much precision.
256 |         while BloomCalculations.PROBS[bucketsPerElement][K - 1] <= maxFalsePosProb:
257 |             K -= 1
258 | 
259 |         return BloomSpecification(K, bucketsPerElement)
260 | 
261 | cdef class BloomFilter:
262 |     EXCESS = 20
263 |     cdef unsigned int _hashCount
264 |     cdef MMapBitField _bitmap
265 |     cdef int _ignore_case
266 |     cdef object _tempfile
267 | 
268 |     def __cinit__(self, unsigned int hashes, MMapBitField bitmap, int ignore_case):
269 |         cdef int i
270 | 
271 |         self._hashCount = hashes
272 |         self._bitmap = bitmap
273 |         self._ignore_case = ignore_case
274 | 
275 |     def __enter__(self):
276 |         return self
277 | 
278 |     def __exit__(self, *excinfo):
279 |         self.close()
280 |         return None
281 | 
282 |     cpdef close(self):
283 |         self._bitmap.close()
284 | 
285 |     cpdef fdatasync(self):
286 |         """ Flush everything to disk """
287 |         self._bitmap.fdatasync()
288 | 
289 |     def filename(self):
290 |         """
291 |         Filename of the MMAP file
292 |         """
293 |         return self._bitmap._filename
294 | 
295 |     @classmethod
296 |     def _maxBucketsPerElement(cls, numElements):
297 |         numElements = max(1, numElements)
298 |         v = (sys.maxsize - cls.EXCESS) / float(numElements)
299 |         if v < 1.0:
300 |             msg = "Cannot compute probabilities for %s elements."
301 |             raise UnsupportedOperationException, msg % numElements
302 |         return min(len(BloomCalculations.PROBS) - 1, int(v))
303 | 
304 |     @classmethod
305 |     def _bucketsFor(cls, numElements, bucketsPer, filename, read_only, want_lock=False, fdatasync_on_close=True):
306 |         numBits = numElements * bucketsPer + cls.EXCESS
307 |         bf_size = min(sys.maxsize, numBits)
308 |         return MMapBitField(filename, bf_size, read_only,
309 |                             want_lock=want_lock,
310 |                             fdatasync_on_close=fdatasync_on_close)
311 | 
312 |     @classmethod
313 |     def getFilter(cls, numElements, maxFalsePosProbability, **kwargs):
314 |         """
315 |         Create a bloom filter.
316 | 
317 |         numElements and maxFalsePosProbability are taken to form a
318 |         speciification for the Bloom Filter.  The filter is designed
319 |         to hold a maximum of numElements entries and will have an
320 |         upper bound false positive error rate of
321 |         maxFalsePosProbability.
322 | 
323 |         Optional **kwargs:
324 | 
325 |         filename: The filepath of the mmap io file.  If set to None - a file
326 |                   will be created in temporary storage. Default: None
327 | 
328 |         ignore_case: All strings will be forced into lower case for
329 |                      both add and search functions. Default: False
330 | 
331 |         read_only: The file will be opened in read-only mode and the
332 |                    memory map will be setup in read only mode. Default False
333 | 
334 |         """
335 |         filename = kwargs.get('filename', None)
336 |         ignore_case = kwargs.get('ignore_case', 0)
337 |         read_only = kwargs.get('read_only', 0)
338 |         want_lock = kwargs.get('want_lock', False)
339 |         fdatasync_on_close = kwargs.get('fdatasync_on_close', True)
340 | 
341 |         for k in ['filename', 'ignore_case', 'read_only', 'want_lock', 'fdatasync_on_close']:
342 |             if kwargs.has_key(k):
343 |                 del kwargs[k]
344 |         if kwargs:
345 |             raise RuntimeError, "Unexpected kwargs: %s" % str(kwargs)
346 | 
347 |         if not filename:
348 |             fileobj = tempfile.NamedTemporaryFile(delete=True)
349 |             fileobj.file.close()
350 |             filename = fileobj.name
351 | 
352 |         assert 0 < maxFalsePosProbability <= 1.0, "Invalid probability"
353 |         bucketsPerElement = cls._maxBucketsPerElement(numElements)
354 |         spec = BloomCalculations.computeBloomSpec2(bucketsPerElement, maxFalsePosProbability)
355 |         bitmap = cls._bucketsFor(numElements, spec.bucketsPerElement, filename, read_only, want_lock=want_lock, fdatasync_on_close=fdatasync_on_close)
356 |         bf = BloomFilter(spec.K, bitmap, ignore_case)
357 |         if not filename:
358 |             bf._tempfile = fileobj
359 |         return bf
360 | 
361 |     def __setitem__(self, key, int ignored):
362 |         self.add(key)
363 | 
364 |     def __getitem__(self, key):
365 |         return int(self.contains(key))
366 | 
367 |     def __contains__(self, ustring):
368 |         return self.contains(ustring)
369 | 
370 |     @cython.boundscheck(False)
371 |     cpdef add(self, ustring):
372 |         """ Add a key into the filter.  Just like a set.  """
373 |         cdef unsigned long long i
374 |         cdef unsigned long long _bucket_indexes[1000]
375 | 
376 |         if isinstance(ustring, unicode):
377 |             key = ustring.encode('utf8')
378 |         else:
379 |             key = ustring
380 | 
381 |         if self._ignore_case:
382 |             c_lcase(key);
383 | 
384 |         self._get_hash_buckets(key, _bucket_indexes, self._hashCount, self.buckets())
385 |         for i in range(self._hashCount):
386 |             self._bitmap[_bucket_indexes[i]] = 1
387 | 
388 |     @cython.boundscheck(False)
389 |     cpdef contains(self, ustring):
390 |         """ Check if a key is in the bloom filter.  May return a false positive. """
391 |         cdef unsigned long long _bucket_indexes[1000]
392 |         cdef unsigned long long i
393 | 
394 |         if isinstance(ustring, unicode):
395 |             key = ustring.encode('utf8')
396 |         else:
397 |             key = ustring
398 | 
399 |         if self._ignore_case:
400 |             c_lcase(key);
401 |         self._get_hash_buckets(key, _bucket_indexes, self._hashCount, self.buckets())
402 |         for i in range(self._hashCount):
403 |             if not self._bitmap[_bucket_indexes[i]]:
404 |                 return False
405 |         return True
406 | 
407 |     cpdef buckets(self):
408 |         """ Return the number of total buckets (bits) in the bloom filter """
409 |         return self._bitmap.size()
410 | 
411 |     def getHashBuckets(self, ustring, unsigned int hashCount, unsigned long long max):
412 |         """ This method is just available for test purposes.  Not actually useful for normal users. """
413 |         cdef unsigned long long _bucket_indexes[1000]
414 | 
415 |         if isinstance(ustring, unicode):
416 |             key = ustring.encode('utf8')
417 |         else:
418 |             key = ustring
419 | 
420 |         self._get_hash_buckets(key, _bucket_indexes, hashCount, max)
421 |         result = []
422 |         for i in range(hashCount):
423 |             result.append(_bucket_indexes[i])
424 |         return result
425 | 
426 |     @cython.boundscheck(False)
427 |     cdef void _get_hash_buckets(self, bytes key, unsigned long long * _bucket_indexes, unsigned int hashCount, unsigned long max):
428 |         """
429 |         Murmur is faster than an SHA-based approach and provides as-good collision
430 |         resistance.  The combinatorial generation approach described in
431 |         https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
432 |         does prove to work in actual tests, and is obviously faster
433 |         than performing further iterations of murmur.
434 |         """
435 |         cdef unsigned long result[2]
436 |         cdef unsigned long hash1, hash2
437 |         cdef unsigned long i
438 | 
439 |         MurmurHash3_x64_128(<char*>key, len(key), 0, result)
440 |         hash1 = result[0]
441 |         MurmurHash3_x64_128(<char*>key, len(key), result[1] & 0xFFFFFFFF, result)
442 |         hash2 = result[0]
443 | 
444 |         for i in range(hashCount):
445 |             _bucket_indexes[i] = llabs((hash1 + i * hash2) % max)
446 | 
447 |     cdef void _strip_newline(self, char *buffer, unsigned int size):
448 |         """
449 |         Strip newline by overwriting with a null
450 |         """
451 |         cdef unsigned int i
452 |         for i in range(size):
453 |             if buffer[i] == '\n':
454 |                 buffer[i] = '\x00'
455 |                 return
456 | 
457 |     def bulkload_text(self, char* filename):
458 |         cdef FILE* file_in = fopen( filename, "r")
459 |         cdef char line[128]
460 |         if file_in:
461 |             while fgets(line, 128, file_in):
462 |                 self._strip_newline(line, len(line))
463 |                 self.add(line)
464 |             # Yeah, i should check for errors. sosumi.
465 |             fclose(file_in)
466 | 
467 | cdef void c_lcase(char* buffer):
468 |     """
469 |     Force string to lower case
470 |     """
471 |     cdef unsigned int i
472 |     for i in range(len(buffer)):
473 |         buffer[i] = <char> tolower(buffer[i])
474 | 


--------------------------------------------------------------------------------