├── requirements.txt ├── .gitignore ├── MANIFEST.in ├── .travis.yml ├── src ├── mmap_writer.h ├── MurmurHash3.h ├── hydra.py ├── mmap_writer.c ├── MurmurHash3.c └── _hydra.pyx ├── CHANGES.rst ├── tests ├── timeit.py ├── test_murmur.py ├── helpers.py ├── test_mmapbitarray.py └── test_bloom.py ├── license.txt ├── README.md └── setup.py /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | flake8 3 | nose 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | a.out 2 | *.a 3 | *.o 4 | *.pyc 5 | *.so 6 | *.swp 7 | *egg-info 8 | bin 9 | build 10 | dist 11 | include 12 | javabloom 13 | lib 14 | man 15 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.md 2 | include *.rst 3 | include *.txt 4 | include MANIFEST.in 5 | include setup.py 6 | include setup.cfg 7 | 8 | recursive-include src *.c *.h *.py 9 | recursive-include tests *.c *.h *.py 10 | recursive-include docs *.html *.css *.gif *.jpg *.txt 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # new container based environment 2 | sudo: false 3 | 4 | cache: 5 | pip: true 6 | 7 | language: python 8 | python: 9 | - "2.7" 10 | - "3.4" 11 | - "3.5" 12 | install: 13 | - "pip install -U pip wheel" 14 | - "pip install -r requirements.txt" 15 | script: 16 | - "cythonize src/_hydra.pyx" 17 | - "python setup.py build_ext --inplace" 18 | - "flake8 src tests" 19 | - "python setup.py test" 20 | -------------------------------------------------------------------------------- /src/mmap_writer.h: -------------------------------------------------------------------------------- 1 | 2 | int open_mmap_file_rw(char* filename, size_t bytesize); 3 | int open_mmap_file_ro(char* filepath); 4 | char* map_file_rw(int fd, size_t filesize, int want_lock); 5 | char* map_file_ro(int fd, size_t filesize, int want_lock); 6 | void turn_bits_on(char *map, off_t index, char bitmask); 7 | int flush_to_disk(int fd); 8 | int close_file(int fd); 9 | int unmap_file(char* map, size_t filesize); 10 | void bulkload_file(char* buffer, char* filename); 11 | -------------------------------------------------------------------------------- /CHANGES.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | Changelog 3 | ========= 4 | 5 | 2.6 (unreleased) 6 | ---------------- 7 | 8 | 9 | 2.5 (2016-08-02) 10 | ---------------- 11 | 12 | - Close descriptor file in Reading/UpdatingBloomFilter. 13 | 14 | - Define some BloomFilter and MMapBitField methods as cpdef. 15 | 16 | 2.4 (2016-08-02) 17 | ---------------- 18 | 19 | - #12, #15: Ship C code to avoid Cython install time dependency. 20 | 21 | 2.3 (2015-06-04) 22 | ---------------- 23 | 24 | - Look at Git commit history for changes. 25 | -------------------------------------------------------------------------------- /tests/timeit.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a speed profiler for insertion and lookup 3 | """ 4 | import cProfile 5 | from hydra import WritingBloomFilter 6 | from helpers import KeyGenerator 7 | 8 | keygen = KeyGenerator() 9 | input_keys = [keygen.random_string() for i in range(100000)] 10 | other_keys = [keygen.random_string() for i in range(200000)] 11 | 12 | ELEMENTS = 10000000 13 | MAX_FAILURE_RATE = 0.1 14 | bf = WritingBloomFilter(ELEMENTS, MAX_FAILURE_RATE) 15 | 16 | 17 | def test_one(): 18 | for key in input_keys: 19 | bf[key] = 0 20 | 21 | for key in other_keys: 22 | key in bf 23 | 24 | cProfile.run('test_one()') 25 | -------------------------------------------------------------------------------- /tests/test_murmur.py: -------------------------------------------------------------------------------- 1 | import _hydra 2 | from helpers import KeyGenerator 3 | 4 | 5 | def test__hydra(): 6 | # This test will probably fail on big-endian machines 7 | h1 = _hydra.hash('foo') 8 | h2 = _hydra.hash('foo', h1 & 0xFFFFFFFF) 9 | assert (-39287385592190013122878999397579195001, 10 | -73964642705803263641983394469427790275) == (h1, h2) 11 | 12 | 13 | def test_collisions(): 14 | keygen = KeyGenerator() 15 | hashes = {} 16 | for i, key in enumerate(keygen.randomKeys()): 17 | hcode = _hydra.hash(key) 18 | if hcode not in hashes: 19 | hashes[hcode] = key 20 | else: 21 | raise RuntimeError("Hash collision!: {} {}".format( 22 | key, hashes[hcode])) 23 | 24 | 25 | def test_null_key(): 26 | h0 = _hydra.hash('foo') 27 | h1 = _hydra.hash('foo\0bar') 28 | h2 = _hydra.hash('foo\0baz') 29 | assert h0 != h1, 'Hash collision for appended null' 30 | assert h0 != h2, 'Hash collision for appended null' 31 | assert h1 != h2, 'Hash collision for bytes after null' 32 | -------------------------------------------------------------------------------- /license.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2010 Victor Ng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/helpers.py: -------------------------------------------------------------------------------- 1 | import string 2 | import random 3 | from os.path import join, dirname 4 | 5 | 6 | class KeyGenerator(object): 7 | def __init__(self, seed=314519): 8 | fname = join(dirname(__file__), 'words') 9 | self._fname = fname 10 | self._lines = open(fname, 'r').readlines() 11 | self._linecount = len(self._lines) 12 | self.ELEMENTS = 10000 13 | 14 | self._r1 = random.Random() 15 | self._r1.seed(seed) 16 | 17 | def __len__(self): 18 | return self._linecount 19 | 20 | def __getitem__(self, i): 21 | start = i.start or 0 22 | stop = i.stop or self._linecount 23 | step = i.step or 1 24 | return self._lines[start:stop:step] 25 | 26 | def random_string(self, length=16): 27 | return "".join([self._r1.choice(string.ascii_letters + string.digits) 28 | for x in range(1, length)]) 29 | 30 | def randomKeys(self, num_elem=None): 31 | ''' 32 | Return a bunch of random keys 33 | ''' 34 | if not num_elem: 35 | num_elem = self.ELEMENTS 36 | return self._r1.sample(self._lines, num_elem) 37 | -------------------------------------------------------------------------------- /src/MurmurHash3.h: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------------- 2 | // MurmurHash3 was written by Austin Appleby, and is placed in the public 3 | // domain. The author hereby disclaims copyright to this source code. 4 | 5 | #ifndef _MURMURHASH3_H_ 6 | #define _MURMURHASH3_H_ 7 | 8 | //----------------------------------------------------------------------------- 9 | // Platform-specific functions and macros 10 | 11 | // Microsoft Visual Studio 12 | 13 | #if defined(_MSC_VER) && (_MSC_VER < 1600) 14 | 15 | typedef unsigned char uint8_t; 16 | typedef unsigned int uint32_t; 17 | typedef unsigned __int64 uint64_t; 18 | 19 | // Other compilers 20 | 21 | #else // defined(_MSC_VER) 22 | 23 | #include 24 | 25 | #endif // !defined(_MSC_VER) 26 | 27 | //----------------------------------------------------------------------------- 28 | 29 | void MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed, void * out ); 30 | 31 | void MurmurHash3_x86_128 ( const void * key, int len, uint32_t seed, void * out ); 32 | 33 | void MurmurHash3_x64_128 ( const void * key, int len, uint32_t seed, void * out ); 34 | 35 | //----------------------------------------------------------------------------- 36 | 37 | #endif // _MURMURHASH3_H_ 38 | -------------------------------------------------------------------------------- /tests/test_mmapbitarray.py: -------------------------------------------------------------------------------- 1 | from _hydra import MMapBitField 2 | import tempfile 3 | 4 | 5 | def test_ro_segfault(): 6 | tf = tempfile.NamedTemporaryFile(delete=True) 7 | rw_field = MMapBitField(tf.name, 80, 0) 8 | rw_field[0] = 1 9 | ro_field = MMapBitField(tf.name, 80, 1) 10 | try: 11 | ro_field[0] = 1 12 | except ValueError: 13 | pass 14 | 15 | 16 | def test_setitem(): 17 | tf = tempfile.NamedTemporaryFile(delete=True) 18 | bf = MMapBitField(tf.name, 80, 0) 19 | 20 | # verify set once 21 | bf[0] = 1 22 | assert bf[0] 23 | for idx in range(1, len(bf)): 24 | assert not bf[idx] 25 | bf[0] = 1 26 | assert bf[0] 27 | for idx in range(1, len(bf)): 28 | assert not bf[idx] 29 | 30 | # verify unset twice 31 | bf[0] = 0 32 | for idx in range(len(bf)): 33 | assert not bf[idx] 34 | bf[0] = 0 35 | for idx in range(len(bf)): 36 | assert not bf[idx] 37 | 38 | # verify set at end twice 39 | bf[len(bf) - 1] = 1 40 | assert bf[len(bf) - 1] 41 | bf[len(bf) - 1] = 1 42 | assert bf[len(bf) - 1] 43 | 44 | # verify unset at end twice 45 | bf[len(bf) - 1] = 0 46 | assert not bf[len(bf) - 1] 47 | bf[len(bf) - 1] = 0 48 | assert not bf[len(bf) - 1] 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/crankycoder/hydra.svg?branch=master)](https://travis-ci.org/crankycoder/hydra) 2 | 3 | Hydra: The Python Bloom Filter. 4 | 5 | Compile with Cython 0.24 or higher. 6 | 7 | --- 8 | 9 | Hydra is a high performance bloom filter. It's basically a port of 10 | the Cassandra bloom filter with some fun Cython hackery. 11 | 12 | 1) It's persistent using memory mapped io. On Linux, the mmap uses 13 | the MAP_POPULATE flag so the entire file is loaded into kernel space 14 | virtual memory. In other words - fast. 15 | 16 | 2) The hash function uses the MurmurHash3 algorithm, so it should be 17 | fast and have excellent key distribution and avalanche properties. 18 | 19 | 3) The filter exports a set-like interface. Use .add(..), .contains() 20 | or use the "in" operator. 21 | 22 | 4) Tests. OMG what is wrong with people with no tests? 23 | 24 | The filter supports periodic forced synchronization to disk using 25 | fdatasync(), or you can just let the deallocator flush everything to 26 | disk when your filter goes out of scope, or your process terminates. 27 | 28 | Hydras are snakes with multiple heads. They're also bad dudes with 29 | snake logos on their chest who regularly try to beat on Nick Fury. 30 | Now it's a bloom filter. 31 | 32 | Mostly, I couldn't bear to make this yet another PySomeLibraryName 33 | library. 34 | 35 | 36 | Build, install a dev build and test: 37 | 38 | $ pip install -r requirements.txt 39 | $ cythonize src/_hydra.pyx 40 | $ python setup.py develop 41 | $ python setup.py test 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | from setuptools.extension import Extension 3 | from os.path import join 4 | 5 | import os 6 | 7 | __version__ = '2.6.dev0' 8 | 9 | ext_modules = [Extension("_hydra", 10 | extra_compile_args=['-std=gnu99', 11 | '-O2', 12 | '-D_LARGEFILE64_SOURCE'], 13 | sources=["src/_hydra.c", 14 | 'src/mmap_writer.c', 15 | 'src/MurmurHash3.c'], 16 | 17 | # path to .h file(s) 18 | include_dirs=[join(os.getcwd(), 'src')], 19 | 20 | # path to .a or .so file(s) 21 | library_dirs=[join(os.getcwd(), 'src')])] 22 | 23 | setup(name='Hydra', 24 | author='Victor Ng', 25 | author_email='crankycoder@gmail.com', 26 | description='A high performance persistent bloom filter', 27 | url="http://github.com/crankycoder/Hydra", 28 | version=__version__, 29 | license='MIT License', 30 | zip_safe=False, 31 | package_dir={'': 'src'}, 32 | py_modules=['hydra'], 33 | ext_modules=ext_modules, 34 | test_suite='nose.collector', 35 | classifiers=[ 36 | 'License :: OSI Approved :: MIT License', 37 | 'Programming Language :: Python', 38 | 'Programming Language :: Python :: 2', 39 | 'Programming Language :: Python :: 2.7', 40 | 'Programming Language :: Python :: 3', 41 | 'Programming Language :: Python :: 3.4', 42 | 'Programming Language :: Python :: 3.5', 43 | 'Programming Language :: Python :: Implementation :: CPython', 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /src/hydra.py: -------------------------------------------------------------------------------- 1 | import _hydra 2 | 3 | 4 | def ReadingBloomFilter(filename, want_lock=False): 5 | """ 6 | Create a read-only bloom filter with an upperbound of 7 | (num_elements, max_fp_prob) as a specification and using filename 8 | as the backing datastore. 9 | """ 10 | with open('{}.desc'.format(filename), 'r') as descriptor: 11 | num_elements = int(descriptor.readline()) 12 | max_fp_prob = float(descriptor.readline()) 13 | ignore_case = int(descriptor.readline()) 14 | 15 | return _hydra.BloomFilter.getFilter( 16 | num_elements, max_fp_prob, 17 | filename=filename, ignore_case=ignore_case, 18 | read_only=True, want_lock=want_lock) 19 | 20 | 21 | def UpdatingBloomFilter(filename, want_lock=False, fdatasync_on_close=True): 22 | """ 23 | Load an existing bloom filter in read-write mode using filename 24 | as the backing datastore. 25 | """ 26 | with open('{}.desc'.format(filename), 'r') as descriptor: 27 | num_elements = int(descriptor.readline()) 28 | max_fp_prob = float(descriptor.readline()) 29 | ignore_case = int(descriptor.readline()) 30 | 31 | return _hydra.BloomFilter.getFilter( 32 | num_elements, max_fp_prob, 33 | filename=filename, ignore_case=ignore_case, 34 | read_only=False, want_lock=want_lock, 35 | fdatasync_on_close=fdatasync_on_close) 36 | 37 | 38 | def WritingBloomFilter(num_elements, max_fp_prob, filename=None, 39 | ignore_case=False, want_lock=False, 40 | fdatasync_on_close=True): 41 | """ 42 | Create a read/write bloom filter with an upperbound of 43 | (num_elements, max_fp_prob) as a specification and using filename 44 | as the backing datastore. 45 | """ 46 | new_filter = _hydra.BloomFilter.getFilter( 47 | num_elements, max_fp_prob, 48 | filename=filename, ignore_case=ignore_case, 49 | read_only=False, want_lock=want_lock, 50 | fdatasync_on_close=fdatasync_on_close) 51 | if filename: 52 | with open('{}.desc'.format(filename), 'w') as descriptor: 53 | descriptor.write("{}\n".format(num_elements)) 54 | descriptor.write("{:0.8f}\n".format(max_fp_prob)) 55 | descriptor.write("{:d}\n".format(ignore_case)) 56 | return new_filter 57 | 58 | # Expose the murmur hash 59 | murmur_hash = _hydra.hash 60 | -------------------------------------------------------------------------------- /src/mmap_writer.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include 11 | 12 | #include "mmap_writer.h" 13 | 14 | #define FILEPATH "/tmp/mmapped.bin" 15 | #define NUMINTS (255) 16 | #define FILESIZE (NUMINTS * sizeof(char)) 17 | 18 | /* 19 | * Create the file and reallocate NULL bytes. 20 | * 21 | * Return the file descriptor to the file 22 | */ 23 | int open_mmap_file_rw(char* filename, size_t bytesize) 24 | { 25 | int fd; 26 | int result; 27 | 28 | /* Open a file for writing. 29 | * * - Creating the file if it doesn't exist. 30 | * * 31 | * * Note: "O_WRONLY" mode is not sufficient when mmaping. 32 | * */ 33 | 34 | fd = open(filename, O_RDWR | O_CREAT, (mode_t)0644); 35 | if (fd == -1) { 36 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 37 | "Error opening file for writing"); 38 | return -1; 39 | } 40 | 41 | #ifdef __linux__ 42 | /* Stretch the file size to the size of the (mmapped) array of 43 | * ints 44 | * */ 45 | result = posix_fallocate(fd, 0, bytesize); 46 | if (result) { 47 | errno = result; 48 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 49 | "Error calling lseek() to 'stretch' the file"); 50 | close(fd); 51 | return -1; 52 | } 53 | #else 54 | /* Stretch the file size to the size of the (mmapped) array of 55 | * ints 56 | * */ 57 | result = lseek(fd, bytesize-1, SEEK_SET); 58 | if (result == -1) { 59 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 60 | "Error calling lseek() to 'stretch' the file"); 61 | close(fd); 62 | return -1; 63 | } 64 | 65 | /* Something needs to be written at the end of the file to 66 | * * have the file actually have the new size. 67 | * * Just writing an empty string at the current file position 68 | * will do. 69 | * * 70 | * * Note: 71 | * * - The current position in the file is at the end of the 72 | * stretched 73 | * * file due to the call to lseek(). 74 | * * - An empty string is actually a single '\0' character, so a 75 | * zero-byte 76 | * * will be written at the last byte of the file. 77 | * */ 78 | result = write(fd, "", 1); 79 | if (result != 1) { 80 | close(fd); 81 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 82 | "Error writing last byte of the file"); 83 | return -1; 84 | } 85 | #endif 86 | 87 | return fd; 88 | } 89 | 90 | int open_mmap_file_ro(char* filepath) 91 | { 92 | int fd; 93 | fd = open(filepath, O_RDONLY); 94 | if (fd == -1) { 95 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 96 | "Error opening file for reading"); 97 | return -1; 98 | } 99 | return fd; 100 | } 101 | 102 | /* 103 | * mmap a file descriptor in read-only mode and return a char array 104 | */ 105 | char* map_file_ro(int fd, size_t filesize, int want_lock) 106 | { 107 | char* map; 108 | int flags = MAP_SHARED; 109 | #ifdef __linux__ 110 | if (want_lock) { 111 | flags |= MAP_LOCKED; 112 | } 113 | #endif 114 | map = mmap(0, filesize, PROT_READ, flags, fd, 0); 115 | if (map == MAP_FAILED) { 116 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 117 | "Error mmapping the file"); 118 | close(fd); 119 | return 0; 120 | } 121 | return map; 122 | } 123 | 124 | /* 125 | * mmap the file descriptor in r/w/ mode. Return the char array 126 | */ 127 | char* map_file_rw(int fd, size_t filesize, int want_lock) 128 | { 129 | char* map; 130 | int flags = MAP_SHARED; 131 | 132 | #ifdef __linux__ 133 | flags |= MAP_POPULATE; 134 | if (want_lock) { 135 | flags |= MAP_LOCKED; 136 | } 137 | #endif 138 | 139 | map = (char *) mmap(0, filesize, PROT_READ | PROT_WRITE, flags, fd, 0); 140 | 141 | if (map == MAP_FAILED) { 142 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 143 | "Error mmapping the file"); 144 | close(fd); 145 | return 0; 146 | } 147 | 148 | return map; 149 | } 150 | 151 | /* 152 | * Don't forget to free the mmapped memory 153 | */ 154 | int unmap_file(char* map, size_t filesize) { 155 | if (munmap(map, filesize) == -1) { 156 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 157 | "Error un-mmapping the file"); 158 | return -1; 159 | } 160 | return 0; 161 | } 162 | 163 | void turn_bits_on(char *map, off_t index, char bitmask) 164 | { 165 | map[index] = map[index] | bitmask; 166 | } 167 | 168 | int flush_to_disk(int fd) 169 | { 170 | int result; 171 | result = fdatasync(fd); 172 | if (result == -1) { 173 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 174 | "Error flushing the file"); 175 | close(fd); 176 | return -1; 177 | } 178 | return 0; 179 | } 180 | 181 | int close_file(int fd) 182 | { 183 | int result; 184 | flush_to_disk(fd); 185 | result = close(fd); 186 | if (result == -1) { 187 | PyErr_SetFromErrnoWithFilename(PyExc_OSError, 188 | "Error closing the file"); 189 | return -1; 190 | } 191 | return 0; 192 | } 193 | 194 | int main(int argc, char *argv[]) 195 | { 196 | int fd; 197 | char *map; /* mmapped array of chars */ 198 | fd = open_mmap_file_rw(FILEPATH, FILESIZE); 199 | 200 | map = map_file_rw(fd, FILESIZE, 0); 201 | 202 | /* Now write int's to the file as if it were memory (an array of 203 | * ints). 204 | * */ 205 | for (off_t i = 0; i 22 | 23 | #define ROTL32(x,y) _rotl(x,y) 24 | #define ROTL64(x,y) _rotl64(x,y) 25 | 26 | #define BIG_CONSTANT(x) (x) 27 | 28 | // Other compilers 29 | 30 | #else // defined(_MSC_VER) 31 | 32 | #define FORCE_INLINE inline __attribute__((always_inline)) 33 | 34 | inline uint32_t rotl32 ( uint32_t x, int8_t r ) 35 | { 36 | return (x << r) | (x >> (32 - r)); 37 | } 38 | 39 | inline uint64_t rotl64 ( uint64_t x, int8_t r ) 40 | { 41 | return (x << r) | (x >> (64 - r)); 42 | } 43 | 44 | #define ROTL32(x,y) rotl32(x,y) 45 | #define ROTL64(x,y) rotl64(x,y) 46 | 47 | #define BIG_CONSTANT(x) (x##LLU) 48 | 49 | #endif // !defined(_MSC_VER) 50 | 51 | //----------------------------------------------------------------------------- 52 | // Block read - if your platform needs to do endian-swapping or can only 53 | // handle aligned reads, do the conversion here 54 | 55 | FORCE_INLINE uint32_t getblock32 ( const uint32_t * p, int i ) 56 | { 57 | return p[i]; 58 | } 59 | 60 | FORCE_INLINE uint64_t getblock64 ( const uint64_t * p, int i ) 61 | { 62 | return p[i]; 63 | } 64 | 65 | //----------------------------------------------------------------------------- 66 | // Finalization mix - force all bits of a hash block to avalanche 67 | 68 | FORCE_INLINE uint32_t fmix32 ( uint32_t h ) 69 | { 70 | h ^= h >> 16; 71 | h *= 0x85ebca6b; 72 | h ^= h >> 13; 73 | h *= 0xc2b2ae35; 74 | h ^= h >> 16; 75 | 76 | return h; 77 | } 78 | 79 | //---------- 80 | 81 | FORCE_INLINE uint64_t fmix64 ( uint64_t k ) 82 | { 83 | k ^= k >> 33; 84 | k *= BIG_CONSTANT(0xff51afd7ed558ccd); 85 | k ^= k >> 33; 86 | k *= BIG_CONSTANT(0xc4ceb9fe1a85ec53); 87 | k ^= k >> 33; 88 | 89 | return k; 90 | } 91 | 92 | //----------------------------------------------------------------------------- 93 | 94 | void MurmurHash3_x86_32 ( const void * key, int len, 95 | uint32_t seed, void * out ) 96 | { 97 | const uint8_t * data = (const uint8_t*)key; 98 | const int nblocks = len / 4; 99 | 100 | uint32_t h1 = seed; 101 | 102 | const uint32_t c1 = 0xcc9e2d51; 103 | const uint32_t c2 = 0x1b873593; 104 | 105 | //---------- 106 | // body 107 | 108 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*4); 109 | 110 | for(int i = -nblocks; i; i++) 111 | { 112 | uint32_t k1 = getblock32(blocks,i); 113 | 114 | k1 *= c1; 115 | k1 = ROTL32(k1,15); 116 | k1 *= c2; 117 | 118 | h1 ^= k1; 119 | h1 = ROTL32(h1,13); 120 | h1 = h1*5+0xe6546b64; 121 | } 122 | 123 | //---------- 124 | // tail 125 | 126 | const uint8_t * tail = (const uint8_t*)(data + nblocks*4); 127 | 128 | uint32_t k1 = 0; 129 | 130 | switch(len & 3) 131 | { 132 | case 3: k1 ^= tail[2] << 16; 133 | case 2: k1 ^= tail[1] << 8; 134 | case 1: k1 ^= tail[0]; 135 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 136 | }; 137 | 138 | //---------- 139 | // finalization 140 | 141 | h1 ^= len; 142 | 143 | h1 = fmix32(h1); 144 | 145 | *(uint32_t*)out = h1; 146 | } 147 | 148 | //----------------------------------------------------------------------------- 149 | 150 | void MurmurHash3_x86_128 ( const void * key, const int len, 151 | uint32_t seed, void * out ) 152 | { 153 | const uint8_t * data = (const uint8_t*)key; 154 | const int nblocks = len / 16; 155 | 156 | uint32_t h1 = seed; 157 | uint32_t h2 = seed; 158 | uint32_t h3 = seed; 159 | uint32_t h4 = seed; 160 | 161 | const uint32_t c1 = 0x239b961b; 162 | const uint32_t c2 = 0xab0e9789; 163 | const uint32_t c3 = 0x38b34ae5; 164 | const uint32_t c4 = 0xa1e38b93; 165 | 166 | //---------- 167 | // body 168 | 169 | const uint32_t * blocks = (const uint32_t *)(data + nblocks*16); 170 | 171 | for(int i = -nblocks; i; i++) 172 | { 173 | uint32_t k1 = getblock32(blocks,i*4+0); 174 | uint32_t k2 = getblock32(blocks,i*4+1); 175 | uint32_t k3 = getblock32(blocks,i*4+2); 176 | uint32_t k4 = getblock32(blocks,i*4+3); 177 | 178 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 179 | 180 | h1 = ROTL32(h1,19); h1 += h2; h1 = h1*5+0x561ccd1b; 181 | 182 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 183 | 184 | h2 = ROTL32(h2,17); h2 += h3; h2 = h2*5+0x0bcaa747; 185 | 186 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 187 | 188 | h3 = ROTL32(h3,15); h3 += h4; h3 = h3*5+0x96cd1c35; 189 | 190 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 191 | 192 | h4 = ROTL32(h4,13); h4 += h1; h4 = h4*5+0x32ac3b17; 193 | } 194 | 195 | //---------- 196 | // tail 197 | 198 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 199 | 200 | uint32_t k1 = 0; 201 | uint32_t k2 = 0; 202 | uint32_t k3 = 0; 203 | uint32_t k4 = 0; 204 | 205 | switch(len & 15) 206 | { 207 | case 15: k4 ^= tail[14] << 16; 208 | case 14: k4 ^= tail[13] << 8; 209 | case 13: k4 ^= tail[12] << 0; 210 | k4 *= c4; k4 = ROTL32(k4,18); k4 *= c1; h4 ^= k4; 211 | 212 | case 12: k3 ^= tail[11] << 24; 213 | case 11: k3 ^= tail[10] << 16; 214 | case 10: k3 ^= tail[ 9] << 8; 215 | case 9: k3 ^= tail[ 8] << 0; 216 | k3 *= c3; k3 = ROTL32(k3,17); k3 *= c4; h3 ^= k3; 217 | 218 | case 8: k2 ^= tail[ 7] << 24; 219 | case 7: k2 ^= tail[ 6] << 16; 220 | case 6: k2 ^= tail[ 5] << 8; 221 | case 5: k2 ^= tail[ 4] << 0; 222 | k2 *= c2; k2 = ROTL32(k2,16); k2 *= c3; h2 ^= k2; 223 | 224 | case 4: k1 ^= tail[ 3] << 24; 225 | case 3: k1 ^= tail[ 2] << 16; 226 | case 2: k1 ^= tail[ 1] << 8; 227 | case 1: k1 ^= tail[ 0] << 0; 228 | k1 *= c1; k1 = ROTL32(k1,15); k1 *= c2; h1 ^= k1; 229 | }; 230 | 231 | //---------- 232 | // finalization 233 | 234 | h1 ^= len; h2 ^= len; h3 ^= len; h4 ^= len; 235 | 236 | h1 += h2; h1 += h3; h1 += h4; 237 | h2 += h1; h3 += h1; h4 += h1; 238 | 239 | h1 = fmix32(h1); 240 | h2 = fmix32(h2); 241 | h3 = fmix32(h3); 242 | h4 = fmix32(h4); 243 | 244 | h1 += h2; h1 += h3; h1 += h4; 245 | h2 += h1; h3 += h1; h4 += h1; 246 | 247 | ((uint32_t*)out)[0] = h1; 248 | ((uint32_t*)out)[1] = h2; 249 | ((uint32_t*)out)[2] = h3; 250 | ((uint32_t*)out)[3] = h4; 251 | } 252 | 253 | //----------------------------------------------------------------------------- 254 | 255 | void MurmurHash3_x64_128 ( const void * key, const int len, 256 | const uint32_t seed, void * out ) 257 | { 258 | const uint8_t * data = (const uint8_t*)key; 259 | const int nblocks = len / 16; 260 | 261 | uint64_t h1 = seed; 262 | uint64_t h2 = seed; 263 | 264 | const uint64_t c1 = BIG_CONSTANT(0x87c37b91114253d5); 265 | const uint64_t c2 = BIG_CONSTANT(0x4cf5ad432745937f); 266 | 267 | //---------- 268 | // body 269 | 270 | const uint64_t * blocks = (const uint64_t *)(data); 271 | 272 | for(int i = 0; i < nblocks; i++) 273 | { 274 | uint64_t k1 = getblock64(blocks,i*2+0); 275 | uint64_t k2 = getblock64(blocks,i*2+1); 276 | 277 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 278 | 279 | h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729; 280 | 281 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 282 | 283 | h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5; 284 | } 285 | 286 | //---------- 287 | // tail 288 | 289 | const uint8_t * tail = (const uint8_t*)(data + nblocks*16); 290 | 291 | uint64_t k1 = 0; 292 | uint64_t k2 = 0; 293 | 294 | switch(len & 15) 295 | { 296 | case 15: k2 ^= ((uint64_t)tail[14]) << 48; 297 | case 14: k2 ^= ((uint64_t)tail[13]) << 40; 298 | case 13: k2 ^= ((uint64_t)tail[12]) << 32; 299 | case 12: k2 ^= ((uint64_t)tail[11]) << 24; 300 | case 11: k2 ^= ((uint64_t)tail[10]) << 16; 301 | case 10: k2 ^= ((uint64_t)tail[ 9]) << 8; 302 | case 9: k2 ^= ((uint64_t)tail[ 8]) << 0; 303 | k2 *= c2; k2 = ROTL64(k2,33); k2 *= c1; h2 ^= k2; 304 | 305 | case 8: k1 ^= ((uint64_t)tail[ 7]) << 56; 306 | case 7: k1 ^= ((uint64_t)tail[ 6]) << 48; 307 | case 6: k1 ^= ((uint64_t)tail[ 5]) << 40; 308 | case 5: k1 ^= ((uint64_t)tail[ 4]) << 32; 309 | case 4: k1 ^= ((uint64_t)tail[ 3]) << 24; 310 | case 3: k1 ^= ((uint64_t)tail[ 2]) << 16; 311 | case 2: k1 ^= ((uint64_t)tail[ 1]) << 8; 312 | case 1: k1 ^= ((uint64_t)tail[ 0]) << 0; 313 | k1 *= c1; k1 = ROTL64(k1,31); k1 *= c2; h1 ^= k1; 314 | }; 315 | 316 | //---------- 317 | // finalization 318 | 319 | h1 ^= len; h2 ^= len; 320 | 321 | h1 += h2; 322 | h2 += h1; 323 | 324 | h1 = fmix64(h1); 325 | h2 = fmix64(h2); 326 | 327 | h1 += h2; 328 | h2 += h1; 329 | 330 | ((uint64_t*)out)[0] = h1; 331 | ((uint64_t*)out)[1] = h2; 332 | } 333 | 334 | //----------------------------------------------------------------------------- 335 | -------------------------------------------------------------------------------- /src/_hydra.pyx: -------------------------------------------------------------------------------- 1 | cimport cython 2 | import operator 3 | import os 4 | import sys 5 | import tempfile 6 | 7 | cdef extern from "ctype.h" nogil: 8 | cdef int tolower (int c) 9 | 10 | cdef extern from "stdlib.h" nogil: 11 | long long int llabs(long long int j) 12 | 13 | 14 | cdef extern from "stdio.h" nogil: 15 | ctypedef struct FILE 16 | FILE *fopen(char *path, char *mode) 17 | int fclose(FILE *strea) 18 | cdef char* fgets (char *buffer, int fd, FILE *stream) 19 | 20 | cdef extern from "mmap_writer.h" nogil: 21 | cdef char* map_file_ro(int fd, size_t filesize, int want_lock) except NULL 22 | cdef char* map_file_rw(int fd, size_t filesize, int want_lock) except NULL 23 | cdef int open_mmap_file_ro(char* filepath) except -1 24 | cdef int open_mmap_file_rw(char* filename, size_t bytesize) except -1 25 | cdef void bulkload_file(char* buffer, char* filename) 26 | cdef int close_file(int fd) except -1 27 | cdef int flush_to_disk(int fd) except -1 28 | cdef void turn_bits_on(char *map, size_t index, char bitmask) 29 | cdef int unmap_file(char* map, size_t filesize) except -1 30 | 31 | cdef extern from "MurmurHash3.h" nogil: 32 | void MurmurHash3_x64_128 (void * key, int len, unsigned int seed, void * out) 33 | 34 | def hash(key, int seed=0): 35 | """ This function hashes a string using the Murmur3 hash algorithm""" 36 | cdef long result[2] 37 | if isinstance(key, unicode): 38 | key = key.encode('utf8') 39 | MurmurHash3_x64_128(key, len(key), seed, result) 40 | return long(result[0]) << 64 | (long(result[1]) & 0xFFFFFFFFFFFFFFFF) 41 | 42 | cdef class MMapBitField: 43 | cdef char* _filename 44 | cdef int _fd 45 | cdef long _bitsize 46 | cdef long _bytesize 47 | cdef char* _buffer 48 | cdef int _read_only 49 | cdef int _fdatasync_on_close 50 | 51 | def __cinit__(self, filename, long bitsize, int read_only, int want_lock=False, int fdatasync_on_close=True): 52 | if isinstance(filename, unicode): 53 | filename = filename.encode('utf8') 54 | self._filename = filename 55 | self._bitsize = bitsize 56 | self._bytesize = (bitsize / 8) + 2 57 | self._read_only = read_only 58 | self._fdatasync_on_close = fdatasync_on_close 59 | 60 | # Now setup the file and mmap 61 | if read_only: 62 | self.open_ro_buffer(want_lock) 63 | else: 64 | self.open_rw_buffer(want_lock) 65 | 66 | cdef void open_rw_buffer(self, want_lock=False): 67 | self._fd = open_mmap_file_rw(self._filename, self._bytesize) 68 | self._buffer = map_file_rw(self._fd, self._bytesize, want_lock) 69 | 70 | cdef void open_ro_buffer(self, want_lock=False): 71 | self._fd = open_mmap_file_ro(self._filename) 72 | self._buffer = map_file_ro(self._fd, self._bytesize, want_lock) 73 | 74 | def __dealloc__(self): 75 | self.close() 76 | 77 | cpdef close(self): 78 | if self._fd >= 0 and self._buffer: 79 | if not self._read_only and self._fdatasync_on_close: 80 | flush_to_disk(self._fd) 81 | unmap_file(self._buffer, self._bytesize) 82 | close_file(self._fd) 83 | self._fd = -1 84 | self._buffer = NULL 85 | 86 | cpdef fdatasync(self): 87 | """ Flush everything to disk """ 88 | if self._fd < 0 or not self._buffer: 89 | raise ValueError('I/O operation on closed file') 90 | 91 | if self._read_only: 92 | raise ValueError('bit field is read only') 93 | 94 | flush_to_disk(self._fd) 95 | 96 | def __setitem__(self, size_t key, int value): 97 | cdef size_t byte_offset = key / 8 98 | cdef char bitmask 99 | cdef char bitval 100 | 101 | if self._fd < 0 or not self._buffer: 102 | raise ValueError('I/O operation on closed file') 103 | 104 | if self._read_only: 105 | raise ValueError('bit field is read only') 106 | 107 | bitmask = 2 ** (key % 8) 108 | if value: 109 | bitval = self._buffer[byte_offset] | bitmask 110 | else: 111 | bitval = self._buffer[byte_offset] & ~bitmask 112 | if bitval != self._buffer[byte_offset]: 113 | self._buffer[byte_offset] = bitval 114 | 115 | def __getitem__(self, size_t key): 116 | cdef size_t byte_offset = key / 8 117 | 118 | if self._fd < 0 or not self._buffer: 119 | raise ValueError('I/O operation on closed file') 120 | 121 | cdef char old_bitmask = self._buffer[byte_offset] 122 | return (old_bitmask & (2 ** (key % 8))) 123 | 124 | def __iter__(self): 125 | if self._fd < 0 or not self._buffer: 126 | raise ValueError('I/O operation on closed file') 127 | 128 | return MMapIter(self) 129 | 130 | def __len__(self): 131 | return self.size() 132 | 133 | cpdef size(self): 134 | if self._fd < 0 or not self._buffer: 135 | raise ValueError('I/O operation on closed file') 136 | 137 | return self._bitsize 138 | 139 | 140 | cdef class MMapIter: 141 | cdef size_t _idx 142 | cdef MMapBitField _bitfield 143 | def __cinit__(self, bitfield): 144 | self._bitfield = bitfield 145 | self._idx = 0 146 | 147 | def __next__(self): 148 | cdef int result 149 | if self._idx < len(self._bitfield): 150 | result = self._bitfield[self._idx] 151 | self._idx +=1 152 | return result 153 | raise StopIteration 154 | 155 | 156 | class UnsupportedOperationException(Exception): pass 157 | 158 | class BloomSpecification: 159 | """ 160 | A wrapper class that holds two key parameters for a Bloom Filter: the 161 | number of hash functions used, and the number of buckets per element used. 162 | """ 163 | 164 | def __init__(self, k, bucketsPerElement): 165 | self.K = k 166 | self.bucketsPerElement = bucketsPerElement 167 | 168 | def __eq__(self, other): 169 | c1 = getattr(other, 'K', None) == self.K 170 | c2 = getattr(other, 'bucketsPerElement', None) == self.bucketsPerElement 171 | return c1 and c2 172 | 173 | cdef class BloomCalculations: 174 | """ 175 | This calculation class is ported straight from Cassandra. 176 | """ 177 | minBuckets = 2 178 | minK = 1 179 | 180 | PROBS = [ 181 | [1.0], # dummy row representing 0 buckets per element 182 | [1.0, 1.0], # dummy row representing 1 buckets per element 183 | [1.0, 0.393, 0.400], 184 | [1.0, 0.283, 0.237, 0.253], 185 | [1.0, 0.221, 0.155, 0.147, 0.160], 186 | [1.0, 0.181, 0.109, 0.092, 0.092, 0.101], # 5 187 | [1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638], 188 | [1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364], 189 | [1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229], 190 | [1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145], 191 | [1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846], # 10 192 | [1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509], 193 | [1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314], 194 | [1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194], 195 | [1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012], 196 | [1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744], # 15 197 | [1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459], 198 | [1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284], 199 | [1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176], 200 | [1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109], 201 | [1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05] # 20 202 | ] 203 | 204 | optKPerBuckets = [max(1, min(enumerate(probs), key=operator.itemgetter(1))[0]) for probs in PROBS] 205 | 206 | @classmethod 207 | def computeBloomSpec1(cls, bucketsPerElement): 208 | """ 209 | Given the number of buckets that can be used per element, return a 210 | specification that minimizes the false positive rate. 211 | 212 | @param bucketsPerElement The number of buckets per element for the filter. 213 | @return A spec that minimizes the false positive rate. 214 | """ 215 | assert bucketsPerElement >= 1 216 | assert bucketsPerElement <= len(BloomCalculations.PROBS) - 1 217 | return BloomSpecification(cls.optKPerBuckets[bucketsPerElement], bucketsPerElement) 218 | 219 | 220 | @classmethod 221 | def computeBloomSpec2(cls, maxBucketsPerElement, maxFalsePosProb): 222 | """ 223 | Given a maximum tolerable false positive probability, compute a Bloom 224 | specification which will give less than the specified false positive rate, 225 | but minimize the number of buckets per element and the number of hash 226 | functions used. Because bandwidth (and therefore total bitvector size) 227 | is considered more expensive than computing power, preference is given 228 | to minimizing buckets per element rather than number of hash functions. 229 | 230 | @param maxBucketsPerElement The maximum number of buckets available for the filter. 231 | @param maxFalsePosProb The maximum tolerable false positive rate. 232 | @return A Bloom Specification which would result in a false positive rate 233 | less than specified by the function call 234 | @throws UnsupportedOperationException if a filter satisfying the parameters cannot be met 235 | """ 236 | assert maxBucketsPerElement >= 1 237 | assert maxBucketsPerElement <= len(BloomCalculations.PROBS) - 1 238 | maxK = len(BloomCalculations.PROBS[maxBucketsPerElement]) - 1 239 | 240 | # Handle the trivial cases 241 | if maxFalsePosProb >= BloomCalculations.PROBS[cls.minBuckets][cls.minK]: 242 | return BloomSpecification(2, cls.optKPerBuckets[2]) 243 | 244 | if maxFalsePosProb < BloomCalculations.PROBS[maxBucketsPerElement][maxK]: 245 | msg = "Unable to satisfy %s with %s buckets per element" 246 | raise UnsupportedOperationException(msg % (maxFalsePosProb, maxBucketsPerElement)) 247 | 248 | # First find the minimal required number of buckets: 249 | bucketsPerElement = 2 250 | K = cls.optKPerBuckets[2] 251 | while(BloomCalculations.PROBS[bucketsPerElement][K] > maxFalsePosProb): 252 | bucketsPerElement += 1 253 | K = cls.optKPerBuckets[bucketsPerElement] 254 | # Now that the number of buckets is sufficient, see if we can relax K 255 | # without losing too much precision. 256 | while BloomCalculations.PROBS[bucketsPerElement][K - 1] <= maxFalsePosProb: 257 | K -= 1 258 | 259 | return BloomSpecification(K, bucketsPerElement) 260 | 261 | cdef class BloomFilter: 262 | EXCESS = 20 263 | cdef unsigned int _hashCount 264 | cdef MMapBitField _bitmap 265 | cdef int _ignore_case 266 | cdef object _tempfile 267 | 268 | def __cinit__(self, unsigned int hashes, MMapBitField bitmap, int ignore_case): 269 | cdef int i 270 | 271 | self._hashCount = hashes 272 | self._bitmap = bitmap 273 | self._ignore_case = ignore_case 274 | 275 | def __enter__(self): 276 | return self 277 | 278 | def __exit__(self, *excinfo): 279 | self.close() 280 | return None 281 | 282 | cpdef close(self): 283 | self._bitmap.close() 284 | 285 | cpdef fdatasync(self): 286 | """ Flush everything to disk """ 287 | self._bitmap.fdatasync() 288 | 289 | def filename(self): 290 | """ 291 | Filename of the MMAP file 292 | """ 293 | return self._bitmap._filename 294 | 295 | @classmethod 296 | def _maxBucketsPerElement(cls, numElements): 297 | numElements = max(1, numElements) 298 | v = (sys.maxsize - cls.EXCESS) / float(numElements) 299 | if v < 1.0: 300 | msg = "Cannot compute probabilities for %s elements." 301 | raise UnsupportedOperationException, msg % numElements 302 | return min(len(BloomCalculations.PROBS) - 1, int(v)) 303 | 304 | @classmethod 305 | def _bucketsFor(cls, numElements, bucketsPer, filename, read_only, want_lock=False, fdatasync_on_close=True): 306 | numBits = numElements * bucketsPer + cls.EXCESS 307 | bf_size = min(sys.maxsize, numBits) 308 | return MMapBitField(filename, bf_size, read_only, 309 | want_lock=want_lock, 310 | fdatasync_on_close=fdatasync_on_close) 311 | 312 | @classmethod 313 | def getFilter(cls, numElements, maxFalsePosProbability, **kwargs): 314 | """ 315 | Create a bloom filter. 316 | 317 | numElements and maxFalsePosProbability are taken to form a 318 | speciification for the Bloom Filter. The filter is designed 319 | to hold a maximum of numElements entries and will have an 320 | upper bound false positive error rate of 321 | maxFalsePosProbability. 322 | 323 | Optional **kwargs: 324 | 325 | filename: The filepath of the mmap io file. If set to None - a file 326 | will be created in temporary storage. Default: None 327 | 328 | ignore_case: All strings will be forced into lower case for 329 | both add and search functions. Default: False 330 | 331 | read_only: The file will be opened in read-only mode and the 332 | memory map will be setup in read only mode. Default False 333 | 334 | """ 335 | filename = kwargs.get('filename', None) 336 | ignore_case = kwargs.get('ignore_case', 0) 337 | read_only = kwargs.get('read_only', 0) 338 | want_lock = kwargs.get('want_lock', False) 339 | fdatasync_on_close = kwargs.get('fdatasync_on_close', True) 340 | 341 | for k in ['filename', 'ignore_case', 'read_only', 'want_lock', 'fdatasync_on_close']: 342 | if kwargs.has_key(k): 343 | del kwargs[k] 344 | if kwargs: 345 | raise RuntimeError, "Unexpected kwargs: %s" % str(kwargs) 346 | 347 | if not filename: 348 | fileobj = tempfile.NamedTemporaryFile(delete=True) 349 | fileobj.file.close() 350 | filename = fileobj.name 351 | 352 | assert 0 < maxFalsePosProbability <= 1.0, "Invalid probability" 353 | bucketsPerElement = cls._maxBucketsPerElement(numElements) 354 | spec = BloomCalculations.computeBloomSpec2(bucketsPerElement, maxFalsePosProbability) 355 | bitmap = cls._bucketsFor(numElements, spec.bucketsPerElement, filename, read_only, want_lock=want_lock, fdatasync_on_close=fdatasync_on_close) 356 | bf = BloomFilter(spec.K, bitmap, ignore_case) 357 | if not filename: 358 | bf._tempfile = fileobj 359 | return bf 360 | 361 | def __setitem__(self, key, int ignored): 362 | self.add(key) 363 | 364 | def __getitem__(self, key): 365 | return int(self.contains(key)) 366 | 367 | def __contains__(self, ustring): 368 | return self.contains(ustring) 369 | 370 | @cython.boundscheck(False) 371 | cpdef add(self, ustring): 372 | """ Add a key into the filter. Just like a set. """ 373 | cdef unsigned long long i 374 | cdef unsigned long long _bucket_indexes[1000] 375 | 376 | if isinstance(ustring, unicode): 377 | key = ustring.encode('utf8') 378 | else: 379 | key = ustring 380 | 381 | if self._ignore_case: 382 | c_lcase(key); 383 | 384 | self._get_hash_buckets(key, _bucket_indexes, self._hashCount, self.buckets()) 385 | for i in range(self._hashCount): 386 | self._bitmap[_bucket_indexes[i]] = 1 387 | 388 | @cython.boundscheck(False) 389 | cpdef contains(self, ustring): 390 | """ Check if a key is in the bloom filter. May return a false positive. """ 391 | cdef unsigned long long _bucket_indexes[1000] 392 | cdef unsigned long long i 393 | 394 | if isinstance(ustring, unicode): 395 | key = ustring.encode('utf8') 396 | else: 397 | key = ustring 398 | 399 | if self._ignore_case: 400 | c_lcase(key); 401 | self._get_hash_buckets(key, _bucket_indexes, self._hashCount, self.buckets()) 402 | for i in range(self._hashCount): 403 | if not self._bitmap[_bucket_indexes[i]]: 404 | return False 405 | return True 406 | 407 | cpdef buckets(self): 408 | """ Return the number of total buckets (bits) in the bloom filter """ 409 | return self._bitmap.size() 410 | 411 | def getHashBuckets(self, ustring, unsigned int hashCount, unsigned long long max): 412 | """ This method is just available for test purposes. Not actually useful for normal users. """ 413 | cdef unsigned long long _bucket_indexes[1000] 414 | 415 | if isinstance(ustring, unicode): 416 | key = ustring.encode('utf8') 417 | else: 418 | key = ustring 419 | 420 | self._get_hash_buckets(key, _bucket_indexes, hashCount, max) 421 | result = [] 422 | for i in range(hashCount): 423 | result.append(_bucket_indexes[i]) 424 | return result 425 | 426 | @cython.boundscheck(False) 427 | cdef void _get_hash_buckets(self, bytes key, unsigned long long * _bucket_indexes, unsigned int hashCount, unsigned long max): 428 | """ 429 | Murmur is faster than an SHA-based approach and provides as-good collision 430 | resistance. The combinatorial generation approach described in 431 | https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf 432 | does prove to work in actual tests, and is obviously faster 433 | than performing further iterations of murmur. 434 | """ 435 | cdef unsigned long result[2] 436 | cdef unsigned long hash1, hash2 437 | cdef unsigned long i 438 | 439 | MurmurHash3_x64_128(key, len(key), 0, result) 440 | hash1 = result[0] 441 | MurmurHash3_x64_128(key, len(key), result[1] & 0xFFFFFFFF, result) 442 | hash2 = result[0] 443 | 444 | for i in range(hashCount): 445 | _bucket_indexes[i] = llabs((hash1 + i * hash2) % max) 446 | 447 | cdef void _strip_newline(self, char *buffer, unsigned int size): 448 | """ 449 | Strip newline by overwriting with a null 450 | """ 451 | cdef unsigned int i 452 | for i in range(size): 453 | if buffer[i] == '\n': 454 | buffer[i] = '\x00' 455 | return 456 | 457 | def bulkload_text(self, char* filename): 458 | cdef FILE* file_in = fopen( filename, "r") 459 | cdef char line[128] 460 | if file_in: 461 | while fgets(line, 128, file_in): 462 | self._strip_newline(line, len(line)) 463 | self.add(line) 464 | # Yeah, i should check for errors. sosumi. 465 | fclose(file_in) 466 | 467 | cdef void c_lcase(char* buffer): 468 | """ 469 | Force string to lower case 470 | """ 471 | cdef unsigned int i 472 | for i in range(len(buffer)): 473 | buffer[i] = tolower(buffer[i]) 474 | --------------------------------------------------------------------------------