├── .gitignore ├── README.rst ├── __init__.py ├── benchmark.py ├── bloomfilter.py └── tests.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ================= 2 | bloomfilter-redis 3 | ================= 4 | 5 | Standard & time series bloom filters, backed by Redis bit vectors. 6 | 7 | This implementation is Python-only. If you're looking for (way) more speed, check out a C-based extension that uses hiredis at https://github.com/seomoz/pyreBloom 8 | 9 | Overview 10 | ======== 11 | 12 | This is the little bloom filter we're using to filter unique views using redis. 13 | 14 | It doesn't do anything special, but I didn't find any small and dependency-free bloom 15 | filter written in Python that use Redis as their backend. 16 | 17 | Time Series 18 | ======== 19 | If you're tracking users over time, and you want to answer the question "have we seen 20 | this guy in the past 2 minutes", this is exactly right for you. For high-throughput 21 | applications this is very space-effective. The total memory footprint is known before- 22 | hand, and is based on the amount of history you want to save and the resolution. 23 | 24 | You might track users in the past 2 minutes with a 10-second resolution using 12 bloom 25 | filters. User hits are logged into the most recent bloom filter, and checking if you have 26 | seen a user in the past 2 minutes will just go back through those 12 filters. 27 | 28 | The finest resolutions possible are around 1ms. If you're pushing it to this limit you'll 29 | have to take care of a bunch of things: Storing to and retrieving from Redis takes some 30 | time. Timestamps aren't all that exact, especially when running on a virtual machine. If 31 | you're using multiple machines, their clocks have to be perfectly in sync. 32 | 33 | Quick Benchmarks 34 | ================ 35 | 36 | Quick benchmark for ballpark figures on a MacbookPro (2x 2.66GHz) with Python 2.7, 37 | hiredis and Redis 2.9 (unstable). Each benchmark was run with k=4 hashes per key. Keys 38 | are random strings of 10 chars length: 39 | 40 | Big filter with fewer values: 41 | filling bloom filter of 1024.00kB size with 10k values 42 | adding 10000 values took 2.09s (4790 values/sec, 208.73 us/value) 43 | correct: 100000 / false: 0 -> 0% false positives 44 | 45 | Small filter with a lot of values: 46 | filling bloom filter of 500.00kB size with 100k values 47 | adding 100000 values took 22.30s (4485 values/sec, 222.96 us/value) 48 | correct: 100000 / false: 3 -> 0.003% false positives 49 | 50 | 4 parallel Python processes: 51 | filling bloom filter of 1024.00kB size with 2M values 52 | adding 2000000 values took 214.69s (9316 values/sec, 429.38 us/value) -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | from bloomfilter import BloomFilter, TimeSeriesBloomFilter -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | import random 2 | import redis 3 | import sys 4 | 5 | from datetime import datetime 6 | 7 | from bloomfilter import BloomFilter, TimeSeriesBloomFilter 8 | 9 | connection = redis.Redis() 10 | 11 | filter_size = 8 * 500 * 1024 12 | test_amount = 1000 * 10 13 | 14 | TimeSeriesBloomFilter 15 | f = TimeSeriesBloomFilter(connection=connection, bitvector_key='test_bloomfilter', n=filter_size, k=4) 16 | # f = BloomFilter(connection=connection, bitvector_key='test_bloomfilter', n=filter_size, k=4) 17 | 18 | print "filling bloom filter of %.2fkB size with %.1fk values" % \ 19 | (filter_size/1024.0/8, test_amount/1000.0) 20 | 21 | # create a reference dict so that we can check for false positives 22 | ref = {} 23 | for x in xrange(test_amount): 24 | ref['%.8f' % random.random()] = True 25 | 26 | # add values to filter 27 | start = datetime.now() 28 | for val in ref.iterkeys(): 29 | f.add(val) 30 | 31 | # calculate results 32 | duration = datetime.now()-start 33 | duration = duration.seconds+duration.microseconds/1000000.0 34 | per_second = test_amount/duration 35 | print "adding %i values took %.2fs (%i values/sec, %.2f us/value)" % \ 36 | (test_amount, duration, per_second, 1000000.0/per_second) 37 | 38 | # try random values and see how many false positives we'll get 39 | false_positives = 0 40 | correct_responses = 0 41 | start = datetime.now() 42 | while correct_responses < test_amount: 43 | val = '%.8f' % random.random() 44 | if (val in f) and (val not in ref): 45 | false_positives += 1 46 | else: 47 | correct_responses += 1 48 | duration = datetime.now()-start 49 | print "duration:",duration 50 | print "correct: %s / false: %s -> %.4f%% false positives" % \ 51 | (correct_responses, false_positives, 100*false_positives/float(correct_responses)) 52 | 53 | # remove the key in redis 54 | connection.delete('test_bloomfilter') -------------------------------------------------------------------------------- /bloomfilter.py: -------------------------------------------------------------------------------- 1 | import math 2 | import time 3 | 4 | from datetime import datetime, timedelta 5 | 6 | # For hash functions see http://www.partow.net/programming/hashfunctions/index.html 7 | # Author Arash Partow, CPL http://www.opensource.org/licenses/cpl1.0.php 8 | def FNVHash(key): 9 | fnv_prime = 0x811C9DC5 10 | hash = 0 11 | for i in range(len(key)): 12 | hash *= fnv_prime 13 | hash ^= ord(key[i]) 14 | return hash 15 | 16 | def APHash(key): 17 | hash = 0xAAAAAAAA 18 | for i in range(len(key)): 19 | if ((i & 1) == 0): 20 | hash ^= ((hash << 7) ^ ord(key[i]) * (hash >> 3)) 21 | else: 22 | hash ^= (~((hash << 11) + ord(key[i]) ^ (hash >> 5))) 23 | return hash 24 | 25 | class TimeSeriesBloomFilter(object): 26 | # todo: make it more clear how all this works 27 | # todo: create a helper function that calculates the total amount of memory stored 28 | 29 | def __init__(self, connection, bitvector_key, n, k, **kwargs): 30 | self.time_resolution = kwargs.get('time_resolution', timedelta(minutes=1)) 31 | self.time_limit = kwargs.get('time_limit', timedelta(minutes=10)) 32 | self.time_limit_seconds = self.time_limit.days*86400 + self.time_limit.seconds 33 | self.connection = connection 34 | self.bitvector_key = bitvector_key 35 | self.n = n 36 | self.k = k 37 | 38 | def most_current_filters(self, within, now): 39 | resolution_microseconds = (self.time_resolution.days*86400 + self.time_resolution.seconds)*1e6 + \ 40 | self.time_resolution.microseconds 41 | 42 | within_microseconds = (within.days*86400 + within.seconds)*1e6 + within.microseconds 43 | 44 | # how many bloom filters will we need to iterate for this? 45 | num_filters = int(math.ceil(within_microseconds / resolution_microseconds)) 46 | 47 | # figure out what the passed timestamp really is 48 | current_microtimestamp = time.mktime(now.timetuple())*1e6 + now.microsecond 49 | 50 | # get a datetime object of the 'current' filter 51 | block = resolution_microseconds * math.floor(current_microtimestamp / resolution_microseconds) 52 | block_now = datetime.fromtimestamp(block/1e6) 53 | 54 | for x in xrange(num_filters): 55 | filter_date = block_now - x * self.time_resolution 56 | filter_bitvector_key = '%s|%s' % (self.bitvector_key, filter_date.isoformat()) 57 | yield BloomFilter(self.connection, filter_bitvector_key, self.n, self.k) 58 | 59 | def add(self, key, **kwargs): 60 | within = kwargs.get('within', self.time_resolution) 61 | now = kwargs.get('now', datetime.now()) 62 | 63 | # add to the current bloom filter 64 | for bloom_filter in self.most_current_filters(within=within, now=now): 65 | # we'll expire the bloom filter we're setting to after 'limit' + 1 seconds 66 | bloom_filter.add(key, timeout=self.time_limit_seconds+1) 67 | 68 | def delete(self, key, **kwargs): 69 | within = kwargs.get('within', self.time_limit) 70 | now = kwargs.get('now', datetime.now()) 71 | 72 | # delete from the time series bloomfilters 73 | for bloom_filter in self.most_current_filters(within=within, now=now): 74 | # in case of creating new filter when deleting, so check first 75 | if key in bloom_filter: 76 | bloom_filter.delete(key) 77 | 78 | def __contains__(self, key, **kwargs): 79 | # checks if this time series bloom filter has 80 | # contained an element within the last x minutes 81 | within = kwargs.get('within', self.time_limit) 82 | now = kwargs.get('now', datetime.now()) 83 | 84 | for i,bloom_filter in enumerate(self.most_current_filters(within=within, now=now)): 85 | if key in bloom_filter: 86 | return True 87 | else: 88 | return False 89 | 90 | # lookup support for the 'within' parameter that we can't express in the magic __contains__ 91 | contains = __contains__ 92 | 93 | class BloomFilter(object): 94 | def __init__(self, connection, bitvector_key, n, k): 95 | # create a bloom filter based on a redis connection, a bitvector_key (name) for it 96 | # and the settings n & k, which dictate how effective it will be 97 | # - n is the amount of bits it will use, I have had success with 85001024 (500kiB) 98 | # for 100k values. If you have fewer, you can get away with using fewer bits. 99 | # in general, the more bits, the fewer false positives 100 | # - k is the number of hash derivations it uses, too many will fill up the filter 101 | # too quickly, not enough will lead to many false positives 102 | 103 | self.connection = connection 104 | self.bitvector_key = bitvector_key 105 | self.n = n 106 | self.k = k 107 | 108 | def __contains__(self, key): 109 | pipeline = self.connection.pipeline() 110 | for hashed_offset in self.calculate_offsets(key): 111 | pipeline.getbit(self.bitvector_key, hashed_offset) 112 | results = pipeline.execute() 113 | return all(results) 114 | 115 | def add(self, key, set_value=1, transaction=False, timeout=None): 116 | # set bits for every hash to 1 117 | # sometimes we can use pipelines here instead of MULTI, 118 | # which makes it a bit faster 119 | pipeline = self.connection.pipeline(transaction=transaction) 120 | for hashed_offset in self.calculate_offsets(key): 121 | pipeline.setbit(self.bitvector_key, hashed_offset, set_value) 122 | 123 | if timeout is not None: 124 | pipeline.expire(self.bitvector_key, timeout) 125 | 126 | pipeline.execute() 127 | 128 | def delete(self, key): 129 | # delete is just an add with value 0 130 | # make sure the pipeline gets wrapped in MULTI/EXEC, so 131 | # that a deleted element is either fully deleted or not 132 | # at all, in case someone is checking __contains__ while 133 | # an element is being deleted 134 | self.add(key, set_value=0, transaction=True) 135 | 136 | def calculate_offsets(self, key): 137 | # we're using only two hash functions with different settings, as described 138 | # by Kirsch & Mitzenmacher: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf 139 | hash_1 = FNVHash(key) 140 | hash_2 = APHash(key) 141 | 142 | for i in range(self.k): 143 | yield (hash_1 + i * hash_2) % self.n 144 | -------------------------------------------------------------------------------- /tests.py: -------------------------------------------------------------------------------- 1 | import random 2 | import redis 3 | import sys 4 | import unittest 5 | 6 | from datetime import datetime,timedelta 7 | 8 | from bloomfilter import BloomFilter, TimeSeriesBloomFilter 9 | 10 | class SimpleTestCase(unittest.TestCase): 11 | def setUp(self): 12 | self.connection = redis.Redis() 13 | 14 | self.single = BloomFilter(connection=self.connection, 15 | bitvector_key='test_bloomfilter', 16 | n=1024, 17 | k=4) 18 | 19 | self.timeseries = TimeSeriesBloomFilter(connection=self.connection, 20 | bitvector_key='test_timed_bloomfilter', 21 | n=1024*8, 22 | k=4, 23 | time_resolution=timedelta(microseconds=1000), 24 | time_limit=timedelta(microseconds=10000)) 25 | 26 | def tearDown(self): 27 | # remove the key in redis 28 | self.connection.delete('test_bloomfilter') 29 | 30 | class SimpleTest(SimpleTestCase): 31 | def test_add(self): 32 | f = self.single 33 | 34 | f.add('three') 35 | f.add('four') 36 | f.add('five') 37 | f.add('six') 38 | f.add('seven') 39 | f.add('eight') 40 | f.add('nine') 41 | f.add("ten") 42 | 43 | # test membership operations 44 | assert 'ten' in f 45 | assert 'five' in f 46 | assert 'two' not in f 47 | assert 'eleven' not in f 48 | 49 | def test_delete(self): 50 | f = self.single 51 | 52 | f.add('ten') 53 | assert 'ten' in f 54 | 55 | f.delete('ten') 56 | assert 'ten' not in f 57 | 58 | 59 | def test_timeseries_add(self): 60 | f = self.timeseries 61 | 62 | assert 'test_value' not in f 63 | f.add('test_value') 64 | assert 'test_value' in f 65 | 66 | def test_timeseries_delay(self): 67 | f = self.timeseries 68 | 69 | f.add('test_value') 70 | start = datetime.now() 71 | # allow for 3ms delay in storing/timer resolution 72 | delay = timedelta(microseconds=3000) 73 | 74 | # make sure that the filter doesn't say that test_value is in the filter for too long 75 | while 'test_value' in f: 76 | assert datetime.now() < (start+timedelta(microseconds=10000)+delay) 77 | assert 'test_value' not in f 78 | 79 | if __name__ == '__main__': 80 | unittest.main() 81 | --------------------------------------------------------------------------------