├── .gitignore
├── README.rst
├── __init__.py
├── benchmark.py
├── bloomfilter.py
└── tests.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | =================
 2 | bloomfilter-redis
 3 | =================
 4 | 
 5 | Standard & time series bloom filters, backed by Redis bit vectors.
 6 | 
 7 | This implementation is Python-only. If you're looking for (way) more speed, check out a C-based extension that uses hiredis at https://github.com/seomoz/pyreBloom
 8 | 
 9 | Overview
10 | ========
11 | 
12 | This is the little bloom filter we're using to filter unique views using redis.
13 | 
14 | It doesn't do anything special, but I didn't find any small and dependency-free bloom
15 | filter written in Python that use Redis as their backend.
16 | 
17 | Time Series
18 | ========
19 | If you're tracking users over time, and you want to answer the question "have we seen
20 | this guy in the past 2 minutes", this is exactly right for you. For high-throughput
21 | applications this is very space-effective. The total memory footprint is known before-
22 | hand, and is based on the amount of history you want to save and the resolution.
23 | 
24 | You might track users in the past 2 minutes with a 10-second resolution using 12 bloom
25 | filters. User hits are logged into the most recent bloom filter, and checking if you have
26 | seen a user in the past 2 minutes will just go back through those 12 filters.
27 | 
28 | The finest resolutions possible are around 1ms. If you're pushing it to this limit you'll
29 | have to take care of a bunch of things: Storing to and retrieving from Redis takes some
30 | time. Timestamps aren't all that exact, especially when running on a virtual machine. If
31 | you're using multiple machines, their clocks have to be perfectly in sync.
32 | 
33 | Quick Benchmarks
34 | ================
35 | 
36 | Quick benchmark for ballpark figures on a MacbookPro (2x 2.66GHz) with Python 2.7,
37 | hiredis and Redis 2.9 (unstable). Each benchmark was run with k=4 hashes per key. Keys
38 | are random strings of 10 chars length:
39 | 
40 | Big filter with fewer values:
41 | filling bloom filter of 1024.00kB size with 10k values
42 | adding 10000 values took 2.09s (4790 values/sec, 208.73 us/value)
43 | correct: 100000 / false: 0 -> 0% false positives
44 | 
45 | Small filter with a lot of values:
46 | filling bloom filter of 500.00kB size with 100k values
47 | adding 100000 values took 22.30s (4485 values/sec, 222.96 us/value)
48 | correct: 100000 / false: 3 -> 0.003% false positives
49 | 
50 | 4 parallel Python processes:
51 | filling bloom filter of 1024.00kB size with 2M values
52 | adding 2000000 values took 214.69s (9316 values/sec, 429.38 us/value)


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from bloomfilter import BloomFilter, TimeSeriesBloomFilter


--------------------------------------------------------------------------------
/benchmark.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import redis
 3 | import sys
 4 | 
 5 | from datetime import datetime
 6 | 
 7 | from bloomfilter import BloomFilter, TimeSeriesBloomFilter
 8 | 
 9 | connection = redis.Redis()
10 | 
11 | filter_size = 8 * 500 * 1024
12 | test_amount = 1000 * 10
13 | 
14 | TimeSeriesBloomFilter
15 | f = TimeSeriesBloomFilter(connection=connection, bitvector_key='test_bloomfilter', n=filter_size, k=4)
16 | # f = BloomFilter(connection=connection, bitvector_key='test_bloomfilter', n=filter_size, k=4)
17 | 
18 | print "filling bloom filter of %.2fkB size with %.1fk values" % \
19 |     (filter_size/1024.0/8, test_amount/1000.0)
20 | 
21 | # create a reference dict so that we can check for false positives
22 | ref = {}
23 | for x in xrange(test_amount):
24 |     ref['%.8f' % random.random()] = True
25 | 
26 | # add values to filter
27 | start = datetime.now()
28 | for val in ref.iterkeys():
29 |     f.add(val)
30 | 
31 | # calculate results
32 | duration = datetime.now()-start
33 | duration = duration.seconds+duration.microseconds/1000000.0
34 | per_second = test_amount/duration
35 | print "adding %i values took %.2fs (%i values/sec, %.2f us/value)" % \
36 |     (test_amount, duration, per_second, 1000000.0/per_second)
37 | 
38 | # try random values and see how many false positives we'll get
39 | false_positives = 0
40 | correct_responses = 0
41 | start = datetime.now()
42 | while correct_responses < test_amount:
43 |     val = '%.8f' % random.random()
44 |     if (val in f) and (val not in ref):
45 |         false_positives += 1
46 |     else:
47 |         correct_responses += 1
48 | duration = datetime.now()-start
49 | print "duration:",duration
50 | print "correct: %s / false: %s -> %.4f%% false positives" % \
51 |     (correct_responses, false_positives, 100*false_positives/float(correct_responses))
52 | 
53 | # remove the key in redis
54 | connection.delete('test_bloomfilter')


--------------------------------------------------------------------------------
/bloomfilter.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import time
  3 | 
  4 | from datetime import datetime, timedelta
  5 | 
  6 | # For hash functions see http://www.partow.net/programming/hashfunctions/index.html
  7 | # Author Arash Partow, CPL http://www.opensource.org/licenses/cpl1.0.php
  8 | def FNVHash(key):
  9 |     fnv_prime = 0x811C9DC5
 10 |     hash = 0
 11 |     for i in range(len(key)):
 12 |       hash *= fnv_prime
 13 |       hash ^= ord(key[i])
 14 |     return hash
 15 | 
 16 | def APHash(key):
 17 |     hash = 0xAAAAAAAA
 18 |     for i in range(len(key)):
 19 |       if ((i & 1) == 0):
 20 |         hash ^= ((hash <<  7) ^ ord(key[i]) * (hash >> 3))
 21 |       else:
 22 |         hash ^= (~((hash << 11) + ord(key[i]) ^ (hash >> 5)))
 23 |     return hash
 24 | 
 25 | class TimeSeriesBloomFilter(object):
 26 |     # todo: make it more clear how all this works
 27 |     # todo: create a helper function that calculates the total amount of memory stored
 28 | 
 29 |     def __init__(self, connection, bitvector_key, n, k, **kwargs):
 30 |         self.time_resolution = kwargs.get('time_resolution', timedelta(minutes=1))
 31 |         self.time_limit = kwargs.get('time_limit', timedelta(minutes=10))
 32 |         self.time_limit_seconds = self.time_limit.days*86400 + self.time_limit.seconds
 33 |         self.connection = connection
 34 |         self.bitvector_key = bitvector_key
 35 |         self.n = n
 36 |         self.k = k
 37 | 
 38 |     def most_current_filters(self, within, now):
 39 |         resolution_microseconds = (self.time_resolution.days*86400 + self.time_resolution.seconds)*1e6 + \
 40 |                                     self.time_resolution.microseconds
 41 | 
 42 |         within_microseconds = (within.days*86400 + within.seconds)*1e6 + within.microseconds
 43 | 
 44 |         # how many bloom filters will we need to iterate for this?
 45 |         num_filters = int(math.ceil(within_microseconds / resolution_microseconds))
 46 | 
 47 |         # figure out what the passed timestamp really is
 48 |         current_microtimestamp = time.mktime(now.timetuple())*1e6 + now.microsecond
 49 | 
 50 |         # get a datetime object of the 'current' filter
 51 |         block = resolution_microseconds * math.floor(current_microtimestamp / resolution_microseconds)
 52 |         block_now = datetime.fromtimestamp(block/1e6)
 53 | 
 54 |         for x in xrange(num_filters):
 55 |             filter_date = block_now - x * self.time_resolution
 56 |             filter_bitvector_key = '%s|%s' % (self.bitvector_key, filter_date.isoformat())
 57 |             yield BloomFilter(self.connection, filter_bitvector_key, self.n, self.k)
 58 | 
 59 |     def add(self, key, **kwargs):
 60 |         within = kwargs.get('within', self.time_resolution)
 61 |         now = kwargs.get('now', datetime.now())
 62 | 
 63 |         # add to the current bloom filter
 64 |         for bloom_filter in self.most_current_filters(within=within, now=now):
 65 |             # we'll expire the bloom filter we're setting to after 'limit' + 1 seconds
 66 |             bloom_filter.add(key, timeout=self.time_limit_seconds+1)
 67 | 
 68 |     def delete(self, key, **kwargs):
 69 |         within = kwargs.get('within', self.time_limit)
 70 |         now = kwargs.get('now', datetime.now())
 71 | 
 72 |         # delete from the time series bloomfilters
 73 |         for bloom_filter in self.most_current_filters(within=within, now=now):
 74 |             # in case of creating new filter when deleting, so check first
 75 |             if key in bloom_filter:
 76 |                 bloom_filter.delete(key)
 77 | 
 78 |     def __contains__(self, key, **kwargs):
 79 |         # checks if this time series bloom filter has
 80 |         # contained an element within the last x minutes
 81 |         within = kwargs.get('within', self.time_limit)
 82 |         now = kwargs.get('now', datetime.now())
 83 | 
 84 |         for i,bloom_filter in enumerate(self.most_current_filters(within=within, now=now)):
 85 |             if key in bloom_filter:
 86 |                 return True
 87 |         else:
 88 |             return False
 89 | 
 90 |     # lookup support for the 'within' parameter that we can't express in the magic __contains__
 91 |     contains = __contains__
 92 | 
 93 | class BloomFilter(object):
 94 |     def __init__(self, connection, bitvector_key, n, k):
 95 |         # create a bloom filter based on a redis connection, a bitvector_key (name) for it
 96 |         # and the settings n & k, which dictate how effective it will be
 97 |         # - n is the amount of bits it will use, I have had success with 85001024 (500kiB)
 98 |         #   for 100k values. If you have fewer, you can get away with using fewer bits.
 99 |         #   in general, the more bits, the fewer false positives
100 |         # - k is the number of hash derivations it uses, too many will fill up the filter
101 |         #   too quickly, not enough will lead to many false positives
102 | 
103 |         self.connection = connection
104 |         self.bitvector_key = bitvector_key
105 |         self.n = n
106 |         self.k = k
107 | 
108 |     def __contains__(self, key):
109 |         pipeline = self.connection.pipeline()
110 |         for hashed_offset in self.calculate_offsets(key):
111 |             pipeline.getbit(self.bitvector_key, hashed_offset)
112 |         results = pipeline.execute()
113 |         return all(results)
114 | 
115 |     def add(self, key, set_value=1, transaction=False, timeout=None):
116 |         # set bits for every hash to 1
117 |         # sometimes we can use pipelines here instead of MULTI,
118 |         # which makes it a bit faster
119 |         pipeline = self.connection.pipeline(transaction=transaction)
120 |         for hashed_offset in self.calculate_offsets(key):
121 |             pipeline.setbit(self.bitvector_key, hashed_offset, set_value)
122 | 
123 |         if timeout is not None:
124 |             pipeline.expire(self.bitvector_key, timeout)
125 | 
126 |         pipeline.execute()
127 | 
128 |     def delete(self, key):
129 |         # delete is just an add with value 0
130 |         # make sure the pipeline gets wrapped in MULTI/EXEC, so
131 |         # that a deleted element is either fully deleted or not
132 |         # at all, in case someone is checking __contains__ while
133 |         # an element is being deleted
134 |         self.add(key, set_value=0, transaction=True)
135 | 
136 |     def calculate_offsets(self, key):
137 |         # we're using only two hash functions with different settings, as described
138 |         # by Kirsch & Mitzenmacher: https://www.eecs.harvard.edu/~michaelm/postscripts/tr-02-05.pdf
139 |         hash_1 = FNVHash(key)
140 |         hash_2 = APHash(key)
141 | 
142 |         for i in range(self.k):
143 |             yield (hash_1 + i * hash_2) % self.n
144 | 


--------------------------------------------------------------------------------
/tests.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import redis
 3 | import sys
 4 | import unittest
 5 | 
 6 | from datetime import datetime,timedelta
 7 | 
 8 | from bloomfilter import BloomFilter, TimeSeriesBloomFilter
 9 | 
10 | class SimpleTestCase(unittest.TestCase):
11 |     def setUp(self):
12 |         self.connection = redis.Redis()
13 | 
14 |         self.single = BloomFilter(connection=self.connection,
15 |                         bitvector_key='test_bloomfilter',
16 |                         n=1024,
17 |                         k=4)
18 | 
19 |         self.timeseries = TimeSeriesBloomFilter(connection=self.connection,
20 |                             bitvector_key='test_timed_bloomfilter',
21 |                             n=1024*8,
22 |                             k=4,
23 |                             time_resolution=timedelta(microseconds=1000),
24 |                             time_limit=timedelta(microseconds=10000))
25 | 
26 |     def tearDown(self):
27 |         # remove the key in redis
28 |         self.connection.delete('test_bloomfilter')
29 | 
30 | class SimpleTest(SimpleTestCase):
31 |     def test_add(self):
32 |         f = self.single
33 | 
34 |         f.add('three')
35 |         f.add('four')
36 |         f.add('five')
37 |         f.add('six')
38 |         f.add('seven')
39 |         f.add('eight')
40 |         f.add('nine')
41 |         f.add("ten")
42 | 
43 |         # test membership operations
44 |         assert 'ten' in f
45 |         assert 'five' in f
46 |         assert 'two' not in f
47 |         assert 'eleven' not in f
48 | 
49 |     def test_delete(self):
50 |         f = self.single
51 | 
52 |         f.add('ten')
53 |         assert 'ten' in f
54 | 
55 |         f.delete('ten')
56 |         assert 'ten' not in f
57 | 
58 | 
59 |     def test_timeseries_add(self):
60 |         f = self.timeseries
61 | 
62 |         assert 'test_value' not in f
63 |         f.add('test_value')
64 |         assert 'test_value' in f
65 | 
66 |     def test_timeseries_delay(self):
67 |         f = self.timeseries
68 | 
69 |         f.add('test_value')
70 |         start = datetime.now()
71 |         # allow for 3ms delay in storing/timer resolution
72 |         delay = timedelta(microseconds=3000)
73 | 
74 |         # make sure that the filter doesn't say that test_value is in the filter for too long
75 |         while 'test_value' in f:
76 |             assert datetime.now() < (start+timedelta(microseconds=10000)+delay)
77 |         assert 'test_value' not in f
78 | 
79 | if __name__ == '__main__':
80 |     unittest.main()
81 | 


--------------------------------------------------------------------------------