├── .gitignore ├── .python-version ├── LICENSE ├── MANIFEST.in ├── README.rst ├── probably ├── __init__.py ├── bloomfilter.py ├── cdbf.py ├── countmin.py ├── hashfunctions.py ├── hll.py ├── maintenance.pyx ├── setup.py └── temporal_daily.py ├── pyproject.toml ├── setup.cfg ├── setup.py └── tests └── cdbf_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.so 3 | build 4 | dist 5 | *.egg-info/ 6 | .cache 7 | *.c 8 | .envrc 9 | .venv*/ 10 | .eggs/ 11 | -------------------------------------------------------------------------------- /.python-version: -------------------------------------------------------------------------------- 1 | 3.9.7 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Parsely, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include probably *.py 2 | recursive-include probably *.pyx 3 | recursive-include probably *.c 4 | include *.rst 5 | include *.txt 6 | include setup.cfg 7 | include pyproject.toml 8 | include LICENSE -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | Probably: Probabilistic Data Structures for Realtime Analytics 2 | -------------------------------------------------------------- 3 | 4 | Package containing some useful probabilistic data structures: 5 | 6 | * ``BloomFilter`` 7 | * ``CountMinSketch`` 8 | * ``CountdownBloomFilter`` 9 | * ``HyperLogLog`` (HLL) 10 | * ``TemporalDailyBloomFilter`` 11 | 12 | Build with:: 13 | 14 | python setup.py build_ext --inplace 15 | -------------------------------------------------------------------------------- /probably/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | 3 | from .bloomfilter import * 4 | from .cdbf import * 5 | from .countmin import * 6 | from .hll import * 7 | from .temporal_daily import * 8 | -------------------------------------------------------------------------------- /probably/bloomfilter.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import bitarray 4 | import numpy as np 5 | 6 | from .hashfunctions import generate_hashfunctions 7 | 8 | class BloomFilter(object): 9 | """Basic Bloom Filter.""" 10 | 11 | def __init__(self, capacity, error_rate): 12 | self.error_rate = error_rate 13 | self.capacity = capacity 14 | self.nbr_slices = int(np.ceil(np.log2(1.0 / error_rate))) 15 | self.bits_per_slice = int(np.ceil((capacity * abs(np.log(error_rate))) / (self.nbr_slices * (np.log(2) ** 2)))) 16 | self.nbr_bits = self.nbr_slices * self.bits_per_slice 17 | self.initialize_bitarray() 18 | self.count = 0 19 | self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) 20 | self.hashed_values = [] 21 | 22 | def initialize_bitarray(self): 23 | self.bitarray = bitarray.bitarray(self.nbr_bits) 24 | self.bitarray.setall(False) 25 | 26 | def __contains__(self, key): 27 | self.hashed_values = self.hashes(key) 28 | offset = 0 29 | for value in self.hashed_values: 30 | if not self.bitarray[offset + value]: 31 | return False 32 | offset += self.bits_per_slice 33 | return True 34 | 35 | def add(self, key): 36 | if key in self: 37 | return True 38 | offset = 0 39 | if not self.hashed_values: 40 | self.hashed_values = self.hashes(key) 41 | for value in self.hashed_values: 42 | self.bitarray[offset + value] = True 43 | offset += self.bits_per_slice 44 | self.count += 1 45 | return False 46 | 47 | 48 | if __name__ == "__main__": 49 | import numpy as np 50 | 51 | bf = BloomFilter(10000, 0.01) 52 | 53 | random_items = [str(r) for r in np.random.randn(20000)] 54 | for item in random_items[:10000]: 55 | bf.add(item) 56 | 57 | false_positive = 0 58 | for item in random_items[10000:20000]: 59 | if item in bf: 60 | false_positive += 1 61 | 62 | print("Error rate (false positive): %s" % str(float(false_positive) / 10000)) 63 | -------------------------------------------------------------------------------- /probably/cdbf.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | 5 | from .hashfunctions import generate_hashfunctions 6 | from .maintenance import maintenance 7 | 8 | 9 | class CountdownBloomFilter(object): 10 | """ Implementation of a Modified Countdown Bloom Filter. Uses a batched maintenance process instead of a continuous one. 11 | 12 | Sanjuas-Cuxart, Josep, et al. "A lightweight algorithm for traffic filtering over sliding windows." 13 | Communications (ICC), 2012 IEEE International Conference on. IEEE, 2012. 14 | 15 | http://www-mobile.ecs.soton.ac.uk/home/conference/ICC2012/symposia/papers/a_lightweight_algorithm_for_traffic_filtering_over_sliding__.pdf 16 | """ 17 | 18 | def __init__(self, capacity, error_rate=0.001, expiration=60, disable_hard_capacity=False): 19 | self.error_rate = error_rate 20 | self.capacity = capacity 21 | self.expiration = expiration 22 | self.nbr_slices = int(np.ceil(np.log2(1.0 / error_rate))) 23 | self.bits_per_slice = int(np.ceil((capacity * abs(np.log(error_rate))) / (self.nbr_slices * (np.log(2) ** 2)))) 24 | self.nbr_bits = self.nbr_slices * self.bits_per_slice 25 | self.count = 0 26 | self.cellarray = np.zeros(self.nbr_bits, dtype=np.uint8) 27 | self.counter_init = 255 28 | self.refresh_head = 0 29 | self.make_hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) 30 | # This is the unset ratio ... and we keep it constant at 0.5 31 | # since the BF will operate most of the time at his optimal 32 | # set ratio (50 %) and the overall effect of this parameter 33 | # on the refresh rate is very minimal anyway. 34 | self.z = 0.5 35 | self.estimate_z = 0 36 | self.disable_hard_capacity = disable_hard_capacity 37 | 38 | def _compute_z(self): 39 | """ Compute the unset ratio (exact) """ 40 | return self.cellarray.nonzero()[0].shape[0] / self.nbr_bits 41 | 42 | def _estimate_count(self): 43 | """ Update the count number using the estimation of the unset ratio """ 44 | if self.estimate_z == 0: 45 | self.estimate_z = (1.0 / self.nbr_bits) 46 | self.estimate_z = min(self.estimate_z, 0.999999) 47 | self.count = int(-(self.nbr_bits / self.nbr_slices) * np.log(1 - self.estimate_z)) 48 | 49 | def expiration_maintenance(self): 50 | """ Decrement cell value if not zero 51 | This maintenance process need to executed each self.compute_refresh_time() 52 | """ 53 | if self.cellarray[self.refresh_head] != 0: 54 | self.cellarray[self.refresh_head] -= 1 55 | self.refresh_head = (self.refresh_head + 1) % self.nbr_bits 56 | 57 | def batched_expiration_maintenance_dev(self, elapsed_time): 58 | """ Batched version of expiration_maintenance() """ 59 | num_iterations = self.num_batched_maintenance(elapsed_time) 60 | for i in range(num_iterations): 61 | self.expiration_maintenance() 62 | 63 | def batched_expiration_maintenance(self, elapsed_time): 64 | """ Batched version of expiration_maintenance() 65 | Cython version 66 | """ 67 | num_iterations = self.num_batched_maintenance(elapsed_time) 68 | self.refresh_head, nonzero = maintenance(self.cellarray, self.nbr_bits, num_iterations, self.refresh_head) 69 | if num_iterations != 0: 70 | self.estimate_z = float(nonzero) / float(num_iterations) 71 | self._estimate_count() 72 | processed_interval = num_iterations * self.compute_refresh_time() 73 | return processed_interval 74 | 75 | def compute_refresh_time(self): 76 | """ Compute the refresh period for the given expiration delay """ 77 | if self.z == 0: 78 | self.z = 1E-10 79 | s = float(self.expiration) * (1.0/(self.nbr_bits)) * (1.0/(self.counter_init - 1 + (1.0/(self.z * (self.nbr_slices + 1))))) 80 | return s 81 | 82 | def num_batched_maintenance(self, elapsed_time): 83 | return int(np.floor(elapsed_time / self.compute_refresh_time())) 84 | 85 | def __nonzero__(self): 86 | return True 87 | 88 | def __bool__(self): 89 | return True 90 | 91 | def __contains__(self, key): 92 | if not isinstance(key, list): 93 | hashes = self.make_hashes(key) 94 | else: 95 | hashes = key 96 | offset = 0 97 | for k in hashes: 98 | if self.cellarray[offset + k] == 0: 99 | return False 100 | offset += self.bits_per_slice 101 | return True 102 | 103 | def __len__(self): 104 | """ Return the number of keys stored by this bloom filter. """ 105 | return self.count 106 | 107 | def add(self, key, skip_check=False): 108 | hashes = self.make_hashes(key) 109 | if not skip_check and hashes in self: 110 | offset = 0 111 | for k in hashes: 112 | self.cellarray[offset + k] = self.counter_init 113 | offset += self.bits_per_slice 114 | return True 115 | if (self.count > self.capacity or self.estimate_z > 0.5) and not self.disable_hard_capacity: 116 | raise IndexError("BloomFilter is at capacity") 117 | offset = 0 118 | for k in hashes: 119 | self.cellarray[offset + k] = self.counter_init 120 | offset += self.bits_per_slice 121 | self.count += 1 122 | return False 123 | -------------------------------------------------------------------------------- /probably/countmin.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import heapq 4 | import sys 5 | import random 6 | 7 | import numpy as np 8 | 9 | from .hashfunctions import generate_hashfunctions 10 | 11 | 12 | class CountMinSketch(object): 13 | """ Basic Count-Min Sketch """ 14 | 15 | def __init__(self, delta, epsilon, k): 16 | self.nbr_bits = int(np.ceil(np.exp(1) / epsilon)) 17 | self.nbr_slices = int(np.ceil(np.log(1 / delta))) 18 | self.k = k 19 | self.count = np.zeros((self.nbr_slices, self.nbr_bits), dtype=np.int32) 20 | self.heap = [] 21 | self.top_k = {} 22 | self.make_hashes = generate_hashfunctions(self.nbr_bits, self.nbr_slices) 23 | 24 | def update(self, key, increment): 25 | for row, column in enumerate(self.make_hashes(key)): 26 | self.count[int(row), int(column)] += increment 27 | return self.update_heap(key) 28 | 29 | def update_heap(self, key): 30 | estimate = self.get(key) 31 | poped = key 32 | if key in self.top_k: 33 | old_pair = self.top_k.get(key) 34 | old_pair[0] = estimate 35 | heapq.heapify(self.heap) 36 | poped = None 37 | else: 38 | if len(self.top_k) < self.k: 39 | heapq.heappush(self.heap, [estimate, key]) 40 | self.top_k[key] = [estimate, key] 41 | poped = None 42 | else: 43 | new_pair = [estimate, key] 44 | old_pair = heapq.heappushpop(self.heap, new_pair) 45 | poped = old_pair[1] 46 | if old_pair[1] in self.top_k: 47 | del self.top_k[old_pair[1]] 48 | self.top_k[key] = new_pair 49 | return poped 50 | 51 | def get(self, key): 52 | value = float('inf') 53 | for row, column in enumerate(self.make_hashes(key)): 54 | value = min(self.count[row, column], value) 55 | return value 56 | 57 | 58 | 59 | if __name__ == "__main__": 60 | import random 61 | import time 62 | 63 | stream = [] 64 | for i in range(100): 65 | stream = stream + [str(i)] * i 66 | 67 | cms = CountMinSketch(10**-3, 0.01, 10) 68 | random.shuffle(stream) 69 | 70 | t1 = time.time() 71 | for s in stream: 72 | p = cms.update(s, 1) 73 | print(time.time() - t1) 74 | 75 | -------------------------------------------------------------------------------- /probably/hashfunctions.py: -------------------------------------------------------------------------------- 1 | import struct 2 | 3 | import mmh3 4 | from six import text_type 5 | from six.moves import range 6 | 7 | 8 | def hash64(key, seed): 9 | """ 10 | Wrapper around mmh3.hash64 to get us single 64-bit value. 11 | 12 | This also does the extra work of ensuring that we always treat the 13 | returned values as big-endian unsigned long, like smhasher used to 14 | do. 15 | """ 16 | hash_val = mmh3.hash64(key, seed)[0] 17 | return struct.unpack('>Q', struct.pack('q', hash_val))[0] 18 | 19 | 20 | def generate_hashfunctions(nbr_bits, nbr_slices): 21 | """Generate a set of hash functions. 22 | 23 | The core method is a 64-bit murmur3 hash which has a good distribution. 24 | Multiple hashes can be generate using the previous hash value as a seed. 25 | """ 26 | def _make_hashfuncs(key): 27 | if isinstance(key, text_type): 28 | key = key.encode('utf-8') 29 | else: 30 | key = str(key) 31 | rval = [] 32 | current_hash = 0 33 | for i in range(nbr_slices): 34 | seed = current_hash 35 | current_hash = hash64(key, seed) 36 | rval.append(current_hash % nbr_bits) 37 | return rval 38 | return _make_hashfuncs 39 | -------------------------------------------------------------------------------- /probably/hll.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import numpy as np 4 | from six import PY3 5 | from six.moves import range 6 | 7 | from .hashfunctions import hash64 8 | 9 | 10 | if PY3: 11 | long = int 12 | 13 | 14 | class HyperLogLog(object): 15 | """ Basic Hyperloglog """ 16 | 17 | def __init__(self, error_rate): 18 | b = int(np.ceil(np.log2((1.04 / error_rate) ** 2))) 19 | self.precision = 64 20 | self.alpha = self._get_alpha(b) 21 | self.b = b 22 | self.m = 1 << b 23 | self.M = np.zeros(self.m, dtype=np.uint8) 24 | self.bitcount_arr = [long(1) << i for i in range(self.precision - b + 1)] 25 | 26 | @staticmethod 27 | def _get_alpha(b): 28 | if not (4 <= b <= 16): 29 | raise ValueError("b=%d should be in range [4 : 16]" % b) 30 | if b == 4: 31 | return 0.673 32 | if b == 5: 33 | return 0.697 34 | if b == 6: 35 | return 0.709 36 | return 0.7213 / (1.0 + 1.079 / (1 << b)) 37 | 38 | def _get_rho(self, w, arr): 39 | """ Return the least signifiant bit 40 | O(N) in the worst case 41 | """ 42 | lsb = 0 43 | while not (w & arr[lsb]): 44 | lsb += 1 45 | return lsb + 1 46 | 47 | def add(self, uuid): 48 | """ Adds a key to the HyperLogLog """ 49 | if uuid: 50 | # Computing the hash 51 | try: 52 | x = hash64(uuid) 53 | except UnicodeEncodeError: 54 | x = hash64(uuid.encode('ascii', 'ignore')) 55 | # Finding the register to update by using the first b bits as an index 56 | j = x & ((1 << self.b) - 1) 57 | # Remove those b bits 58 | w = x >> self.b 59 | # Find the first 0 in the remaining bit pattern 60 | self.M[j] = max(self.M[j], self._get_rho(w, self.bitcount_arr)) 61 | 62 | def __len__(self, M=None): 63 | """ Returns the estimate of the cardinality """ 64 | return self.estimate() 65 | 66 | def __or__(self, other_hll): 67 | """ Perform a union with another HLL object. """ 68 | self.M = reduce(lambda x, y: np.maximum(x, y), 69 | [self.M, other_hll.M]).astype(np.int16) 70 | return self 71 | 72 | def estimate(self): 73 | """ Returns the estimate of the cardinality """ 74 | E = self.alpha * float(self.m ** 2) / np.power(2.0, - self.M).sum() 75 | if E <= 2.5 * self.m: # Small range correction 76 | V = self.m - np.count_nonzero(self.M) 77 | return int(self.m * np.log(self.m / float(V))) if V > 0 else int(E) 78 | # intermidiate range correction -> No correction 79 | elif E <= float(long(1) << self.precision) / 30.0: 80 | return int(E) 81 | else: 82 | return int(-(long(1) << self.precision) * 83 | np.log(1.0 - E / (long(1) << self.precision))) 84 | 85 | 86 | if __name__ == "__main__": 87 | hll = HyperLogLog(0.01) 88 | for i in range(100000): 89 | hll.add(str(i)) 90 | print(len(hll)) 91 | -------------------------------------------------------------------------------- /probably/maintenance.pyx: -------------------------------------------------------------------------------- 1 | ''' 2 | Cython module for fast maintenance process 3 | ''' 4 | 5 | import cython 6 | from libc.stdlib cimport malloc, free 7 | from libc.math cimport sqrt, pow 8 | from six.moves import range 9 | cimport numpy as np 10 | 11 | @cython.wraparound(False) 12 | @cython.boundscheck(False) 13 | @cython.cdivision(True) 14 | cdef tuple maintenance_cyt(np.ndarray[np.uint8_t, ndim=1, mode="c"] cells, 15 | long int cells_size, 16 | long int num_iterations, 17 | long int head): 18 | ''' 19 | Maintenance process for the Countdown Bloom Filter 20 | ''' 21 | cdef long int refresh_head = head 22 | cdef long int itr 23 | cdef long int nonzero = 0 24 | 25 | for itr in range(num_iterations): 26 | if cells[refresh_head] != 0: 27 | cells[refresh_head] -= 1 28 | if cells[refresh_head] != 0: 29 | nonzero += 1 30 | refresh_head = (refresh_head + 1) % cells_size 31 | return refresh_head, nonzero 32 | 33 | 34 | def maintenance(np.ndarray[np.uint8_t, ndim=1, mode="c"] cells, 35 | long int cells_size, long int num_iterations, head): 36 | return maintenance_cyt(cells, cells_size, num_iterations, head) 37 | 38 | 39 | -------------------------------------------------------------------------------- /probably/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from distutils.extension import Extension 3 | from Cython.Distutils import build_ext 4 | import numpy 5 | 6 | setup( 7 | cmdclass = {'build_ext': build_ext}, 8 | ext_modules = [Extension("maintenance", ["maintenance.pyx"], include_dirs=[numpy.get_include()]),] 9 | ) 10 | -------------------------------------------------------------------------------- /probably/temporal_daily.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import datetime as dt 4 | import glob 5 | import math 6 | import os 7 | import time 8 | import zlib 9 | 10 | import bitarray 11 | import numpy as np 12 | from six.moves import cPickle as pickle 13 | from six.moves import range 14 | 15 | from .bloomfilter import BloomFilter 16 | from .hashfunctions import generate_hashfunctions 17 | 18 | 19 | class DailyTemporalBloomFilter(object): 20 | """Long Range Temporal BloomFilter using a daily resolution. 21 | 22 | For really high value of expiration (like 60 days) with low requirement on precision. 23 | The actual error of this BF will the be native error of the BF + the error related 24 | to the coarse aspect of the expiration, since we no longer expires information precisely. 25 | Also, as opposed to a classic Bloom Filter, this one will aslo have false positive (reporting membership for a non-member) 26 | AND false negative (reporting non-membership for a member). 27 | 28 | The upper bound of the temporal_error can be theoricaly quite high. However, if the 29 | items of the set are uniformly distributed over time, the avg error will be something like 1.0 / expiration 30 | """ 31 | 32 | def __init__(self, capacity, error_rate, expiration, name, snapshot_path): 33 | self.error_rate = error_rate 34 | self.capacity = capacity 35 | self.nbr_slices = int(np.ceil(np.log2(1.0 / error_rate))) 36 | self.bits_per_slice = int(np.ceil((capacity * abs(np.log(error_rate))) / (self.nbr_slices * (np.log(2) ** 2)))) 37 | self.nbr_bits = self.nbr_slices * self.bits_per_slice 38 | self.initialize_bitarray() 39 | self.count = 0 40 | self.hashes = generate_hashfunctions(self.bits_per_slice, self.nbr_slices) 41 | self.hashed_values = [] 42 | self.name = name 43 | self.snapshot_path = snapshot_path 44 | self.expiration = expiration 45 | self.initialize_period() 46 | self.snapshot_to_load = None 47 | self.ready = False 48 | self.warm_period = None 49 | self.next_snapshot_load = time.time() 50 | 51 | def initialize_bitarray(self): 52 | """Initialize both bitarray. 53 | 54 | This BF contain two bit arrays instead of single one like a plain BF. bitarray 55 | is the main bit array where all the historical items are stored. It's the one 56 | used for the membership query. The second one, current_day_bitarray is the one 57 | used for creating the daily snapshot. 58 | """ 59 | self.bitarray = bitarray.bitarray(self.nbr_bits) 60 | self.current_day_bitarray = bitarray.bitarray(self.nbr_bits) 61 | self.bitarray.setall(False) 62 | self.current_day_bitarray.setall(False) 63 | 64 | def __contains__(self, key): 65 | """Check membership.""" 66 | self.hashed_values = self.hashes(key) 67 | offset = 0 68 | for value in self.hashed_values: 69 | if not self.bitarray[offset + value]: 70 | return False 71 | offset += self.bits_per_slice 72 | return True 73 | 74 | def add(self, key): 75 | if key in self: 76 | return True 77 | offset = 0 78 | if not self.hashed_values: 79 | self.hashed_values = self.hashes(key) 80 | for value in self.hashed_values: 81 | self.bitarray[offset + value] = True 82 | self.current_day_bitarray[offset + value] = True 83 | offset += self.bits_per_slice 84 | self.count += 1 85 | return False 86 | 87 | def initialize_period(self, period=None): 88 | """Initialize the period of BF. 89 | 90 | :period: datetime.datetime for setting the period explicity. 91 | """ 92 | if not period: 93 | self.current_period = dt.datetime.now() 94 | else: 95 | self.current_period = period 96 | self.current_period = dt.datetime(self.current_period.year, self.current_period.month, self.current_period.day) 97 | self.date = self.current_period.strftime("%Y-%m-%d") 98 | 99 | def maintenance(self): 100 | """Expire the old element of the set. 101 | 102 | Initialize a new bitarray and load the previous snapshot. Execute this guy 103 | at the beginining of each day. 104 | """ 105 | self.initialize_period() 106 | self.initialize_bitarray() 107 | self.restore_from_disk() 108 | 109 | def compute_refresh_period(self): 110 | self.warm_period = (60 * 60 * 24) // (self.expiration-2) 111 | 112 | def _should_warm(self): 113 | return time.time() >= self.next_snapshot_load 114 | 115 | def warm(self, jittering_ratio=0.2): 116 | """Progressively load the previous snapshot during the day. 117 | 118 | Loading all the snapshots at once can takes a substantial amount of time. This method, if called 119 | periodically during the day will progressively load those snapshots one by one. Because many workers are 120 | going to use this method at the same time, we add a jittering to the period between load to avoid 121 | hammering the disk at the same time. 122 | """ 123 | if self.snapshot_to_load == None: 124 | last_period = self.current_period - dt.timedelta(days=self.expiration-1) 125 | self.compute_refresh_period() 126 | self.snapshot_to_load = [] 127 | base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) 128 | availables_snapshots = glob.glob(base_filename) 129 | for filename in availables_snapshots: 130 | snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") 131 | if snapshot_period >= last_period: 132 | self.snapshot_to_load.append(filename) 133 | self.ready = False 134 | 135 | if self.snapshot_to_load and self._should_warm(): 136 | filename = self.snapshot_to_load.pop() 137 | self._union_bf_from_file(filename) 138 | jittering = self.warm_period * (np.random.random()-0.5) * jittering_ratio 139 | self.next_snapshot_load = time.time() + self.warm_period + jittering 140 | if not self.snapshot_to_load: 141 | self.ready = True 142 | 143 | 144 | def _union_bf_from_file(self, filename, current=False): 145 | snapshot = pickle.loads(zlib.decompress(open(filename,'r').read())) 146 | if current: 147 | self.current_day_bitarray = self.current_day_bitarray | snapshot 148 | else: 149 | self.bitarray = self.bitarray | snapshot 150 | 151 | def restore_from_disk(self, clean_old_snapshot=False): 152 | """Restore the state of the BF using previous snapshots. 153 | 154 | :clean_old_snapshot: Delete the old snapshot on the disk (period < current - expiration) 155 | """ 156 | base_filename = "%s/%s_%s_*.dat" % (self.snapshot_path, self.name, self.expiration) 157 | availables_snapshots = glob.glob(base_filename) 158 | last_period = self.current_period - dt.timedelta(days=self.expiration-1) 159 | for filename in availables_snapshots: 160 | snapshot_period = dt.datetime.strptime(filename.split('_')[-1].strip('.dat'), "%Y-%m-%d") 161 | if snapshot_period < last_period and not clean_old_snapshot: 162 | continue 163 | else: 164 | self._union_bf_from_file(filename) 165 | if snapshot_period == self.current_period: 166 | self._union_bf_from_file(filename, current=True) 167 | 168 | if snapshot_period < last_period and clean_old_snapshot: 169 | os.remove(filename) 170 | self.ready = True 171 | 172 | def save_snaphot(self): 173 | """Save the current state of the current day bitarray on disk. 174 | 175 | Save the internal representation (bitarray) into a binary file using this format: 176 | filename : name_expiration_2013-01-01.dat 177 | """ 178 | filename = "%s/%s_%s_%s.dat" % (self.snapshot_path, self.name, self.expiration, self.date) 179 | with open(filename, 'w') as f: 180 | f.write(zlib.compress(pickle.dumps(self.current_day_bitarray, protocol=pickle.HIGHEST_PROTOCOL))) 181 | 182 | def union_current_day(self, bf): 183 | """Union only the current_day of an other BF.""" 184 | self.bitarray = self.bitarray | bf.current_day_bitarray 185 | 186 | 187 | if __name__ == "__main__": 188 | import numpy as np 189 | 190 | bf = DailyTemporalBloomFilter(10000, 0.01, 30, 'test', './') 191 | 192 | random_items = [str(r) for r in np.random.randn(20000)] 193 | for item in random_items[:10000]: 194 | bf.add(item) 195 | 196 | false_positive = 0 197 | for item in random_items[10000:20000]: 198 | if item in bf: 199 | false_positive += 1 200 | 201 | print("Error rate (false positive): %s" % str(float(false_positive) / 10000)) 202 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | # Minimum requirements for the build system to execute. 3 | requires = [ 4 | "setuptools>=38.6.0", 5 | "wheel", 6 | "Cython>=0.29.21", 7 | "oldest-supported-numpy", 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = probably 3 | description = Probably: Simple Probabilistic Data Structures 4 | long_description = file: README.md 5 | long_description_content_type = text/markdown 6 | url = https://github.com/Parsely/probably 7 | author = Parse.ly 8 | author_email = hello@parsely.com 9 | license = MIT License 10 | license_file = LICENSE 11 | platforms = any 12 | classifiers = 13 | Intended Audience :: Developers 14 | License :: OSI Approved :: MIT License 15 | Programming Language :: Python 16 | Programming Language :: Python :: 3 17 | Programming Language :: Python :: 3 :: Only 18 | Programming Language :: Python :: 3.6 19 | Programming Language :: Python :: 3.7 20 | Programming Language :: Python :: 3.8 21 | Programming Language :: Python :: 3.9 22 | Operating System :: OS Independent 23 | Topic :: Utilities 24 | Topic :: Database :: Database Engines/Servers 25 | Topic :: Software Development :: Libraries :: Python Modules 26 | project_urls = 27 | Bug Tracker = https://github.com/Parsely/probably/issues 28 | Documentation = https://github.com/Parsely/probably 29 | Source Code = https://github.com/Parsely/probably 30 | 31 | [options] 32 | packages = find: 33 | install_requires = 34 | bitarray 35 | mmh3>=2.4 36 | numpy>=1.16.5 37 | six 38 | python_requires = >=3.6.0 39 | setup_requires= 40 | setuptools>=38.6.0 41 | wheel 42 | Cython>=0.29.21 43 | oldest-supported-numpy 44 | zip_safe = False 45 | 46 | [options.extras_require] 47 | test = 48 | pytest>=5.0.1 49 | 50 | [build_ext] 51 | inplace = True 52 | 53 | [options.packages.find] 54 | include = probably, probably.* 55 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from os.path import join 3 | 4 | import numpy as np 5 | from Cython.Build import cythonize 6 | from setuptools import Extension, setup 7 | 8 | VERSION = "1.1.3" 9 | 10 | extensions = [ 11 | Extension( 12 | "probably.maintenance", 13 | [join("probably", "maintenance.pyx")], 14 | include_dirs=[np.get_include()], 15 | ), 16 | ] 17 | 18 | setup( 19 | name="probably", 20 | version=VERSION, 21 | setup_requires=["oldest-supported-numpy", "cython"], 22 | ext_modules=cythonize(extensions), 23 | ) 24 | -------------------------------------------------------------------------------- /tests/cdbf_test.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, print_function 2 | 3 | import time 4 | import unittest 5 | 6 | import numpy as np 7 | from six.moves import range 8 | 9 | from probably import CountdownBloomFilter 10 | 11 | 12 | class CountdownBloomFilterTests(unittest.TestCase): 13 | ''' 14 | Tests for CountdownBloomFilter 15 | ''' 16 | @classmethod 17 | def setUp(self): 18 | self.batch_refresh_period = 0.1 19 | self.expiration = 5.0 20 | self.bf = CountdownBloomFilter(1000, 0.02, self.expiration) 21 | 22 | def test_empty(self): 23 | assert len(self.bf) == 0 24 | assert self.bf.cellarray.nonzero()[0].shape == (0,) 25 | 26 | def test_cellarray(self): 27 | assert self.bf.cellarray.shape == (8148,) 28 | 29 | def test_add(self): 30 | assert not self.bf.add('random_uuid') 31 | assert self.bf.add('random_uuid') 32 | np.testing.assert_array_equal(self.bf.cellarray.nonzero()[0], 33 | np.array([1039, 1376, 3202, 5228, 6295, 34 | 7530])) 35 | 36 | def test_touch(self): 37 | assert not self.bf.add('random_uuid') 38 | assert self.bf.add('random_uuid') 39 | # Check membership just before expiration 40 | nbr_step = int(self.expiration / self.batch_refresh_period) 41 | for i in range(nbr_step - 1): 42 | self.bf.batched_expiration_maintenance(self.batch_refresh_period) 43 | assert 'random_uuid' in self.bf 44 | 45 | # Check membership right after expiration 46 | self.bf.batched_expiration_maintenance(2 * self.batch_refresh_period) 47 | 48 | # Touch. This should reset the TTL 49 | assert not self.bf.add('random_uuid') 50 | 51 | assert 'random_uuid' in self.bf 52 | 53 | def test_compute_refresh_time(self): 54 | assert self.bf.compute_refresh_time() == 2.4132205876674775e-06 55 | 56 | def test_single_batch_expiration(self): 57 | assert not self.bf.add('random_uuid') 58 | assert self.bf.add('random_uuid') 59 | nzi = self.bf.cellarray.nonzero()[0] 60 | np.testing.assert_array_equal(self.bf.cellarray[nzi], 61 | np.array([255, 255, 255, 255, 255, 255], 62 | dtype=np.uint8)) 63 | self.bf.batched_expiration_maintenance(self.batch_refresh_period) 64 | np.testing.assert_array_equal(self.bf.cellarray[nzi], 65 | np.array([250, 250, 250, 250, 250, 250], 66 | dtype=np.uint8)) 67 | self.bf.batched_expiration_maintenance(self.expiration - 2 * 68 | self.batch_refresh_period) 69 | np.testing.assert_array_equal(self.bf.cellarray[nzi], 70 | np.array([5, 5, 6, 6, 6, 6], 71 | dtype=np.uint8)) 72 | self.bf.batched_expiration_maintenance(self.batch_refresh_period) 73 | np.testing.assert_array_equal(self.bf.cellarray[nzi], 74 | np.array([0, 0, 1, 1, 1, 1], 75 | dtype=np.uint8)) 76 | 77 | def test_expiration_realtime(self): 78 | assert not self.bf.add('random_uuid') 79 | uuid_exists = self.bf.add('random_uuid') 80 | assert uuid_exists 81 | elapsed = 0 82 | start = time.time() 83 | while uuid_exists: 84 | t1 = time.time() 85 | if elapsed: 86 | self.bf.batched_expiration_maintenance(elapsed) 87 | uuid_exists = 'random_uuid' in self.bf 88 | t2 = time.time() 89 | elapsed = t2 - t1 90 | experimental_expiration = time.time() - start 91 | print(experimental_expiration) 92 | # See if we finished in roughly the right amount of time 93 | assert (experimental_expiration - self.expiration) < 0.40 94 | 95 | def test_expiration(self): 96 | assert not self.bf.add('random_uuid') 97 | assert self.bf.add('random_uuid') 98 | # Check membership just before expiration 99 | nbr_step = int(self.expiration / self.batch_refresh_period) 100 | for i in range(nbr_step - 1): 101 | self.bf.batched_expiration_maintenance(self.batch_refresh_period) 102 | assert 'random_uuid' in self.bf 103 | # Check membership right after expiration 104 | self.bf.batched_expiration_maintenance(self.batch_refresh_period) 105 | assert 'random_uuid' not in self.bf 106 | 107 | def test_count_estimate(self): 108 | for i in range(500): 109 | self.bf.add(str(i)) 110 | assert self.bf.count == 500 111 | self.bf.batched_expiration_maintenance(2.5) 112 | for i in range(500, 1000): 113 | self.bf.add(str(i)) 114 | assert self.bf.count == 1000 115 | for i in range(26): 116 | self.bf.batched_expiration_maintenance(0.1) 117 | assert self.bf.count == 492 118 | self.assertAlmostEqual(self.bf.estimate_z, 0.304, places=3) 119 | self.assertAlmostEqual((float(self.bf.cellarray.nonzero()[0].shape[0]) / 120 | self.bf.nbr_bits), 121 | 0.304, 122 | places=3) 123 | 124 | 125 | if __name__ == '__main__': 126 | unittest.main() 127 | --------------------------------------------------------------------------------