├── BloomFilter.py ├── README.md ├── experiment.py ├── hash_function.py ├── images ├── 20861426.png ├── 90123647.png ├── exp1.png └── exp2.png └── jhash.py /BloomFilter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import random 3 | from hash_function import HashFunction 4 | 5 | class BloomFilter(): 6 | 7 | def __init__(self,M,K): 8 | self.m = M 9 | self.k = K 10 | self.bitVector = [False] * M 11 | self.hashFunctions = [] 12 | for i in range(0,self.k): 13 | obj = HashFunction(M) 14 | self.hashFunctions.append(obj) 15 | 16 | def insert(self,obj): 17 | for f in self.hashFunctions: 18 | hash = f.convert(obj) 19 | index = int(hash) 20 | #print "Hash is " + str(hash) + " Index is " + str(index) 21 | self.bitVector[index] = True 22 | 23 | def check(self,obj): 24 | ret = True 25 | for f in self.hashFunctions: 26 | index = int(f.convert(obj)) 27 | ret = ret & self.bitVector[index] 28 | return ret 29 | 30 | 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## 1. Implementation of BloomFilter 2 | 3 | The implementation of the project entirely was done in python. I used jenkins hash implementation as base hash function. 4 | In addition, I wrote another wrapper class for HashFunction which uses jenkins hash function as base and it takes 2 additional random parameters to differentiate each instance of the HashFunction class. 5 | I designed and implemented another BloomFilter class which takes M and K values in its constructor and use that wrapped HashFunction class to generate different K number of hash functions in the class. 6 | M is used to set the size of the Bit Vector. In its constructor, all bits in the bit vector initialized to False at first. 7 | 8 | BloomFilter class has 2 methods insert and check. Insert method takes a value as a parameter and hash the value through all K number of different hash functions. Set true to all indexes after that. 9 | However, check method basically hash the value again through hash functions and check if all set to True. If any of them set to False, it will return false otherwise True. 10 | 11 | I implemented another class Experiment which basically takes B , N , M ,K values. 12 | 13 | B is for number of iterations 14 | N is for number of items to be added 15 | M is for Bit vector size 16 | K is for number of hash functions. 17 | 18 | In this sense, by setting these parameters in Experiment class, the class itself will do the rest as following 19 | - The experiment class sets constructor fields to its private properties. 20 | - It has make_experiment method to perform experiment with given parameters. 21 | - In make_experiment method, it will loop through B times , sum each result for the experiment with different hash functions and in the end, it will divide the sum by B again. 22 | 23 | Jenkin's hash implementation is taken from http://stackoverflow.com/questions/3279615/python-implementation-of-jenkins-hash 24 | 25 | ## 2. Experiments. 26 | ##### 2.1 Number of Vector size VS False Positive Probability with constant number of HashFunctions 27 | ![alt tag](https://raw.githubusercontent.com/m00dy/bloomfilter/master/images/90123647.png) 28 | ![alt tag](https://raw.githubusercontent.com/m00dy/bloomfilter/master/images/exp1.png) 29 | 30 | ##### 2.2 Number of Hash Functions VS False Positive Probability with constant number of Vector Size 31 | ![alt tag](https://raw.githubusercontent.com/m00dy/bloomfilter/master/images/20861426.png) 32 | ![alt tag](https://raw.githubusercontent.com/m00dy/bloomfilter/master/images/exp2.png) 33 | -------------------------------------------------------------------------------- /experiment.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import time 4 | import random 5 | from BloomFilter import BloomFilter 6 | 7 | 8 | class Experiment(): 9 | def __init__(self,B,N,M,K): 10 | self.b = B 11 | self.m = M 12 | self.k = K 13 | self.n = N 14 | self.timeStart = 0 15 | self.timeFinish = 0 16 | self.falsePositiveCounter = 0 17 | self.f = 1 18 | 19 | def percentage(self,part, whole): 20 | return 100 * float(part)/float(whole) 21 | 22 | def make_experiment(self): 23 | self.timeStart = time.time() 24 | 25 | for i in range(0,self.b): 26 | #Build the bloomFilter 27 | bf = BloomFilter(self.m,self.k) 28 | 29 | #Insert N times odd items to the bloomFilter 30 | for i in range(0,self.n): 31 | rnumber = random.randint(0,self.n) 32 | rnumber = rnumber*2 + 1 33 | bf.insert(str(rnumber)) 34 | 35 | #Check N times even items from the bloomFilter 36 | for i in range(0,self.n): 37 | rnumber = random.randint(0,self.n) 38 | rnumber = rnumber*2 39 | result = bf.check(str(rnumber)) 40 | 41 | if(result is True): 42 | self.falsePositiveCounter = self.falsePositiveCounter + 1 43 | 44 | self.falsePositiveCounter = self.falsePositiveCounter / self.b 45 | self.timeFinish = time.time() 46 | 47 | #return [(self.timeFinish-self.timeStart),self.percentage(self.falsePositiveCounter,self.n)] 48 | return self.percentage(self.falsePositiveCounter,self.n) 49 | 50 | ### Number of Hash Functions Test ### 51 | 52 | 53 | l=[] 54 | for i in range(1,21): 55 | exp1 = Experiment(150,2000,20000,i) 56 | l.append(exp1.make_experiment()) 57 | 58 | 59 | 60 | 61 | """ 62 | PLOT API 63 | """ 64 | 65 | import plotly.plotly as py 66 | from plotly.graph_objs import * 67 | 68 | trace1 = Scatter( 69 | x=l, 70 | y=range(1,21), 71 | name="Graph2" 72 | ) 73 | 74 | data = Data([trace1]) 75 | 76 | layout = Layout( 77 | title='Constant number of Vector Size', 78 | xaxis=XAxis( 79 | title='False Positive Probability', 80 | showgrid=False, 81 | zeroline=False 82 | ), 83 | yaxis=YAxis( 84 | title='Number of Hash Functions', 85 | showline=False 86 | ) 87 | ) 88 | fig = Figure(data=data, layout=layout) 89 | plot_url = py.plot(fig, filename='line-style') 90 | 91 | 92 | """ 93 | http://corte.si/%2Fposts/code/bloom-filter-rules-of-thumb/index.html 94 | """ 95 | -------------------------------------------------------------------------------- /hash_function.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import random 3 | from jhash import jhash_short 4 | 5 | class HashFunction(): 6 | 7 | def __init__(self,M): 8 | self.m = M 9 | self.a = random.randint(0,self.m) 10 | self.b = random.randint(0,self.m) 11 | 12 | def base_hash_function(self,key): 13 | return jhash_short(key) 14 | 15 | def convert(self,y): 16 | return (self.a * self.base_hash_function(y) + self.b) % self.m 17 | 18 | 19 | -------------------------------------------------------------------------------- /images/20861426.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m00dy/BloomFilter/b0729aaf999f992b1ba135efbc8459865f4c107f/images/20861426.png -------------------------------------------------------------------------------- /images/90123647.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m00dy/BloomFilter/b0729aaf999f992b1ba135efbc8459865f4c107f/images/90123647.png -------------------------------------------------------------------------------- /images/exp1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m00dy/BloomFilter/b0729aaf999f992b1ba135efbc8459865f4c107f/images/exp1.png -------------------------------------------------------------------------------- /images/exp2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/m00dy/BloomFilter/b0729aaf999f992b1ba135efbc8459865f4c107f/images/exp2.png -------------------------------------------------------------------------------- /jhash.py: -------------------------------------------------------------------------------- 1 | # Need to constrain U32 to only 32 bits using the & 0xffffffff 2 | # since Python has no native notion of integers limited to 32 bit 3 | # http://docs.python.org/library/stdtypes.html#numeric-types-int-float-long-complex 4 | 5 | '''Original copyright notice: 6 | By Bob Jenkins, 1996. bob_jenkins@burtleburtle.net. You may use this 7 | code any way you wish, private, educational, or commercial. Its free. 8 | ''' 9 | 10 | def rot(x,k): 11 | return (((x)<<(k)) | ((x)>>(32-(k)))) 12 | 13 | def mix(a, b, c): 14 | a &= 0xffffffff; b &= 0xffffffff; c &= 0xffffffff 15 | a -= c; a &= 0xffffffff; a ^= rot(c,4); a &= 0xffffffff; c += b; c &= 0xffffffff 16 | b -= a; b &= 0xffffffff; b ^= rot(a,6); b &= 0xffffffff; a += c; a &= 0xffffffff 17 | c -= b; c &= 0xffffffff; c ^= rot(b,8); c &= 0xffffffff; b += a; b &= 0xffffffff 18 | a -= c; a &= 0xffffffff; a ^= rot(c,16); a &= 0xffffffff; c += b; c &= 0xffffffff 19 | b -= a; b &= 0xffffffff; b ^= rot(a,19); b &= 0xffffffff; a += c; a &= 0xffffffff 20 | c -= b; c &= 0xffffffff; c ^= rot(b,4); c &= 0xffffffff; b += a; b &= 0xffffffff 21 | return a, b, c 22 | 23 | def final(a, b, c): 24 | a &= 0xffffffff; b &= 0xffffffff; c &= 0xffffffff 25 | c ^= b; c &= 0xffffffff; c -= rot(b,14); c &= 0xffffffff 26 | a ^= c; a &= 0xffffffff; a -= rot(c,11); a &= 0xffffffff 27 | b ^= a; b &= 0xffffffff; b -= rot(a,25); b &= 0xffffffff 28 | c ^= b; c &= 0xffffffff; c -= rot(b,16); c &= 0xffffffff 29 | a ^= c; a &= 0xffffffff; a -= rot(c,4); a &= 0xffffffff 30 | b ^= a; b &= 0xffffffff; b -= rot(a,14); b &= 0xffffffff 31 | c ^= b; c &= 0xffffffff; c -= rot(b,24); c &= 0xffffffff 32 | return a, b, c 33 | 34 | def hashlittle2(data, initval = 0, initval2 = 0): 35 | length = lenpos = len(data) 36 | 37 | a = b = c = (0xdeadbeef + (length) + initval) 38 | 39 | c += initval2; c &= 0xffffffff 40 | 41 | p = 0 # string offset 42 | while lenpos > 12: 43 | a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); a &= 0xffffffff 44 | b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16) + (ord(data[p+7])<<24)); b &= 0xffffffff 45 | c += (ord(data[p+8]) + (ord(data[p+9])<<8) + (ord(data[p+10])<<16) + (ord(data[p+11])<<24)); c &= 0xffffffff 46 | a, b, c = mix(a, b, c) 47 | p += 12 48 | lenpos -= 12 49 | 50 | if lenpos == 12: c += (ord(data[p+8]) + (ord(data[p+9])<<8) + (ord(data[p+10])<<16) + (ord(data[p+11])<<24)); b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16) + (ord(data[p+7])<<24)); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 51 | if lenpos == 11: c += (ord(data[p+8]) + (ord(data[p+9])<<8) + (ord(data[p+10])<<16)); b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16) + (ord(data[p+7])<<24)); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 52 | if lenpos == 10: c += (ord(data[p+8]) + (ord(data[p+9])<<8)); b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16) + (ord(data[p+7])<<24)); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 53 | if lenpos == 9: c += (ord(data[p+8])); b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16) + (ord(data[p+7])<<24)); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 54 | if lenpos == 8: b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16) + (ord(data[p+7])<<24)); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 55 | if lenpos == 7: b += (ord(data[p+4]) + (ord(data[p+5])<<8) + (ord(data[p+6])<<16)); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 56 | if lenpos == 6: b += ((ord(data[p+5])<<8) + ord(data[p+4])); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)) 57 | if lenpos == 5: b += (ord(data[p+4])); a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)); 58 | if lenpos == 4: a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16) + (ord(data[p+3])<<24)) 59 | if lenpos == 3: a += (ord(data[p+0]) + (ord(data[p+1])<<8) + (ord(data[p+2])<<16)) 60 | if lenpos == 2: a += (ord(data[p+0]) + (ord(data[p+1])<<8)) 61 | if lenpos == 1: a += ord(data[p+0]) 62 | a &= 0xffffffff; b &= 0xffffffff; c &= 0xffffffff 63 | if lenpos == 0: return c, b 64 | 65 | a, b, c = final(a, b, c) 66 | 67 | return c, b 68 | 69 | def jhash_short(data, initval=0): 70 | c, b = hashlittle2(data, initval, 0) 71 | return c 72 | 73 | """ 74 | if __name__ == "__main__": 75 | import sys 76 | hashstr = 'Four score and seven years ago' 77 | hash, hash2 = hashlittle2(hashstr, 0xdeadbeef, 0xdeadbeef) 78 | print '"%s": %x %x' % (hashstr, hash, hash2) 79 | 80 | hash = hashlittle(hashstr, 0) 81 | print '"%s": %x' % (hashstr, hash) 82 | """ 83 | --------------------------------------------------------------------------------