├── .gitignore ├── .travis.yml ├── composer.json ├── phpunit.xml.dist ├── LICENSE ├── README.md ├── Tests └── BloomFilterTest.php └── BloomFilter.php /.gitignore: -------------------------------------------------------------------------------- 1 | .buildpath 2 | .project 3 | .settings 4 | composer.lock 5 | phpunit.xml 6 | vendor 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | php: 3 | - 5.6 4 | - 7.0 5 | - 7.1 6 | - hhvm 7 | 8 | before_script: 9 | - composer install --dev --prefer-source 10 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name" : "makinacorpus/php-bloom", 3 | "description" : "Bloom filter implementation", 4 | "type" : "library", 5 | "license" : "GPL-2", 6 | "authors" : [ 7 | { 8 | "name" : "Pierre Rineau", 9 | "email" : "pierre.rineau@makina-corpus.com" 10 | }, 11 | { 12 | "name" : "Makina Corpus", 13 | "homepage" : "http://makina-corpus.com" 14 | } 15 | ], 16 | "minimum-stability" : "dev", 17 | "prefer-stable" : true, 18 | "require" : { 19 | "php" : ">= 5.6" 20 | }, 21 | "require-dev" : { 22 | "phpunit/phpunit" : "^5.1" 23 | }, 24 | "autoload": { 25 | "psr-4": { 26 | "MakinaCorpus\\Bloom\\" : "." 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | ./Tests/ 16 | 17 | 18 | 19 | 20 | 21 | ./ 22 | 23 | ./vendor 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Makina Corpus 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PHP Bloom filter 2 | 3 | [![Build Status](https://travis-ci.org/makinacorpus/php-bloom.svg?branch=master)](https://travis-ci.org/makinacorpus/php-bloom) 4 | 5 | This is a simple PHP Bloom filter implementation using Sherif Ramadan's 6 | implementation. 7 | 8 | Original code and a really great explaination can be found here 9 | http://phpden.info/Bloom-filters-in-PHP 10 | 11 | It is slightly modified to correct some coding standard issues, to achieve 12 | a more flexible runtime configuration, and fixes a few performance issues. 13 | 14 | ## Usage 15 | 16 | You must first choose a targetted maximum number of elements that your filter 17 | will contain, and a false positive implementation, obviously the lesser are 18 | those two numbers, the faster the implementation will be. 19 | 20 | ```php 21 | // You may cache this value, and fetch it back, it's the whole goal of this 22 | // API. Beware that the stored string might contain ``\0`` characters, ensure 23 | // your storage API deals with those strings in safe way. 24 | $value = null; 25 | 26 | // Configure your Bloom filter, if you store the value, you should store the 27 | // configuration along since selected hash algorithms and string size would 28 | // change otherwise. 29 | $probability = 0.0001 30 | $maxSize = 10000; 31 | 32 | $filter = new \MakinaCorpus\Bloom\BloomFilter(); 33 | 34 | // You may add as many elements as you wish, elements can be any type, really, 35 | // if not scalar they will be serialized prior to being hashed. 36 | $filter->set('some_string'); 37 | $filter->set(123456); 38 | $filter->set(['some' => 'array']); 39 | $filter->set(new \stdClass()); 40 | 41 | // And the whole goal of it: 42 | if ($filter->check('some_value')) { 43 | do_something(); 44 | } 45 | 46 | ``` 47 | 48 | ## Notes 49 | 50 | Please carefully read the original author's blog post, since it explains 51 | everything you need to know about Bloom filters: http://phpden.info/Bloom-filters-in-PHP 52 | 53 | Please also use it wisely, the hashing algorithms are quite fast, but if you 54 | do use it too much, it will impact negatively on your CPU usage. 55 | 56 | There are numerous other competitive implementations, you may use whichever 57 | seems the best for you, take a look around before choosing. 58 | -------------------------------------------------------------------------------- /Tests/BloomFilterTest.php: -------------------------------------------------------------------------------- 1 | markTestSkipped(); 13 | } 14 | } 15 | 16 | private function generateRandomString($length = 32) 17 | { 18 | $characters = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; 19 | $charactersLength = strlen($characters); 20 | $randomString = ''; 21 | 22 | for ($i = 0; $i < $length; $i++) { 23 | $randomString .= $characters[rand(0, $charactersLength - 1)]; 24 | } 25 | 26 | return $randomString; 27 | } 28 | 29 | public function testBasicFunctionnality() 30 | { 31 | $strings = []; 32 | $maxSize = (float)getenv("BLOOM_SIZE"); 33 | $probability = (float)getenv("BLOOM_PROBABILITY"); 34 | $filter = new BloomFilter($maxSize, $probability); 35 | 36 | for ($i = 0; $i < $maxSize; ++$i) { 37 | $strings[$this->generateRandomString()] = rand(0, 1); 38 | } 39 | 40 | // Set everything in the hash 41 | foreach ($strings as $string => $isIn) { 42 | if ($isIn) { 43 | $filter->set($string); 44 | } 45 | } 46 | 47 | $countIn = 0; 48 | $countMiss = 0; 49 | 50 | // Test everything 51 | foreach ($strings as $string => $isIn) { 52 | $result = $filter->check($string); 53 | if ($isIn) { 54 | ++$countIn; 55 | if (!$result) { 56 | $this->fail("False negative is not possible"); 57 | } 58 | } else { 59 | if ($result) { 60 | ++$countMiss; 61 | } 62 | } 63 | } 64 | 65 | echo "\nmax size: ", $maxSize, "\nin: ", $countIn, "\nmiss: ", $countMiss, "\n"; 66 | $this->assertTrue($countMiss / $maxSize < $probability); 67 | 68 | $filter = unserialize(serialize($filter)); 69 | 70 | $countIn = 0; 71 | $countMiss = 0; 72 | 73 | // Same test after unserialization 74 | foreach ($strings as $string => $isIn) { 75 | $result = $filter->check($string); 76 | if ($isIn) { 77 | ++$countIn; 78 | if (!$result) { 79 | $this->fail("False negative is not possible"); 80 | } 81 | } else { 82 | if ($result) { 83 | ++$countMiss; 84 | } 85 | } 86 | } 87 | 88 | 89 | echo "\nafter unserialize, max size: ", $maxSize, "\nin: ", $countIn, "\nmiss: ", $countMiss, "\n"; 90 | $this->assertTrue($countMiss / $maxSize < $probability); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /BloomFilter.php: -------------------------------------------------------------------------------- 1 | maxSize = $maxSize; 39 | $this->probability = $probability; 40 | 41 | $this->init(); 42 | 43 | $this->filter = str_repeat("\0", ceil($this->space / 8)); 44 | } 45 | 46 | private function init() 47 | { 48 | $this->space = $this->calculateSpace($this->maxSize, $this->probability); 49 | $this->hashes = $this->calculateHashFunctions($this->maxSize, $this->space); 50 | $this->hashAlgos = $this->getHashAlgos(); 51 | 52 | if ($this->hashes > $this->numHashFunctionsAvailable($this->hashAlgos)) { 53 | throw new \LogicException("Can't initialize filter with available hash functions"); 54 | } 55 | 56 | if (!function_exists('gmp_init')) { 57 | if (!function_exists('bcmod')) { 58 | throw new \LogicException("Can't initialize filter if you don't have any of the 'gmp' or 'bcmath' extension (gmp is faster)"); 59 | } 60 | $this->useBcMath = true; 61 | } 62 | } 63 | 64 | private function getHashAlgos() 65 | { 66 | return hash_algos(); 67 | } 68 | 69 | private function calculateSpace($maxSize, $probability) 70 | { 71 | return (int)ceil(($maxSize * (log($probability)) / (log(2) ** 2)) * -1); 72 | } 73 | 74 | private function calculateHashFunctions($maxSize, $space) 75 | { 76 | return (int)ceil($space / $maxSize * log(2)); 77 | } 78 | 79 | private function numHashFunctionsAvailable($hashAlgos) 80 | { 81 | $num = 0; 82 | 83 | foreach ($hashAlgos as $algo) { 84 | $num += count(unpack('J*', hash($algo, 'bloom', true))); 85 | } 86 | 87 | return $num; 88 | } 89 | 90 | private function hash($element) 91 | { 92 | $hashes = []; 93 | 94 | foreach ($this->hashAlgos as $algo) { 95 | foreach (unpack('P*', hash($algo, $element, true)) as $hash) { 96 | if ($this->useBcMath) { 97 | $hashes[] = bcmod(sprintf("%u", $hash), $this->space); 98 | } else { 99 | $hash = gmp_init(sprintf("%u", $hash)); 100 | $hashes[] = ($hash % $this->space); 101 | } 102 | if (count($hashes) >= $this->hashes) { 103 | break 2; 104 | } 105 | } 106 | } 107 | 108 | return $hashes; 109 | } 110 | 111 | /** 112 | * Set element in the filter 113 | * 114 | * @param mixed $element 115 | */ 116 | public function set($element) 117 | { 118 | if (!is_scalar($element)) { 119 | $element = serialize($element); 120 | } 121 | 122 | $hashes = $this->hash($element); 123 | 124 | foreach ($hashes as $hash) { 125 | $offset = (int)floor($hash / 8); 126 | $bit = (int)($hash % 8); 127 | $this->filter[$offset] = chr(ord($this->filter[$offset]) | (2 ** $bit)); 128 | } 129 | 130 | $this->empty = false; 131 | } 132 | 133 | /** 134 | * Is element in the hash 135 | * 136 | * @param mixed $element 137 | * 138 | * @return boolean 139 | * Beware that a strict false means strict false, while a strict true 140 | * means "probably with a X% probably" where X is the value you built 141 | * the filter with. 142 | */ 143 | public function check($element) 144 | { 145 | if (!is_scalar($element)) { 146 | $element = serialize($element); 147 | } 148 | 149 | $hashes = $this->hash($element); 150 | 151 | foreach ($hashes as $hash) { 152 | 153 | $offset = (int)floor($hash / 8); 154 | $bit = (int)($hash % 8); 155 | 156 | if (!(ord($this->filter[$offset]) & (2 ** $bit))) { 157 | return false; 158 | } 159 | } 160 | 161 | return true; 162 | } 163 | 164 | /** 165 | * Is this instance empty 166 | * 167 | * @return boolean 168 | */ 169 | public function isEmpty() 170 | { 171 | return $this->empty; 172 | } 173 | 174 | /** 175 | * {@inheritdoc} 176 | */ 177 | public function serialize() 178 | { 179 | return implode(',', [$this->maxSize, $this->probability, base64_encode($this->filter)]); 180 | } 181 | 182 | /** 183 | * {@inheritdoc} 184 | */ 185 | public function unserialize($serialized) 186 | { 187 | list($this->maxSize, $this->probability, $this->filter) = explode(',', $serialized, 3); 188 | $this->filter = base64_decode($this->filter); 189 | 190 | $this->init(); 191 | $this->empty = false; 192 | } 193 | } 194 | --------------------------------------------------------------------------------