├── LICENSE ├── README.md ├── bloomfilter.php ├── bloomfilter16.php ├── bloomfilter32.php ├── test.ini └── test.php /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, and 10 | distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by the copyright 13 | owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all other entities 16 | that control, are controlled by, or are under common control with that entity. 17 | For the purposes of this definition, "control" means (i) the power, direct or 18 | indirect, to cause the direction or management of such entity, whether by 19 | contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the 20 | outstanding shares, or (iii) beneficial ownership of such entity. 21 | 22 | "You" (or "Your") shall mean an individual or Legal Entity exercising 23 | permissions granted by this License. 24 | 25 | "Source" form shall mean the preferred form for making modifications, including 26 | but not limited to software source code, documentation source, and configuration 27 | files. 28 | 29 | "Object" form shall mean any form resulting from mechanical transformation or 30 | translation of a Source form, including but not limited to compiled object code, 31 | generated documentation, and conversions to other media types. 32 | 33 | "Work" shall mean the work of authorship, whether in Source or Object form, made 34 | available under the License, as indicated by a copyright notice that is included 35 | in or attached to the work (an example is provided in the Appendix below). 36 | 37 | "Derivative Works" shall mean any work, whether in Source or Object form, that 38 | is based on (or derived from) the Work and for which the editorial revisions, 39 | annotations, elaborations, or other modifications represent, as a whole, an 40 | original work of authorship. For the purposes of this License, Derivative Works 41 | shall not include works that remain separable from, or merely link (or bind by 42 | name) to the interfaces of, the Work and Derivative Works thereof. 43 | 44 | "Contribution" shall mean any work of authorship, including the original version 45 | of the Work and any modifications or additions to that Work or Derivative Works 46 | thereof, that is intentionally submitted to Licensor for inclusion in the Work 47 | by the copyright owner or by an individual or Legal Entity authorized to submit 48 | on behalf of the copyright owner. For the purposes of this definition, 49 | "submitted" means any form of electronic, verbal, or written communication sent 50 | to the Licensor or its representatives, including but not limited to 51 | communication on electronic mailing lists, source code control systems, and 52 | issue tracking systems that are managed by, or on behalf of, the Licensor for 53 | the purpose of discussing and improving the Work, but excluding communication 54 | that is conspicuously marked or otherwise designated in writing by the copyright 55 | owner as "Not a Contribution." 56 | 57 | "Contributor" shall mean Licensor and any individual or Legal Entity on behalf 58 | of whom a Contribution has been received by Licensor and subsequently 59 | incorporated within the Work. 60 | 61 | 2. Grant of Copyright License. 62 | 63 | Subject to the terms and conditions of this License, each Contributor hereby 64 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 65 | irrevocable copyright license to reproduce, prepare Derivative Works of, 66 | publicly display, publicly perform, sublicense, and distribute the Work and such 67 | Derivative Works in Source or Object form. 68 | 69 | 3. Grant of Patent License. 70 | 71 | Subject to the terms and conditions of this License, each Contributor hereby 72 | grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 73 | irrevocable (except as stated in this section) patent license to make, have 74 | made, use, offer to sell, sell, import, and otherwise transfer the Work, where 75 | such license applies only to those patent claims licensable by such Contributor 76 | that are necessarily infringed by their Contribution(s) alone or by combination 77 | of their Contribution(s) with the Work to which such Contribution(s) was 78 | submitted. If You institute patent litigation against any entity (including a 79 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 80 | Contribution incorporated within the Work constitutes direct or contributory 81 | patent infringement, then any patent licenses granted to You under this License 82 | for that Work shall terminate as of the date such litigation is filed. 83 | 84 | 4. Redistribution. 85 | 86 | You may reproduce and distribute copies of the Work or Derivative Works thereof 87 | in any medium, with or without modifications, and in Source or Object form, 88 | provided that You meet the following conditions: 89 | 90 | You must give any other recipients of the Work or Derivative Works a copy of 91 | this License; and 92 | You must cause any modified files to carry prominent notices stating that You 93 | changed the files; and 94 | You must retain, in the Source form of any Derivative Works that You distribute, 95 | all copyright, patent, trademark, and attribution notices from the Source form 96 | of the Work, excluding those notices that do not pertain to any part of the 97 | Derivative Works; and 98 | If the Work includes a "NOTICE" text file as part of its distribution, then any 99 | Derivative Works that You distribute must include a readable copy of the 100 | attribution notices contained within such NOTICE file, excluding those notices 101 | that do not pertain to any part of the Derivative Works, in at least one of the 102 | following places: within a NOTICE text file distributed as part of the 103 | Derivative Works; within the Source form or documentation, if provided along 104 | with the Derivative Works; or, within a display generated by the Derivative 105 | Works, if and wherever such third-party notices normally appear. The contents of 106 | the NOTICE file are for informational purposes only and do not modify the 107 | License. You may add Your own attribution notices within Derivative Works that 108 | You distribute, alongside or as an addendum to the NOTICE text from the Work, 109 | provided that such additional attribution notices cannot be construed as 110 | modifying the License. 111 | You may add Your own copyright statement to Your modifications and may provide 112 | additional or different license terms and conditions for use, reproduction, or 113 | distribution of Your modifications, or for any such Derivative Works as a whole, 114 | provided Your use, reproduction, and distribution of the Work otherwise complies 115 | with the conditions stated in this License. 116 | 117 | 5. Submission of Contributions. 118 | 119 | Unless You explicitly state otherwise, any Contribution intentionally submitted 120 | for inclusion in the Work by You to the Licensor shall be under the terms and 121 | conditions of this License, without any additional terms or conditions. 122 | Notwithstanding the above, nothing herein shall supersede or modify the terms of 123 | any separate license agreement you may have executed with Licensor regarding 124 | such Contributions. 125 | 126 | 6. Trademarks. 127 | 128 | This License does not grant permission to use the trade names, trademarks, 129 | service marks, or product names of the Licensor, except as required for 130 | reasonable and customary use in describing the origin of the Work and 131 | reproducing the content of the NOTICE file. 132 | 133 | 7. Disclaimer of Warranty. 134 | 135 | Unless required by applicable law or agreed to in writing, Licensor provides the 136 | Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 137 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, 138 | including, without limitation, any warranties or conditions of TITLE, 139 | NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are 140 | solely responsible for determining the appropriateness of using or 141 | redistributing the Work and assume any risks associated with Your exercise of 142 | permissions under this License. 143 | 144 | 8. Limitation of Liability. 145 | 146 | In no event and under no legal theory, whether in tort (including negligence), 147 | contract, or otherwise, unless required by applicable law (such as deliberate 148 | and grossly negligent acts) or agreed to in writing, shall any Contributor be 149 | liable to You for damages, including any direct, indirect, special, incidental, 150 | or consequential damages of any character arising as a result of this License or 151 | out of the use or inability to use the Work (including but not limited to 152 | damages for loss of goodwill, work stoppage, computer failure or malfunction, or 153 | any and all other commercial damages or losses), even if such Contributor has 154 | been advised of the possibility of such damages. 155 | 156 | 9. Accepting Warranty or Additional Liability. 157 | 158 | While redistributing the Work or Derivative Works thereof, You may choose to 159 | offer, and charge a fee for, acceptance of support, warranty, indemnity, or 160 | other liability obligations and/or rights consistent with this License. However, 161 | in accepting such obligations, You may act only on Your own behalf and on Your 162 | sole responsibility, not on behalf of any other Contributor, and only if You 163 | agree to indemnify, defend, and hold each Contributor harmless for any liability 164 | incurred by, or claims asserted against, such Contributor by reason of your 165 | accepting any such warranty or additional liability. 166 | 167 | END OF TERMS AND CONDITIONS 168 | 169 | APPENDIX: How to apply the Apache License to your work 170 | 171 | To apply the Apache License to your work, attach the following boilerplate 172 | notice, with the fields enclosed by brackets "[]" replaced with your own 173 | identifying information. (Don't include the brackets!) The text should be 174 | enclosed in the appropriate comment syntax for the file format. We also 175 | recommend that a file or class name and description of purpose be included on 176 | the same "printed page" as the copyright notice for easier identification within 177 | third-party archives. 178 | 179 | Copyright [yyyy] [name of copyright owner] 180 | 181 | Licensed under the Apache License, Version 2.0 (the "License"); 182 | you may not use this file except in compliance with the License. 183 | You may obtain a copy of the License at 184 | 185 | http://www.apache.org/licenses/LICENSE-2.0 186 | 187 | Unless required by applicable law or agreed to in writing, software 188 | distributed under the License is distributed on an "AS IS" BASIS, 189 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 190 | See the License for the specific language governing permissions and 191 | limitations under the License. 192 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [blog](http://www.xuetech.com/search/label/Bloom%20Filter) 2 | ====== 3 | 4 | php-bloom-filter 5 | ================ 6 | * This is a fast (possibly the fastest?) single threaded bloom filter implementation in pure PHP. 7 | * There are no dependencies on external modules unlike many other implementations. 8 | * It uses a binary string to store the bit vector and manipulates based on byte indexes of the string. 9 | * [Apache 2.0 License](https://raw.github.com/dsx724/php-bloom-filter/master/LICENSE). 10 | 11 | There are 3 implementations with slightly different performance characteristics: 12 | * a 1GB vector implementation with (8-40 bit addressing) with any hash. 13 | * a 512MB vector implementation (32 bit addressing) with MD5. 14 | * a 64KB vector implementation (16 bit addressing) with MD5. 15 | 16 | math 17 | ==== 18 | * m vector bits 19 | * k hash functions 20 | * n elements 21 | * p probability of false positive 22 | 23 | * (1-(1-1/m)^(m*ln(2)))^(m*ln(2)/n)=p 24 | * k = m*ln(2)/n; 25 | 26 | performance 27 | =========== 28 | On a 4.4GHz G3258 system, the single threaded insert/lookup throughput in elements per second respectively with six bit sets per key (k = 6): 29 | 30 | On PHP 5.5 31 | * a 1GB vector implementation with (8-40 bit addressing) with any hash: 209K / 329K 32 | * a 512MB vector implementation (32 bit addressing) with MD5: 255K / 407K 33 | * a 64KB vector implementation (16 bit addressing) with MD5: 347K / 602K 34 | 35 | On HHVM: 36 | * a 1GB vector implementation with (8-40 bit addressing) with any hash: 386K / 608K 37 | * a 512MB vector implementation (32 bit addressing) with MD5: 510K / 874K 38 | * a 64KB vector implementation (16 bit addressing) with MD5: 429K / 675K 39 | 40 | It is faster than the following implementations by a significant margin: 41 | 42 | * https://github.com/mrspartak/php.bloom.filter 43 | * https://code.google.com/p/php-bloom-filter/ 44 | * https://packagist.org/packages/pleonasm/bloom-filter 45 | 46 | MD5 hashing is one of the more expensive operations and PHP does not have a native implementation of xxHash or Murmurhash3. 47 | 48 | cautionary tales 49 | ================ 50 | * PHP Limitations 51 | * Strings are limited to byte addressing of signed 32 bit integers. The maximum string is only 2GB - 1B (2^31-1 Bytes). 52 | * The bit vector only supports powers of 2 bits in this implementation. Thus the largest vector size is 1GB. 53 | * Workaround with multiple strings could allow for implementations greater than 1GB. 54 | * PHP 5.4+ 55 | * PHP lacks calloc or malloc so str_repeat is used to allocate the bit array. 56 | * PHP cannot directly use the output of str_repeat and primitive assignment will require double the memory of the vector size due to the copy. 57 | -------------------------------------------------------------------------------- /bloomfilter.php: -------------------------------------------------------------------------------- 1 | m != $bf2->m) throw new Exception('Unable to merge due to vector difference.'); 39 | if ($bf1->k != $bf2->k) throw new Exception('Unable to merge due to hash count difference.'); 40 | if ($bf1->hash != $bf2->hash) throw new Exception('Unable to merge due to hash difference.'); 41 | $length = strlen($bfout->bit_array); 42 | if ($union){ 43 | $bfout->bit_array = $bf1->bit_array | $bf2->bit_array; 44 | $bfout->n = $bf1->n + $bf2->n; 45 | } else { 46 | $bfout->bit_array = $bf1->bit_array & $bf2->bit_array; 47 | $bfout->n = abs($bf1->n - $bf2->n); 48 | } 49 | } 50 | public static function createFromProbability($n, $p){ 51 | if ($p <= 0 || $p >= 1) throw new Exception('Invalid false positive rate requested.'); 52 | if ($n <= 0) throw new Exception('Invalid capacity requested.'); 53 | $k = floor(log(1/$p,2)); 54 | $m = pow(2,ceil(log(-$n*log($p)/pow(log(2),2),2))); //approximate estimator method 55 | return new self($m,$k); 56 | } 57 | public static function getUnion($bf1,$bf2){ 58 | $bf = new self($bf1->m,$bf1->k,$bf1->hash); 59 | self::merge($bf1,$bf2,$bf,true); 60 | return $bf; 61 | } 62 | public static function getIntersection($bf1,$bf2){ 63 | $bf = new self($bf1->m,$bf1->k,$bf1->hash); 64 | self::merge($bf1,$bf2,$bf,false); 65 | return $bf; 66 | } 67 | private $n = 0; // # of entries 68 | private $m; // # of bits in array 69 | private $k; // # of hash functions 70 | private $hash; 71 | private $mask; 72 | private $chunk_size; // # of bytes to push off hash to generate an address 73 | private $bit_array; // data structure 74 | public function __construct($m, $k, $h='md5'){ 75 | if ($m < 8) throw new Exception('The bit array length must be at least 8 bits.'); 76 | if (($m & ($m - 1)) !== 0) throw new Exception('The bit array length must be power of 2.'); 77 | if ($m > 8589934592) throw new Exception('The maximum data structure size is 1GB.'); 78 | $this->m = $m; //number of bits 79 | $this->k = $k; 80 | $this->hash = $h; 81 | $address_bits = (int)log($m,2); 82 | $this->mask = (1 << $address_bits) - 8; 83 | $this->chunk_size = (int)ceil($address_bits / 8); 84 | $this->hash_times = ((int)ceil($this->chunk_size * $this->k / strlen(hash($this->hash,null,true)))) - 1; 85 | $this->bit_array = (binary)(str_repeat("\0",$this->getArraySize(true))); 86 | } 87 | public function calculateProbability($n = 0){ 88 | return pow(1-pow(1-1/$this->m,$this->k*($n ?: $this->n)),$this->k); 89 | // return pow(1-exp($this->k*($n ?: $this->n)/$this->m),$this->k); //approximate estimator 90 | } 91 | public function calculateCapacity($p){ 92 | return floor($this->m*log(2)/log($p,1-pow(1-1/$this->m,$this->m*log(2)))); 93 | } 94 | public function getElementCount(){ 95 | return $this->n; 96 | } 97 | public function getArraySize($bytes = false){ 98 | return $this->m >> ($bytes ? 3 : 0); 99 | } 100 | public function getHashCount(){ 101 | return $this->k; 102 | } 103 | public function getInfo($p = null){ 104 | $units = array('','K','M','G','T','P','E','Z','Y'); 105 | $M = $this->getArraySize(true); 106 | $magnitude = intval(floor(log($M,1024))); 107 | $unit = $units[$magnitude]; 108 | $M /= pow(1024,$magnitude); 109 | return 'Allocated '.$this->getArraySize().' bits ('.$M.' '.$unit.'Bytes)'.PHP_EOL. 110 | 'Using '.$this->getHashCount(). ' ('.($this->chunk_size << 3).'b) hashes'.PHP_EOL. 111 | 'Contains '.$this->getElementCount().' elements'.PHP_EOL. 112 | (isset($p) ? 'Capacity of '.number_format($this->calculateCapacity($p)).' (p='.$p.')'.PHP_EOL : ''); 113 | } 114 | public function add($key){ 115 | $hash = hash($this->hash,$key,true); 116 | for ($i = 0; $i < $this->hash_times; $i++) $hash .= hash($this->hash,$hash,true); 117 | for ($index = 0; $index < $this->k; $index++){ 118 | $hash_sub = hexdec(unpack('H*',substr($hash,$index*$this->chunk_size,$this->chunk_size))[1]); 119 | $word = ($hash_sub & $this->mask) >> 3; 120 | $this->bit_array[$word] = $this->bit_array[$word] | chr(1 << ($hash_sub & 7)); 121 | } 122 | $this->n++; 123 | } 124 | public function contains($key){ 125 | $hash = hash($this->hash,$key,true); 126 | for ($i = 0; $i < $this->hash_times; $i++) $hash .= hash($this->hash,$hash,true); 127 | for ($index = 0; $index < $this->k; $index++){ 128 | $hash_sub = hexdec(unpack('H*',substr($hash,$index*$this->chunk_size,$this->chunk_size))[1]); 129 | if ((ord($this->bit_array[($hash_sub & $this->mask) >> 3]) & (1 << ($hash_sub & 7))) === 0) return false; 130 | } 131 | return true; 132 | } 133 | public function unionWith($bf){ 134 | self::merge($this,$bf,$this,true); 135 | } 136 | public function intersectWith($bf){ 137 | self::merge($this,$bf,$this,false); 138 | } 139 | } 140 | ?> 141 | -------------------------------------------------------------------------------- /bloomfilter16.php: -------------------------------------------------------------------------------- 1 | m != $bf2->m) throw new Exception('Unable to merge due to vector difference.'); 36 | if ($bf1->k != $bf2->k) throw new Exception('Unable to merge due to hash count difference.'); 37 | $length = strlen($bfout->bit_array); 38 | if ($union){ 39 | $bfout->bit_array = $bf1->bit_array | $bf2->bit_array; 40 | $bfout->n = $bf1->n + $bf2->n; 41 | } else { 42 | $bfout->bit_array = $bf1->bit_array & $bf2->bit_array; 43 | $bfout->n = abs($bf1->n - $bf2->n); 44 | } 45 | } 46 | public static function createFromProbability($n, $p){ 47 | if ($p <= 0 || $p >= 1) throw new Exception('Invalid false positive rate requested.'); 48 | if ($n <= 0) throw new Exception('Invalid capacity requested.'); 49 | $k = floor(log(1/$p,2)); 50 | $m = pow(2,ceil(log(-$n*log($p)/pow(log(2),2),2))); //approximate estimator method 51 | return new self($m,$k); 52 | } 53 | public static function getUnion($bf1,$bf2){ 54 | $bf = new self($bf1->m,$bf1->k,$bf1->hash); 55 | self::merge($bf1,$bf2,$bf,true); 56 | return $bf; 57 | } 58 | public static function getIntersection($bf1,$bf2){ 59 | $bf = new self($bf1->m,$bf1->k,$bf1->hash); 60 | self::merge($bf1,$bf2,$bf,false); 61 | return $bf; 62 | } 63 | private $n = 0; // # of entries 64 | private $m; // # of bits in array 65 | private $k; // # of hash functions 66 | private $mask; 67 | private $hash_size; 68 | private $bit_array; // data structure 69 | public function __construct($m, $k){ 70 | if ($m < 8) throw new Exception('The bit array length must be at least 8 bits.'); 71 | if (($m & ($m - 1)) !== 0) throw new Exception('The bit array length must be power of 2.'); 72 | if ($m > 65536) throw new Exception('The maximum data structure size is 8KB.'); 73 | if ($k > 8) throw new Exception('The maximum bits to set is 8.'); 74 | $this->m = $m; 75 | $this->k = $k; 76 | $this->k2 = $k * 2; 77 | $address_bits = (int)log($m,2); 78 | $this->mask = (1 << $address_bits) - 8; 79 | $this->bit_array = (binary)(str_repeat("\0",$this->getArraySize(true))); 80 | } 81 | public function calculateProbability($n = 0){ 82 | return pow(1-pow(1-1/$this->m,$this->k*($n ?: $this->n)),$this->k); 83 | // return pow(1-exp($this->k*($n ?: $this->n)/$this->m),$this->k); //approximate estimator 84 | } 85 | public function calculateCapacity($p){ 86 | return floor($this->m*log(2)/log($p,1-pow(1-1/$this->m,$this->m*log(2)))); 87 | } 88 | public function getElementCount(){ 89 | return $this->n; 90 | } 91 | public function getArraySize($bytes = false){ 92 | return $this->m >> ($bytes ? 3 : 0); 93 | } 94 | public function getHashCount(){ 95 | return $this->k; 96 | } 97 | public function getInfo($p = null){ 98 | $units = array('','K','M','G','T','P','E','Z','Y'); 99 | $M = $this->getArraySize(true); 100 | $magnitude = intval(floor(log($M,1024))); 101 | $unit = $units[$magnitude]; 102 | $M /= pow(1024,$magnitude); 103 | return 'Allocated '.$this->getArraySize().' bits ('.$M.' '.$unit.'Bytes)'.PHP_EOL. 104 | 'Using '.$this->getHashCount(). ' (16b) hashes'.PHP_EOL. 105 | 'Contains '.$this->getElementCount().' elements'.PHP_EOL. 106 | (isset($p) ? 'Capacity of '.number_format($this->calculateCapacity($p)).' (p='.$p.')'.PHP_EOL : ''); 107 | } 108 | public function add($key){ 109 | $hash = md5($key,true); 110 | for ($index = 0; $index < $this->k2; $index++){ 111 | $hash_sub = (ord($hash[$index++]) << 8) | ord($hash[$index]); 112 | $word = ($hash_sub & $this->mask) >> 3; 113 | $this->bit_array[$word] = $this->bit_array[$word] | chr(1 << ($hash_sub & 7)); 114 | } 115 | $this->n++; 116 | } 117 | public function contains($key){ 118 | $hash = md5($key,true); 119 | for ($index = 0; $index < $this->k2; $index++){ 120 | $hash_sub = (ord($hash[$index++]) << 8) | ord($hash[$index]); 121 | if ((ord($this->bit_array[($hash_sub & $this->mask) >> 3]) & (1 << ($hash_sub & 7))) === 0) return false; 122 | } 123 | return true; 124 | } 125 | public function unionWith($bf){ 126 | self::merge($this,$bf,$this,true); 127 | } 128 | public function intersectWith($bf){ 129 | self::merge($this,$bf,$this,false); 130 | } 131 | } 132 | ?> 133 | -------------------------------------------------------------------------------- /bloomfilter32.php: -------------------------------------------------------------------------------- 1 | m != $bf2->m) throw new Exception('Unable to merge due to vector difference.'); 36 | if ($bf1->k != $bf2->k) throw new Exception('Unable to merge due to hash count difference.'); 37 | $length = strlen($bfout->bit_array); 38 | if ($union){ 39 | $bfout->bit_array = $bf1->bit_array | $bf2->bit_array; 40 | $bfout->n = $bf1->n + $bf2->n; 41 | } else { 42 | $bfout->bit_array = $bf1->bit_array & $bf2->bit_array; 43 | $bfout->n = abs($bf1->n - $bf2->n); 44 | } 45 | } 46 | public static function createFromProbability($n, $p){ 47 | if ($p <= 0 || $p >= 1) throw new Exception('Invalid false positive rate requested.'); 48 | if ($n <= 0) throw new Exception('Invalid capacity requested.'); 49 | $k = floor(log(1/$p,2)); 50 | $m = pow(2,ceil(log(-$n*log($p)/pow(log(2),2),2))); //approximate estimator method 51 | return new self($m,$k); 52 | } 53 | public static function getUnion($bf1,$bf2){ 54 | $bf = new self($bf1->m,$bf1->k,$bf1->hash); 55 | self::merge($bf1,$bf2,$bf,true); 56 | return $bf; 57 | } 58 | public static function getIntersection($bf1,$bf2){ 59 | $bf = new self($bf1->m,$bf1->k,$bf1->hash); 60 | self::merge($bf1,$bf2,$bf,false); 61 | return $bf; 62 | } 63 | private $n = 0; // # of entries 64 | private $m; // # of bits in array 65 | private $k; // # of hash functions 66 | private $mask; 67 | private $chunk_size; // # of bytes to push off hash to generate an address 68 | private $hash_size; 69 | private $bit_array; // data structure 70 | public function __construct($m, $k){ 71 | if ($m < 8) throw new Exception('The bit array length must be at least 8 bits.'); 72 | if (($m & ($m - 1)) !== 0) throw new Exception('The bit array length must be power of 2.'); 73 | if ($m > 4294967296) throw new Exception('The maximum data structure size is 512MB.'); 74 | $this->m = $m; 75 | $this->k = $k; 76 | $address_bits = (int)log($m,2); 77 | $this->mask = (1 << $address_bits) - 8; 78 | $this->chunk_size = ((int)ceil($address_bits / 32)) << 2; 79 | $this->hash_times = ((int)ceil($this->chunk_size * $this->k / 16)) - 1; 80 | $this->bit_array = (binary)(str_repeat("\0",$this->getArraySize(true))); 81 | } 82 | public function calculateProbability($n = 0){ 83 | return pow(1-pow(1-1/$this->m,$this->k*($n ?: $this->n)),$this->k); 84 | // return pow(1-exp($this->k*($n ?: $this->n)/$this->m),$this->k); //approximate estimator 85 | } 86 | public function calculateCapacity($p){ 87 | return floor($this->m*log(2)/log($p,1-pow(1-1/$this->m,$this->m*log(2)))); 88 | } 89 | public function getElementCount(){ 90 | return $this->n; 91 | } 92 | public function getArraySize($bytes = false){ 93 | return $this->m >> ($bytes ? 3 : 0); 94 | } 95 | public function getHashCount(){ 96 | return $this->k; 97 | } 98 | public function getInfo($p = null){ 99 | $units = array('','K','M','G','T','P','E','Z','Y'); 100 | $M = $this->getArraySize(true); 101 | $magnitude = intval(floor(log($M,1024))); 102 | $unit = $units[$magnitude]; 103 | $M /= pow(1024,$magnitude); 104 | return 'Allocated '.$this->getArraySize().' bits ('.$M.' '.$unit.'Bytes)'.PHP_EOL. 105 | 'Using '.$this->getHashCount(). ' ('.($this->chunk_size << 3).'b) hashes'.PHP_EOL. 106 | 'Contains '.$this->getElementCount().' elements'.PHP_EOL. 107 | (isset($p) ? 'Capacity of '.number_format($this->calculateCapacity($p)).' (p='.$p.')'.PHP_EOL : ''); 108 | } 109 | public function add($key){ 110 | $hash = md5($key,true); 111 | for ($i = 0; $i < $this->hash_times; $i++) $hash .= md5($hash,true); 112 | for ($index = 0; $index < $this->k; $index++){ 113 | $hash_sub = unpack('L',substr($hash,$index*$this->chunk_size,$this->chunk_size))[1]; 114 | $word = ($hash_sub & $this->mask) >> 3; 115 | $this->bit_array[$word] = $this->bit_array[$word] | chr(1 << ($hash_sub & 7)); 116 | } 117 | $this->n++; 118 | } 119 | public function contains($key){ 120 | $hash = md5($key,true); 121 | for ($i = 0; $i < $this->hash_times; $i++) $hash .= md5($hash,true); 122 | for ($index = 0; $index < $this->k; $index++){ 123 | $hash_sub = unpack('L',substr($hash,$index*$this->chunk_size,$this->chunk_size))[1]; 124 | if ((ord($this->bit_array[($hash_sub & $this->mask) >> 3]) & (1 << ($hash_sub & 7))) === 0) return false; 125 | } 126 | return true; 127 | } 128 | public function unionWith($bf){ 129 | self::merge($this,$bf,$this,true); 130 | } 131 | public function intersectWith($bf){ 132 | self::merge($this,$bf,$this,false); 133 | } 134 | } 135 | ?> 136 | -------------------------------------------------------------------------------- /test.ini: -------------------------------------------------------------------------------- 1 | [php] 2 | display_errors = 'On' 3 | memory_limit = '3G' 4 | max_execution_time = 600 5 | 6 | [test] 7 | class = 'BloomFilter' 8 | include[] = 'bloomfilter.php' -------------------------------------------------------------------------------- /test.php: -------------------------------------------------------------------------------- 1 | $value) ini_set($key,$value); 8 | foreach ($config['test']['include'] as $include) require_once $include; 9 | ?> 10 |
19 | 0.000001; $p /= 10){ 29 | $result = []; 30 | $result[] = $n; 31 | $result[] = $p; 32 | 33 | $filter = $config['test']['class']::createFromProbability($n, $p); 34 | $result[] = $filter->getArraySize(true); 35 | $result[] = $filter->calculateCapacity($p); 36 | 37 | $false_neg = 0; 38 | $false_pos = 0; 39 | 40 | $range = $n * 3; 41 | for ($k = 0; $k < $range; $k+= 3) $filter->add('T'.$k); 42 | $samples = $n * 9; 43 | for ($k = 0; $k < $samples; $k++) { 44 | if ($k % 3 == 0 && $k < $range) $false_neg += !$filter->contains('T'.$k); 45 | else $false_pos += $filter->contains('T'.$k); 46 | } 47 | 48 | $result[] = $false_neg; 49 | $result[] = $false_pos; 50 | $result[] = $false_pos / $samples; 51 | $result[] = ($false_pos / $samples < $p && $false_neg == 0) ? 'PASS' : 'FAIL'; 52 | $results[] = $result; 53 | } 54 | } 55 | 56 | break; 57 | 58 | case '10': 59 | $results[] = ['N Elements','EProb','M Bytes','Capacity','FNeg','FPos','AProb']; 60 | for ($i = 1; $i < 6; $i++){ 61 | $n = (int)pow(10,$i); 62 | for ($p = 0.1; $p > 0.000001; $p /= 10){ 63 | $result = []; 64 | $result[] = $n; 65 | $result[] = $p; 66 | 67 | $filter = $config['test']['class']::createFromProbability($n, $p); 68 | 69 | $result[] = $filter->getArraySize(true); 70 | $result[] = $filter->calculateCapacity($p); 71 | 72 | $false_neg = 0; 73 | $false_pos = 0; 74 | 75 | $range = $n * 3; 76 | for ($k = 0; $k < $range; $k+= 3) $filter->add('T'.$k); 77 | $samples = $n * 9; 78 | for ($k = 0; $k < $samples; $k++) { 79 | if ($k % 3 == 0 && $k < $range) $false_neg += !$filter->contains('T'.$k); 80 | else $false_pos += $filter->contains('T'.$k); 81 | } 82 | 83 | $result[] = $false_neg; 84 | $result[] = $false_pos; 85 | $result[] = $false_pos / $samples; 86 | $result[] = ($false_pos / $samples < $p && $false_neg == 0) ? 'PASS' : 'FAIL'; 87 | $results[] = $result; 88 | } 89 | } 90 | break; 91 | 92 | case 'U': 93 | $results[] = ['FNeg','FPos','EProb','AProb']; 94 | $result = []; 95 | $capacity = 100000; 96 | $max = 175000; 97 | $p = 0.01; 98 | $filter1 = $config['test']['class']::createFromProbability($capacity, $p); 99 | $filter2 = $config['test']['class']::createFromProbability($capacity, $p); 100 | 101 | 102 | $samples = $capacity * 5; 103 | for ($i = 0; $i < $max; $i+=2) $filter1->add('K'.$i); 104 | for ($i = 0; $i < $max; $i+=3) $filter2->add('K'.$i); 105 | 106 | echo ''.$filter1->getInfo($p).''; 107 | echo '
'.$filter2->getInfo($p).''; 108 | 109 | $filter3 = $config['test']['class']::getUnion($filter1,$filter2); 110 | 111 | $false_neg = 0; 112 | $false_pos = 0; 113 | 114 | for ($i = 0; $i < $samples; $i++){ 115 | if (($i % 2 == 0 || $i % 3 == 0) && $i < $max) $false_neg += !$filter3->contains('K'.$i); 116 | else $false_pos += $filter3->contains('K'.$i); 117 | } 118 | 119 | echo '
'.$filter3->getInfo($p).''; 120 | $result[] = $false_neg; 121 | $result[] = $false_pos; 122 | $result[] = $p; 123 | $result[] = $false_pos / $samples; 124 | $result[] = ($false_pos / $samples < $p && $false_neg == 0) ? 'PASS' : 'FAIL'; 125 | 126 | $results[] = $result; 127 | try { 128 | echo 'Testing Merge '; 129 | $filterx = $config['test']['class']::createFromProbability($capacity, 0.1); 130 | for ($i = 0; $i < $capacity / 100; $i++) $filterx->add($i); 131 | $filterx = $config['test']['class']::getUnion($filterx,$filter3); 132 | echo 'FAIL'.PHP_EOL; 133 | } catch (Exception $e){ 134 | echo 'PASS'.PHP_EOL; 135 | } 136 | 137 | $result = []; 138 | 139 | $filter1->unionWith($filter2); 140 | 141 | $false_neg = 0; 142 | $false_pos = 0; 143 | for ($i = 0; $i < $samples; $i++){ 144 | if (($i % 2 == 0 || $i % 3 == 0) && $i < $max) $false_neg += !$filter1->contains('K'.$i); 145 | else $false_pos += $filter1->contains('K'.$i); 146 | } 147 | echo '
'.$filter1->getInfo($p).''; 148 | $result[] = $false_neg; 149 | $result[] = $false_pos; 150 | $result[] = $p; 151 | $result[] = $false_pos / $samples; 152 | $result[] = ($false_pos / $samples < $p && $false_neg == 0) ? 'PASS' : 'FAIL'; 153 | 154 | $results[] = $result; 155 | break; 156 | 157 | case 'I': 158 | $results[] = ['FNeg','FPos','EProb','AProb']; 159 | $result = []; 160 | $capacity = 100000; 161 | $max = 300000; 162 | $p = 0.01; 163 | $filter1 = $config['test']['class']::createFromProbability($capacity, $p); 164 | $filter2 = $config['test']['class']::createFromProbability($capacity, $p); 165 | 166 | 167 | $samples = $capacity * 5; 168 | for ($i = 0; $i < $max; $i+=2) $filter1->add('K'.$i); 169 | for ($i = 0; $i < $max; $i+=3) $filter2->add('K'.$i); 170 | 171 | echo '
'.$filter1->getInfo($p).''; 172 | echo '
'.$filter2->getInfo($p).''; 173 | 174 | $filter3 = $config['test']['class']::getIntersection($filter1,$filter2); 175 | 176 | $false_neg = 0; 177 | $false_pos = 0; 178 | 179 | for ($i = 0; $i < $samples; $i++){ 180 | if ($i % 2 == 0 && $i % 3 == 0 && $i < $max) $false_neg += !$filter3->contains('K'.$i); 181 | else $false_pos += $filter3->contains('K'.$i); 182 | } 183 | 184 | echo '
'.$filter3->getInfo($p).''; 185 | $result[] = $false_neg; 186 | $result[] = $false_pos; 187 | $result[] = $p; 188 | $result[] = $false_pos / $samples; 189 | $result[] = ($false_pos / $samples < $p && $false_neg == 0) ? 'PASS' : 'FAIL'; 190 | 191 | $results[] = $result; 192 | try { 193 | echo 'Testing Merge '; 194 | $filterx = $config['test']['class']::createFromProbability($capacity, 0.1); 195 | for ($i = 0; $i < $capacity / 100; $i++) $filterx->add($i); 196 | $filterx = $config['test']['class']::getIntersection($filterx,$filter3); 197 | echo 'FAIL'.PHP_EOL; 198 | } catch (Exception $e){ 199 | echo 'PASS'.PHP_EOL; 200 | } 201 | 202 | $result = []; 203 | 204 | $filter1->intersectWith($filter2); 205 | 206 | $false_neg = 0; 207 | $false_pos = 0; 208 | for ($i = 0; $i < $samples; $i++){ 209 | if ($i % 2 == 0 && $i % 3 == 0 && $i < $max) $false_neg += !$filter1->contains('K'.$i); 210 | else $false_pos += $filter1->contains('K'.$i); 211 | } 212 | echo '
'.$filter1->getInfo($p).''; 213 | $result[] = $false_neg; 214 | $result[] = $false_pos; 215 | $result[] = $p; 216 | $result[] = $false_pos / $samples; 217 | $result[] = ($false_pos / $samples < $p && $false_neg == 0) ? 'PASS' : 'FAIL'; 218 | 219 | $results[] = $result; 220 | break; 221 | 222 | case 'MEM': 223 | $capacity = 800000000; 224 | $probability = 0.01; 225 | $filter = $config['test']['class']::createFromProbability($capacity, $probability); 226 | echo '
'.$filter->getInfo($probability).''; 227 | break; 228 | 229 | case 'TEST': 230 | $capacity = 1000000; 231 | $probability = 0.01; 232 | $s1 = microtime(true); 233 | $filter = $config['test']['class']::createFromProbability($capacity, $probability); 234 | $e1 = microtime(true); 235 | 236 | $sample = 1000000; 237 | $offset = 500000; 238 | 239 | $s2 = microtime(true); 240 | for ($i = 0; $i < $sample; $i++) $filter->add($i); 241 | $e2 = microtime(true); 242 | $t = 0; 243 | $s3 = microtime(true); 244 | for ($i = $offset; $i < $sample + $offset; $i++) $t += $filter->contains($i); 245 | $e3 = microtime(true); 246 | 247 | echo '
'; 248 | echo $filter->getInfo($probability).PHP_EOL; 249 | echo 'Create Time: '.($e1 - $s1).PHP_EOL; 250 | echo 'Add Time: '.($e2 - $s2).' ('.floor($sample/($e2-$s2)).' i/s)'.PHP_EOL; 251 | echo 'Check Time: '.($e3 - $s3).' ('.floor($sample/($e3-$s3)).' i/s)'.PHP_EOL; 252 | echo $t; 253 | echo ''; 254 | break; 255 | } 256 | echo '
'.implode(' | ',$row).' |