├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── TODO.md ├── _config.yml ├── pom.xml ├── src ├── main │ └── java │ │ └── com │ │ └── duprasville │ │ └── guava │ │ └── probably │ │ ├── AbstractCuckooStrategy.java │ │ ├── BloomFilter.java │ │ ├── CuckooFilter.java │ │ ├── CuckooStrategies.java │ │ ├── CuckooStrategy.java │ │ ├── CuckooStrategyMurmurBealDupras32.java │ │ ├── CuckooTable.java │ │ ├── ProbabilisticFilter.java │ │ └── package-info.java └── test │ └── java │ └── com │ └── duprasville │ └── guava │ └── probably │ ├── AbstractProbabilisticFilterTest.java │ ├── BloomProbabilisticFilterTest.java │ ├── CuckooFilterProbabilisticFilterTest.java │ ├── CuckooFilterTest.java │ └── CuckooStrategiesTest.java ├── updaterelease.sh └── util ├── deploy_snapshot.sh ├── settings.xml ├── update_snapshot_docs.sh └── util.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Maven 2 | target/ 3 | *.ser 4 | *.ec 5 | 6 | # IntelliJ Idea 7 | .idea/ 8 | out/ 9 | *.ipr 10 | *.iws 11 | *.iml 12 | 13 | # Eclipse 14 | .classpath 15 | .project 16 | .settings/ 17 | .metadata/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | jdk: 3 | - oraclejdk8 4 | - openjdk8 5 | install: mvn install -U -DskipTests=true 6 | script: 7 | - mvn verify -U -Dmaven.javadoc.skip=true 8 | # avoid unnecessary cache updates 9 | - rm -rf $HOME/.m2/repository/com/duprasville/guava/guava-probably 10 | after_success: 11 | - util/update_snapshot_docs.sh 12 | - util/deploy_snapshot.sh 13 | cache: 14 | directories: 15 | - $HOME/.m2 16 | env: 17 | global: 18 | - secure: MJY016432HZC8lTgwtyWLzmjaKZNVxHWRZ1M3en4X4u0qsYd9mwpZKjbBry5M2GY1laeMwa2XX/hAicxaSqkijul6NGTkV1tSmAkRHccqyu80+5uQuK0SYRMzXPxZdHGwwYG1X9Myene7HDs2FyGg8UezQphfFQFk0y+0EFuFR4C2hrGUJl8OBjzx/sF9/MhyyUfdZPDHVxpZ/cmw+TspmBZ08IVjl5HJcYQGki/Y2VZ4d7rKfI24EeL5mjivFzWR3j6hWLKpRqbnCtMzcXDs/z8qBzWPIu8D+PZQpZwN6OQM/JDYpDD4Vn/9+FGStbjCGOtSTigvA7639NY7nKiGEWBJTEhREP+7YwovYcj9oPXodTYpNg/Ai1tgCHw2zGBiD2eandCCYhyN2gHcfdq2lJ1PA4nUPNdj1zsTW7IUNbAmJ+gVd4uDvgIML7NPqbgl9hUaEx/DPzVc272sOYJOdtRgcg5sOIA7Ehnky6CWnpKCggv2Lww3dO5bSeMccAoB96RQKNFqXTAynGoBaLZnVZzgCobZ66YAYWSosM0oBzWVwDEkw32MiRoizplF3NfnSARaGzENyCV4Gay611dbXYF9HVrxp3oYFnptEbvvOKmUGgp0HpQiBemzJ0tuabs4Zu47MkCnalrU9K8BkoreaVJk8WP0/KPj53t/fWVUT4= 19 | - secure: uLNJ12pDfeggkdcvV1G2KVgh04H7SkxRBPBsgmgMknu/6MnXZSlJS8uiT7bSId0E3F/ERXCjB/z00vI3+kYpvAYdvsNqGONx6rMJvFs+vn3BtG9q5VSyhULKrnZnkWthRYKQv5EZdQT8WqKVO77m266sX1eBGUtazazqwm8kEZ7yjagfiBNcfoUIdVEP1jCxd/0+Vdq/KRgtejunXmm6rcE9ppMh+j5KG6jy5FxwlfQzDXys+E+NvIsgFmMy7ZQKnivZuEhsQmng4nB8AaZpraOOrooNBJfVdn+VGiL8Pnw1INgOqv89LN5u6Hcc5ztL5Sf2rooSP21KQCLC1ZbtmqjXxng06AA4PIuTKdv75G/huK8M7q+TCWSpe/4aIjnRxOfqetQ3jntj6e1joolHVUWj/9cSOUwGIJg748SA6atN5iDi0GGfORutblD28/2BZzMOEMRKfszT0JGxQ5bsjbDzIbyvT+XrSpZ+exx/DAtMUzvn9OE+xDQW046pFy87UHPeW985vM9M1MVh1frT8RPq8pCIKVp+bLk8gJTvV+etfVkD6yT2YYwxtp/02s3G3IrQVF4WtuwXv0B0HUF94xwzVm+JtHEQ+v2Yq68z7IjApBlabocrUcWMNH1EiDgcscwQJ2xgdNEPyef3ruZYQ2TJaWeo/gyt0CmgfGyDAzc= 20 | - secure: Gi1cBOP9PiFiKDHHnId8hXGHjo4heNLN9XSsvKzSFG0fJzn889XWl3b6W4rGT4uqV3TvETe51rI/RdmD2Ru2W4oyoAk5ReE7rIyUFXmEHa1L/rfyNF8L0eAcKHS3ZpGIMk+k15kGJUxRKyloSTiqrXvIV96c7r9tfmZet5482uT+FiMNblrVwlWcguYzpUOpU2nHj57heueCIAWBVjS3/Fk1M8HwLFm07a7N8v9bvqN8Opc1a36FtIKxuHfCBWA0R+VwJLYEeW0RAUPnFQtVnq3ZKxOYqAiH+jdg4gDIX6i12/1VETQTjTkQt9uFVOtIooxWVMHt7Glho3/kBopUlnOOOsfmj40X/YP4u6tafcai0imXNtEVnjhUBX4f6+62QSl8GoJRvYwwKaJ1XL2Bz8W8kFvBj+rPRMF6495J4aAvn47ykxBM/gQvrMusjKyvku34jIpCeZ/XKUpUHQvErTQlp7dE1F94GaW4DDSBAFGBVrHPe0hGfRwRVglRZMmpDYj6yTl9xcsn+8iR2A8cN9JVumye10rKcAYIpy768uWarT4ZXv2eO1otxm2SUPlb6+ryxXKCyr0d6JfvG/o0Yq+Yg3mVBq4jgAvx4MzgP67tOKufBXcQP6+HKtNZrcHD0pWPassGb23io3/On68YObJpddgRBYes+lFLHOpzOmA= 21 | branches: 22 | except: 23 | - gh-pages 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Guava-Probably: Probabilistic Filters 2 | ===================================== 3 | The Guava-Probably project provides two probabilistic filters for Guava. 4 | 5 | [![Build Status](https://travis-ci.org/bdupras/guava-probably.svg?branch=master)](https://travis-ci.org/bdupras/guava-probably) 6 | [![GitHub license](https://img.shields.io/github/license/bdupras/guava-probably.svg)](./LICENSE) 7 | [![Maven Central](https://img.shields.io/maven-central/v/com.duprasville.guava/guava-probably.svg)](https://maven-badges.herokuapp.com/maven-central/com.duprasville.guava/guava-probably) 8 | 9 | # What is it? 10 | A probabilistic filter is a space-efficient data structure for representing a set in order to support membership queries. [ref][BroderMitzenmacher] 11 | 12 | # How does it work? 13 | Check out this sweet, interactive demo: [Probabilistic Filters By Example](https://bdupras.github.io/filter-tutorial/) 14 | 15 | # What's it good for? 16 | Probabilistic filters are great for reducing unnecessary disk, database or network queries. Applications where the universe of possible members in a set is much larger than actual members may benefit from probabilistic filters, especially when most membership queries are expected to return false. 17 | 18 | # No really, what's it good for? 19 | - Google Chrome uses p-filters to make a preliminary decision whether a particular web site is malicious or safe. [ref][Yakunin] 20 | - Exim mail transfer agent uses p-filters in its rate-limiting logic. [ref][Finch] 21 | - Use a p-filter to reject malicious authentication attempts, protecting your cache and database from botnet queries. 22 | 23 | # Cool, how do I get it? 24 | Requires JDK 8 or higher and Google Guava 19.0 or higher (as of 1.0). 25 | - `1.0`: [API Docs][guava-probably-snapshot-api-docs], 05 July 2019. 26 | 27 | To add a dependency on Guava-Probably using Maven, use the following: 28 | ```xml 29 | 30 | com.duprasville.guava 31 | guava-probably 32 | 1.0 33 | 34 | ``` 35 | 36 | To add a dependency using Gradle: 37 | ``` 38 | dependencies { 39 | compile 'com.duprasville.guava:guava-probably:1.0' 40 | } 41 | ``` 42 | 43 | # How do I learn more? 44 | - [Probabilistic Filters By Example](https://bdupras.github.io/filter-tutorial/) 45 | - [Cuckoo Filter: Practically Better Than Bloom](https://www.cs.cmu.edu/~dga/papers/cuckoo-conext2014.pdf) 46 | - [Bloom Filters by Example](http://billmill.org/bloomfilter-tutorial/) 47 | - [Google Guava BloomFilter](https://github.com/google/guava/wiki/HashingExplained#bloomfilter) 48 | - [Network Applications of Bloom Filters: A Survey](http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.im/1109191032) 49 | - [Nice Bloom filter application](http://blog.alexyakunin.com/2010/03/nice-bloom-filter-application.html) 50 | - [What use are Bloom filters, anyway?](http://fanf.livejournal.com/82764.html) 51 | 52 | # Links 53 | - [GitHub project](https://github.com/bdupras/guava-probably) 54 | - [Issue tracker: report a defect or feature request](https://github.com/bdupras/guava-probably/issues/new) 55 | 56 | [BroderMitzenmacher]: http://projecteuclid.org/DPubS?service=UI&version=1.0&verb=Display&handle=euclid.im/1109191032 "Network Applications of Bloom Filters: A Survey; Andrei Broder and Michael Mitzenmacher" 57 | [Yakunin]: http://blog.alexyakunin.com/2010/03/nice-bloom-filter-application.html "Nice Bloom filter application" 58 | [Finch]: http://fanf.livejournal.com/82764.html "What use are Bloom filters, anyway?" 59 | [guava-probably-release-api-docs]: http://bdupras.github.io/guava-probably/releases/1.0/api/docs/ 60 | [guava-probably-snapshot-api-docs]: http://bdupras.github.io/guava-probably/releases/snapshot/api/docs/ 61 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | # Guava-Probably: TODO List 2 | 3 | ## CI 4 | * commit/push to release SNAPSHOT, major, minor, patch :: maven central && javadocs 5 | * simplify travis scripts 6 | 7 | ## Features 8 | * MultiSet interface operations (count, set counts) 9 | * CuckooFilter impl increase max capacity (separate even/odd tables? array of tables?) 10 | * Primitive interface API (to avoid object alloc) 11 | * Direct hash fn invocation (to avoid object alloc) 12 | * extract filter dimensions calculation 13 | * NOTE: knowing if an insertion modified a bloom filter is useful 14 | ** e.g. loop detection in routing algos 15 | ** question: what should the semantic be for returning inserted/not-inserted vs changed/not-changed? 16 | * make deletability optional? when off, colliding insertions do not mutate the filter 17 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | # Main site settings 2 | title: Guava-Probably 3 | subtitle: Probabilistic data structures for Guava 4 | description: Probabilistic data structures for Guava 5 | baseurl: /guava-probably 6 | url: http://bdupras.github.io 7 | permalink: /news/:year/:month/:day/:title/ 8 | 9 | exclude: 10 | - "Gemfile*" 11 | - "README.md" 12 | - "*.sh" 13 | 14 | # GitHub-flavored Markdown support 15 | markdown: kramdown 16 | kramdown: 17 | input: GFM 18 | 19 | # Set by default by GitHub pages (can't be changed) 20 | safe: true 21 | lsi: false 22 | # source: 23 | 24 | # Set by default by GitHub pages (can be changed) 25 | # highlighter: pygments 26 | # github: (https://help.github.com/articles/repository-metadata-on-github-pages/) 27 | 28 | # Collections 29 | collections: 30 | releases: 31 | output: true 32 | permalink: /:collection/:path/ 33 | 34 | # Release data 35 | # Do not change! updaterelease.sh automatically updates these fields 36 | latest_release: 1.0 37 | latest_snapshot: 1.0-SNAPSHOT 38 | 39 | # Miscellaneous data 40 | email: brian@duprasville.com 41 | twitter_username: briandupras 42 | github_username: bdupras/guava-probably 43 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | com.duprasville.guava 4 | guava-probably 5 | jar 6 | 1.1-SNAPSHOT 7 | Guava-Probably: Probabilistic Data Structures extension for Guava 8 | 9 | Guava-Probably is an extension library to Google Guava that adds probabilistic data structures 10 | and related interfaces. 11 | 12 | https://github.com/bdupras/guava-probably 13 | 14 | 15 | The Apache License, Version 2.0 16 | http://www.apache.org/licenses/LICENSE-2.0.txt 17 | 18 | 19 | 20 | 21 | Brian Dupras 22 | brian@duprasville.com 23 | bdupras 24 | https://github.com/bdupras 25 | 26 | 27 | 28 | scm:git:git@github.com:bdupras/guava-probably.git 29 | scm:git:git@github.com:bdupras/guava-probably.git 30 | git@github.com:bdupras/guava-probably.git 31 | HEAD 32 | 33 | 34 | UTF-8 35 | 36 | 37 | 38 | 39 | ossrh 40 | https://oss.sonatype.org/service/local/staging/deploy/maven2/ 41 | 42 | 43 | ossrh 44 | https://oss.sonatype.org/content/repositories/snapshots 45 | 46 | 47 | 48 | 49 | 50 | org.apache.maven.plugins 51 | maven-compiler-plugin 52 | 3.8.1 53 | 54 | 1.8 55 | 1.8 56 | 57 | 58 | 59 | org.sonatype.plugins 60 | nexus-staging-maven-plugin 61 | 1.6.3 62 | true 63 | 64 | ossrh 65 | https://oss.sonatype.org/ 66 | true 67 | 68 | 69 | 70 | maven-release-plugin 71 | 2.5.3 72 | 73 | v@{project.version} 74 | true 75 | release 76 | 77 | 78 | 79 | org.apache.maven.scm 80 | maven-scm-provider-gitexe 81 | 1.9.5 82 | 83 | 84 | 85 | 86 | org.apache.maven.plugins 87 | maven-source-plugin 88 | 2.2.1 89 | 90 | 91 | attach-sources 92 | 93 | jar-no-fork 94 | 95 | 96 | 97 | 98 | 99 | org.apache.maven.plugins 100 | maven-javadoc-plugin 101 | 2.9.1 102 | 103 | 104 | attach-javadocs 105 | 106 | jar 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | com.google.guava 117 | guava 118 | 19.0 119 | 120 | 121 | com.google.guava 122 | guava-testlib 123 | 19.0 124 | 125 | 126 | com.google.guava 127 | guava-tests 128 | 19.0 129 | 130 | 131 | junit 132 | junit 133 | 4.13.1 134 | test 135 | 136 | 137 | com.google.code.findbugs 138 | jsr305 139 | 2.0.1 140 | 141 | 142 | com.google.truth 143 | truth 144 | 0.28 145 | 146 | 147 | 148 | 149 | release 150 | 151 | 152 | 153 | maven-source-plugin 154 | 3.0.1 155 | 156 | 157 | attach-sources 158 | 159 | jar 160 | 161 | 162 | 163 | 164 | 165 | org.sonatype.plugins 166 | nexus-staging-maven-plugin 167 | 1.6.3 168 | true 169 | 170 | ossrh 171 | https://oss.sonatype.org/ 172 | true 173 | 174 | 175 | 176 | org.apache.maven.plugins 177 | maven-gpg-plugin 178 | 1.6 179 | 180 | 181 | sign-artifacts 182 | verify 183 | 184 | sign 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | jdk8 194 | 195 | [1.8,) 196 | 197 | 198 | 199 | 200 | 201 | org.apache.maven.plugins 202 | maven-javadoc-plugin 203 | 204 | -Xdoclint:none 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | org.apache.maven.plugins 213 | maven-javadoc-plugin 214 | 215 | -Xdoclint:none 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/AbstractCuckooStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | abstract class AbstractCuckooStrategy implements CuckooStrategy { 18 | AbstractCuckooStrategy(int ordinal) { 19 | this.ordinal = ordinal; 20 | } 21 | 22 | public abstract long index(int hash, long m); 23 | 24 | public abstract long altIndex(long index, int fingerprint, long m); 25 | 26 | protected abstract int pickEntryToKick(int numEntriesPerBucket); 27 | 28 | protected abstract long maxRelocationAttempts(); 29 | 30 | private final int ordinal; 31 | 32 | public int ordinal() { 33 | return ordinal; 34 | } 35 | 36 | public boolean addAll(CuckooTable thiz, CuckooTable that) { 37 | for (long index = 0; index < that.numBuckets; index++) { 38 | for (int entry = 0; entry < that.numEntriesPerBucket; entry++) { 39 | int fingerprint = that.readEntry(index, entry); 40 | if (CuckooTable.EMPTY_ENTRY != fingerprint && !( 41 | putEntry(fingerprint, thiz, index) || 42 | putEntry(fingerprint, thiz, 43 | altIndex(index, fingerprint, thiz.numBuckets)))) { 44 | return false; 45 | } 46 | } 47 | } 48 | return true; 49 | } 50 | 51 | protected boolean putEntry(int fingerprint, CuckooTable table, long index) { 52 | return table.swapAnyEntry(fingerprint, CuckooTable.EMPTY_ENTRY, index) 53 | || putEntry(fingerprint, table, index, 0); 54 | } 55 | 56 | 57 | protected boolean putEntry(int fingerprint, final CuckooTable table, long index, int kick) { 58 | if (maxRelocationAttempts() == kick) { 59 | return false; 60 | } 61 | 62 | int entry = pickEntryToKick(table.numEntriesPerBucket); 63 | int kicked = table.swapEntry(fingerprint, index, entry); 64 | 65 | if ((CuckooTable.EMPTY_ENTRY == kicked) 66 | || putEntry(kicked, table, altIndex(index, kicked, table.numBuckets), kick + 1)) { 67 | return true; 68 | } else { 69 | int kickedBack = table.swapEntry(kicked, index, entry); 70 | assert kickedBack == fingerprint : "Uh oh - couldn't unroll failed attempts to putEntry()"; 71 | return false; 72 | } 73 | } 74 | 75 | public boolean equivalent(CuckooTable thiz, CuckooTable that) { 76 | if (!thiz.isCompatible(that)) { 77 | return false; 78 | } 79 | 80 | for (long index = 0; index < that.numBuckets; index++) { 81 | for (int entry = 0; entry < that.numEntriesPerBucket; entry++) { 82 | int fingerprint = that.readEntry(index, entry); 83 | if (CuckooTable.EMPTY_ENTRY == fingerprint) { 84 | continue; 85 | } 86 | 87 | int thizCount = thiz.countEntry(fingerprint, index) + 88 | thiz.countEntry(fingerprint, altIndex(index, fingerprint, thiz.numBuckets)); 89 | int thatCount = that.countEntry(fingerprint, index) + 90 | that.countEntry(fingerprint, altIndex(index, fingerprint, that.numBuckets)); 91 | if (thizCount != thatCount) { 92 | return false; 93 | } 94 | } 95 | } 96 | return true; 97 | } 98 | 99 | public boolean containsAll(CuckooTable thiz, CuckooTable that) { 100 | if (!thiz.isCompatible(that)) { 101 | return false; 102 | } 103 | 104 | for (long index = 0; index < that.numBuckets; index++) { 105 | for (int entry = 0; entry < that.numEntriesPerBucket; entry++) { 106 | int fingerprint = that.readEntry(index, entry); 107 | if (CuckooTable.EMPTY_ENTRY == fingerprint) { 108 | continue; 109 | } 110 | 111 | int thizCount = thiz.countEntry(fingerprint, index) + 112 | thiz.countEntry(fingerprint, altIndex(index, fingerprint, thiz.numBuckets)); 113 | int thatCount = that.countEntry(fingerprint, index) + 114 | that.countEntry(fingerprint, altIndex(index, fingerprint, that.numBuckets)); 115 | if (thizCount < thatCount) { 116 | return false; 117 | } 118 | } 119 | } 120 | return true; 121 | } 122 | 123 | public boolean removeAll(CuckooTable thiz, CuckooTable that) { 124 | if (!thiz.isCompatible(that)) { 125 | return false; 126 | } 127 | 128 | for (long index = 0; index < that.numBuckets; index++) { 129 | for (int entry = 0; entry < that.numEntriesPerBucket; entry++) { 130 | int fingerprint = that.readEntry(index, entry); 131 | if (CuckooTable.EMPTY_ENTRY == fingerprint) { 132 | continue; 133 | } 134 | 135 | long altIndex = altIndex(index, fingerprint, thiz.numBuckets); 136 | int thatCount = that.countEntry(fingerprint, index) + that.countEntry(fingerprint, altIndex); 137 | 138 | for (int i = 0; i < thatCount; i++) { 139 | if (!(thiz.swapAnyEntry(CuckooTable.EMPTY_ENTRY, fingerprint, index) 140 | || thiz.swapAnyEntry(CuckooTable.EMPTY_ENTRY, fingerprint, altIndex))) { 141 | return false; 142 | } 143 | } 144 | } 145 | } 146 | return true; 147 | } 148 | 149 | @Override 150 | public boolean equals(Object obj) { 151 | if (obj instanceof CuckooStrategy) { 152 | return ((CuckooStrategy) obj).ordinal() == this.ordinal(); 153 | } else { 154 | return super.equals(obj); 155 | } 156 | } 157 | 158 | @Override 159 | public int hashCode() { 160 | return this.ordinal(); 161 | } 162 | 163 | @Override 164 | public String toString() { 165 | return this.getClass().getSimpleName() + '{' + 166 | "ordinal=" + this.ordinal() + 167 | '}'; 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/BloomFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.hash.Funnel; 18 | import com.google.common.math.LongMath; 19 | 20 | import java.io.Serializable; 21 | import java.util.Collection; 22 | 23 | import javax.annotation.CheckReturnValue; 24 | import javax.annotation.Nullable; 25 | 26 | import static com.google.common.base.Preconditions.checkArgument; 27 | import static com.google.common.base.Preconditions.checkNotNull; 28 | 29 | /** 30 | * A Bloom filter for instances of {@code E} that implements the {@link ProbabilisticFilter} 31 | * interface. 32 | * 33 | *

This implementation is backed by Google Guava's 35 | * {@code BloomFilter}. 36 | * 37 | * From Guava:

A Bloom filter offers an approximate containment test with one-sided 38 | * error: if it claims that an element is contained in it, this might be in error, but if it claims 39 | * that an element is not contained in it, then this is definitely true. 40 | * 41 | *

If you are unfamiliar with Bloom filters, this nice tutorial 42 | * may help you understand how they work. 43 | * 44 | *

The false positive probability ({@code FPP}) of a bloom filter is defined as the probability 45 | * that {@link #contains(Object)} will erroneously return {@code true} for an object that has not 46 | * actually been put in the {@link BloomFilter}.

47 | * 48 | * @param the type of instances that the {@link BloomFilter} accepts. 49 | * @author Brian Dupras 50 | * @author Guava Authors (underlying BloomFilter implementation) 51 | * @see com.google.common.hash.BloomFilter 52 | * @see ProbabilisticFilter 53 | */ 54 | public final class BloomFilter implements ProbabilisticFilter, Serializable { 55 | private com.google.common.hash.BloomFilter delegate; 56 | private final Funnel funnel; 57 | private final long capacity; 58 | private final double fpp; 59 | private long size; 60 | 61 | private BloomFilter(com.google.common.hash.BloomFilter delegate, Funnel funnel, long capacity, double fpp, long size) { 62 | super(); 63 | checkNotNull(delegate); 64 | checkNotNull(funnel); 65 | checkArgument(capacity >= 0, "capacity must be positive"); 66 | checkArgument(fpp >= 0.0 && fpp < 1.0, "fpp must be positive 0.0 <= fpp < 1.0"); 67 | checkArgument(size >= 0, "size must be positive"); 68 | this.delegate = delegate; 69 | this.funnel = funnel; 70 | this.capacity = capacity; 71 | this.fpp = fpp; 72 | this.size = size; 73 | } 74 | 75 | /** 76 | * Creates a {@link BloomFilter} with the expected number of insertions and expected false 77 | * positive probability. 78 | * 79 | *

Note that overflowing a {@link BloomFilter} with significantly more elements than specified, 80 | * will result in its saturation, and a sharp deterioration of its false positive probability. 81 | * 82 | *

The constructed {@link BloomFilter} will be serializable if the provided {@link Funnel} is. 83 | * 84 | *

It is recommended that the funnel be implemented as a Java enum. This has the benefit of 85 | * ensuring proper serialization and deserialization, which is important since {@link 86 | * #equals(Object)} also relies on object identity of funnels. 87 | * 88 | * @param funnel the funnel of T's that the constructed {@link BloomFilter} will use 89 | * @param capacity the number of expected insertions to the constructed {@link BloomFilter}; must 90 | * be positive 91 | * @param fpp the desired false positive probability (must be positive and less than 1.0) 92 | * @return a {@link BloomFilter} 93 | * @see com.google.common.hash.BloomFilter#create(com.google.common.hash.Funnel, int, 95 | * double) 96 | */ 97 | @CheckReturnValue 98 | public static BloomFilter create(Funnel funnel, long capacity, double fpp) { 99 | return new BloomFilter( 100 | com.google.common.hash.BloomFilter.create(funnel, capacity, fpp), 101 | funnel, capacity, fpp, 0L); 102 | } 103 | 104 | /** 105 | * Creates a {@link BloomFilter BloomFilter} with the expected number of insertions and a 106 | * default expected false positive probability of 3%. 107 | * 108 | *

Note that overflowing a {@link BloomFilter} with significantly more objects than specified, 109 | * will result in its saturation, and a sharp deterioration of its false positive probability. 110 | * 111 | *

The constructed {@link BloomFilter} will be serializable if the provided {@code Funnel} 112 | * is. 113 | * 114 | *

It is recommended that the funnel be implemented as a Java enum. This has the benefit of 115 | * ensuring proper serialization and deserialization, which is important since {@link #equals} 116 | * also relies on object identity of funnels. 117 | * 118 | * @param funnel the funnel of T's that the constructed {@link BloomFilter} will use 119 | * @param capacity the number of expected insertions to the constructed {@link BloomFilter}; must 120 | * be positive 121 | * @return a {@link BloomFilter} 122 | * @see com.google.common.hash.BloomFilter#create(com.google.common.hash.Funnel, int) 124 | */ 125 | @CheckReturnValue 126 | public static BloomFilter create(Funnel funnel, long capacity) { 127 | return new BloomFilter( 128 | com.google.common.hash.BloomFilter.create(funnel, capacity, 0.03D), 129 | funnel, capacity, 0.03D, 0L); 130 | } 131 | 132 | /** 133 | * Adds the specified element to this filter. A return value of {@code true} ensures that {@link 134 | * #contains(Object)} given {@code e} will also return {@code true}. 135 | * 136 | * @param e element to be added to this filter 137 | * @return always {@code true} as {@code com.google.common.hash.BloomFilter} cannot fail to add an 138 | * object 139 | * @throws NullPointerException if the specified element is null 140 | * @see #contains(Object) 141 | * @see #addAll(Collection) 142 | * @see #addAll(ProbabilisticFilter) 143 | * @see com.google.common.hash.BloomFilter#put(T) 144 | */ 145 | public boolean add(E e) { 146 | checkNotNull(e); 147 | delegate.put(e); 148 | size = LongMath.checkedAdd(size, 1L); 149 | return true; 150 | } 151 | 152 | /** 153 | * Combines {@code this} filter with another compatible filter. The mutations happen to {@code 154 | * this} instance. Callers must ensure {@code this} filter is appropriately sized to avoid 155 | * saturating it or running out of space. 156 | * 157 | * @param f filter to be combined into {@code this} filter - {@code f} is not mutated 158 | * @return {@code true} if the operation was successful, {@code false} otherwise 159 | * @throws NullPointerException if the specified filter is null 160 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == 161 | * false} 162 | * @see #add(Object) 163 | * @see #addAll(Collection) 164 | * @see #contains(Object) 165 | */ 166 | public boolean addAll(ProbabilisticFilter f) { 167 | checkNotNull(f); 168 | checkArgument(this != f, "Cannot combine a " + this.getClass().getSimpleName() + 169 | " with itself."); 170 | checkArgument(f instanceof BloomFilter, "Cannot combine a " + 171 | this.getClass().getSimpleName() + " with a " + f.getClass().getSimpleName()); 172 | checkArgument(this.isCompatible(f), "Cannot combine incompatible filters. " + 173 | this.getClass().getSimpleName() + " instances must have equivalent funnels; the same " + 174 | "strategy; and the same number of buckets, entries per bucket, and bits per entry."); 175 | 176 | delegate.putAll(((BloomFilter) f).delegate); 177 | size = LongMath.checkedAdd(size, f.sizeLong()); 178 | return true; 179 | } 180 | 181 | /** 182 | * Adds all of the elements in the specified collection to this filter. The behavior of this 183 | * operation is undefined if the specified collection is modified while the operation is in 184 | * progress. 185 | * 186 | * @param c collection containing elements to be added to this filter 187 | * @return {@code true} if all elements of the collection were successfully added, {@code false} 188 | * otherwise 189 | * @throws NullPointerException if the specified collection contains a null element, or if the 190 | * specified collection is null 191 | * @see #add(Object) 192 | * @see #addAll(ProbabilisticFilter) 193 | * @see #contains(Object) 194 | */ 195 | public boolean addAll(Collection c) { 196 | checkNotNull(c); 197 | for (E e : c) { 198 | checkNotNull(c); 199 | add(e); 200 | } 201 | return true; 202 | } 203 | 204 | /** 205 | * Returns {@code true} if this filter might contain the specified element, {@code false} 206 | * if this is definitely not the case. 207 | * 208 | * @param e element whose containment in this filter is to be tested 209 | * @return {@code true} if this filter might contain the specified element, {@code false} 210 | * if this is definitely not the case. 211 | * @throws ClassCastException if the type of the specified element is incompatible with this 212 | * filter (optional) 213 | * @throws NullPointerException if the specified element is {@code null} and this filter does not 214 | * permit {@code null} elements 215 | * @see #containsAll(Collection) 216 | * @see #containsAll(ProbabilisticFilter) 217 | * @see #add(Object) 218 | * @see #remove(Object) 219 | * @see com.google.common.hash.BloomFilter#mightContain(T) 220 | */ 221 | public boolean contains(E e) { 222 | return delegate.mightContain(e); 223 | } 224 | 225 | /** 226 | * Returns the current false positive probability ({@code FPP}) of this filter. 227 | * 228 | * @return the probability that {@link #contains(Object)} will erroneously return {@code true} 229 | * given an element that has not actually been added to the filter. 230 | * @see #fpp() 231 | * @see com.google.common.hash.BloomFilter#put(T) 232 | */ 233 | public double currentFpp() { 234 | return delegate.expectedFpp(); 235 | } 236 | 237 | /** 238 | * Returns {@code true} if the specified filter is compatible with {@code this} filter. {@code f} 239 | * is considered compatible if {@code this} filter can use it in combinatoric operations (e.g. 240 | * {@link #addAll(ProbabilisticFilter)}, {@link #containsAll(ProbabilisticFilter)}). 241 | * 242 | * For two bloom filters to be compatible, they must: 243 | * 244 | *

  • not be the same instance
  • have the same number of hash functions
  • have 245 | * the same bit size
  • have the same strategy
  • have equal funnels
246 | * 247 | * @param f filter to check for compatibility with {@code this} filter 248 | * @return {@code true} if the specified filter is compatible with {@code this} filter 249 | * @throws NullPointerException if the specified filter is {@code null} 250 | * @see #addAll(ProbabilisticFilter) 251 | * @see #containsAll(ProbabilisticFilter) 252 | * @see #removeAll(ProbabilisticFilter) 253 | * @see com.google.common.hash.BloomFilter#isCompatible(com.google.common.hash.BloomFilter) 254 | */ 255 | public boolean isCompatible(ProbabilisticFilter f) { 256 | checkNotNull(f); 257 | return (f instanceof BloomFilter) && 258 | this.delegate.isCompatible(((BloomFilter) f).delegate); 259 | } 260 | 261 | /** 262 | * Returns {@code true} if this filter might contain all of the elements of the specified 263 | * collection (optional operation). More formally, returns {@code true} if {@link 264 | * #contains(Object)} {@code == true} for all of the elements of the specified collection. 265 | * 266 | * @param c collection containing elements to be checked for containment in this filter 267 | * @return {@code true} if this filter might contain all elements of the specified 268 | * collection 269 | * @throws NullPointerException if the specified collection contains one or more {@code null} 270 | * elements, or if the specified collection is {@code null} 271 | * @see #contains(Object) 272 | * @see #containsAll(ProbabilisticFilter) 273 | */ 274 | public boolean containsAll(Collection c) { 275 | checkNotNull(c); 276 | for (E e : c) { 277 | checkNotNull(e); 278 | if (!contains(e)) return false; 279 | } 280 | return true; 281 | } 282 | 283 | /** 284 | * Not supported. 285 | * 286 | * @throws UnsupportedOperationException 287 | */ 288 | public boolean containsAll(ProbabilisticFilter f) { 289 | throw new UnsupportedOperationException(); 290 | } 291 | 292 | /** 293 | * Returns {@code true} if this filter contains no elements. 294 | * 295 | * @return {@code true} if this filter contains no elements 296 | * @see #sizeLong() 297 | */ 298 | public boolean isEmpty() { 299 | return 0 == this.sizeLong(); 300 | } 301 | 302 | /** 303 | * Returns the number of elements contained in this filter (its cardinality). If this filter 304 | * contains more than {@code Long.MAX_VALUE} elements, returns {@code Long.MAX_VALUE}. 305 | * 306 | * @return the number of elements contained in this filter (its cardinality) 307 | * @see #capacity() 308 | * @see #isEmpty() 309 | */ 310 | public long sizeLong() { 311 | return size >= 0 ? size : Long.MAX_VALUE /* overflow */; 312 | } 313 | 314 | /** 315 | * Returns the number of elements contained in this filter (its cardinality). If this filter 316 | * contains more than {@code Integer.MAX_VALUE} elements, returns {@code Integer.MAX_VALUE}. 317 | * 318 | * @return the number of elements contained in this filter (its cardinality) 319 | * @see #capacity() 320 | * @see #isEmpty() 321 | * @see #sizeLong() 322 | */ 323 | public long size() { 324 | return size > Integer.MAX_VALUE ? Integer.MAX_VALUE : size; 325 | } 326 | 327 | /** 328 | * Returns the number of elements this filter can represent at its requested {@code FPP}. This is 329 | * not be a hard limit of the filter implementation. It is permissible for a filter to contain 330 | * more elements than its requested capacity, though its {@code FPP} will suffer. 331 | * 332 | * @return the number of elements this filter can represent at its requested {@code FPP}. 333 | * @see #fpp() 334 | * @see #currentFpp() 335 | * @see #sizeLong() 336 | */ 337 | public long capacity() { 338 | return capacity; 339 | } 340 | 341 | /** 342 | * Returns the intended {@code FPP} limit of this filter. This is not a hard limit of the filter 343 | * implementation. It is permissible for a filter's {@code FPP} to degrade (e.g. via saturation) 344 | * beyond its intended limit. 345 | * 346 | * @return the intended {@code FPP} limit of this filter. 347 | * @see #currentFpp() 348 | */ 349 | public double fpp() { 350 | return fpp; 351 | } 352 | 353 | /** 354 | * Creates a new {@link BloomFilter} that's a copy of this instance. The returned instance {@code 355 | * equals(f) == true} but shares no mutable state. 356 | */ 357 | public static BloomFilter copyOf(BloomFilter f) { 358 | return new BloomFilter(f.delegate.copy(), f.funnel, f.capacity(), f.fpp(), f.sizeLong()); 359 | } 360 | 361 | /** 362 | * Removes all of the elements from this filter. The filter will be empty after this call 363 | * returns. 364 | * 365 | * @see #sizeLong() 366 | * @see #isEmpty() 367 | */ 368 | public void clear() { 369 | this.delegate = com.google.common.hash.BloomFilter.create(funnel, (int) capacity, fpp); 370 | this.size = 0L; 371 | } 372 | 373 | /** 374 | * Not supported. Standard bloom filters do not support element removal. 375 | * 376 | * @throws UnsupportedOperationException 377 | */ 378 | public boolean remove(E e) { 379 | throw new UnsupportedOperationException(); 380 | } 381 | 382 | /** 383 | * Not supported. Standard bloom filters do not support element removal. 384 | * 385 | * @throws UnsupportedOperationException 386 | */ 387 | public boolean removeAll(Collection c) { 388 | throw new UnsupportedOperationException(); 389 | } 390 | 391 | /** 392 | * Not supported. Standard bloom filters do not support element removal. 393 | * 394 | * @throws UnsupportedOperationException 395 | */ 396 | public boolean removeAll(ProbabilisticFilter f) { 397 | throw new UnsupportedOperationException(); 398 | } 399 | 400 | @Override 401 | public boolean equals(@Nullable Object object) { 402 | if (object instanceof com.google.common.hash.BloomFilter) { 403 | //noinspection ConstantConditions 404 | return delegate.equals(((BloomFilter) object).delegate); 405 | } else { 406 | return delegate.equals(object); 407 | } 408 | } 409 | 410 | @Override 411 | public int hashCode() { 412 | return delegate.hashCode(); 413 | } 414 | } -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/CuckooFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.annotations.Beta; 18 | import com.google.common.annotations.VisibleForTesting; 19 | import com.google.common.base.Objects; 20 | import com.google.common.hash.Funnel; 21 | import com.google.common.primitives.SignedBytes; 22 | 23 | import java.io.DataInputStream; 24 | import java.io.DataOutputStream; 25 | import java.io.IOException; 26 | import java.io.InputStream; 27 | import java.io.OutputStream; 28 | import java.io.Serializable; 29 | import java.util.Collection; 30 | import java.util.Random; 31 | 32 | import javax.annotation.CheckReturnValue; 33 | import javax.annotation.Nullable; 34 | 35 | import static com.google.common.base.Preconditions.checkArgument; 36 | import static com.google.common.base.Preconditions.checkNotNull; 37 | import static com.google.common.math.DoubleMath.log2; 38 | import static com.google.common.math.LongMath.divide; 39 | import static java.lang.Math.ceil; 40 | import static java.lang.Math.pow; 41 | import static java.math.RoundingMode.CEILING; 42 | import static java.math.RoundingMode.HALF_DOWN; 43 | 44 | /** 45 | * A Cuckoo filter for instances of {@code E} that implements the {@link ProbabilisticFilter} 46 | * interface. 47 | * 48 | *
"Cuckoo filters can replace Bloom filters for approximate set membership tests. 49 | * Cuckoo filters support adding and removing items dynamically while achieving even higher 50 | * performance than Bloom filters. For applications that store many items and target moderately low 51 | * false positive rates, cuckoo filters have lower space overhead than space-optimized Bloom 52 | * filters. Cuckoo filters outperform previous data structures that extend Bloom filters to support 53 | * deletions substantially in both time and space." - Fan, et. al.
54 | * 55 | *

Cuckoo filters offer constant time performance for the basic operations {@link #add(Object)}, 56 | * {@link #remove(Object)}, {@link #contains(Object)} and {@link #sizeLong()}.

57 | * 58 | *

This class does not permit {@code null} elements.

59 | * 60 | *

Cuckoo filters implement the {@link Serializable} interface. They also support a more compact 61 | * serial representation via the {@link #writeTo(OutputStream)} and {@link #readFrom(InputStream, 62 | * Funnel)} methods. Both serialized forms will continue to be supported by future versions of this 63 | * library. However, serial forms generated by newer versions of the code may not be readable by 64 | * older versions of the code (e.g., a serialized cuckoo filter generated today may not be 65 | * readable by a binary that was compiled 6 months ago).

66 | * 67 | *

ref: Cuckoo Filter: 68 | * Practically Better Than Bloom Bin Fan, David G. Andersen, Michael Kaminsky†, Michael D. 69 | * Mitzenmacher‡ Carnegie Mellon University, †Intel Labs, ‡Harvard University

70 | * 71 | * @param the type of elements that this filter accepts 72 | * @author Brian Dupras 73 | * @author Alex Beal 74 | * @see ProbabilisticFilter 75 | */ 76 | @Beta 77 | public final class CuckooFilter implements ProbabilisticFilter, Serializable { 78 | static final int MAX_ENTRIES_PER_BUCKET = 8; 79 | static final int MIN_ENTRIES_PER_BUCKET = 2; 80 | 81 | /** 82 | * Minimum false positive probability supported, 8.67E-19. 83 | * 84 | * CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]" 85 | * (b) entries per bucket: 8 at e <= 0.00001 86 | * (f) bits per entry: 64-bits max 87 | * (e) false positive probability 88 | * 89 | * 64 = log2(16/e) = [log2(1/e) + log2(16)] 90 | * 64 = log2(1/e) + 4 91 | * 60 = log2(1/e) 92 | * 2^60 = 1/e 93 | * e = 1/2^60 94 | * e = 8.673617379884035E-19 95 | */ 96 | static double MIN_FPP = 1.0D / pow(2, 60); 97 | 98 | /** 99 | * Maximum false positive probability supported, 0.99. 100 | */ 101 | static double MAX_FPP = 0.99D; 102 | 103 | private final CuckooTable table; 104 | private final Funnel funnel; 105 | private final CuckooStrategy cuckooStrategy; 106 | private final double fpp; 107 | 108 | /** 109 | * Creates a CuckooFilter. 110 | */ 111 | private CuckooFilter( 112 | CuckooTable table, Funnel funnel, CuckooStrategy cuckooStrategy, double fpp) { 113 | this.fpp = fpp; 114 | this.table = checkNotNull(table); 115 | this.funnel = checkNotNull(funnel); 116 | this.cuckooStrategy = checkNotNull(cuckooStrategy); 117 | } 118 | 119 | /** 120 | * Returns a new {@link CuckooFilter} that's a copy of this instance. The new instance is equal to 121 | * this instance but shares no mutable state. 122 | */ 123 | @CheckReturnValue 124 | public CuckooFilter copy() { 125 | return new CuckooFilter(table.copy(), funnel, cuckooStrategy, fpp); 126 | } 127 | 128 | /** 129 | * Returns {@code true} if this filter might contain the specified element, {@code false} 130 | * if this is definitely not the case. 131 | * 132 | * @param e element whose containment in this filter is to be tested 133 | * @return {@code true} if this filter might contain the specified element, {@code false} 134 | * if this is definitely not the case. 135 | * @throws NullPointerException if the specified element is {@code null} and this filter does not 136 | * permit {@code null} elements 137 | * @see #containsAll(Collection) 138 | * @see #containsAll(ProbabilisticFilter) 139 | * @see #add(Object) 140 | * @see #remove(Object) 141 | */ 142 | @CheckReturnValue 143 | public boolean contains(E e) { 144 | checkNotNull(e); 145 | return cuckooStrategy.contains(e, funnel, table); 146 | } 147 | 148 | /** 149 | * Returns {@code true} if this filter might contain all of the elements of the specified 150 | * collection. More formally, returns {@code true} if {@link #contains(Object)} {@code == true} 151 | * for all of the elements of the specified collection. 152 | * 153 | * @param c collection containing elements to be checked for containment in this filter 154 | * @return {@code true} if this filter might contain all elements of the specified 155 | * collection 156 | * @throws NullPointerException if the specified collection contains one or more {@code null} 157 | * elements, or if the specified collection is {@code null} 158 | * @see #contains(Object) 159 | * @see #containsAll(ProbabilisticFilter) 160 | */ 161 | public boolean containsAll(Collection c) { 162 | checkNotNull(c); 163 | for (E e : c) { 164 | checkNotNull(e); 165 | if (!contains(e)) return false; 166 | } 167 | return true; 168 | } 169 | 170 | /** 171 | * Returns {@code true} if this filter might contain all elements contained in the 172 | * specified filter. {@link #isCompatible(ProbabilisticFilter)} must return {@code true} for the 173 | * given filter. 174 | * 175 | * @param f cuckoo filter containing elements to be checked for probable containment in this 176 | * filter 177 | * @return {@code true} if this filter might contain all elements contained in the 178 | * specified filter, {@code false} if this is definitely not the case. 179 | * @throws NullPointerException if the specified filter is {@code null} 180 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == false} 181 | * given {@code f} 182 | * @see #contains(Object) 183 | * @see #containsAll(Collection) 184 | */ 185 | public boolean containsAll(ProbabilisticFilter f) { 186 | checkNotNull(f); 187 | if (this == f) { 188 | return true; 189 | } 190 | checkCompatibility(f, "compare"); 191 | return this.cuckooStrategy.containsAll(this.table, ((CuckooFilter) f).table); 192 | } 193 | 194 | /** 195 | * Adds the specified element to this filter. Returns {@code true} if {@code e} was successfully 196 | * added to the filter, {@code false} if this is definitely not the case, as would be the 197 | * case when the filter becomes saturated. Saturation may occur even if {@link #sizeLong()} {@code 198 | * < } {@link #capacity()}, e.g. if {@code e} has already been added {@code 2*b} times to the 199 | * cuckoo filter, it will have saturated the number of entries per bucket ({@code b}) allocated 200 | * within the filter and a subsequent invocation will return {@code false}. A return value of 201 | * {@code true} ensures that {@link #contains(Object)} given {@code e} will also return {@code 202 | * true}. 203 | * 204 | * @param e element to be added to this filter 205 | * @return {@code true} if {@code e} was successfully added to the filter, {@code false} if this 206 | * is definitely not the case 207 | * @throws NullPointerException if the specified element is {@code null} 208 | * @todo consider exposing {@code b} as maxEntriesPerElement()? 209 | * @see #contains(Object) 210 | * @see #addAll(Collection) 211 | * @see #addAll(ProbabilisticFilter) 212 | */ 213 | @CheckReturnValue 214 | public boolean add(E e) { 215 | checkNotNull(e); 216 | return cuckooStrategy.add(e, funnel, table); 217 | } 218 | 219 | /** 220 | * Combines {@code this} filter with another compatible filter. The mutations happen to {@code 221 | * this} instance. Callers must ensure {@code this} filter is appropriately sized to avoid 222 | * saturating it or running out of space. 223 | * 224 | * @param f filter to be combined into {@code this} filter - {@code f} is not mutated 225 | * @return {@code true} if the operation was successful, {@code false} otherwise 226 | * @throws NullPointerException if the specified filter is {@code null} 227 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == 228 | * false} 229 | * @see #add(Object) 230 | * @see #addAll(Collection) 231 | * @see #contains(Object) 232 | */ 233 | @CheckReturnValue 234 | public boolean addAll(ProbabilisticFilter f) { 235 | checkNotNull(f); 236 | checkArgument(this != f, "Cannot combine a " + this.getClass().getSimpleName() + 237 | " with itself."); 238 | checkCompatibility(f, "combine"); 239 | return this.cuckooStrategy.addAll(this.table, ((CuckooFilter) f).table); 240 | } 241 | 242 | /** 243 | * Adds all of the elements in the specified collection to this filter. The behavior of this 244 | * operation is undefined if the specified collection is modified while the operation is in 245 | * progress. Some elements of {@code c} may have been added to the filter even when {@code false} 246 | * is returned. In this case, the caller may {@link #remove(Object)} the additions by comparing 247 | * the filter {@link #sizeLong()} before and after the invocation, knowing that additions from 248 | * {@code c} occurred in {@code c}'s iteration order. 249 | * 250 | * @param c collection containing elements to be added to this filter 251 | * @return {@code true} if all elements of the collection were successfully added, {@code false} 252 | * otherwise 253 | * @throws NullPointerException if the specified collection contains a {@code null} element, or if 254 | * the specified collection is {@code null} 255 | * @see #add(Object) 256 | * @see #addAll(ProbabilisticFilter) 257 | * @see #contains(Object) 258 | */ 259 | public boolean addAll(Collection c) { 260 | checkNotNull(c); 261 | for (E e : c) { 262 | checkNotNull(e); 263 | if (!add(e)) { 264 | return false; 265 | } 266 | } 267 | return true; 268 | } 269 | 270 | /** 271 | * Removes all of the elements from this filter. The filter will be empty after this call 272 | * returns. 273 | * 274 | * @see #sizeLong() 275 | * @see #isEmpty() 276 | */ 277 | public void clear() { 278 | table.clear(); 279 | } 280 | 281 | 282 | /** 283 | * Removes the specified element from this filter. The element must be contained in the filter 284 | * prior to invocation. If {@code false} is returned, this is definitely an indication that 285 | * the specified element wasn't contained in the filter prior to invocation. This condition is an 286 | * error, and this filter can no longer be relied upon to return correct {@code false} responses 287 | * from {@link #contains(Object)}, unless {@link #isEmpty()} is also {@code true}. 288 | * 289 | * @param e element to be removed from this filter 290 | * @return {@code true} if this filter probably contained the specified element, {@code false} 291 | * otherwise 292 | * @throws NullPointerException if the specified element is {@code null} and this filter does not 293 | * permit {@code null} elements 294 | * @see #contains(Object) 295 | * @see #removeAll(Collection) 296 | * @see #removeAll(ProbabilisticFilter) 297 | */ 298 | @CheckReturnValue 299 | public boolean remove(E e) { 300 | checkNotNull(e); 301 | return cuckooStrategy.remove(e, funnel, table); 302 | } 303 | 304 | 305 | /** 306 | * Removes from this filter all of its elements that are contained in the specified collection. 307 | * All element contained in the specified collection must be contained in the filter prior to 308 | * invocation. 309 | * 310 | * If {@code false} is returned, this is definitely an indication that the specified 311 | * collection contained elements that were not contained in this filter prior to invocation, and 312 | * this filter can no longer be relied upon to return correct {@code false} responses from {@link 313 | * #contains(Object)}, unless {@link #isEmpty()} is also {@code true}. 314 | * 315 | * Some elements of {@code c} may have been removed from the filter even when {@code false} is 316 | * returned. In this case, the caller may {@link #add(Object)} the additions by comparing the 317 | * filter {@link #sizeLong()} before and after the invocation, knowing that removals from {@code 318 | * c} occurred in {@code c}'s iteration order. 319 | * 320 | * @param c collection containing elements to be removed from this filter 321 | * @return {@code true} if all of the elements of the specified collection were successfully 322 | * removed from the filter, {@code false} if any of the elements was not successfully removed 323 | * @throws NullPointerException if the specified collection contains one or more {@code null} 324 | * elements, or if the specified collection is {@code null} 325 | * @see #contains(Object) 326 | * @see #remove(Object) 327 | * @see #removeAll(ProbabilisticFilter) 328 | */ 329 | @CheckReturnValue 330 | public boolean removeAll(Collection c) { 331 | checkNotNull(c); 332 | for (E e : c) { 333 | checkNotNull(e); 334 | if (!remove(e)) { 335 | return false; 336 | } 337 | } 338 | return true; 339 | } 340 | 341 | /** 342 | * Subtracts the specified filter from {@code this} filter. The mutations happen to {@code this} 343 | * instance. Callers must ensure that the specified filter represents elements that are currently 344 | * contained in {@code this} filter. 345 | * 346 | * If {@code false} is returned, this is definitely an indication that the specified filter 347 | * contained elements that were not contained in this filter prior to invocation and this filter 348 | * can no longer be relied upon to return correct {@code false} responses from {@link 349 | * #contains(Object)}, unless {@link #isEmpty()} is also {@code true}. 350 | * 351 | * @param f filter containing elements to remove from {@code this} filter - {@code f} is not 352 | * mutated 353 | * @return {@code true} if the operation was successful, {@code false} otherwise 354 | * @throws NullPointerException if the specified filter is null 355 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == false} 356 | * given {@code f} 357 | * @see #contains(Object) 358 | * @see #remove(Object) 359 | * @see #removeAll(Collection) 360 | */ 361 | @CheckReturnValue 362 | public boolean removeAll(ProbabilisticFilter f) { 363 | checkNotNull(f); 364 | if (this == f) { 365 | clear(); 366 | return true; 367 | } 368 | checkCompatibility(f, "remove"); 369 | return this.cuckooStrategy.removeAll(this.table, ((CuckooFilter) f).table); 370 | } 371 | 372 | /** 373 | * Returns the number of elements contained in this filter (its cardinality). If this filter 374 | * contains more than {@code Long.MAX_VALUE} elements, returns {@code Long.MAX_VALUE}. 375 | * 376 | * @return the number of elements contained in this filter (its cardinality) 377 | * @see #capacity() 378 | * @see #isEmpty() 379 | * @see #size() 380 | */ 381 | public long sizeLong() { 382 | return table.size(); 383 | } 384 | 385 | /** 386 | * Returns the number of elements contained in this filter (its cardinality). If this filter 387 | * contains more than {@code Integer.MAX_VALUE} elements, returns {@code Integer.MAX_VALUE}. 388 | * 389 | * @return the number of elements contained in this filter (its cardinality) 390 | * @see #capacity() 391 | * @see #isEmpty() 392 | * @see #sizeLong() 393 | */ 394 | public long size() { 395 | final long ret = sizeLong(); 396 | return ret > Integer.MAX_VALUE ? Integer.MAX_VALUE : ret; 397 | } 398 | 399 | /** 400 | * Returns the number of elements this filter can represent at its requested {@code FPP}. It's 401 | * sometimes possible to add more elements to a cuckoo filter than its capacity since the load 402 | * factor used to calculate its optimal storage size is less than 100%. 403 | * 404 | * @return the number of elements this filter can represent at its requested {@code FPP}. 405 | * @see #fpp() 406 | * @see #currentFpp() 407 | * @see #sizeLong() 408 | * @see #optimalLoadFactor(int) 409 | */ 410 | public long capacity() { 411 | return (long) Math.floor(table.capacity() * optimalLoadFactor(table.numEntriesPerBucket())); 412 | } 413 | 414 | /** 415 | * Returns the approximate {@code FPP} limit of this filter. This is not a hard limit, however a 416 | * cuckoo filter will not exceed its {@code FPP} by a significant amount as the filter becomes 417 | * saturated. 418 | * 419 | * @return the intended {@code FPP} limit of this filter. 420 | * @see #currentFpp() 421 | */ 422 | public double fpp() { 423 | return table.fppAtGivenLoad(optimalLoadFactor(table.numEntriesPerBucket())); 424 | } 425 | 426 | /** 427 | * Returns the current false positive probability ({@code FPP}) of this filter. 428 | * 429 | * @return the probability that {@link #contains(Object)} will erroneously return {@code true} 430 | * given an element that has not actually been added to the filter. Unlike a bloom filter, a 431 | * cuckoo filter cannot become saturated to the point of significantly degrading its {@code FPP}. 432 | * @see CuckooFilter#fpp() 433 | */ 434 | public double currentFpp() { 435 | return table.currentFpp(); 436 | } 437 | 438 | /** 439 | * Returns {@code true} if this filter contains no elements. 440 | * 441 | * @return {@code true} if this filter contains no elements 442 | * @see #sizeLong() 443 | */ 444 | public boolean isEmpty() { 445 | return 0 == sizeLong(); 446 | } 447 | 448 | /** 449 | * Returns {@code true} if {@code f} is compatible with {@code this} filter. {@code f} is 450 | * considered compatible if {@code this} filter can use it in combinatoric operations (e.g. {@link 451 | * #addAll(ProbabilisticFilter)}). 452 | * 453 | * @param f The filter to check for compatibility. 454 | * @return {@code true} if {@code f} is compatible with {@code this} filter. 455 | */ 456 | public boolean isCompatible(ProbabilisticFilter f) { 457 | checkNotNull(f); 458 | 459 | return (this != f) 460 | && (f instanceof CuckooFilter) 461 | && (this.table.isCompatible(((CuckooFilter) f).table)) 462 | && (this.cuckooStrategy.equals(((CuckooFilter) f).cuckooStrategy)) 463 | && (this.funnel.equals(((CuckooFilter) f).funnel)); 464 | } 465 | 466 | @Override 467 | public boolean equals(@Nullable Object object) { 468 | if (object == this) { 469 | return true; 470 | } 471 | if (object instanceof CuckooFilter) { 472 | CuckooFilter that = (CuckooFilter) object; 473 | return this.funnel.equals(that.funnel) 474 | && this.cuckooStrategy.equals(that.cuckooStrategy) 475 | && this.table.equals(that.table) 476 | && this.cuckooStrategy.equivalent(this.table, that.table) 477 | ; 478 | } 479 | return false; 480 | } 481 | 482 | @Override 483 | public int hashCode() { 484 | return Objects.hashCode(funnel, cuckooStrategy, table); 485 | } 486 | 487 | /** 488 | * Creates a filter with the expected number of insertions and expected false positive 489 | * probability.

Note that overflowing a {@link CuckooFilter} with significantly more 490 | * objects than specified, will result in its saturation causing {@link #add(Object)} to reject 491 | * new additions.

The constructed {@link CuckooFilter} will be serializable if the 492 | * provided {@code Funnel} is.

It is recommended that the funnel be implemented as a 493 | * Java enum. This has the benefit of ensuring proper serialization and deserialization, which is 494 | * important since {@link #equals} also relies on object identity of funnels. 495 | * 496 | * @param funnel the funnel of T's that the constructed {@link CuckooFilter} will use 497 | * @param capacity the number of expected insertions to the constructed {@link CuckooFilter}; must 498 | * be positive 499 | * @param fpp the desired false positive probability (must be positive and less than 1.0). 500 | * @return a {@link CuckooFilter} 501 | */ 502 | @CheckReturnValue 503 | public static CuckooFilter create( 504 | Funnel funnel, long capacity, double fpp) { 505 | return create(funnel, capacity, fpp, 506 | CuckooStrategies.MURMUR128_BEALDUPRAS_32.strategy()); 507 | } 508 | 509 | @VisibleForTesting 510 | static CuckooFilter create(Funnel funnel, long capacity, double fpp, 511 | CuckooStrategy cuckooStrategy) { 512 | checkNotNull(funnel); 513 | checkArgument(capacity > 0, "Expected insertions (%s) must be > 0", capacity); 514 | checkArgument(fpp > 0.0D, "False positive probability (%s) must be > 0.0", fpp); 515 | checkArgument(fpp < 1.0D, "False positive probability (%s) must be < 1.0", fpp); 516 | checkNotNull(cuckooStrategy); 517 | 518 | int numEntriesPerBucket = optimalEntriesPerBucket(fpp); 519 | long numBuckets = optimalNumberOfBuckets(capacity, numEntriesPerBucket); 520 | int numBitsPerEntry = optimalBitsPerEntry(fpp, numEntriesPerBucket); 521 | 522 | try { 523 | return new CuckooFilter(new CuckooTable(numBuckets, 524 | numEntriesPerBucket, numBitsPerEntry), funnel, cuckooStrategy, fpp); 525 | } catch (IllegalArgumentException e) { 526 | throw new IllegalArgumentException("Could not create CuckooFilter of " + numBuckets + 527 | " buckets, " + numEntriesPerBucket + " entries per bucket, " + numBitsPerEntry + 528 | " bits per entry", e); 529 | } 530 | } 531 | 532 | /** 533 | * Creates a filter with the expected number of insertions and a default expected false positive 534 | * probability of 3.2%.

Note that overflowing a {@code CuckooFilter} with significantly 535 | * more objects than specified, will result in its saturation causing {@link #add(Object)} to 536 | * reject new additions.

The constructed {@link CuckooFilter} will be serializable if the 537 | * provided {@code Funnel} is.

It is recommended that the funnel be implemented as a 538 | * Java enum. This has the benefit of ensuring proper serialization and deserialization, which is 539 | * important since {@link #equals} also relies on object identity of funnels. 540 | * 541 | * @param funnel the funnel of T's that the constructed {@link CuckooFilter} will use 542 | * @param capacity the number of expected insertions to the constructed {@link CuckooFilter}; must 543 | * be positive 544 | * @return a {@link CuckooFilter} 545 | */ 546 | @CheckReturnValue 547 | public static CuckooFilter create(Funnel funnel, long capacity) { 548 | return create(funnel, capacity, 0.032D); 549 | } 550 | 551 | /* 552 | * Space optimization cheat sheet, per CuckooFilter § 5.1 : 553 | * 554 | * Given: 555 | * n: expected insertions 556 | * e: expected false positive probability (e.g. 0.03D for 3% fpp) 557 | * 558 | * Choose: 559 | * b: bucket size in entries (2, 4, 8) 560 | * a: load factor (proportional to b) 561 | * 562 | * Calculate: 563 | * f: fingerprint size in bits 564 | * m: table size in buckets 565 | * 566 | * 567 | * 1) Choose b = 8 | 4 | 2 568 | * when e : 0.00001 < e ≤ 0.002 569 | * ref: CuckooFilter § 5.1 ¶ 5, "Optimal bucket size" 570 | * 571 | * 2) Choose a = 50% | 84% | 95.5% | 98% 572 | * when b = 1 | 2 | 4 | 8 573 | * ref: CuckooFilter § 5.1 ¶ 2, "(1) Larger buckets improve table occupancy" 574 | * 575 | * 2) Optimal f = ceil( log2(2b/e) ) 576 | * ref: CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]" 577 | * 578 | * 3) Required m = evenCeil( ceiling( ceiling( n/a ) / b ) ) 579 | * Minimum entries (B) = n/a rounded up 580 | * Minimum buckets (m) = B/b rounded up to an even number 581 | */ 582 | 583 | /** 584 | * Returns the optimal number of entries per bucket, or bucket size, ({@code b}) given the 585 | * expected false positive probability ({@code e}). 586 | * 587 | * CuckooFilter § 5.1 ¶ 5, "Optimal bucket size" 588 | * 589 | * @param e the desired false positive probability (must be positive and less than 1.0) 590 | * @return optimal number of entries per bucket 591 | */ 592 | @VisibleForTesting 593 | static int optimalEntriesPerBucket(double e) { 594 | checkArgument(e > 0.0D, "e must be > 0.0"); 595 | if (e <= 0.00001) { 596 | return MAX_ENTRIES_PER_BUCKET; 597 | } else if (e <= 0.002) { 598 | return MAX_ENTRIES_PER_BUCKET / 2; 599 | } else { 600 | return MIN_ENTRIES_PER_BUCKET; 601 | } 602 | } 603 | 604 | /** 605 | * Returns the optimal load factor ({@code a}) given the number of entries per bucket ({@code 606 | * b}). 607 | * 608 | * CuckooFilter § 5.1 ¶ 2, "(1) Larger buckets improve table occupancy" 609 | * 610 | * @param b number of entries per bucket 611 | * @return load factor, positive and less than 1.0 612 | */ 613 | @VisibleForTesting 614 | static double optimalLoadFactor(int b) { 615 | checkArgument(b == 2 || b == 4 || b == 8, "b must be 2, 4, or 8"); 616 | if (b == 2) { 617 | return 0.84D; 618 | } else if (b == 4) { 619 | return 0.955D; 620 | } else { 621 | return 0.98D; 622 | } 623 | } 624 | 625 | /** 626 | * Returns the optimal number of bits per entry ({@code f}) given the false positive probability 627 | * ({@code e}) and the number of entries per bucket ({@code b}). 628 | * 629 | * CuckooFilter § 5.1 Eq. (6), "f ≥ log2(2b/e) = [log2(1/e) + log2(2b)]" 630 | * 631 | * @param e the desired false positive probability (must be positive and less than 1.0) 632 | * @param b number of entries per bucket 633 | * @return number of bits per entry 634 | */ 635 | @VisibleForTesting 636 | static int optimalBitsPerEntry(double e, int b) { 637 | checkArgument(e >= MIN_FPP, "Cannot create CuckooFilter with FPP[" + e + 638 | "] < CuckooFilter.MIN_FPP[" + CuckooFilter.MIN_FPP + "]"); 639 | return log2(2 * b / e, HALF_DOWN); 640 | } 641 | 642 | /** 643 | * Returns the minimal required number of buckets given the expected insertions {@code n}, and the 644 | * number of entries per bucket ({@code b}). 645 | * 646 | * @param n the number of expected insertions 647 | * @param b number of entries per bucket 648 | * @return number of buckets 649 | */ 650 | @VisibleForTesting 651 | static long optimalNumberOfBuckets(long n, int b) { 652 | checkArgument(n > 0, "n must be > 0"); 653 | return evenCeil(divide((long) ceil(n / optimalLoadFactor(b)), b, CEILING)); 654 | } 655 | 656 | static long evenCeil(long n) { 657 | return (n + 1) / 2 * 2; 658 | } 659 | 660 | private Object writeReplace() { 661 | return new SerialForm(this); 662 | } 663 | 664 | /** 665 | * Returns the size in bits of the underlying cuckoo table data structure. 666 | */ 667 | @VisibleForTesting 668 | long bitSize() { 669 | return table.bitSize(); 670 | } 671 | 672 | private static class SerialForm implements Serializable { 673 | final long[] data; 674 | final long size; 675 | final long checksum; 676 | final long numBuckets; 677 | final int numEntriesPerBucket; 678 | final int numBitsPerEntry; 679 | final Funnel funnel; 680 | final CuckooStrategy cuckooStrategy; 681 | final double fpp; 682 | 683 | SerialForm(CuckooFilter filter) { 684 | this.data = filter.table.data(); 685 | this.numBuckets = filter.table.numBuckets(); 686 | this.numEntriesPerBucket = filter.table.numEntriesPerBucket(); 687 | this.numBitsPerEntry = filter.table.numBitsPerEntry(); 688 | this.size = filter.table.size(); 689 | this.checksum = filter.table.checksum(); 690 | this.funnel = filter.funnel; 691 | this.cuckooStrategy = filter.cuckooStrategy; 692 | this.fpp = filter.fpp; 693 | } 694 | 695 | Object readResolve() { 696 | return new CuckooFilter( 697 | new CuckooTable(data, size, checksum, numBuckets, numEntriesPerBucket, numBitsPerEntry), 698 | funnel, cuckooStrategy, fpp); 699 | } 700 | 701 | private static final long serialVersionUID = 1; 702 | } 703 | 704 | /** 705 | * Writes this cuckoo filter to an output stream, with a custom format (not Java serialization). 706 | * This has been measured to save at least 400 bytes compared to regular serialization.

707 | * 708 | * Use {@link #readFrom(InputStream, Funnel)} to reconstruct the written CuckooFilter. 709 | */ 710 | public void writeTo(OutputStream out) throws IOException { 711 | /* 712 | * Serial form: 713 | * 1 signed byte for the strategy 714 | * 1 IEEE 754 floating-point double, the expected FPP 715 | * 1 big endian long, the number of entries 716 | * 1 big endian long, the checksum of entries 717 | * 1 big endian long for the number of buckets 718 | * 1 big endian int for the number of entries per bucket 719 | * 1 big endian int for the fingerprint size in bits 720 | * 1 big endian int, the number of longs in the filter table's data 721 | * N big endian longs of the filter table's data 722 | */ 723 | DataOutputStream dout = new DataOutputStream(out); 724 | dout.writeByte(SignedBytes.checkedCast(cuckooStrategy.ordinal())); 725 | dout.writeDouble(fpp); 726 | dout.writeLong(table.size()); 727 | dout.writeLong(table.checksum()); 728 | dout.writeLong(table.numBuckets()); 729 | dout.writeInt(table.numEntriesPerBucket()); 730 | dout.writeInt(table.numBitsPerEntry()); 731 | dout.writeInt(table.data().length); 732 | 733 | for (long value : table.data()) { 734 | dout.writeLong(value); 735 | } 736 | } 737 | 738 | /** 739 | * Reads a byte stream, which was written by {@link #writeTo(OutputStream)}, into a {@link 740 | * CuckooFilter}.

The {@code Funnel} to be used is not encoded in the stream, so it must be 741 | * provided here. Warning: the funnel provided must behave identically to the one 742 | * used to populate the original Cuckoo filter! 743 | * 744 | * @throws IOException if the InputStream throws an {@code IOException}, or if its data does not 745 | * appear to be a CuckooFilter serialized using the {@link 746 | * #writeTo(OutputStream)} method. 747 | */ 748 | @CheckReturnValue 749 | public static CuckooFilter readFrom(InputStream in, Funnel funnel) throws IOException { 750 | checkNotNull(in, "InputStream"); 751 | checkNotNull(funnel, "Funnel"); 752 | int strategyOrdinal = -1; 753 | double fpp = -1.0D; 754 | long size = -1L; 755 | long checksum = -1L; 756 | long numBuckets = -1L; 757 | int numEntriesPerBucket = -1; 758 | int numBitsPerEntry = -1; 759 | int dataLength = -1; 760 | try { 761 | DataInputStream din = new DataInputStream(in); 762 | // currently this assumes there is no negative ordinal; will have to be updated if we 763 | // add non-stateless strategies (for which we've reserved negative ordinals; see 764 | // Strategy.ordinal()). 765 | strategyOrdinal = din.readByte(); 766 | fpp = din.readDouble(); 767 | size = din.readLong(); 768 | checksum = din.readLong(); 769 | numBuckets = din.readLong(); 770 | numEntriesPerBucket = din.readInt(); 771 | numBitsPerEntry = din.readInt(); 772 | dataLength = din.readInt(); 773 | 774 | CuckooStrategy cuckooStrategy = CuckooStrategies.values()[strategyOrdinal].strategy(); 775 | long[] data = new long[dataLength]; 776 | for (int i = 0; i < data.length; i++) { 777 | data[i] = din.readLong(); 778 | } 779 | return new CuckooFilter( 780 | new CuckooTable(data, size, checksum, numBuckets, numEntriesPerBucket, numBitsPerEntry), 781 | funnel, cuckooStrategy, fpp); 782 | } catch (RuntimeException e) { 783 | IOException ioException = new IOException( 784 | "Unable to deserialize CuckooFilter from InputStream." 785 | + " strategyOrdinal: " + strategyOrdinal 786 | + " fpp: " + fpp 787 | + " size: " + size 788 | + " checksum: " + checksum 789 | + " numBuckets: " + numBuckets 790 | + " numEntriesPerBucket: " + numEntriesPerBucket 791 | + " numBitsPerEntry: " + numBitsPerEntry 792 | + " dataLength: " + dataLength); 793 | ioException.initCause(e); 794 | throw ioException; 795 | } 796 | } 797 | 798 | /** 799 | * Returns the number of longs required by a CuckooTable for storage given the dimensions chosen 800 | * by the CuckooFilter to support {@code capacity) @ {@code fpp}. 801 | * 802 | * CuckooTable current impl uses a single long[] for data storage, so the calculated value must be 803 | * <= Integer.MAX_VALUE at this time. 804 | */ 805 | @VisibleForTesting 806 | static int calculateDataLength(long capacity, double fpp) { 807 | return CuckooTable.calculateDataLength( 808 | optimalNumberOfBuckets(capacity, optimalEntriesPerBucket(fpp)), 809 | optimalEntriesPerBucket(fpp), 810 | optimalBitsPerEntry(fpp, optimalEntriesPerBucket(fpp))); 811 | } 812 | 813 | @Override 814 | public String toString() { 815 | return "CuckooFilter{" + 816 | "table=" + table + 817 | ", funnel=" + funnel + 818 | ", strategy=" + cuckooStrategy + 819 | ", capacity=" + capacity() + 820 | ", fpp=" + fpp + 821 | ", currentFpp=" + currentFpp() + 822 | ", size=" + sizeLong() + 823 | '}'; 824 | } 825 | 826 | private void checkCompatibility(ProbabilisticFilter f, String verb) { 827 | checkArgument(f instanceof CuckooFilter, "Cannot" + verb + " a " + 828 | this.getClass().getSimpleName() + " with a " + f.getClass().getSimpleName()); 829 | checkArgument(this.isCompatible(f), "Cannot " + verb + " incompatible filters. " + 830 | this.getClass().getSimpleName() + " instances must have equivalent funnels; the same " + 831 | "strategy; and the same number of buckets, entries per bucket, and bits per entry."); 832 | } 833 | 834 | } 835 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/CuckooStrategies.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.hash.Hashing; 18 | 19 | /** 20 | * Collections of strategies of generating the f-bit fingerprint, index i1 and index i2 required for 21 | * an element to be mapped to a CuckooTable of m buckets with hash function h. These strategies are 22 | * part of the serialized form of the Cuckoo filters that use them, thus they must be preserved as 23 | * is (no updates allowed, only introduction of new versions).

Important: the order of the 24 | * constants cannot change, and they cannot be deleted - we depend on their ordinal for CuckooFilter 25 | * serialization. 26 | * 27 | * @author Brian Dupras 28 | */ 29 | public enum CuckooStrategies { 30 | /** 31 | * Adaptation of "Cuckoo Filter: Practically Better Than Bloom", Bin Fan, et al, that is 32 | * comparable to a Bloom Filter's memory efficiency, supports entry deletion, and can accept up to 33 | * 12.8 billion entries at 3% FPP. 34 | * 35 | *

This strategy uses 32 bits of {@link Hashing#murmur3_128} to find an entry's primary index. 36 | * The next non-zero f-bit segment of the hash is used as the entry's fingerprint. An entry's 37 | * alternate index is defined as {@code [hash(fingerprint) * parsign(index)] modulo bucket_count}, 38 | * where {@code hash(fingerprint)} is always odd, and {@code parsign(index)} is defined as {@code 39 | * +1} when {@code index} is even and {@code -1} when {@code index} is odd. The filter's bucket 40 | * count is rounded up to an even number. By specifying an even number of buckets and an odd 41 | * fingerprint hash, the parity of the alternate index is guaranteed to be opposite the parity of 42 | * the primary index. The use of the index's parity to apply a sign to {@code hash(fingerprint)} 43 | * causes the operation to be reversible, i.e. {@code index(e) == altIndex(altIndex(e))}.

44 | * 45 | *

A notable difference of this strategy from "Cuckoo Filter" is the method of selecting an 46 | * entry's alternate index. In the paper, the alternate index is defined as {@code index xor 47 | * hash(fingerprint)}. The use of {@code xor} requires that the index space be defined as 48 | * [0..2^f]. The side-effect of this is that the Cuckoo Filter's bucket count must be a power of 49 | * 2, meaning the memory utilization of the filter must be "rounded up" to the next power of two. 50 | * This side-effect of the paper's algorithm is avoided by the algorithm as described above.

51 | */ 52 | MURMUR128_BEALDUPRAS_32() { 53 | @Override 54 | public CuckooStrategy strategy() { 55 | return new CuckooStrategyMurmurBealDupras32(this.ordinal()); 56 | } 57 | }; 58 | 59 | public abstract CuckooStrategy strategy(); 60 | } 61 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/CuckooStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.hash.Funnel; 18 | 19 | import java.io.Serializable; 20 | 21 | interface CuckooStrategy extends Serializable { 22 | int ordinal(); 23 | boolean add(T object, Funnel funnel, CuckooTable table); 24 | boolean remove(T object, Funnel funnel, CuckooTable table); 25 | boolean contains(T object, Funnel funnel, CuckooTable table); 26 | boolean addAll(CuckooTable thiz, CuckooTable that); 27 | boolean equivalent(CuckooTable thiz, CuckooTable that); 28 | boolean containsAll(CuckooTable thiz, CuckooTable that); 29 | boolean removeAll(CuckooTable thiz, CuckooTable that); 30 | } 31 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/CuckooStrategyMurmurBealDupras32.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.hash.Funnel; 18 | import com.google.common.hash.HashCode; 19 | import com.google.common.hash.HashFunction; 20 | import com.google.common.hash.Hashing; 21 | 22 | import java.util.Random; 23 | 24 | import static com.google.common.base.Preconditions.checkArgument; 25 | import static com.google.common.math.LongMath.mod; 26 | 27 | /** 28 | * Cuckoo Filter strategy employing Murmur3 32-bit hashes and parity-based altIndex calculation. 29 | * 30 | * @author Brian Dupras 31 | * @author Alex Beal 32 | */ 33 | class CuckooStrategyMurmurBealDupras32 extends AbstractCuckooStrategy implements CuckooStrategy { 34 | private static final int MAX_RELOCATION_ATTEMPTS = 500; 35 | private static final HashFunction hashFunction = Hashing.murmur3_128(); 36 | 37 | CuckooStrategyMurmurBealDupras32(int ordinal) { 38 | super(ordinal); 39 | } 40 | 41 | public boolean add(T object, Funnel funnel, CuckooTable table) { 42 | final long hash64 = hash(object, funnel).asLong(); 43 | final int hash1 = hash1(hash64); 44 | final int hash2 = hash2(hash64); 45 | final int fingerprint = fingerprint(hash2, table.numBitsPerEntry); 46 | 47 | final long index = index(hash1, table.numBuckets); 48 | return putEntry(fingerprint, table, index) || 49 | putEntry(fingerprint, table, altIndex(index, fingerprint, table.numBuckets)); 50 | } 51 | 52 | protected long maxRelocationAttempts() { 53 | return MAX_RELOCATION_ATTEMPTS; 54 | } 55 | 56 | private final Random kicker = new Random(1L); 57 | 58 | protected int pickEntryToKick(int numEntriesPerBucket) { 59 | return kicker.nextInt(numEntriesPerBucket); 60 | } 61 | 62 | public boolean remove(T object, Funnel funnel, CuckooTable table) { 63 | final long hash64 = hash(object, funnel).asLong(); 64 | final int hash1 = hash1(hash64); 65 | final int hash2 = hash2(hash64); 66 | final int fingerprint = fingerprint(hash2, table.numBitsPerEntry); 67 | final long index1 = index(hash1, table.numBuckets); 68 | final long index2 = altIndex(index1, fingerprint, table.numBuckets); 69 | return table.swapAnyEntry(CuckooTable.EMPTY_ENTRY, fingerprint, index1) 70 | || table.swapAnyEntry(CuckooTable.EMPTY_ENTRY, fingerprint, index2); 71 | } 72 | 73 | public boolean contains(T object, Funnel funnel, CuckooTable table) { 74 | final long hash64 = hash(object, funnel).asLong(); 75 | final int hash1 = hash1(hash64); 76 | final int hash2 = hash2(hash64); 77 | final int fingerprint = fingerprint(hash2, table.numBitsPerEntry); 78 | final long index1 = index(hash1, table.numBuckets); 79 | final long index2 = altIndex(index1, fingerprint, table.numBuckets); 80 | return table.hasEntry(fingerprint, index1) || table.hasEntry(fingerprint, index2); 81 | } 82 | 83 | HashCode hash(final T object, final Funnel funnel) { 84 | return hashFunction.hashObject(object, funnel); 85 | } 86 | 87 | int hash1(long hash64) { 88 | return (int) hash64; 89 | } 90 | 91 | int hash2(long hash64) { 92 | return (int) (hash64 >>> 32); 93 | } 94 | 95 | /** 96 | * Returns an f-bit portion of the given hash. Iterating by f-bit segments from the least 97 | * significant side of the hash to the most significant, looks for a non-zero segment. If a 98 | * non-zero segment isn't found, 1 is returned to distinguish the fingerprint from a 99 | * non-entry. 100 | * 101 | * @param hash 32-bit hash value 102 | * @param f number of bits to consider from the hash 103 | * @return first non-zero f-bit value from hash as an int, or 1 if no non-zero value is found 104 | */ 105 | public static int fingerprint(int hash, int f) { 106 | checkArgument(f > 0, "f must be greater than zero"); 107 | checkArgument(f <= Integer.SIZE, "f must be less than " + Integer.SIZE); 108 | int mask = (0x80000000 >> (f - 1)) >>> (Integer.SIZE - f); 109 | 110 | for (int bit = 0; (bit + f) <= Integer.SIZE; bit += f) { 111 | int ret = (hash >> bit) & mask; 112 | if (0 != ret) { 113 | return ret; 114 | } 115 | } 116 | return 0x1; 117 | } 118 | 119 | /** 120 | * Calculates a primary index for an entry in the cuckoo table given the entry's 32-bit 121 | * hash and the table's size in buckets, m. 122 | * 123 | * tl;dr simply a wrap-around modulo bound by 0..m-1 124 | * 125 | * @param hash 32-bit hash value 126 | * @param m size of cuckoo table in buckets 127 | * @return index, bound by 0..m-1 inclusive 128 | */ 129 | @Override 130 | public long index(int hash, long m) { 131 | return mod(hash, m); 132 | } 133 | 134 | /** 135 | * Calculates an alternate index for an entry in the cuckoo table. 136 | * 137 | * tl;dr 138 | * Calculates an offset as an odd hash of the fingerprint and adds to, or subtracts from, 139 | * the starting index, wrapping around the table (mod) as necessary. 140 | * 141 | * Detail: 142 | * Hash the fingerprint 143 | * make it odd (*) 144 | * flip the sign if starting index is odd 145 | * sum with starting index (**) 146 | * and modulo to 0..m-1 147 | * 148 | * (*) Constraining the CuckooTable to an even size in buckets, and applying odd offsets 149 | * guarantees opposite parities for index & altIndex. The parity of the starting index 150 | * determines whether the offset is subtracted from or added to the starting index. 151 | * This strategy guarantees altIndex() is reversible, i.e. 152 | * 153 | * index == altIndex(altIndex(index, fingerprint, m), fingerprint, m) 154 | * 155 | * (**) Summing the starting index and offset can possibly lead to numeric overflow. See 156 | * {@link #protectedSum(long, long, long)} protectedSum} for details on how this is 157 | * avoided. 158 | * 159 | * @param index starting index 160 | * @param fingerprint fingerprint 161 | * @param m size of table in buckets; must be even for this strategy 162 | * @return an alternate index for fingerprint bounded by 0..m-1 163 | */ 164 | @Override 165 | public long altIndex(long index, int fingerprint, long m) { 166 | checkArgument(0L <= index, "index must be a positive!"); 167 | checkArgument((0L <= m) && (0L == (m & 0x1L)), "m must be a positive even number!"); 168 | return mod(protectedSum(index, parsign(index) * odd(hash(fingerprint)), m), m); 169 | } 170 | 171 | /** 172 | * Maps parity of i to a sign. 173 | * 174 | * @return 1 if i is even parity, -1 if i is odd parity 175 | */ 176 | static long parsign(long i) { 177 | return ((i & 0x01L) * -2L) + 1L; 178 | } 179 | 180 | static int hash(int i) { 181 | return hashFunction.hashInt(i).asInt(); 182 | } 183 | 184 | static long odd(long i) { 185 | return i | 0x01L; 186 | } 187 | 188 | /** 189 | * Returns the sum of index and offset, reduced by a mod-consistent amount if necessary to 190 | * protect from numeric overflow. This method is intended to support a subsequent mod operation 191 | * on the return value. 192 | * 193 | * @param index Assumed to be >= 0L. 194 | * @param offset Any value. 195 | * @param mod Value used to reduce the result, 196 | * @return sum of index and offset, reduced by a mod-consistent amount if necessary to protect 197 | * from numeric overflow. 198 | */ 199 | static long protectedSum(long index, long offset, long mod) { 200 | return canSum(index, offset) ? index + offset : protectedSum(index - mod, offset, mod); 201 | } 202 | 203 | static boolean canSum(long a, long b) { 204 | return (a ^ b) < 0 | (a ^ (a + b)) >= 0; 205 | } 206 | 207 | } 208 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/CuckooTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.annotations.VisibleForTesting; 18 | import com.google.common.base.Objects; 19 | import com.google.common.math.LongMath; 20 | import com.google.common.primitives.Ints; 21 | 22 | import java.math.RoundingMode; 23 | import java.util.Arrays; 24 | import java.util.Random; 25 | 26 | import static com.google.common.base.Preconditions.checkArgument; 27 | import static java.lang.Math.pow; 28 | 29 | class CuckooTable { 30 | static final int EMPTY_ENTRY = 0x00; 31 | 32 | public long[] data() { 33 | return data; 34 | } 35 | 36 | public long numBuckets() { 37 | return numBuckets; 38 | } 39 | 40 | public int numEntriesPerBucket() { 41 | return numEntriesPerBucket; 42 | } 43 | 44 | public int numBitsPerEntry() { 45 | return numBitsPerEntry; 46 | } 47 | 48 | final long[] data; 49 | final long numBuckets; 50 | final int numEntriesPerBucket; 51 | final int numBitsPerEntry; 52 | private long size; 53 | private long checksum; 54 | 55 | public CuckooTable(long numBuckets, int numEntriesPerBucket, int numBitsPerEntry) { 56 | this(new long[calculateDataLength(numBuckets, numEntriesPerBucket, numBitsPerEntry)] 57 | , numBuckets 58 | , numEntriesPerBucket 59 | , numBitsPerEntry 60 | , 0L 61 | ); 62 | } 63 | 64 | CuckooTable(final long[] data, long numBuckets, int numEntriesPerBucket, 65 | int numBitsPerEntry, long checksum) { 66 | this(data, 0L, checksum, numBuckets, numEntriesPerBucket, numBitsPerEntry); 67 | } 68 | 69 | public CuckooTable(final long[] data, long size, long checksum, long numBuckets, 70 | int numEntriesPerBucket, int numBitsPerEntry) { 71 | this.data = data; 72 | this.size = size; 73 | this.numBuckets = numBuckets; 74 | this.numEntriesPerBucket = numEntriesPerBucket; 75 | this.numBitsPerEntry = numBitsPerEntry; 76 | this.checksum = checksum; 77 | } 78 | 79 | public CuckooTable copy() { 80 | return new CuckooTable( 81 | data.clone(), size, checksum, numBuckets, numEntriesPerBucket, numBitsPerEntry); 82 | } 83 | 84 | public static int calculateDataLength(long numBuckets, int numEntriesPerBucket, int numBitsPerEntry) { 85 | checkArgument(numBuckets > 0, "numBuckets (%s) must be > 0", numBuckets); 86 | checkArgument(numEntriesPerBucket > 0, "numEntriesPerBucket (%s) must be > 0", 87 | numEntriesPerBucket); 88 | checkArgument(numBitsPerEntry > 0, "numBitsPerEntry (%s) must be > 0", numBitsPerEntry); 89 | 90 | return Ints.checkedCast(LongMath.divide( 91 | LongMath.checkedMultiply(numBuckets, 92 | LongMath.checkedMultiply(numEntriesPerBucket, numBitsPerEntry)), 93 | Long.SIZE, RoundingMode.CEILING)); 94 | } 95 | 96 | public int findEntry(int value, long bucket) { 97 | for (int i = 0; i < numEntriesPerBucket; i++) { 98 | if (value == readEntry(bucket, i)) { 99 | return i; 100 | } 101 | } 102 | return -1; 103 | } 104 | 105 | public int countEntry(int value, long bucket) { 106 | int ret = 0; 107 | for (int i = 0; i < numEntriesPerBucket; i++) { 108 | if (value == readEntry(bucket, i)) { 109 | ret++; 110 | } 111 | } 112 | return ret; 113 | } 114 | 115 | public boolean hasEntry(int value, long bucket) { 116 | return findEntry(value, bucket) >= 0; 117 | } 118 | 119 | public int readEntry(long bucket, int entry) { 120 | return readBits( 121 | data, bitOffset(bucket, entry, numEntriesPerBucket, numBitsPerEntry), numBitsPerEntry); 122 | } 123 | 124 | public boolean swapAnyEntry(int valueIn, int valueOut, long bucket) { 125 | final int entry = findEntry(valueOut, bucket); 126 | if (entry >= 0) { 127 | final int kicked = swapEntry(valueIn, bucket, entry); 128 | assert valueOut == kicked : "expected valueOut [" + valueOut + "] != actual kicked [" + 129 | kicked + "]"; 130 | return true; 131 | } 132 | return false; 133 | } 134 | 135 | int swapEntry(int value, long bucket, int entry) { 136 | final int kicked = writeBits(value, data, 137 | bitOffset(bucket, entry, numEntriesPerBucket, numBitsPerEntry), numBitsPerEntry); 138 | checksum += value - kicked; 139 | 140 | if ((EMPTY_ENTRY == value) && (EMPTY_ENTRY != kicked)) { 141 | size--; 142 | } else if ((EMPTY_ENTRY != value) && (EMPTY_ENTRY == kicked)) { 143 | size++; 144 | } 145 | assert size >= 0 : "Hmm - that's strange. CuckooTable size [" + size + "] shouldn't be < 0l"; 146 | 147 | return kicked; 148 | } 149 | 150 | static long bitOffset(long bucket, int entry, int numEntriesPerBucket, int numBitsPerEntry) { 151 | return (bucket * numEntriesPerBucket + entry) * numBitsPerEntry; 152 | } 153 | 154 | static int dataIndex(long bit) { 155 | return (int) (bit >>> 6); 156 | } 157 | 158 | @VisibleForTesting 159 | static int readBits(final long[] data, long bit, int len) { 160 | final int startLower = (int) (bit % Long.SIZE); 161 | final int lenLower = Math.min(Long.SIZE - startLower, len); 162 | final int lenUpper = Math.max(len - lenLower, 0); 163 | 164 | final int indexUpper = dataIndex(bit + len); 165 | 166 | final long lower = (data[dataIndex(bit)] & mask(startLower, lenLower)) >>> startLower; 167 | final long upper = indexUpper < data.length ? 168 | (data[indexUpper] & mask(0, lenUpper)) << lenLower : 0x00L; 169 | 170 | return (int) (lower | upper); 171 | } 172 | 173 | @VisibleForTesting 174 | static int writeBits(int bits, final long[] data, long bit, int len) { 175 | final int ret = readBits(data, bit, len); 176 | 177 | final long bitsl = ((long) bits) & 0x00000000FFFFFFFFL; // upcast without carrying the sign 178 | 179 | final int startLower = (int) (bit % Long.SIZE); 180 | final int lenLower = Math.min(Long.SIZE - startLower, len); 181 | final int lenUpper = Math.max(len - lenLower, 0); 182 | 183 | final long maskLowerKeep = ~(mask(0, lenLower) << startLower); 184 | final long maskUpperKeep = mask(lenUpper, Long.SIZE - lenUpper); 185 | 186 | final long bitsLower = (bitsl << startLower) & ~maskLowerKeep; 187 | final long bitsUpper = (bitsl >>> (len - lenUpper)) & ~maskUpperKeep; 188 | 189 | final int indexLower = dataIndex(bit); 190 | final int indexUpper = dataIndex(bit + len - 1); 191 | 192 | final long dataLower = (data[indexLower] & maskLowerKeep) | bitsLower; 193 | data[indexLower] = dataLower; 194 | 195 | if (indexLower != indexUpper) { 196 | final long dataUpper = (data[indexUpper] & maskUpperKeep) | bitsUpper; 197 | data[indexUpper] = dataUpper; 198 | } 199 | 200 | return ret; 201 | } 202 | 203 | static long mask(int start, int len) { 204 | return (len <= 0) ? 0L : (0x8000000000000000L >> (len - 1)) >>> (Long.SIZE - (start + len)); 205 | } 206 | 207 | @Override 208 | public boolean equals(Object o) { 209 | if (o instanceof CuckooTable) { 210 | CuckooTable that = (CuckooTable) o; 211 | return this.numBuckets == that.numBuckets 212 | && this.numEntriesPerBucket == that.numEntriesPerBucket 213 | && this.numBitsPerEntry == that.numBitsPerEntry 214 | && this.size == that.size 215 | && this.checksum == that.checksum 216 | ; 217 | } 218 | return false; 219 | } 220 | 221 | @Override 222 | public int hashCode() { 223 | return Objects.hashCode(numBuckets, numEntriesPerBucket, numBitsPerEntry, size, 224 | checksum); 225 | } 226 | 227 | public boolean isCompatible(CuckooTable that) { 228 | return this.numBuckets == that.numBuckets 229 | && this.numEntriesPerBucket == that.numEntriesPerBucket 230 | && this.numBitsPerEntry == that.numBitsPerEntry; 231 | } 232 | 233 | public long size() { 234 | return size < 0 ? /* indicates overflow */ Long.MAX_VALUE : size; 235 | } 236 | 237 | public long checksum() { 238 | return checksum; 239 | } 240 | 241 | public long bitSize() { 242 | return (long) data.length * Long.SIZE; 243 | } 244 | 245 | public long capacity() { 246 | return numBuckets * numEntriesPerBucket; 247 | } 248 | 249 | public double load() { 250 | return (double) size() / (double) capacity(); 251 | } 252 | 253 | public double currentFpp() { 254 | return fppAtGivenLoad(load()); 255 | } 256 | 257 | public double fppAtGivenLoad(double load) { 258 | return 1.0D - pow( 259 | ( pow(2, numBitsPerEntry) - 2 ) 260 | / 261 | ( pow(2, numBitsPerEntry) - 1 ) 262 | , 263 | 2 * numEntriesPerBucket * load 264 | ); 265 | } 266 | 267 | public double averageBitsPerEntry() { 268 | return (double) bitSize() / (double) size; 269 | } 270 | 271 | @Override 272 | public String toString() { 273 | return getClass().getSimpleName() + "{" + 274 | "size=" + size + 275 | ", checksum=" + checksum + 276 | ", byteSize=" + bitSize() / Byte.SIZE + 277 | ", load=" + load() + 278 | ", capacity=" + capacity() + 279 | ", averageBitsPerEntry=" + averageBitsPerEntry() + 280 | ", numBuckets=" + numBuckets + 281 | ", numEntriesPerBucket=" + numEntriesPerBucket + 282 | ", numBitsPerEntry=" + numBitsPerEntry + 283 | '}'; 284 | } 285 | 286 | public void clear() { 287 | Arrays.fill(data, 0L); 288 | size = 0L; 289 | } 290 | } 291 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/ProbabilisticFilter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import java.util.Collection; 18 | 19 | import javax.annotation.CheckReturnValue; 20 | 21 | /** 22 | * A probabilistic filter offers an approximate containment test with one-sided error: if it claims 23 | * that an element is contained in it, this might be in error, but if it claims that an 24 | * element is not contained in it, then this is definitely true.

The false 25 | * positive probability ({@code FPP}) of a probabilistic filter is defined as the probability that 26 | * {@link #contains(Object)} will erroneously return {@code true} for an element that is not 27 | * actually contained in the filter.

28 | * 29 | * @param the type of elements that this filter accepts 30 | * @author Brian Dupras 31 | * @see CuckooFilter 32 | * @see BloomFilter 33 | */ 34 | public interface ProbabilisticFilter { 35 | /** 36 | * Adds the specified element to this filter (optional operation). A return value of {@code true} 37 | * ensures that {@link #contains(Object)} given {@code e} will also return {@code true}. 38 | * 39 | * @param e element to be added to this filter 40 | * @return {@code true} if {@code e} was successfully added to the filter, {@code false} if this 41 | * is definitely not the case 42 | * @throws UnsupportedOperationException if the {@link #add(Object)} operation is not supported by 43 | * this filter 44 | * @throws ClassCastException if the class of the specified element prevents it from 45 | * being added to this filter 46 | * @throws NullPointerException if the specified element is {@code null} and this filter 47 | * does not permit {@code null} elements 48 | * @throws IllegalArgumentException if some property of the specified element prevents it 49 | * from being added to this filter 50 | * @see #contains(Object) 51 | * @see #addAll(Collection) 52 | * @see #addAll(ProbabilisticFilter) 53 | */ 54 | @CheckReturnValue 55 | boolean add(E e); 56 | 57 | /** 58 | * Combines {@code this} filter with another compatible filter (optional operation). The mutations 59 | * happen to {@code this} instance. Callers must ensure {@code this} filter is appropriately sized 60 | * to avoid saturating it or running out of space. 61 | * 62 | * @param f filter to be combined into {@code this} filter - {@code f} is not mutated 63 | * @return {@code true} if the operation was successful, {@code false} otherwise 64 | * @throws UnsupportedOperationException if the {@link #addAll(ProbabilisticFilter)} operation is 65 | * not supported by this filter 66 | * @throws NullPointerException if the specified filter is {@code null} 67 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == 68 | * false} 69 | * @throws IllegalStateException if this filter cannot be combined with the specified 70 | * filter at this time due to insertion restrictions 71 | * @see #add(Object) 72 | * @see #addAll(Collection) 73 | * @see #contains(Object) 74 | */ 75 | @CheckReturnValue 76 | boolean addAll(ProbabilisticFilter f); 77 | 78 | /** 79 | * Adds all of the elements in the specified collection to this filter (optional operation). The 80 | * behavior of this operation is undefined if the specified collection is modified while the 81 | * operation is in progress. 82 | * 83 | * @param c collection containing elements to be added to this filter 84 | * @return {@code true} if all elements of the collection were successfully added, {@code false} 85 | * otherwise 86 | * @throws UnsupportedOperationException if the {@link #addAll(Collection)} operation is not 87 | * supported by this filter 88 | * @throws ClassCastException if the class of an element of the specified collection 89 | * prevents it from being added to this filter 90 | * @throws NullPointerException if the specified collection contains a {@code null} 91 | * element and this filter does not permit {@code null} 92 | * elements, or if the specified collection is {@code null} 93 | * @throws IllegalArgumentException if some property of an element of the specified 94 | * collection prevents it from being added to this filter 95 | * @throws IllegalStateException if not all the elements can be added at this time due to 96 | * insertion restrictions 97 | * @see #add(Object) 98 | * @see #addAll(ProbabilisticFilter) 99 | * @see #contains(Object) 100 | */ 101 | @CheckReturnValue 102 | boolean addAll(Collection c); 103 | 104 | /** 105 | * Removes all of the elements from this filter (optional operation). The filter will be empty 106 | * after this call returns. 107 | * 108 | * @throws UnsupportedOperationException if the {@link #clear()} method is not supported by this 109 | * filter 110 | * @see #sizeLong() 111 | * @see #isEmpty() 112 | */ 113 | void clear(); 114 | 115 | /** 116 | * Removes the specified element from this filter (optional operation). The element must be 117 | * contained in the filter prior to invocation. Removing an element that isn't contained in the 118 | * filter may put the filter in an inconsistent state causing it to return false negative 119 | * responses from {@link #contains(Object)}. 120 | * 121 | * If {@code false} is returned, this is definitely an indication that the specified 122 | * element wasn't contained in the filter prior to invocation. If the implementation treats this 123 | * condition as an error, then this filter can no longer be relied upon to return correct {@code 124 | * false} responses from {@link #contains(Object)}, unless {@link #isEmpty()} is also {@code 125 | * true}. 126 | * 127 | * @param e element to be removed from this filter 128 | * @return {@code true} if this filter probably contained the specified element, {@code false} 129 | * otherwise 130 | * @throws ClassCastException if the type of the specified element is incompatible with 131 | * this filter (optional) 132 | * @throws NullPointerException if the specified element is {@code null} and this filter 133 | * does not permit {@code null} elements 134 | * @throws UnsupportedOperationException if the {@link #remove(Object)} operation is not supported 135 | * by this filter 136 | * @see #contains(Object) 137 | * @see #removeAll(Collection) 138 | * @see #removeAll(ProbabilisticFilter) 139 | */ 140 | @CheckReturnValue 141 | boolean remove(E e); 142 | 143 | /** 144 | * Removes from this filter all of its elements that are contained in the specified collection 145 | * (optional operation). All element contained in the specified collection must be contained in 146 | * the filter prior to invocation. Removing elements that aren't contained in the filter may put 147 | * the filter in an inconsistent state causing it to return false negative responses from {@link 148 | * #contains(Object)}. 149 | * 150 | * If {@code false} is returned, this is definitely an indication that the specified 151 | * collection contained elements that were not contained in this filter prior to invocation. If 152 | * the implementation treats this condition as an error, then this filter can no longer be relied 153 | * upon to return correct {@code false} responses from {@link #contains(Object)}, unless {@link 154 | * #isEmpty()} is also {@code true}. 155 | * 156 | * @param c collection containing elements to be removed from this filter 157 | * @return {@code true} if all of the elements of the specified collection were successfully 158 | * removed from the filter, {@code false} if any of the elements was not successfully removed 159 | * @throws ClassCastException if the types of one or more elements in the specified 160 | * collection are incompatible with this filter (optional) 161 | * @throws NullPointerException if the specified collection contains one or more null 162 | * elements and this filter does not permit {@code null} 163 | * elements (optional), or if the specified collection is 164 | * {@code null} 165 | * @throws UnsupportedOperationException if the {@link #removeAll(Collection)} operation is not 166 | * supported by this filter 167 | * @see #contains(Object) 168 | * @see #remove(Object) 169 | * @see #removeAll(ProbabilisticFilter) 170 | */ 171 | @CheckReturnValue 172 | boolean removeAll(Collection c); 173 | 174 | /** 175 | * Subtracts the specified filter from {@code this} filter. The mutations happen to {@code this} 176 | * instance. Callers must ensure that the specified filter represents elements that are currently 177 | * contained in {@code this} filter. 178 | * 179 | * If {@code false} is returned, this is definitely an indication that the specified filter 180 | * contained elements that were not contained in this filter prior to invocation. If the 181 | * implementation treats this condition as an error, then this filter can no longer be relied upon 182 | * to return correct {@code false} responses from {@link #contains(Object)}, unless {@link 183 | * #isEmpty()} is also {@code true}. 184 | * 185 | * @param f filter containing elements to remove from {@code this} filter. {@code f} is not 186 | * mutated 187 | * @return {@code true} if the operation was successful, {@code false} otherwise 188 | * @throws UnsupportedOperationException if the {@link #removeAll(ProbabilisticFilter)} operation 189 | * is not supported by this filter 190 | * @throws NullPointerException if the specified filter is {@code null} 191 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == 192 | * false} given {@code f} 193 | * @see #contains(Object) 194 | * @see #remove(Object) 195 | * @see #removeAll(Collection) 196 | */ 197 | @CheckReturnValue 198 | boolean removeAll(ProbabilisticFilter f); 199 | 200 | /** 201 | * Returns {@code true} if this filter might contain the specified element, {@code false} 202 | * if this is definitely not the case. 203 | * 204 | * @param e element whose containment in this filter is to be tested 205 | * @return {@code true} if this filter might contain the specified element, {@code false} 206 | * if this is definitely not the case. 207 | * @throws ClassCastException if the type of the specified element is incompatible with this 208 | * filter (optional) 209 | * @throws NullPointerException if the specified element is {@code null} and this filter does not 210 | * permit {@code null} elements 211 | * @see #containsAll(Collection) 212 | * @see #containsAll(ProbabilisticFilter) 213 | * @see #add(Object) 214 | * @see #remove(Object) 215 | */ 216 | boolean contains(E e); 217 | 218 | /** 219 | * Returns {@code true} if this filter might contain all of the elements of the specified 220 | * collection (optional operation). More formally, returns {@code true} if {@link 221 | * #contains(Object)} {@code == true} for all of the elements of the specified collection. 222 | * 223 | * @param c collection containing elements to be checked for containment in this filter 224 | * @return {@code true} if this filter might contain all elements of the specified 225 | * collection 226 | * @throws ClassCastException if the types of one or more elements in the specified collection 227 | * are incompatible with this filter (optional) 228 | * @throws NullPointerException if the specified collection contains one or more {@code null} 229 | * elements and this filter does not permit {@code null} elements 230 | * (optional), or if the specified collection is {@code null} 231 | * @see #contains(Object) 232 | * @see #containsAll(ProbabilisticFilter) 233 | */ 234 | boolean containsAll(Collection c); 235 | 236 | /** 237 | * Returns {@code true} if this filter might contain all elements contained in the 238 | * specified filter (optional operation). 239 | * 240 | * @param f filter containing elements to be checked for probable containment in this filter 241 | * @return {@code true} if this filter might contain all elements contained in the 242 | * specified filter, {@code false} if this is definitely not the case. 243 | * @throws UnsupportedOperationException if the {@link #containsAll(ProbabilisticFilter)} 244 | * operation is not supported by this filter 245 | * @throws NullPointerException if the specified filter is {@code null} 246 | * @throws IllegalArgumentException if {@link #isCompatible(ProbabilisticFilter)} {@code == 247 | * false} given {@code f} 248 | * @see #contains(Object) 249 | * @see #containsAll(Collection) 250 | */ 251 | boolean containsAll(ProbabilisticFilter f); 252 | 253 | /** 254 | * Returns {@code true} if this filter contains no elements. 255 | * 256 | * @return {@code true} if this filter contains no elements 257 | * @see #sizeLong() 258 | */ 259 | boolean isEmpty(); 260 | 261 | /** 262 | * Returns the number of elements contained in this filter (its cardinality). If this filter 263 | * contains more than {@code Long.MAX_VALUE} elements, returns {@code Long.MAX_VALUE}. 264 | * 265 | * @return the number of elements contained in this filter (its cardinality) 266 | * @see #capacity() 267 | * @see #isEmpty() 268 | * @see #size() 269 | */ 270 | long sizeLong(); 271 | 272 | /** 273 | * Returns the number of elements contained in this filter (its cardinality). If this filter 274 | * contains more than {@code Integer.MAX_VALUE} elements, returns {@code Integer.MAX_VALUE}. Use 275 | * {@link #sizeLong()} to obtain filter sizes lager than {@code Integer.MAX_VALUE}; 276 | * 277 | *

This method is provided for consistency with the Collections API.

278 | * 279 | * @return the number of elements contained in this filter (its cardinality) 280 | * @see #capacity() 281 | * @see #isEmpty() 282 | * @see #sizeLong() 283 | */ 284 | long size(); 285 | 286 | /** 287 | * Returns {@code true} if the specified filter is compatible with {@code this} filter. {@code f} 288 | * is considered compatible if {@code this} filter can use it in combinatoric operations (e.g. 289 | * {@link #addAll(ProbabilisticFilter)}, {@link #containsAll(ProbabilisticFilter)}, {@link 290 | * #removeAll(ProbabilisticFilter)}). 291 | * 292 | * @param f filter to check for compatibility with {@code this} filter 293 | * @return {@code true} if the specified filter is compatible with {@code this} filter 294 | * @throws NullPointerException if the specified filter is {@code null} 295 | * @see #addAll(ProbabilisticFilter) 296 | * @see #containsAll(ProbabilisticFilter) 297 | * @see #removeAll(ProbabilisticFilter) 298 | */ 299 | boolean isCompatible(ProbabilisticFilter f); 300 | 301 | /** 302 | * Returns the number of elements this filter can represent at its requested {@code FPP}. This may 303 | * not be a hard limit of the filter implementation. It is permissible for a filter to contain 304 | * more elements than its requested capacity, though its {@code FPP} may suffer. 305 | * 306 | * @return the number of elements this filter can represent at its requested {@code FPP}. 307 | * @see #fpp() 308 | * @see #currentFpp() 309 | * @see #sizeLong() 310 | */ 311 | long capacity(); 312 | 313 | /** 314 | * Returns the current false positive probability ({@code FPP}) of this filter. 315 | * 316 | * @return the probability that {@link #contains(Object)} will erroneously return {@code true} 317 | * given an element that has not actually been added to the filter. 318 | * @see #fpp() 319 | */ 320 | double currentFpp(); 321 | 322 | /** 323 | * Returns the intended {@code FPP} limit of this filter. This may not be a hard limit of the 324 | * filter implementation. It is permissible for a filter's {@code FPP} to degrade (e.g. via 325 | * saturation) beyond its intended limit. 326 | * 327 | * @return the intended {@code FPP} limit of this filter. 328 | * @see #currentFpp() 329 | */ 330 | double fpp(); 331 | } 332 | -------------------------------------------------------------------------------- /src/main/java/com/duprasville/guava/probably/package-info.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | /** 16 | * Probabilistic data structures for Guava. 17 | * 18 | *

This package is a part of the open-source Guava-Probably 19 | * library. 20 | * 21 | *

Contents

22 | * 23 | *

Probabilistic Filters

24 | * 25 | *
    26 | * 27 | *
  • {@link com.duprasville.guava.probably.ProbabilisticFilter} - interface defining basic methods 28 | * of probabilistic filters: {@link com.duprasville.guava.probably.ProbabilisticFilter#add(Object)}, 29 | * {@link com.duprasville.guava.probably.ProbabilisticFilter#contains(Object)}, and {@link 30 | * com.duprasville.guava.probably.ProbabilisticFilter#currentFpp()}.
  • 31 | * 32 | *
  • {@link com.duprasville.guava.probably.CuckooFilter} - Cuckoo filter implementation that 33 | * supports deletion.
  • 34 | * 35 | *
  • {@link com.duprasville.guava.probably.BloomFilter} - Bloom filter implementation backed by 36 | * Guava's BloomFilter.
  • 37 | * 38 | *
39 | */ 40 | package com.duprasville.guava.probably; 41 | -------------------------------------------------------------------------------- /src/test/java/com/duprasville/guava/probably/AbstractProbabilisticFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.duprasville.guava.probably; 2 | 3 | import org.junit.Before; 4 | import org.junit.Ignore; 5 | import org.junit.Test; 6 | 7 | import java.util.Arrays; 8 | import java.util.Collection; 9 | import java.util.Random; 10 | 11 | import static com.google.common.base.Preconditions.checkArgument; 12 | import static com.google.common.truth.Truth.assertWithMessage; 13 | import static junit.framework.Assert.assertEquals; 14 | import static junit.framework.Assert.assertFalse; 15 | import static junit.framework.Assert.assertTrue; 16 | import static junit.framework.Assert.fail; 17 | 18 | /** 19 | * Common tests of ProbabilisticFilter interface implementations. 20 | * 21 | * @author Brian Dupras 22 | */ 23 | public abstract class AbstractProbabilisticFilterTest { 24 | abstract ProbabilisticFilter filter(int capacity, double fpp); 25 | 26 | Random random = new Random(1L); 27 | static final int FILTER_CAPACITY = 1000000; 28 | static double FILTER_FPP = 0.002D; 29 | ProbabilisticFilter filter; 30 | ProbabilisticFilter filter2; 31 | 32 | static final int TINY_FILTER_CAPACITY = 1; 33 | static final double TINY_FILTER_FPP = 0.002D; 34 | ProbabilisticFilter tinyFilter; 35 | 36 | private ProbabilisticFilter filter() { 37 | return filter(FILTER_CAPACITY, FILTER_FPP); 38 | } 39 | 40 | private ProbabilisticFilter tinyFilter() { 41 | return filter(TINY_FILTER_CAPACITY, TINY_FILTER_FPP); 42 | } 43 | 44 | @Before 45 | public void setUp() { 46 | filter = filter(); 47 | filter2 = filter(); 48 | tinyFilter = tinyFilter(); 49 | } 50 | 51 | @Test 52 | public void addEShouldReturnTrueWhenFilterIsNotFull() { 53 | assertTrue(filter.add("foo")); 54 | assertTrue(filter.contains("foo")); 55 | } 56 | 57 | @Test(expected = NullPointerException.class) 58 | public void addNullShouldThrowNullPointerException() { 59 | filter.add(null); 60 | fail(); 61 | } 62 | 63 | @Test 64 | public void addAllCollectionOfEShouldReturnTrue() { 65 | assertTrue(filter.addAll(Arrays.asList("foo", "bar"))); 66 | assertTrue(filter.containsAll(Arrays.asList("bar", "foo"))); 67 | } 68 | 69 | @Test(expected = NullPointerException.class) 70 | public void addAllNullCollectionShouldThrowNullPointerException() { 71 | filter.addAll((Collection) null); 72 | fail(); 73 | } 74 | 75 | @Test(expected = NullPointerException.class) 76 | public void addAllCollectionOfEContainingNullShouldThrowNullPointerException() { 77 | filter.addAll(Arrays.asList("foo", "bar", null)); 78 | fail(); 79 | } 80 | 81 | @Test 82 | public void addAllProbabilisticFilterOfEShouldReturnTrue() { 83 | assert filter.addAll(Arrays.asList("foo", "bar", "baz", "boz", "foz")); 84 | assert filter2.addAll(Arrays.asList("foo2", "bar2", "baz2", "boz2", "foz2")); 85 | assertTrue(filter.addAll(filter2)); 86 | } 87 | 88 | @Test(expected = IllegalArgumentException.class) 89 | public void addAllProbabilisticFilterOfEThatIsNotCompatibleShouldThrowIllegalArgumentException() { 90 | filter.addAll(tinyFilter); 91 | fail(); 92 | } 93 | 94 | @Test(expected = NullPointerException.class) 95 | public void addAllNullProbabilisticFilterShouldThrowNullPointerException() { 96 | filter.addAll((ProbabilisticFilter) null); 97 | fail(); 98 | } 99 | 100 | @Test 101 | public void clearShouldRemovePreviouslyContainedElements() { 102 | assert 0 == filter.size(); 103 | assert filter.add("foo"); 104 | assert 1 == filter.size(); 105 | assert filter.contains("foo"); 106 | filter.clear(); 107 | assertEquals(0, filter.size()); 108 | assertFalse(filter.contains("foo")); 109 | } 110 | 111 | @Test 112 | public void containsEThatIsContainedShouldReturnTrue() { 113 | assert filter.add("foo"); 114 | assertTrue(filter.contains("foo")); 115 | } 116 | 117 | @Test 118 | public void containsEThatIsNotContainedShouldReturnFalse() { 119 | assertFalse(filter.contains("bar")); 120 | } 121 | 122 | @Test(expected = NullPointerException.class) 123 | public void containsNullShouldThrowNullPointerException() { 124 | filter.contains(null); 125 | fail(); 126 | } 127 | 128 | @Test 129 | public void containsAllCollectionOfEThatIsFullyContainedShouldReturnTrue() { 130 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 131 | assertTrue(filter.containsAll(Arrays.asList("foo", "bar"))); 132 | assertTrue(filter.containsAll(Arrays.asList("foo", "bar", "baz"))); 133 | } 134 | 135 | @Test 136 | public void containsAllCollectionOfEThatIsNotFullyContainedShouldReturnFalse() { 137 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 138 | assertFalse(filter.containsAll(Arrays.asList("foo", "bar", "boom"))); 139 | assertFalse(filter.containsAll(Arrays.asList("foo", "bar", "baz", "boom"))); 140 | } 141 | 142 | @Test(expected = NullPointerException.class) 143 | public void containsAllCollectionOfEContainingNullShouldThrowNullPointerException() { 144 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 145 | filter.containsAll(Arrays.asList("foo", null)); 146 | fail(); 147 | } 148 | 149 | @Test 150 | @Ignore 151 | public void sizeLongReturnsLongMaxValueWhenFilterSizeExceedsLogMaxValue() { 152 | fail("Test Not Implemented. Current filter impls can't be allocated at a sufficient size."); 153 | } 154 | 155 | @Test @Ignore("Test not yet implemented.") 156 | public void sizeReturnsIntegerMaxValueWhenFilterSizeExceedsIntegerMaxValue() { 157 | fail("Test not yet implemented."); 158 | } 159 | 160 | @Test 161 | public void isCompatible() { 162 | assertTrue(filter().isCompatible(filter())); 163 | assertFalse(filter().isCompatible(tinyFilter())); 164 | } 165 | 166 | @Test 167 | abstract public void capacity(); 168 | 169 | @Test 170 | abstract public void fpp(); 171 | 172 | @Test 173 | public void basicGenerativeTests() throws Exception { 174 | for (double fpp = 0.0000001; fpp < 0.1; fpp *= 10) { 175 | for (int capacity = 100; capacity <= 100000; capacity *= 10) { 176 | basicTests(filter(capacity, fpp), capacity, fpp); 177 | } 178 | } 179 | } 180 | 181 | private void basicTests( 182 | final ProbabilisticFilter filter, int capacity, double fpp) { 183 | checkArgument(capacity > 0, "capacity (%s) must be > 0", capacity); 184 | checkArgument(fpp > 0, "fpp (%s) must be > 0.0", fpp); 185 | 186 | assertEquals("currentFpp should be 0 when filter is empty", 0.0D, filter.currentFpp()); 187 | 188 | assertFalse("contains should return false when filter is empty", filter.contains("Nope")); 189 | 190 | assertTrue("add should return true when inserting the first item", filter.add("Yep!")); 191 | 192 | int falseInsertions = 0; 193 | 194 | for (int i = 0; i < capacity - 1; i++) { //minus 1 since we've already inserted one above 195 | double currentFppBefore = filter.currentFpp(); 196 | 197 | if (filter.add(Integer.toString(i))) { 198 | assertTrue("currentFpp should not decrease after put returns true", 199 | filter.currentFpp() >= currentFppBefore); 200 | 201 | assertTrue("contains should return true when queried with an inserted item", 202 | filter.contains(Integer.toString(i))); 203 | } else { 204 | falseInsertions++; 205 | assertEquals("currentFpp should not change after put returns false", 206 | currentFppBefore, filter.currentFpp()); 207 | } 208 | } 209 | 210 | // fill up the filter until put has returned `true` numInsertion times in total 211 | //noinspection StatementWithEmptyBody 212 | while (filter.add(Integer.toString(random.nextInt())) && (--falseInsertions > 0)) ; 213 | 214 | assertWithMessage( 215 | "currentFpp should be, approximately, at most the requested fpp after inserting the " + 216 | "requested number of items") 217 | .that(filter.currentFpp()) 218 | .isAtMost(fpp * 1.3); 219 | 220 | assertWithMessage( 221 | "currentFpp should be, approximately, at least the half the requested fpp after " + 222 | "inserting the requested number of items: " + capacity + ", " + fpp) 223 | .that(filter.currentFpp()) 224 | .isAtLeast(fpp * 0.65); 225 | } 226 | } -------------------------------------------------------------------------------- /src/test/java/com/duprasville/guava/probably/BloomProbabilisticFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.duprasville.guava.probably; 2 | 3 | import com.google.common.hash.Funnels; 4 | 5 | import org.junit.Test; 6 | 7 | import java.util.Arrays; 8 | 9 | import static com.google.common.base.Charsets.UTF_8; 10 | import static junit.framework.Assert.assertEquals; 11 | import static junit.framework.Assert.assertTrue; 12 | 13 | /** 14 | * BloomFilter tests of ProbabilisticFilter interface implementations. 15 | * 16 | * @author Brian Dupras 17 | */ 18 | public class BloomProbabilisticFilterTest extends AbstractProbabilisticFilterTest { 19 | ProbabilisticFilter filter(int capacity, double fpp) { 20 | return BloomFilter.create(Funnels.stringFunnel(UTF_8), capacity, fpp); 21 | } 22 | 23 | @Test 24 | public void addEShouldReturnTrueWhenFilterIsFull() { 25 | for (int i = 0; i < tinyFilter.capacity() * 2; i++) { 26 | tinyFilter.add("foo" + i); 27 | } 28 | assertTrue("Bloom filters cannot reject additions", tinyFilter.add("bust")); 29 | } 30 | 31 | @Test 32 | public void addAllCollectionOfEContainingTooManyItemsShouldReturnTrue() { 33 | assertTrue(tinyFilter.addAll(Arrays.asList("foo", "bar", "baz", "boz", "foz", "biz", "fiz", 34 | "fuz", "buz"))); 35 | } 36 | 37 | @Test(expected = UnsupportedOperationException.class) 38 | public void removeEShouldThrowUnsupportedOperationException() { 39 | filter.remove("nope"); 40 | } 41 | 42 | @Test(expected = UnsupportedOperationException.class) 43 | public void removeAllCollectionOfEShouldThrowUnsupportedOperationException() { 44 | filter.removeAll(Arrays.asList("nope", "neither")); 45 | } 46 | 47 | @Test(expected = UnsupportedOperationException.class) 48 | public void removeAllProbabilisticFilterOfEShouldThrowUnsupportedOperationException() { 49 | filter.removeAll(filter); 50 | } 51 | 52 | @Test(expected = UnsupportedOperationException.class) 53 | public void containsAllProbabilisticFilterShouldThrowUnsupportedOperationException() { 54 | filter.containsAll(filter); 55 | } 56 | 57 | @Test 58 | public void currentFppShouldGrowOrStayTheSameWithEverySuccessfulInsertion() { 59 | assertEquals(0.0D, filter.currentFpp()); 60 | double lastFpp = filter.currentFpp(); 61 | for (int i = 0; i < filter.capacity(); i++) { 62 | if (filter.add(String.valueOf(i))) { 63 | double currentFpp = filter.currentFpp(); 64 | assertTrue(lastFpp <= currentFpp); 65 | lastFpp = currentFpp; 66 | } 67 | } 68 | } 69 | 70 | @Override 71 | public void capacity() { 72 | assertEquals(1000000, filter.capacity()); 73 | assertEquals(1000003, filter(1000003, 0.9D).capacity()); 74 | 75 | assertEquals(1, tinyFilter.capacity()); 76 | assertEquals(3, filter(3, TINY_FILTER_FPP).capacity()); 77 | } 78 | 79 | @Override 80 | public void fpp() { 81 | assertEquals(FILTER_FPP, filter.fpp(), FILTER_FPP); 82 | assertEquals(TINY_FILTER_FPP, tinyFilter.fpp(), TINY_FILTER_FPP); 83 | } 84 | } -------------------------------------------------------------------------------- /src/test/java/com/duprasville/guava/probably/CuckooFilterProbabilisticFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.duprasville.guava.probably; 2 | 3 | import com.google.common.hash.Funnels; 4 | 5 | import org.junit.Test; 6 | 7 | import java.util.Arrays; 8 | 9 | import static com.google.common.base.Charsets.UTF_8; 10 | import static junit.framework.Assert.assertEquals; 11 | import static junit.framework.Assert.assertFalse; 12 | import static junit.framework.Assert.assertTrue; 13 | import static junit.framework.Assert.fail; 14 | 15 | /** 16 | * CuckooFilter tests of ProbabilisticFilter interface implementations. 17 | * 18 | * @author Brian Dupras 19 | */ 20 | public class CuckooFilterProbabilisticFilterTest extends AbstractProbabilisticFilterTest { 21 | ProbabilisticFilter filter(int capacity, double fpp) { 22 | return CuckooFilter.create(Funnels.stringFunnel(UTF_8), capacity, fpp); 23 | } 24 | 25 | @Test 26 | public void addEShouldReturnFalseWhenFilterIsFull() { 27 | for (int i = 0; i < tinyFilter.capacity() * 2; i++) { 28 | tinyFilter.add("foo" + i); 29 | } 30 | assertFalse(tinyFilter.add("bust")); 31 | } 32 | 33 | @Test 34 | public void addAllCollectionOfEContainingTooManyItemsShouldReturnFalse() { 35 | assertFalse(tinyFilter.addAll(Arrays.asList("foo", "bar", "baz", "boz", "foz", "biz", "fiz", 36 | "fuz", "buz"))); 37 | } 38 | 39 | @Test 40 | public void containsAllProbabilisticFilterOfEThatIsFullyContainedShouldReturnTrue() { 41 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 42 | assert filter2.addAll(Arrays.asList("foo", "bar")); 43 | assertTrue(filter.containsAll(filter2)); 44 | assert filter2.add("baz"); 45 | assertTrue(filter.containsAll(filter2)); 46 | } 47 | 48 | @Test 49 | public void containsAllProbabilisticFilterOfEThatIsNotFullyContainedShouldReturnFalse() { 50 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 51 | assert filter2.addAll(Arrays.asList("foo", "bar", "boom")); 52 | assertFalse(filter.containsAll(filter2)); 53 | assert filter2.add("baz"); 54 | assertFalse(filter.containsAll(filter2)); 55 | } 56 | 57 | @Test 58 | public void removeEThatIsContainedShouldReturnTrue() { 59 | assert !filter.contains("foo"); 60 | assert filter.add("foo"); 61 | assert filter.contains("foo"); 62 | assertTrue(filter.remove("foo")); 63 | assertFalse(filter.contains("foo")); 64 | } 65 | 66 | @Test 67 | public void removeEThatIsNotContainedShouldReturnFalse() { 68 | assert !filter.contains("foo"); 69 | assertFalse(filter.remove("foo")); 70 | } 71 | 72 | @Test(expected = NullPointerException.class) 73 | public void removeNullShouldThrowNullPointerException() { 74 | filter.remove(null); 75 | fail(); 76 | } 77 | 78 | @Test 79 | public void removeAllCollectionOfEThatIsFullyContainedShouldReturnTrue() { 80 | assert filter.addAll(Arrays.asList("foo", "bar")); 81 | assert filter.containsAll(Arrays.asList("foo", "bar")); 82 | assertTrue(filter.removeAll(Arrays.asList("foo", "bar"))); 83 | } 84 | 85 | @Test 86 | public void removeAllCollectionOfEThatIsNotFullyContainedShouldReturnFalse() { 87 | assert filter.addAll(Arrays.asList("foo", "bar")); 88 | assert filter.containsAll(Arrays.asList("foo", "bar")); 89 | assert 2 == filter.size(); 90 | assertFalse(filter.removeAll(Arrays.asList("foo", "boom"))); 91 | assertEquals(1, filter.size()); 92 | } 93 | 94 | @Test(expected = NullPointerException.class) 95 | public void removeAllCollectionOfEContainingNullShouldThrowNullPointerException() { 96 | assert filter.addAll(Arrays.asList("foo", "bar")); 97 | assert filter.containsAll(Arrays.asList("foo", "bar")); 98 | assert 2 == filter.size(); 99 | filter.removeAll(Arrays.asList("foo", null)); 100 | fail(); 101 | } 102 | 103 | @Test 104 | public void removeAllProbabilisticFilterOfEThatIsFullyContainedShouldReturnTrue() { 105 | assert filter.addAll(Arrays.asList("foo", "bar", "baz", "boz")); 106 | assert filter.containsAll(Arrays.asList("foo", "bar", "baz", "boz")); 107 | assert 4 == filter.size(); 108 | assert filter2.addAll(Arrays.asList("baz", "boz")); 109 | assert filter2.containsAll(Arrays.asList("baz", "boz")); 110 | assert 2 == filter2.size(); 111 | assertTrue(filter.removeAll(filter2)); 112 | assertEquals(2, filter.size()); 113 | } 114 | 115 | @Test 116 | public void removeAllProbabilisticFilterOfEThatIsNotFullyContainedShouldReturnFalse() { 117 | assert filter.addAll(Arrays.asList("foo", "bar", "baz", "boz")); 118 | assert filter.containsAll(Arrays.asList("foo", "bar")); 119 | assert 4 == filter.size(); 120 | assertFalse(filter.removeAll(Arrays.asList("foo", "bar", "boom"))); 121 | assertEquals(2, filter.size()); 122 | } 123 | 124 | @Test(expected = IllegalArgumentException.class) 125 | public void removeAllProbabilisticFilterOfEThatIsNotCompatibleShouldThrowIllegalArgumentException() { 126 | filter.removeAll(tinyFilter); 127 | fail(); 128 | } 129 | 130 | @Test 131 | public void isEmpty() { 132 | assertTrue(filter.isEmpty()); 133 | assert filter.add("foo"); 134 | assertFalse(filter.isEmpty()); 135 | assert filter.remove("foo"); 136 | assertTrue(filter.isEmpty()); 137 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 138 | assertFalse(filter.isEmpty()); 139 | assert filter.removeAll(Arrays.asList("foo", "bar", "baz")); 140 | assertTrue(filter.isEmpty()); 141 | assert filter2.addAll(Arrays.asList("foo", "bar", "baz")); 142 | assert filter.addAll(filter2); 143 | assertFalse(filter.isEmpty()); 144 | assert filter.removeAll(filter2); 145 | assertTrue(filter.isEmpty()); 146 | } 147 | 148 | @Test 149 | public void sizeLong() { 150 | assertEquals(0L, filter.sizeLong()); 151 | assert filter.add("foo"); 152 | assertEquals(1L, filter.sizeLong()); 153 | assert filter.remove("foo"); 154 | assertEquals(0L, filter.sizeLong()); 155 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 156 | assertEquals(3L, filter.sizeLong()); 157 | assert filter.removeAll(Arrays.asList("foo", "bar", "baz")); 158 | assertEquals(0L, filter.sizeLong()); 159 | assert filter2.addAll(Arrays.asList("foo", "bar", "baz")); 160 | assert filter.addAll(filter2); 161 | assertEquals(3L, filter.sizeLong()); 162 | assert filter.removeAll(filter2); 163 | assertEquals(0L, filter.sizeLong()); 164 | } 165 | 166 | @Test 167 | public void size() { 168 | assertEquals(0, filter.size()); 169 | assert filter.add("foo"); 170 | assertEquals(1, filter.size()); 171 | assert filter.remove("foo"); 172 | assertEquals(0, filter.size()); 173 | assert filter.addAll(Arrays.asList("foo", "bar", "baz")); 174 | assertEquals(3, filter.size()); 175 | assert filter.removeAll(Arrays.asList("foo", "bar", "baz")); 176 | assertEquals(0, filter.size()); 177 | assert filter2.addAll(Arrays.asList("foo", "bar", "baz")); 178 | assert filter.addAll(filter2); 179 | assertEquals(3, filter.size()); 180 | assert filter.removeAll(filter2); 181 | assertEquals(0, filter.size()); 182 | } 183 | 184 | @Test 185 | public void currentFppShouldGrowWithEverySuccessfulInsertion() { 186 | assertEquals(0.0D, filter.currentFpp()); 187 | double lastFpp = filter.currentFpp(); 188 | for (int i = 0; i < filter.capacity(); i++) { 189 | if (filter.add(String.valueOf(i))) { 190 | double currentFpp = filter.currentFpp(); 191 | assertTrue(lastFpp < currentFpp); 192 | lastFpp = currentFpp; 193 | } 194 | } 195 | } 196 | 197 | @Test 198 | public void capacity() { 199 | assertEquals(1000007, filter.capacity()); 200 | assertEquals(1000003, filter(1000003, 0.9D).capacity()); 201 | 202 | assertEquals(7, tinyFilter.capacity()); 203 | assertEquals(3, filter(3, 0.9D).capacity()); 204 | } 205 | 206 | @Test 207 | public void fpp() { 208 | assertEquals(FILTER_FPP, filter.fpp(), FILTER_FPP * 0.1); 209 | assertEquals(TINY_FILTER_FPP, tinyFilter.fpp(), TINY_FILTER_FPP * 0.1); 210 | } 211 | 212 | } -------------------------------------------------------------------------------- /src/test/java/com/duprasville/guava/probably/CuckooFilterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | package com.duprasville.guava.probably; 16 | 17 | import com.google.common.collect.ImmutableSet; 18 | import com.google.common.hash.Funnel; 19 | import com.google.common.hash.Funnels; 20 | import com.google.common.hash.PrimitiveSink; 21 | import com.google.common.math.LongMath; 22 | import com.google.common.primitives.Ints; 23 | import com.google.common.testing.EqualsTester; 24 | import com.google.common.testing.NullPointerTester; 25 | import com.google.common.testing.SerializableTester; 26 | 27 | import org.junit.Test; 28 | 29 | import java.io.ByteArrayInputStream; 30 | import java.io.ByteArrayOutputStream; 31 | import java.math.RoundingMode; 32 | import java.util.Random; 33 | 34 | import javax.annotation.Nullable; 35 | 36 | import static com.google.common.base.Charsets.UTF_8; 37 | import static com.google.common.truth.Truth.assertThat; 38 | import static junit.framework.Assert.assertEquals; 39 | import static junit.framework.Assert.assertFalse; 40 | import static junit.framework.Assert.assertNotSame; 41 | import static junit.framework.Assert.assertTrue; 42 | import static junit.framework.Assert.fail; 43 | 44 | /** 45 | * CuckooFilter tests of Object method overrides, static methods, and serialization. Modified from 46 | * Guava tests for BloomFilter. 47 | * 48 | * @author Brian Dupras 49 | * @author The Guava Authors (original BloomFilter tests) 50 | */ 51 | public class CuckooFilterTest { 52 | @Test 53 | public void createAndCheckBealDupras32CuckooFilterWithKnownFalsePositives() { 54 | int numInsertions = 1000000; 55 | CuckooFilter cf = CuckooFilter.create( 56 | Funnels.unencodedCharsFunnel(), numInsertions, 0.03, 57 | CuckooStrategies.MURMUR128_BEALDUPRAS_32.strategy()); 58 | 59 | // Insert "numInsertions" even numbers into the CF. 60 | for (int i = 0; i < numInsertions * 2; i += 2) { 61 | cf.add(Integer.toString(i)); 62 | } 63 | 64 | // Assert that the CF "might" have all of the even numbers. 65 | for (int i = 0; i < numInsertions * 2; i += 2) { 66 | assertTrue(cf.contains(Integer.toString(i))); 67 | } 68 | 69 | // Now we check for known false positives using a set of known false positives. 70 | // (These are all of the false positives under 900.) 71 | ImmutableSet falsePositives = ImmutableSet.of(217, 329, 581, 707, 757, 805, 863); 72 | for (int i = 1; i < 900; i += 2) { 73 | if (!falsePositives.contains(i)) { 74 | assertFalse("CF should not contain " + i, cf.contains(Integer.toString(i))); 75 | } 76 | } 77 | 78 | // Check that there are exactly 25926 false positives for this CF. 79 | int expectedNumFpp = 25926; 80 | int actualNumFpp = 0; 81 | for (int i = 1; i < numInsertions * 2; i += 2) { 82 | if (cf.contains(Integer.toString(i))) { 83 | actualNumFpp++; 84 | } 85 | } 86 | assertEquals(expectedNumFpp, actualNumFpp); 87 | // The normal order of (expected, actual) is reversed here on purpose. 88 | assertEquals((double) expectedNumFpp / numInsertions, cf.currentFpp(), 0.00035); 89 | } 90 | 91 | @Test 92 | public void createAndCheckBealDupras32CuckooFilterWithKnownUtf8FalsePositives() { 93 | int numInsertions = 1000000; 94 | CuckooFilter cf = CuckooFilter.create( 95 | Funnels.stringFunnel(UTF_8), numInsertions, 0.03, 96 | CuckooStrategies.MURMUR128_BEALDUPRAS_32.strategy()); 97 | 98 | // Insert "numInsertions" even numbers into the CF. 99 | for (int i = 0; i < numInsertions * 2; i += 2) { 100 | cf.add(Integer.toString(i)); 101 | } 102 | 103 | // Assert that the CF "might" have all of the even numbers. 104 | for (int i = 0; i < numInsertions * 2; i += 2) { 105 | assertTrue(cf.contains(Integer.toString(i))); 106 | } 107 | 108 | // Now we check for known false positives using a set of known false positives. 109 | // (These are all of the false positives under 900.) 110 | ImmutableSet falsePositives = 111 | ImmutableSet.of(5, 315, 389, 443, 445, 615, 621, 703, 789, 861, 899); 112 | for (int i = 1; i < 900; i += 2) { 113 | if (!falsePositives.contains(i)) { 114 | assertFalse("CF should not contain " + i, cf.contains(Integer.toString(i))); 115 | } 116 | } 117 | 118 | // Check that there are exactly 26610 false positives for this CF. 119 | int expectedNumFpp = 26610; 120 | int actualNumFpp = 0; 121 | for (int i = 1; i < numInsertions * 2; i += 2) { 122 | if (cf.contains(Integer.toString(i))) { 123 | actualNumFpp++; 124 | } 125 | } 126 | assertEquals(expectedNumFpp, actualNumFpp); 127 | // The normal order of (expected, actual) is reversed here on purpose. 128 | assertEquals((double) expectedNumFpp / numInsertions, cf.currentFpp(), 0.0004); 129 | } 130 | 131 | /** 132 | * Sanity checking with many combinations of false positive rates and expected insertions 133 | */ 134 | @Test 135 | public void basic() { 136 | for (double fpr = 0.0000001; fpr < 0.1; fpr *= 10) { 137 | for (int capacity = 1; capacity <= 10000; capacity *= 10) { 138 | final CuckooFilter cf = CuckooFilter.create(BAD_FUNNEL, 139 | capacity, fpr); 140 | 141 | assertFalse(cf.contains(new Object())); 142 | for (int insertions = 0; insertions < capacity; insertions++) { 143 | Object o = new Object(); 144 | if (cf.add(o)) { 145 | assertTrue("mightContain should return true when queried with an object previously " + 146 | "added to the filter", cf.contains(o)); 147 | } 148 | } 149 | } 150 | } 151 | } 152 | 153 | @SuppressWarnings("CheckReturnValue") 154 | @Test 155 | public void preconditions() { 156 | try { 157 | CuckooFilter.create(Funnels.unencodedCharsFunnel(), -1); 158 | fail(); 159 | } catch (IllegalArgumentException expected) { 160 | } 161 | try { 162 | CuckooFilter.create(Funnels.unencodedCharsFunnel(), -1, 0.03); 163 | fail(); 164 | } catch (IllegalArgumentException expected) { 165 | } 166 | try { 167 | CuckooFilter.create(Funnels.unencodedCharsFunnel(), 1, 0.0); 168 | fail(); 169 | } catch (IllegalArgumentException expected) { 170 | } 171 | try { 172 | CuckooFilter.create(Funnels.unencodedCharsFunnel(), 1, 1.0); 173 | fail(); 174 | } catch (IllegalArgumentException expected) { 175 | } 176 | } 177 | 178 | @Test 179 | public void failureWhenMoreThan64BitFingerprintsAreNeeded() { 180 | try { 181 | int n = 1000; 182 | double p = 0.00000000000000000000000000000000000000000000000000000000000000000000000000000001; 183 | CuckooFilter.create(Funnels.unencodedCharsFunnel(), n, p); 184 | fail(); 185 | } catch (IllegalArgumentException expected) { 186 | } 187 | } 188 | 189 | @Test 190 | public void nullPointers() { 191 | NullPointerTester tester = new NullPointerTester(); 192 | tester.testAllPublicInstanceMethods(CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100)); 193 | tester.testAllPublicStaticMethods(CuckooFilter.class); 194 | } 195 | 196 | /** 197 | * Tests that we always get a non-negative optimal size. 198 | */ 199 | @SuppressWarnings("CheckReturnValue") 200 | @Test 201 | public void optimalSize() { 202 | for (int n = 1; n < 1000; n++) { 203 | for (double fpp = CuckooFilter.MIN_FPP; fpp <= CuckooFilter.MAX_FPP; fpp += 0.001) { 204 | assertTrue(CuckooFilter.optimalEntriesPerBucket(fpp) >= 2); 205 | assertTrue( 206 | CuckooFilter.optimalNumberOfBuckets(n, CuckooFilter.optimalEntriesPerBucket(fpp)) >= 2); 207 | assertTrue( 208 | CuckooFilter.optimalBitsPerEntry(fpp, CuckooFilter.optimalEntriesPerBucket(fpp)) >= 2); 209 | } 210 | } 211 | 212 | // some random values 213 | Random random = new Random(0); 214 | for (int repeats = 0; repeats < 10000; repeats++) { 215 | final int n = random.nextInt(1 << 16); 216 | final double fpp = random.nextDouble(); 217 | 218 | assertTrue(CuckooFilter.optimalEntriesPerBucket(fpp) >= 2); 219 | assertTrue( 220 | CuckooFilter.optimalNumberOfBuckets(n, CuckooFilter.optimalEntriesPerBucket(fpp)) >= 2); 221 | assertTrue( 222 | CuckooFilter.optimalBitsPerEntry(fpp, CuckooFilter.optimalEntriesPerBucket(fpp)) >= 2); 223 | } 224 | 225 | assertEquals(8, CuckooFilter.optimalEntriesPerBucket(CuckooFilter.MIN_FPP)); 226 | assertEquals(2, CuckooFilter.optimalEntriesPerBucket(CuckooFilter.MAX_FPP)); 227 | assertEquals(273913732, CuckooFilter.optimalNumberOfBuckets(Integer.MAX_VALUE, 228 | CuckooFilter.optimalEntriesPerBucket(CuckooFilter.MIN_FPP))); 229 | assertEquals(Long.SIZE, CuckooFilter.optimalBitsPerEntry(CuckooFilter.MIN_FPP, 230 | CuckooFilter.optimalEntriesPerBucket(CuckooFilter.MIN_FPP))); 231 | 232 | try { 233 | CuckooFilter.create(BAD_FUNNEL, Integer.MAX_VALUE, Double.MIN_VALUE); 234 | fail("we can't represent a CF with such an FPP lower than " + CuckooFilter.MIN_FPP + "!"); 235 | } catch (IllegalArgumentException expected) { 236 | assertThat(expected).hasMessage("Cannot create CuckooFilter with FPP[4.9E-324] < " + 237 | "CuckooFilter.MIN_FPP[8.673617379884035E-19]"); 238 | } 239 | } 240 | 241 | @Test 242 | public void largeNumberOfInsertions() { 243 | // We don't actually allocate a CuckooFilter here to keep Java from OOM'ing 244 | CuckooFilter.calculateDataLength(3L * Integer.MAX_VALUE, 0.0001D); 245 | CuckooFilter.calculateDataLength(6L * Integer.MAX_VALUE, 0.03D); 246 | CuckooFilter.calculateDataLength(26L * Integer.MAX_VALUE, CuckooFilter.MAX_FPP); 247 | } 248 | 249 | @Test 250 | public void copy() { 251 | CuckooFilter original = CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100); 252 | CuckooFilter copy = original.copy(); 253 | assertNotSame(original, copy); 254 | assertEquals(original, copy); 255 | } 256 | 257 | @Test 258 | public void bitSize() { 259 | double fpp = 0.03; 260 | for (int i = 1; i < 10000; i++) { 261 | long numBits = CuckooFilter.calculateDataLength(i, fpp) * Long.SIZE; 262 | int arraySize = Ints.checkedCast(LongMath.divide(numBits, Long.SIZE, RoundingMode.CEILING)); 263 | assertEquals( 264 | arraySize * Long.SIZE, 265 | CuckooFilter.create(Funnels.unencodedCharsFunnel(), i, fpp).bitSize()); 266 | } 267 | } 268 | 269 | @Test 270 | public void equals_empty() { 271 | new EqualsTester() 272 | .addEqualityGroup(CuckooFilter.create(Funnels.byteArrayFunnel(), 100, 0.01)) 273 | .addEqualityGroup(CuckooFilter.create(Funnels.byteArrayFunnel(), 100, 0.02)) 274 | .addEqualityGroup(CuckooFilter.create(Funnels.byteArrayFunnel(), 200, 0.01)) 275 | .addEqualityGroup(CuckooFilter.create(Funnels.byteArrayFunnel(), 200, 0.02)) 276 | .addEqualityGroup(CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100, 0.01)) 277 | .addEqualityGroup(CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100, 0.02)) 278 | .addEqualityGroup(CuckooFilter.create(Funnels.unencodedCharsFunnel(), 200, 0.01)) 279 | .addEqualityGroup(CuckooFilter.create(Funnels.unencodedCharsFunnel(), 200, 0.02)) 280 | .testEquals(); 281 | } 282 | 283 | @Test 284 | public void equals() { 285 | CuckooFilter cf1 = CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100); 286 | cf1.add("1"); 287 | cf1.add("2"); 288 | 289 | CuckooFilter cf2 = CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100); 290 | cf2.add("1"); 291 | cf2.add("2"); 292 | 293 | new EqualsTester() 294 | .addEqualityGroup(cf1, cf2) 295 | .testEquals(); 296 | 297 | cf2.add("3"); 298 | 299 | new EqualsTester() 300 | .addEqualityGroup(cf1) 301 | .addEqualityGroup(cf2) 302 | .testEquals(); 303 | 304 | cf2.remove("3"); 305 | 306 | new EqualsTester() 307 | .addEqualityGroup(cf1, cf2) 308 | .testEquals(); 309 | 310 | } 311 | 312 | @Test 313 | public void equals2() { 314 | // numInsertions param undersized purposely to force underlying storage saturation 315 | CuckooFilter cf1 = CuckooFilter.create(Funnels.unencodedCharsFunnel(), 2); 316 | cf1.add("1"); 317 | cf1.add("2"); 318 | cf1.add("3"); 319 | cf1.add("4"); 320 | 321 | CuckooFilter cf2 = CuckooFilter.create(Funnels.unencodedCharsFunnel(), 2); 322 | cf2.add("4"); 323 | cf2.add("3"); 324 | cf2.add("2"); 325 | cf2.add("1"); 326 | 327 | assertTrue("equals should be true when tables are equivalent but ordered differently", 328 | cf1.equals(cf2)); 329 | 330 | new EqualsTester() 331 | .addEqualityGroup(cf1, cf2) 332 | .testEquals(); 333 | } 334 | 335 | @Test 336 | public void equalsWithCustomFunnel() { 337 | CuckooFilter cf1 = CuckooFilter.create(new CustomFunnel(), 100); 338 | CuckooFilter cf2 = CuckooFilter.create(new CustomFunnel(), 100); 339 | assertEquals(cf1, cf2); 340 | } 341 | 342 | @Test 343 | public void serializationWithCustomFunnel() { 344 | SerializableTester.reserializeAndAssert(CuckooFilter.create(new CustomFunnel(), 100)); 345 | } 346 | 347 | private static final class CustomFunnel implements Funnel { 348 | public void funnel(Long value, PrimitiveSink into) { 349 | into.putLong(value); 350 | } 351 | 352 | @Override 353 | public boolean equals(@Nullable Object object) { 354 | return (object instanceof CustomFunnel); 355 | } 356 | 357 | @Override 358 | public int hashCode() { 359 | return 42; 360 | } 361 | } 362 | 363 | @Test 364 | public void addReturnValue() { 365 | for (int i = 0; i < 10; i++) { 366 | CuckooFilter cf = CuckooFilter.create(Funnels.unencodedCharsFunnel(), 100); 367 | for (int j = 0; j < 10; j++) { 368 | String value = new Object().toString(); 369 | boolean mightContain = cf.contains(value); 370 | boolean put = cf.add(value); 371 | assertTrue(mightContain != put); 372 | boolean delete = cf.remove(value); 373 | assertTrue(put == delete); 374 | } 375 | } 376 | } 377 | 378 | @Test 379 | public void addAll() { 380 | int element1 = 1; 381 | int element2 = 2; 382 | 383 | CuckooFilter cf1 = CuckooFilter.create(Funnels.integerFunnel(), 100); 384 | cf1.add(element1); 385 | assertTrue(cf1.contains(element1)); 386 | assertFalse(cf1.contains(element2)); 387 | 388 | CuckooFilter cf2 = CuckooFilter.create(Funnels.integerFunnel(), 100); 389 | cf2.add(element2); 390 | assertFalse(cf2.contains(element1)); 391 | assertTrue(cf2.contains(element2)); 392 | 393 | assertTrue(cf1.isCompatible(cf2)); 394 | cf1.addAll(cf2); 395 | assertTrue(cf1.contains(element1)); 396 | assertTrue(cf1.contains(element2)); 397 | assertFalse(cf2.contains(element1)); 398 | assertTrue(cf2.contains(element2)); 399 | } 400 | 401 | @Test 402 | public void addAllFails() { 403 | int element = 1; 404 | 405 | CuckooFilter cf1 = CuckooFilter.create(Funnels.integerFunnel(), 100); 406 | // purposely fill buckets that contain entries for element 407 | while (cf1.add(element)) { 408 | assertTrue(cf1.contains(element)); 409 | } 410 | 411 | CuckooFilter cf2 = CuckooFilter.create(Funnels.integerFunnel(), 100); 412 | cf2.add(element); 413 | assertTrue(cf2.contains(element)); 414 | 415 | assertTrue(cf1.isCompatible(cf2)); 416 | 417 | assertFalse("putAll should return false when buckets at index & altIndex are already full", 418 | cf1.addAll(cf2)); 419 | } 420 | 421 | @Test 422 | public void addAllDifferentSizes() { 423 | CuckooFilter cf1 = CuckooFilter.create(Funnels.integerFunnel(), 1); 424 | CuckooFilter cf2 = CuckooFilter.create(Funnels.integerFunnel(), 10); 425 | 426 | try { 427 | assertFalse(cf1.isCompatible(cf2)); 428 | cf1.addAll(cf2); 429 | fail(); 430 | } catch (IllegalArgumentException expected) { 431 | } 432 | 433 | try { 434 | assertFalse(cf2.isCompatible(cf1)); 435 | cf2.addAll(cf1); 436 | fail(); 437 | } catch (IllegalArgumentException expected) { 438 | } 439 | } 440 | 441 | @Test 442 | public void addAllWithSelf() { 443 | CuckooFilter cf1 = CuckooFilter.create(Funnels.integerFunnel(), 1); 444 | try { 445 | assertFalse(cf1.isCompatible(cf1)); 446 | cf1.addAll(cf1); 447 | fail(); 448 | } catch (IllegalArgumentException expected) { 449 | } 450 | } 451 | 452 | @Test 453 | public void javaSerialization() { 454 | CuckooFilter cf = CuckooFilter.create(Funnels.byteArrayFunnel(), 100); 455 | for (int i = 0; i < 10; i++) { 456 | cf.add(Ints.toByteArray(i)); 457 | } 458 | 459 | CuckooFilter copy = SerializableTester.reserialize(cf); 460 | for (int i = 0; i < 10; i++) { 461 | assertTrue(copy.contains(Ints.toByteArray(i))); 462 | } 463 | assertEquals(cf.currentFpp(), copy.currentFpp()); 464 | 465 | SerializableTester.reserializeAndAssert(cf); 466 | } 467 | 468 | @Test 469 | public void customSerialization() throws Exception { 470 | Funnel funnel = Funnels.byteArrayFunnel(); 471 | CuckooFilter cf = CuckooFilter.create(funnel, 100); 472 | for (int i = 0; i < 100; i++) { 473 | cf.add(Ints.toByteArray(i)); 474 | } 475 | 476 | ByteArrayOutputStream out = new ByteArrayOutputStream(); 477 | cf.writeTo(out); 478 | 479 | assertEquals(cf, CuckooFilter.readFrom(new ByteArrayInputStream(out.toByteArray()), funnel)); 480 | } 481 | 482 | static final Funnel BAD_FUNNEL = new Funnel() { 483 | public void funnel(Object object, PrimitiveSink bytePrimitiveSink) { 484 | bytePrimitiveSink.putInt(object.hashCode()); 485 | } 486 | }; 487 | 488 | @Test 489 | public void ensureGeneric() { 490 | class SuperClass { 491 | } 492 | class SubClass extends SuperClass { 493 | } 494 | 495 | CuckooFilter filter = CuckooFilter.create( 496 | new Funnel() { 497 | public void funnel(SuperClass from, PrimitiveSink into) { 498 | into.putInt(from.hashCode()); 499 | } 500 | }, 1000, 0.03D); 501 | 502 | assertTrue(filter.add(new SuperClass())); 503 | assertTrue(filter.add(new SubClass())); 504 | } 505 | } 506 | -------------------------------------------------------------------------------- /src/test/java/com/duprasville/guava/probably/CuckooStrategiesTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2015 Brian Dupras 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except 5 | * in compliance with the License. You may obtain a copy of the License at 6 | * 7 | * http://www.apache.org/licenses/LICENSE-2.0 8 | * 9 | * Unless required by applicable law or agreed to in writing, software distributed under the License 10 | * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express 11 | * or implied. See the License for the specific language governing permissions and limitations under 12 | * the License. 13 | */ 14 | 15 | 16 | package com.duprasville.guava.probably; 17 | 18 | import org.junit.Test; 19 | 20 | import java.util.Random; 21 | 22 | import static com.duprasville.guava.probably.CuckooStrategies.MURMUR128_BEALDUPRAS_32; 23 | import static com.duprasville.guava.probably.CuckooStrategies.values; 24 | import static com.duprasville.guava.probably.CuckooTable.readBits; 25 | import static com.duprasville.guava.probably.CuckooTable.writeBits; 26 | import static com.google.common.truth.Truth.assertThat; 27 | import static junit.framework.Assert.assertEquals; 28 | 29 | /** 30 | * CuckooFilterStrategies tests. These are tests of internal, though somewhat complex, 31 | * implementation details of cuckoo filters. 32 | * 33 | * @author Brian Dupras 34 | */ 35 | public class CuckooStrategiesTest { 36 | @Test 37 | public void fingerprintBoundaries() throws Exception { 38 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0x80000000, 1)).isEqualTo(0x01); 39 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0xC0000000, 2)).isEqualTo(0x03); 40 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0xE0000000, 3)).isEqualTo(0x04); 41 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0xE0000000, 8)).isEqualTo(0xE0); 42 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0xE0000000, 16)).isEqualTo(0xE000); 43 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0x80000000, Integer.SIZE)).isEqualTo(0x80000000); 44 | for (int f = 1; f < Integer.SIZE; f++) { 45 | assertThat(CuckooStrategyMurmurBealDupras32.fingerprint(0x00, f)).isNotEqualTo(0x00); 46 | } 47 | } 48 | 49 | @Test 50 | public void indexIsModuloM() throws Exception { 51 | final int min = Integer.MIN_VALUE; 52 | final int max = Integer.MAX_VALUE; 53 | final int incr = 100000; 54 | final long m = 0x1DEAL; 55 | 56 | for (int hash = min; hash != next(hash, incr, max); hash = next(hash, incr, max)) { 57 | final long index = new CuckooStrategyMurmurBealDupras32(-1).index(hash, m); 58 | assertThat(index).isLessThan(m); 59 | assertThat(index).isGreaterThan(-1L); 60 | } 61 | } 62 | 63 | @Test 64 | public void altIndexIsReversible() throws Exception { 65 | final long max = Long.MAX_VALUE - 1L; // must be even! 66 | final long incr = 1000000L; 67 | final Random random = new Random(1L); 68 | final byte[] fingerprint = new byte[1]; 69 | 70 | for (long index = 0; index != next(index, incr, max); index = next(index, incr, max)) { 71 | random.nextBytes(fingerprint); 72 | int f = (random.nextInt(126) + 1) * (random.nextBoolean() ? 1 : -1); 73 | final long altIndex = new CuckooStrategyMurmurBealDupras32(-1).altIndex(index, f, max); 74 | final long altAltIndex = new CuckooStrategyMurmurBealDupras32(-1).altIndex(altIndex, f, max); 75 | assertEquals("index should equal altIndex(altIndex(index)):" + f, index, altAltIndex); 76 | } 77 | } 78 | 79 | /** 80 | * This test will fail whenever someone updates/reorders the BloomFilterStrategies constants. Only 81 | * appending a new constant is allowed. 82 | */ 83 | @Test 84 | public void cuckooFilterStrategies() { 85 | assertThat(values()).hasLength(1); 86 | assertEquals(MURMUR128_BEALDUPRAS_32, values()[0]); 87 | } 88 | 89 | @Test 90 | public void writeBits_() throws Exception { 91 | long[] data; 92 | 93 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 94 | assertEquals(0xfafa, writeBits(0xABCD, data, 0, 16)); 95 | assertEquals(0xfafafafafafaABCDl, data[0]); 96 | assertEquals(0xfafafafafafafafal, data[1]); 97 | 98 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 99 | assertEquals(0xfafa, writeBits(0xABCD, data, 32, 16)); 100 | assertEquals(0xfafaABCDfafafafal, data[0]); 101 | assertEquals(0xfafafafafafafafal, data[1]); 102 | 103 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 104 | assertEquals(0xfafa, writeBits(0xABCD, data, 48, 16)); 105 | assertEquals(0xABCDfafafafafafal, data[0]); 106 | assertEquals(0xfafafafafafafafal, data[1]); 107 | 108 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 109 | assertEquals(0x7D7D, writeBits(0xABCD, data, 49, 16)); 110 | assertEquals(0x579Afafafafafafal, data[0]); 111 | assertEquals(0xfafafafafafafafBl, data[1]); 112 | 113 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 114 | assertEquals(0xfafa, writeBits(0xABCD, data, 56, 16)); 115 | assertEquals(0xCDfafafafafafafal, data[0]); 116 | assertEquals(0xfafafafafafafaABl, data[1]); 117 | 118 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 119 | assertEquals(0xfafa, writeBits(0xABCD, data, 64, 16)); 120 | assertEquals(0xfafafafafafafafal, data[0]); 121 | assertEquals(0xfafafafafafaABCDl, data[1]); 122 | 123 | data = new long[]{0xfafafafafafafafal, 0xfafafafafafafafal}; 124 | assertEquals(0xfafa, writeBits(0xABCD, data, 112, 16)); 125 | assertEquals(0xfafafafafafafafal, data[0]); 126 | assertEquals(0xABCDfafafafafafal, data[1]); 127 | } 128 | 129 | @Test 130 | public void readBits_() throws Exception { 131 | assertEquals(0xABCD, readBits(new long[]{0x000000000000ABCDL, 0x00000000000000FFL}, 0, 16)); 132 | assertEquals(0xABCD, readBits(new long[]{0x0000ABCD00000000L, 0x00000000000000FFL}, 32, 16)); 133 | assertEquals(0xABCD, readBits(new long[]{0xABCD000000000000L, 0x00000000000000FFL}, 48, 16)); 134 | assertEquals(0xABCD, readBits(new long[]{0xABCD000000000000L << 1, 0x00000000000FFL}, 49, 16)); 135 | assertEquals(0xABCD, readBits(new long[]{0xCD00000000000000L, 0x0000000000000FABL}, 56, 16)); 136 | assertEquals(0xABCD, readBits(new long[]{0xFF00000000000000L, 0x000000000000ABCDL}, 64, 16)); 137 | 138 | assertEquals(0x01CD, readBits(new long[]{0x000000000000ABCDL, 0x00000000000000FFL}, 0, 9)); 139 | assertEquals(0x01CD, readBits(new long[]{0x0000ABCD00000000L, 0x00000000000000FFL}, 32, 9)); 140 | assertEquals(0x01CD, readBits(new long[]{0xABCD000000000000L, 0x00000000000000FFL}, 48, 9)); 141 | assertEquals(0x01CD, readBits(new long[]{0xABCD000000000000L << 1, 0x00000000000FFL}, 49, 9)); 142 | assertEquals(0x01CD, readBits(new long[]{0xCD00000000000000L, 0x0000000000000FABL}, 56, 9)); 143 | assertEquals(0x01CD, readBits(new long[]{0xFF00000000000000L, 0x000000000000ABCDL}, 64, 9)); 144 | } 145 | 146 | 147 | // Test utilities 148 | 149 | private int next(int start, int incr, int max) { 150 | int ret = start + max / incr; 151 | return ((ret < start) || (ret > max)) ? max : ret; 152 | } 153 | 154 | private long next(long start, long incr, long max) { 155 | long ret = start + max / incr; 156 | return ((ret < start) || (ret > max)) ? max : ret; 157 | } 158 | } 159 | -------------------------------------------------------------------------------- /updaterelease.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #*************************************************************************** 4 | # 5 | # Main script for updating release API docs. Can be used to 6 | # either update a specific release version or the current snapshot release. 7 | # 8 | # Usage examples: 9 | # 10 | # ./updaterelease.sh snapshot 11 | # ./updaterelease.sh 1.0 12 | # ./updaterelease.sh 1.0-rc1 13 | # 14 | # All of these update the Javadoc located at _releases//api/docs , 15 | # creating those 16 | # directories if this is a new release version. If is 'snapshot', 17 | # Javadoc is derived from the 'master' branch. Otherwise, it is 18 | # derived from the git tag 'v'. In both cases, the actual version 19 | # number is determined by checking out the git branch or tag and getting 20 | # the version number from the pom.xml file via Maven (for non-snapshot 21 | # releases, though, it should always be the same as the 22 | # argument). 23 | # 24 | #*************************************************************************** 25 | 26 | set -e -u 27 | projectname=guava-probably 28 | 29 | # Ensure working dir is the root of the git repo and load util functions. 30 | cd $(dirname $0) 31 | source util/util.sh 32 | 33 | ensure_no_uncommitted_changes 34 | 35 | # Ensure valid args from user and get the basic variables we need. 36 | if [[ ! $# -eq 1 ]]; then 37 | echo "Usage: $0 " >&2 38 | exit 1 39 | fi 40 | release=$1 41 | releaseref=$(git_ref $release) 42 | initialref=$(current_git_ref) 43 | 44 | # Create temp directories and files. 45 | tempdir=$(mktemp -d -t ${projectname}-$release-temp.XXX) 46 | logfile=$(mktemp -t ${projectname}-$release-temp-log.XXX) 47 | 48 | # Ensure temp files are cleaned up and we're back on the original branch on exit. 49 | function cleanup { 50 | exitcode=$? 51 | if [[ "$exitcode" == "0" ]]; then 52 | rm $logfile 53 | else 54 | # Put a newline in case we're in the middle of a "Do something... Done." line 55 | echo "" 56 | echo "Update failed: see log at '$logfile' for more details." >&2 57 | # If we failed while not on the original branch/ref, switch back to it. 58 | currentref=$(current_git_ref) 59 | if [[ "$currentref" != "$initialref" ]]; then 60 | git checkout -q $initialref 61 | fi 62 | fi 63 | rm -fr $tempdir 64 | exit $exitcode 65 | } 66 | trap cleanup INT TERM EXIT 67 | 68 | # Switch to the git ref for the release to do things with the actual repo. 69 | git_checkout_ref $releaseref 70 | 71 | # Get the current project version from Maven. 72 | projectversion=$(project_version) 73 | 74 | echo "Updating Javadoc for ${projectname} ${projectversion}" 75 | 76 | # Copy source files to a temp dir. 77 | cp -r src $tempdir/src 78 | 79 | # Compile and generate Javadoc, putting class files in $tempdir/classes and docs in $tempdir/docs. 80 | 81 | echo -n "Compiling and generating Javadoc..." 82 | mvn \ 83 | clean \ 84 | compile \ 85 | javadoc:javadoc \ 86 | dependency:build-classpath \ 87 | -Dmdep.outputFile=$tempdir/classpath \ 88 | >> $logfile 2>&1 89 | echo " Done." 90 | 91 | mv target/classes $tempdir/classes 92 | mv target/site/apidocs $tempdir/docs 93 | 94 | # Cleanup target dir. 95 | rm -fr target 96 | 97 | # Switch back to gh-pages. 98 | git_checkout_ref $initialref 99 | 100 | 101 | # Move generated output to the appropriate final directories. 102 | docsdir=_releases/$release/api/docs 103 | mkdir -p $docsdir && rm -fr $docsdir 104 | 105 | echo -n "Moving generated Javadoc to $docsdir..." 106 | mv $tempdir/docs $docsdir 107 | echo " Done." 108 | 109 | # Commit 110 | echo -n "Committing changes..." 111 | git add . 112 | git commit -q -m "Generate Javadoc for ${projectname} ${projectversion}" 113 | echo " Done." 114 | 115 | # Update version info in _config.yml 116 | if [[ $release == "snapshot" ]]; then 117 | fieldtoupdate="latest_snapshot" 118 | version="${projectversion}" 119 | else 120 | fieldtoupdate="latest_release" 121 | # The release being updated currently may not be the latest release. 122 | version=$(latest_release) 123 | fi 124 | sed -i'.bak' -e "s/^$fieldtoupdate:[ ]+.+/$fieldtoupdate: $version/g" _config.yml 125 | if [ -e _config.yml.bak ]; then 126 | rm _config.yml.bak 127 | fi 128 | if ! git diff --quiet ; then 129 | echo -n "Updating $fieldtoupdate in _config.yml to $version..." 130 | git add _config.yml > /dev/null 131 | git commit -q -m "Update $fieldtoupdate version to $version" 132 | echo " Done." 133 | fi 134 | 135 | echo "Update succeeded." 136 | -------------------------------------------------------------------------------- /util/deploy_snapshot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # see https://coderwall.com/p/9b_lfq 4 | 5 | set -e -u 6 | 7 | if [ "$TRAVIS_REPO_SLUG" == "bdupras/guava-probably" ] && \ 8 | [ "$TRAVIS_JDK_VERSION" == "oraclejdk8" ] && \ 9 | [ "$TRAVIS_PULL_REQUEST" == "false" ] && \ 10 | [ "$TRAVIS_BRANCH" == "master" ]; then 11 | echo "Publishing Maven snapshot..." 12 | 13 | mvn clean source:jar javadoc:jar deploy --settings="util/settings.xml" -DskipTests=true 14 | 15 | echo "Maven snapshot published." 16 | fi 17 | -------------------------------------------------------------------------------- /util/settings.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | ossrh 7 | ${env.CI_DEPLOY_USERNAME} 8 | ${env.CI_DEPLOY_PASSWORD} 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /util/update_snapshot_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # see http://benlimmer.com/2013/12/26/automatically-publish-javadoc-to-gh-pages-with-travis-ci/ for details 4 | 5 | set -e -u -x 6 | 7 | if [ "$TRAVIS_REPO_SLUG" == "bdupras/guava-probably" ] && \ 8 | [ "$TRAVIS_JDK_VERSION" == "oraclejdk8" ] && \ 9 | [ "$TRAVIS_PULL_REQUEST" == "false" ] && \ 10 | [ "$TRAVIS_BRANCH" == "master" ]; then 11 | echo "Publishing Javadoc ..." 12 | 13 | cd $HOME 14 | rm -Rf gh-pages 15 | git clone -q -b gh-pages https://${GH_TOKEN}@github.com/bdupras/guava-probably gh-pages > /dev/null 16 | cd gh-pages 17 | 18 | git config --global user.email "travis@travis-ci.org" 19 | git config --global user.name "travis-ci" 20 | 21 | ./updaterelease.sh snapshot 22 | 23 | git push -fq origin gh-pages > /dev/null 24 | 25 | echo "Javadoc published to gh-pages." 26 | fi 27 | -------------------------------------------------------------------------------- /util/util.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e -u 4 | 5 | # Exits if there are uncommitted changes in the git repo. 6 | function ensure_no_uncommitted_changes { 7 | if ! git diff --quiet ; then 8 | echo "Uncommitted changes found. Aborting." >&2 9 | exit 1 10 | fi 11 | } 12 | 13 | # Returns the current git branch, if any; the HEAD commit's SHA1 otherwise. 14 | function current_git_ref { 15 | branch=$(git rev-parse --abbrev-ref HEAD) 16 | if [ $branch == "HEAD" ]; then 17 | echo $(git rev-parse HEAD) 18 | else 19 | echo $branch 20 | fi 21 | } 22 | 23 | # Returns the version of the project at the current revision, pulled from Maven. 24 | function project_version { 25 | mvn -B org.apache.maven.plugins:maven-help-plugin:2.1.1:evaluate \ 26 | -Dexpression=project.version \ 27 | | grep -Ev '(^\[|Download\w+:)' 28 | } 29 | 30 | # Checks that a given tag exists in the repo. 31 | function check_tag_exists { 32 | tag=$1 33 | if ! git show-ref -q --verify refs/tags/$tag; then 34 | echo "Tag $tag does not exist" >&2 35 | exit 1 36 | fi 37 | } 38 | 39 | # Takes an input arg representing a version (as parsed in parse_version) and 40 | # returns the git ref (tag or branch) that should be checked out to get that version. 41 | function git_ref { 42 | release=$1 43 | if [[ $release == "snapshot" ]]; then 44 | echo "master" 45 | else 46 | tag="v$release" 47 | check_tag_exists $tag 48 | echo $tag 49 | fi 50 | } 51 | 52 | # Checks out the branch/ref with the given identifier. 53 | # If the ref is the master branch, pulls to update it. 54 | function git_checkout_ref { 55 | ref=$1 56 | echo -n "Checking out '$ref'..." 57 | git checkout -q $ref 58 | echo " Done." 59 | 60 | # If we're on master, pull to get the latest 61 | if [ $ref == "master" ]; then 62 | echo -n "Pulling to get latest changes..." 63 | git pull -q --ff-only 64 | echo " Done." 65 | fi 66 | } 67 | 68 | platform=$(uname) 69 | if [[ $platform == "Linux" ]]; then 70 | # GNU utils 71 | extended="-r" 72 | versionsort="--version-sort" 73 | else 74 | # BSD utils 75 | extended="-E" 76 | versionsort="-g" 77 | fi 78 | 79 | # Sorts all numeric releases from the _releases/ directory by version, from 80 | # greatest version to least. This works as you'd expect, for example: 81 | # 82 | # 18.0.2 > 18.0.1 > 18.0 > 18.0-rc2 > 18.0-rc1 > 17.1 > 17.0.1 > 17.0 83 | # 84 | # This function expects to be run with the working directory at the root of 85 | # the git tree. 86 | function sort_releases { 87 | # This is all sorts of hacky and I'm sure there's a better way, but it 88 | # seems to work as long as we're just dealing with versions like 1.2, 89 | # 1.2.3, 1.2-rc1 and 1.2.3-rc1. 90 | ls _releases | \ 91 | grep -E ^[0-9]+\.[0-9]+ | \ 92 | sort -u | \ 93 | sed $extended -e 's/^([0-9]+\.[0-9]+)$/\1.01/g' -e 's/-rc/!/g' | \ 94 | sort -r $versionsort | \ 95 | sed $extended -e 's/!/-rc/g' -e 's/\.01//g' 96 | } 97 | 98 | # Gets the major version part of a version number. 99 | function major_version { 100 | majorversion=$(echo "$1" | cut -d . -f 1) 101 | if [[ ! $majorversion =~ ^[0-9]+$ ]]; then 102 | echo "Invalid version number: $1" >&2 103 | exit 1 104 | fi 105 | echo $majorversion 106 | } 107 | 108 | # Prints the highest non-rc release from the sorted list of releases 109 | # produced by sort_releases. If a release argument is provided, print 110 | # the highest non-rc release that has a major version that is lower than 111 | # the given release. For example, given "16.0.1", return "15.0". 112 | function latest_release { 113 | if [[ $# -eq 1 ]]; then 114 | ceiling=$(major_version "$1") 115 | else 116 | ceiling="" 117 | fi 118 | 119 | releases=$(sort_releases) 120 | for release in $releases; do 121 | if [[ ! -z "$ceiling" ]]; then 122 | releasemajor=$(major_version "$release") 123 | if (( ceiling <= releasemajor )); then 124 | continue 125 | fi 126 | fi 127 | if [[ ! $release =~ ^.+-rc[0-9]+$ ]]; then 128 | echo $release 129 | break 130 | fi 131 | done 132 | } 133 | --------------------------------------------------------------------------------