├── .github ├── mergify.yml └── workflows │ └── simple-build-test.yml ├── .gitignore ├── Cargo.toml ├── LICENSE ├── Makefile ├── README.md ├── RELEASES.md ├── TODO.md ├── benches ├── fuse16_bench.rs ├── fuse8_bench.rs └── xor_bench.rs ├── check.sh ├── perf.sh ├── rustfmt.toml ├── src ├── bin │ └── perf.rs ├── fuse16.rs ├── fuse16_test.rs ├── fuse8.rs ├── fuse8_test.rs ├── hasher.rs ├── lib.rs ├── xor8 │ ├── builder.rs │ ├── filter.rs │ ├── mod.rs │ └── xor8_test.rs └── xor8_old.rs └── tests ├── tl1-serialized.data └── xorfilter.rs /.github/mergify.yml: -------------------------------------------------------------------------------- 1 | queue_rules: 2 | - name: feature_queue 3 | conditions: 4 | # - '#check-pending=0' 5 | - '#check-success>=2' 6 | # - check-success=check-subject 7 | - check-success=xorfilter (stable, --release, test) 8 | - check-success~=xorfilter 9 | 10 | pull_request_rules: 11 | 12 | - name: put into queue if approved 13 | conditions: 14 | - "#approved-reviews-by>=0" 15 | - "#changes-requested-reviews-by=0" 16 | # - check-success=check-subject 17 | - check-success=xorfilter (stable, --release, test) 18 | actions: 19 | queue: 20 | name: feature_queue 21 | 22 | - name: Delete head branch after merge 23 | conditions: 24 | - merged 25 | actions: 26 | delete_head_branch: 27 | -------------------------------------------------------------------------------- /.github/workflows/simple-build-test.yml: -------------------------------------------------------------------------------- 1 | name: unittest 2 | 3 | on: 4 | push: 5 | pull_request: 6 | schedule: [cron: "40 1 * * *"] 7 | 8 | 9 | jobs: 10 | build: 11 | name: xorfilter 12 | runs-on: ubuntu-latest 13 | 14 | 15 | strategy: 16 | matrix: 17 | toolchain: 18 | - "stable" 19 | - "nightly" 20 | profile: 21 | - "--release" 22 | # Disable debug mode test 23 | # It's about 10 times slower with debug mode: ~ 10 minutes 24 | # - "" 25 | cmd: 26 | - "build" 27 | - "test" 28 | 29 | 30 | steps: 31 | - name: Install toolchain with clippy available 32 | uses: actions-rs/toolchain@v1 33 | with: 34 | profile: minimal 35 | toolchain: "${{ matrix.toolchain }}" 36 | override: true 37 | components: clippy 38 | 39 | 40 | - uses: actions/checkout@v2 41 | 42 | 43 | - name: "${{ matrix.toolchain }} ${{ matrix.cmd }} ${{ matrix.profile }}" 44 | uses: actions-rs/cargo@v1 45 | with: 46 | command: "${{ matrix.cmd }}" 47 | args: --verbose ${{ matrix.profile }} 48 | 49 | build-benchmark: 50 | runs-on: ubuntu-latest 51 | 52 | 53 | steps: 54 | - name: Install toolchain with clippy available 55 | uses: actions-rs/toolchain@v1 56 | with: 57 | profile: minimal 58 | toolchain: "nightly" 59 | override: true 60 | 61 | 62 | - uses: actions/checkout@v2 63 | 64 | 65 | - uses: actions-rs/cargo@v1 66 | with: 67 | command: bench 68 | args: --verbose nothing-to-run 69 | 70 | 71 | lint: 72 | runs-on: ubuntu-latest 73 | 74 | steps: 75 | - uses: actions/checkout@v2 76 | - uses: actions-rs/toolchain@v1.0.6 77 | with: 78 | profile: minimal 79 | toolchain: "nightly" 80 | override: true 81 | components: rustfmt, clippy 82 | 83 | 84 | - name: Format 85 | uses: actions-rs/cargo@v1 86 | with: 87 | command: fmt 88 | args: --all -- --check 89 | 90 | 91 | - name: Clippy 92 | uses: actions-rs/clippy-check@v1 93 | with: 94 | token: ${{ secrets.GITHUB_TOKEN }} 95 | args: --all-targets -- -D warnings -A clippy::uninlined_format_args 96 | 97 | 98 | - name: Build-doc 99 | uses: actions-rs/cargo@v1 100 | with: 101 | command: doc 102 | args: --all --no-deps 103 | env: 104 | RUSTDOCFLAGS: "-D warnings" 105 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | target 3 | Cargo.lock 4 | .vimsession 5 | core 6 | test.out 7 | flamegraph.svg 8 | perf.out 9 | check.out 10 | perf.data 11 | perf.data.old 12 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "xorfilter-rs" 3 | version = "0.6.0" 4 | description = "Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters" 5 | repository = "https://github.com/bnclabs/xorfilter" 6 | documentation = "https://docs.rs/xorfilter-rs" 7 | keywords = ["xorfilter", "bloom", "bitmap", "data-structures"] 8 | categories = ["algorithms", "database", "data-structures"] 9 | homepage = "https://github.com/bnclabs/xorfilter" 10 | authors = ["prataprc "] 11 | license = "Apache-2.0" 12 | edition = "2018" 13 | readme = "README.md" 14 | 15 | [profile.release] 16 | debug = true 17 | 18 | [profile.bench] 19 | debug = true 20 | 21 | [lib] 22 | name = "xorfilter" 23 | 24 | [[bin]] 25 | name = "perf" 26 | required-features = ["perf", "cbordata"] 27 | 28 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 29 | [dependencies] 30 | cbordata = { version = "0.6.0", optional = true } 31 | structopt = { version = "0.3.20", default-features = false, optional = true } 32 | rand = { version = "0.7.3", features = ["small_rng"], optional = true } 33 | 34 | [dev-dependencies] 35 | criterion = "0.3" 36 | rand = { version = "0.7.3", features = ["small_rng"] } 37 | 38 | [[bench]] 39 | name = "xor_bench" 40 | harness = false 41 | 42 | [[bench]] 43 | name = "fuse8_bench" 44 | harness = false 45 | 46 | [[bench]] 47 | name = "fuse16_bench" 48 | harness = false 49 | 50 | [features] 51 | perf = ["structopt", "rand", "cbordata"] 52 | 53 | [badges] 54 | maintenance = { status = "actively-developed" } 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | # ... build ... 3 | cargo +nightly build 4 | cargo +nightly build --features cbordata 5 | cargo +stable build 6 | cargo +stable build --features cbordata 7 | # 8 | # ... test ... 9 | cargo +nightly test --no-run 10 | cargo +nightly test --no-run --features cbordata 11 | cargo +stable test --no-run 12 | cargo +stable test --no-run --features cbordata 13 | # 14 | # ... bench ... 15 | cargo +nightly bench --no-run 16 | cargo +nightly bench --no-run --features cbordata 17 | # 18 | # ... doc ... 19 | cargo +nightly doc 20 | cargo +nightly doc --features cbordata 21 | cargo +stable doc 22 | cargo +stable doc --features cbordata 23 | # 24 | # ... meta commands ... 25 | cargo +nightly clippy --all-targets --all-features 26 | 27 | test: 28 | # ... test ... 29 | cargo +nightly test 30 | cargo +nightly test --features cbordata 31 | cargo +stable test --no-run 32 | cargo +stable test --no-run --features cbordata 33 | 34 | lint: 35 | cargo fmt 36 | cargo clippy --all-targets -- -D warnings 37 | 38 | doc: 39 | RUSTDOCFLAGS="-D warnings" cargo doc --all --no-deps 40 | 41 | bench: 42 | # ... test ... 43 | cargo +nightly bench 44 | cargo +nightly bench --features cbordata 45 | cargo +stable test --no-run 46 | 47 | flamegraph: 48 | echo "not an executable" 49 | 50 | prepare: build test bench 51 | check.sh check.out 52 | perf.sh perf.out 53 | 54 | clean: 55 | cargo clean 56 | rm -f check.out perf.out flamegraph.svg perf.data perf.data.old 57 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Rustdoc](https://img.shields.io/badge/rustdoc-hosted-blue.svg)](https://docs.rs/xorfilter-rs) 2 | [![simple-build-test](https://github.com/bnclabs/xorfilter/actions/workflows/simple-build-test.yml/badge.svg)](https://github.com/bnclabs/xorfilter/actions/workflows/simple-build-test.yml) 3 | 4 | Rust library implementing xor filters 5 | ------------------------------------- 6 | 7 | Implementation of [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258) 8 | in [rust-lang](https://www.rust-lang.org/), Journal of Experimental Algorithmics (to appear). 9 | 10 | This package is a port from its [golang implementation](https://github.com/FastFilter/xorfilter). 11 | 12 | ### How to use _xorfilter_ in my rust project ? 13 | 14 | Add the following under project's `Cargo.toml`: 15 | 16 | ```toml 17 | [dependencies] 18 | xorfilter-rs = "0.2.0" 19 | ``` 20 | 21 | or 22 | 23 | ```toml 24 | [dependencies] 25 | xorfilter-rs = { git = "https://github.com/bnclabs/xorfilter" } 26 | ``` 27 | 28 | ```rust 29 | use xorfilter::Xor8; 30 | 31 | let mut keys: Vec = vec![]; 32 | for _ in 0..num_keys { 33 | keys.push(rng.gen()); 34 | } 35 | 36 | let mut filter = Xor8::new(); // new filter. 37 | filter.populate_keys(&keys); // populate keys. 38 | filter.build(); // build bitmap. 39 | 40 | for key in 0..lookup { 41 | // there can be false positives, but no false negatives. 42 | filter.contains_key(key); 43 | } 44 | ``` 45 | 46 | Open issues 47 | ----------- 48 | 49 | * [ ] Serialize / Deserialize Xor8 type. 50 | * [ ] Incrementally adding keys to a pre-built Xor8 instance. 51 | * [ ] Gather benchmark results for other implementations - Go, C, C++, Erlang, Java, Python. 52 | 53 | Benchmarks 54 | ---------- 55 | 56 | Following are the results for a set of 10-million `u64` keys: 57 | 58 | | | build 10M keys | membership | FPP | Bits/Entry | 59 | |-------------|-----------------|-------------|---------|-------------| 60 | | Xor8-C | 1.206 secs | NA | 0.389 % | 9.84 bits | 61 | | Xor8-rust | 1.809 secs | 61.716 ns | 0.392 % | 9.84 bits | 62 | | Fuse8-C | 0.508 secs | NA | 0.390 % | 9.02 bits | 63 | | Fuse8-rust | 0.577 secs | 42.657 ns | 0.392 % | 9.02 bits | 64 | | Fuse16-C | 0.515 secs | NA | 0.001 % | 18.04 bits | 65 | | Fuse16-rust | 0.621 secs | 54.657 ns | 0.001 % | 18.03 bits | 66 | 67 | * **Build time** is measured in `Seconds`, for 10 million entries. 68 | * **Membership** is measured in `Nanosec`, for single lookup in a set of 10 million entries. 69 | * **FPP** = False Positive Probability measured in percentage 70 | 71 | Useful links 72 | ------------ 73 | 74 | * [Xor Filters: Faster and Smaller Than Bloom and Cuckoo Filters](https://arxiv.org/abs/1912.08258) 75 | * [Blog post by Daniel Lemire](https://lemire.me/blog/2019/12/19/xor-filters-faster-and-smaller-than-bloom-filters/) 76 | 77 | 78 | Contribution 79 | ------------ 80 | 81 | * Simple workflow. Fork - Modify - Pull request. 82 | * Before creating a PR, 83 | * Run `make build` to confirm all versions of build is passing with 84 | 0 warnings and 0 errors. 85 | * Run `check.sh` with 0 warnings, 0 errors and all test-cases passing. 86 | * Run `perf.sh` with 0 warnings, 0 errors and all test-cases passing. 87 | * [Install][spellcheck] and run `cargo spellcheck` to remove common spelling mistakes. 88 | * [Developer certificate of origin][dco] is preferred. 89 | 90 | [dco]: https://developercertificate.org/ 91 | [spellcheck]: https://github.com/drahnr/cargo-spellcheck 92 | -------------------------------------------------------------------------------- /RELEASES.md: -------------------------------------------------------------------------------- 1 | 0.6.0 2 | ===== 3 | 4 | * Added len() method for Fuse8 5 | * Added len() method for Fuse16 6 | 7 | 0.5.1 8 | ===== 9 | 10 | * Fuse8: handle duplicates without sorting. 11 | * Fuse8: improve test case with duplicate keys. 12 | * CI: improvement to check.sh script. 13 | * rustdoc. 14 | 15 | 0.5.0 16 | ===== 17 | 18 | **Breaking Change** 19 | 20 | File version moves from `TL1` to `TL2`. 21 | * Now includes `hash_builder` field as part of Xor8 serialization. 22 | * Test cases for TL1 (backward compatibility) and TL2. 23 | * METADATA includes length of the serialized `hash_builder`. 24 | * Shape of the serialized file has changed. 25 | * `Xor8::write_file`, `Xor8::read_file`, `Xor8::to_bytes`, `Xor8::from_bytes` 26 | methods expect that type parameter implements `Default`, `Clone`, 27 | `From>`, `Into>` traits. 28 | 29 | * Bugfix: Check for duplicate key-digest. It is possible that, when using 30 | `insert()`, `populate()`, keys could generate duplicate digest. This will 31 | lead to failure while building the filter. To mitigate this issue we are 32 | maintaining the digests in sort order. 33 | * `Fuse8` and `Fuse16` implementation. 34 | * hasher: NoHash, to use the types and its methods using `u64` digests. 35 | * Add `size_of()` method to filter types. 36 | * Support key-set of size 0, 1, and 2. 37 | * Improve test cases. 38 | * rustdoc 39 | * cargo: fix category slug 40 | 41 | 0.4.0 42 | ===== 43 | 44 | * package maintanence. 45 | 46 | 0.3.0 47 | ===== 48 | 49 | * Xor8 to bytes and vice-versa conversion. 50 | * implement Default trait. 51 | * implement mkit's IntoCbor and FromCbor traits for Cbor serialization. 52 | * improve test cases. 53 | * use criterion for benchmark. 54 | * clippy fixes. 55 | * ci scripts. 56 | 57 | 0.2.0 58 | ===== 59 | 60 | * `write_file()` and `read_file()` methods on Xor8 type will take 61 | `&ffi::OsStr` instead of `&str`. This more consistent with rust-idiom. 62 | * cleanup test-cases. 63 | * cleanup Makefile. 64 | 65 | 0.1.0 66 | ===== 67 | 68 | * First release 69 | 70 | Refer to [release-checklist][release-checklist]. 71 | 72 | [release-checklist]: https://prataprc.github.io/rust-crates-release-checklist.html 73 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | TODO List 2 | ========= 3 | 4 | * Test cases take long time to run. May be split it or put it under ignore. 5 | * Add `test-all` target in the Makefile to test ignored cases. 6 | * Can we have parallel build algorithm for Xor8, Fuse8 and Fuse16 filters ? 7 | -------------------------------------------------------------------------------- /benches/fuse16_bench.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::RandomState; 2 | 3 | use criterion::criterion_group; 4 | use criterion::criterion_main; 5 | use criterion::Criterion; 6 | use rand::prelude::random; 7 | use rand::rngs::StdRng; 8 | use rand::Rng; 9 | use rand::SeedableRng; 10 | use xorfilter::Fuse16; 11 | 12 | const SIZE: usize = 1_000_000; 13 | 14 | fn generate_unique_keys(rng: &mut StdRng, size: usize) -> Vec { 15 | let mut keys: Vec = Vec::with_capacity(size); 16 | keys.resize(size, Default::default()); 17 | 18 | for key in keys.iter_mut() { 19 | *key = rng.gen(); 20 | } 21 | keys.sort_unstable(); 22 | keys.dedup(); 23 | 24 | for _i in 0..(size - keys.len()) { 25 | let key = rng.gen::(); 26 | if !keys.contains(&key) { 27 | keys.push(key) 28 | } 29 | } 30 | 31 | keys 32 | } 33 | 34 | fn bench_fuse16_populate_keys(c: &mut Criterion) { 35 | let seed: u64 = random(); 36 | println!("bench_fuse16_populate_keys seed:{}", seed); 37 | let mut rng = StdRng::seed_from_u64(seed); 38 | 39 | let keys = generate_unique_keys(&mut rng, SIZE); 40 | 41 | c.bench_function("fuse16_populate_keys", |b| { 42 | b.iter(|| { 43 | let mut filter = Fuse16::::new(keys.len() as u32); 44 | filter.populate_keys(&keys); 45 | filter.build().expect("failed build"); 46 | }) 47 | }); 48 | } 49 | 50 | fn bench_fuse16_build_keys(c: &mut Criterion) { 51 | let seed: u64 = random(); 52 | println!("bench_fuse16_build_keys seed:{}", seed); 53 | let mut rng = StdRng::seed_from_u64(seed); 54 | 55 | let keys = generate_unique_keys(&mut rng, SIZE); 56 | 57 | c.bench_function("fuse16_build_keys", |b| { 58 | b.iter(|| { 59 | let mut filter = Fuse16::::new(keys.len() as u32); 60 | filter.build_keys(&keys).expect("failed build"); 61 | }) 62 | }); 63 | } 64 | 65 | fn bench_fuse16_populate(c: &mut Criterion) { 66 | let seed: u64 = random(); 67 | println!("bench_fuse16_populate seed:{}", seed); 68 | let mut rng = StdRng::seed_from_u64(seed); 69 | 70 | let keys = generate_unique_keys(&mut rng, SIZE); 71 | 72 | c.bench_function("fuse16_populate", |b| { 73 | b.iter(|| { 74 | let mut filter = Fuse16::::new(keys.len() as u32); 75 | filter.populate(&keys); 76 | filter.build().expect("failed build"); 77 | }) 78 | }); 79 | } 80 | 81 | fn bench_fuse16_insert(c: &mut Criterion) { 82 | let seed: u64 = random(); 83 | println!("bench_fuse16_insert seed:{}", seed); 84 | let mut rng = StdRng::seed_from_u64(seed); 85 | 86 | let keys = generate_unique_keys(&mut rng, SIZE); 87 | 88 | c.bench_function("fuse16_insert", |b| { 89 | b.iter(|| { 90 | let mut filter = Fuse16::::new(keys.len() as u32); 91 | keys.iter().for_each(|key| filter.insert(key)); 92 | filter.build().expect("failed build"); 93 | }) 94 | }); 95 | } 96 | 97 | fn bench_fuse16_contains(c: &mut Criterion) { 98 | let seed: u64 = random(); 99 | println!("bench_fuse16_contains seed:{}", seed); 100 | let mut rng = StdRng::seed_from_u64(seed); 101 | 102 | let keys = generate_unique_keys(&mut rng, SIZE); 103 | 104 | let filter = { 105 | let mut filter = Fuse16::::new(keys.len() as u32); 106 | filter.populate(&keys); 107 | filter.build().expect("failed build"); 108 | filter 109 | }; 110 | 111 | let mut n = 0; 112 | c.bench_function("fuse16_contains", |b| { 113 | b.iter(|| { 114 | filter.contains(&keys[n % keys.len()]); 115 | n += 1; 116 | }) 117 | }); 118 | } 119 | 120 | fn bench_fuse16_contains_key(c: &mut Criterion) { 121 | let seed: u64 = random(); 122 | println!("bench_fuse16_contains_key seed:{}", seed); 123 | let mut rng = StdRng::seed_from_u64(seed); 124 | 125 | let keys = generate_unique_keys(&mut rng, SIZE); 126 | 127 | let filter = { 128 | let mut filter = Fuse16::::new(keys.len() as u32); 129 | filter.populate(&keys); 130 | filter.build().expect("failed build"); 131 | filter 132 | }; 133 | 134 | let mut n = 0; 135 | c.bench_function("fuse16_contains_key", |b| { 136 | b.iter(|| { 137 | filter.contains_key(keys[n % keys.len()]); 138 | n += 1; 139 | }) 140 | }); 141 | } 142 | 143 | criterion_group!( 144 | benches, 145 | bench_fuse16_populate_keys, 146 | bench_fuse16_build_keys, 147 | bench_fuse16_populate, 148 | bench_fuse16_insert, 149 | bench_fuse16_contains, 150 | bench_fuse16_contains_key, 151 | ); 152 | 153 | criterion_main!(benches); 154 | -------------------------------------------------------------------------------- /benches/fuse8_bench.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::RandomState; 2 | 3 | use criterion::criterion_group; 4 | use criterion::criterion_main; 5 | use criterion::Criterion; 6 | use rand::prelude::random; 7 | use rand::rngs::StdRng; 8 | use rand::Rng; 9 | use rand::SeedableRng; 10 | use xorfilter::Fuse8; 11 | 12 | const SIZE: usize = 1_000_000; 13 | 14 | fn generate_unique_keys(rng: &mut StdRng, size: usize) -> Vec { 15 | let mut keys: Vec = Vec::with_capacity(size); 16 | keys.resize(size, u64::default()); 17 | 18 | for key in keys.iter_mut() { 19 | *key = rng.gen(); 20 | } 21 | keys.sort_unstable(); 22 | keys.dedup(); 23 | 24 | for _i in 0..(size - keys.len()) { 25 | let key = rng.gen::(); 26 | if !keys.contains(&key) { 27 | keys.push(key) 28 | } 29 | } 30 | 31 | keys 32 | } 33 | 34 | fn bench_fuse8_populate_keys(c: &mut Criterion) { 35 | let seed: u64 = random(); 36 | println!("bench_fuse8_populate_keys seed:{}", seed); 37 | let mut rng = StdRng::seed_from_u64(seed); 38 | 39 | let keys = generate_unique_keys(&mut rng, SIZE); 40 | 41 | c.bench_function("fuse8_populate_keys", |b| { 42 | b.iter(|| { 43 | let mut filter = Fuse8::::new(keys.len() as u32); 44 | filter.populate_keys(&keys); 45 | filter.build().expect("failed build"); 46 | }) 47 | }); 48 | } 49 | 50 | fn bench_fuse8_build_keys(c: &mut Criterion) { 51 | let seed: u64 = random(); 52 | println!("bench_fuse8_build_keys seed:{}", seed); 53 | let mut rng = StdRng::seed_from_u64(seed); 54 | 55 | let keys = generate_unique_keys(&mut rng, SIZE); 56 | 57 | c.bench_function("fuse8_build_keys", |b| { 58 | b.iter(|| { 59 | let mut filter = Fuse8::::new(keys.len() as u32); 60 | filter.build_keys(&keys).expect("failed build"); 61 | }) 62 | }); 63 | } 64 | 65 | fn bench_fuse8_populate(c: &mut Criterion) { 66 | let seed: u64 = random(); 67 | println!("bench_fuse8_populate seed:{}", seed); 68 | let mut rng = StdRng::seed_from_u64(seed); 69 | 70 | let keys = generate_unique_keys(&mut rng, SIZE); 71 | 72 | c.bench_function("fuse8_populate", |b| { 73 | b.iter(|| { 74 | let mut filter = Fuse8::::new(keys.len() as u32); 75 | filter.populate(&keys); 76 | filter.build().expect("failed build"); 77 | }) 78 | }); 79 | } 80 | 81 | fn bench_fuse8_insert(c: &mut Criterion) { 82 | let seed: u64 = random(); 83 | println!("bench_fuse8_insert seed:{}", seed); 84 | let mut rng = StdRng::seed_from_u64(seed); 85 | 86 | let keys = generate_unique_keys(&mut rng, SIZE); 87 | 88 | c.bench_function("fuse8_insert", |b| { 89 | b.iter(|| { 90 | let mut filter = Fuse8::::new(keys.len() as u32); 91 | keys.iter().for_each(|key| filter.insert(key)); 92 | filter.build().expect("failed build"); 93 | }) 94 | }); 95 | } 96 | 97 | fn bench_fuse8_contains(c: &mut Criterion) { 98 | let seed: u64 = random(); 99 | println!("bench_fuse8_contains seed:{}", seed); 100 | let mut rng = StdRng::seed_from_u64(seed); 101 | 102 | let keys = generate_unique_keys(&mut rng, SIZE); 103 | 104 | let filter = { 105 | let mut filter = Fuse8::::new(keys.len() as u32); 106 | filter.populate(&keys); 107 | filter.build().expect("failed build"); 108 | filter 109 | }; 110 | 111 | let mut n = 0; 112 | c.bench_function("fuse8_contains", |b| { 113 | b.iter(|| { 114 | filter.contains(&keys[n % keys.len()]); 115 | n += 1; 116 | }) 117 | }); 118 | } 119 | 120 | fn bench_fuse8_contains_key(c: &mut Criterion) { 121 | let seed: u64 = random(); 122 | println!("bench_fuse8_contains_key seed:{}", seed); 123 | let mut rng = StdRng::seed_from_u64(seed); 124 | 125 | let keys = generate_unique_keys(&mut rng, SIZE); 126 | 127 | let filter = { 128 | let mut filter = Fuse8::::new(keys.len() as u32); 129 | filter.populate(&keys); 130 | filter.build().expect("failed build"); 131 | filter 132 | }; 133 | 134 | let mut n = 0; 135 | c.bench_function("fuse8_contains_key", |b| { 136 | b.iter(|| { 137 | filter.contains_key(keys[n % keys.len()]); 138 | n += 1; 139 | }) 140 | }); 141 | } 142 | 143 | criterion_group!( 144 | benches, 145 | bench_fuse8_populate_keys, 146 | bench_fuse8_build_keys, 147 | bench_fuse8_populate, 148 | bench_fuse8_insert, 149 | bench_fuse8_contains, 150 | bench_fuse8_contains_key, 151 | ); 152 | 153 | criterion_main!(benches); 154 | -------------------------------------------------------------------------------- /benches/xor_bench.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::RandomState; 2 | 3 | use criterion::criterion_group; 4 | use criterion::criterion_main; 5 | use criterion::Criterion; 6 | use rand::prelude::random; 7 | use rand::rngs::StdRng; 8 | use rand::Rng; 9 | use rand::SeedableRng; 10 | use xorfilter::xor8::Xor8Builder; 11 | 12 | const SIZE: usize = 1_000_000; 13 | 14 | fn generate_unique_keys(rng: &mut StdRng, size: usize) -> Vec { 15 | let mut keys: Vec = Vec::with_capacity(size); 16 | keys.resize(size, u64::default()); 17 | 18 | for key in keys.iter_mut() { 19 | *key = rng.gen(); 20 | } 21 | keys.sort_unstable(); 22 | keys.dedup(); 23 | 24 | for _i in 0..(size - keys.len()) { 25 | let key = rng.gen::(); 26 | if !keys.contains(&key) { 27 | keys.push(key) 28 | } 29 | } 30 | 31 | keys 32 | } 33 | 34 | fn bench_xor8_populate_digests(c: &mut Criterion) { 35 | let seed: u64 = random(); 36 | let mut rng = StdRng::seed_from_u64(seed); 37 | 38 | let digests = generate_unique_keys(&mut rng, SIZE); 39 | 40 | c.bench_function("xor8_populate_digests", |b| { 41 | b.iter(|| { 42 | let mut builder = Xor8Builder::::new(); 43 | builder.populate_digests(&digests); 44 | let _filter = criterion::black_box(builder.build().expect("failed build")); 45 | }) 46 | }); 47 | } 48 | 49 | fn bench_xor8_build_from_digests(c: &mut Criterion) { 50 | let seed: u64 = random(); 51 | let mut rng = StdRng::seed_from_u64(seed); 52 | 53 | let keys = generate_unique_keys(&mut rng, SIZE); 54 | 55 | c.bench_function("xor8_build_from_digests", |b| { 56 | b.iter(|| { 57 | let mut builder = Xor8Builder::::new(); 58 | let _filter = criterion::black_box( 59 | builder.build_from_digests(&keys).expect("failed build"), 60 | ); 61 | }) 62 | }); 63 | } 64 | 65 | fn bench_xor8_populate(c: &mut Criterion) { 66 | let seed: u64 = random(); 67 | let mut rng = StdRng::seed_from_u64(seed); 68 | 69 | let keys = generate_unique_keys(&mut rng, SIZE); 70 | 71 | c.bench_function("xor8_populate", |b| { 72 | b.iter(|| { 73 | let mut builder = Xor8Builder::::new(); 74 | builder.populate(&keys); 75 | let _filter = criterion::black_box(builder.build().expect("failed build")); 76 | }) 77 | }); 78 | } 79 | 80 | fn bench_xor8_insert(c: &mut Criterion) { 81 | let seed: u64 = random(); 82 | let mut rng = StdRng::seed_from_u64(seed); 83 | 84 | let keys = generate_unique_keys(&mut rng, SIZE); 85 | 86 | c.bench_function("xor8_insert", |b| { 87 | b.iter(|| { 88 | let mut builder = Xor8Builder::::new(); 89 | keys.iter().for_each(|key| builder.insert(key)); 90 | let _f = criterion::black_box(builder.build().expect("failed build")); 91 | }) 92 | }); 93 | } 94 | 95 | fn bench_xor8_contains(c: &mut Criterion) { 96 | let seed: u64 = random(); 97 | let mut rng = StdRng::seed_from_u64(seed); 98 | 99 | let keys = generate_unique_keys(&mut rng, SIZE); 100 | 101 | let filter = { 102 | let mut builder = Xor8Builder::::new(); 103 | builder.populate(&keys); 104 | builder.build().expect("failed build") 105 | }; 106 | 107 | let mut n = 0; 108 | c.bench_function("xor8_contains", |b| { 109 | b.iter(|| { 110 | filter.contains(&keys[n % keys.len()]); 111 | n += 1; 112 | }) 113 | }); 114 | } 115 | 116 | fn bench_xor8_contains_digest(c: &mut Criterion) { 117 | let seed: u64 = random(); 118 | let mut rng = StdRng::seed_from_u64(seed); 119 | 120 | let keys = generate_unique_keys(&mut rng, SIZE); 121 | 122 | let filter = { 123 | let mut builder = Xor8Builder::::new(); 124 | builder.populate(&keys); 125 | builder.build().expect("failed build") 126 | }; 127 | 128 | let mut n = 0; 129 | c.bench_function("xor8_contains_digest", |b| { 130 | b.iter(|| { 131 | filter.contains_digest(keys[n % keys.len()]); 132 | n += 1; 133 | }) 134 | }); 135 | } 136 | 137 | criterion_group!( 138 | benches, 139 | bench_xor8_populate_digests, 140 | bench_xor8_build_from_digests, 141 | bench_xor8_populate, 142 | bench_xor8_insert, 143 | bench_xor8_contains, 144 | bench_xor8_contains_digest, 145 | ); 146 | 147 | criterion_main!(benches); 148 | -------------------------------------------------------------------------------- /check.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | export RUST_BACKTRACE=full 4 | export RUSTFLAGS=-g 5 | 6 | exec > $1 7 | exec 2>&1 8 | 9 | set -o xtrace 10 | 11 | exec_prg() { 12 | 13 | for i in {0..5}; 14 | do 15 | date; cargo +nightly test --release -- --nocapture || exit $?; 16 | date; cargo +nightly test -- --nocapture || exit $?; 17 | date; cargo +nightly test --release --features cbordata -- --nocapture || exit $?; 18 | date; cargo +nightly test --features cbordata -- --nocapture || exit $?; 19 | date; cargo +stable test --release -- --nocapture || exit $?; 20 | date; cargo +stable test -- --nocapture || exit $?; 21 | date; cargo +stable test --release --features cbordata -- --nocapture || exit $?; 22 | date; cargo +stable test --features cbordata -- --nocapture || exit $?; 23 | done 24 | } 25 | 26 | exec_prg 27 | -------------------------------------------------------------------------------- /perf.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | exec > $1 4 | exec 2>&1 5 | 6 | set -o xtrace 7 | 8 | PERF=$HOME/.cargo/target/release/perf 9 | 10 | date; time cargo +nightly bench -- --nocapture || exit $? 11 | 12 | date; time cargo +nightly run --release --bin perf --features=perf -- --loads 10000000 --gets 10000000 xor8 || exit $? 13 | date; time cargo +nightly run --release --bin perf --features=perf -- --loads 10000000 --gets 10000000 fuse8 || exit $? 14 | date; time cargo +nightly run --release --bin perf --features=perf -- --loads 10000000 --gets 10000000 fuse16 || exit $? 15 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | reorder_imports = true 2 | imports_granularity = "Item" 3 | group_imports = "StdExternalCrate" 4 | where_single_line = true 5 | trailing_comma = "Vertical" 6 | overflow_delimited_expr = true 7 | wrap_comments = true 8 | comment_width = 90 9 | max_width = 90 10 | merge_derives = false 11 | chain_width = 90 12 | -------------------------------------------------------------------------------- /src/bin/perf.rs: -------------------------------------------------------------------------------- 1 | use std::sync::Arc; 2 | use std::thread; 3 | use std::time; 4 | 5 | use rand::random; 6 | use rand::rngs::StdRng; 7 | use rand::Rng; 8 | use rand::SeedableRng; 9 | use structopt::StructOpt; 10 | use xorfilter::BuildHasherDefault; 11 | use xorfilter::Fuse16; 12 | use xorfilter::Fuse8; 13 | use xorfilter::Xor8; 14 | 15 | /// Command line options. 16 | #[derive(Clone, StructOpt)] 17 | pub struct Opt { 18 | #[structopt(long = "seed", default_value = "0")] 19 | seed: u64, 20 | 21 | #[structopt(long = "loads", default_value = "10000000")] 22 | loads: usize, 23 | 24 | #[structopt(long = "gets", default_value = "10000000")] 25 | gets: usize, 26 | 27 | #[structopt(long = "readers", default_value = "1")] 28 | readers: usize, 29 | 30 | command: String, 31 | } 32 | 33 | fn main() { 34 | let mut opts = Opt::from_args(); 35 | if opts.seed == 0 { 36 | opts.seed = random(); 37 | } 38 | 39 | match opts.command.as_str() { 40 | "xor8" => run_xor8(opts), 41 | "fuse8" => run_fuse8(opts), 42 | "fuse16" => run_fuse16(opts), 43 | _ => unreachable!(), 44 | } 45 | } 46 | 47 | fn run_xor8(opts: Opt) { 48 | let keys: Vec = (0..(opts.loads as u64)).collect(); 49 | 50 | let mut filter = Xor8::::new(); 51 | filter.populate(&keys); 52 | 53 | let start = time::Instant::now(); 54 | filter.build().unwrap(); 55 | println!("Took {:?} to build {} keys", start.elapsed(), keys.len()); 56 | 57 | let mut handles = vec![]; 58 | let keys = Arc::new(keys); 59 | for j in 0..opts.readers { 60 | let (opts, filter, keys) = (opts.clone(), filter.clone(), Arc::clone(&keys)); 61 | let handle = thread::spawn(move || { 62 | let mut rng = StdRng::seed_from_u64(opts.seed); 63 | let (mut hits, start) = (0, time::Instant::now()); 64 | for _i in 0..opts.gets { 65 | let off: usize = rng.gen::() % keys.len(); 66 | if filter.contains(&keys[off]) { 67 | hits += 1; 68 | } 69 | } 70 | println!( 71 | "Reader-{} took {:?} to check {} keys, hits:{} ", 72 | j, 73 | start.elapsed(), 74 | keys.len(), 75 | hits 76 | ); 77 | }); 78 | handles.push(handle); 79 | } 80 | 81 | for handle in handles.into_iter() { 82 | handle.join().unwrap() 83 | } 84 | } 85 | 86 | fn run_fuse8(opts: Opt) { 87 | let keys: Vec = (0..(opts.loads as u64)).collect(); 88 | 89 | let mut filter = Fuse8::::new(keys.len() as u32); 90 | filter.populate(&keys); 91 | 92 | let start = time::Instant::now(); 93 | filter.build().unwrap(); 94 | println!("Took {:?} to build {} keys", start.elapsed(), keys.len()); 95 | 96 | let mut handles = vec![]; 97 | let keys = Arc::new(keys); 98 | for j in 0..opts.readers { 99 | let (opts, filter, keys) = (opts.clone(), filter.clone(), Arc::clone(&keys)); 100 | let handle = thread::spawn(move || { 101 | let mut rng = StdRng::seed_from_u64(opts.seed); 102 | let (mut hits, start) = (0, time::Instant::now()); 103 | for _i in 0..opts.gets { 104 | let off: usize = rng.gen::() % keys.len(); 105 | if filter.contains(&keys[off]) { 106 | hits += 1; 107 | } 108 | } 109 | println!( 110 | "Reader-{} took {:?} to check {} keys, hits:{} ", 111 | j, 112 | start.elapsed(), 113 | keys.len(), 114 | hits 115 | ); 116 | }); 117 | handles.push(handle); 118 | } 119 | 120 | for handle in handles.into_iter() { 121 | handle.join().unwrap() 122 | } 123 | } 124 | 125 | fn run_fuse16(opts: Opt) { 126 | let keys: Vec = (0..(opts.loads as u64)).collect(); 127 | 128 | let mut filter = Fuse16::::new(keys.len() as u32); 129 | filter.populate(&keys); 130 | 131 | let start = time::Instant::now(); 132 | filter.build().unwrap(); 133 | println!("Took {:?} to build {} keys", start.elapsed(), keys.len()); 134 | 135 | let mut handles = vec![]; 136 | let keys = Arc::new(keys); 137 | for j in 0..opts.readers { 138 | let (opts, filter, keys) = (opts.clone(), filter.clone(), Arc::clone(&keys)); 139 | let handle = thread::spawn(move || { 140 | let mut rng = StdRng::seed_from_u64(opts.seed); 141 | let (mut hits, start) = (0, time::Instant::now()); 142 | for _i in 0..opts.gets { 143 | let off: usize = rng.gen::() % keys.len(); 144 | if filter.contains(&keys[off]) { 145 | hits += 1; 146 | } 147 | } 148 | println!( 149 | "Reader-{} took {:?} to check {} keys, hits:{} ", 150 | j, 151 | start.elapsed(), 152 | keys.len(), 153 | hits 154 | ); 155 | }); 156 | handles.push(handle); 157 | } 158 | 159 | for handle in handles.into_iter() { 160 | handle.join().unwrap() 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/fuse16.rs: -------------------------------------------------------------------------------- 1 | #[allow(unused_imports)] 2 | use std::collections::hash_map::DefaultHasher; 3 | #[allow(unused_imports)] 4 | use std::collections::hash_map::RandomState; 5 | use std::collections::BTreeMap; 6 | use std::hash::BuildHasher; 7 | use std::hash::Hash; 8 | use std::hash::Hasher; 9 | use std::sync::Arc; 10 | 11 | #[cfg(feature = "cbordata")] 12 | use cbordata::Cbor; 13 | #[cfg(feature = "cbordata")] 14 | use cbordata::Cborize; 15 | #[cfg(feature = "cbordata")] 16 | use cbordata::FromCbor; 17 | #[cfg(feature = "cbordata")] 18 | use cbordata::IntoCbor; 19 | #[cfg(feature = "cbordata")] 20 | use cbordata::{self as cbor}; 21 | 22 | use crate::fuse8::BinaryHashes; 23 | use crate::BuildHasherDefault; 24 | use crate::Error; 25 | use crate::Result; 26 | 27 | // probabillity of success should always be > 0.5 so 100 iterations is highly unlikely. 28 | const XOR_MAX_ITERATIONS: usize = 100; 29 | 30 | #[inline] 31 | pub fn binary_fuse16_fingerprint(hash: u64) -> u64 { 32 | hash ^ (hash >> 32) 33 | } 34 | 35 | /// Type Fuse16 is probabilistic data-structure to test membership of an element in a set. 36 | /// 37 | /// Fuse16 is parametrized over type `H` which is expected to implement [BuildHasher] 38 | /// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied, 39 | /// [BuildHasherDefault] is used as the default hash-builder. 40 | /// 41 | /// If `RandomState` is used as BuildHasher, `std` has got this to say 42 | /// > _A particular instance RandomState will create the same instances 43 | /// > of Hasher, but the hashers created by two different RandomState 44 | /// > instances are unlikely to produce the same result for the same values._ 45 | /// 46 | /// If [DefaultHasher] is used as BuildHasher, `std` has got this to say, 47 | /// > _The internal algorithm is not specified, and so its hashes 48 | /// > should not be relied upon over releases._ 49 | /// 50 | /// The default type for parameter `H` might change when a reliable and commonly used 51 | /// BuildHasher type available. 52 | pub struct Fuse16 53 | where H: BuildHasher 54 | { 55 | keys: Option>, 56 | pub hash_builder: H, 57 | pub seed: u64, 58 | pub num_keys: Option, 59 | pub segment_length: u32, 60 | pub segment_length_mask: u32, 61 | pub segment_count: u32, 62 | pub segment_count_length: u32, 63 | pub finger_prints: Arc>, 64 | } 65 | 66 | impl Clone for Fuse16 67 | where H: Clone + BuildHasher 68 | { 69 | fn clone(&self) -> Self { 70 | Fuse16 { 71 | keys: Some(BTreeMap::new()), 72 | hash_builder: self.hash_builder.clone(), 73 | seed: self.seed, 74 | num_keys: self.num_keys, 75 | segment_length: self.segment_length, 76 | segment_length_mask: self.segment_length_mask, 77 | segment_count: self.segment_count, 78 | segment_count_length: self.segment_count_length, 79 | finger_prints: Arc::clone(&self.finger_prints), 80 | } 81 | } 82 | } 83 | 84 | impl Fuse16 85 | where H: BuildHasher 86 | { 87 | #[inline] 88 | fn binary_fuse16_hash_batch(&self, hash: u64) -> BinaryHashes { 89 | use crate::fuse8::binary_fuse_mulhi; 90 | 91 | let mut ans = BinaryHashes::default(); 92 | 93 | ans.h0 = binary_fuse_mulhi(hash, self.segment_count_length.into()) as u32; 94 | ans.h1 = ans.h0 + self.segment_length; 95 | ans.h2 = ans.h1 + self.segment_length; 96 | ans.h1 ^= ((hash >> 18) as u32) & self.segment_length_mask; 97 | ans.h2 ^= (hash as u32) & self.segment_length_mask; 98 | ans 99 | } 100 | 101 | #[inline] 102 | fn binary_fuse16_hash(&self, index: u32, hash: u64) -> u32 { 103 | use crate::fuse8::binary_fuse_mulhi; 104 | 105 | let mut h = binary_fuse_mulhi(hash, self.segment_count_length.into()); 106 | h += (index * self.segment_length) as u64; 107 | // keep the lower 36 bits 108 | let hh = hash & ((1_u64 << 36) - 1); 109 | // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift 110 | h ^= (hh >> (36 - 18 * index)) & (self.segment_length_mask as u64); 111 | 112 | h as u32 113 | } 114 | } 115 | 116 | impl Fuse16 117 | where H: BuildHasher 118 | { 119 | /// New Fuse16 instance that can index size number of keys. Internal data-structures 120 | /// are pre-allocated for `size`. `size` should be at least 2. 121 | pub fn new(size: u32) -> Fuse16 122 | where H: Default { 123 | Self::with_hasher(size, H::default()) 124 | } 125 | 126 | /// New Fuse16 instance initialized with supplied hasher. 127 | pub fn with_hasher(size: u32, hash_builder: H) -> Fuse16 { 128 | use std::cmp; 129 | 130 | use crate::fuse8::binary_fuse_calculate_segment_length; 131 | use crate::fuse8::binary_fuse_calculate_size_factor; 132 | 133 | let arity = 3_u32; 134 | 135 | let segment_length = match size { 136 | 0 => 4, 137 | size => cmp::min(binary_fuse_calculate_segment_length(arity, size), 262144), 138 | }; 139 | 140 | let segment_length_mask = segment_length - 1; 141 | let mut array_length = { 142 | let size_factor = binary_fuse_calculate_size_factor(arity, size); 143 | let cap = match size { 144 | 0 | 1 => 0, 145 | size => ((size as f64) * size_factor).round() as u32, 146 | }; 147 | let n = ((cap + segment_length - 1) / segment_length).wrapping_sub(arity - 1); 148 | (n.wrapping_add(arity) - 1) * segment_length 149 | }; 150 | 151 | let mut segment_count = (array_length + segment_length - 1) / segment_length; 152 | segment_count = if segment_count <= (arity - 1) { 153 | 1 154 | } else { 155 | segment_count - (arity - 1) 156 | }; 157 | 158 | array_length = (segment_count + arity - 1) * segment_length; 159 | let segment_count_length = segment_count * segment_length; 160 | 161 | Fuse16 { 162 | keys: Some(BTreeMap::new()), 163 | hash_builder, 164 | seed: u64::default(), 165 | num_keys: None, 166 | segment_length, 167 | segment_length_mask, 168 | segment_count, 169 | segment_count_length, 170 | finger_prints: Arc::new(vec![0; array_length as usize]), 171 | } 172 | } 173 | } 174 | 175 | impl Fuse16 176 | where H: BuildHasher 177 | { 178 | /// Return the size of index. 179 | #[inline] 180 | pub fn size_of(&self) -> usize { 181 | std::mem::size_of::() + (self.finger_prints.len() * 2) 182 | } 183 | 184 | /// Insert 64-bit digest of a single key. Digest for the key shall be generated 185 | /// using the default-hasher or via hasher supplied via [Fuse16::with_hasher] method. 186 | pub fn insert(&mut self, key: &K) { 187 | let digest = { 188 | let mut hasher = self.hash_builder.build_hasher(); 189 | key.hash(&mut hasher); 190 | hasher.finish() 191 | }; 192 | if let Some(x) = self.num_keys.as_mut() { 193 | *x += 1 194 | } 195 | self.keys.as_mut().unwrap().insert(digest, ()); 196 | } 197 | 198 | /// Populate with 64-bit digests for a collection of keys of type `K`. Digest for 199 | /// key shall be generated using the default-hasher or via hasher supplied 200 | /// via [Fuse16::with_hasher] method. 201 | pub fn populate(&mut self, keys: &[K]) { 202 | if let Some(x) = self.num_keys.as_mut() { 203 | *x += keys.len() 204 | } 205 | 206 | keys.iter().for_each(|key| { 207 | let mut hasher = self.hash_builder.build_hasher(); 208 | key.hash(&mut hasher); 209 | self.keys.as_mut().unwrap().insert(hasher.finish(), ()); 210 | }) 211 | } 212 | 213 | /// Populate with pre-compute collection of 64-bit digests. 214 | pub fn populate_keys(&mut self, digests: &[u64]) { 215 | if let Some(x) = self.num_keys.as_mut() { 216 | *x += digests.len() 217 | } 218 | 219 | for digest in digests.iter() { 220 | self.keys.as_mut().unwrap().insert(*digest, ()); 221 | } 222 | } 223 | // construct the filter, returns true on success, false on failure. 224 | // most likely, a failure is due to too high a memory usage 225 | // size is the number of keys 226 | // The caller is responsable for calling binary_fuse16_allocate(size,filter) 227 | // before. The caller is responsible to ensure that there are no duplicated 228 | // keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on 229 | // 100), it should never fail, except if there are duplicated keys. If it fails, 230 | // a return value of false is provided. 231 | /// Build bitmap for keys that where previously inserted using [Fuse16::insert], 232 | /// [Fuse16::populate] and [Fuse16::populate_keys] method. 233 | pub fn build(&mut self) -> Result<()> { 234 | match self.keys.take() { 235 | Some(keys) => { 236 | let digests = keys.keys().copied().collect::>(); 237 | self.build_keys(&digests) 238 | } 239 | None => Ok(()), 240 | } 241 | } 242 | 243 | /// Build a bitmap for pre-computed 64-bit digests for keys. If keys where 244 | /// previously inserted using [Fuse16::insert] or [Fuse16::populate] or 245 | /// [Fuse16::populate_keys] methods, they shall be ignored. 246 | /// 247 | /// It is upto the caller to ensure that digests are unique, that there no 248 | /// duplicates. 249 | pub fn build_keys(&mut self, digests: &[u64]) -> Result<()> { 250 | use crate::fuse8::binary_fuse_mod3; 251 | use crate::fuse8::binary_fuse_murmur64; 252 | use crate::fuse8::binary_fuse_rng_splitmix64; 253 | 254 | let mut rng_counter = 0x726b2b9d438b9d4d_u64; 255 | let capacity = self.finger_prints.len(); 256 | let size = digests.len(); 257 | 258 | self.num_keys = Some(digests.len()); 259 | self.seed = binary_fuse_rng_splitmix64(&mut rng_counter); 260 | let mut reverse_order: Vec = vec![0; size + 1]; 261 | let mut reverse_h: Vec = vec![0; size]; 262 | let mut alone: Vec = vec![0; capacity]; 263 | let mut t2count: Vec = vec![0; capacity]; 264 | let mut t2hash: Vec = vec![0; capacity]; 265 | 266 | let mut block_bits: u32 = 1; 267 | while (1_u32 << block_bits) < self.segment_count { 268 | block_bits += 1; 269 | } 270 | let block = 1_u32 << block_bits; 271 | 272 | let mut start_pos: Vec = vec![0; 1 << block_bits]; 273 | 274 | let mut h012 = [0_u32; 5]; 275 | 276 | reverse_order[size] = 1; // sentinel 277 | let mut iter = 0..=XOR_MAX_ITERATIONS; 278 | loop { 279 | if iter.next().is_none() { 280 | err_at!(Fatal, msg: "Too many iterations. Are all your keys unique?")?; 281 | } 282 | 283 | for i in 0_u32..block { 284 | // important : i * size would overflow as a 32-bit number in some 285 | // cases. 286 | start_pos[i as usize] = 287 | (((i as u64) * (size as u64)) >> block_bits) as u32; 288 | } 289 | 290 | let mask_block = (block - 1) as u64; 291 | for (_, digest) in digests.iter().enumerate().take(size) { 292 | let hash: u64 = binary_fuse_murmur64(digest.wrapping_add(self.seed)); 293 | let mut segment_index: u64 = hash >> (64 - block_bits); 294 | while reverse_order[start_pos[segment_index as usize] as usize] != 0 { 295 | segment_index += 1; 296 | segment_index &= mask_block; 297 | } 298 | reverse_order[start_pos[segment_index as usize] as usize] = hash; 299 | start_pos[segment_index as usize] += 1; 300 | } 301 | 302 | let mut error: isize = 0; 303 | for (_, rev_order) in reverse_order.iter().enumerate().take(size) { 304 | let hash: u64 = *rev_order; 305 | 306 | let h0: usize = self.binary_fuse16_hash(0, hash) as usize; 307 | t2count[h0] = t2count[h0].wrapping_add(4); 308 | t2hash[h0] ^= hash; 309 | 310 | let h1: usize = self.binary_fuse16_hash(1, hash) as usize; 311 | t2count[h1] = t2count[h1].wrapping_add(4); 312 | t2count[h1] ^= 1; 313 | t2hash[h1] ^= hash; 314 | 315 | let h2: usize = self.binary_fuse16_hash(2, hash) as usize; 316 | t2count[h2] = t2count[h2].wrapping_add(4); 317 | t2hash[h2] ^= hash; 318 | t2count[h2] ^= 2; 319 | 320 | error = if t2count[h0] < 4 { 1 } else { error }; 321 | error = if t2count[h1] < 4 { 1 } else { error }; 322 | error = if t2count[h2] < 4 { 1 } else { error }; 323 | } 324 | 325 | if error > 0 { 326 | reverse_order.fill(0); 327 | reverse_order[size] = 1; // sentinel 328 | t2count.fill(0); 329 | t2hash.fill(0); 330 | self.seed = binary_fuse_rng_splitmix64(&mut rng_counter); 331 | continue; 332 | } 333 | 334 | let mut q_size = 0_usize; // End of key addition 335 | 336 | // Add sets with one key to the queue. 337 | for (i, x) in t2count.iter().enumerate().take(capacity) { 338 | alone[q_size] = i as u32; 339 | q_size += if (x >> 2) == 1 { 1 } else { 0 }; 340 | } 341 | 342 | let mut stack_size = 0_usize; 343 | 344 | while q_size > 0 { 345 | q_size -= 1; 346 | let index = alone[q_size] as usize; 347 | if (t2count[index] >> 2) == 1 { 348 | let hash: u64 = t2hash[index]; 349 | 350 | //h012[0] = binary_fuse16_hash(0, hash, self); 351 | h012[1] = self.binary_fuse16_hash(1, hash); 352 | h012[2] = self.binary_fuse16_hash(2, hash); 353 | h012[3] = self.binary_fuse16_hash(0, hash); // == h012[0]; 354 | h012[4] = h012[1]; 355 | 356 | let found: u8 = t2count[index] & 3; 357 | reverse_h[stack_size] = found; 358 | reverse_order[stack_size] = hash; 359 | stack_size += 1; 360 | 361 | let other_index1: u32 = h012[(found + 1) as usize]; 362 | alone[q_size] = other_index1; 363 | q_size += if (t2count[other_index1 as usize] >> 2) == 2 { 364 | 1 365 | } else { 366 | 0 367 | }; 368 | 369 | t2count[other_index1 as usize] -= 4; 370 | t2count[other_index1 as usize] ^= binary_fuse_mod3(found + 1); 371 | t2hash[other_index1 as usize] ^= hash; 372 | 373 | let other_index2: u32 = h012[(found + 2) as usize]; 374 | alone[q_size] = other_index2; 375 | q_size += if (t2count[other_index2 as usize] >> 2) == 2 { 376 | 1 377 | } else { 378 | 0 379 | }; 380 | t2count[other_index2 as usize] -= 4; 381 | t2count[other_index2 as usize] ^= binary_fuse_mod3(found + 2); 382 | t2hash[other_index2 as usize] ^= hash; 383 | } 384 | } 385 | 386 | if stack_size == size { 387 | break; // success 388 | } 389 | 390 | reverse_order.fill(0); 391 | reverse_order[size] = 1; // sentinel 392 | t2count.fill(0); 393 | t2hash.fill(0); 394 | 395 | self.seed = binary_fuse_rng_splitmix64(&mut rng_counter); 396 | } 397 | 398 | for i in (0_usize..size).rev() { 399 | // the hash of the key we insert next 400 | let hash: u64 = reverse_order[i]; 401 | let xor2: u16 = binary_fuse16_fingerprint(hash) as u16; 402 | let found: usize = reverse_h[i] as usize; 403 | h012[0] = self.binary_fuse16_hash(0, hash); 404 | h012[1] = self.binary_fuse16_hash(1, hash); 405 | h012[2] = self.binary_fuse16_hash(2, hash); 406 | h012[3] = h012[0]; 407 | h012[4] = h012[1]; 408 | 409 | Arc::get_mut(&mut self.finger_prints).unwrap()[h012[found] as usize] = xor2 410 | ^ self.finger_prints[h012[found + 1] as usize] 411 | ^ self.finger_prints[h012[found + 2] as usize]; 412 | } 413 | 414 | Ok(()) 415 | } 416 | } 417 | 418 | impl Fuse16 419 | where H: BuildHasher 420 | { 421 | #[allow(clippy::len_without_is_empty)] 422 | /// Return the number of keys added/built into the bitmap index. 423 | pub fn len(&self) -> Option { 424 | self.num_keys 425 | } 426 | 427 | /// Contains tell you whether the key is likely part of the set, with false 428 | /// positive rate. 429 | pub fn contains(&self, key: &K) -> bool { 430 | let digest = { 431 | let mut hasher = self.hash_builder.build_hasher(); 432 | key.hash(&mut hasher); 433 | hasher.finish() 434 | }; 435 | self.contains_key(digest) 436 | } 437 | 438 | /// Contains tell you whether the key, as pre-computed digest form, is likely 439 | /// part of the set, with false positive rate. 440 | pub fn contains_key(&self, digest: u64) -> bool { 441 | use crate::fuse8::binary_fuse_mix_split; 442 | 443 | let hash = binary_fuse_mix_split(digest, self.seed); 444 | let mut f = binary_fuse16_fingerprint(hash) as u16; 445 | let BinaryHashes { h0, h1, h2 } = self.binary_fuse16_hash_batch(hash); 446 | f ^= self.finger_prints[h0 as usize] 447 | ^ self.finger_prints[h1 as usize] 448 | ^ self.finger_prints[h2 as usize]; 449 | f == 0 450 | } 451 | 452 | #[allow(dead_code)] 453 | fn get_hasher(&self) -> H::Hasher { 454 | self.hash_builder.build_hasher() 455 | } 456 | } 457 | 458 | //------ Implement cbordata related functionalities 459 | 460 | // Intermediate type to serialize and de-serialized Fuse16 into bytes. 461 | #[cfg(feature = "cbordata")] 462 | #[derive(Cborize)] 463 | struct CborFuse16 { 464 | hash_builder: Vec, 465 | seed: u64, 466 | num_keys: Option, 467 | segment_length: u32, 468 | segment_length_mask: u32, 469 | segment_count: u32, 470 | segment_count_length: u32, 471 | finger_prints: Vec, 472 | } 473 | 474 | #[cfg(feature = "cbordata")] 475 | impl CborFuse16 { 476 | const ID: &'static str = "fuse8/0.0.1"; 477 | } 478 | 479 | #[cfg(feature = "cbordata")] 480 | impl IntoCbor for Fuse16 481 | where H: BuildHasher + Into> 482 | { 483 | fn into_cbor(self) -> cbor::Result { 484 | let val = CborFuse16 { 485 | hash_builder: self.hash_builder.into(), 486 | seed: self.seed, 487 | num_keys: self.num_keys, 488 | segment_length: self.segment_length, 489 | segment_length_mask: self.segment_length_mask, 490 | segment_count: self.segment_count, 491 | segment_count_length: self.segment_count_length, 492 | finger_prints: self.finger_prints.to_vec(), 493 | }; 494 | val.into_cbor() 495 | } 496 | } 497 | 498 | #[cfg(feature = "cbordata")] 499 | impl FromCbor for Fuse16 500 | where H: BuildHasher + From> 501 | { 502 | fn from_cbor(val: Cbor) -> cbor::Result { 503 | let val = CborFuse16::from_cbor(val)?; 504 | 505 | let filter = Fuse16 { 506 | keys: None, 507 | hash_builder: val.hash_builder.into(), 508 | seed: val.seed, 509 | num_keys: val.num_keys, 510 | segment_length: val.segment_length, 511 | segment_length_mask: val.segment_length_mask, 512 | segment_count: val.segment_count, 513 | segment_count_length: val.segment_count_length, 514 | finger_prints: Arc::new(val.finger_prints), 515 | }; 516 | 517 | Ok(filter) 518 | } 519 | } 520 | 521 | #[cfg(test)] 522 | #[path = "fuse16_test.rs"] 523 | mod fuse16_test; 524 | -------------------------------------------------------------------------------- /src/fuse16_test.rs: -------------------------------------------------------------------------------- 1 | use rand::prelude::random; 2 | use rand::rngs::StdRng; 3 | use rand::Rng; 4 | use rand::SeedableRng; 5 | 6 | use super::*; 7 | 8 | fn generate_unique_keys(rng: &mut StdRng, size: usize) -> Vec { 9 | let mut keys: Vec = Vec::with_capacity(size); 10 | keys.resize(size, u64::default()); 11 | 12 | for key in keys.iter_mut() { 13 | *key = rng.gen(); 14 | } 15 | keys.sort_unstable(); 16 | keys.dedup(); 17 | 18 | for _i in 0..(size - keys.len()) { 19 | let key = rng.gen::(); 20 | if !keys.contains(&key) { 21 | keys.push(key) 22 | } 23 | } 24 | 25 | keys 26 | } 27 | 28 | fn test_fuse16_build(name: &str, seed: u64, size: u32) 29 | where H: Default + BuildHasher { 30 | let (x, y) = { 31 | let size = size as usize; 32 | (size / 3, size / 3) 33 | }; 34 | 35 | println!("test_fuse16_build<{}> size:{}", name, size); 36 | let mut rng = StdRng::seed_from_u64(seed); 37 | 38 | let mut filter = Fuse16::::new(size); 39 | let keys = generate_unique_keys(&mut rng, size as usize); 40 | let (keys1, keys2, keys3) = (&keys[0..x], &keys[x..x + y], &keys[x + y..]); 41 | 42 | // populate api 43 | filter.populate(keys1); 44 | // populate_keys api 45 | let digests: Vec = keys2 46 | .iter() 47 | .map(|k| { 48 | let mut hasher = filter.get_hasher(); 49 | k.hash(&mut hasher); 50 | hasher.finish() 51 | }) 52 | .collect(); 53 | filter.populate_keys(&digests); 54 | // insert api 55 | keys3.iter().for_each(|key| filter.insert(key)); 56 | 57 | filter.build().expect("failed to build fuse16 filter"); 58 | 59 | // contains api 60 | for key in keys.iter() { 61 | assert!(filter.contains(key), "key {} not present", key); 62 | } 63 | // contains_key api 64 | for key in keys.iter() { 65 | let digest = { 66 | let mut hasher = filter.get_hasher(); 67 | key.hash(&mut hasher); 68 | hasher.finish() 69 | }; 70 | assert!(filter.contains_key(digest), "key {} not present", key); 71 | } 72 | 73 | // print some statistics 74 | let (falsesize, mut matches) = (10_000_000, 0_f64); 75 | let bpv = ((filter.finger_prints.len() * 2) as f64) * 8.0 / (keys.len() as f64); 76 | println!("test_fuse16_build<{}> bits per entry {} bits", name, bpv); 77 | if size > 100000 { 78 | assert!(bpv < 20.0, "bpv({}) >= 20.0", bpv); 79 | } 80 | 81 | for _ in 0..falsesize { 82 | if filter.contains(&rng.gen::()) { 83 | matches += 1_f64; 84 | } 85 | } 86 | 87 | let fpp = matches * 100.0 / (falsesize as f64); 88 | println!("test_fuse16_build<{}> false positive rate {}%", name, fpp); 89 | assert!(fpp < 0.40, "fpp({}) >= 0.40", fpp); 90 | } 91 | 92 | fn test_fuse16_build_keys(name: &str, seed: u64, size: u32) 93 | where H: Default + BuildHasher { 94 | println!("test_fuse16_build_keys<{}> size:{}", name, size); 95 | let mut rng = StdRng::seed_from_u64(seed); 96 | 97 | let mut filter = Fuse16::::new(size); 98 | 99 | // build_keys api 100 | let keys = generate_unique_keys(&mut rng, size as usize); 101 | let digests: Vec = keys 102 | .iter() 103 | .map(|k| { 104 | let mut hasher = filter.get_hasher(); 105 | k.hash(&mut hasher); 106 | hasher.finish() 107 | }) 108 | .collect(); 109 | 110 | filter.build_keys(&digests).expect("failed to build fuse16 filter"); 111 | 112 | // contains api 113 | for key in keys.iter() { 114 | assert!(filter.contains(key), "key {} not present", key); 115 | } 116 | // contains_key api 117 | for digest in digests.into_iter() { 118 | assert!(filter.contains_key(digest), "digest {} not present", digest); 119 | } 120 | 121 | // print some statistics 122 | let (falsesize, mut matches) = (10_000_000, 0_f64); 123 | let bpv = ((filter.finger_prints.len() * 2) as f64) * 8.0 / (keys.len() as f64); 124 | println!( 125 | "test_fuse16_build_keys<{}> bits per entry {} bits", 126 | name, bpv 127 | ); 128 | if size > 100000 { 129 | assert!(bpv < 20.0, "bpv({}) >= 20.0", bpv); 130 | } 131 | 132 | for _ in 0..falsesize { 133 | if filter.contains(&rng.gen::()) { 134 | matches += 1_f64; 135 | } 136 | } 137 | 138 | let fpp = matches * 100.0 / (falsesize as f64); 139 | println!( 140 | "test_fuse16_build_keys<{}> false positive rate {}%", 141 | name, fpp 142 | ); 143 | assert!(fpp < 0.40, "fpp({}) >= 0.40", fpp); 144 | } 145 | 146 | #[test] 147 | fn test_fuse16() { 148 | let mut seed: u64 = random(); 149 | println!("test_fuse16 seed:{}", seed); 150 | 151 | for size in [0, 1, 2, 10, 1000, 10_000, 100_000, 1_000_000, 10_000_000].iter() { 152 | seed = seed.wrapping_add(*size as u64); 153 | test_fuse16_build::("RandomState", seed, *size); 154 | test_fuse16_build::("BuildHasherDefault", seed, *size); 155 | test_fuse16_build_keys::("RandomState", seed, *size); 156 | test_fuse16_build_keys::("BuildHasherDefault", seed, *size); 157 | } 158 | } 159 | 160 | #[test] 161 | #[ignore] 162 | fn test_fuse16_billion() { 163 | let seed: u64 = random(); 164 | println!("test_fuse16_billion seed:{}", seed); 165 | 166 | let size = 1_000_000_000; 167 | test_fuse16_build::("RandomState", seed, size); 168 | test_fuse16_build::("BuildHasherDefault", seed, size); 169 | test_fuse16_build_keys::("RandomState", seed, size); 170 | test_fuse16_build_keys::("BuildHasherDefault", seed, size); 171 | } 172 | 173 | #[cfg(feature = "cbordata")] 174 | #[test] 175 | fn test_fuse16_cbor() { 176 | let seed: u64 = random(); 177 | println!("test_fuse16_cbor seed:{}", seed); 178 | let mut rng = StdRng::seed_from_u64(seed); 179 | 180 | let keys: Vec = (0..100_000).map(|_| rng.gen::()).collect(); 181 | 182 | let filter = { 183 | let mut filter = Fuse16::::new(keys.len() as u32); 184 | filter.populate(&keys); 185 | filter.build().expect("fail building fuse16 filter"); 186 | filter 187 | }; 188 | 189 | for key in keys.iter() { 190 | assert!(filter.contains(key), "key {} not present", key); 191 | } 192 | 193 | let filter = { 194 | let val = filter.into_cbor().unwrap(); 195 | Fuse16::::from_cbor(val).unwrap() 196 | }; 197 | 198 | for key in keys.iter() { 199 | assert!(filter.contains(key), "key {} not present", key); 200 | } 201 | } 202 | -------------------------------------------------------------------------------- /src/fuse8.rs: -------------------------------------------------------------------------------- 1 | #[allow(unused_imports)] 2 | use std::collections::hash_map::DefaultHasher; 3 | #[allow(unused_imports)] 4 | use std::collections::hash_map::RandomState; 5 | use std::hash::BuildHasher; 6 | use std::hash::Hash; 7 | use std::hash::Hasher; 8 | use std::sync::Arc; 9 | 10 | #[cfg(feature = "cbordata")] 11 | use cbordata::Cbor; 12 | #[cfg(feature = "cbordata")] 13 | use cbordata::Cborize; 14 | #[cfg(feature = "cbordata")] 15 | use cbordata::FromCbor; 16 | #[cfg(feature = "cbordata")] 17 | use cbordata::IntoCbor; 18 | #[cfg(feature = "cbordata")] 19 | use cbordata::{self as cbor}; 20 | 21 | use crate::BuildHasherDefault; 22 | use crate::Error; 23 | use crate::Result; 24 | 25 | // probabillity of success should always be > 0.5 so 100 iterations is highly unlikely. 26 | const XOR_MAX_ITERATIONS: usize = 100; 27 | 28 | #[inline] 29 | pub(crate) fn binary_fuse_murmur64(mut h: u64) -> u64 { 30 | h ^= h >> 33; 31 | h = h.wrapping_mul(0xff51afd7ed558ccd_u64); 32 | h ^= h >> 33; 33 | h = h.wrapping_mul(0xc4ceb9fe1a85ec53_u64); 34 | h ^= h >> 33; 35 | h 36 | } 37 | 38 | #[inline] 39 | pub(crate) fn binary_fuse_mix_split(key: u64, seed: u64) -> u64 { 40 | binary_fuse_murmur64(key.wrapping_add(seed)) 41 | } 42 | 43 | #[allow(dead_code)] 44 | #[inline] 45 | fn binary_fuse_rotl64(n: u64, c: u32) -> u64 { 46 | n.rotate_left(c) 47 | } 48 | 49 | #[allow(dead_code)] 50 | #[inline] 51 | fn binary_fuse_reduce(hash: u32, n: u32) -> u32 { 52 | // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 53 | (((hash as u64) * (n as u64)) >> 32) as u32 54 | } 55 | 56 | #[inline] 57 | fn binary_fuse8_fingerprint(hash: u64) -> u64 { 58 | hash ^ (hash >> 32) 59 | } 60 | 61 | // returns random number, modifies the seed 62 | pub(crate) fn binary_fuse_rng_splitmix64(seed: &mut u64) -> u64 { 63 | *seed = seed.wrapping_add(0x9E3779B97F4A7C15_u64); 64 | let mut z = *seed; 65 | z = (z ^ (z >> 30)).wrapping_mul(0xBF58476D1CE4E5B9_u64); 66 | z = (z ^ (z >> 27)).wrapping_mul(0x94D049BB133111EB_u64); 67 | z ^ (z >> 31) 68 | } 69 | 70 | #[inline] 71 | pub(crate) fn binary_fuse_mulhi(a: u64, b: u64) -> u64 { 72 | (((a as u128) * (b as u128)) >> 64) as u64 73 | } 74 | 75 | #[inline] 76 | pub(crate) fn binary_fuse_calculate_segment_length(arity: u32, size: u32) -> u32 { 77 | let ln_size = (size as f64).ln(); 78 | 79 | // These parameters are very sensitive. Replacing 'floor' by 'round' can 80 | // substantially affect the construction time. 81 | match arity { 82 | 3 => 1_u32 << ((ln_size / 3.33_f64.ln() + 2.25).floor() as u32), 83 | 4 => 1_u32 << ((ln_size / 2.91_f64.ln() - 0.50).floor() as u32), 84 | _ => 65536, 85 | } 86 | } 87 | 88 | #[inline] 89 | fn binary_fuse8_max(a: f64, b: f64) -> f64 { 90 | if a < b { 91 | b 92 | } else { 93 | a 94 | } 95 | } 96 | 97 | #[inline] 98 | pub(crate) fn binary_fuse_calculate_size_factor(arity: u32, size: u32) -> f64 { 99 | let ln_size = (size as f64).ln(); 100 | match arity { 101 | 3 => binary_fuse8_max(1.125, 0.875 + 0.250 * 1000000.0_f64.ln() / ln_size), 102 | 4 => binary_fuse8_max(1.075, 0.770 + 0.305 * 0600000.0_f64.ln() / ln_size), 103 | _ => 2.0, 104 | } 105 | } 106 | 107 | #[inline] 108 | pub(crate) fn binary_fuse_mod3(x: u8) -> u8 { 109 | if x > 2 { 110 | x - 3 111 | } else { 112 | x 113 | } 114 | } 115 | 116 | /// Type Fuse8 is probabilistic data-structure to test membership of an element in a set. 117 | /// 118 | /// Fuse8 is parametrized over type `H` which is expected to implement [BuildHasher] 119 | /// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied, 120 | /// [BuildHasherDefault] is used as the default hash-builder. 121 | /// 122 | /// If `RandomState` is used as BuildHasher, `std` has got this to say 123 | /// > _A particular instance RandomState will create the same instances 124 | /// > of Hasher, but the hashers created by two different RandomState 125 | /// > instances are unlikely to produce the same result for the same values._ 126 | /// 127 | /// If [DefaultHasher] is used as BuildHasher, `std` has got this to say, 128 | /// > _The internal algorithm is not specified, and so its hashes 129 | /// > should not be relied upon over releases._ 130 | /// 131 | /// The default type for parameter `H` might change when a reliable and commonly used 132 | /// BuildHasher type available. 133 | /// 134 | /// IMPORTANT: Fuse8 filter can only tolerate few duplicates in a given data-set. 135 | /// So make sure to supply a hasher that is capable of generating unique digests, 136 | /// _(with allowed tolerance of duplicates)_ and while supplying the digests directly 137 | /// via `populate_keys()` and `build_keys()` make sure they don't have more than few 138 | /// duplicates. 139 | pub struct Fuse8 140 | where H: BuildHasher 141 | { 142 | keys: Option>, 143 | pub hash_builder: H, 144 | pub seed: u64, 145 | pub num_keys: Option, 146 | pub segment_length: u32, 147 | pub segment_length_mask: u32, 148 | pub segment_count: u32, 149 | pub segment_count_length: u32, 150 | pub finger_prints: Arc>, 151 | } 152 | 153 | #[derive(Default)] 154 | pub(crate) struct BinaryHashes { 155 | pub(crate) h0: u32, 156 | pub(crate) h1: u32, 157 | pub(crate) h2: u32, 158 | } 159 | 160 | impl Clone for Fuse8 161 | where H: Clone + BuildHasher 162 | { 163 | fn clone(&self) -> Self { 164 | Fuse8 { 165 | keys: Some(Vec::default()), 166 | hash_builder: self.hash_builder.clone(), 167 | seed: self.seed, 168 | num_keys: self.num_keys, 169 | segment_length: self.segment_length, 170 | segment_length_mask: self.segment_length_mask, 171 | segment_count: self.segment_count, 172 | segment_count_length: self.segment_count_length, 173 | finger_prints: Arc::clone(&self.finger_prints), 174 | } 175 | } 176 | } 177 | 178 | impl Fuse8 179 | where H: BuildHasher 180 | { 181 | #[inline] 182 | fn binary_fuse8_hash_batch(&self, hash: u64) -> BinaryHashes { 183 | let mut ans = BinaryHashes::default(); 184 | 185 | ans.h0 = binary_fuse_mulhi(hash, self.segment_count_length.into()) as u32; 186 | ans.h1 = ans.h0 + self.segment_length; 187 | ans.h2 = ans.h1 + self.segment_length; 188 | ans.h1 ^= ((hash >> 18) as u32) & self.segment_length_mask; 189 | ans.h2 ^= (hash as u32) & self.segment_length_mask; 190 | ans 191 | } 192 | 193 | #[inline] 194 | fn binary_fuse8_hash(&self, index: u32, hash: u64) -> u32 { 195 | let mut h = binary_fuse_mulhi(hash, self.segment_count_length.into()); 196 | h += (index * self.segment_length) as u64; 197 | // keep the lower 36 bits 198 | let hh = hash & ((1_u64 << 36) - 1); 199 | // index 0: right shift by 36; index 1: right shift by 18; index 2: no shift 200 | h ^= (hh >> (36 - 18 * index)) & (self.segment_length_mask as u64); 201 | 202 | h as u32 203 | } 204 | } 205 | 206 | impl Fuse8 207 | where H: BuildHasher 208 | { 209 | /// New Fuse8 instance that can index size number of keys. Internal data-structures 210 | /// are pre-allocated for `size`. `size` should be at least 2. 211 | pub fn new(size: u32) -> Fuse8 212 | where H: Default { 213 | Self::with_hasher(size, H::default()) 214 | } 215 | 216 | /// New Fuse8 instance initialized with supplied hasher. 217 | pub fn with_hasher(size: u32, hash_builder: H) -> Fuse8 { 218 | use std::cmp; 219 | 220 | let arity = 3_u32; 221 | 222 | let segment_length = match size { 223 | 0 => 4, 224 | size => cmp::min(binary_fuse_calculate_segment_length(arity, size), 262144), 225 | }; 226 | 227 | let segment_length_mask = segment_length - 1; 228 | let mut array_length = { 229 | let size_factor = binary_fuse_calculate_size_factor(arity, size); 230 | let cap = match size { 231 | 0 | 1 => 0, 232 | size => ((size as f64) * size_factor).round() as u32, 233 | }; 234 | let n = ((cap + segment_length - 1) / segment_length).wrapping_sub(arity - 1); 235 | (n.wrapping_add(arity) - 1) * segment_length 236 | }; 237 | 238 | let mut segment_count = (array_length + segment_length - 1) / segment_length; 239 | segment_count = if segment_count <= (arity - 1) { 240 | 1 241 | } else { 242 | segment_count - (arity - 1) 243 | }; 244 | 245 | array_length = (segment_count + arity - 1) * segment_length; 246 | let segment_count_length = segment_count * segment_length; 247 | 248 | Fuse8 { 249 | keys: Some(Vec::default()), 250 | hash_builder, 251 | seed: u64::default(), 252 | num_keys: None, 253 | segment_length, 254 | segment_length_mask, 255 | segment_count, 256 | segment_count_length, 257 | finger_prints: Arc::new(vec![0; array_length as usize]), 258 | } 259 | } 260 | } 261 | 262 | impl Fuse8 263 | where H: BuildHasher 264 | { 265 | /// Return the size of index. 266 | #[inline] 267 | pub fn size_of(&self) -> usize { 268 | std::mem::size_of::() + self.finger_prints.len() 269 | } 270 | 271 | /// Insert 64-bit digest of a single key. Digest for the key shall be generated 272 | /// using the default-hasher or via hasher supplied via [Fuse8::with_hasher] method. 273 | pub fn insert(&mut self, key: &K) { 274 | let digest = { 275 | let mut hasher = self.hash_builder.build_hasher(); 276 | key.hash(&mut hasher); 277 | hasher.finish() 278 | }; 279 | if let Some(x) = self.num_keys.as_mut() { 280 | *x += 1 281 | } 282 | self.keys.as_mut().unwrap().push(digest); 283 | } 284 | 285 | /// Populate with 64-bit digests for a collection of keys of type `K`. Digest for 286 | /// key shall be generated using the default-hasher or via hasher supplied 287 | /// via [Fuse8::with_hasher] method. 288 | pub fn populate(&mut self, keys: &[K]) { 289 | if let Some(x) = self.num_keys.as_mut() { 290 | *x += keys.len() 291 | } 292 | keys.iter().for_each(|key| { 293 | let mut hasher = self.hash_builder.build_hasher(); 294 | key.hash(&mut hasher); 295 | self.keys.as_mut().unwrap().push(hasher.finish()); 296 | }) 297 | } 298 | 299 | /// Populate with pre-compute collection of 64-bit digests. 300 | pub fn populate_keys(&mut self, digests: &[u64]) { 301 | if let Some(x) = self.num_keys.as_mut() { 302 | *x += digests.len() 303 | } 304 | self.keys.as_mut().unwrap().extend_from_slice(digests); 305 | } 306 | 307 | // construct the filter, returns true on success, false on failure. 308 | // most likely, a failure is due to too high a memory usage 309 | // size is the number of keys 310 | // The caller is responsable for calling binary_fuse8_allocate(size,filter) 311 | // before. The caller is responsible to ensure that there are no duplicated 312 | // keys. The inner loop will run up to XOR_MAX_ITERATIONS times (default on 313 | // 100), it should never fail, except if there are duplicated keys. If it fails, 314 | // a return value of false is provided. 315 | /// Build bitmap for keys that where previously inserted using [Fuse8::insert], 316 | /// [Fuse8::populate] and [Fuse8::populate_keys] method. 317 | pub fn build(&mut self) -> Result<()> { 318 | match self.keys.take() { 319 | Some(keys) => self.build_keys(&keys), 320 | None => Ok(()), 321 | } 322 | } 323 | 324 | /// Build a bitmap for pre-computed 64-bit digests for keys. If keys where 325 | /// previously inserted using [Fuse8::insert] or [Fuse8::populate] or 326 | /// [Fuse8::populate_keys] methods, they shall be ignored. 327 | /// 328 | /// It is upto the caller to ensure that digests are unique, that there no 329 | /// duplicates. 330 | pub fn build_keys(&mut self, digests: &[u64]) -> Result<()> { 331 | let mut rng_counter = 0x726b2b9d438b9d4d_u64; 332 | let capacity = self.finger_prints.len(); 333 | let size = digests.len(); 334 | 335 | self.num_keys = Some(digests.len()); 336 | self.seed = binary_fuse_rng_splitmix64(&mut rng_counter); 337 | 338 | let mut reverse_order: Vec = vec![0; size + 1]; 339 | let mut reverse_h: Vec = vec![0; size]; 340 | let mut alone: Vec = vec![0; capacity]; 341 | let mut t2count: Vec = vec![0; capacity]; 342 | let mut t2hash: Vec = vec![0; capacity]; 343 | 344 | let mut block_bits: u32 = 1; 345 | while (1_u32 << block_bits) < self.segment_count { 346 | block_bits += 1; 347 | } 348 | 349 | let block = 1_u32 << block_bits; 350 | 351 | let mut start_pos: Vec = vec![0; 1 << block_bits]; 352 | 353 | let mut h012 = [0_u32; 5]; 354 | 355 | reverse_order[size] = 1; // sentinel 356 | let mut iter = 0..=XOR_MAX_ITERATIONS; 357 | loop { 358 | if iter.next().is_none() { 359 | err_at!(Fatal, msg: "Too many iterations. Are all your keys unique?")?; 360 | } 361 | 362 | for i in 0_u32..block { 363 | // important : i * size would overflow as a 32-bit number in some 364 | // cases. 365 | start_pos[i as usize] = 366 | (((i as u64) * (size as u64)) >> block_bits) as u32; 367 | } 368 | 369 | let mask_block = (block - 1) as u64; 370 | for (_, digest) in digests.iter().enumerate().take(size) { 371 | let hash: u64 = binary_fuse_murmur64(digest.wrapping_add(self.seed)); 372 | let mut segment_index: u64 = hash >> (64 - block_bits); 373 | while reverse_order[start_pos[segment_index as usize] as usize] != 0 { 374 | segment_index += 1; 375 | segment_index &= mask_block; 376 | } 377 | reverse_order[start_pos[segment_index as usize] as usize] = hash; 378 | start_pos[segment_index as usize] += 1; 379 | } 380 | 381 | let mut error: isize = 0; 382 | let mut duplicates = 0; 383 | for (_, rev_order) in reverse_order.iter().enumerate().take(size) { 384 | let hash: u64 = *rev_order; 385 | 386 | let h0: usize = self.binary_fuse8_hash(0, hash) as usize; 387 | t2count[h0] = t2count[h0].wrapping_add(4); 388 | t2hash[h0] ^= hash; 389 | 390 | let h1: usize = self.binary_fuse8_hash(1, hash) as usize; 391 | t2count[h1] = t2count[h1].wrapping_add(4); 392 | t2count[h1] ^= 1; 393 | t2hash[h1] ^= hash; 394 | 395 | let h2: usize = self.binary_fuse8_hash(2, hash) as usize; 396 | t2count[h2] = t2count[h2].wrapping_add(4); 397 | t2hash[h2] ^= hash; 398 | t2count[h2] ^= 2; 399 | 400 | // If we have duplicated hash values, then it is likely that 401 | // the next comparison is true 402 | if (t2hash[h0] & t2hash[h1] & t2hash[h2]) == 0 { 403 | // next we do the actual test 404 | if ((t2hash[h0] == 0) && (t2count[h0] == 8)) 405 | || ((t2hash[h1] == 0) && (t2count[h1] == 8)) 406 | || ((t2hash[h2] == 0) && (t2count[h2] == 8)) 407 | { 408 | duplicates += 1; 409 | t2count[h0] = t2count[h0].wrapping_sub(4); 410 | t2hash[h0] ^= hash; 411 | t2count[h1] = t2count[h1].wrapping_sub(4); 412 | t2count[h1] ^= 1; 413 | t2hash[h1] ^= hash; 414 | t2count[h2] = t2count[h2].wrapping_sub(4); 415 | t2hash[h2] ^= hash; 416 | t2count[h2] ^= 2; 417 | } 418 | } 419 | 420 | error = if t2count[h0] < 4 { 1 } else { error }; 421 | error = if t2count[h1] < 4 { 1 } else { error }; 422 | error = if t2count[h2] < 4 { 1 } else { error }; 423 | } 424 | 425 | if error > 0 { 426 | reverse_order.fill(0); 427 | reverse_order[size] = 1; // sentinel 428 | t2count.fill(0); 429 | t2hash.fill(0); 430 | self.seed = binary_fuse_rng_splitmix64(&mut rng_counter); 431 | continue; 432 | } 433 | 434 | let mut q_size = 0_usize; // End of key addition 435 | 436 | // Add sets with one key to the queue. 437 | for (i, x) in t2count.iter().enumerate().take(capacity) { 438 | alone[q_size] = i as u32; 439 | q_size += if (x >> 2) == 1 { 1 } else { 0 }; 440 | } 441 | 442 | let mut stack_size = 0_usize; 443 | 444 | while q_size > 0 { 445 | q_size -= 1; 446 | let index = alone[q_size] as usize; 447 | if (t2count[index] >> 2) == 1 { 448 | let hash: u64 = t2hash[index]; 449 | 450 | //h012[0] = self.binary_fuse8_hash(0, hash); 451 | h012[1] = self.binary_fuse8_hash(1, hash); 452 | h012[2] = self.binary_fuse8_hash(2, hash); 453 | h012[3] = self.binary_fuse8_hash(0, hash); // == h012[0]; 454 | h012[4] = h012[1]; 455 | 456 | let found: u8 = t2count[index] & 3; 457 | reverse_h[stack_size] = found; 458 | reverse_order[stack_size] = hash; 459 | stack_size += 1; 460 | 461 | let other_index1: u32 = h012[(found + 1) as usize]; 462 | alone[q_size] = other_index1; 463 | q_size += if (t2count[other_index1 as usize] >> 2) == 2 { 464 | 1 465 | } else { 466 | 0 467 | }; 468 | 469 | t2count[other_index1 as usize] -= 4; 470 | t2count[other_index1 as usize] ^= binary_fuse_mod3(found + 1); 471 | t2hash[other_index1 as usize] ^= hash; 472 | 473 | let other_index2: u32 = h012[(found + 2) as usize]; 474 | alone[q_size] = other_index2; 475 | q_size += if (t2count[other_index2 as usize] >> 2) == 2 { 476 | 1 477 | } else { 478 | 0 479 | }; 480 | t2count[other_index2 as usize] -= 4; 481 | t2count[other_index2 as usize] ^= binary_fuse_mod3(found + 2); 482 | t2hash[other_index2 as usize] ^= hash; 483 | } 484 | } 485 | 486 | if (stack_size + duplicates) == size { 487 | break; // success 488 | } 489 | 490 | reverse_order.fill(0); 491 | reverse_order[size] = 1; // sentinel 492 | t2count.fill(0); 493 | t2hash.fill(0); 494 | 495 | self.seed = binary_fuse_rng_splitmix64(&mut rng_counter); 496 | } 497 | 498 | if size == 0 { 499 | return Ok(()); 500 | } 501 | 502 | for i in (0_usize..size).rev() { 503 | // the hash of the key we insert next 504 | let hash: u64 = reverse_order[i]; 505 | let xor2: u8 = binary_fuse8_fingerprint(hash) as u8; 506 | let found: usize = reverse_h[i] as usize; 507 | h012[0] = self.binary_fuse8_hash(0, hash); 508 | h012[1] = self.binary_fuse8_hash(1, hash); 509 | h012[2] = self.binary_fuse8_hash(2, hash); 510 | h012[3] = h012[0]; 511 | h012[4] = h012[1]; 512 | Arc::get_mut(&mut self.finger_prints).unwrap()[h012[found] as usize] = xor2 513 | ^ self.finger_prints[h012[found + 1] as usize] 514 | ^ self.finger_prints[h012[found + 2] as usize]; 515 | } 516 | 517 | Ok(()) 518 | } 519 | } 520 | 521 | impl Fuse8 522 | where H: BuildHasher 523 | { 524 | #[allow(clippy::len_without_is_empty)] 525 | /// Return the number of keys added/built into the bitmap index. 526 | pub fn len(&self) -> Option { 527 | self.num_keys 528 | } 529 | 530 | /// Contains tell you whether the key is likely part of the set, with false 531 | /// positive rate. 532 | pub fn contains(&self, key: &K) -> bool { 533 | let digest = { 534 | let mut hasher = self.hash_builder.build_hasher(); 535 | key.hash(&mut hasher); 536 | hasher.finish() 537 | }; 538 | self.contains_key(digest) 539 | } 540 | 541 | /// Contains tell you whether the key, as pre-computed digest form, is likely 542 | /// part of the set, with false positive rate. 543 | pub fn contains_key(&self, digest: u64) -> bool { 544 | let hash = binary_fuse_mix_split(digest, self.seed); 545 | let mut f = binary_fuse8_fingerprint(hash) as u8; 546 | let BinaryHashes { h0, h1, h2 } = self.binary_fuse8_hash_batch(hash); 547 | f ^= self.finger_prints[h0 as usize] 548 | ^ self.finger_prints[h1 as usize] 549 | ^ self.finger_prints[h2 as usize]; 550 | f == 0 551 | } 552 | 553 | #[allow(dead_code)] 554 | fn get_hasher(&self) -> H::Hasher { 555 | self.hash_builder.build_hasher() 556 | } 557 | } 558 | 559 | //------ Implement cbordata related functionalities 560 | 561 | // Intermediate type to serialize and de-serialized Fuse8 into bytes. 562 | #[cfg(feature = "cbordata")] 563 | #[derive(Cborize)] 564 | struct CborFuse8 { 565 | hash_builder: Vec, 566 | seed: u64, 567 | num_keys: Option, 568 | segment_length: u32, 569 | segment_length_mask: u32, 570 | segment_count: u32, 571 | segment_count_length: u32, 572 | finger_prints: Vec, 573 | } 574 | 575 | #[cfg(feature = "cbordata")] 576 | impl CborFuse8 { 577 | const ID: &'static str = "fuse8/0.0.1"; 578 | } 579 | 580 | #[cfg(feature = "cbordata")] 581 | impl IntoCbor for Fuse8 582 | where H: BuildHasher + Into> 583 | { 584 | fn into_cbor(self) -> cbor::Result { 585 | let finger_prints = self.finger_prints.to_vec(); 586 | let val = CborFuse8 { 587 | hash_builder: self.hash_builder.into(), 588 | seed: self.seed, 589 | num_keys: self.num_keys, 590 | segment_length: self.segment_length, 591 | segment_length_mask: self.segment_length_mask, 592 | segment_count: self.segment_count, 593 | segment_count_length: self.segment_count_length, 594 | finger_prints, 595 | }; 596 | val.into_cbor() 597 | } 598 | } 599 | 600 | #[cfg(feature = "cbordata")] 601 | impl FromCbor for Fuse8 602 | where H: BuildHasher + From> 603 | { 604 | fn from_cbor(val: Cbor) -> cbor::Result { 605 | let val = CborFuse8::from_cbor(val)?; 606 | 607 | let filter = Fuse8 { 608 | keys: None, 609 | hash_builder: val.hash_builder.into(), 610 | seed: val.seed, 611 | num_keys: val.num_keys, 612 | segment_length: val.segment_length, 613 | segment_length_mask: val.segment_length_mask, 614 | segment_count: val.segment_count, 615 | segment_count_length: val.segment_count_length, 616 | finger_prints: Arc::new(val.finger_prints), 617 | }; 618 | 619 | Ok(filter) 620 | } 621 | } 622 | 623 | #[cfg(test)] 624 | #[path = "fuse8_test.rs"] 625 | mod fuse8_test; 626 | -------------------------------------------------------------------------------- /src/fuse8_test.rs: -------------------------------------------------------------------------------- 1 | use rand::distributions::Distribution; 2 | use rand::distributions::Standard; 3 | use rand::prelude::random; 4 | use rand::rngs::StdRng; 5 | use rand::Rng; 6 | use rand::SeedableRng; 7 | 8 | use super::*; 9 | 10 | fn generate_unique_keys(prefix: &str, rng: &mut StdRng, size: usize) -> Vec 11 | where 12 | K: Clone + Default + Ord, 13 | Standard: Distribution, 14 | { 15 | let mut keys: Vec = Vec::with_capacity(size); 16 | keys.resize(size, K::default()); 17 | 18 | for key in keys.iter_mut() { 19 | *key = rng.gen(); 20 | } 21 | keys.sort_unstable(); 22 | 23 | let mut ks = keys.clone(); 24 | ks.dedup(); 25 | println!("{} number of duplicates {}", prefix, size - ks.len()); 26 | 27 | keys 28 | } 29 | 30 | fn test_fuse8_build(name: &str, seed: u64, size: u32) 31 | where 32 | H: Default + BuildHasher, 33 | K: Clone + Default + Ord + Hash + std::fmt::Display, 34 | Standard: Distribution, 35 | { 36 | use std::cmp; 37 | 38 | let mut rng = StdRng::seed_from_u64(seed); 39 | 40 | let keys = generate_unique_keys(name, &mut rng, size as usize); 41 | 42 | let size = keys.len() as u32; 43 | let (x, y) = { 44 | let size = size as usize; 45 | (size / 3, size / 3) 46 | }; 47 | let (keys1, keys2, keys3) = (&keys[0..x], &keys[x..x + y], &keys[x + y..]); 48 | 49 | println!("test_fuse8_build<{}> size:{}", name, size); 50 | 51 | let mut filter = Fuse8::::new(size); 52 | 53 | // populate api 54 | filter.populate(keys1); 55 | // populate_keys api 56 | let digests: Vec = keys2 57 | .iter() 58 | .map(|k| { 59 | let mut hasher = filter.get_hasher(); 60 | k.hash(&mut hasher); 61 | hasher.finish() 62 | }) 63 | .collect(); 64 | filter.populate_keys(&digests); 65 | // insert api 66 | keys3.iter().for_each(|key| filter.insert(key)); 67 | 68 | filter.build().expect("failed to build fuse16 filter"); 69 | 70 | // contains api 71 | for key in keys.iter() { 72 | assert!(filter.contains(key), "key {} not present", key); 73 | } 74 | // contains_key api 75 | for key in keys.iter() { 76 | let digest = { 77 | let mut hasher = filter.get_hasher(); 78 | key.hash(&mut hasher); 79 | hasher.finish() 80 | }; 81 | assert!(filter.contains_key(digest), "key {} not present", key); 82 | } 83 | 84 | // print some statistics 85 | let (falsesize, mut matches) = (cmp::min(size * 10, 10_000_000), 0_f64); 86 | let bpv = (filter.finger_prints.len() as f64) * 8.0 / (keys.len() as f64); 87 | println!("test_fuse8_build<{}> bits per entry {} bits", name, bpv); 88 | 89 | for _ in 0..falsesize { 90 | let k = rng.gen::(); 91 | let ok = filter.contains(&k); 92 | match keys.binary_search(&k) { 93 | Ok(_) if !ok => panic!("false negative {}", k), 94 | Ok(_) => (), 95 | Err(_) if ok => matches += 1_f64, 96 | Err(_) => (), 97 | } 98 | } 99 | 100 | let fpp = matches * 100.0 / (falsesize as f64); 101 | println!("test_fuse8_build<{}> false positive rate {}%", name, fpp); 102 | 103 | if size > 100_000 { 104 | assert!(bpv < 12.0, "bpv({}) >= 12.0", bpv); 105 | assert!(fpp < 0.4, "fpp({}) >= 0.4", fpp); 106 | } 107 | } 108 | 109 | fn test_fuse8_build_keys(name: &str, seed: u64, size: u32) 110 | where 111 | H: Default + BuildHasher, 112 | K: Clone + Default + Ord + Hash + std::fmt::Display, 113 | Standard: Distribution, 114 | { 115 | use std::cmp; 116 | 117 | let mut rng = StdRng::seed_from_u64(seed); 118 | 119 | let keys = generate_unique_keys(name, &mut rng, size as usize); 120 | let size = keys.len() as u32; 121 | 122 | println!("test_fuse8_build_keys<{}> size:{}", name, size); 123 | 124 | let mut filter = Fuse8::::new(size); 125 | 126 | // build_keys api 127 | let digests: Vec = keys 128 | .iter() 129 | .map(|k| { 130 | let mut hasher = filter.get_hasher(); 131 | k.hash(&mut hasher); 132 | hasher.finish() 133 | }) 134 | .collect(); 135 | 136 | filter.build_keys(&digests).expect("failed to build fuse16 filter"); 137 | 138 | // contains api 139 | for key in keys.iter() { 140 | assert!(filter.contains(key), "key {} not present", key); 141 | } 142 | // contains_key api 143 | for digest in digests.into_iter() { 144 | assert!(filter.contains_key(digest), "digest {} not present", digest); 145 | } 146 | 147 | // print some statistics 148 | let (falsesize, mut matches) = (cmp::min(size * 10, 10_000_000), 0_f64); 149 | let bpv = (filter.finger_prints.len() as f64) * 8.0 / (keys.len() as f64); 150 | println!( 151 | "test_fuse8_build_keys<{}> bits per entry {} bits", 152 | name, bpv 153 | ); 154 | 155 | for _ in 0..falsesize { 156 | let k = rng.gen::(); 157 | let ok = filter.contains(&k); 158 | match keys.binary_search(&k) { 159 | Ok(_) if !ok => panic!("false negative {}", k), 160 | Ok(_) => (), 161 | Err(_) if ok => matches += 1_f64, 162 | Err(_) => (), 163 | } 164 | } 165 | 166 | let fpp = matches * 100.0 / (falsesize as f64); 167 | println!( 168 | "test_fuse8_build_keys<{}> false positive rate {}%", 169 | name, fpp 170 | ); 171 | 172 | if size > 100_000 { 173 | assert!(bpv < 12.0, "bpv({}) >= 12.0", bpv); 174 | assert!(fpp < 0.4, "fpp({}) >= 0.4", fpp); 175 | } 176 | } 177 | 178 | #[test] 179 | fn test_fuse8_u8() { 180 | let mut seed: u64 = [6509898893809465102_u64, random()][random::() % 2]; 181 | println!("test_fuse8_u8 seed:{}", seed); 182 | 183 | for size in [0, 1, 2, 10, 100].iter() { 184 | seed = seed.wrapping_add(*size as u64); 185 | test_fuse8_build::("RandomState,u8", seed, *size); 186 | test_fuse8_build::("BuildHasherDefault,u8", seed, *size); 187 | test_fuse8_build_keys::("RandomState,u8", seed, *size); 188 | test_fuse8_build_keys::( 189 | "BuildHasherDefault,u8", 190 | seed, 191 | *size, 192 | ); 193 | } 194 | } 195 | 196 | #[test] 197 | fn test_fuse8_u16() { 198 | let mut seed: u64 = random(); 199 | println!("test_fuse8_u16 seed:{}", seed); 200 | 201 | for size in [0, 1, 2, 10, 100, 500].iter() { 202 | seed = seed.wrapping_add(*size as u64); 203 | test_fuse8_build::("RandomState,16", seed, *size); 204 | test_fuse8_build::("BuildHasherDefault,16", seed, *size); 205 | test_fuse8_build_keys::("RandomState,16", seed, *size); 206 | test_fuse8_build_keys::( 207 | "BuildHasherDefault,16", 208 | seed, 209 | *size, 210 | ); 211 | } 212 | } 213 | 214 | #[test] 215 | fn test_fuse8_u64() { 216 | let mut seed: u64 = random(); 217 | println!("test_fuse8_u64 seed:{}", seed); 218 | 219 | for size in [0, 1, 2, 10, 1000, 10_000, 100_000, 1_000_000, 10_000_000].iter() { 220 | seed = seed.wrapping_add(*size as u64); 221 | test_fuse8_build::("RandomState,64", seed, *size); 222 | test_fuse8_build::("BuildHasherDefault,64", seed, *size); 223 | test_fuse8_build_keys::("RandomState,64", seed, *size); 224 | test_fuse8_build_keys::( 225 | "BuildHasherDefault,64", 226 | seed, 227 | *size, 228 | ); 229 | } 230 | } 231 | 232 | #[test] 233 | fn test_fuse8_duplicates() { 234 | println!("test_fuse8_duplicates"); 235 | 236 | let keys = vec![102, 123, 1242352, 12314, 124235, 1231234, 12414, 1242352]; 237 | 238 | let mut filter = Fuse8::::new(keys.len() as u32); 239 | 240 | filter.build_keys(&keys).expect("build with duplicate keys failed"); 241 | 242 | // contains api 243 | for key in keys.iter() { 244 | assert!(filter.contains_key(*key), "key {} not present", key); 245 | } 246 | } 247 | 248 | #[test] 249 | #[ignore] 250 | fn test_fuse8_billion() { 251 | let seed: u64 = random(); 252 | println!("test_fuse8_billion seed:{}", seed); 253 | 254 | let size = 1_000_000_000; 255 | test_fuse8_build::("RandomState,u64", seed, size); 256 | test_fuse8_build::("BuildHasherDefault,u64", seed, size); 257 | test_fuse8_build_keys::("RandomState,u64", seed, size); 258 | test_fuse8_build_keys::( 259 | "BuildHasherDefault,u64", 260 | seed, 261 | size, 262 | ); 263 | } 264 | 265 | #[cfg(feature = "cbordata")] 266 | #[test] 267 | fn test_fuse8_cbor() { 268 | let seed: u64 = random(); 269 | println!("test_fuse8_cbor seed:{}", seed); 270 | let mut rng = StdRng::seed_from_u64(seed); 271 | 272 | let keys: Vec = (0..100_000).map(|_| rng.gen::()).collect(); 273 | 274 | let filter = { 275 | let mut filter = Fuse8::::new(keys.len() as u32); 276 | filter.populate(&keys); 277 | filter.build().expect("fail building fuse8 filter"); 278 | filter 279 | }; 280 | 281 | for key in keys.iter() { 282 | assert!(filter.contains(key), "key {} not present", key); 283 | } 284 | 285 | let filter = { 286 | let val = filter.into_cbor().unwrap(); 287 | Fuse8::::from_cbor(val).unwrap() 288 | }; 289 | 290 | for key in keys.iter() { 291 | assert!(filter.contains(key), "key {} not present", key); 292 | } 293 | } 294 | -------------------------------------------------------------------------------- /src/hasher.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::DefaultHasher; 2 | use std::hash::BuildHasher; 3 | use std::hash::Hasher; 4 | use std::hash::{self}; 5 | 6 | /// Wrapper type for [std::hash::BuildHasherDefault], that uses 7 | /// [DefaultHasher] as the hasher. 8 | #[derive(Clone, Default)] 9 | pub struct BuildHasherDefault { 10 | hasher: hash::BuildHasherDefault, 11 | } 12 | 13 | impl From for Vec { 14 | fn from(_: BuildHasherDefault) -> Vec { 15 | vec![] 16 | } 17 | } 18 | 19 | impl From> for BuildHasherDefault { 20 | fn from(_: Vec) -> BuildHasherDefault { 21 | BuildHasherDefault { 22 | hasher: hash::BuildHasherDefault::::default(), 23 | } 24 | } 25 | } 26 | 27 | impl BuildHasher for BuildHasherDefault { 28 | type Hasher = DefaultHasher; 29 | 30 | fn build_hasher(&self) -> Self::Hasher { 31 | self.hasher.build_hasher() 32 | } 33 | } 34 | 35 | /// NoHash type skips hashing altogether. 36 | /// 37 | /// When a filter is constructed using NoHash as the type parameter then it is upto 38 | /// application to generate the 64-bit hash digest outside this library. 39 | #[derive(Clone)] 40 | pub struct NoHash; 41 | 42 | impl From for Vec { 43 | fn from(_: NoHash) -> Vec { 44 | vec![] 45 | } 46 | } 47 | 48 | impl From> for NoHash { 49 | fn from(_: Vec) -> NoHash { 50 | NoHash 51 | } 52 | } 53 | 54 | impl BuildHasher for NoHash { 55 | type Hasher = NoHash; 56 | 57 | fn build_hasher(&self) -> Self { 58 | NoHash 59 | } 60 | } 61 | 62 | impl Default for NoHash { 63 | fn default() -> Self { 64 | NoHash 65 | } 66 | } 67 | 68 | impl Hasher for NoHash { 69 | fn write(&mut self, _bytes: &[u8]) { 70 | panic!("Can't generate hash digest using NoHash") 71 | } 72 | 73 | fn finish(&self) -> u64 { 74 | panic!("Can't generate hash digest using NoHash") 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | #![allow(clippy::bool_to_int_with_if)] 2 | 3 | //! Library implements xor-filter. 4 | //! 5 | //! Refer to original implementation under `github.com/FastFilter` to learn the 6 | //! differences between [Xor8], [Fuse8] and [Fuse16] filters. Otherwise, all the types 7 | //! provides similar methods. 8 | //! 9 | //! Starting from version `0.6.0` [Xor8] type is split into [xor8::Xor8] and 10 | //! [xor8::Xor8Builder] under module [xor8]. And [Xor8] type is now deprecated. 11 | //! 12 | //! Provides hasher types: 13 | //! 14 | //! All filter-types are parametrised over user supplied hasher-type. 15 | //! 16 | //! * Use [NoHash] when hash feature is not needed on [Xor8], [Fuse8] and [Fuse16] types. 17 | //! Note that type methods that accept parametrized key cannot be used. 18 | //! * [BuildHasherDefault] is the default hasher when `H` is not supplied. Note that 19 | //! [DefaultHasher] uses an unspecified internal algorithm and so its hashes should not 20 | //! be relied upon over releases. 21 | //! 22 | //! **Handling duplicates** 23 | //! 24 | //! * [Fuse16] and [Xor8] implementation uses BTreeMap to make sure all the digests 25 | //! generated from keys are unique, this avoids duplicates but decreases the build 26 | //! performance significantly. 27 | //! * [Fuse8] implementation computes duplicates on the fly leading to significantly 28 | //! better build performance. On the other hand, Fuse8 cannot handle more than few 29 | //! duplicates. 30 | //! 31 | //! **Cloning** 32 | //! 33 | //! Cloning [Xor8], [Fuse8], [Fuse16] is fast, but valid only after the filter 34 | //! is constructed. This can linearly scale for read-concurrency with lookup operation. 35 | //! 36 | //! This is ported from its original implementation: 37 | //! 38 | //! **Features** 39 | //! 40 | //! * Enable ``cbordata`` feature for serialize and deserialize [Xor8] [Fuse8] [Fuse16] 41 | //! types using CBOR spec. 42 | //! 43 | //! * [Xor8] from , written in golang. 44 | //! * [Fuse8] and [Fuse16] from written 45 | //! in C. 46 | 47 | #[allow(unused_imports)] 48 | use std::collections::hash_map::DefaultHasher; 49 | use std::error; 50 | use std::fmt; 51 | use std::result; 52 | 53 | /// Short form to compose Error values. 54 | /// 55 | /// Here are few possible ways: 56 | /// 57 | /// ```ignore 58 | /// use crate::Error; 59 | /// err_at!(ParseError, msg: "bad argument"); 60 | /// ``` 61 | /// 62 | /// ```ignore 63 | /// use crate::Error; 64 | /// err_at!(ParseError, std::io::read(buf)); 65 | /// ``` 66 | /// 67 | /// ```ignore 68 | /// use crate::Error; 69 | /// err_at!(ParseError, std::fs::read(file_path), "read failed"); 70 | /// ``` 71 | macro_rules! err_at { 72 | ($v:ident, msg: $($arg:expr),+) => {{ 73 | let prefix = format!("{}:{}", file!(), line!()); 74 | Err(Error::$v(prefix, format!($($arg),+))) 75 | }}; 76 | ($v:ident, $e:expr) => {{ 77 | match $e { 78 | Ok(val) => Ok(val), 79 | Err(err) => { 80 | let prefix = format!("{}:{}", file!(), line!()); 81 | Err(Error::$v(prefix, format!("{}", err))) 82 | } 83 | } 84 | }}; 85 | ($v:ident, $e:expr, $($arg:expr),+) => {{ 86 | match $e { 87 | Ok(val) => Ok(val), 88 | Err(err) => { 89 | let prefix = format!("{}:{}", file!(), line!()); 90 | let msg = format!($($arg),+); 91 | Err(Error::$v(prefix, format!("{} {}", err, msg))) 92 | } 93 | } 94 | }}; 95 | } 96 | 97 | /// Error variants that are returned by this package's API. 98 | /// 99 | /// Each variant carries a prefix, typically identifying the 100 | /// error location. 101 | pub enum Error { 102 | Fatal(String, String), 103 | } 104 | 105 | impl fmt::Display for Error { 106 | fn fmt(&self, f: &mut fmt::Formatter) -> result::Result<(), fmt::Error> { 107 | use Error::*; 108 | 109 | match self { 110 | Fatal(p, msg) => write!(f, "{} Fatal: {}", p, msg), 111 | } 112 | } 113 | } 114 | 115 | impl fmt::Debug for Error { 116 | fn fmt(&self, f: &mut fmt::Formatter) -> result::Result<(), fmt::Error> { 117 | write!(f, "{}", self) 118 | } 119 | } 120 | 121 | impl error::Error for Error {} 122 | 123 | /// Type alias for Result return type, used by this package. 124 | pub type Result = result::Result; 125 | 126 | mod fuse16; 127 | mod fuse8; 128 | mod hasher; 129 | mod xor8_old; 130 | 131 | pub mod xor8; 132 | pub use fuse16::Fuse16; 133 | pub use fuse8::Fuse8; 134 | pub use hasher::BuildHasherDefault; 135 | pub use hasher::NoHash; 136 | #[deprecated(since = "0.6.0", note = "Use xor8::Xor8 and xor8::Xor8Builder types")] 137 | pub use xor8_old::Xor8; 138 | -------------------------------------------------------------------------------- /src/xor8/builder.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashSet; 2 | use std::hash::BuildHasher; 3 | use std::hash::Hash; 4 | use std::hash::Hasher; 5 | use std::sync::Arc; 6 | 7 | use crate::xor8::filter::fingerprint; 8 | use crate::xor8::filter::splitmix64; 9 | use crate::xor8::filter::XorSet; 10 | use crate::xor8::Xor8; 11 | use crate::BuildHasherDefault; 12 | 13 | #[derive(Clone, Copy, Default)] 14 | struct KeyIndex { 15 | hash: u64, 16 | index: u32, 17 | } 18 | 19 | /// Builds an Xor8 filter. 20 | /// 21 | /// Example: 22 | /// ``` 23 | /// # use xorfilter::xor8::Xor8Builder; 24 | /// 25 | /// let mut b: Xor8Builder = Xor8Builder::new(); 26 | /// 27 | /// b.populate(&["foo", "bar"]); 28 | /// let filter = b.build().unwrap(); 29 | /// 30 | /// assert!(filter.contains("foo")); 31 | /// ``` 32 | #[derive(Clone, Debug)] 33 | pub struct Xor8Builder 34 | where H: BuildHasher + Clone 35 | { 36 | digests: HashSet, 37 | pub num_digests: usize, 38 | pub hash_builder: H, 39 | } 40 | 41 | impl Default for Xor8Builder 42 | where H: BuildHasher + Clone + Default 43 | { 44 | fn default() -> Self { 45 | Self { 46 | digests: Default::default(), 47 | num_digests: 0, 48 | hash_builder: H::default(), 49 | } 50 | } 51 | } 52 | 53 | impl Xor8Builder 54 | where H: BuildHasher + Clone 55 | { 56 | /// New Xor8 builder initialized with [BuildHasherDefault]. 57 | pub fn new() -> Self 58 | where H: Default { 59 | Self::default() 60 | } 61 | 62 | /// New Xor8 builder initialized with supplied `hasher`. 63 | pub fn with_hasher(hash_builder: H) -> Self { 64 | Self { 65 | digests: HashSet::new(), 66 | num_digests: 0, 67 | hash_builder, 68 | } 69 | } 70 | 71 | pub fn get_hasher(&self) -> H::Hasher { 72 | self.hash_builder.build_hasher() 73 | } 74 | 75 | /// Calculate hash of a key. 76 | #[inline] 77 | pub fn hash(&self, key: &K) -> u64 { 78 | let mut hasher = self.get_hasher(); 79 | key.hash(&mut hasher); 80 | hasher.finish() 81 | } 82 | 83 | /// Insert 64-bit digest of a single key. 84 | /// 85 | /// Digest for the key shall be generated using the default-hasher or via hasher 86 | /// supplied via [Xor8Builder::with_hasher] method. 87 | pub fn insert(&mut self, key: &K) { 88 | let digest = self.hash(key); 89 | 90 | self.digests.insert(digest); 91 | self.num_digests += 1; 92 | } 93 | 94 | /// Populate with 64-bit digests for a collection of keys of type `K`. 95 | /// 96 | /// Digest for key shall be generated using the default-hasher or via hasher supplied 97 | /// via [Xor8Builder::with_hasher] method. 98 | pub fn populate<'i, K: Hash + 'i, I: IntoIterator>(&mut self, keys: I) { 99 | let mut n = 0; 100 | 101 | for key in keys.into_iter() { 102 | n += 1; 103 | 104 | let digest = self.hash(key); 105 | self.digests.insert(digest); 106 | } 107 | 108 | self.num_digests += n; 109 | } 110 | 111 | /// Populate with pre-compute collection of 64-bit digests. 112 | pub fn populate_digests<'i, I: IntoIterator>(&mut self, digests: I) { 113 | let mut n = 0; 114 | 115 | for digest in digests.into_iter() { 116 | n += 1; 117 | self.digests.insert(*digest); 118 | } 119 | 120 | self.num_digests += n; 121 | } 122 | 123 | /// Build bitmap for keys that where previously inserted using [Xor8Builder::insert], 124 | /// [Xor8Builder::populate] and [Xor8Builder::populate_digests] method. 125 | pub fn build(&mut self) -> Result, crate::Error> { 126 | let digests = self.digests.iter().copied().collect::>(); 127 | self.build_from_digests(&digests) 128 | } 129 | 130 | /// Build a bitmap for pre-computed 64-bit digests for keys. 131 | /// 132 | /// If keys where previously inserted using [Xor8Builder::insert] or 133 | /// [Xor8Builder::populate] or [Xor8Builder::populate_digests] methods, they shall be 134 | /// ignored. 135 | /// 136 | /// It is upto the caller to ensure that digests are unique, that there no duplicates. 137 | pub fn build_from_digests( 138 | &mut self, 139 | digests: &[u64], 140 | ) -> Result, crate::Error> { 141 | let mut ff = Xor8::::new(self.hash_builder.clone()); 142 | 143 | ff.num_keys = Some(digests.len()); 144 | let (size, mut rngcounter) = (digests.len(), 1_u64); 145 | let capacity = { 146 | let capacity = 32 + ((1.23 * (size as f64)).ceil() as u32); 147 | capacity / 3 * 3 // round it down to a multiple of 3 148 | }; 149 | ff.seed = splitmix64(&mut rngcounter); 150 | ff.block_length = capacity / 3; 151 | ff.finger_prints = Arc::new(vec![u8::default(); capacity as usize]); 152 | 153 | let block_length = ff.block_length as usize; 154 | let mut q0: Vec = Vec::with_capacity(block_length); 155 | let mut q1: Vec = Vec::with_capacity(block_length); 156 | let mut q2: Vec = Vec::with_capacity(block_length); 157 | let mut stack: Vec = Vec::with_capacity(size); 158 | let mut sets0: Vec = vec![XorSet::default(); block_length]; 159 | let mut sets1: Vec = vec![XorSet::default(); block_length]; 160 | let mut sets2: Vec = vec![XorSet::default(); block_length]; 161 | 162 | loop { 163 | for key in digests.iter() { 164 | let hs = ff.get_h0h1h2(*key); 165 | sets0[hs.h0 as usize].xor_mask ^= hs.h; 166 | sets0[hs.h0 as usize].count += 1; 167 | sets1[hs.h1 as usize].xor_mask ^= hs.h; 168 | sets1[hs.h1 as usize].count += 1; 169 | sets2[hs.h2 as usize].xor_mask ^= hs.h; 170 | sets2[hs.h2 as usize].count += 1; 171 | } 172 | 173 | q0.clear(); 174 | q1.clear(); 175 | q2.clear(); 176 | 177 | let iter = sets0.iter().enumerate().take(ff.block_length as usize); 178 | for (i, item) in iter { 179 | if item.count == 1 { 180 | q0.push(KeyIndex { 181 | index: i as u32, 182 | hash: item.xor_mask, 183 | }); 184 | } 185 | } 186 | let iter = sets1.iter().enumerate().take(ff.block_length as usize); 187 | for (i, item) in iter { 188 | if item.count == 1 { 189 | q1.push(KeyIndex { 190 | index: i as u32, 191 | hash: item.xor_mask, 192 | }); 193 | } 194 | } 195 | let iter = sets2.iter().enumerate().take(ff.block_length as usize); 196 | for (i, item) in iter { 197 | if item.count == 1 { 198 | q2.push(KeyIndex { 199 | index: i as u32, 200 | hash: item.xor_mask, 201 | }); 202 | } 203 | } 204 | 205 | stack.clear(); 206 | 207 | while !q0.is_empty() || !q1.is_empty() || !q2.is_empty() { 208 | while let Some(keyindexvar) = q0.pop() { 209 | if sets0[keyindexvar.index as usize].count == 0 { 210 | // not actually possible after the initial scan. 211 | continue; 212 | } 213 | let hash = keyindexvar.hash; 214 | let h1 = ff.get_h1(hash); 215 | let h2 = ff.get_h2(hash); 216 | stack.push(keyindexvar); 217 | 218 | let mut s = unsafe { sets1.get_unchecked_mut(h1 as usize) }; 219 | s.xor_mask ^= hash; 220 | s.count -= 1; 221 | if s.count == 1 { 222 | q1.push(KeyIndex { 223 | index: h1, 224 | hash: s.xor_mask, 225 | }) 226 | } 227 | 228 | let mut s = unsafe { sets2.get_unchecked_mut(h2 as usize) }; 229 | s.xor_mask ^= hash; 230 | s.count -= 1; 231 | if s.count == 1 { 232 | q2.push(KeyIndex { 233 | index: h2, 234 | hash: s.xor_mask, 235 | }) 236 | } 237 | } 238 | while let Some(mut keyindexvar) = q1.pop() { 239 | if sets1[keyindexvar.index as usize].count == 0 { 240 | continue; 241 | } 242 | let hash = keyindexvar.hash; 243 | let h0 = ff.get_h0(hash); 244 | let h2 = ff.get_h2(hash); 245 | keyindexvar.index += ff.block_length; 246 | stack.push(keyindexvar); 247 | 248 | let mut s = unsafe { sets0.get_unchecked_mut(h0 as usize) }; 249 | s.xor_mask ^= hash; 250 | s.count -= 1; 251 | if s.count == 1 { 252 | q0.push(KeyIndex { 253 | index: h0, 254 | hash: s.xor_mask, 255 | }) 256 | } 257 | 258 | let mut s = unsafe { sets2.get_unchecked_mut(h2 as usize) }; 259 | s.xor_mask ^= hash; 260 | s.count -= 1; 261 | if s.count == 1 { 262 | q2.push(KeyIndex { 263 | index: h2, 264 | hash: s.xor_mask, 265 | }) 266 | } 267 | } 268 | while let Some(mut keyindexvar) = q2.pop() { 269 | if sets2[keyindexvar.index as usize].count == 0 { 270 | continue; 271 | } 272 | let hash = keyindexvar.hash; 273 | let h0 = ff.get_h0(hash); 274 | let h1 = ff.get_h1(hash); 275 | keyindexvar.index += 2 * ff.block_length; 276 | stack.push(keyindexvar); 277 | 278 | let mut s = unsafe { sets0.get_unchecked_mut(h0 as usize) }; 279 | s.xor_mask ^= hash; 280 | s.count -= 1; 281 | if s.count == 1 { 282 | q0.push(KeyIndex { 283 | index: h0, 284 | hash: s.xor_mask, 285 | }) 286 | } 287 | let mut s = unsafe { sets1.get_unchecked_mut(h1 as usize) }; 288 | s.xor_mask ^= hash; 289 | s.count -= 1; 290 | if s.count == 1 { 291 | q1.push(KeyIndex { 292 | index: h1, 293 | hash: s.xor_mask, 294 | }) 295 | } 296 | } 297 | } 298 | 299 | if stack.len() == size { 300 | break; 301 | } 302 | 303 | for item in sets0.iter_mut() { 304 | *item = XorSet::default(); 305 | } 306 | for item in sets1.iter_mut() { 307 | *item = XorSet::default(); 308 | } 309 | for item in sets2.iter_mut() { 310 | *item = XorSet::default(); 311 | } 312 | ff.seed = splitmix64(&mut rngcounter) 313 | } 314 | 315 | while let Some(ki) = stack.pop() { 316 | let mut val = fingerprint(ki.hash) as u8; 317 | if ki.index < ff.block_length { 318 | let h1 = (ff.get_h1(ki.hash) + ff.block_length) as usize; 319 | let h2 = (ff.get_h2(ki.hash) + 2 * ff.block_length) as usize; 320 | val ^= ff.finger_prints[h1] ^ ff.finger_prints[h2]; 321 | } else if ki.index < 2 * ff.block_length { 322 | let h0 = ff.get_h0(ki.hash) as usize; 323 | let h2 = (ff.get_h2(ki.hash) + 2 * ff.block_length) as usize; 324 | val ^= ff.finger_prints[h0] ^ ff.finger_prints[h2]; 325 | } else { 326 | let h0 = ff.get_h0(ki.hash) as usize; 327 | let h1 = (ff.get_h1(ki.hash) + ff.block_length) as usize; 328 | val ^= ff.finger_prints[h0] ^ ff.finger_prints[h1] 329 | } 330 | Arc::get_mut(&mut ff.finger_prints).unwrap()[ki.index as usize] = val; 331 | } 332 | 333 | Ok(ff) 334 | } 335 | } 336 | -------------------------------------------------------------------------------- /src/xor8/filter.rs: -------------------------------------------------------------------------------- 1 | //! Library implements xor-filter. 2 | //! 3 | //! This is a port of its 4 | //! [original implementation](https://github.com/FastFilter/xorfilter) 5 | //! written in golang. 6 | 7 | #[allow(unused_imports)] 8 | use std::collections::hash_map::DefaultHasher; 9 | #[allow(unused_imports)] 10 | use std::collections::hash_map::RandomState; 11 | use std::convert::TryInto; 12 | use std::ffi; 13 | use std::fs; 14 | use std::hash::BuildHasher; 15 | use std::hash::Hash; 16 | use std::hash::Hasher; 17 | use std::io::ErrorKind; 18 | use std::io::Read; 19 | use std::io::Write; 20 | use std::io::{self}; 21 | use std::sync::Arc; 22 | 23 | #[cfg(feature = "cbordata")] 24 | use cbordata::Cbor; 25 | #[cfg(feature = "cbordata")] 26 | use cbordata::Cborize; 27 | #[cfg(feature = "cbordata")] 28 | use cbordata::FromCbor; 29 | #[cfg(feature = "cbordata")] 30 | use cbordata::IntoCbor; 31 | #[cfg(feature = "cbordata")] 32 | use cbordata::{self as cbor}; 33 | 34 | use crate::BuildHasherDefault; 35 | 36 | pub(in crate::xor8) fn murmur64(mut h: u64) -> u64 { 37 | h ^= h >> 33; 38 | h = h.wrapping_mul(0xff51_afd7_ed55_8ccd); 39 | h ^= h >> 33; 40 | h = h.wrapping_mul(0xc4ce_b9fe_1a85_ec53); 41 | h ^= h >> 33; 42 | h 43 | } 44 | 45 | // returns random number, modifies the seed 46 | pub(in crate::xor8) fn splitmix64(seed: &mut u64) -> u64 { 47 | *seed = (*seed).wrapping_add(0x9E37_79B9_7F4A_7C15); 48 | let mut z = *seed; 49 | z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); 50 | z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); 51 | z ^ (z >> 31) 52 | } 53 | 54 | pub(in crate::xor8) fn mixsplit(key: u64, seed: u64) -> u64 { 55 | murmur64(key.wrapping_add(seed)) 56 | } 57 | 58 | pub(in crate::xor8) fn reduce(hash: u32, n: u32) -> u32 { 59 | // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 60 | (((hash as u64) * (n as u64)) >> 32) as u32 61 | } 62 | 63 | pub(in crate::xor8) fn fingerprint(hash: u64) -> u64 { 64 | hash ^ (hash >> 32) 65 | } 66 | 67 | #[derive(Clone, Default)] 68 | pub(in crate::xor8) struct XorSet { 69 | pub(in crate::xor8) xor_mask: u64, 70 | pub(in crate::xor8) count: u32, 71 | } 72 | 73 | #[derive(Default)] 74 | pub(in crate::xor8) struct Hashes { 75 | pub(in crate::xor8) h: u64, 76 | pub(in crate::xor8) h0: u32, 77 | pub(in crate::xor8) h1: u32, 78 | pub(in crate::xor8) h2: u32, 79 | } 80 | 81 | /// Type Xor8 is probabilistic data-structure to test membership of an element in a set. 82 | /// 83 | /// This implementation has a false positive rate of about 0.3% and a memory usage of 84 | /// less than 9 bits per entry for sizeable sets. 85 | /// 86 | /// Xor8 is parametrized over type `H` which is expected to implement [BuildHasher] 87 | /// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied, 88 | /// [BuildHasherDefault] is used as the default hash-builder. 89 | /// 90 | /// If `RandomState` is used as BuildHasher, `std` has got this to say 91 | /// > _A particular instance RandomState will create the same instances 92 | /// > of Hasher, but the hashers created by two different RandomState_ 93 | /// > instances are unlikely to produce the same result for the same values._ 94 | /// 95 | /// If [DefaultHasher] is used as BuildHasher, `std` has got this to say, 96 | /// > _The internal algorithm is not specified, and so its hashes 97 | /// > should not be relied upon over releases._ 98 | /// 99 | /// The default type for parameter `H` might change when a reliable and commonly used 100 | /// BuildHasher type is available. 101 | #[derive(Clone, Debug, Default)] 102 | pub struct Xor8 103 | where H: BuildHasher 104 | { 105 | pub hash_builder: H, 106 | pub seed: u64, 107 | // TODO: Keep `Option` for the compatibility with Cbor format. 108 | // It is always `Some` since we have moved out Xor8Builder to another struct. 109 | pub num_keys: Option, 110 | pub block_length: u32, 111 | pub finger_prints: Arc>, 112 | } 113 | 114 | impl PartialEq for Xor8 115 | where H: BuildHasher 116 | { 117 | fn eq(&self, other: &Self) -> bool { 118 | let num_keys = match (self.num_keys, other.num_keys) { 119 | (Some(a), Some(b)) => a == b, 120 | (_, _) => true, 121 | }; 122 | 123 | self.seed == other.seed 124 | && num_keys 125 | && self.block_length == other.block_length 126 | && self.finger_prints == other.finger_prints 127 | } 128 | } 129 | 130 | impl Xor8 131 | where H: BuildHasher 132 | { 133 | pub(crate) fn new(hash_builder: H) -> Self { 134 | Self { 135 | hash_builder, 136 | seed: 0, 137 | num_keys: None, 138 | block_length: 0, 139 | finger_prints: Arc::new(vec![]), 140 | } 141 | } 142 | } 143 | 144 | impl Xor8 145 | where H: BuildHasher 146 | { 147 | #[allow(clippy::len_without_is_empty)] 148 | /// Return the number of keys added/built into the bitmap index. 149 | pub fn len(&self) -> Option { 150 | self.num_keys 151 | } 152 | 153 | /// Contains tell you whether the key is likely part of the set, with false 154 | /// positive rate. 155 | pub fn contains(&self, key: &K) -> bool { 156 | let hashed_key = { 157 | let mut hasher = self.hash_builder.build_hasher(); 158 | key.hash(&mut hasher); 159 | hasher.finish() 160 | }; 161 | self.contains_digest(hashed_key) 162 | } 163 | 164 | /// Contains tell you whether the key, as pre-computed digest form, is likely 165 | /// part of the set, with false positive rate. 166 | pub fn contains_digest(&self, digest: u64) -> bool { 167 | let hash = mixsplit(digest, self.seed); 168 | let f = fingerprint(hash) as u8; 169 | let r0 = hash as u32; 170 | let r1 = hash.rotate_left(21) as u32; 171 | let r2 = hash.rotate_left(42) as u32; 172 | let h0 = reduce(r0, self.block_length) as usize; 173 | let h1 = (reduce(r1, self.block_length) + self.block_length) as usize; 174 | let h2 = (reduce(r2, self.block_length) + 2 * self.block_length) as usize; 175 | f == (self.finger_prints[h0] ^ self.finger_prints[h1] ^ self.finger_prints[h2]) 176 | } 177 | 178 | pub fn get_hasher(&self) -> H::Hasher { 179 | self.hash_builder.build_hasher() 180 | } 181 | 182 | /// Calculate hash of a key. 183 | #[inline] 184 | pub fn hash(&self, key: &K) -> u64 { 185 | let mut hasher = self.get_hasher(); 186 | key.hash(&mut hasher); 187 | hasher.finish() 188 | } 189 | } 190 | 191 | impl Xor8 192 | where H: BuildHasher 193 | { 194 | pub(in crate::xor8) fn get_h0h1h2(&self, k: u64) -> Hashes { 195 | let h = mixsplit(k, self.seed); 196 | Hashes { 197 | h, 198 | h0: reduce(h as u32, self.block_length), 199 | h1: reduce(h.rotate_left(21) as u32, self.block_length), 200 | h2: reduce(h.rotate_left(42) as u32, self.block_length), 201 | } 202 | } 203 | 204 | pub(in crate::xor8) fn get_h0(&self, hash: u64) -> u32 { 205 | let r0 = hash as u32; 206 | reduce(r0, self.block_length) 207 | } 208 | 209 | pub(in crate::xor8) fn get_h1(&self, hash: u64) -> u32 { 210 | let r1 = hash.rotate_left(21) as u32; 211 | reduce(r1, self.block_length) 212 | } 213 | 214 | pub(in crate::xor8) fn get_h2(&self, hash: u64) -> u32 { 215 | let r2 = hash.rotate_left(42) as u32; 216 | reduce(r2, self.block_length) 217 | } 218 | } 219 | 220 | /// Implements serialization and de-serialization logic for Xor8. This is still work 221 | /// in progress, refer to issue: 222 | /// in github. 223 | /// 224 | /// TODO: 225 | impl Xor8 226 | where H: Into> + From> + BuildHasher 227 | { 228 | /// File signature write on first 4 bytes of file. 229 | /// ^ stands for xor 230 | /// TL stands for filter 231 | /// 1 stands for version 1 232 | /// 2 stands for version 2 233 | /// 3 stands for version 3 234 | const SIGNATURE_V1: [u8; 4] = [b'^', b'T', b'L', 1]; 235 | const SIGNATURE_V2: [u8; 4] = [b'^', b'T', b'L', 2]; 236 | 237 | /// METADATA_LENGTH is size that required to write size of all the 238 | /// metadata of the serialized filter. 239 | // signature length + seed-length + block-length + 240 | // fingerprint-length + hasher-builder length + fingerprint + hash-builder 241 | const METADATA_LENGTH: usize = 4 + 8 + 4 + 4 + 4; 242 | 243 | /// Write to file in binary format 244 | /// TODO Add chechsum of finger_prints into file headers 245 | pub fn write_file(&self, path: &ffi::OsStr) -> io::Result 246 | where H: Clone { 247 | let mut f = fs::File::create(path)?; 248 | let buf = self.to_bytes(); 249 | f.write_all(&buf)?; 250 | Ok(buf.len()) 251 | } 252 | 253 | /// Read from file in binary format 254 | pub fn read_file(path: &ffi::OsStr) -> io::Result 255 | where H: Default { 256 | let mut f = fs::File::open(path)?; 257 | let mut data = Vec::new(); 258 | f.read_to_end(&mut data)?; 259 | Self::from_bytes(data) 260 | } 261 | 262 | pub fn to_bytes(&self) -> Vec 263 | where H: Clone { 264 | let capacity = Self::METADATA_LENGTH + self.finger_prints.len(); 265 | let mut buf: Vec = Vec::with_capacity(capacity); 266 | buf.extend_from_slice(&Xor8::::SIGNATURE_V2); 267 | buf.extend_from_slice(&self.seed.to_be_bytes()); 268 | buf.extend_from_slice(&self.block_length.to_be_bytes()); 269 | buf.extend_from_slice(&(self.finger_prints.len() as u32).to_be_bytes()); 270 | 271 | let hb_binary: Vec = self.hash_builder.clone().into(); 272 | buf.extend_from_slice(&(hb_binary.len() as u32).to_be_bytes()); 273 | 274 | buf.extend_from_slice(&self.finger_prints); 275 | buf.extend_from_slice(&hb_binary); 276 | buf 277 | } 278 | 279 | pub fn from_bytes(buf: Vec) -> io::Result 280 | where H: Default { 281 | use std::io::Error; 282 | 283 | let mut n = 0; 284 | 285 | // validate the buf first. 286 | if Self::METADATA_LENGTH > buf.len() { 287 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 288 | } 289 | 290 | // check the signature 291 | if buf[n..4] == Xor8::::SIGNATURE_V1 { 292 | return Self::from_bytes_v1(buf); 293 | } else if buf[n..4] != Xor8::::SIGNATURE_V2 { 294 | return Err(Error::new( 295 | ErrorKind::InvalidData, 296 | "File signature incorrect", 297 | )); 298 | } 299 | 300 | n += 4; 301 | // fetch the seed 302 | let seed = u64::from_be_bytes(buf[n..n + 8].try_into().unwrap()); 303 | n += 8; 304 | // fetch block_length 305 | let block_length = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()); 306 | n += 4; 307 | // fetch fingerprint length 308 | let fp_len = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()) as usize; 309 | n += 4; 310 | // fetch hash-serizalized length 311 | let hb_len = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()) as usize; 312 | n += 4; 313 | 314 | if buf[n..].len() < (fp_len + hb_len) { 315 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 316 | } 317 | 318 | // fetch the finger print 319 | let finger_prints = Arc::new(buf[n..n + fp_len].to_vec()); 320 | n += fp_len; 321 | // fetch the hash_builder 322 | let hash_builder: H = buf[n..n + hb_len].to_vec().into(); 323 | 324 | Ok(Xor8 { 325 | hash_builder, 326 | seed, 327 | num_keys: None, 328 | block_length, 329 | finger_prints, 330 | }) 331 | } 332 | 333 | fn from_bytes_v1(buf: Vec) -> io::Result 334 | where H: Default { 335 | use std::io::Error; 336 | 337 | let fp_len = u32::from_be_bytes(buf[16..20].try_into().unwrap()) as usize; 338 | if buf[20..].len() < fp_len { 339 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 340 | } 341 | Ok(Xor8 { 342 | hash_builder: H::default(), 343 | seed: u64::from_be_bytes(buf[4..12].try_into().unwrap()), 344 | num_keys: None, 345 | block_length: u32::from_be_bytes(buf[12..16].try_into().unwrap()), 346 | finger_prints: Arc::new(buf[20..].to_vec()), 347 | }) 348 | } 349 | } 350 | 351 | //------ Implement cbordata related functionalities 352 | 353 | // Intermediate type to serialize and de-serialized Xor8 into bytes. 354 | #[cfg(feature = "cbordata")] 355 | #[derive(Cborize)] 356 | struct CborXor8 { 357 | hash_builder: Vec, 358 | seed: u64, 359 | num_keys: Option, 360 | block_length: u32, 361 | finger_prints: Vec, 362 | } 363 | 364 | #[cfg(feature = "cbordata")] 365 | impl CborXor8 { 366 | const ID: &'static str = "xor8/0.0.1"; 367 | } 368 | 369 | #[cfg(feature = "cbordata")] 370 | impl IntoCbor for Xor8 371 | where H: BuildHasher + Into> 372 | { 373 | fn into_cbor(self) -> cbor::Result { 374 | let val = CborXor8 { 375 | hash_builder: self.hash_builder.into(), 376 | seed: self.seed, 377 | num_keys: self.num_keys, 378 | block_length: self.block_length, 379 | finger_prints: self.finger_prints.to_vec(), 380 | }; 381 | val.into_cbor() 382 | } 383 | } 384 | 385 | #[cfg(feature = "cbordata")] 386 | impl FromCbor for Xor8 387 | where H: BuildHasher + From> 388 | { 389 | fn from_cbor(val: Cbor) -> cbor::Result { 390 | let val = CborXor8::from_cbor(val)?; 391 | 392 | let filter = Xor8 { 393 | hash_builder: val.hash_builder.into(), 394 | seed: val.seed, 395 | num_keys: val.num_keys, 396 | block_length: val.block_length, 397 | finger_prints: Arc::new(val.finger_prints), 398 | }; 399 | 400 | Ok(filter) 401 | } 402 | } 403 | -------------------------------------------------------------------------------- /src/xor8/mod.rs: -------------------------------------------------------------------------------- 1 | mod builder; 2 | mod filter; 3 | 4 | pub use builder::Xor8Builder; 5 | pub use filter::Xor8; 6 | 7 | #[cfg(test)] 8 | #[path = "xor8_test.rs"] 9 | mod xor8_test; 10 | -------------------------------------------------------------------------------- /src/xor8/xor8_test.rs: -------------------------------------------------------------------------------- 1 | use std::collections::hash_map::RandomState; 2 | use std::hash::BuildHasher; 3 | 4 | #[cfg(feature = "cbordata")] 5 | use cbordata::FromCbor; 6 | #[cfg(feature = "cbordata")] 7 | use cbordata::IntoCbor; 8 | use rand::prelude::random; 9 | use rand::rngs::StdRng; 10 | use rand::Rng; 11 | use rand::SeedableRng; 12 | 13 | use crate::xor8::Xor8Builder; 14 | use crate::BuildHasherDefault; 15 | 16 | fn generate_unique_keys(rng: &mut StdRng, size: usize) -> Vec { 17 | let mut keys: Vec = Vec::with_capacity(size); 18 | keys.resize(size, u64::default()); 19 | 20 | for key in keys.iter_mut() { 21 | *key = rng.gen(); 22 | } 23 | keys.sort_unstable(); 24 | keys.dedup(); 25 | 26 | for _i in 0..(size - keys.len()) { 27 | let key = rng.gen::(); 28 | if !keys.contains(&key) { 29 | keys.push(key) 30 | } 31 | } 32 | 33 | keys 34 | } 35 | 36 | fn test_xor8_build(name: &str, seed: u64, size: u32) 37 | where H: BuildHasher + Clone + Default { 38 | let (x, y) = { 39 | let size = size as usize; 40 | (size / 3, size / 3) 41 | }; 42 | 43 | println!("test_xor8_build<{}> size:{}", name, size); 44 | let mut rng = StdRng::seed_from_u64(seed); 45 | 46 | let mut builder = Xor8Builder::::new(); 47 | let keys = generate_unique_keys(&mut rng, size as usize); 48 | let (keys1, keys2, keys3) = (&keys[0..x], &keys[x..x + y], &keys[x + y..]); 49 | 50 | // populate api 51 | builder.populate(keys1); 52 | // populate_keys api 53 | let digests: Vec = keys2.iter().map(|k| builder.hash(k)).collect(); 54 | builder.populate_digests(digests.iter()); 55 | // insert api 56 | keys3.iter().for_each(|key| builder.insert(key)); 57 | 58 | let filter = builder.build().expect("failed build"); 59 | 60 | // contains api 61 | for key in keys.iter() { 62 | assert!(filter.contains(key), "key {} not present", key); 63 | } 64 | // contains_key api 65 | for key in keys.iter() { 66 | let digest = filter.hash(key); 67 | assert!(filter.contains_digest(digest), "key {} not present", key); 68 | } 69 | 70 | // print some statistics 71 | let (falsesize, mut matches) = (10_000_000, 0_f64); 72 | let bpv = (filter.finger_prints.len() as f64) * 8.0 / (keys.len() as f64); 73 | println!("test_xor8_build<{}> bits per entry {} bits", name, bpv); 74 | if size > 1000 { 75 | assert!(bpv < 12.0, "bpv({}) >= 12.0", bpv); 76 | } 77 | 78 | for _ in 0..falsesize { 79 | if filter.contains(&rng.gen::()) { 80 | matches += 1_f64; 81 | } 82 | } 83 | 84 | let fpp = matches * 100.0 / (falsesize as f64); 85 | println!("test_xor8_build<{}> false positive rate {}%", name, fpp); 86 | assert!(fpp < 0.40, "fpp({}) >= 0.40", fpp); 87 | } 88 | 89 | fn test_xor8_build_keys(name: &str, seed: u64, size: u32) 90 | where H: Default + BuildHasher + Clone { 91 | println!("test_xor8_build_keys<{}> size:{}", name, size); 92 | let mut rng = StdRng::seed_from_u64(seed); 93 | 94 | let mut builder = Xor8Builder::::new(); 95 | 96 | // build_keys api 97 | let keys = generate_unique_keys(&mut rng, size as usize); 98 | let digests: Vec = keys.iter().map(|k| builder.hash(k)).collect(); 99 | let filter = builder.build_from_digests(&digests).expect("failed build_keys"); 100 | 101 | // contains api 102 | for key in keys.iter() { 103 | assert!(filter.contains(key), "key {} not present", key); 104 | } 105 | 106 | // contains_key api 107 | for digest in digests.into_iter() { 108 | assert!( 109 | filter.contains_digest(digest), 110 | "digest {} not present", 111 | digest 112 | ); 113 | } 114 | 115 | // print some statistics 116 | let (falsesize, mut matches) = (10_000_000, 0_f64); 117 | let bpv = (filter.finger_prints.len() as f64) * 8.0 / (keys.len() as f64); 118 | println!("test_xor8_build_keys<{}> bits per entry {} bits", name, bpv); 119 | if size > 1000 { 120 | assert!(bpv < 12.0, "bpv({}) >= 12.0", bpv); 121 | } 122 | 123 | for _ in 0..falsesize { 124 | if filter.contains(&rng.gen::()) { 125 | matches += 1_f64; 126 | } 127 | } 128 | 129 | let fpp = matches * 100.0 / (falsesize as f64); 130 | println!( 131 | "test_xor8_build_keys<{}> false positive rate {}%", 132 | name, fpp 133 | ); 134 | assert!(fpp < 0.40, "fpp({}) >= 0.40", fpp); 135 | } 136 | 137 | #[test] 138 | fn test_xor8_build_keys_simple() { 139 | let seed: u64 = random(); 140 | println!("test_xor8 seed:{}", seed); 141 | 142 | let size = 100_000; 143 | let name = "BuildHasherDefault"; 144 | 145 | println!("test_xor8_build_keys<{}> size:{}", name, size); 146 | let mut rng = StdRng::seed_from_u64(seed); 147 | 148 | let mut builder = Xor8Builder::::new(); 149 | 150 | // build_keys api 151 | let keys = generate_unique_keys(&mut rng, size as usize); 152 | let digests: Vec = keys.iter().map(|k| builder.hash(k)).collect(); 153 | 154 | let filter = builder.build_from_digests(&digests).expect("failed build_from_digests"); 155 | 156 | // contains api 157 | for key in keys.iter() { 158 | assert!(filter.contains(key), "key {} not present", key); 159 | } 160 | 161 | // contains_key api 162 | for digest in digests.into_iter() { 163 | assert!( 164 | filter.contains_digest(digest), 165 | "digest {} not present", 166 | digest 167 | ); 168 | } 169 | 170 | // print some statistics 171 | let (false_size, mut matches) = (10_000_000, 0_f64); 172 | let bpv = (filter.finger_prints.len() as f64) * 8.0 / (keys.len() as f64); 173 | println!("test_xor8_build_keys<{}> bits per entry {} bits", name, bpv); 174 | assert!(bpv < 12.0, "bpv({}) >= 12.0", bpv); 175 | 176 | for _ in 0..false_size { 177 | if filter.contains(&rng.gen::()) { 178 | matches += 1_f64; 179 | } 180 | } 181 | 182 | let fpp = matches * 100.0 / (false_size as f64); 183 | println!( 184 | "test_xor8_build_keys<{}> false positive rate {}%", 185 | name, fpp 186 | ); 187 | assert!(fpp < 0.50, "fpp({}) >= 0.50%", fpp); 188 | } 189 | 190 | #[test] 191 | fn test_xor8() { 192 | let mut seed: u64 = random(); 193 | println!("test_xor8 seed:{}", seed); 194 | 195 | for size in [0, 1, 2, 10, 1000, 10_000, 100_000, 1_000_000, 10_000_000].iter() { 196 | seed = seed.wrapping_add(*size as u64); 197 | test_xor8_build::("RandomState", seed, *size); 198 | test_xor8_build::("BuildHasherDefault", seed, *size); 199 | test_xor8_build_keys::("RandomState", seed, *size); 200 | test_xor8_build_keys::("BuildHasherDefault", seed, *size); 201 | } 202 | } 203 | 204 | #[test] 205 | #[ignore] 206 | fn test_xor8_billion() { 207 | let seed: u64 = random(); 208 | println!("test_xor8_billion seed:{}", seed); 209 | 210 | let size = 1_000_000_000; 211 | test_xor8_build::("RandomState", seed, size); 212 | test_xor8_build::("BuildHasherDefault", seed, size); 213 | test_xor8_build_keys::("RandomState", seed, size); 214 | test_xor8_build_keys::("BuildHasherDefault", seed, size); 215 | } 216 | 217 | #[cfg(feature = "cbordata")] 218 | #[test] 219 | fn test_xor8_cbor() { 220 | use crate::Xor8; 221 | 222 | let seed: u64 = random(); 223 | println!("test_xor8_cbor seed:{}", seed); 224 | let mut rng = StdRng::seed_from_u64(seed); 225 | 226 | let keys: Vec = (0..100_000).map(|_| rng.gen::()).collect(); 227 | 228 | let filter = { 229 | let mut builder = Xor8Builder::::new(); 230 | builder.populate(&keys); 231 | builder.build().expect("fail building xor8 filter") 232 | }; 233 | 234 | for key in keys.iter() { 235 | assert!(filter.contains(key), "key {} not present", key); 236 | } 237 | 238 | let filter = { 239 | let val = filter.into_cbor().unwrap(); 240 | Xor8::::from_cbor(val).unwrap() 241 | }; 242 | 243 | for key in keys.iter() { 244 | assert!(filter.contains(key), "key {} not present", key); 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /src/xor8_old.rs: -------------------------------------------------------------------------------- 1 | //! Library implements xor-filter. 2 | //! 3 | //! This is a port of its 4 | //! [original implementation](https://github.com/FastFilter/xorfilter) 5 | //! written in golang. 6 | 7 | #[allow(unused_imports)] 8 | use std::collections::hash_map::DefaultHasher; 9 | #[allow(unused_imports)] 10 | use std::collections::hash_map::RandomState; 11 | use std::collections::BTreeMap; 12 | use std::convert::TryInto; 13 | use std::ffi; 14 | use std::fs; 15 | use std::hash::BuildHasher; 16 | use std::hash::Hash; 17 | use std::hash::Hasher; 18 | use std::io::ErrorKind; 19 | use std::io::Read; 20 | use std::io::Write; 21 | use std::io::{self}; 22 | 23 | use crate::BuildHasherDefault; 24 | use crate::Result; 25 | 26 | fn murmur64(mut h: u64) -> u64 { 27 | h ^= h >> 33; 28 | h = h.wrapping_mul(0xff51_afd7_ed55_8ccd); 29 | h ^= h >> 33; 30 | h = h.wrapping_mul(0xc4ce_b9fe_1a85_ec53); 31 | h ^= h >> 33; 32 | h 33 | } 34 | 35 | // returns random number, modifies the seed 36 | fn splitmix64(seed: &mut u64) -> u64 { 37 | *seed = (*seed).wrapping_add(0x9E37_79B9_7F4A_7C15); 38 | let mut z = *seed; 39 | z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); 40 | z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); 41 | z ^ (z >> 31) 42 | } 43 | 44 | fn mixsplit(key: u64, seed: u64) -> u64 { 45 | murmur64(key.wrapping_add(seed)) 46 | } 47 | 48 | fn reduce(hash: u32, n: u32) -> u32 { 49 | // http://lemire.me/blog/2016/06/27/a-fast-alternative-to-the-modulo-reduction/ 50 | (((hash as u64) * (n as u64)) >> 32) as u32 51 | } 52 | 53 | fn fingerprint(hash: u64) -> u64 { 54 | hash ^ (hash >> 32) 55 | } 56 | 57 | #[derive(Clone, Default)] 58 | struct XorSet { 59 | xor_mask: u64, 60 | count: u32, 61 | } 62 | 63 | #[derive(Default)] 64 | struct Hashes { 65 | h: u64, 66 | h0: u32, 67 | h1: u32, 68 | h2: u32, 69 | } 70 | 71 | #[derive(Clone, Copy, Default)] 72 | struct KeyIndex { 73 | hash: u64, 74 | index: u32, 75 | } 76 | 77 | /// Type Xor8 is probabilistic data-structure to test membership of an element in a set. 78 | /// 79 | /// This implementation has a false positive rate of about 0.3% and a memory usage of 80 | /// less than 9 bits per entry for sizeable sets. 81 | /// 82 | /// Xor8 is parametrized over type `H` which is expected to implement [BuildHasher] 83 | /// trait, like types [RandomState] and [BuildHasherDefault]. When not supplied, 84 | /// [BuildHasherDefault] is used as the default hash-builder. 85 | /// 86 | /// If `RandomState` is used as BuildHasher, `std` has got this to say 87 | /// > _A particular instance RandomState will create the same instances 88 | /// > of Hasher, but the hashers created by two different RandomState_ 89 | /// > instances are unlikely to produce the same result for the same values._ 90 | /// 91 | /// If [DefaultHasher] is used as BuildHasher, `std` has got this to say, 92 | /// > _The internal algorithm is not specified, and so its hashes 93 | /// > should not be relied upon over releases._ 94 | /// 95 | /// The default type for parameter `H` might change when a reliable and commonly used 96 | /// BuildHasher type is available. 97 | pub struct Xor8 98 | where H: BuildHasher 99 | { 100 | keys: Option>, 101 | pub hash_builder: H, 102 | pub seed: u64, 103 | pub block_length: u32, 104 | pub finger_prints: Vec, 105 | } 106 | 107 | impl PartialEq for Xor8 108 | where H: BuildHasher 109 | { 110 | fn eq(&self, other: &Self) -> bool { 111 | self.seed == other.seed 112 | && self.block_length == other.block_length 113 | && self.finger_prints == other.finger_prints 114 | } 115 | } 116 | 117 | impl Default for Xor8 118 | where H: BuildHasher + Default 119 | { 120 | fn default() -> Self { 121 | Xor8 { 122 | keys: Some(BTreeMap::new()), 123 | hash_builder: H::default(), 124 | seed: u64::default(), 125 | block_length: u32::default(), 126 | finger_prints: Vec::default(), 127 | } 128 | } 129 | } 130 | 131 | impl Xor8 132 | where H: BuildHasher 133 | { 134 | /// New Xor8 instance initialized with [DefaultHasher]. 135 | pub fn new() -> Self 136 | where H: Default { 137 | Self::default() 138 | } 139 | 140 | /// New Xor8 instance initialized with supplied `hasher`. 141 | pub fn with_hasher(hash_builder: H) -> Self { 142 | Xor8 { 143 | keys: Some(BTreeMap::new()), 144 | hash_builder, 145 | seed: u64::default(), 146 | block_length: u32::default(), 147 | finger_prints: Vec::default(), 148 | } 149 | } 150 | } 151 | 152 | impl Xor8 153 | where H: BuildHasher 154 | { 155 | /// Insert 64-bit digest of a single key. Digest for the key shall be generated 156 | /// using the default-hasher or via hasher supplied via [Xor8::with_hasher] method. 157 | pub fn insert(&mut self, key: &K) { 158 | let hashed_key = { 159 | let mut hasher = self.hash_builder.build_hasher(); 160 | key.hash(&mut hasher); 161 | hasher.finish() 162 | }; 163 | self.keys.as_mut().unwrap().insert(hashed_key, ()); 164 | } 165 | 166 | /// Populate with 64-bit digests for a collection of keys of type `K`. Digest for 167 | /// key shall be generated using the default-hasher or via hasher supplied 168 | /// via [Xor8::with_hasher] method. 169 | pub fn populate(&mut self, keys: &[K]) { 170 | keys.iter().for_each(|key| { 171 | let mut hasher = self.hash_builder.build_hasher(); 172 | key.hash(&mut hasher); 173 | self.keys.as_mut().unwrap().insert(hasher.finish(), ()); 174 | }) 175 | } 176 | 177 | /// Populate with pre-compute collection of 64-bit digests. 178 | pub fn populate_keys(&mut self, digests: &[u64]) { 179 | for digest in digests.iter() { 180 | self.keys.as_mut().unwrap().insert(*digest, ()); 181 | } 182 | } 183 | 184 | /// Build bitmap for keys that where previously inserted using [Xor8::insert], 185 | /// [Xor8::populate] and [Xor8::populate_keys] method. 186 | pub fn build(&mut self) -> Result<()> { 187 | match self.keys.take() { 188 | Some(keys) => { 189 | let digests = keys.keys().copied().collect::>(); 190 | self.build_keys(&digests) 191 | } 192 | None => Ok(()), 193 | } 194 | } 195 | 196 | /// Build a bitmap for pre-computed 64-bit digests for keys. If keys where 197 | /// previously inserted using [Xor8::insert] or [Xor8::populate] or 198 | /// [Xor8::populate_keys] methods, they shall be ignored. 199 | /// 200 | /// It is upto the caller to ensure that digests are unique, that there no 201 | /// duplicates. 202 | pub fn build_keys(&mut self, digests: &[u64]) -> Result<()> { 203 | let (size, mut rngcounter) = (digests.len(), 1_u64); 204 | let capacity = { 205 | let capacity = 32 + ((1.23 * (size as f64)).ceil() as u32); 206 | capacity / 3 * 3 // round it down to a multiple of 3 207 | }; 208 | self.seed = splitmix64(&mut rngcounter); 209 | self.block_length = capacity / 3; 210 | self.finger_prints = vec![u8::default(); capacity as usize]; 211 | 212 | let block_length = self.block_length as usize; 213 | let mut q0: Vec = Vec::with_capacity(block_length); 214 | let mut q1: Vec = Vec::with_capacity(block_length); 215 | let mut q2: Vec = Vec::with_capacity(block_length); 216 | let mut stack: Vec = Vec::with_capacity(size); 217 | let mut sets0: Vec = vec![XorSet::default(); block_length]; 218 | let mut sets1: Vec = vec![XorSet::default(); block_length]; 219 | let mut sets2: Vec = vec![XorSet::default(); block_length]; 220 | 221 | loop { 222 | for key in digests.iter() { 223 | let hs = self.geth0h1h2(*key); 224 | sets0[hs.h0 as usize].xor_mask ^= hs.h; 225 | sets0[hs.h0 as usize].count += 1; 226 | sets1[hs.h1 as usize].xor_mask ^= hs.h; 227 | sets1[hs.h1 as usize].count += 1; 228 | sets2[hs.h2 as usize].xor_mask ^= hs.h; 229 | sets2[hs.h2 as usize].count += 1; 230 | } 231 | 232 | q0.clear(); 233 | q1.clear(); 234 | q2.clear(); 235 | 236 | let iter = sets0.iter().enumerate().take(self.block_length as usize); 237 | for (i, item) in iter { 238 | if item.count == 1 { 239 | q0.push(KeyIndex { 240 | index: i as u32, 241 | hash: item.xor_mask, 242 | }); 243 | } 244 | } 245 | let iter = sets1.iter().enumerate().take(self.block_length as usize); 246 | for (i, item) in iter { 247 | if item.count == 1 { 248 | q1.push(KeyIndex { 249 | index: i as u32, 250 | hash: item.xor_mask, 251 | }); 252 | } 253 | } 254 | let iter = sets2.iter().enumerate().take(self.block_length as usize); 255 | for (i, item) in iter { 256 | if item.count == 1 { 257 | q2.push(KeyIndex { 258 | index: i as u32, 259 | hash: item.xor_mask, 260 | }); 261 | } 262 | } 263 | 264 | stack.clear(); 265 | 266 | while !q0.is_empty() || !q1.is_empty() || !q2.is_empty() { 267 | while let Some(keyindexvar) = q0.pop() { 268 | if sets0[keyindexvar.index as usize].count == 0 { 269 | // not actually possible after the initial scan. 270 | continue; 271 | } 272 | let hash = keyindexvar.hash; 273 | let h1 = self.geth1(hash); 274 | let h2 = self.geth2(hash); 275 | stack.push(keyindexvar); 276 | 277 | let mut s = unsafe { sets1.get_unchecked_mut(h1 as usize) }; 278 | s.xor_mask ^= hash; 279 | s.count -= 1; 280 | if s.count == 1 { 281 | q1.push(KeyIndex { 282 | index: h1, 283 | hash: s.xor_mask, 284 | }) 285 | } 286 | 287 | let mut s = unsafe { sets2.get_unchecked_mut(h2 as usize) }; 288 | s.xor_mask ^= hash; 289 | s.count -= 1; 290 | if s.count == 1 { 291 | q2.push(KeyIndex { 292 | index: h2, 293 | hash: s.xor_mask, 294 | }) 295 | } 296 | } 297 | while let Some(mut keyindexvar) = q1.pop() { 298 | if sets1[keyindexvar.index as usize].count == 0 { 299 | continue; 300 | } 301 | let hash = keyindexvar.hash; 302 | let h0 = self.geth0(hash); 303 | let h2 = self.geth2(hash); 304 | keyindexvar.index += self.block_length; 305 | stack.push(keyindexvar); 306 | 307 | let mut s = unsafe { sets0.get_unchecked_mut(h0 as usize) }; 308 | s.xor_mask ^= hash; 309 | s.count -= 1; 310 | if s.count == 1 { 311 | q0.push(KeyIndex { 312 | index: h0, 313 | hash: s.xor_mask, 314 | }) 315 | } 316 | 317 | let mut s = unsafe { sets2.get_unchecked_mut(h2 as usize) }; 318 | s.xor_mask ^= hash; 319 | s.count -= 1; 320 | if s.count == 1 { 321 | q2.push(KeyIndex { 322 | index: h2, 323 | hash: s.xor_mask, 324 | }) 325 | } 326 | } 327 | while let Some(mut keyindexvar) = q2.pop() { 328 | if sets2[keyindexvar.index as usize].count == 0 { 329 | continue; 330 | } 331 | let hash = keyindexvar.hash; 332 | let h0 = self.geth0(hash); 333 | let h1 = self.geth1(hash); 334 | keyindexvar.index += 2 * self.block_length; 335 | stack.push(keyindexvar); 336 | 337 | let mut s = unsafe { sets0.get_unchecked_mut(h0 as usize) }; 338 | s.xor_mask ^= hash; 339 | s.count -= 1; 340 | if s.count == 1 { 341 | q0.push(KeyIndex { 342 | index: h0, 343 | hash: s.xor_mask, 344 | }) 345 | } 346 | let mut s = unsafe { sets1.get_unchecked_mut(h1 as usize) }; 347 | s.xor_mask ^= hash; 348 | s.count -= 1; 349 | if s.count == 1 { 350 | q1.push(KeyIndex { 351 | index: h1, 352 | hash: s.xor_mask, 353 | }) 354 | } 355 | } 356 | } 357 | 358 | if stack.len() == size { 359 | break; 360 | } 361 | 362 | for item in sets0.iter_mut() { 363 | *item = XorSet::default(); 364 | } 365 | for item in sets1.iter_mut() { 366 | *item = XorSet::default(); 367 | } 368 | for item in sets2.iter_mut() { 369 | *item = XorSet::default(); 370 | } 371 | self.seed = splitmix64(&mut rngcounter) 372 | } 373 | 374 | while let Some(ki) = stack.pop() { 375 | let mut val = fingerprint(ki.hash) as u8; 376 | if ki.index < self.block_length { 377 | let h1 = (self.geth1(ki.hash) + self.block_length) as usize; 378 | let h2 = (self.geth2(ki.hash) + 2 * self.block_length) as usize; 379 | val ^= self.finger_prints[h1] ^ self.finger_prints[h2]; 380 | } else if ki.index < 2 * self.block_length { 381 | let h0 = self.geth0(ki.hash) as usize; 382 | let h2 = (self.geth2(ki.hash) + 2 * self.block_length) as usize; 383 | val ^= self.finger_prints[h0] ^ self.finger_prints[h2]; 384 | } else { 385 | let h0 = self.geth0(ki.hash) as usize; 386 | let h1 = (self.geth1(ki.hash) + self.block_length) as usize; 387 | val ^= self.finger_prints[h0] ^ self.finger_prints[h1] 388 | } 389 | self.finger_prints[ki.index as usize] = val; 390 | } 391 | 392 | Ok(()) 393 | } 394 | } 395 | 396 | impl Xor8 397 | where H: BuildHasher 398 | { 399 | /// Contains tell you whether the key is likely part of the set, with false 400 | /// positive rate. 401 | pub fn contains(&self, key: &K) -> bool { 402 | let hashed_key = { 403 | let mut hasher = self.hash_builder.build_hasher(); 404 | key.hash(&mut hasher); 405 | hasher.finish() 406 | }; 407 | self.contains_key(hashed_key) 408 | } 409 | 410 | /// Contains tell you whether the key, as pre-computed digest form, is likely 411 | /// part of the set, with false positive rate. 412 | pub fn contains_key(&self, digest: u64) -> bool { 413 | let hash = mixsplit(digest, self.seed); 414 | let f = fingerprint(hash) as u8; 415 | let r0 = hash as u32; 416 | let r1 = hash.rotate_left(21) as u32; 417 | let r2 = hash.rotate_left(42) as u32; 418 | let h0 = reduce(r0, self.block_length) as usize; 419 | let h1 = (reduce(r1, self.block_length) + self.block_length) as usize; 420 | let h2 = (reduce(r2, self.block_length) + 2 * self.block_length) as usize; 421 | f == (self.finger_prints[h0] ^ self.finger_prints[h1] ^ self.finger_prints[h2]) 422 | } 423 | 424 | #[allow(dead_code)] 425 | fn get_hasher(&self) -> H::Hasher { 426 | self.hash_builder.build_hasher() 427 | } 428 | } 429 | 430 | impl Xor8 431 | where H: BuildHasher 432 | { 433 | fn geth0h1h2(&self, k: u64) -> Hashes { 434 | let h = mixsplit(k, self.seed); 435 | Hashes { 436 | h, 437 | h0: reduce(h as u32, self.block_length), 438 | h1: reduce(h.rotate_left(21) as u32, self.block_length), 439 | h2: reduce(h.rotate_left(42) as u32, self.block_length), 440 | } 441 | } 442 | 443 | fn geth0(&self, hash: u64) -> u32 { 444 | let r0 = hash as u32; 445 | reduce(r0, self.block_length) 446 | } 447 | 448 | fn geth1(&self, hash: u64) -> u32 { 449 | let r1 = hash.rotate_left(21) as u32; 450 | reduce(r1, self.block_length) 451 | } 452 | 453 | fn geth2(&self, hash: u64) -> u32 { 454 | let r2 = hash.rotate_left(42) as u32; 455 | reduce(r2, self.block_length) 456 | } 457 | } 458 | 459 | /// Implements serialization and de-serialization logic for Xor8. This is still work 460 | /// in progress, refer to issue: 461 | /// in github. 462 | /// 463 | /// TODO: 464 | impl Xor8 465 | where H: Into> + From> + BuildHasher 466 | { 467 | /// File signature write on first 4 bytes of file. 468 | /// ^ stands for xor 469 | /// TL stands for filter 470 | /// 1 stands for version 1 471 | /// 2 stands for version 2 472 | const SIGNATURE_V1: [u8; 4] = [b'^', b'T', b'L', 1]; 473 | const SIGNATURE_V2: [u8; 4] = [b'^', b'T', b'L', 2]; 474 | 475 | /// METADATA_LENGTH is size that required to write size of all the 476 | /// metadata of the serialized filter. 477 | // signature length + seed length + block-length + 478 | // fingerprint length + hasher-builder length + fingerprint + hash-builder 479 | const METADATA_LENGTH: usize = 4 + 8 + 4 + 4 + 4; 480 | 481 | /// Write to file in binary format 482 | /// TODO Add chechsum of finger_prints into file headers 483 | pub fn write_file(&self, path: &ffi::OsStr) -> io::Result 484 | where H: Clone { 485 | let mut f = fs::File::create(path)?; 486 | let buf = self.to_bytes(); 487 | f.write_all(&buf)?; 488 | Ok(buf.len()) 489 | } 490 | 491 | /// Read from file in binary format 492 | pub fn read_file(path: &ffi::OsStr) -> io::Result 493 | where H: Default { 494 | let mut f = fs::File::open(path)?; 495 | let mut data = Vec::new(); 496 | f.read_to_end(&mut data)?; 497 | Self::from_bytes(data) 498 | } 499 | 500 | pub fn to_bytes(&self) -> Vec 501 | where H: Clone { 502 | let capacity = Self::METADATA_LENGTH + self.finger_prints.len(); 503 | let mut buf: Vec = Vec::with_capacity(capacity); 504 | buf.extend_from_slice(&Xor8::::SIGNATURE_V2); 505 | buf.extend_from_slice(&self.seed.to_be_bytes()); 506 | buf.extend_from_slice(&self.block_length.to_be_bytes()); 507 | buf.extend_from_slice(&(self.finger_prints.len() as u32).to_be_bytes()); 508 | 509 | let hb_binary: Vec = self.hash_builder.clone().into(); 510 | buf.extend_from_slice(&(hb_binary.len() as u32).to_be_bytes()); 511 | 512 | buf.extend_from_slice(&self.finger_prints); 513 | buf.extend_from_slice(&hb_binary); 514 | buf 515 | } 516 | 517 | pub fn from_bytes(buf: Vec) -> io::Result 518 | where H: Default { 519 | use std::io::Error; 520 | 521 | let mut n = 0; 522 | 523 | // validate the buf first. 524 | if Self::METADATA_LENGTH > buf.len() { 525 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 526 | } 527 | 528 | // check the signature 529 | if buf[n..4] == Xor8::::SIGNATURE_V1 { 530 | return Self::from_bytes_v1(buf); 531 | } else if buf[n..4] != Xor8::::SIGNATURE_V2 { 532 | return Err(Error::new( 533 | ErrorKind::InvalidData, 534 | "File signature incorrect", 535 | )); 536 | } 537 | 538 | n += 4; 539 | // fetch the seed 540 | let seed = u64::from_be_bytes(buf[n..n + 8].try_into().unwrap()); 541 | n += 8; 542 | // fetch block_length 543 | let block_length = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()); 544 | n += 4; 545 | // fetch fingerprint length 546 | let fp_len = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()) as usize; 547 | n += 4; 548 | // fetch hash-serizalized length 549 | let hb_len = u32::from_be_bytes(buf[n..n + 4].try_into().unwrap()) as usize; 550 | n += 4; 551 | 552 | if buf[n..].len() < (fp_len + hb_len) { 553 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 554 | } 555 | 556 | // fetch the finger print 557 | let finger_prints = buf[n..n + fp_len].to_vec(); 558 | n += fp_len; 559 | // fetch the hash_builder 560 | let hash_builder: H = buf[n..n + hb_len].to_vec().into(); 561 | 562 | Ok(Xor8 { 563 | keys: None, 564 | hash_builder, 565 | seed, 566 | block_length, 567 | finger_prints, 568 | }) 569 | } 570 | 571 | fn from_bytes_v1(buf: Vec) -> io::Result 572 | where H: Default { 573 | use std::io::Error; 574 | 575 | // validate the buf first. 576 | if Self::METADATA_LENGTH > buf.len() { 577 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 578 | } 579 | if buf[..4] != Xor8::::SIGNATURE_V1 { 580 | return Err(Error::new( 581 | ErrorKind::InvalidData, 582 | "File signature incorrect", 583 | )); 584 | } 585 | let fp_len = u32::from_be_bytes(buf[16..20].try_into().unwrap()) as usize; 586 | if buf[20..].len() < fp_len { 587 | return Err(Error::new(ErrorKind::InvalidData, "invalid byte slice")); 588 | } 589 | Ok(Xor8 { 590 | keys: None, 591 | hash_builder: H::default(), 592 | seed: u64::from_be_bytes(buf[4..12].try_into().unwrap()), 593 | block_length: u32::from_be_bytes(buf[12..16].try_into().unwrap()), 594 | finger_prints: buf[20..].to_vec(), 595 | }) 596 | } 597 | } 598 | -------------------------------------------------------------------------------- /tests/tl1-serialized.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/prataprc/xorfilter/7b98e26057bc6f0911ecfbbfb8de20415bf63ddb/tests/tl1-serialized.data -------------------------------------------------------------------------------- /tests/xorfilter.rs: -------------------------------------------------------------------------------- 1 | use std::ffi; 2 | 3 | use rand::prelude::random; 4 | use rand::rngs::StdRng; 5 | use rand::Rng; 6 | use rand::SeedableRng; 7 | use xorfilter::xor8::Xor8; 8 | use xorfilter::xor8::Xor8Builder; 9 | use xorfilter::BuildHasherDefault; 10 | 11 | #[test] 12 | fn test_same_filter_encode_decode() { 13 | let seed: u64 = random(); 14 | println!("test_same_filter_encode_decode seed:{}", seed); 15 | 16 | let file_path = { 17 | let mut fpath = std::env::temp_dir(); 18 | fpath.push("xorfilter-test-same-filter-encode-decode"); 19 | fpath.into_os_string() 20 | }; 21 | let filter = generate_filter(seed); 22 | 23 | filter.write_file(&file_path).expect("fail write_file"); 24 | let filter_read = Xor8::read_file(&file_path).expect("fail read_file"); 25 | assert!( 26 | filter_read == filter, 27 | "Filter unequals after encode and decode" 28 | ); 29 | 30 | let filter_second = generate_filter(seed + 1000); 31 | assert!( 32 | filter_read != filter_second, 33 | "Random generated filters should not be the same" 34 | ); 35 | } 36 | 37 | #[test] 38 | fn test_same_filter_bytes_encoding_tl1() { 39 | use std::path; 40 | 41 | let keys: Vec = (1..10000).map(|i| (i * 2) + 1).collect(); 42 | let missing: Vec = (1..20).map(|i| (i * 2)).collect(); 43 | 44 | let file_path = { 45 | let mut loc = path::PathBuf::new(); 46 | loc.push(path::Path::new(file!()).parent().unwrap().to_str().unwrap()); 47 | loc.push("tl1-serialized.data"); 48 | loc.into_os_string() 49 | }; 50 | 51 | // save_file(file_path.clone(), &keys); 52 | 53 | let filter = Xor8::::read_file(&file_path) 54 | .expect("Read from bytes failed"); 55 | 56 | for key in keys.iter() { 57 | assert!(filter.contains(key)) 58 | } 59 | 60 | for key in missing.iter() { 61 | assert!(!filter.contains(key)) 62 | } 63 | } 64 | 65 | #[test] 66 | fn test_same_filter_bytes_encoding_tl2() { 67 | let seed: u64 = random(); 68 | println!("test_same_filter_bytes_encoding_tl1 seed:{}", seed); 69 | 70 | let filter = generate_filter(seed); 71 | 72 | let buf = filter.to_bytes(); 73 | let filter_read = Xor8::from_bytes(buf).expect("Read from bytes failed"); 74 | assert!( 75 | filter_read == filter, 76 | "Filter unequals after encode and decode" 77 | ); 78 | 79 | let filter_second = generate_filter(seed + 1000); 80 | assert!( 81 | filter_read != filter_second, 82 | "Random generated filters should not be the same" 83 | ); 84 | } 85 | 86 | #[test] 87 | fn test_string_keys() { 88 | // Rust tips: https://ashleygwilliams.github.io/gotober-2018/#103 89 | let rust_tips = vec![ 90 | "don't rewrite your software in rust", 91 | "show up with code", 92 | "don't sell", 93 | "sell sell sell", 94 | "the hard part of programming is not programming", 95 | "the hard part of programming is programming", 96 | "be prepared for change", 97 | "be prepared for things to stay the same", 98 | "have a problem to solve", 99 | "learning curves are a blessing in disguise", 100 | ]; 101 | let hash_builder = BuildHasherDefault::default(); 102 | let mut builder = Xor8Builder::with_hasher(hash_builder); 103 | builder.populate(&rust_tips); 104 | let filter = builder.build().expect("build failed"); 105 | 106 | // Test all keys(rust_tips) 107 | for tip in rust_tips { 108 | assert!(filter.contains(tip)); 109 | } 110 | // Remove last one character 111 | assert!(!filter.contains("show up with cod")); 112 | // String not in keys(rust_tips) 113 | assert!(!filter.contains("No magic, just code")); 114 | } 115 | 116 | /// Generate a filter with random keys 117 | fn generate_filter(seed: u64) -> Xor8 { 118 | let mut rng = StdRng::seed_from_u64(seed); 119 | 120 | let testsize = 10000; 121 | let mut keys: Vec = Vec::with_capacity(testsize); 122 | keys.resize(testsize, u64::default()); 123 | for key in keys.iter_mut() { 124 | *key = rng.gen(); 125 | } 126 | 127 | let mut builder = Xor8Builder::::new(); 128 | builder.populate(&keys); 129 | builder.build().expect("build failed") 130 | } 131 | 132 | // hack to generate tl1 serialized Xor8 instance. 133 | #[allow(dead_code)] 134 | fn save_file(file_path: ffi::OsString, keys: &[u32]) { 135 | let mut builder = Xor8Builder::::new(); 136 | builder.populate(keys); 137 | let filter = builder.build().expect("build failed"); 138 | filter.write_file(&file_path).expect("error saving tl1 to file"); 139 | } 140 | --------------------------------------------------------------------------------