├── .gitattributes ├── .github └── workflows │ └── rust.yml ├── .gitignore ├── CHANGELOG.rst ├── Cargo.toml ├── LICENSE-APACHE ├── LICENSE-MIT ├── Readme.md ├── SECURITY.md ├── fuzz ├── .gitignore ├── Cargo.toml ├── README.md └── fuzz_targets │ ├── damerau_levenshtein.rs │ ├── indel.rs │ ├── jaro.rs │ ├── jaro_winkler.rs │ ├── lcs_seq.rs │ └── levenshtein.rs ├── rapidfuzz-benches ├── Cargo.toml ├── benches │ ├── bench_damerau_levenshtein.rs │ ├── bench_generic_levenshtein.rs │ ├── bench_indel.rs │ ├── bench_jaro.rs │ ├── bench_jaro_winkler.rs │ ├── bench_lcs_seq.rs │ ├── bench_levenshtein.rs │ └── bench_osa.rs └── results │ ├── damerau_levenshtein.svg │ ├── generic_levenshtein.svg │ ├── indel.svg │ ├── jaro.svg │ ├── jaro_winkler.svg │ ├── levenshtein.svg │ ├── longest_common_subsequence.svg │ └── osa.svg └── src ├── common.rs ├── details.rs ├── details ├── common.rs ├── distance.rs ├── growing_hashmap.rs ├── intrinsics.rs ├── matrix.rs └── pattern_match_vector.rs ├── distance.rs ├── distance ├── damerau_levenshtein.rs ├── example.rs ├── example │ └── ocr.rs ├── hamming.rs ├── indel.rs ├── jaro.rs ├── jaro_winkler.rs ├── lcs_seq.rs ├── levenshtein.rs ├── osa.rs ├── postfix.rs └── prefix.rs ├── fuzz.rs └── lib.rs /.gitattributes: -------------------------------------------------------------------------------- 1 | rapidfuzz-benches/results* linguist-vendored -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | env: 8 | CARGO_TERM_COLOR: always 9 | 10 | jobs: 11 | build: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Build 18 | run: cargo build --verbose 19 | - name: Run tests 20 | run: cargo test --verbose 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | lib 14 | .vscode/ 15 | 16 | /.idea/ -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | Changelog 2 | --------- 3 | 4 | [0.5.0] - 2023-12-01 5 | ^^^^^^^^^^^^^^^^^^^^ 6 | Changed 7 | ~~~~~~~ 8 | * improve Args builder for hamming 9 | 10 | 11 | [0.4.0] - 2023-12-01 12 | ^^^^^^^^^^^^^^^^^^^^ 13 | Changed 14 | ~~~~~~~ 15 | * rewrite of function signatures to reduce boilerplate 16 | 17 | * return type now automatically deduced, so no more unwrapping needed 18 | when ``score_cutoff`` is not used 19 | * optional arguments now in Arg structs uisng the builder pattern to reduce amount 20 | of extra arguments 21 | * extra overload ``*_with_args`` for a variant with args, while the default version accepts 22 | only two sequences 23 | 24 | The signatures is expected to largely stay this way for the foreseeable future. 25 | 26 | [0.3.2] - 2023-11-29 27 | ^^^^^^^^^^^^^^^^^^^^ 28 | Fixed 29 | ~~~~~ 30 | * fixed crash inside hashmap grow function leading to a crash in the 31 | Damerau-Levenshtein implementation 32 | * fixed incorrect flagging of similar characters in Jaro similarity 33 | * fixed wraparound in Longest Common Subsequence 34 | 35 | [0.3.1] - 2023-11-29 36 | ^^^^^^^^^^^^^^^^^^^^ 37 | Fixed 38 | ~~~~~ 39 | * fixed crash inside hashmap lookup function leading to a crash in the 40 | Damerau-Levenshtein implementation 41 | 42 | [0.3.0] - 2023-11-27 43 | ^^^^^^^^^^^^^^^^^^^^ 44 | Previous versions only existed for testing purposed years ago. This is a complete 45 | rewrite porting a subset of the features provided in the C++ implementation of 46 | rapidfuzz. The remaining features will be added in later releases. 47 | 48 | Added 49 | ~~~~~ 50 | * added implementations of the following string metrics: 51 | 52 | * Levenshtein distance 53 | * Damerau-Levenshtein distance 54 | * Hamming distance 55 | * Longest common subsequence 56 | * Indel distance 57 | * Optimal string alignment distance 58 | * Postfix similarity 59 | * Prefix similarity 60 | * Jaro similarity 61 | * Jaro-Winkler similarity 62 | 63 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | description="rapid fuzzy string matching library" 3 | name = "rapidfuzz" 4 | version = "0.5.0" 5 | authors = ["maxbachmann "] 6 | edition = "2021" 7 | readme = "Readme.md" 8 | license = "MIT" 9 | repository = "https://github.com/rapidfuzz/rapidfuzz-rs" 10 | documentation = "https://docs.rs/rapidfuzz/" 11 | keywords = ["string", "similarity", "Hamming", "Levenshtein", "Jaro"] 12 | exclude = [".*"] 13 | categories = ["text-processing"] 14 | 15 | [lib] 16 | name = "rapidfuzz" 17 | -------------------------------------------------------------------------------- /LICENSE-APACHE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining 2 | a copy of this software and associated documentation files (the 3 | "Software"), to deal in the Software without restriction, including 4 | without limitation the rights to use, copy, modify, merge, publish, 5 | distribute, sublicense, and/or sell copies of the Software, and to 6 | permit persons to whom the Software is furnished to do so, subject to 7 | the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be 10 | included in all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 13 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 14 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 15 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 16 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 17 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 18 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 |

2 | RapidFuzz 3 |

4 |

Rapid fuzzy string matching in Rust using the Levenshtein Distance

5 | 6 |

7 | 8 | Continous Integration 10 | 11 | 12 | Gitter chat 14 | 15 | 16 | Documentation 18 | 19 | 20 | license 22 | 23 |

24 | 25 |

26 | Description • 27 | Installation • 28 | Usage • 29 | License 30 |

31 | 32 | --- 33 | ## Description 34 | 35 | RapidFuzz is a general purpose string matching library with implementations 36 | for Rust, C++ and Python. 37 | 38 | ### Key Features 39 | 40 | - **Diverse String Metrics**: Offers a variety of string metrics 41 | to suit different use cases. These range from the Levenshtein 42 | distance for edit-based comparisons to the Jaro-Winkler similarity for 43 | more nuanced similarity assessments. 44 | - **Optimized for Speed**: The library is designed with performance in mind. 45 | Each implementation is carefully designed to ensure optimal performance, 46 | making it suitable for the analysis of large datasets. 47 | - **Easy to use**: The API is designed to be simple to use, while still giving 48 | the implementation room for optimization. 49 | 50 | ## Installation 51 | 52 | The installation is as simple as: 53 | ```console 54 | $ cargo add rapidfuzz 55 | ``` 56 | 57 | ## Usage 58 | 59 | The following examples show the usage with the Levenshtein distance. Other metrics 60 | can be found in the [fuzz](https://docs.rs/rapidfuzz/latest/rapidfuzz/fuzz/index.html) and [distance](https://docs.rs/rapidfuzz/latest/rapidfuzz/distance/index.html) modules. 61 | 62 | ```rust 63 | use rapidfuzz::distance::levenshtein; 64 | 65 | // Perform a simple comparision using he levenshtein distance 66 | assert_eq!( 67 | 3, 68 | levenshtein::distance("kitten".chars(), "sitting".chars()) 69 | ); 70 | 71 | // If you are sure the input strings are ASCII only it's usually faster to operate on bytes 72 | assert_eq!( 73 | 3, 74 | levenshtein::distance("kitten".bytes(), "sitting".bytes()) 75 | ); 76 | 77 | // You can provide a score_cutoff value to filter out strings with distance that is worse than 78 | // the score_cutoff 79 | assert_eq!( 80 | None, 81 | levenshtein::distance_with_args( 82 | "kitten".chars(), 83 | "sitting".chars(), 84 | &levenshtein::Args::default().score_cutoff(2) 85 | ) 86 | ); 87 | 88 | // You can provide a score_hint to tell the implementation about the expected score. 89 | // This can be used to select a more performant implementation internally, but might cause 90 | // a slowdown in cases where the distance is actually worse than the score_hint 91 | assert_eq!( 92 | 3, 93 | levenshtein::distance_with_args( 94 | "kitten".chars(), 95 | "sitting".chars(), 96 | &levenshtein::Args::default().score_hint(2) 97 | ) 98 | ); 99 | 100 | // When comparing a single string to multiple strings you can use the 101 | // provided `BatchComparators`. These can cache part of the calculation 102 | // which can provide significant speedups 103 | let scorer = levenshtein::BatchComparator::new("kitten".chars()); 104 | assert_eq!(3, scorer.distance("sitting".chars())); 105 | assert_eq!(0, scorer.distance("kitten".chars())); 106 | ``` 107 | 108 | 109 | ## License 110 | Licensed under either of [Apache License, Version 111 | 2.0](https://github.com/rapidfuzz/rapidfuzz-rs/blob/main/LICENSE-APACHE) or [MIT License](https://github.com/rapidfuzz/rapidfuzz-rs/blob/main/LICENSE-MIT) at your option. 112 | 113 | Unless you explicitly state otherwise, any contribution intentionally submitted 114 | for inclusion in RapidFuzz by you, as defined in the Apache-2.0 license, shall be 115 | dual licensed as above, without any additional terms or conditions. 116 | 117 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Reporting Security Issues 2 | 3 | If you believe you have found a security vulnerability in the project, please report it to us through coordinated disclosure. 4 | 5 | **Please do not report security vulnerabilities through public GitHub issues, discussions, or pull requests.** 6 | 7 | Instead, please send an email to oss@maxbachmann.de. 8 | 9 | Please include as much of the information listed below as you can to help us better understand and resolve the issue: 10 | 11 | * The type of issue (e.g., buffer overflow, SQL injection, or cross-site scripting) 12 | * Full paths of source file(s) related to the manifestation of the issue 13 | * The location of the affected source code (tag/branch/commit or direct URL) 14 | * Any special configuration required to reproduce the issue 15 | * Step-by-step instructions to reproduce the issue 16 | * Proof-of-concept or exploit code (if possible) 17 | * Impact of the issue, including how an attacker might exploit the issue 18 | 19 | This information will help us triage your report more quickly. 20 | -------------------------------------------------------------------------------- /fuzz/.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | corpus 3 | artifacts 4 | coverage -------------------------------------------------------------------------------- /fuzz/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rapidfuzz_fuzz" 3 | version = "0.0.0" 4 | publish = false 5 | edition = "2021" 6 | 7 | [package.metadata] 8 | cargo-fuzz = true 9 | 10 | [dependencies] 11 | libfuzzer-sys = "0.4.6" 12 | arbitrary = { version = "1.3.0", features = ["derive"] } 13 | 14 | rapidfuzz = { path = "../" } 15 | cargo-fuzz = "0.11.2" 16 | 17 | [profile.release] 18 | debug = 1 19 | 20 | [[bin]] 21 | name = "damerau-levenshtein" 22 | path = "fuzz_targets/damerau_levenshtein.rs" 23 | test = false 24 | doc = false 25 | 26 | [[bin]] 27 | name = "levenshtein" 28 | path = "fuzz_targets/levenshtein.rs" 29 | test = false 30 | doc = false 31 | 32 | [[bin]] 33 | name = "indel" 34 | path = "fuzz_targets/indel.rs" 35 | test = false 36 | doc = false 37 | 38 | [[bin]] 39 | name = "lcs_seq" 40 | path = "fuzz_targets/lcs_seq.rs" 41 | test = false 42 | doc = false 43 | 44 | [[bin]] 45 | name = "jaro" 46 | path = "fuzz_targets/jaro.rs" 47 | test = false 48 | doc = false 49 | 50 | [[bin]] 51 | name = "jaro-winkler" 52 | path = "fuzz_targets/jaro_winkler.rs" 53 | test = false 54 | doc = false -------------------------------------------------------------------------------- /fuzz/README.md: -------------------------------------------------------------------------------- 1 | # rapidfuzz-fuzz 2 | 3 | This directory contains fuzzers which can be used to automatically identify faults present in Boa. All the fuzzers in 4 | this directory are [grammar-aware](https://www.fuzzingbook.org/html/Grammars.html) (based on 5 | [Arbitrary](https://docs.rs/arbitrary/latest/arbitrary/)) and coverage-guided. 6 | 7 | You can run any fuzzer you wish with the following command (replacing `your-fuzzer` with a fuzzer availble in 8 | fuzz_targets, e.g. `damerau-levenshtein`): 9 | 10 | ```bash 11 | cargo fuzz run -s none your-fuzzer 12 | ``` 13 | 14 | Note that you may wish to use a different sanitizer option (`-s`) according to what kind of issue you're looking for. 15 | Refer to the [cargo-fuzz book](https://rust-fuzz.github.io/book/cargo-fuzz.html) for details on how to select a 16 | sanitizer and other flags. 17 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/damerau_levenshtein.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | use rapidfuzz::distance::damerau_levenshtein; 6 | 7 | #[derive(Arbitrary, Debug)] 8 | pub struct Texts { 9 | pub s1: String, 10 | pub s2: String, 11 | } 12 | 13 | fn fuzz(texts: Texts) { 14 | damerau_levenshtein::distance(texts.s1.chars(), texts.s2.chars()); 15 | 16 | damerau_levenshtein::BatchComparator::new(texts.s1.chars()).distance(texts.s2.chars()); 17 | } 18 | 19 | fuzz_target!(|texts: Texts| { 20 | fuzz(texts); 21 | }); 22 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/indel.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | use rapidfuzz::distance::indel; 6 | 7 | #[derive(Arbitrary, Debug)] 8 | pub struct Texts { 9 | pub s1: String, 10 | pub s2: String, 11 | } 12 | 13 | fn fuzz(texts: Texts) { 14 | indel::distance(texts.s1.chars(), texts.s2.chars()); 15 | 16 | indel::BatchComparator::new(texts.s1.chars()).distance(texts.s2.chars()); 17 | } 18 | 19 | fuzz_target!(|texts: Texts| { 20 | fuzz(texts); 21 | }); 22 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/jaro.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | use rapidfuzz::distance::jaro; 6 | 7 | #[derive(Arbitrary, Debug)] 8 | pub struct Texts { 9 | pub s1: String, 10 | pub s2: String, 11 | } 12 | 13 | fn fuzz(texts: Texts) { 14 | jaro::distance(texts.s1.chars(), texts.s2.chars()); 15 | 16 | jaro::BatchComparator::new(texts.s1.chars()).distance(texts.s2.chars()); 17 | } 18 | 19 | fuzz_target!(|texts: Texts| { 20 | fuzz(texts); 21 | }); 22 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/jaro_winkler.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | use rapidfuzz::distance::jaro_winkler; 6 | 7 | #[derive(Arbitrary, Debug)] 8 | pub struct Texts { 9 | pub s1: String, 10 | pub s2: String, 11 | } 12 | 13 | fn fuzz(texts: Texts) { 14 | jaro_winkler::distance(texts.s1.chars(), texts.s2.chars()); 15 | 16 | jaro_winkler::BatchComparator::new(texts.s1.chars()).distance(texts.s2.chars()); 17 | } 18 | 19 | fuzz_target!(|texts: Texts| { 20 | fuzz(texts); 21 | }); 22 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/lcs_seq.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | use rapidfuzz::distance::lcs_seq; 6 | 7 | #[derive(Arbitrary, Debug)] 8 | pub struct Texts { 9 | pub s1: String, 10 | pub s2: String, 11 | } 12 | 13 | fn fuzz(texts: Texts) { 14 | lcs_seq::distance(texts.s1.chars(), texts.s2.chars()); 15 | 16 | lcs_seq::BatchComparator::new(texts.s1.chars()).distance(texts.s2.chars()); 17 | } 18 | 19 | fuzz_target!(|texts: Texts| { 20 | fuzz(texts); 21 | }); 22 | -------------------------------------------------------------------------------- /fuzz/fuzz_targets/levenshtein.rs: -------------------------------------------------------------------------------- 1 | #![no_main] 2 | 3 | use arbitrary::Arbitrary; 4 | use libfuzzer_sys::fuzz_target; 5 | use rapidfuzz::distance::levenshtein; 6 | 7 | #[derive(Arbitrary, Debug)] 8 | pub struct Texts { 9 | pub s1: String, 10 | pub s2: String, 11 | } 12 | 13 | fn fuzz(texts: Texts) { 14 | levenshtein::distance(texts.s1.chars(), texts.s2.chars()); 15 | 16 | levenshtein::BatchComparator::new(texts.s1.chars()).distance(texts.s2.chars()); 17 | } 18 | 19 | fuzz_target!(|texts: Texts| { 20 | fuzz(texts); 21 | }); 22 | -------------------------------------------------------------------------------- /rapidfuzz-benches/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rapidfuzz-benches" 3 | version = "0.1.0" 4 | description = "In-tree benchmarks for the RapidFuzz project" 5 | authors = ["maxbachmann "] 6 | edition = "2021" 7 | publish = false 8 | 9 | [dependencies] 10 | rapidfuzz = { path = "../" } 11 | 12 | [dev-dependencies] 13 | criterion = { version = "0.5.1", features = ["html_reports"] } 14 | rand = "0.8.5" 15 | strsim = "0.10.0" 16 | 17 | [[bench]] 18 | name = "bench_generic_levenshtein" 19 | harness = false 20 | 21 | [[bench]] 22 | name = "bench_jaro_winkler" 23 | harness = false 24 | 25 | [[bench]] 26 | name = "bench_jaro" 27 | harness = false 28 | 29 | [[bench]] 30 | name = "bench_levenshtein" 31 | harness = false 32 | 33 | [[bench]] 34 | name = "bench_osa" 35 | harness = false 36 | 37 | [[bench]] 38 | name = "bench_damerau_levenshtein" 39 | harness = false 40 | 41 | [[bench]] 42 | name = "bench_lcs_seq" 43 | harness = false 44 | 45 | [[bench]] 46 | name = "bench_indel" 47 | harness = false 48 | 49 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_damerau_levenshtein.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("Damerau Levenshtein"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::damerau_levenshtein::distance( 37 | val.0.bytes(), 38 | val.1.bytes(), 39 | )); 40 | }) 41 | }); 42 | let (x, y): (Vec<_>, Vec<_>) = (s1.bytes().collect(), s2.bytes().collect()); 43 | group.bench_with_input(BenchmarkId::new("strsim", i), &(&x, &y), |b, val| { 44 | b.iter(|| { 45 | black_box(strsim::generic_damerau_levenshtein( 46 | val.0.as_slice(), 47 | val.1.as_slice(), 48 | )); 49 | }) 50 | }); 51 | } 52 | 53 | group.finish(); 54 | } 55 | 56 | criterion_group!(benches, benchmark); 57 | criterion_main!(benches); 58 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_generic_levenshtein.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("Generic Levenshtein"); 29 | 30 | let lens = (2..128).step_by(2); 31 | for i in lens { 32 | let s1 = generate(i); 33 | let s2 = generate(i); 34 | let args = 35 | distance::levenshtein::Args::default().weights(&distance::levenshtein::WeightTable { 36 | insertion_cost: 1, 37 | deletion_cost: 2, 38 | substitution_cost: 3, 39 | }); 40 | 41 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 42 | b.iter(|| { 43 | black_box(distance::levenshtein::distance_with_args( 44 | val.0.bytes(), 45 | val.1.bytes(), 46 | &args, 47 | )); 48 | }) 49 | }); 50 | } 51 | 52 | group.finish(); 53 | } 54 | 55 | criterion_group!(benches, benchmark); 56 | criterion_main!(benches); 57 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_indel.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("Indel"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::indel::distance(val.0.bytes(), val.1.bytes())); 37 | }) 38 | }); 39 | 40 | let cached = distance::indel::BatchComparator::new(s1.bytes()); 41 | group.bench_with_input( 42 | BenchmarkId::new("rapidfuzz (BatchComparator)", i), 43 | &(&cached, &s2), 44 | |b, val| { 45 | b.iter(|| { 46 | black_box(cached.distance(val.1.bytes())); 47 | }) 48 | }, 49 | ); 50 | } 51 | 52 | group.finish(); 53 | } 54 | 55 | criterion_group!(benches, benchmark); 56 | criterion_main!(benches); 57 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_jaro.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("Jaro"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::jaro::similarity(val.0.bytes(), val.1.bytes())); 37 | }) 38 | }); 39 | 40 | let cached = distance::jaro::BatchComparator::new(s1.bytes()); 41 | group.bench_with_input( 42 | BenchmarkId::new("rapidfuzz (BatchComparator)", i), 43 | &(&cached, &s2), 44 | |b, val| { 45 | b.iter(|| { 46 | black_box(cached.similarity(val.1.bytes())); 47 | }) 48 | }, 49 | ); 50 | 51 | group.bench_with_input(BenchmarkId::new("strsim", i), &(&s1, &s2), |b, val| { 52 | b.iter(|| { 53 | black_box(strsim::generic_jaro( 54 | &StringWrapper(val.0), 55 | &StringWrapper(val.1), 56 | )); 57 | }) 58 | }); 59 | } 60 | 61 | group.finish(); 62 | } 63 | 64 | criterion_group!(benches, benchmark); 65 | criterion_main!(benches); 66 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_jaro_winkler.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("JaroWinkler"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::jaro_winkler::similarity( 37 | val.0.bytes(), 38 | val.1.bytes(), 39 | )); 40 | }) 41 | }); 42 | 43 | let cached = distance::jaro_winkler::BatchComparator::new(s1.bytes()); 44 | group.bench_with_input( 45 | BenchmarkId::new("rapidfuzz (BatchComparator)", i), 46 | &(&cached, &s2), 47 | |b, val| { 48 | b.iter(|| { 49 | black_box(cached.similarity(val.1.bytes())); 50 | }) 51 | }, 52 | ); 53 | 54 | group.bench_with_input(BenchmarkId::new("strsim", i), &(&s1, &s2), |b, val| { 55 | b.iter(|| { 56 | black_box(strsim::generic_jaro_winkler( 57 | &StringWrapper(val.0), 58 | &StringWrapper(val.1), 59 | )); 60 | }) 61 | }); 62 | } 63 | 64 | group.finish(); 65 | } 66 | 67 | criterion_group!(benches, benchmark); 68 | criterion_main!(benches); 69 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_lcs_seq.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("Longest Common Subsequence"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::lcs_seq::similarity(val.0.bytes(), val.1.bytes())); 37 | }) 38 | }); 39 | 40 | let cached = distance::lcs_seq::BatchComparator::new(s1.bytes()); 41 | group.bench_with_input( 42 | BenchmarkId::new("rapidfuzz (BatchComparator)", i), 43 | &(&cached, &s2), 44 | |b, val| { 45 | b.iter(|| { 46 | black_box(cached.similarity(val.1.bytes())); 47 | }) 48 | }, 49 | ); 50 | } 51 | 52 | group.finish(); 53 | } 54 | 55 | criterion_group!(benches, benchmark); 56 | criterion_main!(benches); 57 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_levenshtein.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("Levenshtein"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::levenshtein::distance( 37 | val.0.bytes(), 38 | val.1.bytes(), 39 | )); 40 | }) 41 | }); 42 | group.bench_with_input(BenchmarkId::new("strsim", i), &(&s1, &s2), |b, val| { 43 | b.iter(|| { 44 | black_box(strsim::generic_levenshtein( 45 | &StringWrapper(val.0), 46 | &StringWrapper(val.1), 47 | )); 48 | }) 49 | }); 50 | 51 | let cached = distance::levenshtein::BatchComparator::new(s1.bytes()); 52 | group.bench_with_input( 53 | BenchmarkId::new("rapidfuzz (BatchComparator)", i), 54 | &(&cached, &s2), 55 | |b, val| { 56 | b.iter(|| { 57 | black_box(cached.distance(val.1.bytes())); 58 | }) 59 | }, 60 | ); 61 | } 62 | 63 | group.finish(); 64 | } 65 | 66 | criterion_group!(benches, benchmark); 67 | criterion_main!(benches); 68 | -------------------------------------------------------------------------------- /rapidfuzz-benches/benches/bench_osa.rs: -------------------------------------------------------------------------------- 1 | use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion}; 2 | use rand::{distributions::Alphanumeric, Rng}; 3 | 4 | use rapidfuzz::distance; 5 | 6 | use std::str::Bytes; 7 | 8 | fn generate(len: usize) -> String { 9 | rand::thread_rng() 10 | .sample_iter(&Alphanumeric) 11 | .take(len) 12 | .map(char::from) 13 | .collect() 14 | } 15 | 16 | struct StringWrapper<'a>(&'a str); 17 | 18 | impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 19 | type Item = u8; 20 | type IntoIter = Bytes<'b>; 21 | 22 | fn into_iter(self) -> Self::IntoIter { 23 | self.0.bytes() 24 | } 25 | } 26 | 27 | fn benchmark(c: &mut Criterion) { 28 | let mut group = c.benchmark_group("OSA"); 29 | 30 | for i in (2..128).step_by(2) { 31 | let s1 = generate(i); 32 | let s2 = generate(i); 33 | 34 | group.bench_with_input(BenchmarkId::new("rapidfuzz", i), &(&s1, &s2), |b, val| { 35 | b.iter(|| { 36 | black_box(distance::osa::distance(val.0.chars(), val.1.chars())); 37 | }) 38 | }); 39 | 40 | let cached = distance::osa::BatchComparator::new(s1.chars()); 41 | group.bench_with_input( 42 | BenchmarkId::new("rapidfuzz (BatchComparator)", i), 43 | &(&cached, &s2), 44 | |b, val| { 45 | b.iter(|| { 46 | black_box(cached.distance(val.1.chars())); 47 | }) 48 | }, 49 | ); 50 | 51 | group.bench_with_input(BenchmarkId::new("strsim", i), &(&s1, &s2), |b, val| { 52 | b.iter(|| { 53 | black_box(strsim::osa_distance(val.0, val.1)); 54 | }) 55 | }); 56 | } 57 | 58 | group.finish(); 59 | } 60 | 61 | criterion_group!(benches, benchmark); 62 | criterion_main!(benches); 63 | -------------------------------------------------------------------------------- /rapidfuzz-benches/results/damerau_levenshtein.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Damerau Levenshtein: Comparison 5 | 6 | 7 | Average time (µs) 8 | 9 | 10 | Input 11 | 12 | 13 | 14 | 20.0 15 | 16 | 17 | 18 | 40.0 19 | 20 | 21 | 22 | 60.0 23 | 24 | 25 | 26 | 80.0 27 | 28 | 29 | 30 | 100.0 31 | 32 | 33 | 34 | 120.0 35 | 36 | 37 | 38 | 140.0 39 | 40 | 41 | 42 | 160.0 43 | 44 | 45 | 46 | 180.0 47 | 48 | 49 | 50 | 200.0 51 | 52 | 53 | 54 | 55 | 20.0 56 | 57 | 58 | 59 | 40.0 60 | 61 | 62 | 63 | 60.0 64 | 65 | 66 | 67 | 80.0 68 | 69 | 70 | 71 | 100.0 72 | 73 | 74 | 75 | 120.0 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | rapidfuzz 208 | 209 | 210 | strsim 211 | 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /rapidfuzz-benches/results/generic_levenshtein.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Generic Levenshtein: Comparison 5 | 6 | 7 | Average time (µs) 8 | 9 | 10 | Input 11 | 12 | 13 | 14 | 2.0 15 | 16 | 17 | 18 | 4.0 19 | 20 | 21 | 22 | 6.0 23 | 24 | 25 | 26 | 8.0 27 | 28 | 29 | 30 | 10.0 31 | 32 | 33 | 34 | 12.0 35 | 36 | 37 | 38 | 14.0 39 | 40 | 41 | 42 | 16.0 43 | 44 | 45 | 46 | 47 | 20.0 48 | 49 | 50 | 51 | 40.0 52 | 53 | 54 | 55 | 60.0 56 | 57 | 58 | 59 | 80.0 60 | 61 | 62 | 63 | 100.0 64 | 65 | 66 | 67 | 120.0 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | rapidfuzz 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /rapidfuzz-benches/results/indel.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Indel: Comparison 5 | 6 | 7 | Average time (ns) 8 | 9 | 10 | Input 11 | 12 | 13 | 14 | 50.0 15 | 16 | 17 | 18 | 100.0 19 | 20 | 21 | 22 | 150.0 23 | 24 | 25 | 26 | 200.0 27 | 28 | 29 | 30 | 250.0 31 | 32 | 33 | 34 | 300.0 35 | 36 | 37 | 38 | 350.0 39 | 40 | 41 | 42 | 400.0 43 | 44 | 45 | 46 | 450.0 47 | 48 | 49 | 50 | 500.0 51 | 52 | 53 | 54 | 55 | 20.0 56 | 57 | 58 | 59 | 40.0 60 | 61 | 62 | 63 | 60.0 64 | 65 | 66 | 67 | 80.0 68 | 69 | 70 | 71 | 100.0 72 | 73 | 74 | 75 | 120.0 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | rapidfuzz 208 | 209 | 210 | rapidfuzz (BatchComparator) 211 | 212 | 213 | 214 | 215 | -------------------------------------------------------------------------------- /rapidfuzz-benches/results/longest_common_subsequence.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Longest Common Subsequence: Comparison 5 | 6 | 7 | Average time (ns) 8 | 9 | 10 | Input 11 | 12 | 13 | 14 | 50.0 15 | 16 | 17 | 18 | 100.0 19 | 20 | 21 | 22 | 150.0 23 | 24 | 25 | 26 | 200.0 27 | 28 | 29 | 30 | 250.0 31 | 32 | 33 | 34 | 300.0 35 | 36 | 37 | 38 | 350.0 39 | 40 | 41 | 42 | 400.0 43 | 44 | 45 | 46 | 450.0 47 | 48 | 49 | 50 | 51 | 20.0 52 | 53 | 54 | 55 | 40.0 56 | 57 | 58 | 59 | 60.0 60 | 61 | 62 | 63 | 80.0 64 | 65 | 66 | 67 | 100.0 68 | 69 | 70 | 71 | 120.0 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | rapidfuzz 204 | 205 | 206 | rapidfuzz (BatchComparator) 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /src/common.rs: -------------------------------------------------------------------------------- 1 | use std::fmt::Debug; 2 | 3 | #[derive(Default, Copy, Clone)] 4 | pub struct NoScoreCutoff; 5 | #[derive(Default, Copy, Clone)] 6 | pub struct WithScoreCutoff(pub T); 7 | 8 | pub trait DistanceCutoff 9 | where 10 | T: Copy, 11 | { 12 | type Output: Copy + Into> + PartialEq + Debug; 13 | 14 | fn cutoff(&self) -> Option; 15 | fn score(&self, raw: T) -> Self::Output; 16 | } 17 | 18 | impl DistanceCutoff for NoScoreCutoff 19 | where 20 | T: Copy + PartialEq + Debug, 21 | { 22 | type Output = T; 23 | 24 | fn cutoff(&self) -> Option { 25 | None 26 | } 27 | 28 | fn score(&self, raw: T) -> Self::Output { 29 | raw 30 | } 31 | } 32 | 33 | impl DistanceCutoff for WithScoreCutoff 34 | where 35 | T: Copy + PartialOrd + Debug, 36 | { 37 | type Output = Option; 38 | 39 | fn cutoff(&self) -> Option { 40 | Some(self.0) 41 | } 42 | 43 | fn score(&self, raw: T) -> Self::Output { 44 | (raw <= self.0).then_some(raw) 45 | } 46 | } 47 | 48 | pub trait SimilarityCutoff 49 | where 50 | T: Copy, 51 | { 52 | type Output: Copy + Into> + PartialEq + Debug; 53 | 54 | fn cutoff(&self) -> Option; 55 | fn score(&self, raw: T) -> Self::Output; 56 | } 57 | 58 | impl SimilarityCutoff for NoScoreCutoff 59 | where 60 | T: Copy + PartialEq + Debug, 61 | { 62 | type Output = T; 63 | 64 | fn cutoff(&self) -> Option { 65 | None 66 | } 67 | 68 | fn score(&self, raw: T) -> Self::Output { 69 | raw 70 | } 71 | } 72 | 73 | impl SimilarityCutoff for WithScoreCutoff 74 | where 75 | T: Copy + PartialOrd + Debug, 76 | { 77 | type Output = Option; 78 | 79 | fn cutoff(&self) -> Option { 80 | Some(self.0) 81 | } 82 | 83 | fn score(&self, raw: T) -> Self::Output { 84 | (raw >= self.0).then_some(raw) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/details.rs: -------------------------------------------------------------------------------- 1 | pub mod common; 2 | pub mod distance; 3 | pub mod growing_hashmap; 4 | pub mod intrinsics; 5 | pub mod matrix; 6 | pub mod pattern_match_vector; 7 | -------------------------------------------------------------------------------- /src/details/common.rs: -------------------------------------------------------------------------------- 1 | use crate::{Hash, HashableChar}; 2 | use std::iter::{Skip, Take}; 3 | 4 | pub fn norm_sim_to_norm_dist(score_cutoff: f64) -> f64 { 5 | let imprecision = 0.00001; 6 | (1.0 - score_cutoff + imprecision).min(1.0) 7 | } 8 | 9 | macro_rules! impl_hashable_char { 10 | ($base_type:ty, $kind:tt $(, $t:ty)*) => { 11 | impl HashableChar for $base_type { 12 | #[inline] 13 | fn hash_char(&self) -> Hash 14 | { 15 | Hash::$kind(*self $(as $t)*) 16 | } 17 | } 18 | 19 | impl HashableChar for &$base_type { 20 | #[inline] 21 | fn hash_char(&self) -> Hash 22 | { 23 | Hash::$kind(**self $(as $t)*) 24 | } 25 | } 26 | } 27 | } 28 | 29 | impl_hashable_char!(char, UNSIGNED, u32, u64); 30 | impl_hashable_char!(i8, SIGNED, i64); 31 | impl_hashable_char!(i16, SIGNED, i64); 32 | impl_hashable_char!(i32, SIGNED, i64); 33 | impl_hashable_char!(i64, SIGNED, i64); 34 | impl_hashable_char!(u8, UNSIGNED, u64); 35 | impl_hashable_char!(u16, UNSIGNED, u64); 36 | impl_hashable_char!(u32, UNSIGNED, u64); 37 | impl_hashable_char!(u64, UNSIGNED, u64); 38 | 39 | pub fn find_common_prefix(s1: Iter1, s2: Iter2) -> usize 40 | where 41 | Iter1: Iterator + Clone, 42 | Iter2: Iterator + Clone, 43 | Iter1::Item: PartialEq, 44 | Iter2::Item: PartialEq, 45 | { 46 | s1.zip(s2) 47 | .take_while(|(a_char, b_char)| a_char == b_char) 48 | .count() 49 | } 50 | 51 | pub fn find_common_suffix(s1: Iter1, s2: Iter2) -> usize 52 | where 53 | Iter1: DoubleEndedIterator + Clone, 54 | Iter2: DoubleEndedIterator + Clone, 55 | Iter1::Item: PartialEq, 56 | Iter2::Item: PartialEq, 57 | { 58 | s1.rev() 59 | .zip(s2.rev()) 60 | .take_while(|(a_char, b_char)| a_char == b_char) 61 | .count() 62 | } 63 | 64 | pub struct RemovedAffix 65 | where 66 | Iter1: DoubleEndedIterator + Clone, 67 | Iter2: DoubleEndedIterator + Clone, 68 | Iter1::Item: PartialEq, 69 | Iter2::Item: PartialEq, 70 | { 71 | pub s1: Skip>, 72 | pub len1: usize, 73 | pub s2: Skip>, 74 | pub len2: usize, 75 | pub prefix_len: usize, 76 | pub suffix_len: usize, 77 | } 78 | 79 | pub fn remove_common_affix( 80 | s1: Iter1, 81 | mut len1: usize, 82 | s2: Iter2, 83 | mut len2: usize, 84 | ) -> RemovedAffix 85 | where 86 | Iter1: DoubleEndedIterator + Clone, 87 | Iter2: DoubleEndedIterator + Clone, 88 | Iter1::Item: PartialEq + HashableChar, 89 | Iter2::Item: PartialEq + HashableChar, 90 | { 91 | let suffix_len = find_common_suffix(s1.clone(), s2.clone()); 92 | let s1_iter_no_suffix = s1.take(len1 - suffix_len); 93 | let s2_iter_no_suffix = s2.take(len2 - suffix_len); 94 | let prefix_len = find_common_prefix(s1_iter_no_suffix.clone(), s2_iter_no_suffix.clone()); 95 | let s1_iter = s1_iter_no_suffix.skip(prefix_len); 96 | let s2_iter = s2_iter_no_suffix.skip(prefix_len); 97 | len1 -= prefix_len + suffix_len; 98 | len2 -= prefix_len + suffix_len; 99 | 100 | RemovedAffix { 101 | s1: s1_iter, 102 | len1, 103 | s2: s2_iter, 104 | len2, 105 | prefix_len, 106 | suffix_len, 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/details/distance.rs: -------------------------------------------------------------------------------- 1 | use crate::details::common::norm_sim_to_norm_dist; 2 | use crate::HashableChar; 3 | 4 | pub trait MetricUsize2 { 5 | fn maximum(&self, len1: usize, len2: usize) -> usize; 6 | 7 | fn _distance( 8 | &self, 9 | s1: Iter1, 10 | len1: usize, 11 | s2: Iter2, 12 | len2: usize, 13 | score_cutoff: Option, 14 | score_hint: Option, 15 | ) -> Option 16 | where 17 | Iter1: DoubleEndedIterator + Clone, 18 | Iter2: DoubleEndedIterator + Clone, 19 | Iter1::Item: PartialEq + HashableChar + Copy, 20 | Iter2::Item: PartialEq + HashableChar + Copy, 21 | { 22 | let maximum = self.maximum(len1, len2); 23 | 24 | let cutoff_similarity = score_cutoff.map(|x| if maximum >= x { maximum - x } else { 0 }); 25 | let hint_similarity = score_hint.map(|x| if maximum >= x { maximum - x } else { 0 }); 26 | 27 | let sim = self._similarity(s1, len1, s2, len2, cutoff_similarity, hint_similarity)?; 28 | let dist = maximum - sim; 29 | 30 | if let Some(cutoff) = score_cutoff { 31 | if dist > cutoff { 32 | return None; 33 | } 34 | } 35 | Some(dist) 36 | } 37 | 38 | fn _similarity( 39 | &self, 40 | s1: Iter1, 41 | len1: usize, 42 | s2: Iter2, 43 | len2: usize, 44 | score_cutoff: Option, 45 | mut score_hint: Option, 46 | ) -> Option 47 | where 48 | Iter1: DoubleEndedIterator + Clone, 49 | Iter2: DoubleEndedIterator + Clone, 50 | Iter1::Item: PartialEq + HashableChar + Copy, 51 | Iter2::Item: PartialEq + HashableChar + Copy, 52 | { 53 | let maximum = self.maximum(len1, len2); 54 | if let Some(cutoff) = score_cutoff { 55 | if maximum < cutoff { 56 | return None; 57 | } 58 | 59 | if let Some(hint) = score_hint { 60 | score_hint = Some(hint.min(cutoff)); 61 | } 62 | } 63 | 64 | let cutoff_distance = score_cutoff.map(|x| maximum - x); 65 | let hint_distance = score_hint.map(|x| maximum - x); 66 | let dist = self._distance(s1, len1, s2, len2, cutoff_distance, hint_distance)?; 67 | let sim = maximum - dist; 68 | if let Some(cutoff) = score_cutoff { 69 | if sim < cutoff { 70 | return None; 71 | } 72 | } 73 | Some(sim) 74 | } 75 | 76 | fn _normalized_distance( 77 | &self, 78 | s1: Iter1, 79 | len1: usize, 80 | s2: Iter2, 81 | len2: usize, 82 | mut score_cutoff: Option, 83 | score_hint: Option, 84 | ) -> Option 85 | where 86 | Iter1: DoubleEndedIterator + Clone, 87 | Iter2: DoubleEndedIterator + Clone, 88 | Iter1::Item: PartialEq + HashableChar + Copy, 89 | Iter2::Item: PartialEq + HashableChar + Copy, 90 | { 91 | let maximum = self.maximum(len1, len2); 92 | 93 | let cutoff_distance; 94 | if let Some(mut cutoff) = score_cutoff { 95 | cutoff = cutoff.clamp(0.0, 1.0); 96 | score_cutoff = Some(cutoff); 97 | cutoff_distance = Some((maximum as f64 * cutoff).ceil() as usize); 98 | } else { 99 | cutoff_distance = None; 100 | } 101 | 102 | let hint_distance; 103 | if let Some(mut cutoff) = score_hint { 104 | cutoff = cutoff.clamp(0.0, 1.0); 105 | hint_distance = Some((maximum as f64 * cutoff).ceil() as usize); 106 | } else { 107 | hint_distance = None; 108 | } 109 | 110 | let dist = self._distance(s1, len1, s2, len2, cutoff_distance, hint_distance)?; 111 | let norm_dist = if maximum == 0 { 112 | 0.0 113 | } else { 114 | dist as f64 / maximum as f64 115 | }; 116 | if let Some(cutoff) = score_cutoff { 117 | if norm_dist > cutoff { 118 | return None; 119 | } 120 | } 121 | Some(norm_dist) 122 | } 123 | 124 | fn _normalized_similarity( 125 | &self, 126 | s1: Iter1, 127 | len1: usize, 128 | s2: Iter2, 129 | len2: usize, 130 | score_cutoff: Option, 131 | score_hint: Option, 132 | ) -> Option 133 | where 134 | Iter1: DoubleEndedIterator + Clone, 135 | Iter2: DoubleEndedIterator + Clone, 136 | Iter1::Item: PartialEq + HashableChar + Copy, 137 | Iter2::Item: PartialEq + HashableChar + Copy, 138 | { 139 | let cutoff_score = score_cutoff.map(norm_sim_to_norm_dist); 140 | let hint_score = score_hint.map(norm_sim_to_norm_dist); 141 | 142 | let norm_dist = self._normalized_distance(s1, len1, s2, len2, cutoff_score, hint_score)?; 143 | let norm_sim = 1.0 - norm_dist; 144 | 145 | if let Some(cutoff) = score_cutoff { 146 | if norm_sim < cutoff { 147 | return None; 148 | } 149 | } 150 | Some(norm_sim) 151 | } 152 | } 153 | 154 | pub trait MetricUsize { 155 | fn maximum(&self, len1: usize, len2: usize) -> usize; 156 | 157 | fn _distance( 158 | &self, 159 | s1: Iter1, 160 | len1: usize, 161 | s2: Iter2, 162 | len2: usize, 163 | score_cutoff: Option, 164 | score_hint: Option, 165 | ) -> usize 166 | where 167 | Iter1: DoubleEndedIterator + Clone, 168 | Iter2: DoubleEndedIterator + Clone, 169 | Iter1::Item: PartialEq + HashableChar + Copy, 170 | Iter2::Item: PartialEq + HashableChar + Copy, 171 | { 172 | let maximum = self.maximum(len1, len2); 173 | 174 | let cutoff_similarity = score_cutoff.map(|x| if maximum >= x { maximum - x } else { 0 }); 175 | let hint_similarity = score_hint.map(|x| if maximum >= x { maximum - x } else { 0 }); 176 | 177 | let sim = self._similarity(s1, len1, s2, len2, cutoff_similarity, hint_similarity); 178 | maximum - sim 179 | } 180 | 181 | fn _similarity( 182 | &self, 183 | s1: Iter1, 184 | len1: usize, 185 | s2: Iter2, 186 | len2: usize, 187 | score_cutoff: Option, 188 | mut score_hint: Option, 189 | ) -> usize 190 | where 191 | Iter1: DoubleEndedIterator + Clone, 192 | Iter2: DoubleEndedIterator + Clone, 193 | Iter1::Item: PartialEq + HashableChar + Copy, 194 | Iter2::Item: PartialEq + HashableChar + Copy, 195 | { 196 | let maximum = self.maximum(len1, len2); 197 | if let Some(cutoff) = score_cutoff { 198 | if cutoff > maximum { 199 | return maximum; 200 | } 201 | 202 | if let Some(hint) = score_hint { 203 | score_hint = Some(hint.min(cutoff)); 204 | } 205 | } 206 | 207 | let cutoff_distance = score_cutoff.map(|x| maximum - x); 208 | let hint_distance = score_hint.map(|x| maximum - x); 209 | let dist = self._distance(s1, len1, s2, len2, cutoff_distance, hint_distance); 210 | maximum - dist 211 | } 212 | 213 | fn _normalized_distance( 214 | &self, 215 | s1: Iter1, 216 | len1: usize, 217 | s2: Iter2, 218 | len2: usize, 219 | score_cutoff: Option, 220 | score_hint: Option, 221 | ) -> f64 222 | where 223 | Iter1: DoubleEndedIterator + Clone, 224 | Iter2: DoubleEndedIterator + Clone, 225 | Iter1::Item: PartialEq + HashableChar + Copy, 226 | Iter2::Item: PartialEq + HashableChar + Copy, 227 | { 228 | let maximum = self.maximum(len1, len2); 229 | 230 | let cutoff_distance; 231 | if let Some(mut cutoff) = score_cutoff { 232 | cutoff = cutoff.clamp(0.0, 1.0); 233 | cutoff_distance = Some((maximum as f64 * cutoff).ceil() as usize); 234 | } else { 235 | cutoff_distance = None; 236 | } 237 | 238 | let hint_distance; 239 | if let Some(mut cutoff) = score_hint { 240 | cutoff = cutoff.clamp(0.0, 1.0); 241 | hint_distance = Some((maximum as f64 * cutoff).ceil() as usize); 242 | } else { 243 | hint_distance = None; 244 | } 245 | 246 | let dist = self._distance(s1, len1, s2, len2, cutoff_distance, hint_distance); 247 | if maximum == 0 { 248 | 0.0 249 | } else { 250 | dist as f64 / maximum as f64 251 | } 252 | } 253 | 254 | fn _normalized_similarity( 255 | &self, 256 | s1: Iter1, 257 | len1: usize, 258 | s2: Iter2, 259 | len2: usize, 260 | score_cutoff: Option, 261 | score_hint: Option, 262 | ) -> f64 263 | where 264 | Iter1: DoubleEndedIterator + Clone, 265 | Iter2: DoubleEndedIterator + Clone, 266 | Iter1::Item: PartialEq + HashableChar + Copy, 267 | Iter2::Item: PartialEq + HashableChar + Copy, 268 | { 269 | let cutoff_score = score_cutoff.map(norm_sim_to_norm_dist); 270 | let hint_score = score_hint.map(norm_sim_to_norm_dist); 271 | 272 | let norm_dist = self._normalized_distance(s1, len1, s2, len2, cutoff_score, hint_score); 273 | 1.0 - norm_dist 274 | } 275 | } 276 | 277 | pub trait Metricf64 { 278 | fn maximum(&self, len1: usize, len2: usize) -> f64; 279 | 280 | fn _distance( 281 | &self, 282 | s1: Iter1, 283 | len1: usize, 284 | s2: Iter2, 285 | len2: usize, 286 | score_cutoff: Option, 287 | score_hint: Option, 288 | ) -> f64 289 | where 290 | Iter1: DoubleEndedIterator + Clone, 291 | Iter2: DoubleEndedIterator + Clone, 292 | Iter1::Item: PartialEq + HashableChar + Copy, 293 | Iter2::Item: PartialEq + HashableChar + Copy, 294 | { 295 | let maximum = self.maximum(len1, len2); 296 | 297 | let cutoff_similarity = score_cutoff.map(|x| if maximum >= x { maximum - x } else { 0.0 }); 298 | let hint_similarity = score_hint.map(|x| if maximum >= x { maximum - x } else { 0.0 }); 299 | 300 | let sim = self._similarity(s1, len1, s2, len2, cutoff_similarity, hint_similarity); 301 | maximum - sim 302 | } 303 | 304 | fn _similarity( 305 | &self, 306 | s1: Iter1, 307 | len1: usize, 308 | s2: Iter2, 309 | len2: usize, 310 | score_cutoff: Option, 311 | mut score_hint: Option, 312 | ) -> f64 313 | where 314 | Iter1: DoubleEndedIterator + Clone, 315 | Iter2: DoubleEndedIterator + Clone, 316 | Iter1::Item: PartialEq + HashableChar + Copy, 317 | Iter2::Item: PartialEq + HashableChar + Copy, 318 | { 319 | let maximum = self.maximum(len1, len2); 320 | if let Some(cutoff) = score_cutoff { 321 | if cutoff > maximum { 322 | return maximum; 323 | } 324 | 325 | if let Some(hint) = score_hint { 326 | score_hint = Some(hint.min(cutoff)); 327 | } 328 | } 329 | 330 | let cutoff_distance = score_cutoff.map(|x| maximum - x); 331 | let hint_distance = score_hint.map(|x| maximum - x); 332 | let dist = self._distance(s1, len1, s2, len2, cutoff_distance, hint_distance); 333 | maximum - dist 334 | } 335 | 336 | fn _normalized_distance( 337 | &self, 338 | s1: Iter1, 339 | len1: usize, 340 | s2: Iter2, 341 | len2: usize, 342 | score_cutoff: Option, 343 | score_hint: Option, 344 | ) -> f64 345 | where 346 | Iter1: DoubleEndedIterator + Clone, 347 | Iter2: DoubleEndedIterator + Clone, 348 | Iter1::Item: PartialEq + HashableChar + Copy, 349 | Iter2::Item: PartialEq + HashableChar + Copy, 350 | { 351 | let maximum = self.maximum(len1, len2); 352 | 353 | let cutoff_distance = score_cutoff.map(|x| maximum * x); 354 | let hint_distance = score_hint.map(|x| maximum * x); 355 | 356 | let dist = self._distance(s1, len1, s2, len2, cutoff_distance, hint_distance); 357 | if maximum > 0.0 { 358 | dist / maximum 359 | } else { 360 | 0.0 361 | } 362 | } 363 | 364 | fn _normalized_similarity( 365 | &self, 366 | s1: Iter1, 367 | len1: usize, 368 | s2: Iter2, 369 | len2: usize, 370 | score_cutoff: Option, 371 | score_hint: Option, 372 | ) -> f64 373 | where 374 | Iter1: DoubleEndedIterator + Clone, 375 | Iter2: DoubleEndedIterator + Clone, 376 | Iter1::Item: PartialEq + HashableChar + Copy, 377 | Iter2::Item: PartialEq + HashableChar + Copy, 378 | { 379 | let cutoff_score = score_cutoff.map(norm_sim_to_norm_dist); 380 | let hint_score = score_hint.map(norm_sim_to_norm_dist); 381 | 382 | let norm_dist = self._normalized_distance(s1, len1, s2, len2, cutoff_score, hint_score); 383 | 1.0 - norm_dist 384 | } 385 | } 386 | -------------------------------------------------------------------------------- /src/details/growing_hashmap.rs: -------------------------------------------------------------------------------- 1 | use crate::{Hash, HashableChar}; 2 | 3 | #[derive(Default, Clone)] 4 | struct GrowingHashmapMapElem { 5 | key: u64, 6 | value: ValueType, 7 | } 8 | 9 | /// specialized hashmap to store user provided types 10 | /// this implementation relies on a couple of base assumptions in order to simplify the implementation 11 | /// - the hashmap does not have an upper limit of included items 12 | /// - the default value for the `ValueType` can be used as a dummy value to indicate an empty cell 13 | /// - elements can't be removed 14 | /// - only allocates memory on first write access. 15 | /// This improves performance for hashmaps that are never written to 16 | pub struct GrowingHashmap { 17 | used: i32, 18 | fill: i32, 19 | mask: i32, 20 | map: Option>>, 21 | } 22 | 23 | impl Default for GrowingHashmap 24 | where 25 | ValueType: Default + Clone + Eq, 26 | { 27 | #[inline] 28 | fn default() -> Self { 29 | Self { 30 | used: 0, 31 | fill: 0, 32 | mask: -1, 33 | map: None, 34 | } 35 | } 36 | } 37 | 38 | impl GrowingHashmap 39 | where 40 | ValueType: Default + Clone + Eq + Copy, 41 | { 42 | #[allow(dead_code)] 43 | pub const fn size(&self) -> i32 { 44 | self.used 45 | } 46 | 47 | #[allow(dead_code)] 48 | pub const fn capacity(&self) -> i32 { 49 | self.mask + 1 50 | } 51 | 52 | #[allow(dead_code)] 53 | pub const fn empty(&self) -> bool { 54 | self.used == 0 55 | } 56 | 57 | pub fn get(&self, key: u64) -> ValueType { 58 | self.map 59 | .as_ref() 60 | .map_or_else(|| Default::default(), |map| map[self.lookup(key)].value) 61 | } 62 | 63 | pub fn get_mut(&mut self, key: u64) -> &mut ValueType { 64 | if self.map.is_none() { 65 | self.allocate(); 66 | } 67 | 68 | let mut i = self.lookup(key); 69 | if self 70 | .map 71 | .as_ref() 72 | .expect("map should have been created above")[i] 73 | .value 74 | == Default::default() 75 | { 76 | self.fill += 1; 77 | // resize when 2/3 full 78 | if self.fill * 3 >= (self.mask + 1) * 2 { 79 | self.grow((self.used + 1) * 2); 80 | i = self.lookup(key); 81 | } 82 | 83 | self.used += 1; 84 | } 85 | 86 | let elem = &mut self 87 | .map 88 | .as_mut() 89 | .expect("map should have been created above")[i]; 90 | elem.key = key; 91 | &mut elem.value 92 | } 93 | 94 | fn allocate(&mut self) { 95 | self.mask = 8 - 1; 96 | self.map = Some(vec![GrowingHashmapMapElem::default(); 8]); 97 | } 98 | 99 | /// lookup key inside the hashmap using a similar collision resolution 100 | /// strategy to `CPython` and `Ruby` 101 | fn lookup(&self, key: u64) -> usize { 102 | let hash = key; 103 | let mut i = hash as usize & self.mask as usize; 104 | 105 | let map = self 106 | .map 107 | .as_ref() 108 | .expect("callers have to ensure map is allocated"); 109 | 110 | if map[i].value == Default::default() || map[i].key == key { 111 | return i; 112 | } 113 | 114 | let mut perturb = key; 115 | loop { 116 | i = (i * 5 + perturb as usize + 1) & self.mask as usize; 117 | 118 | if map[i].value == Default::default() || map[i].key == key { 119 | return i; 120 | } 121 | 122 | perturb >>= 5; 123 | } 124 | } 125 | 126 | fn grow(&mut self, min_used: i32) { 127 | let mut new_size = self.mask + 1; 128 | while new_size <= min_used { 129 | new_size <<= 1; 130 | } 131 | 132 | self.fill = self.used; 133 | self.mask = new_size - 1; 134 | 135 | let old_map = std::mem::replace( 136 | self.map 137 | .as_mut() 138 | .expect("callers have to ensure map is allocated"), 139 | vec![GrowingHashmapMapElem::::default(); new_size as usize], 140 | ); 141 | 142 | for elem in old_map { 143 | if elem.value != Default::default() { 144 | let j = self.lookup(elem.key); 145 | let new_elem = &mut self.map.as_mut().expect("map created above")[j]; 146 | new_elem.key = elem.key; 147 | new_elem.value = elem.value; 148 | self.used -= 1; 149 | if self.used == 0 { 150 | break; 151 | } 152 | } 153 | } 154 | 155 | self.used = self.fill; 156 | } 157 | } 158 | 159 | pub struct HybridGrowingHashmap { 160 | // todo in theory we have a fixed keytype here and so we wouldn't need both 161 | // an unsigned and signed map. In Practice this probably doesn't matter all that much 162 | pub map_unsigned: GrowingHashmap, 163 | pub map_signed: GrowingHashmap, 164 | pub extended_ascii: [ValueType; 256], 165 | } 166 | 167 | impl HybridGrowingHashmap 168 | where 169 | ValueType: Default + Clone + Copy + Eq, 170 | { 171 | // right now this can't be used since rust fails to elide the memcpy 172 | // on return 173 | /*pub fn new() -> Self { 174 | HybridGrowingHashmap { 175 | map_unsigned: GrowingHashmap::default(), 176 | map_signed: GrowingHashmap::default(), 177 | extended_ascii: [Default::default(); 256], 178 | } 179 | }*/ 180 | 181 | pub fn get(&self, key: CharT) -> ValueType 182 | where 183 | CharT: HashableChar, 184 | { 185 | match key.hash_char() { 186 | Hash::SIGNED(value) => { 187 | if value < 0 { 188 | self.map_signed.get(u64::from_ne_bytes(value.to_ne_bytes())) 189 | } else if value <= 255 { 190 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 191 | self.extended_ascii[usize::from(val_u8)] 192 | } else { 193 | self.map_unsigned 194 | .get(u64::from_ne_bytes(value.to_ne_bytes())) 195 | } 196 | } 197 | Hash::UNSIGNED(value) => { 198 | if value <= 255 { 199 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 200 | self.extended_ascii[usize::from(val_u8)] 201 | } else { 202 | self.map_unsigned.get(value) 203 | } 204 | } 205 | } 206 | } 207 | 208 | pub fn get_mut(&mut self, key: CharT) -> &mut ValueType 209 | where 210 | CharT: HashableChar, 211 | { 212 | match key.hash_char() { 213 | Hash::SIGNED(value) => { 214 | if value < 0 { 215 | self.map_signed 216 | .get_mut(u64::from_ne_bytes(value.to_ne_bytes())) 217 | } else if value <= 255 { 218 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 219 | &mut self.extended_ascii[usize::from(val_u8)] 220 | } else { 221 | self.map_unsigned 222 | .get_mut(u64::from_ne_bytes(value.to_ne_bytes())) 223 | } 224 | } 225 | Hash::UNSIGNED(value) => { 226 | if value <= 255 { 227 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 228 | &mut self.extended_ascii[usize::from(val_u8)] 229 | } else { 230 | self.map_unsigned.get_mut(value) 231 | } 232 | } 233 | } 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /src/details/intrinsics.rs: -------------------------------------------------------------------------------- 1 | pub const fn ceil_div_usize(a: usize, divisor: usize) -> usize { 2 | a / divisor + (a % divisor != 0) as usize 3 | } 4 | 5 | /// shift right without undefined behavior for shifts > bit width 6 | pub const fn shr64(a: u64, shift: usize) -> u64 { 7 | if shift < 64 { 8 | a >> shift 9 | } else { 10 | 0 11 | } 12 | } 13 | 14 | /// shift left without undefined behavior for shifts > bit width 15 | #[allow(dead_code)] 16 | pub const fn shl64(a: u64, shift: usize) -> u64 { 17 | if shift < 64 { 18 | a << shift 19 | } else { 20 | 0 21 | } 22 | } 23 | 24 | // this is still a nightly only api. Can be removed if it becomes stable 25 | pub const fn carrying_add(lhs: u64, rhs: u64, carry: bool) -> (u64, bool) { 26 | let (a, b) = lhs.overflowing_add(rhs); 27 | let (c, d) = a.overflowing_add(carry as u64); 28 | (c, b | d) 29 | } 30 | 31 | pub const fn bit_mask_lsb_u64(n: usize) -> u64 { 32 | let mut mask = !0_u64; 33 | if n < 64 { 34 | mask = mask.wrapping_add(1_u64 << n); 35 | } 36 | mask 37 | } 38 | pub const fn blsi_u64(v: u64) -> u64 { 39 | v & v.wrapping_neg() 40 | } 41 | 42 | #[allow(dead_code)] 43 | pub const fn blsr_u64(v: u64) -> u64 { 44 | v & v.wrapping_sub(1) 45 | } 46 | -------------------------------------------------------------------------------- /src/details/matrix.rs: -------------------------------------------------------------------------------- 1 | use std::ops::{BitAnd, Shl}; 2 | 3 | #[derive(Clone)] 4 | pub struct BitMatrix { 5 | rows: usize, 6 | cols: usize, 7 | matrix: Vec, 8 | } 9 | 10 | impl BitMatrix 11 | where 12 | T: Clone, 13 | { 14 | pub fn new(rows: usize, cols: usize, val: T) -> Self { 15 | Self { 16 | rows, 17 | cols, 18 | matrix: vec![val; rows * cols], 19 | } 20 | } 21 | 22 | #[allow(dead_code)] 23 | pub const fn rows(&self) -> usize { 24 | self.rows 25 | } 26 | 27 | #[allow(dead_code)] 28 | pub const fn cols(&self) -> usize { 29 | self.cols 30 | } 31 | 32 | pub fn get(&self, row: usize, col: usize) -> &T { 33 | debug_assert!(row < self.rows); 34 | debug_assert!(col < self.cols); 35 | &self.matrix[row * self.cols + col] 36 | } 37 | 38 | pub fn get_mut(&mut self, row: usize, col: usize) -> &mut T { 39 | debug_assert!(row < self.rows); 40 | debug_assert!(col < self.cols); 41 | &mut self.matrix[row * self.cols + col] 42 | } 43 | } 44 | 45 | pub struct ShiftedBitMatrix { 46 | matrix: BitMatrix, 47 | offsets: Vec, 48 | } 49 | 50 | impl ShiftedBitMatrix 51 | where 52 | T: Copy + From + Shl + BitAnd + PartialEq, 53 | { 54 | pub fn new(rows: usize, cols: usize, val: T) -> Self { 55 | Self { 56 | matrix: BitMatrix::::new(rows, cols, val), 57 | offsets: vec![0; rows], 58 | } 59 | } 60 | 61 | #[allow(dead_code)] 62 | pub fn test_bit(&self, row: usize, mut col: usize, default: bool) -> bool { 63 | let offset = self.offsets[row]; 64 | 65 | if offset < 0 { 66 | col += (-offset) as usize; 67 | } else if col >= offset as usize { 68 | col -= offset as usize; 69 | } 70 | // bit on the left of the band 71 | else { 72 | return default; 73 | } 74 | 75 | let word_size = std::mem::size_of::() * 8; 76 | let col_word = col / word_size; 77 | let col_mask = T::from(1) << (col % word_size); 78 | 79 | (*self.matrix.get(row, col_word) & col_mask) != T::from(0) 80 | } 81 | 82 | #[allow(dead_code)] 83 | pub fn get(&self, row: usize, col: usize) -> &T { 84 | self.matrix.get(row, col) 85 | } 86 | 87 | pub fn get_mut(&mut self, row: usize, col: usize) -> &mut T { 88 | self.matrix.get_mut(row, col) 89 | } 90 | 91 | pub fn set_offset(&mut self, row: usize, offset: isize) { 92 | self.offsets[row] = offset; 93 | } 94 | } 95 | 96 | impl Default for ShiftedBitMatrix 97 | where 98 | T: Copy + From + Shl + BitAnd + PartialEq, 99 | { 100 | fn default() -> Self { 101 | Self::new(0, 0, T::from(0)) 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/details/pattern_match_vector.rs: -------------------------------------------------------------------------------- 1 | use crate::details::intrinsics::ceil_div_usize; 2 | use crate::details::matrix::BitMatrix; 3 | use crate::{Hash, HashableChar}; 4 | 5 | #[derive(Clone, Copy, Default)] 6 | struct BitvectorHashmapMapElem { 7 | key: u64, 8 | value: u64, 9 | } 10 | 11 | /// specialized hashmap to store bitvectors 12 | /// this implementation relies on a couple of base assumptions in order to simplify the implementation 13 | /// - the hashmap includes at most 64 different items 14 | /// - since bitvectors are only in use when at least one bit is set, 0 can be used to indicate an unused element 15 | /// - elements are never explicitly removed. When changing a sliding window over a string, shifting the corresponding 16 | /// bits would eventually be 0 -> removed the element 17 | /// - works with u64 keys. The caller has to ensure these have no collisions when using e.g. a mixture of u64 and i64 elements 18 | /// this can be done e.g. by using two hashmaps one for values < 0 and one for values >= 0 19 | #[derive(Clone)] 20 | pub struct BitvectorHashmap { 21 | map: [BitvectorHashmapMapElem; 128], 22 | } 23 | 24 | impl Default for BitvectorHashmap { 25 | #[inline] 26 | fn default() -> Self { 27 | Self { 28 | map: [BitvectorHashmapMapElem::default(); 128], 29 | } 30 | } 31 | } 32 | 33 | impl BitvectorHashmap { 34 | pub const fn get(&self, key: u64) -> u64 { 35 | self.map[self.lookup(key)].value 36 | } 37 | 38 | pub fn get_mut(&mut self, key: u64) -> &mut u64 { 39 | let i = self.lookup(key); 40 | let elem = &mut self.map[i]; 41 | elem.key = key; 42 | &mut elem.value 43 | } 44 | 45 | /// lookup key inside the hashmap using a similar collision resolution 46 | /// strategy to `CPython` and `Ruby` 47 | const fn lookup(&self, key: u64) -> usize { 48 | let mut i = (key % 128) as usize; 49 | 50 | if self.map[i].value == 0 || self.map[i].key == key { 51 | return i; 52 | } 53 | 54 | let mut perturb = key; 55 | loop { 56 | i = (i * 5 + perturb as usize + 1) % 128; 57 | 58 | if self.map[i].value == 0 || self.map[i].key == key { 59 | return i; 60 | } 61 | 62 | perturb >>= 5; 63 | } 64 | } 65 | } 66 | 67 | pub struct PatternMatchVector { 68 | pub extended_ascii: [u64; 256], 69 | pub map_unsigned: Option, 70 | pub map_signed: Option, 71 | } 72 | 73 | pub trait BitVectorInterface { 74 | fn get(&self, block: usize, key: CharT) -> u64 75 | where 76 | CharT: HashableChar; 77 | 78 | fn size(&self) -> usize; 79 | } 80 | 81 | impl Default for PatternMatchVector { 82 | fn default() -> Self { 83 | Self { 84 | map_unsigned: None, 85 | map_signed: None, 86 | extended_ascii: [0; 256], 87 | } 88 | } 89 | } 90 | 91 | impl PatternMatchVector { 92 | pub fn insert(&mut self, s1: Iter1) 93 | where 94 | Iter1: Iterator, 95 | CharT: HashableChar, 96 | { 97 | let mut mask: u64 = 1; 98 | for ch in s1 { 99 | self.insert_mask(ch, mask); 100 | mask <<= 1; 101 | } 102 | } 103 | 104 | fn insert_mask(&mut self, key: CharT, mask: u64) 105 | where 106 | CharT: HashableChar, 107 | { 108 | match key.hash_char() { 109 | Hash::SIGNED(value) => { 110 | if value < 0 { 111 | if self.map_signed.is_none() { 112 | self.map_signed = Some(BitvectorHashmap::default()); 113 | } 114 | let item = self 115 | .map_signed 116 | .as_mut() 117 | .expect("map should have been created above") 118 | .get_mut(u64::from_ne_bytes(value.to_ne_bytes())); 119 | *item |= mask; 120 | } else if value <= 255 { 121 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 122 | let item = &mut self.extended_ascii[usize::from(val_u8)]; 123 | *item |= mask; 124 | } else { 125 | if self.map_unsigned.is_none() { 126 | self.map_unsigned = Some(BitvectorHashmap::default()); 127 | } 128 | let item = self 129 | .map_unsigned 130 | .as_mut() 131 | .expect("map should have been created above") 132 | .get_mut(u64::from_ne_bytes(value.to_ne_bytes())); 133 | *item |= mask; 134 | } 135 | } 136 | Hash::UNSIGNED(value) => { 137 | if value <= 255 { 138 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 139 | let item = &mut self.extended_ascii[usize::from(val_u8)]; 140 | *item |= mask; 141 | } else { 142 | if self.map_unsigned.is_none() { 143 | self.map_unsigned = Some(BitvectorHashmap::default()); 144 | } 145 | let item = self 146 | .map_unsigned 147 | .as_mut() 148 | .expect("map should have been created above") 149 | .get_mut(value); 150 | *item |= mask; 151 | } 152 | } 153 | } 154 | } 155 | } 156 | 157 | impl BitVectorInterface for PatternMatchVector { 158 | fn get(&self, block: usize, key: CharT) -> u64 159 | where 160 | CharT: HashableChar, 161 | { 162 | debug_assert!(block == 0); 163 | match key.hash_char() { 164 | Hash::SIGNED(value) => { 165 | if value < 0 { 166 | self.map_signed 167 | .as_ref() 168 | .map_or(0, |map| map.get(u64::from_ne_bytes(value.to_ne_bytes()))) 169 | } else if value <= 255 { 170 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 171 | self.extended_ascii[usize::from(val_u8)] 172 | } else { 173 | self.map_unsigned 174 | .as_ref() 175 | .map_or(0, |map| map.get(u64::from_ne_bytes(value.to_ne_bytes()))) 176 | } 177 | } 178 | Hash::UNSIGNED(value) => { 179 | if value <= 255 { 180 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 181 | self.extended_ascii[usize::from(val_u8)] 182 | } else { 183 | self.map_unsigned.as_ref().map_or(0, |map| map.get(value)) 184 | } 185 | } 186 | } 187 | } 188 | 189 | fn size(&self) -> usize { 190 | 1 191 | } 192 | } 193 | 194 | #[derive(Clone)] 195 | pub struct BlockPatternMatchVector { 196 | pub block_count: usize, 197 | pub map_unsigned: Option>, 198 | pub map_signed: Option>, 199 | pub extended_ascii: BitMatrix, 200 | } 201 | 202 | impl BlockPatternMatchVector { 203 | pub fn new(str_len: usize) -> Self { 204 | let block_count = ceil_div_usize(str_len, 64); 205 | Self { 206 | block_count, 207 | map_unsigned: None, 208 | map_signed: None, 209 | extended_ascii: BitMatrix::::new(256, block_count, 0), 210 | } 211 | } 212 | 213 | pub fn insert(&mut self, s1: Iter1) 214 | where 215 | Iter1: Iterator, 216 | CharT: HashableChar, 217 | { 218 | let mut mask: u64 = 1; 219 | for (i, ch) in s1.enumerate() { 220 | let block = i / 64; 221 | self.insert_mask(block, ch, mask); 222 | mask = mask.rotate_left(1); 223 | } 224 | } 225 | 226 | fn insert_mask(&mut self, block: usize, key: CharT, mask: u64) 227 | where 228 | CharT: HashableChar, 229 | { 230 | debug_assert!(block < self.size()); 231 | 232 | match key.hash_char() { 233 | Hash::SIGNED(value) => { 234 | if value < 0 { 235 | if self.map_signed.is_none() { 236 | self.map_signed = Some(vec![BitvectorHashmap::default(); self.block_count]); 237 | } 238 | let item = self 239 | .map_signed 240 | .as_mut() 241 | .expect("map should have been created above")[block] 242 | .get_mut(u64::from_ne_bytes(value.to_ne_bytes())); 243 | *item |= mask; 244 | } else if value <= 255 { 245 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 246 | let item = self.extended_ascii.get_mut(val_u8.into(), block); 247 | *item |= mask; 248 | } else { 249 | if self.map_unsigned.is_none() { 250 | self.map_unsigned = 251 | Some(vec![BitvectorHashmap::default(); self.block_count]); 252 | } 253 | let item = self 254 | .map_unsigned 255 | .as_mut() 256 | .expect("map should have been created above")[block] 257 | .get_mut(u64::from_ne_bytes(value.to_ne_bytes())); 258 | *item |= mask; 259 | } 260 | } 261 | Hash::UNSIGNED(value) => { 262 | if value <= 255 { 263 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 264 | let item = self.extended_ascii.get_mut(val_u8.into(), block); 265 | *item |= mask; 266 | } else { 267 | if self.map_unsigned.is_none() { 268 | self.map_unsigned = 269 | Some(vec![BitvectorHashmap::default(); self.block_count]); 270 | } 271 | let item = self 272 | .map_unsigned 273 | .as_mut() 274 | .expect("map should have been created above")[block] 275 | .get_mut(value); 276 | *item |= mask; 277 | } 278 | } 279 | } 280 | } 281 | } 282 | 283 | impl BitVectorInterface for BlockPatternMatchVector { 284 | fn get(&self, block: usize, key: CharT) -> u64 285 | where 286 | CharT: HashableChar, 287 | { 288 | debug_assert!(block < self.size()); 289 | 290 | match key.hash_char() { 291 | Hash::SIGNED(value) => { 292 | if value < 0 { 293 | self.map_signed.as_ref().map_or(0, |map| { 294 | map[block].get(u64::from_ne_bytes(value.to_ne_bytes())) 295 | }) 296 | } else if value <= 255 { 297 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 298 | *self.extended_ascii.get(val_u8.into(), block) 299 | } else { 300 | self.map_unsigned.as_ref().map_or(0, |map| { 301 | map[block].get(u64::from_ne_bytes(value.to_ne_bytes())) 302 | }) 303 | } 304 | } 305 | Hash::UNSIGNED(value) => { 306 | if value <= 255 { 307 | let val_u8 = u8::try_from(value).expect("we check the bounds above"); 308 | *self.extended_ascii.get(val_u8.into(), block) 309 | } else { 310 | self.map_unsigned 311 | .as_ref() 312 | .map_or(0, |map| map[block].get(value)) 313 | } 314 | } 315 | } 316 | } 317 | 318 | fn size(&self) -> usize { 319 | self.block_count 320 | } 321 | } 322 | -------------------------------------------------------------------------------- /src/distance.rs: -------------------------------------------------------------------------------- 1 | pub mod damerau_levenshtein; 2 | pub mod hamming; 3 | pub mod indel; 4 | pub mod jaro; 5 | pub mod jaro_winkler; 6 | pub mod lcs_seq; 7 | pub mod levenshtein; 8 | pub mod osa; 9 | pub mod postfix; 10 | pub mod prefix; 11 | 12 | #[cfg(test)] 13 | pub(crate) mod example; 14 | -------------------------------------------------------------------------------- /src/distance/example.rs: -------------------------------------------------------------------------------- 1 | pub mod ocr; 2 | -------------------------------------------------------------------------------- /src/distance/hamming.rs: -------------------------------------------------------------------------------- 1 | //! Hamming distance 2 | //! 3 | //! The Hamming distance measures the similarity of two sequences of equal length. 4 | //! Specifically, it counts the minimum number of substitutions required to 5 | //! transform one string into the other. 6 | //! 7 | //! While regularly the Hamming distance only works with texts of equal length, 8 | //! this implementation provides an addition argument `pad` to decide whether texts 9 | //! of unequal length should be padded or return an error. 10 | //! 11 | 12 | use crate::common::{DistanceCutoff, NoScoreCutoff, SimilarityCutoff, WithScoreCutoff}; 13 | use crate::details::distance::MetricUsize; 14 | use crate::HashableChar; 15 | 16 | use std::error; 17 | use std::fmt::{self, Debug, Display, Formatter}; 18 | 19 | #[derive(Default, Copy, Clone)] 20 | pub struct Padding(bool); 21 | #[derive(Default, Copy, Clone)] 22 | pub struct NoPadding; 23 | 24 | #[must_use] 25 | #[derive(Copy, Clone, Debug)] 26 | pub struct Args { 27 | score_cutoff: CutoffType, 28 | score_hint: Option, 29 | pad: PaddingType, 30 | } 31 | 32 | impl Default for Args { 33 | fn default() -> Args { 34 | Args { 35 | score_cutoff: NoScoreCutoff, 36 | score_hint: None, 37 | pad: NoPadding, 38 | } 39 | } 40 | } 41 | 42 | pub trait PaddingTrait 43 | where 44 | T: Copy, 45 | { 46 | type Output: Copy + PartialEq + Debug; 47 | 48 | fn pad(&self) -> bool; 49 | fn error(&self) -> Self::Output; 50 | fn score(&self, raw: T) -> Self::Output; 51 | } 52 | 53 | impl PaddingTrait for NoPadding 54 | where 55 | T: Copy + PartialEq + Debug, 56 | { 57 | type Output = Result; 58 | 59 | fn pad(&self) -> bool { 60 | false 61 | } 62 | 63 | fn error(&self) -> Self::Output { 64 | Err(Error::DifferentLengthArgs) 65 | } 66 | 67 | fn score(&self, raw: T) -> Self::Output { 68 | Ok(raw) 69 | } 70 | } 71 | 72 | impl PaddingTrait for Padding 73 | where 74 | T: Copy + PartialOrd + Debug + Default, 75 | { 76 | type Output = T; 77 | 78 | fn pad(&self) -> bool { 79 | self.0 80 | } 81 | 82 | // will not occur 83 | fn error(&self) -> Self::Output { 84 | T::default() 85 | } 86 | 87 | fn score(&self, raw: T) -> Self::Output { 88 | raw 89 | } 90 | } 91 | 92 | impl Args 93 | where 94 | ResultType: Copy, 95 | { 96 | pub fn score_hint(mut self, score_hint: ResultType) -> Self { 97 | self.score_hint = Some(score_hint); 98 | self 99 | } 100 | 101 | pub fn score_cutoff( 102 | self, 103 | score_cutoff: ResultType, 104 | ) -> Args, PaddingType> { 105 | Args { 106 | score_hint: self.score_hint, 107 | score_cutoff: WithScoreCutoff(score_cutoff), 108 | pad: self.pad, 109 | } 110 | } 111 | 112 | pub fn pad(self, pad: bool) -> Args { 113 | Args { 114 | score_hint: self.score_hint, 115 | score_cutoff: self.score_cutoff, 116 | pad: Padding(pad), 117 | } 118 | } 119 | } 120 | 121 | #[derive(Debug, PartialEq, Eq, Copy, Clone)] 122 | pub enum Error { 123 | DifferentLengthArgs, 124 | } 125 | 126 | impl Display for Error { 127 | fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { 128 | let text = match self { 129 | Self::DifferentLengthArgs => "Differing length arguments provided", 130 | }; 131 | 132 | write!(fmt, "{text}") 133 | } 134 | } 135 | 136 | impl error::Error for Error {} 137 | 138 | fn distance_impl(mut s1: Iter1, mut s2: Iter2) -> usize 139 | where 140 | Iter1: Iterator, 141 | Iter2: Iterator, 142 | Iter1::Item: PartialEq + HashableChar, 143 | Iter2::Item: PartialEq + HashableChar, 144 | { 145 | let mut dist = 0; 146 | loop { 147 | match (s1.next(), s2.next()) { 148 | (Some(ch1), Some(ch2)) => { 149 | if ch1 != ch2 { 150 | dist += 1; 151 | } 152 | } 153 | (None, None) => { 154 | return dist; 155 | } 156 | _ => { 157 | dist += 1; 158 | } 159 | } 160 | } 161 | } 162 | 163 | struct IndividualComparator; 164 | 165 | impl MetricUsize for IndividualComparator { 166 | fn maximum(&self, len1: usize, len2: usize) -> usize { 167 | len1.max(len2) 168 | } 169 | 170 | fn _distance( 171 | &self, 172 | s1: Iter1, 173 | _len1: usize, 174 | s2: Iter2, 175 | _len2: usize, 176 | _score_cutoff: Option, 177 | _score_hint: Option, 178 | ) -> usize 179 | where 180 | Iter1: Iterator, 181 | Iter2: Iterator, 182 | Iter1::Item: PartialEq + HashableChar, 183 | Iter2::Item: PartialEq + HashableChar, 184 | { 185 | distance_impl(s1, s2) 186 | } 187 | } 188 | 189 | /// Hamming distance 190 | /// 191 | /// Calculates the Hamming distance. 192 | /// 193 | /// # Examples 194 | /// 195 | /// ``` 196 | /// use rapidfuzz::distance::hamming; 197 | /// 198 | /// assert_eq!(Ok(1), hamming::distance("hamming".chars(), "humming".chars())); 199 | /// ``` 200 | pub fn distance(s1: Iter1, s2: Iter2) -> Result 201 | where 202 | Iter1: IntoIterator, 203 | Iter1::IntoIter: DoubleEndedIterator + Clone, 204 | Iter2: IntoIterator, 205 | Iter2::IntoIter: DoubleEndedIterator + Clone, 206 | Iter1::Item: PartialEq + HashableChar + Copy, 207 | Iter2::Item: PartialEq + HashableChar + Copy, 208 | { 209 | distance_with_args(s1, s2, &Args::default()) 210 | } 211 | 212 | pub fn distance_with_args( 213 | s1: Iter1, 214 | s2: Iter2, 215 | args: &Args, 216 | ) -> PaddingType::Output 217 | where 218 | Iter1: IntoIterator, 219 | Iter1::IntoIter: DoubleEndedIterator + Clone, 220 | Iter2: IntoIterator, 221 | Iter2::IntoIter: DoubleEndedIterator + Clone, 222 | Iter1::Item: PartialEq + HashableChar + Copy, 223 | Iter2::Item: PartialEq + HashableChar + Copy, 224 | CutoffType: DistanceCutoff, 225 | PaddingType: PaddingTrait, 226 | { 227 | let s1_iter = s1.into_iter(); 228 | let s2_iter = s2.into_iter(); 229 | let len1 = s1_iter.clone().count(); 230 | let len2 = s2_iter.clone().count(); 231 | 232 | if !args.pad.pad() && len1 != len2 { 233 | return args.pad.error(); 234 | } 235 | 236 | args.pad 237 | .score(args.score_cutoff.score(IndividualComparator {}._distance( 238 | s1_iter, 239 | len1, 240 | s2_iter, 241 | len2, 242 | args.score_cutoff.cutoff(), 243 | args.score_hint, 244 | ))) 245 | } 246 | 247 | /// Hamming similarity in the range [0, max] 248 | /// 249 | /// This is calculated as `max(len1, len2) - `[`distance`]. 250 | /// 251 | pub fn similarity(s1: Iter1, s2: Iter2) -> Result 252 | where 253 | Iter1: IntoIterator, 254 | Iter1::IntoIter: DoubleEndedIterator + Clone, 255 | Iter2: IntoIterator, 256 | Iter2::IntoIter: DoubleEndedIterator + Clone, 257 | Iter1::Item: PartialEq + HashableChar + Copy, 258 | Iter2::Item: PartialEq + HashableChar + Copy, 259 | { 260 | similarity_with_args(s1, s2, &Args::default()) 261 | } 262 | 263 | pub fn similarity_with_args( 264 | s1: Iter1, 265 | s2: Iter2, 266 | args: &Args, 267 | ) -> PaddingType::Output 268 | where 269 | Iter1: IntoIterator, 270 | Iter1::IntoIter: DoubleEndedIterator + Clone, 271 | Iter2: IntoIterator, 272 | Iter2::IntoIter: DoubleEndedIterator + Clone, 273 | Iter1::Item: PartialEq + HashableChar + Copy, 274 | Iter2::Item: PartialEq + HashableChar + Copy, 275 | CutoffType: SimilarityCutoff, 276 | PaddingType: PaddingTrait, 277 | { 278 | let s1_iter = s1.into_iter(); 279 | let s2_iter = s2.into_iter(); 280 | let len1 = s1_iter.clone().count(); 281 | let len2 = s2_iter.clone().count(); 282 | 283 | if !args.pad.pad() && len1 != len2 { 284 | return args.pad.error(); 285 | } 286 | 287 | args.pad 288 | .score(args.score_cutoff.score(IndividualComparator {}._similarity( 289 | s1_iter, 290 | len1, 291 | s2_iter, 292 | len2, 293 | args.score_cutoff.cutoff(), 294 | args.score_hint, 295 | ))) 296 | } 297 | 298 | /// Normalized Hamming distance in the range [1.0, 0.0] 299 | /// 300 | /// This is calculated as [`distance`]` / max(len1, len2)`. 301 | /// 302 | pub fn normalized_distance(s1: Iter1, s2: Iter2) -> Result 303 | where 304 | Iter1: IntoIterator, 305 | Iter1::IntoIter: DoubleEndedIterator + Clone, 306 | Iter2: IntoIterator, 307 | Iter2::IntoIter: DoubleEndedIterator + Clone, 308 | Iter1::Item: PartialEq + HashableChar + Copy, 309 | Iter2::Item: PartialEq + HashableChar + Copy, 310 | { 311 | normalized_distance_with_args(s1, s2, &Args::default()) 312 | } 313 | 314 | pub fn normalized_distance_with_args( 315 | s1: Iter1, 316 | s2: Iter2, 317 | args: &Args, 318 | ) -> PaddingType::Output 319 | where 320 | Iter1: IntoIterator, 321 | Iter1::IntoIter: DoubleEndedIterator + Clone, 322 | Iter2: IntoIterator, 323 | Iter2::IntoIter: DoubleEndedIterator + Clone, 324 | Iter1::Item: PartialEq + HashableChar + Copy, 325 | Iter2::Item: PartialEq + HashableChar + Copy, 326 | CutoffType: DistanceCutoff, 327 | PaddingType: PaddingTrait, 328 | { 329 | let s1_iter = s1.into_iter(); 330 | let s2_iter = s2.into_iter(); 331 | let len1 = s1_iter.clone().count(); 332 | let len2 = s2_iter.clone().count(); 333 | 334 | if !args.pad.pad() && len1 != len2 { 335 | return args.pad.error(); 336 | } 337 | 338 | args.pad.score( 339 | args.score_cutoff 340 | .score(IndividualComparator {}._normalized_distance( 341 | s1_iter, 342 | len1, 343 | s2_iter, 344 | len2, 345 | args.score_cutoff.cutoff(), 346 | args.score_hint, 347 | )), 348 | ) 349 | } 350 | 351 | /// Normalized Hamming similarity in the range [0.0, 1.0] 352 | /// 353 | /// This is calculated as `1.0 - `[`normalized_distance`]. 354 | /// 355 | pub fn normalized_similarity(s1: Iter1, s2: Iter2) -> Result 356 | where 357 | Iter1: IntoIterator, 358 | Iter1::IntoIter: DoubleEndedIterator + Clone, 359 | Iter2: IntoIterator, 360 | Iter2::IntoIter: DoubleEndedIterator + Clone, 361 | Iter1::Item: PartialEq + HashableChar + Copy, 362 | Iter2::Item: PartialEq + HashableChar + Copy, 363 | { 364 | normalized_similarity_with_args(s1, s2, &Args::default()) 365 | } 366 | 367 | pub fn normalized_similarity_with_args( 368 | s1: Iter1, 369 | s2: Iter2, 370 | args: &Args, 371 | ) -> PaddingType::Output 372 | where 373 | Iter1: IntoIterator, 374 | Iter1::IntoIter: DoubleEndedIterator + Clone, 375 | Iter2: IntoIterator, 376 | Iter2::IntoIter: DoubleEndedIterator + Clone, 377 | Iter1::Item: PartialEq + HashableChar + Copy, 378 | Iter2::Item: PartialEq + HashableChar + Copy, 379 | CutoffType: SimilarityCutoff, 380 | PaddingType: PaddingTrait, 381 | { 382 | let s1_iter = s1.into_iter(); 383 | let s2_iter = s2.into_iter(); 384 | let len1 = s1_iter.clone().count(); 385 | let len2 = s2_iter.clone().count(); 386 | 387 | if !args.pad.pad() && len1 != len2 { 388 | return args.pad.error(); 389 | } 390 | 391 | args.pad.score( 392 | args.score_cutoff 393 | .score(IndividualComparator {}._normalized_similarity( 394 | s1_iter, 395 | len1, 396 | s2_iter, 397 | len2, 398 | args.score_cutoff.cutoff(), 399 | args.score_hint, 400 | )), 401 | ) 402 | } 403 | 404 | /// `One x Many` comparisons using the Hamming distance 405 | /// 406 | /// # Examples 407 | /// 408 | /// ``` 409 | /// use rapidfuzz::distance::hamming; 410 | /// 411 | /// let scorer = hamming::BatchComparator::new("hamming".chars()); 412 | /// assert_eq!(Ok(1), scorer.distance("humming".chars())); 413 | /// ``` 414 | #[derive(Clone)] 415 | pub struct BatchComparator { 416 | s1: Vec, 417 | } 418 | 419 | impl BatchComparator 420 | where 421 | Elem1: HashableChar + Clone, 422 | { 423 | pub fn new(s1: Iter1) -> Self 424 | where 425 | Iter1: IntoIterator, 426 | { 427 | Self { 428 | s1: s1.into_iter().collect(), 429 | } 430 | } 431 | 432 | /// Distance calculated similar to [`distance`] 433 | pub fn distance(&self, s2: Iter2) -> Result 434 | where 435 | Iter2: IntoIterator, 436 | Iter2::IntoIter: DoubleEndedIterator + Clone, 437 | Elem1: PartialEq + HashableChar + Copy, 438 | Iter2::Item: PartialEq + HashableChar + Copy, 439 | { 440 | distance(self.s1.iter().copied(), s2) 441 | } 442 | 443 | pub fn distance_with_args( 444 | &self, 445 | s2: Iter2, 446 | args: &Args, 447 | ) -> PaddingType::Output 448 | where 449 | Iter2: IntoIterator, 450 | Iter2::IntoIter: DoubleEndedIterator + Clone, 451 | Elem1: PartialEq + HashableChar + Copy, 452 | Iter2::Item: PartialEq + HashableChar + Copy, 453 | CutoffType: DistanceCutoff, 454 | PaddingType: PaddingTrait, 455 | { 456 | distance_with_args(self.s1.iter().copied(), s2, args) 457 | } 458 | 459 | /// Similarity calculated similar to [`similarity`] 460 | pub fn similarity(&self, s2: Iter2) -> Result 461 | where 462 | Iter2: IntoIterator, 463 | Iter2::IntoIter: DoubleEndedIterator + Clone, 464 | Elem1: PartialEq + HashableChar + Copy, 465 | Iter2::Item: PartialEq + HashableChar + Copy, 466 | { 467 | similarity(self.s1.iter().copied(), s2) 468 | } 469 | 470 | pub fn similarity_with_args( 471 | &self, 472 | s2: Iter2, 473 | args: &Args, 474 | ) -> PaddingType::Output 475 | where 476 | Iter2: IntoIterator, 477 | Iter2::IntoIter: DoubleEndedIterator + Clone, 478 | Elem1: PartialEq + HashableChar + Copy, 479 | Iter2::Item: PartialEq + HashableChar + Copy, 480 | CutoffType: SimilarityCutoff, 481 | PaddingType: PaddingTrait, 482 | { 483 | similarity_with_args(self.s1.iter().copied(), s2, args) 484 | } 485 | 486 | /// Normalized distance calculated similar to [`normalized_distance`] 487 | pub fn normalized_distance(&self, s2: Iter2) -> Result 488 | where 489 | Iter2: IntoIterator, 490 | Iter2::IntoIter: DoubleEndedIterator + Clone, 491 | Elem1: PartialEq + HashableChar + Copy, 492 | Iter2::Item: PartialEq + HashableChar + Copy, 493 | { 494 | normalized_distance(self.s1.iter().copied(), s2) 495 | } 496 | 497 | pub fn normalized_distance_with_args( 498 | &self, 499 | s2: Iter2, 500 | args: &Args, 501 | ) -> PaddingType::Output 502 | where 503 | Iter2: IntoIterator, 504 | Iter2::IntoIter: DoubleEndedIterator + Clone, 505 | Elem1: PartialEq + HashableChar + Copy, 506 | Iter2::Item: PartialEq + HashableChar + Copy, 507 | CutoffType: DistanceCutoff, 508 | PaddingType: PaddingTrait, 509 | { 510 | normalized_distance_with_args(self.s1.iter().copied(), s2, args) 511 | } 512 | 513 | /// Normalized similarity calculated similar to [`normalized_similarity`] 514 | pub fn normalized_similarity(&self, s2: Iter2) -> Result 515 | where 516 | Iter2: IntoIterator, 517 | Iter2::IntoIter: DoubleEndedIterator + Clone, 518 | Elem1: PartialEq + HashableChar + Copy, 519 | Iter2::Item: PartialEq + HashableChar + Copy, 520 | { 521 | normalized_similarity(self.s1.iter().copied(), s2) 522 | } 523 | 524 | pub fn normalized_similarity_with_args( 525 | &self, 526 | s2: Iter2, 527 | args: &Args, 528 | ) -> PaddingType::Output 529 | where 530 | Iter2: IntoIterator, 531 | Iter2::IntoIter: DoubleEndedIterator + Clone, 532 | Elem1: PartialEq + HashableChar + Copy, 533 | Iter2::Item: PartialEq + HashableChar + Copy, 534 | CutoffType: SimilarityCutoff, 535 | PaddingType: PaddingTrait, 536 | { 537 | normalized_similarity_with_args(self.s1.iter().copied(), s2, args) 538 | } 539 | } 540 | 541 | #[cfg(test)] 542 | mod tests { 543 | use super::*; 544 | 545 | fn assert_dist(dist: usize, str1: &str, str2: &str) { 546 | assert_eq!(Ok(dist), distance(str1.chars(), str2.chars())); 547 | } 548 | 549 | #[test] 550 | fn empty() { 551 | assert_dist(0, "", "") 552 | } 553 | 554 | #[test] 555 | fn same() { 556 | assert_dist(0, "hamming", "hamming") 557 | } 558 | 559 | #[test] 560 | fn numbers() { 561 | assert_eq!(Ok(1), distance([1, 2, 4], [1, 2, 3])); 562 | } 563 | 564 | #[test] 565 | fn diff() { 566 | assert_dist(3, "hamming", "hammers"); 567 | 568 | assert_eq!( 569 | 3, 570 | distance_with_args( 571 | "hammers".chars(), 572 | "hamming".chars(), 573 | &Args::default().pad(true) 574 | ) 575 | ); 576 | assert_eq!( 577 | Some(3), 578 | distance_with_args( 579 | "hammers".chars(), 580 | "hamming".chars(), 581 | &Args::default().pad(true).score_cutoff(3) 582 | ) 583 | ); 584 | assert_eq!( 585 | None, 586 | distance_with_args( 587 | "hammers".chars(), 588 | "hamming".chars(), 589 | &Args::default().pad(true).score_cutoff(2) 590 | ) 591 | ); 592 | assert_eq!( 593 | Ok(Some(3)), 594 | distance_with_args( 595 | "hammers".chars(), 596 | "hamming".chars(), 597 | &Args::default().score_cutoff(3) 598 | ) 599 | ); 600 | assert_eq!( 601 | Ok(None), 602 | distance_with_args( 603 | "hammers".chars(), 604 | "hamming".chars(), 605 | &Args::default().score_cutoff(2) 606 | ) 607 | ); 608 | } 609 | 610 | #[test] 611 | fn diff_multibyte() { 612 | assert_dist(2, "hamming", "h香mmüng"); 613 | } 614 | 615 | #[test] 616 | fn unequal_length() { 617 | assert_eq!( 618 | Err(Error::DifferentLengthArgs), 619 | distance("ham".chars(), "hamming".chars()) 620 | ); 621 | 622 | assert_eq!( 623 | 4, 624 | distance_with_args("ham".chars(), "hamming".chars(), &Args::default().pad(true)) 625 | ); 626 | 627 | assert_eq!( 628 | None, 629 | distance_with_args( 630 | "ham".chars(), 631 | "hamming".chars(), 632 | &Args::default().pad(true).score_cutoff(3) 633 | ) 634 | ); 635 | } 636 | 637 | #[test] 638 | fn names() { 639 | assert_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre") 640 | } 641 | } 642 | -------------------------------------------------------------------------------- /src/distance/postfix.rs: -------------------------------------------------------------------------------- 1 | //! Postfix similarity 2 | //! 3 | //! The Postfix similarity measures the length of the common postfix between two 4 | //! sequences. 5 | //! 6 | 7 | use crate::common::{DistanceCutoff, NoScoreCutoff, SimilarityCutoff, WithScoreCutoff}; 8 | use crate::details::common::find_common_suffix; 9 | use crate::details::distance::MetricUsize; 10 | use crate::HashableChar; 11 | 12 | #[must_use] 13 | #[derive(Copy, Clone, Debug)] 14 | pub struct Args { 15 | score_cutoff: CutoffType, 16 | score_hint: Option, 17 | } 18 | 19 | impl Default for Args { 20 | fn default() -> Args { 21 | Args { 22 | score_cutoff: NoScoreCutoff, 23 | score_hint: None, 24 | } 25 | } 26 | } 27 | 28 | impl Args { 29 | pub fn score_hint(mut self, score_hint: ResultType) -> Self { 30 | self.score_hint = Some(score_hint); 31 | self 32 | } 33 | 34 | pub fn score_cutoff( 35 | self, 36 | score_cutoff: ResultType, 37 | ) -> Args> { 38 | Args { 39 | score_hint: self.score_hint, 40 | score_cutoff: WithScoreCutoff(score_cutoff), 41 | } 42 | } 43 | } 44 | 45 | struct IndividualComparator; 46 | 47 | impl MetricUsize for IndividualComparator { 48 | fn maximum(&self, len1: usize, len2: usize) -> usize { 49 | len1.max(len2) 50 | } 51 | 52 | fn _similarity( 53 | &self, 54 | s1: Iter1, 55 | _len1: usize, 56 | s2: Iter2, 57 | _len2: usize, 58 | _score_cutoff: Option, 59 | _score_hint: Option, 60 | ) -> usize 61 | where 62 | Iter1: DoubleEndedIterator + Clone, 63 | Iter2: DoubleEndedIterator + Clone, 64 | Iter1::Item: PartialEq + HashableChar, 65 | Iter2::Item: PartialEq + HashableChar, 66 | { 67 | find_common_suffix(s1, s2) 68 | } 69 | } 70 | 71 | /// Postfix distance in the range [max, 0]. 72 | /// 73 | /// This is calculated as `max(len1, len2) - `[`similarity`]. 74 | /// 75 | pub fn distance(s1: Iter1, s2: Iter2) -> usize 76 | where 77 | Iter1: IntoIterator, 78 | Iter1::IntoIter: DoubleEndedIterator + Clone, 79 | Iter2: IntoIterator, 80 | Iter2::IntoIter: DoubleEndedIterator + Clone, 81 | Iter1::Item: PartialEq + HashableChar + Copy, 82 | Iter2::Item: PartialEq + HashableChar + Copy, 83 | { 84 | distance_with_args(s1, s2, &Args::default()) 85 | } 86 | 87 | pub fn distance_with_args( 88 | s1: Iter1, 89 | s2: Iter2, 90 | args: &Args, 91 | ) -> CutoffType::Output 92 | where 93 | Iter1: IntoIterator, 94 | Iter1::IntoIter: DoubleEndedIterator + Clone, 95 | Iter2: IntoIterator, 96 | Iter2::IntoIter: DoubleEndedIterator + Clone, 97 | Iter1::Item: PartialEq + HashableChar + Copy, 98 | Iter2::Item: PartialEq + HashableChar + Copy, 99 | CutoffType: DistanceCutoff, 100 | { 101 | let s1_iter = s1.into_iter(); 102 | let s2_iter = s2.into_iter(); 103 | args.score_cutoff.score(IndividualComparator {}._distance( 104 | s1_iter.clone(), 105 | s1_iter.count(), 106 | s2_iter.clone(), 107 | s2_iter.count(), 108 | args.score_cutoff.cutoff(), 109 | args.score_hint, 110 | )) 111 | } 112 | 113 | /// Postfix similarity 114 | /// 115 | /// Calculates the Postfix similarity. 116 | /// 117 | /// # Examples 118 | /// 119 | /// ``` 120 | /// use rapidfuzz::distance::postfix; 121 | /// 122 | /// assert_eq!(3, postfix::similarity("postfix".chars(), "prefix".chars())); 123 | /// ``` 124 | pub fn similarity(s1: Iter1, s2: Iter2) -> usize 125 | where 126 | Iter1: IntoIterator, 127 | Iter1::IntoIter: DoubleEndedIterator + Clone, 128 | Iter2: IntoIterator, 129 | Iter2::IntoIter: DoubleEndedIterator + Clone, 130 | Iter1::Item: PartialEq + HashableChar + Copy, 131 | Iter2::Item: PartialEq + HashableChar + Copy, 132 | { 133 | similarity_with_args(s1, s2, &Args::default()) 134 | } 135 | 136 | pub fn similarity_with_args( 137 | s1: Iter1, 138 | s2: Iter2, 139 | args: &Args, 140 | ) -> CutoffType::Output 141 | where 142 | Iter1: IntoIterator, 143 | Iter1::IntoIter: DoubleEndedIterator + Clone, 144 | Iter2: IntoIterator, 145 | Iter2::IntoIter: DoubleEndedIterator + Clone, 146 | Iter1::Item: PartialEq + HashableChar + Copy, 147 | Iter2::Item: PartialEq + HashableChar + Copy, 148 | CutoffType: SimilarityCutoff, 149 | { 150 | let s1_iter = s1.into_iter(); 151 | let s2_iter = s2.into_iter(); 152 | args.score_cutoff.score(IndividualComparator {}._similarity( 153 | s1_iter.clone(), 154 | s1_iter.count(), 155 | s2_iter.clone(), 156 | s2_iter.count(), 157 | args.score_cutoff.cutoff(), 158 | args.score_hint, 159 | )) 160 | } 161 | 162 | /// Normalized Postfix distance in the range [1.0, 0.0] 163 | /// 164 | /// This is calculated as [`distance`]` / max(len1, len2)`. 165 | /// 166 | pub fn normalized_distance(s1: Iter1, s2: Iter2) -> f64 167 | where 168 | Iter1: IntoIterator, 169 | Iter1::IntoIter: DoubleEndedIterator + Clone, 170 | Iter2: IntoIterator, 171 | Iter2::IntoIter: DoubleEndedIterator + Clone, 172 | Iter1::Item: PartialEq + HashableChar + Copy, 173 | Iter2::Item: PartialEq + HashableChar + Copy, 174 | { 175 | normalized_distance_with_args(s1, s2, &Args::default()) 176 | } 177 | 178 | pub fn normalized_distance_with_args( 179 | s1: Iter1, 180 | s2: Iter2, 181 | args: &Args, 182 | ) -> CutoffType::Output 183 | where 184 | Iter1: IntoIterator, 185 | Iter1::IntoIter: DoubleEndedIterator + Clone, 186 | Iter2: IntoIterator, 187 | Iter2::IntoIter: DoubleEndedIterator + Clone, 188 | Iter1::Item: PartialEq + HashableChar + Copy, 189 | Iter2::Item: PartialEq + HashableChar + Copy, 190 | CutoffType: DistanceCutoff, 191 | { 192 | let s1_iter = s1.into_iter(); 193 | let s2_iter = s2.into_iter(); 194 | args.score_cutoff 195 | .score(IndividualComparator {}._normalized_distance( 196 | s1_iter.clone(), 197 | s1_iter.count(), 198 | s2_iter.clone(), 199 | s2_iter.count(), 200 | args.score_cutoff.cutoff(), 201 | args.score_hint, 202 | )) 203 | } 204 | 205 | /// Normalized Postfix similarity in the range [0.0, 1.0] 206 | /// 207 | /// This is calculated as `1.0 - `[`normalized_distance`]. 208 | /// 209 | pub fn normalized_similarity(s1: Iter1, s2: Iter2) -> f64 210 | where 211 | Iter1: IntoIterator, 212 | Iter1::IntoIter: DoubleEndedIterator + Clone, 213 | Iter2: IntoIterator, 214 | Iter2::IntoIter: DoubleEndedIterator + Clone, 215 | Iter1::Item: PartialEq + HashableChar + Copy, 216 | Iter2::Item: PartialEq + HashableChar + Copy, 217 | { 218 | normalized_similarity_with_args(s1, s2, &Args::default()) 219 | } 220 | 221 | pub fn normalized_similarity_with_args( 222 | s1: Iter1, 223 | s2: Iter2, 224 | args: &Args, 225 | ) -> CutoffType::Output 226 | where 227 | Iter1: IntoIterator, 228 | Iter1::IntoIter: DoubleEndedIterator + Clone, 229 | Iter2: IntoIterator, 230 | Iter2::IntoIter: DoubleEndedIterator + Clone, 231 | Iter1::Item: PartialEq + HashableChar + Copy, 232 | Iter2::Item: PartialEq + HashableChar + Copy, 233 | CutoffType: SimilarityCutoff, 234 | { 235 | let s1_iter = s1.into_iter(); 236 | let s2_iter = s2.into_iter(); 237 | args.score_cutoff 238 | .score(IndividualComparator {}._normalized_similarity( 239 | s1_iter.clone(), 240 | s1_iter.count(), 241 | s2_iter.clone(), 242 | s2_iter.count(), 243 | args.score_cutoff.cutoff(), 244 | args.score_hint, 245 | )) 246 | } 247 | 248 | /// `One x Many` comparisons using the Postfix similarity 249 | /// 250 | /// # Examples 251 | /// 252 | /// ``` 253 | /// use rapidfuzz::distance::postfix; 254 | /// 255 | /// let scorer = postfix::BatchComparator::new("postfix".chars()); 256 | /// assert_eq!(3, scorer.similarity("prefix".chars())); 257 | /// ``` 258 | #[derive(Clone)] 259 | pub struct BatchComparator { 260 | s1: Vec, 261 | } 262 | 263 | impl BatchComparator 264 | where 265 | Elem1: HashableChar + Clone, 266 | { 267 | pub fn new(s1: Iter1) -> Self 268 | where 269 | Iter1: IntoIterator, 270 | Iter1::IntoIter: Clone, 271 | { 272 | let s1_iter = s1.into_iter(); 273 | Self { 274 | s1: s1_iter.collect(), 275 | } 276 | } 277 | 278 | /// Normalized distance calculated similar to [`normalized_distance`] 279 | pub fn normalized_distance(&self, s2: Iter2) -> f64 280 | where 281 | Iter2: IntoIterator, 282 | Iter2::IntoIter: DoubleEndedIterator + Clone, 283 | Elem1: PartialEq + HashableChar + Copy, 284 | Iter2::Item: PartialEq + HashableChar + Copy, 285 | { 286 | normalized_distance(self.s1.iter().copied(), s2) 287 | } 288 | 289 | pub fn normalized_distance_with_args( 290 | &self, 291 | s2: Iter2, 292 | args: &Args, 293 | ) -> CutoffType::Output 294 | where 295 | Iter2: IntoIterator, 296 | Iter2::IntoIter: DoubleEndedIterator + Clone, 297 | Elem1: PartialEq + HashableChar + Copy, 298 | Iter2::Item: PartialEq + HashableChar + Copy, 299 | CutoffType: DistanceCutoff, 300 | { 301 | normalized_distance_with_args(self.s1.iter().copied(), s2, args) 302 | } 303 | 304 | /// Normalized similarity calculated similar to [`normalized_similarity`] 305 | pub fn normalized_similarity(&self, s2: Iter2) -> f64 306 | where 307 | Iter2: IntoIterator, 308 | Iter2::IntoIter: DoubleEndedIterator + Clone, 309 | Elem1: PartialEq + HashableChar + Copy, 310 | Iter2::Item: PartialEq + HashableChar + Copy, 311 | { 312 | normalized_similarity(self.s1.iter().copied(), s2) 313 | } 314 | 315 | pub fn normalized_similarity_with_args( 316 | &self, 317 | s2: Iter2, 318 | args: &Args, 319 | ) -> CutoffType::Output 320 | where 321 | Iter2: IntoIterator, 322 | Iter2::IntoIter: DoubleEndedIterator + Clone, 323 | Elem1: PartialEq + HashableChar + Copy, 324 | Iter2::Item: PartialEq + HashableChar + Copy, 325 | CutoffType: SimilarityCutoff, 326 | { 327 | normalized_similarity_with_args(self.s1.iter().copied(), s2, args) 328 | } 329 | 330 | /// Distance calculated similar to [`distance`] 331 | pub fn distance(&self, s2: Iter2) -> usize 332 | where 333 | Iter2: IntoIterator, 334 | Iter2::IntoIter: DoubleEndedIterator + Clone, 335 | Elem1: PartialEq + HashableChar + Copy, 336 | Iter2::Item: PartialEq + HashableChar + Copy, 337 | { 338 | distance(self.s1.iter().copied(), s2) 339 | } 340 | 341 | pub fn distance_with_args( 342 | &self, 343 | s2: Iter2, 344 | args: &Args, 345 | ) -> CutoffType::Output 346 | where 347 | Iter2: IntoIterator, 348 | Iter2::IntoIter: DoubleEndedIterator + Clone, 349 | Elem1: PartialEq + HashableChar + Copy, 350 | Iter2::Item: PartialEq + HashableChar + Copy, 351 | CutoffType: DistanceCutoff, 352 | { 353 | distance_with_args(self.s1.iter().copied(), s2, args) 354 | } 355 | 356 | /// Similarity calculated similar to [`similarity`] 357 | pub fn similarity(&self, s2: Iter2) -> usize 358 | where 359 | Iter2: IntoIterator, 360 | Iter2::IntoIter: DoubleEndedIterator + Clone, 361 | Elem1: PartialEq + HashableChar + Copy, 362 | Iter2::Item: PartialEq + HashableChar + Copy, 363 | { 364 | similarity(self.s1.iter().copied(), s2) 365 | } 366 | 367 | pub fn similarity_with_args( 368 | &self, 369 | s2: Iter2, 370 | args: &Args, 371 | ) -> CutoffType::Output 372 | where 373 | Iter2: IntoIterator, 374 | Iter2::IntoIter: DoubleEndedIterator + Clone, 375 | Elem1: PartialEq + HashableChar + Copy, 376 | Iter2::Item: PartialEq + HashableChar + Copy, 377 | CutoffType: SimilarityCutoff, 378 | { 379 | similarity_with_args(self.s1.iter().copied(), s2, args) 380 | } 381 | } 382 | -------------------------------------------------------------------------------- /src/distance/prefix.rs: -------------------------------------------------------------------------------- 1 | //! Prefix similarity 2 | //! 3 | //! The Prefix similarity measures the length of the common prefix between two 4 | //! sequences. 5 | //! 6 | 7 | use crate::common::{DistanceCutoff, NoScoreCutoff, SimilarityCutoff, WithScoreCutoff}; 8 | use crate::details::common::find_common_prefix; 9 | use crate::details::distance::MetricUsize; 10 | use crate::HashableChar; 11 | 12 | #[must_use] 13 | #[derive(Copy, Clone, Debug)] 14 | pub struct Args { 15 | score_cutoff: CutoffType, 16 | score_hint: Option, 17 | } 18 | 19 | impl Default for Args { 20 | fn default() -> Args { 21 | Args { 22 | score_cutoff: NoScoreCutoff, 23 | score_hint: None, 24 | } 25 | } 26 | } 27 | 28 | impl Args { 29 | pub fn score_hint(mut self, score_hint: ResultType) -> Self { 30 | self.score_hint = Some(score_hint); 31 | self 32 | } 33 | 34 | pub fn score_cutoff( 35 | self, 36 | score_cutoff: ResultType, 37 | ) -> Args> { 38 | Args { 39 | score_hint: self.score_hint, 40 | score_cutoff: WithScoreCutoff(score_cutoff), 41 | } 42 | } 43 | } 44 | 45 | struct IndividualComparator; 46 | 47 | impl MetricUsize for IndividualComparator { 48 | fn maximum(&self, len1: usize, len2: usize) -> usize { 49 | len1.max(len2) 50 | } 51 | 52 | fn _similarity( 53 | &self, 54 | s1: Iter1, 55 | _len1: usize, 56 | s2: Iter2, 57 | _len2: usize, 58 | _score_cutoff: Option, 59 | _score_hint: Option, 60 | ) -> usize 61 | where 62 | Iter1: Iterator + Clone, 63 | Iter2: Iterator + Clone, 64 | Iter1::Item: PartialEq + HashableChar, 65 | Iter2::Item: PartialEq + HashableChar, 66 | { 67 | find_common_prefix(s1, s2) 68 | } 69 | } 70 | 71 | /// Prefix distance in the range [max, 0]. 72 | /// 73 | /// This is calculated as `max(len1, len2) - `[`similarity`]. 74 | /// 75 | pub fn distance(s1: Iter1, s2: Iter2) -> usize 76 | where 77 | Iter1: IntoIterator, 78 | Iter1::IntoIter: DoubleEndedIterator + Clone, 79 | Iter2: IntoIterator, 80 | Iter2::IntoIter: DoubleEndedIterator + Clone, 81 | Iter1::Item: PartialEq + HashableChar + Copy, 82 | Iter2::Item: PartialEq + HashableChar + Copy, 83 | { 84 | distance_with_args(s1, s2, &Args::default()) 85 | } 86 | 87 | pub fn distance_with_args( 88 | s1: Iter1, 89 | s2: Iter2, 90 | args: &Args, 91 | ) -> CutoffType::Output 92 | where 93 | Iter1: IntoIterator, 94 | Iter1::IntoIter: DoubleEndedIterator + Clone, 95 | Iter2: IntoIterator, 96 | Iter2::IntoIter: DoubleEndedIterator + Clone, 97 | Iter1::Item: PartialEq + HashableChar + Copy, 98 | Iter2::Item: PartialEq + HashableChar + Copy, 99 | CutoffType: DistanceCutoff, 100 | { 101 | let s1_iter = s1.into_iter(); 102 | let s2_iter = s2.into_iter(); 103 | args.score_cutoff.score(IndividualComparator {}._distance( 104 | s1_iter.clone(), 105 | s1_iter.count(), 106 | s2_iter.clone(), 107 | s2_iter.count(), 108 | args.score_cutoff.cutoff(), 109 | args.score_hint, 110 | )) 111 | } 112 | 113 | /// Prefix similarity 114 | /// 115 | /// Calculates the Prefix similarity. 116 | /// 117 | /// # Examples 118 | /// 119 | /// ``` 120 | /// use rapidfuzz::distance::prefix; 121 | /// 122 | /// assert_eq!(4, prefix::similarity("prefix".chars(), "preference".chars())); 123 | /// ``` 124 | pub fn similarity(s1: Iter1, s2: Iter2) -> usize 125 | where 126 | Iter1: IntoIterator, 127 | Iter1::IntoIter: DoubleEndedIterator + Clone, 128 | Iter2: IntoIterator, 129 | Iter2::IntoIter: DoubleEndedIterator + Clone, 130 | Iter1::Item: PartialEq + HashableChar + Copy, 131 | Iter2::Item: PartialEq + HashableChar + Copy, 132 | { 133 | similarity_with_args(s1, s2, &Args::default()) 134 | } 135 | 136 | pub fn similarity_with_args( 137 | s1: Iter1, 138 | s2: Iter2, 139 | args: &Args, 140 | ) -> CutoffType::Output 141 | where 142 | Iter1: IntoIterator, 143 | Iter1::IntoIter: DoubleEndedIterator + Clone, 144 | Iter2: IntoIterator, 145 | Iter2::IntoIter: DoubleEndedIterator + Clone, 146 | Iter1::Item: PartialEq + HashableChar + Copy, 147 | Iter2::Item: PartialEq + HashableChar + Copy, 148 | CutoffType: SimilarityCutoff, 149 | { 150 | let s1_iter = s1.into_iter(); 151 | let s2_iter = s2.into_iter(); 152 | args.score_cutoff.score(IndividualComparator {}._similarity( 153 | s1_iter.clone(), 154 | s1_iter.count(), 155 | s2_iter.clone(), 156 | s2_iter.count(), 157 | args.score_cutoff.cutoff(), 158 | args.score_hint, 159 | )) 160 | } 161 | 162 | /// Normalized Prefix distance in the range [1.0, 0.0] 163 | /// 164 | /// This is calculated as [`distance`]` / max(len1, len2)`. 165 | /// 166 | pub fn normalized_distance(s1: Iter1, s2: Iter2) -> f64 167 | where 168 | Iter1: IntoIterator, 169 | Iter1::IntoIter: DoubleEndedIterator + Clone, 170 | Iter2: IntoIterator, 171 | Iter2::IntoIter: DoubleEndedIterator + Clone, 172 | Iter1::Item: PartialEq + HashableChar + Copy, 173 | Iter2::Item: PartialEq + HashableChar + Copy, 174 | { 175 | normalized_distance_with_args(s1, s2, &Args::default()) 176 | } 177 | 178 | pub fn normalized_distance_with_args( 179 | s1: Iter1, 180 | s2: Iter2, 181 | args: &Args, 182 | ) -> CutoffType::Output 183 | where 184 | Iter1: IntoIterator, 185 | Iter1::IntoIter: DoubleEndedIterator + Clone, 186 | Iter2: IntoIterator, 187 | Iter2::IntoIter: DoubleEndedIterator + Clone, 188 | Iter1::Item: PartialEq + HashableChar + Copy, 189 | Iter2::Item: PartialEq + HashableChar + Copy, 190 | CutoffType: DistanceCutoff, 191 | { 192 | let s1_iter = s1.into_iter(); 193 | let s2_iter = s2.into_iter(); 194 | args.score_cutoff 195 | .score(IndividualComparator {}._normalized_distance( 196 | s1_iter.clone(), 197 | s1_iter.count(), 198 | s2_iter.clone(), 199 | s2_iter.count(), 200 | args.score_cutoff.cutoff(), 201 | args.score_hint, 202 | )) 203 | } 204 | 205 | /// Normalized Prefix similarity in the range [0.0, 1.0] 206 | /// 207 | /// This is calculated as `1.0 - `[`normalized_distance`]. 208 | /// 209 | pub fn normalized_similarity(s1: Iter1, s2: Iter2) -> f64 210 | where 211 | Iter1: IntoIterator, 212 | Iter1::IntoIter: DoubleEndedIterator + Clone, 213 | Iter2: IntoIterator, 214 | Iter2::IntoIter: DoubleEndedIterator + Clone, 215 | Iter1::Item: PartialEq + HashableChar + Copy, 216 | Iter2::Item: PartialEq + HashableChar + Copy, 217 | { 218 | normalized_similarity_with_args(s1, s2, &Args::default()) 219 | } 220 | 221 | pub fn normalized_similarity_with_args( 222 | s1: Iter1, 223 | s2: Iter2, 224 | args: &Args, 225 | ) -> CutoffType::Output 226 | where 227 | Iter1: IntoIterator, 228 | Iter1::IntoIter: DoubleEndedIterator + Clone, 229 | Iter2: IntoIterator, 230 | Iter2::IntoIter: DoubleEndedIterator + Clone, 231 | Iter1::Item: PartialEq + HashableChar + Copy, 232 | Iter2::Item: PartialEq + HashableChar + Copy, 233 | CutoffType: SimilarityCutoff, 234 | { 235 | let s1_iter = s1.into_iter(); 236 | let s2_iter = s2.into_iter(); 237 | args.score_cutoff 238 | .score(IndividualComparator {}._normalized_similarity( 239 | s1_iter.clone(), 240 | s1_iter.count(), 241 | s2_iter.clone(), 242 | s2_iter.count(), 243 | args.score_cutoff.cutoff(), 244 | args.score_hint, 245 | )) 246 | } 247 | 248 | /// `One x Many` comparisons using the Prefix similarity 249 | /// 250 | /// # Examples 251 | /// 252 | /// ``` 253 | /// use rapidfuzz::distance::prefix; 254 | /// 255 | /// let scorer = prefix::BatchComparator::new("prefix".chars()); 256 | /// assert_eq!(4, scorer.similarity("preference".chars())); 257 | /// ``` 258 | #[derive(Clone)] 259 | pub struct BatchComparator { 260 | s1: Vec, 261 | } 262 | 263 | impl BatchComparator 264 | where 265 | Elem1: HashableChar + Clone, 266 | { 267 | pub fn new(s1: Iter1) -> Self 268 | where 269 | Iter1: IntoIterator, 270 | Iter1::IntoIter: Clone, 271 | { 272 | let s1_iter = s1.into_iter(); 273 | Self { 274 | s1: s1_iter.collect(), 275 | } 276 | } 277 | 278 | /// Normalized distance calculated similar to [`normalized_distance`] 279 | pub fn normalized_distance(&self, s2: Iter2) -> f64 280 | where 281 | Iter2: IntoIterator, 282 | Iter2::IntoIter: DoubleEndedIterator + Clone, 283 | Elem1: PartialEq + HashableChar + Copy, 284 | Iter2::Item: PartialEq + HashableChar + Copy, 285 | { 286 | normalized_distance(self.s1.iter().copied(), s2) 287 | } 288 | 289 | pub fn normalized_distance_with_args( 290 | &self, 291 | s2: Iter2, 292 | args: &Args, 293 | ) -> CutoffType::Output 294 | where 295 | Iter2: IntoIterator, 296 | Iter2::IntoIter: DoubleEndedIterator + Clone, 297 | Elem1: PartialEq + HashableChar + Copy, 298 | Iter2::Item: PartialEq + HashableChar + Copy, 299 | CutoffType: DistanceCutoff, 300 | { 301 | normalized_distance_with_args(self.s1.iter().copied(), s2, args) 302 | } 303 | 304 | /// Normalized similarity calculated similar to [`normalized_similarity`] 305 | pub fn normalized_similarity(&self, s2: Iter2) -> f64 306 | where 307 | Iter2: IntoIterator, 308 | Iter2::IntoIter: DoubleEndedIterator + Clone, 309 | Elem1: PartialEq + HashableChar + Copy, 310 | Iter2::Item: PartialEq + HashableChar + Copy, 311 | { 312 | normalized_similarity(self.s1.iter().copied(), s2) 313 | } 314 | 315 | pub fn normalized_similarity_with_args( 316 | &self, 317 | s2: Iter2, 318 | args: &Args, 319 | ) -> CutoffType::Output 320 | where 321 | Iter2: IntoIterator, 322 | Iter2::IntoIter: DoubleEndedIterator + Clone, 323 | Elem1: PartialEq + HashableChar + Copy, 324 | Iter2::Item: PartialEq + HashableChar + Copy, 325 | CutoffType: SimilarityCutoff, 326 | { 327 | normalized_similarity_with_args(self.s1.iter().copied(), s2, args) 328 | } 329 | 330 | /// Distance calculated similar to [`distance`] 331 | pub fn distance(&self, s2: Iter2) -> usize 332 | where 333 | Iter2: IntoIterator, 334 | Iter2::IntoIter: DoubleEndedIterator + Clone, 335 | Elem1: PartialEq + HashableChar + Copy, 336 | Iter2::Item: PartialEq + HashableChar + Copy, 337 | { 338 | distance(self.s1.iter().copied(), s2) 339 | } 340 | 341 | pub fn distance_with_args( 342 | &self, 343 | s2: Iter2, 344 | args: &Args, 345 | ) -> CutoffType::Output 346 | where 347 | Iter2: IntoIterator, 348 | Iter2::IntoIter: DoubleEndedIterator + Clone, 349 | Elem1: PartialEq + HashableChar + Copy, 350 | Iter2::Item: PartialEq + HashableChar + Copy, 351 | CutoffType: DistanceCutoff, 352 | { 353 | distance_with_args(self.s1.iter().copied(), s2, args) 354 | } 355 | 356 | /// Similarity calculated similar to [`similarity`] 357 | pub fn similarity(&self, s2: Iter2) -> usize 358 | where 359 | Iter2: IntoIterator, 360 | Iter2::IntoIter: DoubleEndedIterator + Clone, 361 | Elem1: PartialEq + HashableChar + Copy, 362 | Iter2::Item: PartialEq + HashableChar + Copy, 363 | { 364 | similarity(self.s1.iter().copied(), s2) 365 | } 366 | 367 | pub fn similarity_with_args( 368 | &self, 369 | s2: Iter2, 370 | args: &Args, 371 | ) -> CutoffType::Output 372 | where 373 | Iter2: IntoIterator, 374 | Iter2::IntoIter: DoubleEndedIterator + Clone, 375 | Elem1: PartialEq + HashableChar + Copy, 376 | Iter2::Item: PartialEq + HashableChar + Copy, 377 | CutoffType: SimilarityCutoff, 378 | { 379 | similarity_with_args(self.s1.iter().copied(), s2, args) 380 | } 381 | } 382 | -------------------------------------------------------------------------------- /src/fuzz.rs: -------------------------------------------------------------------------------- 1 | use crate::common::{NoScoreCutoff, SimilarityCutoff, WithScoreCutoff}; 2 | use crate::details::distance::MetricUsize; 3 | use crate::distance::indel; 4 | use crate::HashableChar; 5 | 6 | #[must_use] 7 | #[derive(Clone, Copy, Debug)] 8 | pub struct Args { 9 | score_cutoff: CutoffType, 10 | score_hint: Option, 11 | } 12 | 13 | impl Default for Args { 14 | fn default() -> Args { 15 | Args { 16 | score_cutoff: NoScoreCutoff, 17 | score_hint: None, 18 | } 19 | } 20 | } 21 | 22 | impl Args { 23 | pub fn score_hint(mut self, score_hint: ResultType) -> Self { 24 | self.score_hint = Some(score_hint); 25 | self 26 | } 27 | 28 | pub fn score_cutoff( 29 | self, 30 | score_cutoff: ResultType, 31 | ) -> Args> { 32 | Args { 33 | score_hint: self.score_hint, 34 | score_cutoff: WithScoreCutoff(score_cutoff), 35 | } 36 | } 37 | } 38 | 39 | /// Returns a simple ratio between two strings or `None` if `ratio < score_cutoff` 40 | /// 41 | /// # Example 42 | /// ``` 43 | /// use rapidfuzz::fuzz; 44 | /// /// score is 0.9655 45 | /// let score = fuzz::ratio("this is a test".chars(), "this is a test!".chars()); 46 | /// ``` 47 | /// 48 | pub fn ratio(s1: Iter1, s2: Iter2) -> f64 49 | where 50 | Iter1: IntoIterator, 51 | Iter1::IntoIter: DoubleEndedIterator + Clone, 52 | Iter2: IntoIterator, 53 | Iter2::IntoIter: DoubleEndedIterator + Clone, 54 | Iter1::Item: PartialEq + HashableChar + Copy, 55 | Iter2::Item: PartialEq + HashableChar + Copy, 56 | { 57 | ratio_with_args(s1, s2, &Args::default()) 58 | } 59 | 60 | pub fn ratio_with_args( 61 | s1: Iter1, 62 | s2: Iter2, 63 | args: &Args, 64 | ) -> CutoffType::Output 65 | where 66 | Iter1: IntoIterator, 67 | Iter1::IntoIter: DoubleEndedIterator + Clone, 68 | Iter2: IntoIterator, 69 | Iter2::IntoIter: DoubleEndedIterator + Clone, 70 | Iter1::Item: PartialEq + HashableChar + Copy, 71 | Iter2::Item: PartialEq + HashableChar + Copy, 72 | CutoffType: SimilarityCutoff, 73 | { 74 | let s1_iter = s1.into_iter(); 75 | let s2_iter = s2.into_iter(); 76 | args.score_cutoff 77 | .score(indel::IndividualComparator {}._normalized_similarity( 78 | s1_iter.clone(), 79 | s1_iter.count(), 80 | s2_iter.clone(), 81 | s2_iter.count(), 82 | args.score_cutoff.cutoff(), 83 | args.score_hint, 84 | )) 85 | } 86 | 87 | /// `One x Many` comparisons using `ratio` 88 | /// 89 | /// # Examples 90 | /// 91 | /// ``` 92 | /// use rapidfuzz::fuzz; 93 | /// 94 | /// let scorer = fuzz::RatioBatchComparator::new("this is a test".chars()); 95 | /// /// score is 0.9655 96 | /// let score = scorer.similarity("this is a test!".chars()); 97 | /// ``` 98 | pub struct RatioBatchComparator { 99 | scorer: indel::BatchComparator, 100 | } 101 | 102 | impl RatioBatchComparator 103 | where 104 | Elem1: HashableChar + Clone, 105 | { 106 | pub fn new(s1: Iter1) -> Self 107 | where 108 | Iter1: IntoIterator, 109 | Iter1::IntoIter: Clone, 110 | { 111 | Self { 112 | scorer: indel::BatchComparator::new(s1), 113 | } 114 | } 115 | 116 | /// Similarity calculated similar to [`ratio`] 117 | pub fn similarity(&self, s2: Iter2) -> f64 118 | where 119 | Iter2: IntoIterator, 120 | Iter2::IntoIter: DoubleEndedIterator + Clone, 121 | Elem1: PartialEq + HashableChar + Copy, 122 | Iter2::Item: PartialEq + HashableChar + Copy, 123 | { 124 | self.similarity_with_args(s2, &Args::default()) 125 | } 126 | 127 | pub fn similarity_with_args( 128 | &self, 129 | s2: Iter2, 130 | args: &Args, 131 | ) -> CutoffType::Output 132 | where 133 | Iter2: IntoIterator, 134 | Iter2::IntoIter: DoubleEndedIterator + Clone, 135 | Elem1: PartialEq + HashableChar + Copy, 136 | Iter2::Item: PartialEq + HashableChar + Copy, 137 | CutoffType: SimilarityCutoff, 138 | { 139 | let s2_iter = s2.into_iter(); 140 | args.score_cutoff 141 | .score(self.scorer.scorer._normalized_similarity( 142 | self.scorer.scorer.s1.iter().copied(), 143 | self.scorer.scorer.s1.len(), 144 | s2_iter.clone(), 145 | s2_iter.count(), 146 | args.score_cutoff.cutoff(), 147 | args.score_hint, 148 | )) 149 | } 150 | } 151 | 152 | #[cfg(test)] 153 | mod tests { 154 | use super::*; 155 | 156 | static S1: &str = "new york mets"; 157 | static S3: &str = "the wonderful new york mets"; 158 | //static S4: &str = "new york mets vs atlanta braves"; 159 | //static S5: &str = "atlanta braves vs new york mets"; 160 | //static S7: &str = "new york city mets - atlanta braves"; 161 | // test silly corner cases 162 | static S8: &str = "{"; 163 | static S9: &str = "{a"; 164 | //static S10: &str = "a{"; 165 | //static S10A: &str = "{b"; 166 | 167 | macro_rules! assert_delta { 168 | ($x:expr, $y:expr) => { 169 | match ($x, $y) { 170 | (None, None) => {} 171 | (Some(val1), Some(val2)) => { 172 | if (val1 - val2).abs() > 0.0001 { 173 | panic!("{:?} != {:?}", $x, $y); 174 | } 175 | } 176 | (_, _) => panic!("{:?} != {:?}", $x, $y), 177 | } 178 | }; 179 | } 180 | 181 | #[test] 182 | fn test_equal() { 183 | assert_delta!( 184 | Some(1.0), 185 | Some(ratio_with_args(S1.chars(), S1.chars(), &Args::default())) 186 | ); 187 | assert_delta!( 188 | Some(1.0), 189 | Some(ratio_with_args( 190 | "test".chars(), 191 | "test".chars(), 192 | &Args::default() 193 | )) 194 | ); 195 | assert_delta!( 196 | Some(1.0), 197 | Some(ratio_with_args(S8.chars(), S8.chars(), &Args::default())) 198 | ); 199 | assert_delta!( 200 | Some(1.0), 201 | Some(ratio_with_args(S9.chars(), S9.chars(), &Args::default())) 202 | ); 203 | } 204 | 205 | #[test] 206 | fn test_partial_ratio() { 207 | //assert_delta!(Some(1.0), partial_ratio(S1.chars(), S1.chars(), None, None)); 208 | assert_delta!( 209 | Some(0.65), 210 | Some(ratio_with_args(S1.chars(), S3.chars(), &Args::default())) 211 | ); 212 | //assert_delta!(Some(1.0), partial_ratio(S1.chars(), S3.chars(), None, None)); 213 | } 214 | 215 | #[test] 216 | fn two_empty_strings() { 217 | assert_delta!( 218 | Some(1.0), 219 | Some(ratio_with_args("".chars(), "".chars(), &Args::default())) 220 | ); 221 | } 222 | 223 | #[test] 224 | fn first_string_empty() { 225 | assert_delta!( 226 | Some(0.0), 227 | Some(ratio_with_args( 228 | "test".chars(), 229 | "".chars(), 230 | &Args::default() 231 | )) 232 | ); 233 | } 234 | 235 | #[test] 236 | fn second_string_empty() { 237 | assert_delta!( 238 | Some(0.0), 239 | Some(ratio_with_args( 240 | "".chars(), 241 | "test".chars(), 242 | &Args::default() 243 | )) 244 | ); 245 | } 246 | 247 | // https://github.com/rapidfuzz/RapidFuzz/issues/206 248 | #[test] 249 | fn issue206() { 250 | let str1 = "South Korea"; 251 | let str2 = "North Korea"; 252 | 253 | { 254 | let score = ratio(str1.chars(), str2.chars()); 255 | 256 | assert_eq!( 257 | None, 258 | ratio_with_args( 259 | str1.chars(), 260 | str2.chars(), 261 | &Args::default().score_cutoff(score + 0.0001) 262 | ) 263 | ); 264 | assert_delta!( 265 | Some(score), 266 | ratio_with_args( 267 | str1.chars(), 268 | str2.chars(), 269 | &Args::default().score_cutoff(score - 0.0001) 270 | ) 271 | ); 272 | } 273 | } 274 | 275 | // https://github.com/rapidfuzz/RapidFuzz/issues/210 276 | #[test] 277 | fn issue210() { 278 | let str1 = "bc"; 279 | let str2 = "bca"; 280 | 281 | { 282 | let score = ratio(str1.chars(), str2.chars()); 283 | 284 | assert_eq!( 285 | None, 286 | ratio_with_args( 287 | str1.chars(), 288 | str2.chars(), 289 | &Args::default().score_cutoff(score + 0.0001) 290 | ) 291 | ); 292 | assert_delta!( 293 | Some(score), 294 | ratio_with_args( 295 | str1.chars(), 296 | str2.chars(), 297 | &Args::default().score_cutoff(score - 0.0001) 298 | ) 299 | ); 300 | } 301 | } 302 | } 303 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | //! `RapidFuzz` is a general purpose string matching library with implementations 2 | //! for Rust, C++ and Python. 3 | //! 4 | //! ## Key Features 5 | //! 6 | //! - **Diverse String Metrics**: Offers a variety of string metrics 7 | //! to suit different use cases. These range from the Levenshtein 8 | //! distance for edit-based comparisons to the Jaro-Winkler similarity for 9 | //! more nuanced similarity assessments. 10 | //! - **Optimized for Speed**: The library is designed with performance in mind. 11 | //! Each implementation is carefully designed to ensure optimal performance, 12 | //! making it suitable for the analysis of large datasets. 13 | //! - **Easy to use**: The API is designed to be simple to use, while still giving 14 | //! the implementation room for optimization. 15 | //! 16 | //! ## Installation 17 | //! 18 | //! The installation is as simple as: 19 | //! ```console 20 | //! $ cargo add rapidfuzz 21 | //! ``` 22 | //! 23 | //! ## Usage 24 | //! 25 | //! The following examples show the usage with the [`Levenshtein`] distance. Other metrics 26 | //! can be found in the [`fuzz`] and [`distance`] modules. 27 | //! 28 | //! ```rust 29 | //! use rapidfuzz::distance::levenshtein; 30 | //! 31 | //! // Perform a simple comparision using he levenshtein distance 32 | //! assert_eq!( 33 | //! 3, 34 | //! levenshtein::distance("kitten".chars(), "sitting".chars()) 35 | //! ); 36 | //! 37 | //! // If you are sure the input strings are ASCII only it's usually faster to operate on bytes 38 | //! assert_eq!( 39 | //! 3, 40 | //! levenshtein::distance("kitten".bytes(), "sitting".bytes()) 41 | //! ); 42 | //! 43 | //! // You can provide a score_cutoff value to filter out strings with distance that is worse than 44 | //! // the score_cutoff 45 | //! assert_eq!( 46 | //! None, 47 | //! levenshtein::distance_with_args( 48 | //! "kitten".chars(), 49 | //! "sitting".chars(), 50 | //! &levenshtein::Args::default().score_cutoff(2) 51 | //! ) 52 | //! ); 53 | //! 54 | //! // You can provide a score_hint to tell the implementation about the expected score. 55 | //! // This can be used to select a more performant implementation internally, but might cause 56 | //! // a slowdown in cases where the distance is actually worse than the score_hint 57 | //! assert_eq!( 58 | //! 3, 59 | //! levenshtein::distance_with_args( 60 | //! "kitten".chars(), 61 | //! "sitting".chars(), 62 | //! &levenshtein::Args::default().score_hint(2) 63 | //! ) 64 | //! ); 65 | //! 66 | //! // When comparing a single string to multiple strings you can use the 67 | //! // provided `BatchComparators`. These can cache part of the calculation 68 | //! // which can provide significant speedups 69 | //! let scorer = levenshtein::BatchComparator::new("kitten".chars()); 70 | //! assert_eq!(3, scorer.distance("sitting".chars())); 71 | //! assert_eq!(0, scorer.distance("kitten".chars())); 72 | //! ``` 73 | //! 74 | //! [`Levenshtein`]: distance/levenshtein/index.html 75 | //! [`fuzz`]: fuzz/index.html 76 | //! [`distance`]: distance/index.html 77 | 78 | #![forbid(unsafe_code)] 79 | #![allow( 80 | // these casts are sometimes needed. They restrict the length of input iterators 81 | // but there isn't really any way around this except for always working with 82 | // 128 bit types 83 | clippy::cast_possible_truncation, 84 | clippy::cast_possible_wrap, 85 | clippy::cast_sign_loss, 86 | clippy::cast_precision_loss, 87 | // things are often more readable this way 88 | clippy::module_name_repetitions, 89 | // not practical 90 | clippy::needless_pass_by_value, 91 | clippy::similar_names, 92 | clippy::too_many_lines, 93 | // noisy 94 | clippy::missing_errors_doc, 95 | )] 96 | 97 | pub mod common; 98 | pub(crate) mod details; 99 | pub mod distance; 100 | pub mod fuzz; 101 | 102 | /// Hash value in the range `i64::MIN` - `u64::MAX` 103 | #[derive(Debug, Copy, Clone)] 104 | pub enum Hash { 105 | UNSIGNED(u64), 106 | SIGNED(i64), 107 | } 108 | 109 | /// trait used to map between element types and unique hash values 110 | /// 111 | /// `RapidFuzz` already implements this trait for most primitive types. 112 | /// For custom types this trat can be used to support the internal hashmaps. 113 | /// There are a couple of things to keep in mind when implementing this trait: 114 | /// - hashes have to be a unique value in the range `i64::MIN` - `u64::MAX`. 115 | /// If two distinct objects produce the same hash, they will be assumed to be similar 116 | /// by the hashmap. 117 | /// - the hash function should be very fast. For primitive types it can just be the identity 118 | /// function 119 | /// - the hashmaps are optimized for extended ASCII, so values in the range 0-255 generally 120 | /// provide a better performance. 121 | /// 122 | /// # Example 123 | /// ``` 124 | /// use rapidfuzz::distance; 125 | /// use rapidfuzz::{Hash, HashableChar}; 126 | /// 127 | /// #[derive(PartialEq)] 128 | /// struct MyType { 129 | /// val: u64, 130 | /// } 131 | /// 132 | /// impl HashableChar for &MyType { 133 | /// fn hash_char(&self) -> Hash { 134 | /// Hash::UNSIGNED(self.val) 135 | /// } 136 | /// } 137 | /// 138 | /// assert_eq!( 139 | /// 1, 140 | /// distance::levenshtein::distance( 141 | /// &[MyType { val: 1 }, MyType { val: 1 }], 142 | /// &[MyType { val: 2 }, MyType { val: 1 }], 143 | /// ) 144 | /// ); 145 | /// ``` 146 | pub trait HashableChar { 147 | fn hash_char(&self) -> Hash; 148 | } 149 | --------------------------------------------------------------------------------