├── .criterion ├── similarity │ └── new │ │ ├── estimates.json │ │ ├── sample.json │ │ └── tukey.json └── string equality │ └── new │ ├── estimates.json │ ├── sample.json │ └── tukey.json ├── .gitignore ├── .travis.yml ├── CHANGELOG.md ├── Cargo.toml ├── LICENSE ├── README.md ├── benches └── similarity.rs ├── examples ├── find_words_iter.rs └── similarity.rs └── src └── lib.rs /.criterion/similarity/new/estimates.json: -------------------------------------------------------------------------------- 1 | {"Mean":{"confidence_interval":{"confidence_level":0.95,"lower_bound":13169.410347377065,"upper_bound":13323.095493519453},"point_estimate":13242.65658001758,"standard_error":39.2551175730277},"Median":{"confidence_interval":{"confidence_level":0.95,"lower_bound":13083.325232198142,"upper_bound":13181.556871345028},"point_estimate":13121.513147633454,"standard_error":31.84038672203396},"MedianAbsDev":{"confidence_interval":{"confidence_level":0.95,"lower_bound":164.73246797052187,"upper_bound":305.3433340856505},"point_estimate":220.8634637479288,"standard_error":36.19781757622659},"Slope":{"confidence_interval":{"confidence_level":0.95,"lower_bound":13191.473018526287,"upper_bound":13358.331412017771},"point_estimate":13268.586683129428,"standard_error":42.753751310681885},"StdDev":{"confidence_interval":{"confidence_level":0.95,"lower_bound":284.3693601986127,"upper_bound":491.46345963193033},"point_estimate":394.5153860255187,"standard_error":53.07819572381374}} -------------------------------------------------------------------------------- /.criterion/similarity/new/sample.json: -------------------------------------------------------------------------------- 1 | [[76.0,152.0,228.0,304.0,380.0,456.0,532.0,608.0,684.0,760.0,836.0,912.0,988.0,1064.0,1140.0,1216.0,1292.0,1368.0,1444.0,1520.0,1596.0,1672.0,1748.0,1824.0,1900.0,1976.0,2052.0,2128.0,2204.0,2280.0,2356.0,2432.0,2508.0,2584.0,2660.0,2736.0,2812.0,2888.0,2964.0,3040.0,3116.0,3192.0,3268.0,3344.0,3420.0,3496.0,3572.0,3648.0,3724.0,3800.0,3876.0,3952.0,4028.0,4104.0,4180.0,4256.0,4332.0,4408.0,4484.0,4560.0,4636.0,4712.0,4788.0,4864.0,4940.0,5016.0,5092.0,5168.0,5244.0,5320.0,5396.0,5472.0,5548.0,5624.0,5700.0,5776.0,5852.0,5928.0,6004.0,6080.0,6156.0,6232.0,6308.0,6384.0,6460.0,6536.0,6612.0,6688.0,6764.0,6840.0,6916.0,6992.0,7068.0,7144.0,7220.0,7296.0,7372.0,7448.0,7524.0,7600.0],[1062770.0,1935693.0,2903438.0,3960003.0,4924344.0,5938695.0,6814123.0,8051783.0,8771745.0,10369933.0,12363415.0,11818551.0,13769318.0,13696429.0,15011020.0,15739364.0,16723144.0,17748995.0,18674588.0,21350072.0,21022810.0,21801122.0,22657793.0,23693060.0,24436007.0,26633139.0,26872914.0,27485523.0,30198396.0,30483426.0,30821590.0,31429428.0,32686578.0,33479836.0,34731040.0,35463988.0,36526720.0,38198779.0,38470088.0,39287274.0,41317634.0,41670800.0,44462883.0,44482306.0,44753762.0,45803540.0,49222228.0,47471154.0,50010334.0,50040110.0,50855900.0,51549605.0,55484227.0,53947005.0,54712642.0,55379494.0,64535332.0,59896233.0,59272964.0,60679386.0,60423182.0,63186783.0,63275941.0,62866506.0,64668834.0,66797013.0,68647886.0,72076386.0,70454886.0,71574126.0,71291074.0,72303400.0,75271300.0,82432671.0,77835149.0,75816268.0,77103650.0,77569801.0,78586398.0,80025438.0,79984243.0,82180170.0,83842434.0,82745340.0,84518281.0,86380452.0,86148044.0,87762003.0,87733639.0,90161849.0,90077221.0,90808343.0,92082120.0,99960106.0,95708381.0,95602142.0,96950505.0,99957327.0,101156587.0,101143529.0]] -------------------------------------------------------------------------------- /.criterion/similarity/new/tukey.json: -------------------------------------------------------------------------------- 1 | [11978.074420742507,12485.073729620766,13837.071886629456,14344.071195507713] -------------------------------------------------------------------------------- /.criterion/string equality/new/estimates.json: -------------------------------------------------------------------------------- 1 | {"Mean":{"confidence_interval":{"confidence_level":0.95,"lower_bound":1.0792018727740168e-16,"upper_bound":3.028933812539412e-16},"point_estimate":1.881675387412083e-16,"standard_error":5.072246696106432e-17},"Median":{"confidence_interval":{"confidence_level":0.95,"lower_bound":5.3854634369109415e-17,"upper_bound":7.82618112612923e-17},"point_estimate":6.601853207420922e-17,"standard_error":6.744365275979009e-18},"MedianAbsDev":{"confidence_interval":{"confidence_level":0.95,"lower_bound":2.5281971524584458e-17,"upper_bound":5.760749106322024e-17},"point_estimate":4.178822255987432e-17,"standard_error":8.638110010780425e-18},"Slope":{"confidence_interval":{"confidence_level":0.95,"lower_bound":4.5666556063282363e-17,"upper_bound":5.277076264514557e-17},"point_estimate":4.8735212985150286e-17,"standard_error":1.8100676177822e-18},"StdDev":{"confidence_interval":{"confidence_level":0.95,"lower_bound":1.317388208011907e-16,"upper_bound":8.147998176231884e-16},"point_estimate":5.086403228520523e-16,"standard_error":1.9197988358048342e-16}} -------------------------------------------------------------------------------- /.criterion/string equality/new/sample.json: -------------------------------------------------------------------------------- 1 | [[6088034136707078.0,1.2176068273414156e16,1.826410241012123e16,2.435213654682831e16,3.044017068353539e16,3.652820482024246e16,4.2616238956949544e16,4.870427309365662e16,5.4792307230363704e16,6.088034136707078e16,6.696837550377786e16,7.305640964048493e16,7.914444377719202e16,8.523247791389909e16,9.132051205060618e16,9.740854618731325e16,1.0349658032402032e17,1.0958461446072741e17,1.1567264859743448e17,1.2176068273414157e17,1.2784871687084864e17,1.3393675100755571e17,1.400247851442628e17,1.4611281928096986e17,1.5220085341767696e17,1.5828888755438403e17,1.643769216910911e17,1.7046495582779818e17,1.7655298996450525e17,1.8264102410121235e17,1.8872905823791942e17,1.948170923746265e17,2.0090512651133357e17,2.0699316064804064e17,2.1308119478474774e17,2.1916922892145482e17,2.252572630581619e17,2.3134529719486896e17,2.3743333133157603e17,2.4352136546828314e17,2.496093996049902e17,2.5569743374169728e17,2.6178546787840435e17,2.6787350201511142e17,2.739615361518185e17,2.800495702885256e17,2.8613760442523267e17,2.922256385619397e17,2.9831367269864685e17,3.044017068353539e17,3.10489740972061e17,3.1657777510876806e17,3.2266580924547514e17,3.287538433821822e17,3.348418775188893e17,3.4092991165559635e17,3.470179457923034e17,3.531059799290105e17,3.591940140657176e17,3.652820482024247e17,3.713700823391318e17,3.7745811647583885e17,3.835461506125459e17,3.89634184749253e17,3.9572221888596006e17,4.0181025302266714e17,4.078982871593742e17,4.139863212960813e17,4.2007435543278835e17,4.261623895694955e17,4.3225042370620256e17,4.3833845784290963e17,4.444264919796167e17,4.505145261163238e17,4.5660256025303085e17,4.626905943897379e17,4.68778628526445e17,4.7486666266315206e17,4.8095469679985914e17,4.870427309365663e17,4.9313076507327334e17,4.992187992099804e17,5.053068333466875e17,5.1139486748339456e17,5.174829016201016e17,5.235709357568087e17,5.296589698935158e17,5.3574700403022285e17,5.418350381669299e17,5.47923072303637e17,5.540111064403441e17,5.600991405770512e17,5.661871747137583e17,5.7227520885046534e17,5.783632429871724e17,5.844512771238794e17,5.905393112605866e17,5.966273453972937e17,6.027153795340008e17,6.088034136707078e17],[27.0,28.0,20.0,20.0,20.0,22.0,20.0,20.0,20.0,20.0,21.0,20.0,19.0,20.0,18.0,20.0,19.0,20.0,20.0,20.0,19.0,20.0,19.0,20.0,20.0,20.0,19.0,20.0,19.0,20.0,20.0,20.0,19.0,20.0,18.0,20.0,20.0,20.0,18.0,21.0,20.0,20.0,20.0,20.0,20.0,21.0,20.0,20.0,20.0,20.0,19.0,21.0,18.0,20.0,20.0,20.0,20.0,22.0,19.0,20.0,20.0,20.0,19.0,20.0,20.0,20.0,20.0,20.0,20.0,20.0,18.0,20.0,19.0,20.0,21.0,20.0,19.0,20.0,21.0,18.0,20.0,20.0,19.0,22.0,19.0,20.0,20.0,20.0,20.0,20.0,21.0,18.0,20.0,20.0,19.0,22.0,20.0,19.0,19.0,22.0]] -------------------------------------------------------------------------------- /.criterion/string equality/new/tukey.json: -------------------------------------------------------------------------------- 1 | [-2.1014842491437552e-16,-8.34872267712804e-17,2.5427596827697326e-16,3.8093716642006835e-16] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | /target/ 4 | 5 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 6 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 7 | Cargo.lock 8 | 9 | # These are backup files generated by rustfmt 10 | **/*.rs.bk 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: rust 2 | rust: 3 | - stable 4 | install: 5 | - rustup component add rustfmt 6 | - rustup component add clippy 7 | script: 8 | - cargo fmt -- --check 9 | - cargo clippy -- -D warnings 10 | - cargo test 11 | - cargo package --allow-dirty 12 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ## 0.4.4 2 | * Fix dependencies to work with stable Rust 3 | * Improve doc strings for public functions 4 | 5 | ## 0.4.3 6 | * Fix clippy warnings 7 | 8 | ## 0.4.2 9 | * Run `cargo fmt` 10 | 11 | ## 0.4.1 12 | * Fix a missing doc string 13 | 14 | ## 0.4.0 15 | * Ignore case 16 | * Add examples: `find_words_iter.rs` and `similarity.rs` 17 | * Add public accessors to make `Match` more like `regex::Match` 18 | 19 | ## 0.3.0 20 | * Add `find_words_iter` function to find substring fuzzy matches of words 21 | 22 | ## 0.2.4 23 | * Get the speed back to about where it was before adding Unicode support 24 | 25 | ## 0.2.3 26 | * Add Unicode support 27 | * Switch to Criterion benchmarking to work with stable rust 28 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | authors = ["Issac Trotts "] 3 | description = "Trigram-based string similarity for fuzzy matching." 4 | documentation = "https://docs.rs/trigram" 5 | edition = "2018" 6 | rust-version = "1.40" 7 | homepage = "https://github.com/ijt/trigram" 8 | keywords = ["trigram", "string", "fuzzy", "matching"] 9 | license = "Apache-2.0" 10 | name = "trigram" 11 | readme = "README.md" 12 | repository = "https://github.com/ijt/trigram" 13 | version = "0.4.4" 14 | 15 | [dependencies] 16 | regex = "1.6" 17 | once_cell = "1.13" 18 | 19 | [dev-dependencies] 20 | criterion = "0.3" 21 | table-test = "0.2" 22 | 23 | [[bench]] 24 | name = "similarity" 25 | harness = false 26 | 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # trigram 2 | 3 | [![Build Status](https://travis-ci.org/ijt/trigram.svg?branch=master)](https://travis-ci.org/ijt/trigram) 4 | [![License](https://img.shields.io/badge/license-Apache-blue.svg)](https://raw.githubusercontent.com/ijt/trigram/master/LICENSE) 5 | [![Documentation](https://docs.rs/trigram/badge.svg)](https://docs.rs/trigram) 6 | 7 | This Rust crate contains functions for fuzzy string matching. 8 | 9 | It exports two functions. The `similarity` function returns the similarity of 10 | two strings, and the `find_words_iter` function returns an iterator of matches 11 | for a smaller string (`needle`) in a larger string (`haystack`). 12 | 13 | The similarity of strings is computed based on their trigrams, meaning their 14 | 3-character substrings: https://en.wikipedia.org/wiki/Trigram. 15 | 16 | ## Trying it out 17 | 18 | Here is how to run the examples: 19 | 20 | ``` 21 | $ cargo run --example similarity color colour 22 | ... 23 | 0.44444445 24 | 25 | $ cargo run --example find_words_iter 26 | bufalo 27 | buffalow 28 | Bungalo 29 | biffalo 30 | buffaloo 31 | huffalo 32 | snuffalo 33 | fluffalo 34 | ``` 35 | 36 | ## Usage 37 | 38 | Add this to your `Cargo.toml`: 39 | 40 | ```toml 41 | [dependencies] 42 | trigram = "0.2.2" 43 | ``` 44 | 45 | and call it like this: 46 | 47 | ```rust 48 | use trigram::similarity; 49 | 50 | fn main() { 51 | println!("{}", similarity(&"rustacean", &"crustacean")); 52 | } 53 | ``` 54 | 55 | ## Background 56 | The `similarity` function in this crate is a reverse-engineered approximation 57 | of the `similarity` function in the Postgresql pg\_trgm extension: 58 | https://www.postgresql.org/docs/9.1/pgtrgm.html. It gives exactly the same 59 | answers in many cases, but may disagree in others (none known). If you find a 60 | case where the answers don't match, please file an issue about it! 61 | 62 | A good introduction to the Postgres version of this is given on Stack Overflow: 63 | https://stackoverflow.com/a/43161051/484529. 64 | -------------------------------------------------------------------------------- /benches/similarity.rs: -------------------------------------------------------------------------------- 1 | use criterion::{criterion_group, criterion_main, Criterion}; 2 | use trigram::similarity; 3 | 4 | pub fn criterion_benchmark(c: &mut Criterion) { 5 | c.bench_function("strings", |b| { 6 | b.iter(|| { 7 | let s1 = "This is a longer string. It contains complete sentences."; 8 | let s2 = "This is a longish string. It contains complete sentences."; 9 | similarity(s1, s2) 10 | }) 11 | }); 12 | } 13 | 14 | criterion_group!(benches, criterion_benchmark); 15 | criterion_main!(benches); 16 | -------------------------------------------------------------------------------- /examples/find_words_iter.rs: -------------------------------------------------------------------------------- 1 | use trigram::find_words_iter; 2 | 3 | fn main() { 4 | let haystack = 5 | "Did you know that bufalo buffalow Bungalo biffalo buffaloo huffalo snuffalo fluffalo?"; 6 | let needle = "buffalo"; 7 | for m in find_words_iter(needle, haystack, 0.3) { 8 | println!("{}", m.as_str()); 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /examples/similarity.rs: -------------------------------------------------------------------------------- 1 | use trigram::similarity; 2 | 3 | fn main() { 4 | let args: Vec = ::std::env::args().collect(); 5 | if args.len() != 1 + 2 { 6 | eprintln!("usage: similarity string1 string2"); 7 | ::std::process::exit(1); 8 | } 9 | let a = args[1].as_str(); 10 | let b = args[2].as_str(); 11 | println!("{}", similarity(a, b)); 12 | } 13 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | The trigram library computes the similarity of strings, inspired by the similarity function in the 3 | [Postgresql `pg_trgm` extension](https://www.postgresql.org/docs/9.1/pgtrgm.html). 4 | */ 5 | 6 | use once_cell::sync::Lazy; 7 | use regex::Regex; 8 | use std::collections::HashSet; 9 | use std::hash::Hash; 10 | 11 | /// Iterates over fuzzy matches of one string against the words in another, such 12 | /// that the similarity is over some threshold, for example 0.3. 13 | pub fn find_words_iter<'n, 'h>( 14 | needle: &'n str, 15 | haystack: &'h str, 16 | threshold: f64, 17 | ) -> Matches<'n, 'h> { 18 | static WORD_RX: Lazy = Lazy::new(|| Regex::new(r"\w+").unwrap()); 19 | let words = WORD_RX.find_iter(haystack); 20 | Matches { 21 | needle, 22 | haystack_words: words, 23 | threshold, 24 | } 25 | } 26 | 27 | /// Iterator over fuzzy word matches. 28 | pub struct Matches<'n, 'h> { 29 | needle: &'n str, 30 | haystack_words: regex::Matches<'static, 'h>, 31 | threshold: f64, 32 | } 33 | 34 | impl<'n, 'h> Iterator for Matches<'n, 'h> { 35 | type Item = Match<'h>; 36 | 37 | fn next(&mut self) -> Option { 38 | for m in self.haystack_words.by_ref() { 39 | let w = m.as_str(); 40 | if similarity(self.needle, w) > self.threshold { 41 | let m2 = Match { 42 | text: w, 43 | start: m.start(), 44 | end: m.end(), 45 | }; 46 | return Some(m2); 47 | } 48 | } 49 | None 50 | } 51 | } 52 | 53 | /// This is the same as `regex::Match`. 54 | #[derive(Copy, Clone, Debug, Eq, PartialEq)] 55 | pub struct Match<'t> { 56 | text: &'t str, 57 | start: usize, 58 | end: usize, 59 | } 60 | 61 | impl<'t> Match<'t> { 62 | #[must_use] 63 | pub fn start(self) -> usize { 64 | self.start 65 | } 66 | #[must_use] 67 | pub fn end(self) -> usize { 68 | self.end 69 | } 70 | #[must_use] 71 | pub fn as_str(self) -> &'t str { 72 | self.text 73 | } 74 | } 75 | 76 | /// Returns the similarity of two strings as the Jaccard similarity of their trigram sets. The 77 | /// returned value is between 0.0 and 1.0, with 1.0 indicating maximum similarity. The input 78 | /// strings are normalized before comparison, so it is possible to get a score of 1.0 between 79 | /// different strings. For example `"figaro"` and `"Figaro?"` have a similarity of 80 | /// 1.0. 81 | #[must_use] 82 | pub fn similarity(a: &str, b: &str) -> f64 { 83 | static RX: Lazy = Lazy::new(|| Regex::new(r"^|$|\W+").unwrap()); 84 | let a = RX.replace_all(a, " ").to_lowercase(); 85 | let b = RX.replace_all(b, " ").to_lowercase(); 86 | let ta = trigrams(&a); 87 | let tb = trigrams(&b); 88 | jaccard(&ta, &tb) 89 | } 90 | 91 | /// Jaccard similarity between two sets. 92 | /// 93 | fn jaccard(s1: &HashSet, s2: &HashSet) -> f64 94 | where 95 | T: Hash + Eq, 96 | { 97 | let i = s1.intersection(s2).count() as f64; 98 | let u = s1.union(s2).count() as f64; 99 | if u == 0.0 { 100 | 1.0 101 | } else { 102 | i / u 103 | } 104 | } 105 | 106 | /// Returns the set of trigrams found in s, except ones ending in two spaces. 107 | fn trigrams(s: &str) -> HashSet<&str> { 108 | // The filter is to match an idiosyncrasy of the Postgres trigram extension: 109 | // it doesn't count trigrams that end with two spaces. 110 | let idxs = rune_indexes(s); 111 | (0..idxs.len() - 3) 112 | .map(|i| &s[idxs[i]..idxs[i + 3]]) 113 | .filter(|t| !t.ends_with(" ")) 114 | .collect() 115 | } 116 | 117 | /// Returns a vec of all the indexes of characters within the string, plus a 118 | /// sentinel value at the end. 119 | fn rune_indexes(s: &str) -> Vec { 120 | let mut idxs: Vec = s.char_indices().map(|(i, _)| i).collect(); 121 | idxs.push(s.len()); 122 | idxs 123 | } 124 | 125 | #[cfg(test)] 126 | mod tests { 127 | use super::*; 128 | use table_test::table_test; 129 | 130 | #[test] 131 | fn empty() { 132 | assert_eq!(similarity("", ""), 1.0, "checking similarity of '' to ''"); 133 | } 134 | 135 | #[test] 136 | fn same_string() { 137 | let strs = vec!["", "a", "ab", "abc", "abcd"]; 138 | for a in strs { 139 | assert_eq!( 140 | similarity(a, a), 141 | 1.0, 142 | "checking similarity of '{}' to itself", 143 | a 144 | ); 145 | } 146 | } 147 | 148 | #[test] 149 | fn zero_similarity_for_nothing_in_common() { 150 | let va = vec!["abc", "abcd"]; 151 | for a in va { 152 | let vb = vec!["def", "efgh"]; 153 | for b in vb { 154 | assert_eq!( 155 | similarity(a, b), 156 | 0.0, 157 | "checking that '{}' and '{}' have similarity of zero", 158 | a, 159 | b 160 | ); 161 | assert_eq!( 162 | similarity(b, a), 163 | 0.0, 164 | "checking that '{}' and '{}' have similarity of zero", 165 | b, 166 | a 167 | ); 168 | } 169 | } 170 | } 171 | 172 | #[test] 173 | fn non_ascii_unicode() { 174 | assert_eq!(similarity("🐕", "🐕"), 1.0, "dog matches dog"); 175 | assert_eq!( 176 | similarity("ö`üǜ", "asd"), 177 | 0.0, 178 | "no match between ö`üǜ and asd" 179 | ); 180 | assert_eq!( 181 | similarity("ö`üǜ", "ouu"), 182 | 0.0, 183 | "no match between ö`üǜ… and ouu" 184 | ); 185 | } 186 | 187 | #[test] 188 | fn case_ignored() { 189 | assert_eq!(similarity("A", "a"), 1.0, "A is a"); 190 | assert_eq!(similarity("a", "A"), 1.0, "a is A"); 191 | } 192 | 193 | #[test] 194 | fn fuzzy_matches() { 195 | // Check for agreement with answers given by the postgres pg_trgm similarity function. 196 | assert_eq!(similarity("a", "ab"), 0.25, "checking a and ab"); 197 | assert_eq!(similarity("foo", "food"), 0.5, "checking foo and food"); 198 | assert_eq!( 199 | similarity("bar", "barred"), 200 | 0.375, 201 | "checking bar and barred" 202 | ); 203 | assert_eq!( 204 | similarity("ing bear", "ing boar"), 205 | 0.5, 206 | "checking ing bear and ing boar" 207 | ); 208 | assert_eq!( 209 | similarity("dancing bear", "dancing boar"), 210 | 0.625, 211 | "checking dancing bear and dancing boar" 212 | ); 213 | assert_eq!( 214 | similarity("sir sly", "srsly"), 215 | 0.3, 216 | "checking sir sly and srsly" 217 | ); 218 | assert_eq!( 219 | similarity("same, but different?", "same but different"), 220 | 1.0, 221 | "checking same but different" 222 | ); 223 | } 224 | 225 | #[test] 226 | fn finding() { 227 | let table = vec![ 228 | (("", ""), vec![]), 229 | (("a", ""), vec![]), 230 | (("a", "a"), vec![(0, 1)]), 231 | (("a", "ab"), vec![]), 232 | (("a", "ba"), vec![]), 233 | (("ab", "abc"), vec![(0, 3)]), 234 | (("a", "ababa"), vec![]), 235 | (("a", "a b a b a"), vec![(0, 1), (4, 5), (8, 9)]), 236 | (("riddums", "riddims"), vec![(0, "riddums".len())]), 237 | ( 238 | ("riddums", "funky riddims"), 239 | vec![("funky ".len(), "funky riddums".len())], 240 | ), 241 | ]; 242 | 243 | for (validator, (needle, haystack), expected) in table_test!(table) { 244 | let threshold = 0.3; 245 | let actual: Vec<(usize, usize)> = find_words_iter(needle, haystack, threshold) 246 | .map(|m| (m.start, m.end)) 247 | .collect(); 248 | validator 249 | .given(&format!("needle = '{}', haystack = '{}'", needle, haystack)) 250 | .when("find_vec") 251 | .then(&format!("it should return {:?}", expected)) 252 | .assert_eq(expected, actual); 253 | } 254 | } 255 | } 256 | --------------------------------------------------------------------------------