├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── analyze.yml │ ├── lint.yml │ └── tests.yml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── example_test.go ├── go.mod ├── go.sum ├── internal ├── mathutil │ ├── math.go │ └── mathutil_test.go ├── ngram │ ├── ngram.go │ └── ngram_test.go └── stringutil │ ├── stringutil.go │ └── stringutil_test.go ├── metrics ├── examples_test.go ├── hamming.go ├── jaccard.go ├── jaro.go ├── jaro_winkler.go ├── levenshtein.go ├── match_mismatch.go ├── metrics_test.go ├── overlap_coefficient.go ├── smith_waterman_gotoh.go ├── sorensen_dice.go └── substitution.go └── strutil.go /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | ko_fi: adrg 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "gomod" 4 | directory: "/" 5 | schedule: 6 | interval: "daily" 7 | reviewers: 8 | - "adrg" 9 | - package-ecosystem: "github-actions" 10 | directory: "/" 11 | schedule: 12 | interval: "daily" 13 | reviewers: 14 | - "adrg" 15 | -------------------------------------------------------------------------------- /.github/workflows/analyze.yml: -------------------------------------------------------------------------------- 1 | name: analyze 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | branches: [master] 8 | schedule: 9 | - cron: "0 6 * * 1" 10 | 11 | permissions: 12 | actions: read 13 | contents: read 14 | security-events: write 15 | 16 | jobs: 17 | analyze: 18 | runs-on: ubuntu-latest 19 | 20 | steps: 21 | - name: Checkout 22 | uses: actions/checkout@v4 23 | 24 | - name: Initialize CodeQL 25 | uses: github/codeql-action/init@v3 26 | with: 27 | languages: go 28 | queries: security-and-quality 29 | 30 | - name: Run CodeQL analysis 31 | uses: github/codeql-action/analyze@v3 32 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | lint: 13 | strategy: 14 | matrix: 15 | go: [stable] 16 | os: [ubuntu-latest] 17 | runs-on: ${{ matrix.os }} 18 | steps: 19 | - name: Setup 20 | uses: actions/setup-go@v5 21 | with: 22 | go-version: ${{ matrix.go }} 23 | 24 | - name: Prepare checkout 25 | run: git config --global core.autocrlf false 26 | 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | 30 | - name: Lint 31 | uses: golangci/golangci-lint-action@v8.0.0 32 | with: 33 | args: --timeout=5m 34 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | branches: [master] 6 | pull_request: 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | test: 13 | strategy: 14 | matrix: 15 | go: [stable] 16 | os: [ubuntu-latest] 17 | runs-on: ${{ matrix.os }} 18 | steps: 19 | - name: Setup 20 | uses: actions/setup-go@v5 21 | with: 22 | go-version: ${{ matrix.go }} 23 | 24 | - name: Prepare checkout 25 | run: git config --global core.autocrlf false 26 | 27 | - name: Checkout 28 | uses: actions/checkout@v4 29 | 30 | - name: Test 31 | run: go test -v -coverprofile coverage.txt -covermode atomic ./... 32 | 33 | - name: Coverage 34 | uses: codecov/codecov-action@v5 35 | env: 36 | CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} 37 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, 8 | body size, disability, ethnicity, sex characteristics, gender identity and 9 | expression, level of experience, education, socio-economic status, nationality, 10 | personal appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behaviour that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behaviour by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behaviour and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behaviour. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviour that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behaviour may be 58 | reported by contacting the project team at adrg@epistack.com. All complaints 59 | will be reviewed and investigated and will result in a response that is deemed 60 | necessary and appropriate to the circumstances. The project team is obligated to 61 | maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 71 | version 1.4, available at 72 | https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 73 | 74 | [homepage]: https://www.contributor-covenant.org 75 | 76 | For answers to common questions about this code of conduct, see 77 | https://www.contributor-covenant.org/faq 78 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to this project 2 | 3 | Contributions in the form of pull requests, issues or just general feedback, 4 | are always welcome. Please take a moment to review this document in order to 5 | make the contribution process easy and effective for everyone involved. 6 | 7 | Following these guidelines helps to communicate that you respect the time of 8 | the developers managing and developing this open source project. In return, 9 | they should reciprocate that respect in addressing your issue or assessing 10 | patches and features. 11 | 12 | ## Using the issue tracker 13 | 14 | The issue tracker is the preferred channel for [bug reports](#bugs), 15 | [features requests](#features) and [submitting pull 16 | requests](#pull-requests), but please respect the following restrictions: 17 | 18 | * Please **do not** use the issue tracker for personal support requests (use 19 | [Stack Overflow](http://stackoverflow.com) or IRC). 20 | * Please **do not** derail or troll issues. Keep the discussion on topic and 21 | respect the opinions of others. 22 | 23 | 24 | ## Bug reports 25 | 26 | A bug is a _demonstrable problem_ that is caused by the code in the repository. 27 | Good bug reports are extremely helpful - thank you! 28 | 29 | Guidelines for bug reports: 30 | 31 | 1. **Use the GitHub issue search** — check if the issue has already been 32 | reported. 33 | 2. **Check if the issue has been fixed** — try to reproduce it using the 34 | latest `master` or development branch in the repository. 35 | 3. **Isolate the problem** — create a reduced test case. 36 | 37 | A good bug report shouldn't leave others needing to chase you up for more 38 | information. Please try to be as detailed as possible in your report. What is 39 | your environment? What steps will reproduce the issue? What browser(s) and OS 40 | experience the problem? What would you expect to be the outcome? All these 41 | details will help people to fix any potential bugs. 42 | 43 | Example: 44 | 45 | > Short and descriptive example bug report title 46 | > 47 | > A summary of the issue and the browser/OS environment in which it occurs. If 48 | > suitable, include the steps required to reproduce the bug. 49 | > 50 | > 1. This is the first step 51 | > 2. This is the second step 52 | > 3. Further steps, etc. 53 | > 54 | > `` - a link to the reduced test case 55 | > 56 | > Any other information you want to share that is relevant to the issue being 57 | > reported. This might include the lines of code that you have identified as 58 | > causing the bug, and potential solutions (and your opinions on their 59 | > merits). 60 | 61 | 62 | 63 | ## Feature requests 64 | 65 | Feature requests are welcome. But take a moment to find out whether your idea 66 | fits with the scope and aims of the project. It's up to *you* to make a strong 67 | case to convince the project's developers of the merits of this feature. Please 68 | provide as much detail and context as possible. 69 | 70 | 71 | 72 | ## Pull requests 73 | 74 | Good pull requests - patches, improvements, new features - are a fantastic 75 | help. They should remain focused in scope and avoid containing unrelated 76 | commits. 77 | 78 | **Please ask first** before embarking on any significant pull request (e.g. 79 | implementing features, refactoring code, porting to a different language), 80 | otherwise you risk spending a lot of time working on something that the 81 | project's developers might not want to merge into the project. 82 | 83 | Please adhere to the coding conventions used throughout a project (indentation, 84 | accurate comments, etc.) and any other requirements (such as test coverage). 85 | 86 | Follow this process if you'd like your work considered for inclusion in the 87 | project: 88 | 89 | 1. [Fork](http://help.github.com/fork-a-repo/) the project, clone your fork, 90 | and configure the remotes: 91 | 92 | ```bash 93 | # Clone your fork of the repo into the current directory 94 | git clone https://github.com// 95 | # Navigate to the newly cloned directory 96 | cd 97 | # Assign the original repo to a remote called "upstream" 98 | git remote add upstream https://github.com// 99 | ``` 100 | 101 | 2. If you cloned a while ago, get the latest changes from upstream: 102 | 103 | ```bash 104 | git checkout 105 | git pull upstream 106 | ``` 107 | 108 | 3. Create a new topic branch (off the main project development branch) to 109 | contain your feature, change, or fix: 110 | 111 | ```bash 112 | git checkout -b 113 | ``` 114 | 115 | 4. Commit your changes in logical chunks and use descriptive commit messages. 116 | Use [interactive rebase](https://help.github.com/articles/interactive-rebase) 117 | to tidy up your commits before making them public. 118 | 119 | 5. Locally merge (or rebase) the upstream development branch into your topic branch: 120 | 121 | ```bash 122 | git pull [--rebase] upstream 123 | ``` 124 | 125 | 6. Push your topic branch up to your fork: 126 | 127 | ```bash 128 | git push origin 129 | ``` 130 | 131 | 7. [Open a Pull Request](https://help.github.com/articles/using-pull-requests/) 132 | with a clear title and description. 133 | 134 | **IMPORTANT**: By submitting a patch, you agree to allow the project owner to 135 | license your work under the same license as that used by the project. 136 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Adrian-George Bostan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

strutil

2 | 3 |

4 | 5 | Tests status 6 | 7 | 8 | Code coverage 9 | 10 | 11 | pkg.go.dev documentation 12 | 13 | 14 | MIT license 15 | 16 | 17 | Go report card 18 | 19 | 20 | GitHub issues 21 | 22 | 23 | Buy me a coffee 24 | 25 |

26 | 27 | strutil provides a collection of string metrics for calculating string similarity as well as 28 | other string utility functions. 29 | Full documentation can be found at https://pkg.go.dev/github.com/adrg/strutil. 30 | 31 | ## Installation 32 | 33 | ``` 34 | go get github.com/adrg/strutil 35 | ``` 36 | 37 | ## String metrics 38 | 39 | - [Hamming](#hamming) 40 | - [Levenshtein](#levenshtein) 41 | - [Jaro](#jaro) 42 | - [Jaro-Winkler](#jaro-winkler) 43 | - [Smith-Waterman-Gotoh](#smith-waterman-gotoh) 44 | - [Sorensen-Dice](#sorensen-dice) 45 | - [Jaccard](#jaccard) 46 | - [Overlap Coefficient](#overlap-coefficient) 47 | 48 | The package defines the `StringMetric` interface, which is implemented by all 49 | the string metrics. The interface is used with the `Similarity` function, which 50 | calculates the similarity between the specified strings, using the provided 51 | string metric. 52 | 53 | ```go 54 | type StringMetric interface { 55 | Compare(a, b string) float64 56 | } 57 | 58 | func Similarity(a, b string, metric StringMetric) float64 { 59 | } 60 | ``` 61 | 62 | All defined string metrics can be found in the 63 | [metrics](https://pkg.go.dev/github.com/adrg/strutil/metrics) package. 64 | 65 | #### Hamming 66 | 67 | Calculate similarity. 68 | ```go 69 | similarity := strutil.Similarity("text", "test", metrics.NewHamming()) 70 | fmt.Printf("%.2f\n", similarity) // Output: 0.75 71 | ``` 72 | 73 | Calculate distance. 74 | ```go 75 | ham := metrics.NewHamming() 76 | fmt.Printf("%d\n", ham.Distance("one", "once")) // Output: 2 77 | ``` 78 | 79 | More information and additional examples can be found on 80 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Hamming). 81 | 82 | #### Levenshtein 83 | 84 | Calculate similarity using default options. 85 | ```go 86 | similarity := strutil.Similarity("graph", "giraffe", metrics.NewLevenshtein()) 87 | fmt.Printf("%.2f\n", similarity) // Output: 0.43 88 | ``` 89 | 90 | Configure edit operation costs. 91 | ```go 92 | lev := metrics.NewLevenshtein() 93 | lev.CaseSensitive = false 94 | lev.InsertCost = 1 95 | lev.ReplaceCost = 2 96 | lev.DeleteCost = 1 97 | 98 | similarity := strutil.Similarity("make", "Cake", lev) 99 | fmt.Printf("%.2f\n", similarity) // Output: 0.50 100 | ``` 101 | 102 | Calculate distance. 103 | ```go 104 | lev := metrics.NewLevenshtein() 105 | fmt.Printf("%d\n", lev.Distance("graph", "giraffe")) // Output: 4 106 | ``` 107 | 108 | More information and additional examples can be found on 109 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Levenshtein). 110 | 111 | #### Jaro 112 | 113 | ```go 114 | similarity := strutil.Similarity("think", "tank", metrics.NewJaro()) 115 | fmt.Printf("%.2f\n", similarity) // Output: 0.78 116 | ``` 117 | 118 | More information and additional examples can be found on 119 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaro). 120 | 121 | #### Jaro-Winkler 122 | 123 | ```go 124 | similarity := strutil.Similarity("think", "tank", metrics.NewJaroWinkler()) 125 | fmt.Printf("%.2f\n", similarity) // Output: 0.80 126 | ``` 127 | 128 | More information and additional examples can be found on 129 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#JaroWinkler). 130 | 131 | #### Smith-Waterman-Gotoh 132 | 133 | Calculate similarity using default options. 134 | ```go 135 | swg := metrics.NewSmithWatermanGotoh() 136 | similarity := strutil.Similarity("times roman", "times new roman", swg) 137 | fmt.Printf("%.2f\n", similarity) // Output: 0.82 138 | ``` 139 | 140 | Customize gap penalty and substitution function. 141 | ```go 142 | swg := metrics.NewSmithWatermanGotoh() 143 | swg.CaseSensitive = false 144 | swg.GapPenalty = -0.1 145 | swg.Substitution = metrics.MatchMismatch { 146 | Match: 1, 147 | Mismatch: -0.5, 148 | } 149 | 150 | similarity := strutil.Similarity("Times Roman", "times new roman", swg) 151 | fmt.Printf("%.2f\n", similarity) // Output: 0.96 152 | ``` 153 | 154 | More information and additional examples can be found on 155 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SmithWatermanGotoh). 156 | 157 | #### Sorensen-Dice 158 | 159 | Calculate similarity using default options. 160 | ```go 161 | sd := metrics.NewSorensenDice() 162 | similarity := strutil.Similarity("time to make haste", "no time to waste", sd) 163 | fmt.Printf("%.2f\n", similarity) // Output: 0.62 164 | ``` 165 | 166 | Customize n-gram size. 167 | ```go 168 | sd := metrics.NewSorensenDice() 169 | sd.CaseSensitive = false 170 | sd.NgramSize = 3 171 | 172 | similarity := strutil.Similarity("Time to make haste", "no time to waste", sd) 173 | fmt.Printf("%.2f\n", similarity) // Output: 0.53 174 | ``` 175 | 176 | More information and additional examples can be found on 177 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SorensenDice). 178 | 179 | #### Jaccard 180 | 181 | Calculate similarity using default options. 182 | ```go 183 | j := metrics.NewJaccard() 184 | similarity := strutil.Similarity("time to make haste", "no time to waste", j) 185 | fmt.Printf("%.2f\n", similarity) // Output: 0.45 186 | ``` 187 | 188 | Customize n-gram size. 189 | ```go 190 | j := metrics.NewJaccard() 191 | j.CaseSensitive = false 192 | j.NgramSize = 3 193 | 194 | similarity := strutil.Similarity("Time to make haste", "no time to waste", j) 195 | fmt.Printf("%.2f\n", similarity) // Output: 0.36 196 | ``` 197 | 198 | The input of the Sorensen-Dice example is the same as the one of Jaccard 199 | because the metrics bear a resemblance to each other. In fact, each of the 200 | coefficients can be used to calculate the other one. 201 | 202 | Sorensen-Dice to Jaccard. 203 | ``` 204 | J = SD/(2-SD) 205 | 206 | where SD is the Sorensen-Dice coefficient and J is the Jaccard index. 207 | ``` 208 | 209 | Jaccard to Sorensen-Dice. 210 | ``` 211 | SD = 2*J/(1+J) 212 | 213 | where SD is the Sorensen-Dice coefficient and J is the Jaccard index. 214 | ``` 215 | 216 | More information and additional examples can be found on 217 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaccard). 218 | 219 | #### Overlap Coefficient 220 | 221 | Calculate similarity using default options. 222 | ```go 223 | oc := metrics.NewOverlapCoefficient() 224 | similarity := strutil.Similarity("time to make haste", "no time to waste", oc) 225 | fmt.Printf("%.2f\n", similarity) // Output: 0.67 226 | ``` 227 | 228 | Customize n-gram size. 229 | ```go 230 | oc := metrics.NewOverlapCoefficient() 231 | oc.CaseSensitive = false 232 | oc.NgramSize = 3 233 | 234 | similarity := strutil.Similarity("Time to make haste", "no time to waste", oc) 235 | fmt.Printf("%.2f\n", similarity) // Output: 0.57 236 | ``` 237 | 238 | More information and additional examples can be found on 239 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#OverlapCoefficient). 240 | 241 | ## References 242 | 243 | For more information see: 244 | - [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance) 245 | - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) 246 | - [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro-Winkler_distance) 247 | - [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith-Waterman_algorithm) 248 | - [Sorensen-Dice coefficient](https://en.wikipedia.org/wiki/Sorensen–Dice_coefficient) 249 | - [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index) 250 | - [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient) 251 | 252 | ## Stargazers over time 253 | 254 | [![Stargazers over time](https://starchart.cc/adrg/strutil.svg)](https://starchart.cc/adrg/strutil) 255 | 256 | ## Contributing 257 | 258 | Contributions in the form of pull requests, issues or just general feedback, 259 | are always welcome. 260 | See [CONTRIBUTING.MD](CONTRIBUTING.md). 261 | 262 | ## License 263 | 264 | Copyright (c) 2019 Adrian-George Bostan. 265 | 266 | This project is licensed under the [MIT license](https://opensource.org/licenses/MIT). 267 | See [LICENSE](LICENSE) for more details. 268 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | package strutil_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/adrg/strutil" 7 | "github.com/adrg/strutil/metrics" 8 | ) 9 | 10 | func ExampleSimilarity() { 11 | sim := strutil.Similarity("riddle", "needle", metrics.NewJaroWinkler()) 12 | fmt.Printf("(riddle, needle) similarity: %.2f\n", sim) 13 | 14 | // Output: 15 | // (riddle, needle) similarity: 0.56 16 | } 17 | 18 | func ExampleCommonPrefix() { 19 | fmt.Println("(answer, anvil):", strutil.CommonPrefix("answer", "anvil")) 20 | 21 | // Output: 22 | // (answer, anvil): an 23 | } 24 | 25 | func ExampleUniqueSlice() { 26 | sample := []string{"a", "b", "a", "b", "b", "c"} 27 | fmt.Println("[a b a b b c]:", strutil.UniqueSlice(sample)) 28 | 29 | // Output: 30 | // [a b a b b c]: [a b c] 31 | } 32 | 33 | func ExampleSliceContains() { 34 | terms := []string{"a", "b", "c"} 35 | fmt.Println("([a b c], b):", strutil.SliceContains(terms, "b")) 36 | fmt.Println("([a b c], d):", strutil.SliceContains(terms, "d")) 37 | 38 | // Output: 39 | // ([a b c], b): true 40 | // ([a b c], d): false 41 | } 42 | 43 | func ExampleNgramCount() { 44 | fmt.Println("abbcd n-gram count (size 2):", strutil.NgramCount("abbcd", 2)) 45 | fmt.Println("abbcd n-gram count (size 3):", strutil.NgramCount("abbcd", 3)) 46 | 47 | // Output: 48 | // abbcd n-gram count (size 2): 4 49 | // abbcd n-gram count (size 3): 3 50 | } 51 | 52 | func ExampleNgrams() { 53 | fmt.Println("abbcd n-grams (size 2):", strutil.Ngrams("abbcd", 2)) 54 | fmt.Println("abbcd n-grams (size 3):", strutil.Ngrams("abbcd", 3)) 55 | 56 | // Output: 57 | // abbcd n-grams (size 2): [ab bb bc cd] 58 | // abbcd n-grams (size 3): [abb bbc bcd] 59 | } 60 | 61 | func ExampleNgramMap() { 62 | // 2 character n-gram map. 63 | ngrams, total := strutil.NgramMap("abbcabb", 2) 64 | fmt.Printf("abbcabb n-gram map (size 2): %v (%d ngrams)\n", ngrams, total) 65 | 66 | // 3 character n-gram map. 67 | ngrams, total = strutil.NgramMap("abbcabb", 3) 68 | fmt.Printf("abbcabb n-gram map (size 3): %v (%d ngrams)\n", ngrams, total) 69 | 70 | // Output: 71 | // abbcabb n-gram map (size 2): map[ab:2 bb:2 bc:1 ca:1] (6 ngrams) 72 | // abbcabb n-gram map (size 3): map[abb:2 bbc:1 bca:1 cab:1] (5 ngrams) 73 | } 74 | 75 | func ExampleNgramIntersection() { 76 | ngrams, common, totalA, totalB := strutil.NgramIntersection("ababc", "ababd", 2) 77 | fmt.Printf("(ababc, ababd) n-gram intersection: %v (%d/%d n-grams)\n", 78 | ngrams, common, totalA+totalB) 79 | 80 | // Output: 81 | // (ababc, ababd) n-gram intersection: map[ab:2 ba:1] (3/8 n-grams) 82 | } 83 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/adrg/strutil 2 | 3 | go 1.19 4 | 5 | require github.com/stretchr/testify v1.10.0 6 | 7 | require ( 8 | github.com/davecgh/go-spew v1.1.1 // indirect 9 | github.com/pmezard/go-difflib v1.0.0 // indirect 10 | gopkg.in/yaml.v3 v3.0.1 // indirect 11 | ) 12 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= 3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= 4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= 5 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= 6 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= 7 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= 8 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 9 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= 10 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= 11 | -------------------------------------------------------------------------------- /internal/mathutil/math.go: -------------------------------------------------------------------------------- 1 | package mathutil 2 | 3 | // Min returns the value of the smallest argument, 4 | // or 0 if no arguments are provided. 5 | func Min(args ...int) int { 6 | if len(args) == 0 { 7 | return 0 8 | } 9 | if len(args) == 1 { 10 | return args[0] 11 | } 12 | 13 | min := args[0] 14 | for _, arg := range args[1:] { 15 | if min > arg { 16 | min = arg 17 | } 18 | } 19 | 20 | return min 21 | } 22 | 23 | // Max returns the value of the largest argument, 24 | // or 0 if no arguments are provided. 25 | func Max(args ...int) int { 26 | if len(args) == 0 { 27 | return 0 28 | } 29 | if len(args) == 1 { 30 | return args[0] 31 | } 32 | 33 | max := args[0] 34 | for _, arg := range args[1:] { 35 | if max < arg { 36 | max = arg 37 | } 38 | } 39 | 40 | return max 41 | } 42 | 43 | // Minf returns the value of the smallest argument, 44 | // or 0 if no arguments are provided. 45 | func Minf(args ...float64) float64 { 46 | if len(args) == 0 { 47 | return 0 48 | } 49 | if len(args) == 1 { 50 | return args[0] 51 | } 52 | 53 | min := args[0] 54 | for _, arg := range args[1:] { 55 | if min > arg { 56 | min = arg 57 | } 58 | } 59 | 60 | return min 61 | } 62 | 63 | // Maxf returns the value of the largest argument, 64 | // or 0 if no arguments are provided. 65 | func Maxf(args ...float64) float64 { 66 | if len(args) == 0 { 67 | return 0 68 | } 69 | if len(args) == 1 { 70 | return args[0] 71 | } 72 | 73 | max := args[0] 74 | for _, arg := range args[1:] { 75 | if max < arg { 76 | max = arg 77 | } 78 | } 79 | 80 | return max 81 | } 82 | -------------------------------------------------------------------------------- /internal/mathutil/mathutil_test.go: -------------------------------------------------------------------------------- 1 | package mathutil_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/adrg/strutil/internal/mathutil" 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestMin(t *testing.T) { 11 | requireEqual(t, [][2]interface{}{ 12 | {0, mathutil.Min()}, 13 | {1, mathutil.Min(1)}, 14 | {0, mathutil.Min(0, 1)}, 15 | {1, mathutil.Min(1, 1)}, 16 | {1, mathutil.Min(2, 1)}, 17 | {1, mathutil.Min(1, 2)}, 18 | {0, mathutil.Min(2, 1, 0)}, 19 | {0, mathutil.Min(0, 1, 2)}, 20 | }) 21 | } 22 | 23 | func TestMax(t *testing.T) { 24 | requireEqual(t, [][2]interface{}{ 25 | {0, mathutil.Max()}, 26 | {1, mathutil.Max(1)}, 27 | {1, mathutil.Max(0, 1)}, 28 | {1, mathutil.Max(1, 1)}, 29 | {2, mathutil.Max(2, 1)}, 30 | {2, mathutil.Max(1, 2)}, 31 | {3, mathutil.Max(2, 1, 3)}, 32 | {3, mathutil.Max(3, 1, 2)}, 33 | }) 34 | } 35 | 36 | func TestMinf(t *testing.T) { 37 | requireEqual(t, [][2]interface{}{ 38 | {0.0, mathutil.Minf()}, 39 | {1.0, mathutil.Minf(1.0)}, 40 | {0.0, mathutil.Minf(0.0, 1.0)}, 41 | {1.0, mathutil.Minf(1.0, 1.0)}, 42 | {1.0, mathutil.Minf(2.0, 1.0)}, 43 | {1.0, mathutil.Minf(1.0, 2.0)}, 44 | {0.0, mathutil.Minf(2.0, 1.0, 0.0)}, 45 | {0.0, mathutil.Minf(0.0, 1.0, 2.0)}, 46 | }) 47 | } 48 | 49 | func TestMaxf(t *testing.T) { 50 | requireEqual(t, [][2]interface{}{ 51 | {0.0, mathutil.Maxf()}, 52 | {1.0, mathutil.Maxf(1.0)}, 53 | {1.0, mathutil.Maxf(0.0, 1.0)}, 54 | {1.0, mathutil.Maxf(1.0, 1.0)}, 55 | {2.0, mathutil.Maxf(2.0, 1.1, 1.0)}, 56 | {2.0, mathutil.Maxf(1.1, 1.0, 2.0)}, 57 | {3.0, mathutil.Maxf(2.0, 1.0, 3.0)}, 58 | {3.0, mathutil.Maxf(3.0, 1.0, 2.0)}, 59 | }) 60 | } 61 | 62 | func requireEqual(t *testing.T, inputs [][2]interface{}) { 63 | t.Helper() 64 | 65 | for _, input := range inputs { 66 | require.Equal(t, input[0], input[1]) 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /internal/ngram/ngram.go: -------------------------------------------------------------------------------- 1 | package ngram 2 | 3 | import "github.com/adrg/strutil/internal/mathutil" 4 | 5 | // Count returns the n-gram count of the specified size for the 6 | // provided term. An n-gram size of 1 is used if the provided size is 7 | // less than or equal to 0. 8 | func Count(runes []rune, size int) int { 9 | return mathutil.Max(len(runes)-(mathutil.Max(size, 1)-1), 0) 10 | } 11 | 12 | // Slice returns all the n-grams of the specified size for the provided term. 13 | // The n-grams in the output slice are in the order in which they occur in the 14 | // input term. An n-gram size of 1 is used if the provided size is less than 15 | // or equal to 0. 16 | func Slice(runes []rune, size int) []string { 17 | // Use an n-gram size of 1 if the provided size is invalid. 18 | size = mathutil.Max(size, 1) 19 | 20 | // Check if term length is too small. 21 | lenRunes := len(runes) 22 | if lenRunes == 0 || lenRunes < size { 23 | return nil 24 | } 25 | 26 | // Generate n-gram slice. 27 | limit := lenRunes - (size - 1) 28 | ngrams := make([]string, limit) 29 | 30 | for i, j := 0, 0; i < limit; i++ { 31 | ngrams[j] = string(runes[i : i+size]) 32 | j++ 33 | } 34 | 35 | return ngrams 36 | } 37 | 38 | // Map returns a map of all n-grams of the specified size for the provided 39 | // term, along with their frequency. The function also returns the total 40 | // number of n-grams, which is the sum of all the values in the output map. 41 | // An n-gram size of 1 is used if the provided size is less than or equal to 0. 42 | func Map(runes []rune, size int) (map[string]int, int) { 43 | // Use an n-gram size of 1 if the provided size is invalid. 44 | size = mathutil.Max(size, 1) 45 | 46 | // Check if term length is too small. 47 | lenRunes := len(runes) 48 | if lenRunes == 0 || lenRunes < size { 49 | return map[string]int{}, 0 50 | } 51 | 52 | // Generate n-gram map. 53 | limit := lenRunes - (size - 1) 54 | ngrams := make(map[string]int, limit) 55 | 56 | var ngramCount int 57 | for i := 0; i < limit; i++ { 58 | ngram := string(runes[i : i+size]) 59 | count := ngrams[ngram] 60 | ngrams[ngram] = count + 1 61 | ngramCount++ 62 | } 63 | 64 | return ngrams, ngramCount 65 | } 66 | 67 | // Intersection returns a map of the n-grams of the specified size found 68 | // in both terms, along with their frequency. The function also returns the 69 | // number of common n-grams (the sum of all the values in the output map), 70 | // the total number of n-grams in the first term and the total number of 71 | // n-grams in the second term. An n-gram size of 1 is used if the provided 72 | // size is less than or equal to 0. 73 | func Intersection(a, b []rune, size int) (map[string]int, int, int, int) { 74 | // Use an n-gram size of 1 if the provided size is invalid. 75 | size = mathutil.Max(size, 1) 76 | 77 | // Compute the n-grams of the first term. 78 | ngramsA, totalA := Map(a, size) 79 | 80 | // Calculate n-gram intersection with the second term. 81 | limit := len(b) - (size - 1) 82 | commonNgrams := make(map[string]int, mathutil.Max(limit, 0)) 83 | 84 | var totalB, intersection int 85 | for i := 0; i < limit; i++ { 86 | ngram := string(b[i : i+size]) 87 | totalB++ 88 | 89 | if count, ok := ngramsA[ngram]; ok && count > 0 { 90 | // Decrease frequency of n-gram found in the first term each time 91 | // a successful match is found. 92 | intersection++ 93 | ngramsA[ngram] = count - 1 94 | 95 | // Update common n-grams map with the matched n-gram and its 96 | // frequency. 97 | count = commonNgrams[ngram] 98 | commonNgrams[ngram] = count + 1 99 | } 100 | } 101 | 102 | return commonNgrams, intersection, totalA, totalB 103 | } 104 | -------------------------------------------------------------------------------- /internal/ngram/ngram_test.go: -------------------------------------------------------------------------------- 1 | package ngram_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/adrg/strutil/internal/ngram" 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestNgramCount(t *testing.T) { 11 | requireEqual(t, [][2]interface{}{ 12 | {0, ngram.Count(nil, -1)}, 13 | {0, ngram.Count(nil, 0)}, 14 | {0, ngram.Count(nil, 1)}, 15 | {0, ngram.Count([]rune{}, -1)}, 16 | {0, ngram.Count([]rune{}, 0)}, 17 | {0, ngram.Count([]rune{}, 1)}, 18 | {6, ngram.Count([]rune("abbabb"), -1)}, 19 | {6, ngram.Count([]rune("abbabb"), 0)}, 20 | {6, ngram.Count([]rune("abbabb"), 1)}, 21 | {5, ngram.Count([]rune("abbabb"), 2)}, 22 | {4, ngram.Count([]rune("abbabb"), 3)}, 23 | {3, ngram.Count([]rune("abbabb"), 4)}, 24 | {2, ngram.Count([]rune("abbabb"), 5)}, 25 | {1, ngram.Count([]rune("abbabb"), 6)}, 26 | {0, ngram.Count([]rune("abbabb"), 7)}, 27 | {0, ngram.Count([]rune("abbabb"), 8)}, 28 | }) 29 | } 30 | 31 | func TestNgrams(t *testing.T) { 32 | requireEqual(t, [][2]interface{}{ 33 | {0, len(ngram.Slice(nil, -1))}, 34 | {0, len(ngram.Slice(nil, 0))}, 35 | {0, len(ngram.Slice(nil, 1))}, 36 | {0, len(ngram.Slice([]rune{}, -1))}, 37 | {0, len(ngram.Slice([]rune{}, 0))}, 38 | {0, len(ngram.Slice([]rune{}, 1))}, 39 | { 40 | []string{"a", "b", "c", "d", "e", "f"}, 41 | ngram.Slice([]rune("abcdef"), -1), 42 | }, 43 | { 44 | []string{"a", "b", "c", "d", "e", "f"}, 45 | ngram.Slice([]rune("abcdef"), 0), 46 | }, 47 | { 48 | []string{"a", "b", "c", "d", "e", "f"}, 49 | ngram.Slice([]rune("abcdef"), 1), 50 | }, 51 | { 52 | []string{"ab", "bc", "cd", "de", "ef"}, 53 | ngram.Slice([]rune("abcdef"), 2), 54 | }, 55 | { 56 | []string{"abc", "bcd", "cde", "def"}, 57 | ngram.Slice([]rune("abcdef"), 3), 58 | }, 59 | { 60 | []string{"abcd", "bcde", "cdef"}, 61 | ngram.Slice([]rune("abcdef"), 4), 62 | }, 63 | { 64 | []string{"abcde", "bcdef"}, 65 | ngram.Slice([]rune("abcdef"), 5), 66 | }, 67 | { 68 | []string{"abcdef"}, 69 | ngram.Slice([]rune("abcdef"), 6), 70 | }, 71 | { 72 | 0, 73 | len(ngram.Slice([]rune("abcdef"), 7)), 74 | }, 75 | { 76 | 0, 77 | len(ngram.Slice([]rune("abcdef"), 8)), 78 | }, 79 | }) 80 | } 81 | 82 | func TestNgramMap(t *testing.T) { 83 | inputs := []*struct { 84 | term []rune 85 | size int 86 | expMap map[string]int 87 | expTotal int 88 | }{ 89 | { 90 | term: nil, 91 | size: -1, 92 | expMap: map[string]int{}, 93 | }, 94 | { 95 | term: nil, 96 | expMap: map[string]int{}, 97 | }, 98 | { 99 | term: nil, 100 | size: 1, 101 | expMap: map[string]int{}, 102 | }, 103 | { 104 | term: []rune{}, 105 | size: -1, 106 | expMap: map[string]int{}, 107 | }, 108 | { 109 | term: []rune{}, 110 | expMap: map[string]int{}, 111 | }, 112 | { 113 | term: []rune{}, 114 | size: 1, 115 | expMap: map[string]int{}, 116 | }, 117 | { 118 | term: []rune("abbabb"), 119 | size: -1, 120 | expMap: map[string]int{"a": 2, "b": 4}, 121 | expTotal: 6, 122 | }, 123 | { 124 | term: []rune("abbabb"), 125 | expMap: map[string]int{"a": 2, "b": 4}, 126 | expTotal: 6, 127 | }, 128 | { 129 | term: []rune("abbabb"), 130 | size: 1, 131 | expMap: map[string]int{"a": 2, "b": 4}, 132 | expTotal: 6, 133 | }, 134 | { 135 | term: []rune("abbabb"), 136 | size: 2, 137 | expMap: map[string]int{"ab": 2, "bb": 2, "ba": 1}, 138 | expTotal: 5, 139 | }, 140 | { 141 | term: []rune("abbabb"), 142 | size: 3, 143 | expMap: map[string]int{"abb": 2, "bba": 1, "bab": 1}, 144 | expTotal: 4, 145 | }, 146 | { 147 | term: []rune("abbabb"), 148 | size: 4, 149 | expMap: map[string]int{"abba": 1, "bbab": 1, "babb": 1}, 150 | expTotal: 3, 151 | }, 152 | { 153 | term: []rune("abbabb"), 154 | size: 5, 155 | expMap: map[string]int{"abbab": 1, "bbabb": 1}, 156 | expTotal: 2, 157 | }, 158 | { 159 | term: []rune("abbabb"), 160 | size: 6, 161 | expMap: map[string]int{"abbabb": 1}, 162 | expTotal: 1, 163 | }, 164 | { 165 | term: []rune("abbabb"), 166 | size: 7, 167 | expMap: map[string]int{}, 168 | expTotal: 0, 169 | }, 170 | { 171 | term: []rune("abbabb"), 172 | size: 8, 173 | expMap: map[string]int{}, 174 | expTotal: 0, 175 | }, 176 | } 177 | 178 | for _, input := range inputs { 179 | actMap, actTotal := ngram.Map(input.term, input.size) 180 | require.Equal(t, input.expMap, actMap) 181 | require.Equal(t, input.expTotal, actTotal) 182 | } 183 | } 184 | 185 | func TestNgramIntersection(t *testing.T) { 186 | inputs := []*struct { 187 | a []rune 188 | b []rune 189 | size int 190 | 191 | expMap map[string]int 192 | expTotal int 193 | expTotalA int 194 | expTotalB int 195 | }{ 196 | { 197 | size: 1, 198 | expMap: map[string]int{}, 199 | }, 200 | { 201 | a: []rune{}, 202 | size: 1, 203 | expMap: map[string]int{}, 204 | }, 205 | { 206 | b: []rune{}, 207 | size: 1, 208 | expMap: map[string]int{}, 209 | }, 210 | { 211 | a: []rune{}, 212 | b: []rune{}, 213 | size: 1, 214 | expMap: map[string]int{}, 215 | }, 216 | { 217 | a: []rune("ababbaa"), 218 | b: []rune("aabbaa"), 219 | size: -1, 220 | expMap: map[string]int{"a": 4, "b": 2}, 221 | expTotal: 6, 222 | expTotalA: 7, 223 | expTotalB: 6, 224 | }, 225 | { 226 | a: []rune("aabbaa"), 227 | b: []rune("ababbaa"), 228 | expMap: map[string]int{"a": 4, "b": 2}, 229 | expTotal: 6, 230 | expTotalA: 6, 231 | expTotalB: 7, 232 | }, 233 | { 234 | a: []rune("ababbaa"), 235 | b: []rune("aabbaa"), 236 | size: 1, 237 | expMap: map[string]int{"a": 4, "b": 2}, 238 | expTotal: 6, 239 | expTotalA: 7, 240 | expTotalB: 6, 241 | }, 242 | { 243 | a: []rune("aabbaa"), 244 | b: []rune("ababbaa"), 245 | size: 2, 246 | expMap: map[string]int{"aa": 1, "ab": 1, "ba": 1, "bb": 1}, 247 | expTotal: 4, 248 | expTotalA: 5, 249 | expTotalB: 6, 250 | }, 251 | { 252 | a: []rune("ababbaa"), 253 | b: []rune("aabbaa"), 254 | size: 3, 255 | expMap: map[string]int{"abb": 1, "bba": 1, "baa": 1}, 256 | expTotal: 3, 257 | expTotalA: 5, 258 | expTotalB: 4, 259 | }, 260 | { 261 | a: []rune("aabbaa"), 262 | b: []rune("ababbaa"), 263 | size: 4, 264 | expMap: map[string]int{"abba": 1, "bbaa": 1}, 265 | expTotal: 2, 266 | expTotalA: 3, 267 | expTotalB: 4, 268 | }, 269 | { 270 | a: []rune("ababbaa"), 271 | b: []rune("aabbaa"), 272 | size: 5, 273 | expMap: map[string]int{"abbaa": 1}, 274 | expTotal: 1, 275 | expTotalA: 3, 276 | expTotalB: 2, 277 | }, 278 | { 279 | a: []rune("aabbaa"), 280 | b: []rune("ababbaa"), 281 | size: 6, 282 | expMap: map[string]int{}, 283 | expTotalA: 1, 284 | expTotalB: 2, 285 | }, 286 | { 287 | a: []rune("ababbaa"), 288 | b: []rune("aabbaa"), 289 | size: 7, 290 | expMap: map[string]int{}, 291 | expTotalA: 1, 292 | }, 293 | { 294 | a: []rune("aabbaa"), 295 | b: []rune("ababbaa"), 296 | size: 7, 297 | expMap: map[string]int{}, 298 | expTotalB: 1, 299 | }, 300 | { 301 | a: []rune("ababbaa"), 302 | b: []rune("aabbaa"), 303 | size: 8, 304 | expMap: map[string]int{}, 305 | }, 306 | { 307 | a: []rune("aabbaa"), 308 | b: []rune("ababbaa"), 309 | size: 8, 310 | expMap: map[string]int{}, 311 | }, 312 | { 313 | a: []rune("ababbaa"), 314 | b: []rune("aabbaa"), 315 | size: 9, 316 | expMap: map[string]int{}, 317 | }, 318 | { 319 | a: []rune("aabbaa"), 320 | b: []rune("ababbaa"), 321 | size: 9, 322 | expMap: map[string]int{}, 323 | }, 324 | } 325 | 326 | for _, input := range inputs { 327 | actMap, actTotal, actTotalA, actTotalB := ngram.Intersection(input.a, input.b, input.size) 328 | require.Equal(t, input.expMap, actMap) 329 | require.Equal(t, input.expTotal, actTotal) 330 | require.Equal(t, input.expTotalA, actTotalA) 331 | require.Equal(t, input.expTotalB, actTotalB) 332 | } 333 | } 334 | 335 | func requireEqual(t *testing.T, inputs [][2]interface{}) { 336 | t.Helper() 337 | 338 | for _, input := range inputs { 339 | require.Equal(t, input[0], input[1]) 340 | } 341 | } 342 | -------------------------------------------------------------------------------- /internal/stringutil/stringutil.go: -------------------------------------------------------------------------------- 1 | package stringutil 2 | 3 | // CommonPrefix returns the common prefix of the specified strings. An empty 4 | // string is returned if the parameters have no prefix in common. 5 | func CommonPrefix(first, second string) string { 6 | fRunes, sRunes := []rune(first), []rune(second) 7 | if len(fRunes) > len(sRunes) { 8 | fRunes, sRunes = sRunes, fRunes 9 | } 10 | 11 | var commonLen int 12 | for i, r := range fRunes { 13 | if r != sRunes[i] { 14 | break 15 | } 16 | 17 | commonLen++ 18 | } 19 | 20 | return string(sRunes[0:commonLen]) 21 | } 22 | 23 | // UniqueSlice returns a slice containing the unique items from the specified 24 | // string slice. The items in the output slice are in the order in which they 25 | // occur in the input slice. 26 | func UniqueSlice(items []string) []string { 27 | var uniq []string 28 | registry := map[string]struct{}{} 29 | 30 | for _, item := range items { 31 | if _, ok := registry[item]; ok { 32 | continue 33 | } 34 | 35 | registry[item] = struct{}{} 36 | uniq = append(uniq, item) 37 | } 38 | 39 | return uniq 40 | } 41 | 42 | // SliceContains returns true if terms contains q, or false otherwise. 43 | func SliceContains(terms []string, q string) bool { 44 | for _, term := range terms { 45 | if q == term { 46 | return true 47 | } 48 | } 49 | 50 | return false 51 | } 52 | -------------------------------------------------------------------------------- /internal/stringutil/stringutil_test.go: -------------------------------------------------------------------------------- 1 | package stringutil_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/adrg/strutil/internal/stringutil" 7 | "github.com/stretchr/testify/require" 8 | ) 9 | 10 | func TestCommonPrefix(t *testing.T) { 11 | requireEqual(t, [][2]interface{}{ 12 | {"", stringutil.CommonPrefix("", "")}, 13 | {"", stringutil.CommonPrefix("a", "")}, 14 | {"", stringutil.CommonPrefix("", "b")}, 15 | {"", stringutil.CommonPrefix("a", "b")}, 16 | {"a", stringutil.CommonPrefix("ab", "aab")}, 17 | {"a", stringutil.CommonPrefix("aab", "ab")}, 18 | {"aa", stringutil.CommonPrefix("aab", "aaab")}, 19 | {"aa", stringutil.CommonPrefix("aaab", "aab")}, 20 | {"忧郁的乌龟", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的乌龟")}, 21 | {"忧郁的", stringutil.CommonPrefix("忧郁的", "忧郁的乌龟")}, 22 | {"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")}, 23 | {"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")}, 24 | {"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")}, 25 | {"\u2019", stringutil.CommonPrefix("\u2019a", "\u2019b")}, 26 | {"a\u2019bc", stringutil.CommonPrefix("a\u2019bcd", "a\u2019bce")}, 27 | {"abc", stringutil.CommonPrefix("abc\u2019d", "abc\u2020d")}, 28 | }) 29 | } 30 | 31 | func TestUniqueSlice(t *testing.T) { 32 | requireEqual(t, [][2]interface{}{ 33 | {0, len(stringutil.UniqueSlice(nil))}, 34 | {0, len(stringutil.UniqueSlice([]string{}))}, 35 | {[]string{"a"}, stringutil.UniqueSlice([]string{"a"})}, 36 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b"})}, 37 | {[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a"})}, 38 | {[]string{"a"}, stringutil.UniqueSlice([]string{"a", "a"})}, 39 | {[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a", "a"})}, 40 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "a", "b"})}, 41 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "a", "a", "b"})}, 42 | {[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a", "a", "a"})}, 43 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b", "b", "a"})}, 44 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b", "a", "b"})}, 45 | }) 46 | } 47 | 48 | func TestSliceContains(t *testing.T) { 49 | requireEqual(t, [][2]interface{}{ 50 | {false, stringutil.SliceContains(nil, "")}, 51 | {false, stringutil.SliceContains(nil, "a")}, 52 | {false, stringutil.SliceContains([]string{}, "")}, 53 | {false, stringutil.SliceContains([]string{}, "a")}, 54 | {true, stringutil.SliceContains([]string{"a", "b"}, "a")}, 55 | {true, stringutil.SliceContains([]string{"b", "a"}, "a")}, 56 | {false, stringutil.SliceContains([]string{"b", "a"}, "c")}, 57 | }) 58 | } 59 | 60 | func requireEqual(t *testing.T, inputs [][2]interface{}) { 61 | t.Helper() 62 | 63 | for _, input := range inputs { 64 | require.Equal(t, input[0], input[1]) 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /metrics/examples_test.go: -------------------------------------------------------------------------------- 1 | package metrics_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/adrg/strutil/metrics" 7 | ) 8 | 9 | func ExampleHamming() { 10 | // Default options. 11 | h := metrics.NewHamming() 12 | 13 | sim := h.Compare("text", "test") 14 | fmt.Printf("(text, test) similarity: %.2f\n", sim) 15 | 16 | dist := h.Distance("text", "test") 17 | fmt.Printf("(text, test) distance: %d\n", dist) 18 | 19 | // Custom options. 20 | h.CaseSensitive = false 21 | 22 | sim = h.Compare("ONE", "once") 23 | fmt.Printf("(ONE, once) similarity: %.2f\n", sim) 24 | 25 | dist = h.Distance("one", "once") 26 | fmt.Printf("(ONE, once) distance: %d\n", dist) 27 | 28 | // Output: 29 | // (text, test) similarity: 0.75 30 | // (text, test) distance: 1 31 | // (ONE, once) similarity: 0.50 32 | // (ONE, once) distance: 2 33 | } 34 | 35 | func ExampleLevenshtein() { 36 | // Default options. 37 | lev := metrics.NewLevenshtein() 38 | 39 | sim := lev.Compare("book", "brick") 40 | fmt.Printf("(book, brick) similarity: %.2f\n", sim) 41 | 42 | dist := lev.Distance("book", "brick") 43 | fmt.Printf("(book, brick) distance: %d\n", dist) 44 | 45 | // Custom options. 46 | lev.CaseSensitive = false 47 | lev.ReplaceCost = 2 48 | 49 | sim = lev.Compare("HELLO", "jello") 50 | fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim) 51 | 52 | dist = lev.Distance("HELLO", "jello") 53 | fmt.Printf("(HELLO, jello) distance: %d\n", dist) 54 | 55 | // Output: 56 | // (book, brick) similarity: 0.40 57 | // (book, brick) distance: 3 58 | // (HELLO, jello) similarity: 0.60 59 | // (HELLO, jello) distance: 2 60 | } 61 | 62 | func ExampleJaro() { 63 | jaro := metrics.NewJaro() 64 | sim := jaro.Compare("sort", "shirt") 65 | fmt.Printf("(sort, shirt) similarity: %.2f\n", sim) 66 | 67 | // Output: 68 | // (sort, shirt) similarity: 0.78 69 | } 70 | 71 | func ExampleJaroWinkler() { 72 | jw := metrics.NewJaroWinkler() 73 | sim := jw.Compare("sort", "shirt") 74 | fmt.Printf("(sort, shirt) similarity: %.2f\n", sim) 75 | 76 | // Output: 77 | // (sort, shirt) similarity: 0.80 78 | } 79 | 80 | func ExampleSmithWatermanGotoh() { 81 | // Default options. 82 | swg := metrics.NewSmithWatermanGotoh() 83 | 84 | sim := swg.Compare("a pink kitten", "a kitten") 85 | fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim) 86 | 87 | // Custom options. 88 | swg.CaseSensitive = false 89 | swg.GapPenalty = -0.1 90 | swg.Substitution = metrics.MatchMismatch{ 91 | Match: 1, 92 | Mismatch: -0.5, 93 | } 94 | 95 | sim = swg.Compare("a pink kitten", "A KITTEN") 96 | fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim) 97 | 98 | // Output: 99 | // (a pink kitten, a kitten) similarity: 0.88 100 | // (a pink kitten, A KITTEN) similarity: 0.94 101 | } 102 | 103 | func ExampleSorensenDice() { 104 | // Default options. 105 | sd := metrics.NewSorensenDice() 106 | sim := sd.Compare("night", "alright") 107 | fmt.Printf("(night, alright) similarity: %.2f\n", sim) 108 | 109 | // Custom options. 110 | sd.CaseSensitive = false 111 | sd.NgramSize = 3 112 | 113 | sim = sd.Compare("night", "alright") 114 | fmt.Printf("(night, alright) similarity: %.2f\n", sim) 115 | 116 | // Output: 117 | // (night, alright) similarity: 0.60 118 | // (night, alright) similarity: 0.50 119 | } 120 | 121 | func ExampleJaccard() { 122 | // Default options. 123 | j := metrics.NewJaccard() 124 | sim := j.Compare("night", "alright") 125 | fmt.Printf("(night, alright) similarity: %.2f\n", sim) 126 | 127 | // Custom options. 128 | j.CaseSensitive = false 129 | j.NgramSize = 3 130 | 131 | sim = j.Compare("night", "alright") 132 | fmt.Printf("(night, alright) similarity: %.2f\n", sim) 133 | 134 | // Output: 135 | // (night, alright) similarity: 0.43 136 | // (night, alright) similarity: 0.33 137 | } 138 | 139 | func ExampleOverlapCoefficient() { 140 | // Default options. 141 | oc := metrics.NewOverlapCoefficient() 142 | sim := oc.Compare("night", "alright") 143 | fmt.Printf("(night, alright) similarity: %.2f\n", sim) 144 | 145 | // Subset comparison. 146 | sim = oc.Compare("aa", "aaaa") 147 | fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim) 148 | 149 | // Custom options. 150 | oc.CaseSensitive = false 151 | oc.NgramSize = 3 152 | 153 | sim = oc.Compare("night", "alright") 154 | fmt.Printf("(night, alright) similarity: %.2f\n", sim) 155 | 156 | // Output: 157 | // (night, alright) similarity: 0.75 158 | // (aa, aaaa) similarity: 1.00 159 | // (night, alright) similarity: 0.67 160 | } 161 | -------------------------------------------------------------------------------- /metrics/hamming.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | ) 6 | 7 | // Hamming represents the Hamming metric for measuring the similarity 8 | // between sequences. 9 | // For more information see https://en.wikipedia.org/wiki/Hamming_distance. 10 | type Hamming struct { 11 | // CaseSensitive specifies if the string comparison is case sensitive. 12 | CaseSensitive bool 13 | } 14 | 15 | // NewHamming returns a new Hamming string metric. 16 | // 17 | // Default options: 18 | // CaseSensitive: true 19 | func NewHamming() *Hamming { 20 | return &Hamming{ 21 | CaseSensitive: true, 22 | } 23 | } 24 | 25 | // Compare returns the Hamming similarity of a and b. The returned 26 | // similarity is a number between 0 and 1. Larger similarity numbers indicate 27 | // closer matches. 28 | func (m *Hamming) Compare(a, b string) float64 { 29 | distance, maxLen := m.distance(a, b) 30 | return 1 - float64(distance)/float64(maxLen) 31 | } 32 | 33 | // Distance returns the Hamming distance between a and b. Lower distances 34 | // indicate closer matches. A distance of 0 means the strings are identical. 35 | func (m *Hamming) Distance(a, b string) int { 36 | distance, _ := m.distance(a, b) 37 | return distance 38 | } 39 | 40 | func (m *Hamming) distance(a, b string) (int, int) { 41 | // Lower terms if case insensitive comparison is specified. 42 | if !m.CaseSensitive { 43 | a = strings.ToLower(a) 44 | b = strings.ToLower(b) 45 | } 46 | runesA, runesB := []rune(a), []rune(b) 47 | 48 | // Check if both terms are empty. 49 | lenA, lenB := len(runesA), len(runesB) 50 | if lenA == 0 && lenB == 0 { 51 | return 0, 0 52 | } 53 | 54 | // If the lengths of the sequences are not equal, the distance is 55 | // initialized to their absolute difference. Otherwise, it is set to 0. 56 | if lenA > lenB { 57 | lenA, lenB = lenB, lenA 58 | } 59 | distance := lenB - lenA 60 | 61 | // Calculate Hamming distance. 62 | for i := 0; i < lenA; i++ { 63 | if runesA[i] != runesB[i] { 64 | distance++ 65 | } 66 | } 67 | 68 | return distance, lenB 69 | } 70 | -------------------------------------------------------------------------------- /metrics/jaccard.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/adrg/strutil/internal/ngram" 7 | ) 8 | 9 | // Jaccard represents the Jaccard index for measuring the similarity 10 | // between sequences. 11 | // For more information see https://en.wikipedia.org/wiki/Jaccard_index. 12 | type Jaccard struct { 13 | // CaseSensitive specifies if the string comparison is case sensitive. 14 | CaseSensitive bool 15 | 16 | // NgramSize represents the size (in characters) of the tokens generated 17 | // when comparing the input sequences. 18 | NgramSize int 19 | } 20 | 21 | // NewJaccard returns a new Jaccard string metric. 22 | // 23 | // Default options: 24 | // CaseSensitive: true 25 | // NGramSize: 2 26 | func NewJaccard() *Jaccard { 27 | return &Jaccard{ 28 | CaseSensitive: true, 29 | NgramSize: 2, 30 | } 31 | } 32 | 33 | // Compare returns the Jaccard similarity coefficient of a and b. The 34 | // returned similarity is a number between 0 and 1. Larger similarity numbers 35 | // indicate closer matches. 36 | // An n-gram size of 2 is used if the provided size is less than or equal to 0. 37 | func (m *Jaccard) Compare(a, b string) float64 { 38 | // Lower terms if case insensitive comparison is specified. 39 | if !m.CaseSensitive { 40 | a = strings.ToLower(a) 41 | b = strings.ToLower(b) 42 | } 43 | 44 | // Check if both terms are empty. 45 | runesA, runesB := []rune(a), []rune(b) 46 | if len(runesA) == 0 && len(runesB) == 0 { 47 | return 1 48 | } 49 | 50 | size := m.NgramSize 51 | if size <= 0 { 52 | size = 2 53 | } 54 | 55 | // Calculate n-gram intersection and union. 56 | _, common, totalA, totalB := ngram.Intersection(runesA, runesB, size) 57 | 58 | total := totalA + totalB 59 | if total == 0 { 60 | return 0 61 | } 62 | 63 | // Return similarity. 64 | return float64(common) / float64(total-common) 65 | } 66 | -------------------------------------------------------------------------------- /metrics/jaro.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | "unicode/utf8" 6 | 7 | "github.com/adrg/strutil/internal/mathutil" 8 | ) 9 | 10 | // Jaro represents the Jaro metric for measuring the similarity 11 | // between sequences. 12 | // For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance. 13 | type Jaro struct { 14 | // CaseSensitive specifies if the string comparison is case sensitive. 15 | CaseSensitive bool 16 | } 17 | 18 | // NewJaro returns a new Jaro string metric. 19 | // 20 | // Default options: 21 | // CaseSensitive: true 22 | func NewJaro() *Jaro { 23 | return &Jaro{ 24 | CaseSensitive: true, 25 | } 26 | } 27 | 28 | // Compare returns the Jaro similarity of a and b. The returned similarity is 29 | // a number between 0 and 1. Larger similarity numbers indicate closer matches. 30 | func (m *Jaro) Compare(a, b string) float64 { 31 | // Check if both terms are empty. 32 | lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b) 33 | if lenA == 0 && lenB == 0 { 34 | return 1 35 | } 36 | 37 | // Check if one of the terms is empty. 38 | if lenA == 0 || lenB == 0 { 39 | return 0 40 | } 41 | 42 | // Lower terms if case insensitive comparison is specified. 43 | if !m.CaseSensitive { 44 | a = strings.ToLower(a) 45 | b = strings.ToLower(b) 46 | } 47 | 48 | // Get matching runes. 49 | halfLen := mathutil.Max(0, mathutil.Max(lenA, lenB)/2) 50 | mrA := matchingRunes(a, b, halfLen) 51 | mrB := matchingRunes(b, a, halfLen) 52 | 53 | fmLen, smLen := len(mrA), len(mrB) 54 | if fmLen == 0 || smLen == 0 { 55 | return 0.0 56 | } 57 | 58 | // Return similarity. 59 | return (float64(fmLen)/float64(lenA) + 60 | float64(smLen)/float64(lenB) + 61 | float64(fmLen-transpositions(mrA, mrB)/2)/float64(fmLen)) / 3.0 62 | } 63 | 64 | func matchingRunes(a, b string, limit int) []rune { 65 | var ( 66 | runesA = []rune(a) 67 | runesB = []rune(b) 68 | runesCommon = []rune{} 69 | lenB = len(runesB) 70 | ) 71 | 72 | for i, r := range runesA { 73 | end := mathutil.Min(i+limit+1, lenB) 74 | for j := mathutil.Max(0, i-limit); j < end; j++ { 75 | if r == runesB[j] && runesB[j] != -1 { 76 | runesCommon = append(runesCommon, runesB[j]) 77 | runesB[j] = -1 78 | break 79 | } 80 | } 81 | } 82 | 83 | return runesCommon 84 | } 85 | 86 | func transpositions(a, b []rune) int { 87 | var count int 88 | 89 | minLen := mathutil.Min(len(a), len(b)) 90 | for i := 0; i < minLen; i++ { 91 | if a[i] != b[i] { 92 | count++ 93 | } 94 | } 95 | 96 | return count 97 | } 98 | -------------------------------------------------------------------------------- /metrics/jaro_winkler.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | "unicode/utf8" 6 | 7 | "github.com/adrg/strutil/internal/stringutil" 8 | ) 9 | 10 | // JaroWinkler represents the Jaro-Winkler metric for measuring the similarity 11 | // between sequences. 12 | // For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance. 13 | type JaroWinkler struct { 14 | // CaseSensitive specifies if the string comparison is case sensitive. 15 | CaseSensitive bool 16 | } 17 | 18 | // NewJaroWinkler returns a new Jaro-Winkler string metric. 19 | // 20 | // Default options: 21 | // CaseSensitive: true 22 | func NewJaroWinkler() *JaroWinkler { 23 | return &JaroWinkler{ 24 | CaseSensitive: true, 25 | } 26 | } 27 | 28 | // Compare returns the Jaro-Winkler similarity of a and b. The returned 29 | // similarity is a number between 0 and 1. Larger similarity numbers indicate 30 | // closer matches. 31 | func (m *JaroWinkler) Compare(a, b string) float64 { 32 | // Lower terms if case insensitive comparison is specified. 33 | if !m.CaseSensitive { 34 | a = strings.ToLower(a) 35 | b = strings.ToLower(b) 36 | } 37 | 38 | // Calculate common prefix. 39 | lenPrefix := utf8.RuneCountInString(stringutil.CommonPrefix(a, b)) 40 | if lenPrefix > 4 { 41 | lenPrefix = 4 42 | } 43 | 44 | jaro := NewJaro() 45 | jaro.CaseSensitive = m.CaseSensitive 46 | 47 | // Return similarity. 48 | similarity := jaro.Compare(a, b) 49 | return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity)) 50 | } 51 | -------------------------------------------------------------------------------- /metrics/levenshtein.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/adrg/strutil/internal/mathutil" 7 | ) 8 | 9 | // Levenshtein represents the Levenshtein metric for measuring the similarity 10 | // between sequences. 11 | // For more information see https://en.wikipedia.org/wiki/Levenshtein_distance. 12 | type Levenshtein struct { 13 | // CaseSensitive specifies if the string comparison is case sensitive. 14 | CaseSensitive bool 15 | 16 | // InsertCost represents the Levenshtein cost of a character insertion. 17 | InsertCost int 18 | 19 | // InsertCost represents the Levenshtein cost of a character deletion. 20 | DeleteCost int 21 | 22 | // InsertCost represents the Levenshtein cost of a character substitution. 23 | ReplaceCost int 24 | } 25 | 26 | // NewLevenshtein returns a new Levenshtein string metric. 27 | // 28 | // Default options: 29 | // CaseSensitive: true 30 | // InsertCost: 1 31 | // DeleteCost: 1 32 | // ReplaceCost: 1 33 | func NewLevenshtein() *Levenshtein { 34 | return &Levenshtein{ 35 | CaseSensitive: true, 36 | InsertCost: 1, 37 | DeleteCost: 1, 38 | ReplaceCost: 1, 39 | } 40 | } 41 | 42 | // Compare returns the Levenshtein similarity of a and b. The returned 43 | // similarity is a number between 0 and 1. Larger similarity numbers indicate 44 | // closer matches. 45 | func (m *Levenshtein) Compare(a, b string) float64 { 46 | distance, maxLen := m.distance(a, b) 47 | return 1 - float64(distance)/float64(maxLen) 48 | } 49 | 50 | // Distance returns the Levenshtein distance between a and b. Lower distances 51 | // indicate closer matches. A distance of 0 means the strings are identical. 52 | func (m *Levenshtein) Distance(a, b string) int { 53 | distance, _ := m.distance(a, b) 54 | return distance 55 | } 56 | 57 | func (m *Levenshtein) distance(a, b string) (int, int) { 58 | // Lower terms if case insensitive comparison is specified. 59 | if !m.CaseSensitive { 60 | a = strings.ToLower(a) 61 | b = strings.ToLower(b) 62 | } 63 | runesA, runesB := []rune(a), []rune(b) 64 | 65 | // Check if both terms are empty. 66 | lenA, lenB := len(runesA), len(runesB) 67 | if lenA == 0 && lenB == 0 { 68 | return 0, 0 69 | } 70 | 71 | // Check if one of the terms is empty. 72 | maxLen := mathutil.Max(lenA, lenB) 73 | if lenA == 0 { 74 | return m.InsertCost * lenB, maxLen 75 | } 76 | if lenB == 0 { 77 | return m.DeleteCost * lenA, maxLen 78 | } 79 | 80 | // Initialize cost slice. 81 | prevCol := make([]int, lenB+1) 82 | for i := 0; i <= lenB; i++ { 83 | prevCol[i] = i 84 | } 85 | 86 | // Calculate distance. 87 | col := make([]int, lenB+1) 88 | for i := 0; i < lenA; i++ { 89 | col[0] = i + 1 90 | for j := 0; j < lenB; j++ { 91 | delCost := prevCol[j+1] + m.DeleteCost 92 | insCost := col[j] + m.InsertCost 93 | 94 | subCost := prevCol[j] 95 | if runesA[i] != runesB[j] { 96 | subCost += m.ReplaceCost 97 | } 98 | 99 | col[j+1] = mathutil.Min(delCost, insCost, subCost) 100 | } 101 | 102 | col, prevCol = prevCol, col 103 | } 104 | 105 | return prevCol[lenB], maxLen 106 | } 107 | -------------------------------------------------------------------------------- /metrics/match_mismatch.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | // MatchMismatch represents a substitution function which returns the match or 4 | // mismatch value depeding on the equality of the compared characters. The 5 | // match value must be greater than the mismatch value. 6 | type MatchMismatch struct { 7 | // Match represents the score of equal character substitutions. 8 | Match float64 9 | 10 | // Mismatch represents the score of unequal character substitutions. 11 | Mismatch float64 12 | } 13 | 14 | // Compare returns the match value if a[idxA] is equal to b[idxB] or the 15 | // mismatch value otherwise. 16 | func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64 { 17 | if a[idxA] == b[idxB] { 18 | return m.Match 19 | } 20 | 21 | return m.Mismatch 22 | } 23 | 24 | // Max returns the match value. 25 | func (m MatchMismatch) Max() float64 { 26 | return m.Match 27 | } 28 | 29 | // Min returns the mismatch value. 30 | func (m MatchMismatch) Min() float64 { 31 | return m.Mismatch 32 | } 33 | -------------------------------------------------------------------------------- /metrics/metrics_test.go: -------------------------------------------------------------------------------- 1 | package metrics_test 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | 7 | "github.com/adrg/strutil/metrics" 8 | "github.com/stretchr/testify/require" 9 | ) 10 | 11 | func sf(a float64) string { 12 | return fmt.Sprintf("%.2f", a) 13 | } 14 | 15 | func TestHamming(t *testing.T) { 16 | h := metrics.NewHamming() 17 | require.Equal(t, 0, h.Distance("", "")) 18 | require.Equal(t, "0.75", sf(h.Compare("text", "test"))) 19 | require.Equal(t, "0.50", sf(h.Compare("once", "one"))) 20 | require.Equal(t, "1.00", sf(h.Compare("ab\u2019c", "ab\u2019c"))) 21 | require.Equal(t, "0.75", sf(h.Compare("ab\u2019d", "ab\u2019c"))) 22 | require.Equal(t, "0.75", sf(h.Compare("ab\u2018c", "ab\u2019c"))) 23 | h.CaseSensitive = false 24 | require.Equal(t, "0.50", sf(h.Compare("one", "ONCE"))) 25 | } 26 | 27 | func TestJaccard(t *testing.T) { 28 | j := metrics.NewJaccard() 29 | require.Equal(t, "1.00", sf(j.Compare("", ""))) 30 | require.Equal(t, "0.00", sf(j.Compare("a", "b"))) 31 | require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c"))) 32 | require.Equal(t, "0.50", sf(j.Compare("ab\u2019d", "ab\u2019c"))) 33 | require.Equal(t, "0.20", sf(j.Compare("ab\u2018c", "ab\u2019c"))) 34 | require.Equal(t, "0.43", sf(j.Compare("night", "alright"))) 35 | j.NgramSize = 0 36 | require.Equal(t, "0.43", sf(j.Compare("night", "alright"))) 37 | j.CaseSensitive = false 38 | j.NgramSize = 3 39 | require.Equal(t, "0.33", sf(j.Compare("NIGHT", "alright"))) 40 | } 41 | 42 | func TestJaro(t *testing.T) { 43 | j := metrics.NewJaro() 44 | require.Equal(t, "1.00", sf(j.Compare("", ""))) 45 | require.Equal(t, "0.00", sf(j.Compare("test", ""))) 46 | require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c"))) 47 | require.Equal(t, "0.83", sf(j.Compare("ab\u2019d", "ab\u2019c"))) 48 | require.Equal(t, "0.83", sf(j.Compare("ab\u2018c", "ab\u2019c"))) 49 | require.Equal(t, "0.00", sf(j.Compare("a", "b"))) 50 | require.Equal(t, "0.78", sf(j.Compare("sort", "shirt"))) 51 | require.Equal(t, "0.64", sf(j.Compare("sort", "report"))) 52 | j.CaseSensitive = false 53 | require.Equal(t, "0.78", sf(j.Compare("sort", "SHIRT"))) 54 | } 55 | 56 | func TestJaroWinkler(t *testing.T) { 57 | j := metrics.NewJaroWinkler() 58 | require.Equal(t, "1.00", sf(j.Compare("", ""))) 59 | require.Equal(t, "0.00", sf(j.Compare("test", ""))) 60 | require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c"))) 61 | require.Equal(t, "0.88", sf(j.Compare("ab\u2019d", "ab\u2019c"))) 62 | require.Equal(t, "0.87", sf(j.Compare("ab\u2018c", "ab\u2019c"))) 63 | require.Equal(t, "0.80", sf(j.Compare("sort", "shirt"))) 64 | require.Equal(t, "0.94", sf(j.Compare("charm", "charmed"))) 65 | j.CaseSensitive = false 66 | require.Equal(t, "0.80", sf(j.Compare("sort", "SHIRT"))) 67 | } 68 | 69 | func TestLevenshtein(t *testing.T) { 70 | l := metrics.NewLevenshtein() 71 | require.Equal(t, 0, l.Distance("", "")) 72 | require.Equal(t, 4, l.Distance("test", "")) 73 | require.Equal(t, 4, l.Distance("", "test")) 74 | require.Equal(t, 0, l.Distance("ab\u2019c", "ab\u2019c")) 75 | require.Equal(t, 1, l.Distance("ab\u2019d", "ab\u2019c")) 76 | require.Equal(t, 1, l.Distance("ab\u2018c", "ab\u2019c")) 77 | require.Equal(t, "0.40", sf(l.Compare("book", "brick"))) 78 | require.Equal(t, "0.75", sf(l.Compare("ab\u2019d", "ab\u2019c"))) 79 | require.Equal(t, "0.75", sf(l.Compare("ab\u2018c", "ab\u2019c"))) 80 | l.CaseSensitive = false 81 | require.Equal(t, "0.80", sf(l.Compare("hello", "jello"))) 82 | l.ReplaceCost = 2 83 | require.Equal(t, "0.60", sf(l.Compare("hello", "JELLO"))) 84 | require.Equal(t, "1.00", sf(l.Compare("ab\u2019c", "ab\u2019c"))) 85 | require.Equal(t, "0.50", sf(l.Compare("ab\u2019d", "ab\u2019c"))) 86 | require.Equal(t, "0.50", sf(l.Compare("ab\u2018c", "ab\u2019c"))) 87 | } 88 | 89 | func TestOperlapCoefficient(t *testing.T) { 90 | o := metrics.NewOverlapCoefficient() 91 | require.Equal(t, "1.00", sf(o.Compare("", ""))) 92 | require.Equal(t, "0.75", sf(o.Compare("night", "alright"))) 93 | require.Equal(t, "0.00", sf(o.Compare("aa", ""))) 94 | require.Equal(t, "0.00", sf(o.Compare("bb", ""))) 95 | require.Equal(t, "1.00", sf(o.Compare("ab\u2019c", "ab\u2019c"))) 96 | require.Equal(t, "0.67", sf(o.Compare("ab\u2019d", "ab\u2019c"))) 97 | require.Equal(t, "0.33", sf(o.Compare("ab\u2018c", "ab\u2019c"))) 98 | o.NgramSize = 0 99 | require.Equal(t, "0.75", sf(o.Compare("night", "alright"))) 100 | require.Equal(t, "1.00", sf(o.Compare("aa", "aaaa"))) 101 | o.CaseSensitive = false 102 | require.Equal(t, "1.00", sf(o.Compare("aa", "AAAA"))) 103 | o.NgramSize = 3 104 | require.Equal(t, "0.67", sf(o.Compare("night", "alright"))) 105 | } 106 | 107 | func TestSmithWatermanGotoh(t *testing.T) { 108 | s := metrics.NewSmithWatermanGotoh() 109 | require.Equal(t, "1.00", sf(s.Compare("", ""))) 110 | require.Equal(t, "0.00", sf(s.Compare("test", ""))) 111 | require.Equal(t, "0.00", sf(s.Compare("", "test"))) 112 | require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten"))) 113 | require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c"))) 114 | require.Equal(t, "0.75", sf(s.Compare("ab\u2019d", "ab\u2019c"))) 115 | require.Equal(t, "0.50", sf(s.Compare("ab\u2018c", "ab\u2019c"))) 116 | s.Substitution = nil 117 | require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten"))) 118 | s.CaseSensitive = false 119 | s.GapPenalty = -0.1 120 | s.Substitution = metrics.MatchMismatch{ 121 | Match: 1, 122 | Mismatch: -0.5, 123 | } 124 | require.Equal(t, "0.94", sf(s.Compare("a pink kitten", "A KITTEN"))) 125 | } 126 | 127 | func TestSorensenDice(t *testing.T) { 128 | s := metrics.NewSorensenDice() 129 | require.Equal(t, "1.00", sf(s.Compare("", ""))) 130 | require.Equal(t, "0.00", sf(s.Compare("a", "b"))) 131 | require.Equal(t, "0.60", sf(s.Compare("night", "alright"))) 132 | require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c"))) 133 | require.Equal(t, "0.67", sf(s.Compare("ab\u2019d", "ab\u2019c"))) 134 | require.Equal(t, "0.33", sf(s.Compare("ab\u2018c", "ab\u2019c"))) 135 | s.NgramSize = 0 136 | require.Equal(t, "0.60", sf(s.Compare("night", "alright"))) 137 | s.CaseSensitive = false 138 | require.Equal(t, "0.60", sf(s.Compare("night", "ALRIGHT"))) 139 | s.NgramSize = 3 140 | require.Equal(t, "0.50", sf(s.Compare("night", "alright"))) 141 | } 142 | 143 | func TestMatchMismatch(t *testing.T) { 144 | m := metrics.MatchMismatch{ 145 | Match: 2, 146 | Mismatch: 1, 147 | } 148 | require.Equal(t, "1.00", sf(m.Compare([]rune{'a'}, 0, []rune{'b'}, 0))) 149 | require.Equal(t, "2.00", sf(m.Compare([]rune{'a'}, 0, []rune{'a'}, 0))) 150 | require.Equal(t, "1.00", sf(m.Min())) 151 | require.Equal(t, "2.00", sf(m.Max())) 152 | } 153 | -------------------------------------------------------------------------------- /metrics/overlap_coefficient.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/adrg/strutil/internal/mathutil" 7 | "github.com/adrg/strutil/internal/ngram" 8 | ) 9 | 10 | // OverlapCoefficient represents the overlap coefficient for measuring the 11 | // similarity between sequences. The metric is also know as the 12 | // Szymkiewicz-Simpson coefficient. 13 | // For more information see https://en.wikipedia.org/wiki/Overlap_coefficient. 14 | type OverlapCoefficient struct { 15 | // CaseSensitive specifies if the string comparison is case sensitive. 16 | CaseSensitive bool 17 | 18 | // NgramSize represents the size (in characters) of the tokens generated 19 | // when comparing the input sequences. 20 | NgramSize int 21 | } 22 | 23 | // NewOverlapCoefficient returns a new overlap coefficient string metric. 24 | // 25 | // Default options: 26 | // CaseSensitive: true 27 | // NGramSize: 2 28 | func NewOverlapCoefficient() *OverlapCoefficient { 29 | return &OverlapCoefficient{ 30 | CaseSensitive: true, 31 | NgramSize: 2, 32 | } 33 | } 34 | 35 | // Compare returns the OverlapCoefficient similarity coefficient of a and b. 36 | // The returned similarity is a number between 0 and 1. Larger similarity 37 | // numbers indicate closer matches. 38 | // An n-gram size of 2 is used if the provided size is less than or equal to 0. 39 | func (m *OverlapCoefficient) Compare(a, b string) float64 { 40 | // Lower terms if case insensitive comparison is specified. 41 | if !m.CaseSensitive { 42 | a = strings.ToLower(a) 43 | b = strings.ToLower(b) 44 | } 45 | 46 | // Check if both terms are empty. 47 | runesA, runesB := []rune(a), []rune(b) 48 | if len(runesA) == 0 && len(runesB) == 0 { 49 | return 1 50 | } 51 | 52 | size := m.NgramSize 53 | if size <= 0 { 54 | size = 2 55 | } 56 | 57 | // Calculate n-gram intersection and minimum subset. 58 | _, common, totalA, totalB := ngram.Intersection(runesA, runesB, size) 59 | 60 | min := mathutil.Min(totalA, totalB) 61 | if min == 0 { 62 | return 0 63 | } 64 | 65 | // Return similarity. 66 | return float64(common) / float64(min) 67 | } 68 | -------------------------------------------------------------------------------- /metrics/smith_waterman_gotoh.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/adrg/strutil/internal/mathutil" 7 | ) 8 | 9 | // SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring 10 | // the similarity between sequences. 11 | // For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm. 12 | type SmithWatermanGotoh struct { 13 | // CaseSensitive specifies if the string comparison is case sensitive. 14 | CaseSensitive bool 15 | 16 | // GapPenalty defines a score penalty for character insertions or deletions. 17 | // For relevant results, the gap penalty should be a non-positive number. 18 | GapPenalty float64 19 | 20 | // Substitution represents a substitution function which is used to 21 | // calculate a score for character substitutions. 22 | Substitution Substitution 23 | } 24 | 25 | // NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric. 26 | // 27 | // Default options: 28 | // CaseSensitive: true 29 | // GapPenalty: -0.5 30 | // Substitution: MatchMismatch{ 31 | // Match: 1, 32 | // Mismatch: -2, 33 | // }, 34 | func NewSmithWatermanGotoh() *SmithWatermanGotoh { 35 | return &SmithWatermanGotoh{ 36 | CaseSensitive: true, 37 | GapPenalty: -0.5, 38 | Substitution: MatchMismatch{ 39 | Match: 1, 40 | Mismatch: -2, 41 | }, 42 | } 43 | } 44 | 45 | // Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned 46 | // similarity is a number between 0 and 1. Larger similarity numbers indicate 47 | // closer matches. 48 | func (m *SmithWatermanGotoh) Compare(a, b string) float64 { 49 | gap := m.GapPenalty 50 | 51 | // Lower terms if case insensitive comparison is specified. 52 | if !m.CaseSensitive { 53 | a = strings.ToLower(a) 54 | b = strings.ToLower(b) 55 | } 56 | runesA, runesB := []rune(a), []rune(b) 57 | 58 | // Check if both terms are empty. 59 | lenA, lenB := len(runesA), len(runesB) 60 | if lenA == 0 && lenB == 0 { 61 | return 1 62 | } 63 | 64 | // Check if one of the terms is empty. 65 | if lenA == 0 || lenB == 0 { 66 | return 0 67 | } 68 | 69 | // Use default substitution, if none is specified. 70 | subst := m.Substitution 71 | if subst == nil { 72 | subst = MatchMismatch{ 73 | Match: 1, 74 | Mismatch: -2, 75 | } 76 | } 77 | 78 | // Calculate max distance. 79 | maxDistance := mathutil.Minf(float64(lenA), float64(lenB)) * mathutil.Maxf(subst.Max(), gap) 80 | 81 | // Calculate distance. 82 | v0 := make([]float64, lenB) 83 | v1 := make([]float64, lenB) 84 | 85 | distance := mathutil.Maxf(0, gap, subst.Compare(runesA, 0, runesB, 0)) 86 | v0[0] = distance 87 | 88 | for i := 1; i < lenB; i++ { 89 | v0[i] = mathutil.Maxf(0, v0[i-1]+gap, subst.Compare(runesA, 0, runesB, i)) 90 | distance = mathutil.Maxf(distance, v0[i]) 91 | } 92 | 93 | for i := 1; i < lenA; i++ { 94 | v1[0] = mathutil.Maxf(0, v0[0]+gap, subst.Compare(runesA, i, runesB, 0)) 95 | distance = mathutil.Maxf(distance, v1[0]) 96 | 97 | for j := 1; j < lenB; j++ { 98 | v1[j] = mathutil.Maxf(0, v0[j]+gap, v1[j-1]+gap, v0[j-1]+subst.Compare(runesA, i, runesB, j)) 99 | distance = mathutil.Maxf(distance, v1[j]) 100 | } 101 | 102 | for j := 0; j < lenB; j++ { 103 | v0[j] = v1[j] 104 | } 105 | } 106 | 107 | // Return similarity. 108 | return distance / maxDistance 109 | } 110 | -------------------------------------------------------------------------------- /metrics/sorensen_dice.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/adrg/strutil/internal/ngram" 7 | ) 8 | 9 | // SorensenDice represents the Sorensen-Dice metric for measuring the 10 | // similarity between sequences. 11 | // For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient. 12 | type SorensenDice struct { 13 | // CaseSensitive specifies if the string comparison is case sensitive. 14 | CaseSensitive bool 15 | 16 | // NgramSize represents the size (in characters) of the tokens generated 17 | // when comparing the input sequences. 18 | NgramSize int 19 | } 20 | 21 | // NewSorensenDice returns a new Sorensen-Dice string metric. 22 | // 23 | // Default options: 24 | // CaseSensitive: true 25 | // NGramSize: 2 26 | func NewSorensenDice() *SorensenDice { 27 | return &SorensenDice{ 28 | CaseSensitive: true, 29 | NgramSize: 2, 30 | } 31 | } 32 | 33 | // Compare returns the Sorensen-Dice similarity coefficient of a and b. The 34 | // returned similarity is a number between 0 and 1. Larger similarity numbers 35 | // indicate closer matches. 36 | // An n-gram size of 2 is used if the provided size is less than or equal to 0. 37 | func (m *SorensenDice) Compare(a, b string) float64 { 38 | // Lower terms if case insensitive comparison is specified. 39 | if !m.CaseSensitive { 40 | a = strings.ToLower(a) 41 | b = strings.ToLower(b) 42 | } 43 | 44 | // Check if both terms are empty. 45 | runesA, runesB := []rune(a), []rune(b) 46 | if len(runesA) == 0 && len(runesB) == 0 { 47 | return 1 48 | } 49 | 50 | size := m.NgramSize 51 | if size <= 0 { 52 | size = 2 53 | } 54 | 55 | // Calculate n-gram intersection and union. 56 | _, common, totalA, totalB := ngram.Intersection(runesA, runesB, size) 57 | 58 | total := totalA + totalB 59 | if total == 0 { 60 | return 0 61 | } 62 | 63 | // Return similarity. 64 | return 2 * float64(common) / float64(total) 65 | } 66 | -------------------------------------------------------------------------------- /metrics/substitution.go: -------------------------------------------------------------------------------- 1 | package metrics 2 | 3 | // Substitution represents a substitution function which is used to 4 | // calculate a score for character substitutions. 5 | type Substitution interface { 6 | // Compare returns the substitution score of characters a[idxA] and b[idxB]. 7 | Compare(a []rune, idxA int, b []rune, idxB int) float64 8 | 9 | // Returns the maximum score of a character substitution operation. 10 | Max() float64 11 | 12 | // Returns the minimum score of a character substitution operation. 13 | Min() float64 14 | } 15 | -------------------------------------------------------------------------------- /strutil.go: -------------------------------------------------------------------------------- 1 | /* 2 | Package strutil provides string metrics for calculating string similarity as 3 | well as other string utility functions. Documentation for all the metrics can 4 | be found at https://pkg.go.dev/github.com/adrg/strutil/metrics. 5 | 6 | Included string metrics: 7 | - Hamming 8 | - Jaro 9 | - Jaro-Winkler 10 | - Levenshtein 11 | - Smith-Waterman-Gotoh 12 | - Sorensen-Dice 13 | - Jaccard 14 | - Overlap coefficient 15 | 16 | */ 17 | package strutil 18 | 19 | import ( 20 | "github.com/adrg/strutil/internal/ngram" 21 | "github.com/adrg/strutil/internal/stringutil" 22 | ) 23 | 24 | // StringMetric represents a metric for measuring the similarity between 25 | // strings. The metrics package implements the following string metrics: 26 | // - Hamming 27 | // - Jaro 28 | // - Jaro-Winkler 29 | // - Levenshtein 30 | // - Smith-Waterman-Gotoh 31 | // - Sorensen-Dice 32 | // - Jaccard 33 | // - Overlap coefficient 34 | // 35 | // For more information see https://pkg.go.dev/github.com/adrg/strutil/metrics. 36 | type StringMetric interface { 37 | Compare(a, b string) float64 38 | } 39 | 40 | // Similarity returns the similarity of a and b, computed using the specified 41 | // string metric. The returned similarity is a number between 0 and 1. Larger 42 | // similarity numbers indicate closer matches. 43 | func Similarity(a, b string, metric StringMetric) float64 { 44 | return metric.Compare(a, b) 45 | } 46 | 47 | // CommonPrefix returns the common prefix of the specified strings. An empty 48 | // string is returned if the parameters have no prefix in common. 49 | func CommonPrefix(a, b string) string { 50 | return stringutil.CommonPrefix(a, b) 51 | } 52 | 53 | // UniqueSlice returns a slice containing the unique items from the specified 54 | // string slice. The items in the output slice are in the order in which they 55 | // occur in the input slice. 56 | func UniqueSlice(items []string) []string { 57 | return stringutil.UniqueSlice(items) 58 | } 59 | 60 | // SliceContains returns true if terms contains q, or false otherwise. 61 | func SliceContains(terms []string, q string) bool { 62 | return stringutil.SliceContains(terms, q) 63 | } 64 | 65 | // NgramCount returns the n-gram count of the specified size for the 66 | // provided term. An n-gram size of 1 is used if the provided size is 67 | // less than or equal to 0. 68 | func NgramCount(term string, size int) int { 69 | return ngram.Count([]rune(term), size) 70 | } 71 | 72 | // Ngrams returns all the n-grams of the specified size for the provided term. 73 | // The n-grams in the output slice are in the order in which they occur in the 74 | // input term. An n-gram size of 1 is used if the provided size is less than or 75 | // equal to 0. 76 | func Ngrams(term string, size int) []string { 77 | return ngram.Slice([]rune(term), size) 78 | } 79 | 80 | // NgramMap returns a map of all n-grams of the specified size for the provided 81 | // term, along with their frequency. The function also returns the total number 82 | // of n-grams, which is the sum of all the values in the output map. 83 | // An n-gram size of 1 is used if the provided size is less than or equal to 0. 84 | func NgramMap(term string, size int) (map[string]int, int) { 85 | return ngram.Map([]rune(term), size) 86 | } 87 | 88 | // NgramIntersection returns a map of the n-grams of the specified size found 89 | // in both terms, along with their frequency. The function also returns the 90 | // number of common n-grams (the sum of all the values in the output map), the 91 | // total number of n-grams in the first term and the total number of n-grams in 92 | // the second term. An n-gram size of 1 is used if the provided size is less 93 | // than or equal to 0. 94 | func NgramIntersection(a, b string, size int) (map[string]int, int, int, int) { 95 | return ngram.Intersection([]rune(a), []rune(b), size) 96 | } 97 | --------------------------------------------------------------------------------