├── .github
    ├── FUNDING.yml
    ├── dependabot.yml
    └── workflows
    │   ├── analyze.yml
    │   ├── lint.yml
    │   └── tests.yml
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── example_test.go
├── go.mod
├── go.sum
├── internal
    ├── mathutil
    │   ├── math.go
    │   └── mathutil_test.go
    ├── ngram
    │   ├── ngram.go
    │   └── ngram_test.go
    └── stringutil
    │   ├── stringutil.go
    │   └── stringutil_test.go
├── metrics
    ├── examples_test.go
    ├── hamming.go
    ├── jaccard.go
    ├── jaro.go
    ├── jaro_winkler.go
    ├── levenshtein.go
    ├── match_mismatch.go
    ├── metrics_test.go
    ├── overlap_coefficient.go
    ├── smith_waterman_gotoh.go
    ├── sorensen_dice.go
    └── substitution.go
└── strutil.go


/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | ko_fi: adrg
2 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "gomod"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "daily"
 7 |     reviewers:
 8 |       - "adrg"
 9 |   - package-ecosystem: "github-actions"
10 |     directory: "/"
11 |     schedule:
12 |       interval: "daily"
13 |     reviewers:
14 |       - "adrg"
15 | 


--------------------------------------------------------------------------------
/.github/workflows/analyze.yml:
--------------------------------------------------------------------------------
 1 | name: analyze
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 |     branches: [master]
 8 |   schedule:
 9 |     - cron: "0 6 * * 1"
10 | 
11 | permissions:
12 |   actions: read
13 |   contents: read
14 |   security-events: write
15 | 
16 | jobs:
17 |   analyze:
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |       - name: Checkout
22 |         uses: actions/checkout@v4
23 | 
24 |       - name: Initialize CodeQL
25 |         uses: github/codeql-action/init@v3
26 |         with:
27 |           languages: go
28 |           queries: security-and-quality
29 | 
30 |       - name: Run CodeQL analysis
31 |         uses: github/codeql-action/analyze@v3
32 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   lint:
13 |     strategy:
14 |       matrix:
15 |         go: [stable]
16 |         os: [ubuntu-latest]
17 |     runs-on: ${{ matrix.os }}
18 |     steps:
19 |     - name: Setup
20 |       uses: actions/setup-go@v5
21 |       with:
22 |         go-version: ${{ matrix.go }}
23 | 
24 |     - name: Prepare checkout
25 |       run: git config --global core.autocrlf false
26 | 
27 |     - name: Checkout
28 |       uses: actions/checkout@v4
29 | 
30 |     - name: Lint
31 |       uses: golangci/golangci-lint-action@v8.0.0
32 |       with:
33 |         args: --timeout=5m
34 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [master]
 6 |   pull_request:
 7 | 
 8 | permissions:
 9 |   contents: read
10 | 
11 | jobs:
12 |   test:
13 |     strategy:
14 |       matrix:
15 |         go: [stable]
16 |         os: [ubuntu-latest]
17 |     runs-on: ${{ matrix.os }}
18 |     steps:
19 |     - name: Setup
20 |       uses: actions/setup-go@v5
21 |       with:
22 |         go-version: ${{ matrix.go }}
23 | 
24 |     - name: Prepare checkout
25 |       run: git config --global core.autocrlf false
26 | 
27 |     - name: Checkout
28 |       uses: actions/checkout@v4
29 | 
30 |     - name: Test
31 |       run: go test -v -coverprofile coverage.txt -covermode atomic ./...
32 | 
33 |     - name: Coverage
34 |       uses: codecov/codecov-action@v5
35 |       env:
36 |         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age,
 8 | body size, disability, ethnicity, sex characteristics, gender identity and
 9 | expression, level of experience, education, socio-economic status, nationality,
10 | personal appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behaviour that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behaviour by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |   advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |   address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |   professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behaviour and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behaviour.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviour that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behaviour may be
58 | reported by contacting the project team at adrg@epistack.com. All complaints
59 | will be reviewed and investigated and will result in a response that is deemed
60 | necessary and appropriate to the circumstances. The project team is obligated to
61 | maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
71 | version 1.4, available at
72 | https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
73 | 
74 | [homepage]: https://www.contributor-covenant.org
75 | 
76 | For answers to common questions about this code of conduct, see
77 | https://www.contributor-covenant.org/faq
78 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing to this project
  2 | 
  3 | Contributions in the form of pull requests, issues or just general feedback,
  4 | are always welcome. Please take a moment to review this document in order to
  5 | make the contribution process easy and effective for everyone involved.
  6 | 
  7 | Following these guidelines helps to communicate that you respect the time of
  8 | the developers managing and developing this open source project. In return,
  9 | they should reciprocate that respect in addressing your issue or assessing
 10 | patches and features.
 11 | 
 12 | ## Using the issue tracker
 13 | 
 14 | The issue tracker is the preferred channel for [bug reports](#bugs),
 15 | [features requests](#features) and [submitting pull
 16 | requests](#pull-requests), but please respect the following restrictions:
 17 | 
 18 | * Please **do not** use the issue tracker for personal support requests (use
 19 |   [Stack Overflow](http://stackoverflow.com) or IRC).
 20 | * Please **do not** derail or troll issues. Keep the discussion on topic and
 21 |   respect the opinions of others.
 22 | 
 23 | <a name="bugs"></a>
 24 | ## Bug reports
 25 | 
 26 | A bug is a _demonstrable problem_ that is caused by the code in the repository.
 27 | Good bug reports are extremely helpful - thank you!
 28 | 
 29 | Guidelines for bug reports:
 30 | 
 31 | 1. **Use the GitHub issue search** &mdash; check if the issue has already been
 32 |    reported.
 33 | 2. **Check if the issue has been fixed** &mdash; try to reproduce it using the
 34 |    latest `master` or development branch in the repository.
 35 | 3. **Isolate the problem** &mdash; create a reduced test case.
 36 | 
 37 | A good bug report shouldn't leave others needing to chase you up for more
 38 | information. Please try to be as detailed as possible in your report. What is
 39 | your environment? What steps will reproduce the issue? What browser(s) and OS
 40 | experience the problem? What would you expect to be the outcome? All these
 41 | details will help people to fix any potential bugs.
 42 | 
 43 | Example:
 44 | 
 45 | > Short and descriptive example bug report title
 46 | >
 47 | > A summary of the issue and the browser/OS environment in which it occurs. If
 48 | > suitable, include the steps required to reproduce the bug.
 49 | >
 50 | > 1. This is the first step
 51 | > 2. This is the second step
 52 | > 3. Further steps, etc.
 53 | >
 54 | > `<url>` - a link to the reduced test case
 55 | >
 56 | > Any other information you want to share that is relevant to the issue being
 57 | > reported. This might include the lines of code that you have identified as
 58 | > causing the bug, and potential solutions (and your opinions on their
 59 | > merits).
 60 | 
 61 | 
 62 | <a name="features"></a>
 63 | ## Feature requests
 64 | 
 65 | Feature requests are welcome. But take a moment to find out whether your idea
 66 | fits with the scope and aims of the project. It's up to *you* to make a strong
 67 | case to convince the project's developers of the merits of this feature. Please
 68 | provide as much detail and context as possible.
 69 | 
 70 | 
 71 | <a name="pull-requests"></a>
 72 | ## Pull requests
 73 | 
 74 | Good pull requests - patches, improvements, new features - are a fantastic
 75 | help. They should remain focused in scope and avoid containing unrelated
 76 | commits.
 77 | 
 78 | **Please ask first** before embarking on any significant pull request (e.g.
 79 | implementing features, refactoring code, porting to a different language),
 80 | otherwise you risk spending a lot of time working on something that the
 81 | project's developers might not want to merge into the project.
 82 | 
 83 | Please adhere to the coding conventions used throughout a project (indentation,
 84 | accurate comments, etc.) and any other requirements (such as test coverage).
 85 | 
 86 | Follow this process if you'd like your work considered for inclusion in the
 87 | project:
 88 | 
 89 | 1. [Fork](http://help.github.com/fork-a-repo/) the project, clone your fork,
 90 |    and configure the remotes:
 91 | 
 92 |    ```bash
 93 |    # Clone your fork of the repo into the current directory
 94 |    git clone https://github.com/<your-username>/<repo-name>
 95 |    # Navigate to the newly cloned directory
 96 |    cd <repo-name>
 97 |    # Assign the original repo to a remote called "upstream"
 98 |    git remote add upstream https://github.com/<upstream-owner>/<repo-name>
 99 |    ```
100 | 
101 | 2. If you cloned a while ago, get the latest changes from upstream:
102 | 
103 |    ```bash
104 |    git checkout <dev-branch>
105 |    git pull upstream <dev-branch>
106 |    ```
107 | 
108 | 3. Create a new topic branch (off the main project development branch) to
109 |    contain your feature, change, or fix:
110 | 
111 |    ```bash
112 |    git checkout -b <topic-branch-name>
113 |    ```
114 | 
115 | 4. Commit your changes in logical chunks and use descriptive commit messages.
116 |    Use [interactive rebase](https://help.github.com/articles/interactive-rebase)
117 |    to tidy up your commits before making them public.
118 | 
119 | 5. Locally merge (or rebase) the upstream development branch into your topic branch:
120 | 
121 |    ```bash
122 |    git pull [--rebase] upstream <dev-branch>
123 |    ```
124 | 
125 | 6. Push your topic branch up to your fork:
126 | 
127 |    ```bash
128 |    git push origin <topic-branch-name>
129 |    ```
130 | 
131 | 7. [Open a Pull Request](https://help.github.com/articles/using-pull-requests/)
132 |     with a clear title and description.
133 | 
134 | **IMPORTANT**: By submitting a patch, you agree to allow the project owner to
135 | license your work under the same license as that used by the project.
136 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2019 Adrian-George Bostan <adrg@epistack.com>
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <h1 align="center">strutil</h1>
  2 | 
  3 | <p align="center">
  4 |     <a href="https://github.com/adrg/strutil/actions/workflows/tests.yml">
  5 |         <img alt="Tests status" src="https://github.com/adrg/strutil/actions/workflows/tests.yml/badge.svg">
  6 |     </a>
  7 |     <a href="https://codecov.io/gh/adrg/strutil">
  8 |         <img alt="Code coverage" src="https://codecov.io/gh/adrg/strutil/branch/master/graphs/badge.svg?branch=master" />
  9 |     </a>
 10 |     <a href="https://pkg.go.dev/github.com/adrg/strutil">
 11 |         <img alt="pkg.go.dev documentation" src="https://pkg.go.dev/badge/github.com/adrg/strutil" />
 12 |     </a>
 13 |     <a href="https://opensource.org/licenses/MIT" rel="nofollow">
 14 |         <img alt="MIT license" src="https://img.shields.io/github/license/adrg/strutil" />
 15 |     </a>
 16 |     <a href="https://goreportcard.com/report/github.com/adrg/strutil">
 17 |         <img alt="Go report card" src="https://goreportcard.com/badge/github.com/adrg/strutil" />
 18 |     </a>
 19 |     <a href="https://github.com/adrg/strutil/issues">
 20 |         <img alt="GitHub issues" src="https://img.shields.io/github/issues/adrg/strutil" />
 21 |     </a>
 22 |     <a href="https://ko-fi.com/T6T72WATK">
 23 |         <img alt="Buy me a coffee" src="https://img.shields.io/static/v1.svg?label=%20&message=Buy%20me%20a%20coffee&color=579fbf&logo=buy%20me%20a%20coffee&logoColor=white" />
 24 |     </a>
 25 | </p>
 26 | 
 27 | strutil provides a collection of string metrics for calculating string similarity as well as
 28 | other string utility functions.  
 29 | Full documentation can be found at https://pkg.go.dev/github.com/adrg/strutil.
 30 | 
 31 | ## Installation
 32 | 
 33 | ```
 34 | go get github.com/adrg/strutil
 35 | ```
 36 | 
 37 | ## String metrics
 38 | 
 39 | - [Hamming](#hamming)
 40 | - [Levenshtein](#levenshtein)
 41 | - [Jaro](#jaro)
 42 | - [Jaro-Winkler](#jaro-winkler)
 43 | - [Smith-Waterman-Gotoh](#smith-waterman-gotoh)
 44 | - [Sorensen-Dice](#sorensen-dice)
 45 | - [Jaccard](#jaccard)
 46 | - [Overlap Coefficient](#overlap-coefficient)
 47 | 
 48 | The package defines the `StringMetric` interface, which is implemented by all
 49 | the string metrics. The interface is used with the `Similarity` function, which
 50 | calculates the similarity between the specified strings, using the provided
 51 | string metric.
 52 | 
 53 | ```go
 54 | type StringMetric interface {
 55 |     Compare(a, b string) float64
 56 | }
 57 | 
 58 | func Similarity(a, b string, metric StringMetric) float64 {
 59 | }
 60 | ```
 61 | 
 62 | All defined string metrics can be found in the
 63 | [metrics](https://pkg.go.dev/github.com/adrg/strutil/metrics) package.
 64 | 
 65 | #### Hamming
 66 | 
 67 | Calculate similarity.
 68 | ```go
 69 | similarity := strutil.Similarity("text", "test", metrics.NewHamming())
 70 | fmt.Printf("%.2f\n", similarity) // Output: 0.75
 71 | ```
 72 | 
 73 | Calculate distance.
 74 | ```go
 75 | ham := metrics.NewHamming()
 76 | fmt.Printf("%d\n", ham.Distance("one", "once")) // Output: 2
 77 | ```
 78 | 
 79 | More information and additional examples can be found on
 80 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Hamming).
 81 | 
 82 | #### Levenshtein
 83 | 
 84 | Calculate similarity using default options.
 85 | ```go
 86 | similarity := strutil.Similarity("graph", "giraffe", metrics.NewLevenshtein())
 87 | fmt.Printf("%.2f\n", similarity) // Output: 0.43
 88 | ```
 89 | 
 90 | Configure edit operation costs.
 91 | ```go
 92 | lev := metrics.NewLevenshtein()
 93 | lev.CaseSensitive = false
 94 | lev.InsertCost = 1
 95 | lev.ReplaceCost = 2
 96 | lev.DeleteCost = 1
 97 | 
 98 | similarity := strutil.Similarity("make", "Cake", lev)
 99 | fmt.Printf("%.2f\n", similarity) // Output: 0.50
100 | ```
101 | 
102 | Calculate distance.
103 | ```go
104 | lev := metrics.NewLevenshtein()
105 | fmt.Printf("%d\n", lev.Distance("graph", "giraffe")) // Output: 4
106 | ```
107 | 
108 | More information and additional examples can be found on
109 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Levenshtein).
110 | 
111 | #### Jaro
112 | 
113 | ```go
114 | similarity := strutil.Similarity("think", "tank", metrics.NewJaro())
115 | fmt.Printf("%.2f\n", similarity) // Output: 0.78
116 | ```
117 | 
118 | More information and additional examples can be found on
119 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaro).
120 | 
121 | #### Jaro-Winkler
122 | 
123 | ```go
124 | similarity := strutil.Similarity("think", "tank", metrics.NewJaroWinkler())
125 | fmt.Printf("%.2f\n", similarity) // Output: 0.80
126 | ```
127 | 
128 | More information and additional examples can be found on
129 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#JaroWinkler).
130 | 
131 | #### Smith-Waterman-Gotoh
132 | 
133 | Calculate similarity using default options.
134 | ```go
135 | swg := metrics.NewSmithWatermanGotoh()
136 | similarity := strutil.Similarity("times roman", "times new roman", swg)
137 | fmt.Printf("%.2f\n", similarity) // Output: 0.82
138 | ```
139 | 
140 | Customize gap penalty and substitution function.
141 | ```go
142 | swg := metrics.NewSmithWatermanGotoh()
143 | swg.CaseSensitive = false
144 | swg.GapPenalty = -0.1
145 | swg.Substitution = metrics.MatchMismatch {
146 |     Match:    1,
147 |     Mismatch: -0.5,
148 | }
149 | 
150 | similarity := strutil.Similarity("Times Roman", "times new roman", swg)
151 | fmt.Printf("%.2f\n", similarity) // Output: 0.96
152 | ```
153 | 
154 | More information and additional examples can be found on
155 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SmithWatermanGotoh).
156 | 
157 | #### Sorensen-Dice
158 | 
159 | Calculate similarity using default options.
160 | ```go
161 | sd := metrics.NewSorensenDice()
162 | similarity := strutil.Similarity("time to make haste", "no time to waste", sd)
163 | fmt.Printf("%.2f\n", similarity) // Output: 0.62
164 | ```
165 | 
166 | Customize n-gram size.
167 | ```go
168 | sd := metrics.NewSorensenDice()
169 | sd.CaseSensitive = false
170 | sd.NgramSize = 3
171 | 
172 | similarity := strutil.Similarity("Time to make haste", "no time to waste", sd)
173 | fmt.Printf("%.2f\n", similarity) // Output: 0.53
174 | ```
175 | 
176 | More information and additional examples can be found on
177 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SorensenDice).
178 | 
179 | #### Jaccard
180 | 
181 | Calculate similarity using default options.
182 | ```go
183 | j := metrics.NewJaccard()
184 | similarity := strutil.Similarity("time to make haste", "no time to waste", j)
185 | fmt.Printf("%.2f\n", similarity) // Output: 0.45
186 | ```
187 | 
188 | Customize n-gram size.
189 | ```go
190 | j := metrics.NewJaccard()
191 | j.CaseSensitive = false
192 | j.NgramSize = 3
193 | 
194 | similarity := strutil.Similarity("Time to make haste", "no time to waste", j)
195 | fmt.Printf("%.2f\n", similarity) // Output: 0.36
196 | ```
197 | 
198 | The input of the Sorensen-Dice example is the same as the one of Jaccard
199 | because the metrics bear a resemblance to each other. In fact, each of the
200 | coefficients can be used to calculate the other one.
201 | 
202 | Sorensen-Dice to Jaccard.
203 | ```
204 | J = SD/(2-SD)
205 | 
206 | where SD is the Sorensen-Dice coefficient and J is the Jaccard index.
207 | ```
208 | 
209 | Jaccard to Sorensen-Dice.
210 | ```
211 | SD = 2*J/(1+J)
212 | 
213 | where SD is the Sorensen-Dice coefficient and J is the Jaccard index.
214 | ```
215 | 
216 | More information and additional examples can be found on
217 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaccard).
218 | 
219 | #### Overlap Coefficient
220 | 
221 | Calculate similarity using default options.
222 | ```go
223 | oc := metrics.NewOverlapCoefficient()
224 | similarity := strutil.Similarity("time to make haste", "no time to waste", oc)
225 | fmt.Printf("%.2f\n", similarity) // Output: 0.67
226 | ```
227 | 
228 | Customize n-gram size.
229 | ```go
230 | oc := metrics.NewOverlapCoefficient()
231 | oc.CaseSensitive = false
232 | oc.NgramSize = 3
233 | 
234 | similarity := strutil.Similarity("Time to make haste", "no time to waste", oc)
235 | fmt.Printf("%.2f\n", similarity) // Output: 0.57
236 | ```
237 | 
238 | More information and additional examples can be found on
239 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#OverlapCoefficient).
240 | 
241 | ## References
242 | 
243 | For more information see:
244 | - [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
245 | - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
246 | - [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro-Winkler_distance)
247 | - [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith-Waterman_algorithm)
248 | - [Sorensen-Dice coefficient](https://en.wikipedia.org/wiki/Sorensen–Dice_coefficient)
249 | - [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index)
250 | - [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient)
251 | 
252 | ## Stargazers over time
253 | 
254 | [![Stargazers over time](https://starchart.cc/adrg/strutil.svg)](https://starchart.cc/adrg/strutil)
255 | 
256 | ## Contributing
257 | 
258 | Contributions in the form of pull requests, issues or just general feedback,
259 | are always welcome.  
260 | See [CONTRIBUTING.MD](CONTRIBUTING.md).
261 | 
262 | ## License
263 | 
264 | Copyright (c) 2019 Adrian-George Bostan.
265 | 
266 | This project is licensed under the [MIT license](https://opensource.org/licenses/MIT).
267 | See [LICENSE](LICENSE) for more details.
268 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | package strutil_test
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/adrg/strutil"
 7 | 	"github.com/adrg/strutil/metrics"
 8 | )
 9 | 
10 | func ExampleSimilarity() {
11 | 	sim := strutil.Similarity("riddle", "needle", metrics.NewJaroWinkler())
12 | 	fmt.Printf("(riddle, needle) similarity: %.2f\n", sim)
13 | 
14 | 	// Output:
15 | 	// (riddle, needle) similarity: 0.56
16 | }
17 | 
18 | func ExampleCommonPrefix() {
19 | 	fmt.Println("(answer, anvil):", strutil.CommonPrefix("answer", "anvil"))
20 | 
21 | 	// Output:
22 | 	// (answer, anvil): an
23 | }
24 | 
25 | func ExampleUniqueSlice() {
26 | 	sample := []string{"a", "b", "a", "b", "b", "c"}
27 | 	fmt.Println("[a b a b b c]:", strutil.UniqueSlice(sample))
28 | 
29 | 	// Output:
30 | 	// [a b a b b c]: [a b c]
31 | }
32 | 
33 | func ExampleSliceContains() {
34 | 	terms := []string{"a", "b", "c"}
35 | 	fmt.Println("([a b c], b):", strutil.SliceContains(terms, "b"))
36 | 	fmt.Println("([a b c], d):", strutil.SliceContains(terms, "d"))
37 | 
38 | 	// Output:
39 | 	// ([a b c], b): true
40 | 	// ([a b c], d): false
41 | }
42 | 
43 | func ExampleNgramCount() {
44 | 	fmt.Println("abbcd n-gram count (size 2):", strutil.NgramCount("abbcd", 2))
45 | 	fmt.Println("abbcd n-gram count (size 3):", strutil.NgramCount("abbcd", 3))
46 | 
47 | 	// Output:
48 | 	// abbcd n-gram count (size 2): 4
49 | 	// abbcd n-gram count (size 3): 3
50 | }
51 | 
52 | func ExampleNgrams() {
53 | 	fmt.Println("abbcd n-grams (size 2):", strutil.Ngrams("abbcd", 2))
54 | 	fmt.Println("abbcd n-grams (size 3):", strutil.Ngrams("abbcd", 3))
55 | 
56 | 	// Output:
57 | 	// abbcd n-grams (size 2): [ab bb bc cd]
58 | 	// abbcd n-grams (size 3): [abb bbc bcd]
59 | }
60 | 
61 | func ExampleNgramMap() {
62 | 	// 2 character n-gram map.
63 | 	ngrams, total := strutil.NgramMap("abbcabb", 2)
64 | 	fmt.Printf("abbcabb n-gram map (size 2): %v (%d ngrams)\n", ngrams, total)
65 | 
66 | 	// 3 character n-gram map.
67 | 	ngrams, total = strutil.NgramMap("abbcabb", 3)
68 | 	fmt.Printf("abbcabb n-gram map (size 3): %v (%d ngrams)\n", ngrams, total)
69 | 
70 | 	// Output:
71 | 	// abbcabb n-gram map (size 2): map[ab:2 bb:2 bc:1 ca:1] (6 ngrams)
72 | 	// abbcabb n-gram map (size 3): map[abb:2 bbc:1 bca:1 cab:1] (5 ngrams)
73 | }
74 | 
75 | func ExampleNgramIntersection() {
76 | 	ngrams, common, totalA, totalB := strutil.NgramIntersection("ababc", "ababd", 2)
77 | 	fmt.Printf("(ababc, ababd) n-gram intersection: %v (%d/%d n-grams)\n",
78 | 		ngrams, common, totalA+totalB)
79 | 
80 | 	// Output:
81 | 	// (ababc, ababd) n-gram intersection: map[ab:2 ba:1] (3/8 n-grams)
82 | }
83 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/adrg/strutil
 2 | 
 3 | go 1.19
 4 | 
 5 | require github.com/stretchr/testify v1.10.0
 6 | 
 7 | require (
 8 | 	github.com/davecgh/go-spew v1.1.1 // indirect
 9 | 	github.com/pmezard/go-difflib v1.0.0 // indirect
10 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
11 | )
12 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 5 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
 6 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
 7 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
 8 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 9 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
10 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
11 | 


--------------------------------------------------------------------------------
/internal/mathutil/math.go:
--------------------------------------------------------------------------------
 1 | package mathutil
 2 | 
 3 | // Min returns the value of the smallest argument,
 4 | // or 0 if no arguments are provided.
 5 | func Min(args ...int) int {
 6 | 	if len(args) == 0 {
 7 | 		return 0
 8 | 	}
 9 | 	if len(args) == 1 {
10 | 		return args[0]
11 | 	}
12 | 
13 | 	min := args[0]
14 | 	for _, arg := range args[1:] {
15 | 		if min > arg {
16 | 			min = arg
17 | 		}
18 | 	}
19 | 
20 | 	return min
21 | }
22 | 
23 | // Max returns the value of the largest argument,
24 | // or 0 if no arguments are provided.
25 | func Max(args ...int) int {
26 | 	if len(args) == 0 {
27 | 		return 0
28 | 	}
29 | 	if len(args) == 1 {
30 | 		return args[0]
31 | 	}
32 | 
33 | 	max := args[0]
34 | 	for _, arg := range args[1:] {
35 | 		if max < arg {
36 | 			max = arg
37 | 		}
38 | 	}
39 | 
40 | 	return max
41 | }
42 | 
43 | // Minf returns the value of the smallest argument,
44 | // or 0 if no arguments are provided.
45 | func Minf(args ...float64) float64 {
46 | 	if len(args) == 0 {
47 | 		return 0
48 | 	}
49 | 	if len(args) == 1 {
50 | 		return args[0]
51 | 	}
52 | 
53 | 	min := args[0]
54 | 	for _, arg := range args[1:] {
55 | 		if min > arg {
56 | 			min = arg
57 | 		}
58 | 	}
59 | 
60 | 	return min
61 | }
62 | 
63 | // Maxf returns the value of the largest argument,
64 | // or 0 if no arguments are provided.
65 | func Maxf(args ...float64) float64 {
66 | 	if len(args) == 0 {
67 | 		return 0
68 | 	}
69 | 	if len(args) == 1 {
70 | 		return args[0]
71 | 	}
72 | 
73 | 	max := args[0]
74 | 	for _, arg := range args[1:] {
75 | 		if max < arg {
76 | 			max = arg
77 | 		}
78 | 	}
79 | 
80 | 	return max
81 | }
82 | 


--------------------------------------------------------------------------------
/internal/mathutil/mathutil_test.go:
--------------------------------------------------------------------------------
 1 | package mathutil_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/adrg/strutil/internal/mathutil"
 7 | 	"github.com/stretchr/testify/require"
 8 | )
 9 | 
10 | func TestMin(t *testing.T) {
11 | 	requireEqual(t, [][2]interface{}{
12 | 		{0, mathutil.Min()},
13 | 		{1, mathutil.Min(1)},
14 | 		{0, mathutil.Min(0, 1)},
15 | 		{1, mathutil.Min(1, 1)},
16 | 		{1, mathutil.Min(2, 1)},
17 | 		{1, mathutil.Min(1, 2)},
18 | 		{0, mathutil.Min(2, 1, 0)},
19 | 		{0, mathutil.Min(0, 1, 2)},
20 | 	})
21 | }
22 | 
23 | func TestMax(t *testing.T) {
24 | 	requireEqual(t, [][2]interface{}{
25 | 		{0, mathutil.Max()},
26 | 		{1, mathutil.Max(1)},
27 | 		{1, mathutil.Max(0, 1)},
28 | 		{1, mathutil.Max(1, 1)},
29 | 		{2, mathutil.Max(2, 1)},
30 | 		{2, mathutil.Max(1, 2)},
31 | 		{3, mathutil.Max(2, 1, 3)},
32 | 		{3, mathutil.Max(3, 1, 2)},
33 | 	})
34 | }
35 | 
36 | func TestMinf(t *testing.T) {
37 | 	requireEqual(t, [][2]interface{}{
38 | 		{0.0, mathutil.Minf()},
39 | 		{1.0, mathutil.Minf(1.0)},
40 | 		{0.0, mathutil.Minf(0.0, 1.0)},
41 | 		{1.0, mathutil.Minf(1.0, 1.0)},
42 | 		{1.0, mathutil.Minf(2.0, 1.0)},
43 | 		{1.0, mathutil.Minf(1.0, 2.0)},
44 | 		{0.0, mathutil.Minf(2.0, 1.0, 0.0)},
45 | 		{0.0, mathutil.Minf(0.0, 1.0, 2.0)},
46 | 	})
47 | }
48 | 
49 | func TestMaxf(t *testing.T) {
50 | 	requireEqual(t, [][2]interface{}{
51 | 		{0.0, mathutil.Maxf()},
52 | 		{1.0, mathutil.Maxf(1.0)},
53 | 		{1.0, mathutil.Maxf(0.0, 1.0)},
54 | 		{1.0, mathutil.Maxf(1.0, 1.0)},
55 | 		{2.0, mathutil.Maxf(2.0, 1.1, 1.0)},
56 | 		{2.0, mathutil.Maxf(1.1, 1.0, 2.0)},
57 | 		{3.0, mathutil.Maxf(2.0, 1.0, 3.0)},
58 | 		{3.0, mathutil.Maxf(3.0, 1.0, 2.0)},
59 | 	})
60 | }
61 | 
62 | func requireEqual(t *testing.T, inputs [][2]interface{}) {
63 | 	t.Helper()
64 | 
65 | 	for _, input := range inputs {
66 | 		require.Equal(t, input[0], input[1])
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/internal/ngram/ngram.go:
--------------------------------------------------------------------------------
  1 | package ngram
  2 | 
  3 | import "github.com/adrg/strutil/internal/mathutil"
  4 | 
  5 | // Count returns the n-gram count of the specified size for the
  6 | // provided term. An n-gram size of 1 is used if the provided size is
  7 | // less than or equal to 0.
  8 | func Count(runes []rune, size int) int {
  9 | 	return mathutil.Max(len(runes)-(mathutil.Max(size, 1)-1), 0)
 10 | }
 11 | 
 12 | // Slice returns all the n-grams of the specified size for the provided term.
 13 | // The n-grams in the output slice are in the order in which they occur in the
 14 | // input term. An n-gram size of 1 is used if the provided size is less than
 15 | // or equal to 0.
 16 | func Slice(runes []rune, size int) []string {
 17 | 	// Use an n-gram size of 1 if the provided size is invalid.
 18 | 	size = mathutil.Max(size, 1)
 19 | 
 20 | 	// Check if term length is too small.
 21 | 	lenRunes := len(runes)
 22 | 	if lenRunes == 0 || lenRunes < size {
 23 | 		return nil
 24 | 	}
 25 | 
 26 | 	// Generate n-gram slice.
 27 | 	limit := lenRunes - (size - 1)
 28 | 	ngrams := make([]string, limit)
 29 | 
 30 | 	for i, j := 0, 0; i < limit; i++ {
 31 | 		ngrams[j] = string(runes[i : i+size])
 32 | 		j++
 33 | 	}
 34 | 
 35 | 	return ngrams
 36 | }
 37 | 
 38 | // Map returns a map of all n-grams of the specified size for the provided
 39 | // term, along with their frequency. The function also returns the total
 40 | // number of n-grams, which is the sum of all the values in the output map.
 41 | // An n-gram size of 1 is used if the provided size is less than or equal to 0.
 42 | func Map(runes []rune, size int) (map[string]int, int) {
 43 | 	// Use an n-gram size of 1 if the provided size is invalid.
 44 | 	size = mathutil.Max(size, 1)
 45 | 
 46 | 	// Check if term length is too small.
 47 | 	lenRunes := len(runes)
 48 | 	if lenRunes == 0 || lenRunes < size {
 49 | 		return map[string]int{}, 0
 50 | 	}
 51 | 
 52 | 	// Generate n-gram map.
 53 | 	limit := lenRunes - (size - 1)
 54 | 	ngrams := make(map[string]int, limit)
 55 | 
 56 | 	var ngramCount int
 57 | 	for i := 0; i < limit; i++ {
 58 | 		ngram := string(runes[i : i+size])
 59 | 		count := ngrams[ngram]
 60 | 		ngrams[ngram] = count + 1
 61 | 		ngramCount++
 62 | 	}
 63 | 
 64 | 	return ngrams, ngramCount
 65 | }
 66 | 
 67 | // Intersection returns a map of the n-grams of the specified size found
 68 | // in both terms, along with their frequency. The function also returns the
 69 | // number of common n-grams (the sum of all the values in the output map),
 70 | // the total number of n-grams in the first term and the total number of
 71 | // n-grams in the second term. An n-gram size of 1 is used if the provided
 72 | // size is less than or equal to 0.
 73 | func Intersection(a, b []rune, size int) (map[string]int, int, int, int) {
 74 | 	// Use an n-gram size of 1 if the provided size is invalid.
 75 | 	size = mathutil.Max(size, 1)
 76 | 
 77 | 	// Compute the n-grams of the first term.
 78 | 	ngramsA, totalA := Map(a, size)
 79 | 
 80 | 	// Calculate n-gram intersection with the second term.
 81 | 	limit := len(b) - (size - 1)
 82 | 	commonNgrams := make(map[string]int, mathutil.Max(limit, 0))
 83 | 
 84 | 	var totalB, intersection int
 85 | 	for i := 0; i < limit; i++ {
 86 | 		ngram := string(b[i : i+size])
 87 | 		totalB++
 88 | 
 89 | 		if count, ok := ngramsA[ngram]; ok && count > 0 {
 90 | 			// Decrease frequency of n-gram found in the first term each time
 91 | 			// a successful match is found.
 92 | 			intersection++
 93 | 			ngramsA[ngram] = count - 1
 94 | 
 95 | 			// Update common n-grams map with the matched n-gram and its
 96 | 			// frequency.
 97 | 			count = commonNgrams[ngram]
 98 | 			commonNgrams[ngram] = count + 1
 99 | 		}
100 | 	}
101 | 
102 | 	return commonNgrams, intersection, totalA, totalB
103 | }
104 | 


--------------------------------------------------------------------------------
/internal/ngram/ngram_test.go:
--------------------------------------------------------------------------------
  1 | package ngram_test
  2 | 
  3 | import (
  4 | 	"testing"
  5 | 
  6 | 	"github.com/adrg/strutil/internal/ngram"
  7 | 	"github.com/stretchr/testify/require"
  8 | )
  9 | 
 10 | func TestNgramCount(t *testing.T) {
 11 | 	requireEqual(t, [][2]interface{}{
 12 | 		{0, ngram.Count(nil, -1)},
 13 | 		{0, ngram.Count(nil, 0)},
 14 | 		{0, ngram.Count(nil, 1)},
 15 | 		{0, ngram.Count([]rune{}, -1)},
 16 | 		{0, ngram.Count([]rune{}, 0)},
 17 | 		{0, ngram.Count([]rune{}, 1)},
 18 | 		{6, ngram.Count([]rune("abbabb"), -1)},
 19 | 		{6, ngram.Count([]rune("abbabb"), 0)},
 20 | 		{6, ngram.Count([]rune("abbabb"), 1)},
 21 | 		{5, ngram.Count([]rune("abbabb"), 2)},
 22 | 		{4, ngram.Count([]rune("abbabb"), 3)},
 23 | 		{3, ngram.Count([]rune("abbabb"), 4)},
 24 | 		{2, ngram.Count([]rune("abbabb"), 5)},
 25 | 		{1, ngram.Count([]rune("abbabb"), 6)},
 26 | 		{0, ngram.Count([]rune("abbabb"), 7)},
 27 | 		{0, ngram.Count([]rune("abbabb"), 8)},
 28 | 	})
 29 | }
 30 | 
 31 | func TestNgrams(t *testing.T) {
 32 | 	requireEqual(t, [][2]interface{}{
 33 | 		{0, len(ngram.Slice(nil, -1))},
 34 | 		{0, len(ngram.Slice(nil, 0))},
 35 | 		{0, len(ngram.Slice(nil, 1))},
 36 | 		{0, len(ngram.Slice([]rune{}, -1))},
 37 | 		{0, len(ngram.Slice([]rune{}, 0))},
 38 | 		{0, len(ngram.Slice([]rune{}, 1))},
 39 | 		{
 40 | 			[]string{"a", "b", "c", "d", "e", "f"},
 41 | 			ngram.Slice([]rune("abcdef"), -1),
 42 | 		},
 43 | 		{
 44 | 			[]string{"a", "b", "c", "d", "e", "f"},
 45 | 			ngram.Slice([]rune("abcdef"), 0),
 46 | 		},
 47 | 		{
 48 | 			[]string{"a", "b", "c", "d", "e", "f"},
 49 | 			ngram.Slice([]rune("abcdef"), 1),
 50 | 		},
 51 | 		{
 52 | 			[]string{"ab", "bc", "cd", "de", "ef"},
 53 | 			ngram.Slice([]rune("abcdef"), 2),
 54 | 		},
 55 | 		{
 56 | 			[]string{"abc", "bcd", "cde", "def"},
 57 | 			ngram.Slice([]rune("abcdef"), 3),
 58 | 		},
 59 | 		{
 60 | 			[]string{"abcd", "bcde", "cdef"},
 61 | 			ngram.Slice([]rune("abcdef"), 4),
 62 | 		},
 63 | 		{
 64 | 			[]string{"abcde", "bcdef"},
 65 | 			ngram.Slice([]rune("abcdef"), 5),
 66 | 		},
 67 | 		{
 68 | 			[]string{"abcdef"},
 69 | 			ngram.Slice([]rune("abcdef"), 6),
 70 | 		},
 71 | 		{
 72 | 			0,
 73 | 			len(ngram.Slice([]rune("abcdef"), 7)),
 74 | 		},
 75 | 		{
 76 | 			0,
 77 | 			len(ngram.Slice([]rune("abcdef"), 8)),
 78 | 		},
 79 | 	})
 80 | }
 81 | 
 82 | func TestNgramMap(t *testing.T) {
 83 | 	inputs := []*struct {
 84 | 		term     []rune
 85 | 		size     int
 86 | 		expMap   map[string]int
 87 | 		expTotal int
 88 | 	}{
 89 | 		{
 90 | 			term:   nil,
 91 | 			size:   -1,
 92 | 			expMap: map[string]int{},
 93 | 		},
 94 | 		{
 95 | 			term:   nil,
 96 | 			expMap: map[string]int{},
 97 | 		},
 98 | 		{
 99 | 			term:   nil,
100 | 			size:   1,
101 | 			expMap: map[string]int{},
102 | 		},
103 | 		{
104 | 			term:   []rune{},
105 | 			size:   -1,
106 | 			expMap: map[string]int{},
107 | 		},
108 | 		{
109 | 			term:   []rune{},
110 | 			expMap: map[string]int{},
111 | 		},
112 | 		{
113 | 			term:   []rune{},
114 | 			size:   1,
115 | 			expMap: map[string]int{},
116 | 		},
117 | 		{
118 | 			term:     []rune("abbabb"),
119 | 			size:     -1,
120 | 			expMap:   map[string]int{"a": 2, "b": 4},
121 | 			expTotal: 6,
122 | 		},
123 | 		{
124 | 			term:     []rune("abbabb"),
125 | 			expMap:   map[string]int{"a": 2, "b": 4},
126 | 			expTotal: 6,
127 | 		},
128 | 		{
129 | 			term:     []rune("abbabb"),
130 | 			size:     1,
131 | 			expMap:   map[string]int{"a": 2, "b": 4},
132 | 			expTotal: 6,
133 | 		},
134 | 		{
135 | 			term:     []rune("abbabb"),
136 | 			size:     2,
137 | 			expMap:   map[string]int{"ab": 2, "bb": 2, "ba": 1},
138 | 			expTotal: 5,
139 | 		},
140 | 		{
141 | 			term:     []rune("abbabb"),
142 | 			size:     3,
143 | 			expMap:   map[string]int{"abb": 2, "bba": 1, "bab": 1},
144 | 			expTotal: 4,
145 | 		},
146 | 		{
147 | 			term:     []rune("abbabb"),
148 | 			size:     4,
149 | 			expMap:   map[string]int{"abba": 1, "bbab": 1, "babb": 1},
150 | 			expTotal: 3,
151 | 		},
152 | 		{
153 | 			term:     []rune("abbabb"),
154 | 			size:     5,
155 | 			expMap:   map[string]int{"abbab": 1, "bbabb": 1},
156 | 			expTotal: 2,
157 | 		},
158 | 		{
159 | 			term:     []rune("abbabb"),
160 | 			size:     6,
161 | 			expMap:   map[string]int{"abbabb": 1},
162 | 			expTotal: 1,
163 | 		},
164 | 		{
165 | 			term:     []rune("abbabb"),
166 | 			size:     7,
167 | 			expMap:   map[string]int{},
168 | 			expTotal: 0,
169 | 		},
170 | 		{
171 | 			term:     []rune("abbabb"),
172 | 			size:     8,
173 | 			expMap:   map[string]int{},
174 | 			expTotal: 0,
175 | 		},
176 | 	}
177 | 
178 | 	for _, input := range inputs {
179 | 		actMap, actTotal := ngram.Map(input.term, input.size)
180 | 		require.Equal(t, input.expMap, actMap)
181 | 		require.Equal(t, input.expTotal, actTotal)
182 | 	}
183 | }
184 | 
185 | func TestNgramIntersection(t *testing.T) {
186 | 	inputs := []*struct {
187 | 		a    []rune
188 | 		b    []rune
189 | 		size int
190 | 
191 | 		expMap    map[string]int
192 | 		expTotal  int
193 | 		expTotalA int
194 | 		expTotalB int
195 | 	}{
196 | 		{
197 | 			size:   1,
198 | 			expMap: map[string]int{},
199 | 		},
200 | 		{
201 | 			a:      []rune{},
202 | 			size:   1,
203 | 			expMap: map[string]int{},
204 | 		},
205 | 		{
206 | 			b:      []rune{},
207 | 			size:   1,
208 | 			expMap: map[string]int{},
209 | 		},
210 | 		{
211 | 			a:      []rune{},
212 | 			b:      []rune{},
213 | 			size:   1,
214 | 			expMap: map[string]int{},
215 | 		},
216 | 		{
217 | 			a:         []rune("ababbaa"),
218 | 			b:         []rune("aabbaa"),
219 | 			size:      -1,
220 | 			expMap:    map[string]int{"a": 4, "b": 2},
221 | 			expTotal:  6,
222 | 			expTotalA: 7,
223 | 			expTotalB: 6,
224 | 		},
225 | 		{
226 | 			a:         []rune("aabbaa"),
227 | 			b:         []rune("ababbaa"),
228 | 			expMap:    map[string]int{"a": 4, "b": 2},
229 | 			expTotal:  6,
230 | 			expTotalA: 6,
231 | 			expTotalB: 7,
232 | 		},
233 | 		{
234 | 			a:         []rune("ababbaa"),
235 | 			b:         []rune("aabbaa"),
236 | 			size:      1,
237 | 			expMap:    map[string]int{"a": 4, "b": 2},
238 | 			expTotal:  6,
239 | 			expTotalA: 7,
240 | 			expTotalB: 6,
241 | 		},
242 | 		{
243 | 			a:         []rune("aabbaa"),
244 | 			b:         []rune("ababbaa"),
245 | 			size:      2,
246 | 			expMap:    map[string]int{"aa": 1, "ab": 1, "ba": 1, "bb": 1},
247 | 			expTotal:  4,
248 | 			expTotalA: 5,
249 | 			expTotalB: 6,
250 | 		},
251 | 		{
252 | 			a:         []rune("ababbaa"),
253 | 			b:         []rune("aabbaa"),
254 | 			size:      3,
255 | 			expMap:    map[string]int{"abb": 1, "bba": 1, "baa": 1},
256 | 			expTotal:  3,
257 | 			expTotalA: 5,
258 | 			expTotalB: 4,
259 | 		},
260 | 		{
261 | 			a:         []rune("aabbaa"),
262 | 			b:         []rune("ababbaa"),
263 | 			size:      4,
264 | 			expMap:    map[string]int{"abba": 1, "bbaa": 1},
265 | 			expTotal:  2,
266 | 			expTotalA: 3,
267 | 			expTotalB: 4,
268 | 		},
269 | 		{
270 | 			a:         []rune("ababbaa"),
271 | 			b:         []rune("aabbaa"),
272 | 			size:      5,
273 | 			expMap:    map[string]int{"abbaa": 1},
274 | 			expTotal:  1,
275 | 			expTotalA: 3,
276 | 			expTotalB: 2,
277 | 		},
278 | 		{
279 | 			a:         []rune("aabbaa"),
280 | 			b:         []rune("ababbaa"),
281 | 			size:      6,
282 | 			expMap:    map[string]int{},
283 | 			expTotalA: 1,
284 | 			expTotalB: 2,
285 | 		},
286 | 		{
287 | 			a:         []rune("ababbaa"),
288 | 			b:         []rune("aabbaa"),
289 | 			size:      7,
290 | 			expMap:    map[string]int{},
291 | 			expTotalA: 1,
292 | 		},
293 | 		{
294 | 			a:         []rune("aabbaa"),
295 | 			b:         []rune("ababbaa"),
296 | 			size:      7,
297 | 			expMap:    map[string]int{},
298 | 			expTotalB: 1,
299 | 		},
300 | 		{
301 | 			a:      []rune("ababbaa"),
302 | 			b:      []rune("aabbaa"),
303 | 			size:   8,
304 | 			expMap: map[string]int{},
305 | 		},
306 | 		{
307 | 			a:      []rune("aabbaa"),
308 | 			b:      []rune("ababbaa"),
309 | 			size:   8,
310 | 			expMap: map[string]int{},
311 | 		},
312 | 		{
313 | 			a:      []rune("ababbaa"),
314 | 			b:      []rune("aabbaa"),
315 | 			size:   9,
316 | 			expMap: map[string]int{},
317 | 		},
318 | 		{
319 | 			a:      []rune("aabbaa"),
320 | 			b:      []rune("ababbaa"),
321 | 			size:   9,
322 | 			expMap: map[string]int{},
323 | 		},
324 | 	}
325 | 
326 | 	for _, input := range inputs {
327 | 		actMap, actTotal, actTotalA, actTotalB := ngram.Intersection(input.a, input.b, input.size)
328 | 		require.Equal(t, input.expMap, actMap)
329 | 		require.Equal(t, input.expTotal, actTotal)
330 | 		require.Equal(t, input.expTotalA, actTotalA)
331 | 		require.Equal(t, input.expTotalB, actTotalB)
332 | 	}
333 | }
334 | 
335 | func requireEqual(t *testing.T, inputs [][2]interface{}) {
336 | 	t.Helper()
337 | 
338 | 	for _, input := range inputs {
339 | 		require.Equal(t, input[0], input[1])
340 | 	}
341 | }
342 | 


--------------------------------------------------------------------------------
/internal/stringutil/stringutil.go:
--------------------------------------------------------------------------------
 1 | package stringutil
 2 | 
 3 | // CommonPrefix returns the common prefix of the specified strings. An empty
 4 | // string is returned if the parameters have no prefix in common.
 5 | func CommonPrefix(first, second string) string {
 6 | 	fRunes, sRunes := []rune(first), []rune(second)
 7 | 	if len(fRunes) > len(sRunes) {
 8 | 		fRunes, sRunes = sRunes, fRunes
 9 | 	}
10 | 
11 | 	var commonLen int
12 | 	for i, r := range fRunes {
13 | 		if r != sRunes[i] {
14 | 			break
15 | 		}
16 | 
17 | 		commonLen++
18 | 	}
19 | 
20 | 	return string(sRunes[0:commonLen])
21 | }
22 | 
23 | // UniqueSlice returns a slice containing the unique items from the specified
24 | // string slice. The items in the output slice are in the order in which they
25 | // occur in the input slice.
26 | func UniqueSlice(items []string) []string {
27 | 	var uniq []string
28 | 	registry := map[string]struct{}{}
29 | 
30 | 	for _, item := range items {
31 | 		if _, ok := registry[item]; ok {
32 | 			continue
33 | 		}
34 | 
35 | 		registry[item] = struct{}{}
36 | 		uniq = append(uniq, item)
37 | 	}
38 | 
39 | 	return uniq
40 | }
41 | 
42 | // SliceContains returns true if terms contains q, or false otherwise.
43 | func SliceContains(terms []string, q string) bool {
44 | 	for _, term := range terms {
45 | 		if q == term {
46 | 			return true
47 | 		}
48 | 	}
49 | 
50 | 	return false
51 | }
52 | 


--------------------------------------------------------------------------------
/internal/stringutil/stringutil_test.go:
--------------------------------------------------------------------------------
 1 | package stringutil_test
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/adrg/strutil/internal/stringutil"
 7 | 	"github.com/stretchr/testify/require"
 8 | )
 9 | 
10 | func TestCommonPrefix(t *testing.T) {
11 | 	requireEqual(t, [][2]interface{}{
12 | 		{"", stringutil.CommonPrefix("", "")},
13 | 		{"", stringutil.CommonPrefix("a", "")},
14 | 		{"", stringutil.CommonPrefix("", "b")},
15 | 		{"", stringutil.CommonPrefix("a", "b")},
16 | 		{"a", stringutil.CommonPrefix("ab", "aab")},
17 | 		{"a", stringutil.CommonPrefix("aab", "ab")},
18 | 		{"aa", stringutil.CommonPrefix("aab", "aaab")},
19 | 		{"aa", stringutil.CommonPrefix("aaab", "aab")},
20 | 		{"忧郁的乌龟", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的乌龟")},
21 | 		{"忧郁的", stringutil.CommonPrefix("忧郁的", "忧郁的乌龟")},
22 | 		{"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")},
23 | 		{"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")},
24 | 		{"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")},
25 | 		{"\u2019", stringutil.CommonPrefix("\u2019a", "\u2019b")},
26 | 		{"a\u2019bc", stringutil.CommonPrefix("a\u2019bcd", "a\u2019bce")},
27 | 		{"abc", stringutil.CommonPrefix("abc\u2019d", "abc\u2020d")},
28 | 	})
29 | }
30 | 
31 | func TestUniqueSlice(t *testing.T) {
32 | 	requireEqual(t, [][2]interface{}{
33 | 		{0, len(stringutil.UniqueSlice(nil))},
34 | 		{0, len(stringutil.UniqueSlice([]string{}))},
35 | 		{[]string{"a"}, stringutil.UniqueSlice([]string{"a"})},
36 | 		{[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b"})},
37 | 		{[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a"})},
38 | 		{[]string{"a"}, stringutil.UniqueSlice([]string{"a", "a"})},
39 | 		{[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a", "a"})},
40 | 		{[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "a", "b"})},
41 | 		{[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "a", "a", "b"})},
42 | 		{[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a", "a", "a"})},
43 | 		{[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b", "b", "a"})},
44 | 		{[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b", "a", "b"})},
45 | 	})
46 | }
47 | 
48 | func TestSliceContains(t *testing.T) {
49 | 	requireEqual(t, [][2]interface{}{
50 | 		{false, stringutil.SliceContains(nil, "")},
51 | 		{false, stringutil.SliceContains(nil, "a")},
52 | 		{false, stringutil.SliceContains([]string{}, "")},
53 | 		{false, stringutil.SliceContains([]string{}, "a")},
54 | 		{true, stringutil.SliceContains([]string{"a", "b"}, "a")},
55 | 		{true, stringutil.SliceContains([]string{"b", "a"}, "a")},
56 | 		{false, stringutil.SliceContains([]string{"b", "a"}, "c")},
57 | 	})
58 | }
59 | 
60 | func requireEqual(t *testing.T, inputs [][2]interface{}) {
61 | 	t.Helper()
62 | 
63 | 	for _, input := range inputs {
64 | 		require.Equal(t, input[0], input[1])
65 | 	}
66 | }
67 | 


--------------------------------------------------------------------------------
/metrics/examples_test.go:
--------------------------------------------------------------------------------
  1 | package metrics_test
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 
  6 | 	"github.com/adrg/strutil/metrics"
  7 | )
  8 | 
  9 | func ExampleHamming() {
 10 | 	// Default options.
 11 | 	h := metrics.NewHamming()
 12 | 
 13 | 	sim := h.Compare("text", "test")
 14 | 	fmt.Printf("(text, test) similarity: %.2f\n", sim)
 15 | 
 16 | 	dist := h.Distance("text", "test")
 17 | 	fmt.Printf("(text, test) distance: %d\n", dist)
 18 | 
 19 | 	// Custom options.
 20 | 	h.CaseSensitive = false
 21 | 
 22 | 	sim = h.Compare("ONE", "once")
 23 | 	fmt.Printf("(ONE, once) similarity: %.2f\n", sim)
 24 | 
 25 | 	dist = h.Distance("one", "once")
 26 | 	fmt.Printf("(ONE, once) distance: %d\n", dist)
 27 | 
 28 | 	// Output:
 29 | 	// (text, test) similarity: 0.75
 30 | 	// (text, test) distance: 1
 31 | 	// (ONE, once) similarity: 0.50
 32 | 	// (ONE, once) distance: 2
 33 | }
 34 | 
 35 | func ExampleLevenshtein() {
 36 | 	// Default options.
 37 | 	lev := metrics.NewLevenshtein()
 38 | 
 39 | 	sim := lev.Compare("book", "brick")
 40 | 	fmt.Printf("(book, brick) similarity: %.2f\n", sim)
 41 | 
 42 | 	dist := lev.Distance("book", "brick")
 43 | 	fmt.Printf("(book, brick) distance: %d\n", dist)
 44 | 
 45 | 	// Custom options.
 46 | 	lev.CaseSensitive = false
 47 | 	lev.ReplaceCost = 2
 48 | 
 49 | 	sim = lev.Compare("HELLO", "jello")
 50 | 	fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim)
 51 | 
 52 | 	dist = lev.Distance("HELLO", "jello")
 53 | 	fmt.Printf("(HELLO, jello) distance: %d\n", dist)
 54 | 
 55 | 	// Output:
 56 | 	// (book, brick) similarity: 0.40
 57 | 	// (book, brick) distance: 3
 58 | 	// (HELLO, jello) similarity: 0.60
 59 | 	// (HELLO, jello) distance: 2
 60 | }
 61 | 
 62 | func ExampleJaro() {
 63 | 	jaro := metrics.NewJaro()
 64 | 	sim := jaro.Compare("sort", "shirt")
 65 | 	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)
 66 | 
 67 | 	// Output:
 68 | 	// (sort, shirt) similarity: 0.78
 69 | }
 70 | 
 71 | func ExampleJaroWinkler() {
 72 | 	jw := metrics.NewJaroWinkler()
 73 | 	sim := jw.Compare("sort", "shirt")
 74 | 	fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)
 75 | 
 76 | 	// Output:
 77 | 	// (sort, shirt) similarity: 0.80
 78 | }
 79 | 
 80 | func ExampleSmithWatermanGotoh() {
 81 | 	// Default options.
 82 | 	swg := metrics.NewSmithWatermanGotoh()
 83 | 
 84 | 	sim := swg.Compare("a pink kitten", "a kitten")
 85 | 	fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim)
 86 | 
 87 | 	// Custom options.
 88 | 	swg.CaseSensitive = false
 89 | 	swg.GapPenalty = -0.1
 90 | 	swg.Substitution = metrics.MatchMismatch{
 91 | 		Match:    1,
 92 | 		Mismatch: -0.5,
 93 | 	}
 94 | 
 95 | 	sim = swg.Compare("a pink kitten", "A KITTEN")
 96 | 	fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim)
 97 | 
 98 | 	// Output:
 99 | 	// (a pink kitten, a kitten) similarity: 0.88
100 | 	// (a pink kitten, A KITTEN) similarity: 0.94
101 | }
102 | 
103 | func ExampleSorensenDice() {
104 | 	// Default options.
105 | 	sd := metrics.NewSorensenDice()
106 | 	sim := sd.Compare("night", "alright")
107 | 	fmt.Printf("(night, alright) similarity: %.2f\n", sim)
108 | 
109 | 	// Custom options.
110 | 	sd.CaseSensitive = false
111 | 	sd.NgramSize = 3
112 | 
113 | 	sim = sd.Compare("night", "alright")
114 | 	fmt.Printf("(night, alright) similarity: %.2f\n", sim)
115 | 
116 | 	// Output:
117 | 	// (night, alright) similarity: 0.60
118 | 	// (night, alright) similarity: 0.50
119 | }
120 | 
121 | func ExampleJaccard() {
122 | 	// Default options.
123 | 	j := metrics.NewJaccard()
124 | 	sim := j.Compare("night", "alright")
125 | 	fmt.Printf("(night, alright) similarity: %.2f\n", sim)
126 | 
127 | 	// Custom options.
128 | 	j.CaseSensitive = false
129 | 	j.NgramSize = 3
130 | 
131 | 	sim = j.Compare("night", "alright")
132 | 	fmt.Printf("(night, alright) similarity: %.2f\n", sim)
133 | 
134 | 	// Output:
135 | 	// (night, alright) similarity: 0.43
136 | 	// (night, alright) similarity: 0.33
137 | }
138 | 
139 | func ExampleOverlapCoefficient() {
140 | 	// Default options.
141 | 	oc := metrics.NewOverlapCoefficient()
142 | 	sim := oc.Compare("night", "alright")
143 | 	fmt.Printf("(night, alright) similarity: %.2f\n", sim)
144 | 
145 | 	// Subset comparison.
146 | 	sim = oc.Compare("aa", "aaaa")
147 | 	fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim)
148 | 
149 | 	// Custom options.
150 | 	oc.CaseSensitive = false
151 | 	oc.NgramSize = 3
152 | 
153 | 	sim = oc.Compare("night", "alright")
154 | 	fmt.Printf("(night, alright) similarity: %.2f\n", sim)
155 | 
156 | 	// Output:
157 | 	// (night, alright) similarity: 0.75
158 | 	// (aa, aaaa) similarity: 1.00
159 | 	// (night, alright) similarity: 0.67
160 | }
161 | 


--------------------------------------------------------------------------------
/metrics/hamming.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"strings"
 5 | )
 6 | 
 7 | // Hamming represents the Hamming metric for measuring the similarity
 8 | // between sequences.
 9 | //   For more information see https://en.wikipedia.org/wiki/Hamming_distance.
10 | type Hamming struct {
11 | 	// CaseSensitive specifies if the string comparison is case sensitive.
12 | 	CaseSensitive bool
13 | }
14 | 
15 | // NewHamming returns a new Hamming string metric.
16 | //
17 | // Default options:
18 | //   CaseSensitive: true
19 | func NewHamming() *Hamming {
20 | 	return &Hamming{
21 | 		CaseSensitive: true,
22 | 	}
23 | }
24 | 
25 | // Compare returns the Hamming similarity of a and b. The returned
26 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
27 | // closer matches.
28 | func (m *Hamming) Compare(a, b string) float64 {
29 | 	distance, maxLen := m.distance(a, b)
30 | 	return 1 - float64(distance)/float64(maxLen)
31 | }
32 | 
33 | // Distance returns the Hamming distance between a and b. Lower distances
34 | // indicate closer matches. A distance of 0 means the strings are identical.
35 | func (m *Hamming) Distance(a, b string) int {
36 | 	distance, _ := m.distance(a, b)
37 | 	return distance
38 | }
39 | 
40 | func (m *Hamming) distance(a, b string) (int, int) {
41 | 	// Lower terms if case insensitive comparison is specified.
42 | 	if !m.CaseSensitive {
43 | 		a = strings.ToLower(a)
44 | 		b = strings.ToLower(b)
45 | 	}
46 | 	runesA, runesB := []rune(a), []rune(b)
47 | 
48 | 	// Check if both terms are empty.
49 | 	lenA, lenB := len(runesA), len(runesB)
50 | 	if lenA == 0 && lenB == 0 {
51 | 		return 0, 0
52 | 	}
53 | 
54 | 	// If the lengths of the sequences are not equal, the distance is
55 | 	// initialized to their absolute difference. Otherwise, it is set to 0.
56 | 	if lenA > lenB {
57 | 		lenA, lenB = lenB, lenA
58 | 	}
59 | 	distance := lenB - lenA
60 | 
61 | 	// Calculate Hamming distance.
62 | 	for i := 0; i < lenA; i++ {
63 | 		if runesA[i] != runesB[i] {
64 | 			distance++
65 | 		}
66 | 	}
67 | 
68 | 	return distance, lenB
69 | }
70 | 


--------------------------------------------------------------------------------
/metrics/jaccard.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	"github.com/adrg/strutil/internal/ngram"
 7 | )
 8 | 
 9 | // Jaccard represents the Jaccard index for measuring the similarity
10 | // between sequences.
11 | //   For more information see https://en.wikipedia.org/wiki/Jaccard_index.
12 | type Jaccard struct {
13 | 	// CaseSensitive specifies if the string comparison is case sensitive.
14 | 	CaseSensitive bool
15 | 
16 | 	// NgramSize represents the size (in characters) of the tokens generated
17 | 	// when comparing the input sequences.
18 | 	NgramSize int
19 | }
20 | 
21 | // NewJaccard returns a new Jaccard string metric.
22 | //
23 | // Default options:
24 | //   CaseSensitive: true
25 | //   NGramSize: 2
26 | func NewJaccard() *Jaccard {
27 | 	return &Jaccard{
28 | 		CaseSensitive: true,
29 | 		NgramSize:     2,
30 | 	}
31 | }
32 | 
33 | // Compare returns the Jaccard similarity coefficient of a and b. The
34 | // returned similarity is a number between 0 and 1. Larger similarity numbers
35 | // indicate closer matches.
36 | // An n-gram size of 2 is used if the provided size is less than or equal to 0.
37 | func (m *Jaccard) Compare(a, b string) float64 {
38 | 	// Lower terms if case insensitive comparison is specified.
39 | 	if !m.CaseSensitive {
40 | 		a = strings.ToLower(a)
41 | 		b = strings.ToLower(b)
42 | 	}
43 | 
44 | 	// Check if both terms are empty.
45 | 	runesA, runesB := []rune(a), []rune(b)
46 | 	if len(runesA) == 0 && len(runesB) == 0 {
47 | 		return 1
48 | 	}
49 | 
50 | 	size := m.NgramSize
51 | 	if size <= 0 {
52 | 		size = 2
53 | 	}
54 | 
55 | 	// Calculate n-gram intersection and union.
56 | 	_, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
57 | 
58 | 	total := totalA + totalB
59 | 	if total == 0 {
60 | 		return 0
61 | 	}
62 | 
63 | 	// Return similarity.
64 | 	return float64(common) / float64(total-common)
65 | }
66 | 


--------------------------------------------------------------------------------
/metrics/jaro.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode/utf8"
 6 | 
 7 | 	"github.com/adrg/strutil/internal/mathutil"
 8 | )
 9 | 
10 | // Jaro represents the Jaro metric for measuring the similarity
11 | // between sequences.
12 | //   For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
13 | type Jaro struct {
14 | 	// CaseSensitive specifies if the string comparison is case sensitive.
15 | 	CaseSensitive bool
16 | }
17 | 
18 | // NewJaro returns a new Jaro string metric.
19 | //
20 | // Default options:
21 | //   CaseSensitive: true
22 | func NewJaro() *Jaro {
23 | 	return &Jaro{
24 | 		CaseSensitive: true,
25 | 	}
26 | }
27 | 
28 | // Compare returns the Jaro similarity of a and b. The returned similarity is
29 | // a number between 0 and 1. Larger similarity numbers indicate closer matches.
30 | func (m *Jaro) Compare(a, b string) float64 {
31 | 	// Check if both terms are empty.
32 | 	lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b)
33 | 	if lenA == 0 && lenB == 0 {
34 | 		return 1
35 | 	}
36 | 
37 | 	// Check if one of the terms is empty.
38 | 	if lenA == 0 || lenB == 0 {
39 | 		return 0
40 | 	}
41 | 
42 | 	// Lower terms if case insensitive comparison is specified.
43 | 	if !m.CaseSensitive {
44 | 		a = strings.ToLower(a)
45 | 		b = strings.ToLower(b)
46 | 	}
47 | 
48 | 	// Get matching runes.
49 | 	halfLen := mathutil.Max(0, mathutil.Max(lenA, lenB)/2)
50 | 	mrA := matchingRunes(a, b, halfLen)
51 | 	mrB := matchingRunes(b, a, halfLen)
52 | 
53 | 	fmLen, smLen := len(mrA), len(mrB)
54 | 	if fmLen == 0 || smLen == 0 {
55 | 		return 0.0
56 | 	}
57 | 
58 | 	// Return similarity.
59 | 	return (float64(fmLen)/float64(lenA) +
60 | 		float64(smLen)/float64(lenB) +
61 | 		float64(fmLen-transpositions(mrA, mrB)/2)/float64(fmLen)) / 3.0
62 | }
63 | 
64 | func matchingRunes(a, b string, limit int) []rune {
65 | 	var (
66 | 		runesA      = []rune(a)
67 | 		runesB      = []rune(b)
68 | 		runesCommon = []rune{}
69 | 		lenB        = len(runesB)
70 | 	)
71 | 
72 | 	for i, r := range runesA {
73 | 		end := mathutil.Min(i+limit+1, lenB)
74 | 		for j := mathutil.Max(0, i-limit); j < end; j++ {
75 | 			if r == runesB[j] && runesB[j] != -1 {
76 | 				runesCommon = append(runesCommon, runesB[j])
77 | 				runesB[j] = -1
78 | 				break
79 | 			}
80 | 		}
81 | 	}
82 | 
83 | 	return runesCommon
84 | }
85 | 
86 | func transpositions(a, b []rune) int {
87 | 	var count int
88 | 
89 | 	minLen := mathutil.Min(len(a), len(b))
90 | 	for i := 0; i < minLen; i++ {
91 | 		if a[i] != b[i] {
92 | 			count++
93 | 		}
94 | 	}
95 | 
96 | 	return count
97 | }
98 | 


--------------------------------------------------------------------------------
/metrics/jaro_winkler.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode/utf8"
 6 | 
 7 | 	"github.com/adrg/strutil/internal/stringutil"
 8 | )
 9 | 
10 | // JaroWinkler represents the Jaro-Winkler metric for measuring the similarity
11 | // between sequences.
12 | //   For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
13 | type JaroWinkler struct {
14 | 	// CaseSensitive specifies if the string comparison is case sensitive.
15 | 	CaseSensitive bool
16 | }
17 | 
18 | // NewJaroWinkler returns a new Jaro-Winkler string metric.
19 | //
20 | // Default options:
21 | //   CaseSensitive: true
22 | func NewJaroWinkler() *JaroWinkler {
23 | 	return &JaroWinkler{
24 | 		CaseSensitive: true,
25 | 	}
26 | }
27 | 
28 | // Compare returns the Jaro-Winkler similarity of a and b. The returned
29 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
30 | // closer matches.
31 | func (m *JaroWinkler) Compare(a, b string) float64 {
32 | 	// Lower terms if case insensitive comparison is specified.
33 | 	if !m.CaseSensitive {
34 | 		a = strings.ToLower(a)
35 | 		b = strings.ToLower(b)
36 | 	}
37 | 
38 | 	// Calculate common prefix.
39 | 	lenPrefix := utf8.RuneCountInString(stringutil.CommonPrefix(a, b))
40 | 	if lenPrefix > 4 {
41 | 		lenPrefix = 4
42 | 	}
43 | 
44 | 	jaro := NewJaro()
45 | 	jaro.CaseSensitive = m.CaseSensitive
46 | 
47 | 	// Return similarity.
48 | 	similarity := jaro.Compare(a, b)
49 | 	return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity))
50 | }
51 | 


--------------------------------------------------------------------------------
/metrics/levenshtein.go:
--------------------------------------------------------------------------------
  1 | package metrics
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 
  6 | 	"github.com/adrg/strutil/internal/mathutil"
  7 | )
  8 | 
  9 | // Levenshtein represents the Levenshtein metric for measuring the similarity
 10 | // between sequences.
 11 | //   For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
 12 | type Levenshtein struct {
 13 | 	// CaseSensitive specifies if the string comparison is case sensitive.
 14 | 	CaseSensitive bool
 15 | 
 16 | 	// InsertCost represents the Levenshtein cost of a character insertion.
 17 | 	InsertCost int
 18 | 
 19 | 	// InsertCost represents the Levenshtein cost of a character deletion.
 20 | 	DeleteCost int
 21 | 
 22 | 	// InsertCost represents the Levenshtein cost of a character substitution.
 23 | 	ReplaceCost int
 24 | }
 25 | 
 26 | // NewLevenshtein returns a new Levenshtein string metric.
 27 | //
 28 | // Default options:
 29 | //   CaseSensitive: true
 30 | //   InsertCost: 1
 31 | //   DeleteCost: 1
 32 | //   ReplaceCost: 1
 33 | func NewLevenshtein() *Levenshtein {
 34 | 	return &Levenshtein{
 35 | 		CaseSensitive: true,
 36 | 		InsertCost:    1,
 37 | 		DeleteCost:    1,
 38 | 		ReplaceCost:   1,
 39 | 	}
 40 | }
 41 | 
 42 | // Compare returns the Levenshtein similarity of a and b. The returned
 43 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
 44 | // closer matches.
 45 | func (m *Levenshtein) Compare(a, b string) float64 {
 46 | 	distance, maxLen := m.distance(a, b)
 47 | 	return 1 - float64(distance)/float64(maxLen)
 48 | }
 49 | 
 50 | // Distance returns the Levenshtein distance between a and b. Lower distances
 51 | // indicate closer matches. A distance of 0 means the strings are identical.
 52 | func (m *Levenshtein) Distance(a, b string) int {
 53 | 	distance, _ := m.distance(a, b)
 54 | 	return distance
 55 | }
 56 | 
 57 | func (m *Levenshtein) distance(a, b string) (int, int) {
 58 | 	// Lower terms if case insensitive comparison is specified.
 59 | 	if !m.CaseSensitive {
 60 | 		a = strings.ToLower(a)
 61 | 		b = strings.ToLower(b)
 62 | 	}
 63 | 	runesA, runesB := []rune(a), []rune(b)
 64 | 
 65 | 	// Check if both terms are empty.
 66 | 	lenA, lenB := len(runesA), len(runesB)
 67 | 	if lenA == 0 && lenB == 0 {
 68 | 		return 0, 0
 69 | 	}
 70 | 
 71 | 	// Check if one of the terms is empty.
 72 | 	maxLen := mathutil.Max(lenA, lenB)
 73 | 	if lenA == 0 {
 74 | 		return m.InsertCost * lenB, maxLen
 75 | 	}
 76 | 	if lenB == 0 {
 77 | 		return m.DeleteCost * lenA, maxLen
 78 | 	}
 79 | 
 80 | 	// Initialize cost slice.
 81 | 	prevCol := make([]int, lenB+1)
 82 | 	for i := 0; i <= lenB; i++ {
 83 | 		prevCol[i] = i
 84 | 	}
 85 | 
 86 | 	// Calculate distance.
 87 | 	col := make([]int, lenB+1)
 88 | 	for i := 0; i < lenA; i++ {
 89 | 		col[0] = i + 1
 90 | 		for j := 0; j < lenB; j++ {
 91 | 			delCost := prevCol[j+1] + m.DeleteCost
 92 | 			insCost := col[j] + m.InsertCost
 93 | 
 94 | 			subCost := prevCol[j]
 95 | 			if runesA[i] != runesB[j] {
 96 | 				subCost += m.ReplaceCost
 97 | 			}
 98 | 
 99 | 			col[j+1] = mathutil.Min(delCost, insCost, subCost)
100 | 		}
101 | 
102 | 		col, prevCol = prevCol, col
103 | 	}
104 | 
105 | 	return prevCol[lenB], maxLen
106 | }
107 | 


--------------------------------------------------------------------------------
/metrics/match_mismatch.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | // MatchMismatch represents a substitution function which returns the match or
 4 | // mismatch value depeding on the equality of the compared characters. The
 5 | // match value must be greater than the mismatch value.
 6 | type MatchMismatch struct {
 7 | 	// Match represents the score of equal character substitutions.
 8 | 	Match float64
 9 | 
10 | 	// Mismatch represents the score of unequal character substitutions.
11 | 	Mismatch float64
12 | }
13 | 
14 | // Compare returns the match value if a[idxA] is equal to b[idxB] or the
15 | // mismatch value otherwise.
16 | func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64 {
17 | 	if a[idxA] == b[idxB] {
18 | 		return m.Match
19 | 	}
20 | 
21 | 	return m.Mismatch
22 | }
23 | 
24 | // Max returns the match value.
25 | func (m MatchMismatch) Max() float64 {
26 | 	return m.Match
27 | }
28 | 
29 | // Min returns the mismatch value.
30 | func (m MatchMismatch) Min() float64 {
31 | 	return m.Mismatch
32 | }
33 | 


--------------------------------------------------------------------------------
/metrics/metrics_test.go:
--------------------------------------------------------------------------------
  1 | package metrics_test
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/adrg/strutil/metrics"
  8 | 	"github.com/stretchr/testify/require"
  9 | )
 10 | 
 11 | func sf(a float64) string {
 12 | 	return fmt.Sprintf("%.2f", a)
 13 | }
 14 | 
 15 | func TestHamming(t *testing.T) {
 16 | 	h := metrics.NewHamming()
 17 | 	require.Equal(t, 0, h.Distance("", ""))
 18 | 	require.Equal(t, "0.75", sf(h.Compare("text", "test")))
 19 | 	require.Equal(t, "0.50", sf(h.Compare("once", "one")))
 20 | 	require.Equal(t, "1.00", sf(h.Compare("ab\u2019c", "ab\u2019c")))
 21 | 	require.Equal(t, "0.75", sf(h.Compare("ab\u2019d", "ab\u2019c")))
 22 | 	require.Equal(t, "0.75", sf(h.Compare("ab\u2018c", "ab\u2019c")))
 23 | 	h.CaseSensitive = false
 24 | 	require.Equal(t, "0.50", sf(h.Compare("one", "ONCE")))
 25 | }
 26 | 
 27 | func TestJaccard(t *testing.T) {
 28 | 	j := metrics.NewJaccard()
 29 | 	require.Equal(t, "1.00", sf(j.Compare("", "")))
 30 | 	require.Equal(t, "0.00", sf(j.Compare("a", "b")))
 31 | 	require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
 32 | 	require.Equal(t, "0.50", sf(j.Compare("ab\u2019d", "ab\u2019c")))
 33 | 	require.Equal(t, "0.20", sf(j.Compare("ab\u2018c", "ab\u2019c")))
 34 | 	require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
 35 | 	j.NgramSize = 0
 36 | 	require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
 37 | 	j.CaseSensitive = false
 38 | 	j.NgramSize = 3
 39 | 	require.Equal(t, "0.33", sf(j.Compare("NIGHT", "alright")))
 40 | }
 41 | 
 42 | func TestJaro(t *testing.T) {
 43 | 	j := metrics.NewJaro()
 44 | 	require.Equal(t, "1.00", sf(j.Compare("", "")))
 45 | 	require.Equal(t, "0.00", sf(j.Compare("test", "")))
 46 | 	require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
 47 | 	require.Equal(t, "0.83", sf(j.Compare("ab\u2019d", "ab\u2019c")))
 48 | 	require.Equal(t, "0.83", sf(j.Compare("ab\u2018c", "ab\u2019c")))
 49 | 	require.Equal(t, "0.00", sf(j.Compare("a", "b")))
 50 | 	require.Equal(t, "0.78", sf(j.Compare("sort", "shirt")))
 51 | 	require.Equal(t, "0.64", sf(j.Compare("sort", "report")))
 52 | 	j.CaseSensitive = false
 53 | 	require.Equal(t, "0.78", sf(j.Compare("sort", "SHIRT")))
 54 | }
 55 | 
 56 | func TestJaroWinkler(t *testing.T) {
 57 | 	j := metrics.NewJaroWinkler()
 58 | 	require.Equal(t, "1.00", sf(j.Compare("", "")))
 59 | 	require.Equal(t, "0.00", sf(j.Compare("test", "")))
 60 | 	require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
 61 | 	require.Equal(t, "0.88", sf(j.Compare("ab\u2019d", "ab\u2019c")))
 62 | 	require.Equal(t, "0.87", sf(j.Compare("ab\u2018c", "ab\u2019c")))
 63 | 	require.Equal(t, "0.80", sf(j.Compare("sort", "shirt")))
 64 | 	require.Equal(t, "0.94", sf(j.Compare("charm", "charmed")))
 65 | 	j.CaseSensitive = false
 66 | 	require.Equal(t, "0.80", sf(j.Compare("sort", "SHIRT")))
 67 | }
 68 | 
 69 | func TestLevenshtein(t *testing.T) {
 70 | 	l := metrics.NewLevenshtein()
 71 | 	require.Equal(t, 0, l.Distance("", ""))
 72 | 	require.Equal(t, 4, l.Distance("test", ""))
 73 | 	require.Equal(t, 4, l.Distance("", "test"))
 74 | 	require.Equal(t, 0, l.Distance("ab\u2019c", "ab\u2019c"))
 75 | 	require.Equal(t, 1, l.Distance("ab\u2019d", "ab\u2019c"))
 76 | 	require.Equal(t, 1, l.Distance("ab\u2018c", "ab\u2019c"))
 77 | 	require.Equal(t, "0.40", sf(l.Compare("book", "brick")))
 78 | 	require.Equal(t, "0.75", sf(l.Compare("ab\u2019d", "ab\u2019c")))
 79 | 	require.Equal(t, "0.75", sf(l.Compare("ab\u2018c", "ab\u2019c")))
 80 | 	l.CaseSensitive = false
 81 | 	require.Equal(t, "0.80", sf(l.Compare("hello", "jello")))
 82 | 	l.ReplaceCost = 2
 83 | 	require.Equal(t, "0.60", sf(l.Compare("hello", "JELLO")))
 84 | 	require.Equal(t, "1.00", sf(l.Compare("ab\u2019c", "ab\u2019c")))
 85 | 	require.Equal(t, "0.50", sf(l.Compare("ab\u2019d", "ab\u2019c")))
 86 | 	require.Equal(t, "0.50", sf(l.Compare("ab\u2018c", "ab\u2019c")))
 87 | }
 88 | 
 89 | func TestOperlapCoefficient(t *testing.T) {
 90 | 	o := metrics.NewOverlapCoefficient()
 91 | 	require.Equal(t, "1.00", sf(o.Compare("", "")))
 92 | 	require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
 93 | 	require.Equal(t, "0.00", sf(o.Compare("aa", "")))
 94 | 	require.Equal(t, "0.00", sf(o.Compare("bb", "")))
 95 | 	require.Equal(t, "1.00", sf(o.Compare("ab\u2019c", "ab\u2019c")))
 96 | 	require.Equal(t, "0.67", sf(o.Compare("ab\u2019d", "ab\u2019c")))
 97 | 	require.Equal(t, "0.33", sf(o.Compare("ab\u2018c", "ab\u2019c")))
 98 | 	o.NgramSize = 0
 99 | 	require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
100 | 	require.Equal(t, "1.00", sf(o.Compare("aa", "aaaa")))
101 | 	o.CaseSensitive = false
102 | 	require.Equal(t, "1.00", sf(o.Compare("aa", "AAAA")))
103 | 	o.NgramSize = 3
104 | 	require.Equal(t, "0.67", sf(o.Compare("night", "alright")))
105 | }
106 | 
107 | func TestSmithWatermanGotoh(t *testing.T) {
108 | 	s := metrics.NewSmithWatermanGotoh()
109 | 	require.Equal(t, "1.00", sf(s.Compare("", "")))
110 | 	require.Equal(t, "0.00", sf(s.Compare("test", "")))
111 | 	require.Equal(t, "0.00", sf(s.Compare("", "test")))
112 | 	require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
113 | 	require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
114 | 	require.Equal(t, "0.75", sf(s.Compare("ab\u2019d", "ab\u2019c")))
115 | 	require.Equal(t, "0.50", sf(s.Compare("ab\u2018c", "ab\u2019c")))
116 | 	s.Substitution = nil
117 | 	require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
118 | 	s.CaseSensitive = false
119 | 	s.GapPenalty = -0.1
120 | 	s.Substitution = metrics.MatchMismatch{
121 | 		Match:    1,
122 | 		Mismatch: -0.5,
123 | 	}
124 | 	require.Equal(t, "0.94", sf(s.Compare("a pink kitten", "A KITTEN")))
125 | }
126 | 
127 | func TestSorensenDice(t *testing.T) {
128 | 	s := metrics.NewSorensenDice()
129 | 	require.Equal(t, "1.00", sf(s.Compare("", "")))
130 | 	require.Equal(t, "0.00", sf(s.Compare("a", "b")))
131 | 	require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
132 | 	require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
133 | 	require.Equal(t, "0.67", sf(s.Compare("ab\u2019d", "ab\u2019c")))
134 | 	require.Equal(t, "0.33", sf(s.Compare("ab\u2018c", "ab\u2019c")))
135 | 	s.NgramSize = 0
136 | 	require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
137 | 	s.CaseSensitive = false
138 | 	require.Equal(t, "0.60", sf(s.Compare("night", "ALRIGHT")))
139 | 	s.NgramSize = 3
140 | 	require.Equal(t, "0.50", sf(s.Compare("night", "alright")))
141 | }
142 | 
143 | func TestMatchMismatch(t *testing.T) {
144 | 	m := metrics.MatchMismatch{
145 | 		Match:    2,
146 | 		Mismatch: 1,
147 | 	}
148 | 	require.Equal(t, "1.00", sf(m.Compare([]rune{'a'}, 0, []rune{'b'}, 0)))
149 | 	require.Equal(t, "2.00", sf(m.Compare([]rune{'a'}, 0, []rune{'a'}, 0)))
150 | 	require.Equal(t, "1.00", sf(m.Min()))
151 | 	require.Equal(t, "2.00", sf(m.Max()))
152 | }
153 | 


--------------------------------------------------------------------------------
/metrics/overlap_coefficient.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	"github.com/adrg/strutil/internal/mathutil"
 7 | 	"github.com/adrg/strutil/internal/ngram"
 8 | )
 9 | 
10 | // OverlapCoefficient represents the overlap coefficient for measuring the
11 | // similarity between sequences. The metric is also know as the
12 | // Szymkiewicz-Simpson coefficient.
13 | //   For more information see https://en.wikipedia.org/wiki/Overlap_coefficient.
14 | type OverlapCoefficient struct {
15 | 	// CaseSensitive specifies if the string comparison is case sensitive.
16 | 	CaseSensitive bool
17 | 
18 | 	// NgramSize represents the size (in characters) of the tokens generated
19 | 	// when comparing the input sequences.
20 | 	NgramSize int
21 | }
22 | 
23 | // NewOverlapCoefficient returns a new overlap coefficient string metric.
24 | //
25 | // Default options:
26 | //   CaseSensitive: true
27 | //   NGramSize: 2
28 | func NewOverlapCoefficient() *OverlapCoefficient {
29 | 	return &OverlapCoefficient{
30 | 		CaseSensitive: true,
31 | 		NgramSize:     2,
32 | 	}
33 | }
34 | 
35 | // Compare returns the OverlapCoefficient similarity coefficient of a and b.
36 | // The returned similarity is a number between 0 and 1. Larger similarity
37 | // numbers indicate closer matches.
38 | // An n-gram size of 2 is used if the provided size is less than or equal to 0.
39 | func (m *OverlapCoefficient) Compare(a, b string) float64 {
40 | 	// Lower terms if case insensitive comparison is specified.
41 | 	if !m.CaseSensitive {
42 | 		a = strings.ToLower(a)
43 | 		b = strings.ToLower(b)
44 | 	}
45 | 
46 | 	// Check if both terms are empty.
47 | 	runesA, runesB := []rune(a), []rune(b)
48 | 	if len(runesA) == 0 && len(runesB) == 0 {
49 | 		return 1
50 | 	}
51 | 
52 | 	size := m.NgramSize
53 | 	if size <= 0 {
54 | 		size = 2
55 | 	}
56 | 
57 | 	// Calculate n-gram intersection and minimum subset.
58 | 	_, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
59 | 
60 | 	min := mathutil.Min(totalA, totalB)
61 | 	if min == 0 {
62 | 		return 0
63 | 	}
64 | 
65 | 	// Return similarity.
66 | 	return float64(common) / float64(min)
67 | }
68 | 


--------------------------------------------------------------------------------
/metrics/smith_waterman_gotoh.go:
--------------------------------------------------------------------------------
  1 | package metrics
  2 | 
  3 | import (
  4 | 	"strings"
  5 | 
  6 | 	"github.com/adrg/strutil/internal/mathutil"
  7 | )
  8 | 
  9 | // SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring
 10 | // the similarity between sequences.
 11 | //   For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm.
 12 | type SmithWatermanGotoh struct {
 13 | 	// CaseSensitive specifies if the string comparison is case sensitive.
 14 | 	CaseSensitive bool
 15 | 
 16 | 	// GapPenalty defines a score penalty for character insertions or deletions.
 17 | 	// For relevant results, the gap penalty should be a non-positive number.
 18 | 	GapPenalty float64
 19 | 
 20 | 	// Substitution represents a substitution function which is used to
 21 | 	// calculate a score for character substitutions.
 22 | 	Substitution Substitution
 23 | }
 24 | 
 25 | // NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric.
 26 | //
 27 | // Default options:
 28 | //   CaseSensitive: true
 29 | //   GapPenalty: -0.5
 30 | //   Substitution: MatchMismatch{
 31 | //   	Match:    1,
 32 | //   	Mismatch: -2,
 33 | //   },
 34 | func NewSmithWatermanGotoh() *SmithWatermanGotoh {
 35 | 	return &SmithWatermanGotoh{
 36 | 		CaseSensitive: true,
 37 | 		GapPenalty:    -0.5,
 38 | 		Substitution: MatchMismatch{
 39 | 			Match:    1,
 40 | 			Mismatch: -2,
 41 | 		},
 42 | 	}
 43 | }
 44 | 
 45 | // Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned
 46 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
 47 | // closer matches.
 48 | func (m *SmithWatermanGotoh) Compare(a, b string) float64 {
 49 | 	gap := m.GapPenalty
 50 | 
 51 | 	// Lower terms if case insensitive comparison is specified.
 52 | 	if !m.CaseSensitive {
 53 | 		a = strings.ToLower(a)
 54 | 		b = strings.ToLower(b)
 55 | 	}
 56 | 	runesA, runesB := []rune(a), []rune(b)
 57 | 
 58 | 	// Check if both terms are empty.
 59 | 	lenA, lenB := len(runesA), len(runesB)
 60 | 	if lenA == 0 && lenB == 0 {
 61 | 		return 1
 62 | 	}
 63 | 
 64 | 	// Check if one of the terms is empty.
 65 | 	if lenA == 0 || lenB == 0 {
 66 | 		return 0
 67 | 	}
 68 | 
 69 | 	// Use default substitution, if none is specified.
 70 | 	subst := m.Substitution
 71 | 	if subst == nil {
 72 | 		subst = MatchMismatch{
 73 | 			Match:    1,
 74 | 			Mismatch: -2,
 75 | 		}
 76 | 	}
 77 | 
 78 | 	// Calculate max distance.
 79 | 	maxDistance := mathutil.Minf(float64(lenA), float64(lenB)) * mathutil.Maxf(subst.Max(), gap)
 80 | 
 81 | 	// Calculate distance.
 82 | 	v0 := make([]float64, lenB)
 83 | 	v1 := make([]float64, lenB)
 84 | 
 85 | 	distance := mathutil.Maxf(0, gap, subst.Compare(runesA, 0, runesB, 0))
 86 | 	v0[0] = distance
 87 | 
 88 | 	for i := 1; i < lenB; i++ {
 89 | 		v0[i] = mathutil.Maxf(0, v0[i-1]+gap, subst.Compare(runesA, 0, runesB, i))
 90 | 		distance = mathutil.Maxf(distance, v0[i])
 91 | 	}
 92 | 
 93 | 	for i := 1; i < lenA; i++ {
 94 | 		v1[0] = mathutil.Maxf(0, v0[0]+gap, subst.Compare(runesA, i, runesB, 0))
 95 | 		distance = mathutil.Maxf(distance, v1[0])
 96 | 
 97 | 		for j := 1; j < lenB; j++ {
 98 | 			v1[j] = mathutil.Maxf(0, v0[j]+gap, v1[j-1]+gap, v0[j-1]+subst.Compare(runesA, i, runesB, j))
 99 | 			distance = mathutil.Maxf(distance, v1[j])
100 | 		}
101 | 
102 | 		for j := 0; j < lenB; j++ {
103 | 			v0[j] = v1[j]
104 | 		}
105 | 	}
106 | 
107 | 	// Return similarity.
108 | 	return distance / maxDistance
109 | }
110 | 


--------------------------------------------------------------------------------
/metrics/sorensen_dice.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	"github.com/adrg/strutil/internal/ngram"
 7 | )
 8 | 
 9 | // SorensenDice represents the Sorensen-Dice metric for measuring the
10 | // similarity between sequences.
11 | //   For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient.
12 | type SorensenDice struct {
13 | 	// CaseSensitive specifies if the string comparison is case sensitive.
14 | 	CaseSensitive bool
15 | 
16 | 	// NgramSize represents the size (in characters) of the tokens generated
17 | 	// when comparing the input sequences.
18 | 	NgramSize int
19 | }
20 | 
21 | // NewSorensenDice returns a new Sorensen-Dice string metric.
22 | //
23 | // Default options:
24 | //   CaseSensitive: true
25 | //   NGramSize: 2
26 | func NewSorensenDice() *SorensenDice {
27 | 	return &SorensenDice{
28 | 		CaseSensitive: true,
29 | 		NgramSize:     2,
30 | 	}
31 | }
32 | 
33 | // Compare returns the Sorensen-Dice similarity coefficient of a and b. The
34 | // returned similarity is a number between 0 and 1. Larger similarity numbers
35 | // indicate closer matches.
36 | // An n-gram size of 2 is used if the provided size is less than or equal to 0.
37 | func (m *SorensenDice) Compare(a, b string) float64 {
38 | 	// Lower terms if case insensitive comparison is specified.
39 | 	if !m.CaseSensitive {
40 | 		a = strings.ToLower(a)
41 | 		b = strings.ToLower(b)
42 | 	}
43 | 
44 | 	// Check if both terms are empty.
45 | 	runesA, runesB := []rune(a), []rune(b)
46 | 	if len(runesA) == 0 && len(runesB) == 0 {
47 | 		return 1
48 | 	}
49 | 
50 | 	size := m.NgramSize
51 | 	if size <= 0 {
52 | 		size = 2
53 | 	}
54 | 
55 | 	// Calculate n-gram intersection and union.
56 | 	_, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
57 | 
58 | 	total := totalA + totalB
59 | 	if total == 0 {
60 | 		return 0
61 | 	}
62 | 
63 | 	// Return similarity.
64 | 	return 2 * float64(common) / float64(total)
65 | }
66 | 


--------------------------------------------------------------------------------
/metrics/substitution.go:
--------------------------------------------------------------------------------
 1 | package metrics
 2 | 
 3 | // Substitution represents a substitution function which is used to
 4 | // calculate a score for character substitutions.
 5 | type Substitution interface {
 6 | 	// Compare returns the substitution score of characters a[idxA] and b[idxB].
 7 | 	Compare(a []rune, idxA int, b []rune, idxB int) float64
 8 | 
 9 | 	// Returns the maximum score of a character substitution operation.
10 | 	Max() float64
11 | 
12 | 	// Returns the minimum score of a character substitution operation.
13 | 	Min() float64
14 | }
15 | 


--------------------------------------------------------------------------------
/strutil.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | Package strutil provides string metrics for calculating string similarity as
 3 | well as other string utility functions. Documentation for all the metrics can
 4 | be found at https://pkg.go.dev/github.com/adrg/strutil/metrics.
 5 | 
 6 | Included string metrics:
 7 |   - Hamming
 8 |   - Jaro
 9 |   - Jaro-Winkler
10 |   - Levenshtein
11 |   - Smith-Waterman-Gotoh
12 |   - Sorensen-Dice
13 |   - Jaccard
14 |   - Overlap coefficient
15 | 
16 | */
17 | package strutil
18 | 
19 | import (
20 | 	"github.com/adrg/strutil/internal/ngram"
21 | 	"github.com/adrg/strutil/internal/stringutil"
22 | )
23 | 
24 | // StringMetric represents a metric for measuring the similarity between
25 | // strings. The metrics package implements the following string metrics:
26 | //  - Hamming
27 | //  - Jaro
28 | //  - Jaro-Winkler
29 | //  - Levenshtein
30 | //  - Smith-Waterman-Gotoh
31 | //  - Sorensen-Dice
32 | //  - Jaccard
33 | //  - Overlap coefficient
34 | //
35 | // For more information see https://pkg.go.dev/github.com/adrg/strutil/metrics.
36 | type StringMetric interface {
37 | 	Compare(a, b string) float64
38 | }
39 | 
40 | // Similarity returns the similarity of a and b, computed using the specified
41 | // string metric. The returned similarity is a number between 0 and 1. Larger
42 | // similarity numbers indicate closer matches.
43 | func Similarity(a, b string, metric StringMetric) float64 {
44 | 	return metric.Compare(a, b)
45 | }
46 | 
47 | // CommonPrefix returns the common prefix of the specified strings. An empty
48 | // string is returned if the parameters have no prefix in common.
49 | func CommonPrefix(a, b string) string {
50 | 	return stringutil.CommonPrefix(a, b)
51 | }
52 | 
53 | // UniqueSlice returns a slice containing the unique items from the specified
54 | // string slice. The items in the output slice are in the order in which they
55 | // occur in the input slice.
56 | func UniqueSlice(items []string) []string {
57 | 	return stringutil.UniqueSlice(items)
58 | }
59 | 
60 | // SliceContains returns true if terms contains q, or false otherwise.
61 | func SliceContains(terms []string, q string) bool {
62 | 	return stringutil.SliceContains(terms, q)
63 | }
64 | 
65 | // NgramCount returns the n-gram count of the specified size for the
66 | // provided term. An n-gram size of 1 is used if the provided size is
67 | // less than or equal to 0.
68 | func NgramCount(term string, size int) int {
69 | 	return ngram.Count([]rune(term), size)
70 | }
71 | 
72 | // Ngrams returns all the n-grams of the specified size for the provided term.
73 | // The n-grams in the output slice are in the order in which they occur in the
74 | // input term. An n-gram size of 1 is used if the provided size is less than or
75 | // equal to 0.
76 | func Ngrams(term string, size int) []string {
77 | 	return ngram.Slice([]rune(term), size)
78 | }
79 | 
80 | // NgramMap returns a map of all n-grams of the specified size for the provided
81 | // term, along with their frequency. The function also returns the total number
82 | // of n-grams, which is the sum of all the values in the output map.
83 | // An n-gram size of 1 is used if the provided size is less than or equal to 0.
84 | func NgramMap(term string, size int) (map[string]int, int) {
85 | 	return ngram.Map([]rune(term), size)
86 | }
87 | 
88 | // NgramIntersection returns a map of the n-grams of the specified size found
89 | // in both terms, along with their frequency. The function also returns the
90 | // number of common n-grams (the sum of all the values in the output map), the
91 | // total number of n-grams in the first term and the total number of n-grams in
92 | // the second term. An n-gram size of 1 is used if the provided size is less
93 | // than or equal to 0.
94 | func NgramIntersection(a, b string, size int) (map[string]int, int, int, int) {
95 | 	return ngram.Intersection([]rune(a), []rune(b), size)
96 | }
97 | 


--------------------------------------------------------------------------------