4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | strutil
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 | strutil provides a collection of string metrics for calculating string similarity as well as
28 | other string utility functions.
29 | Full documentation can be found at https://pkg.go.dev/github.com/adrg/strutil.
30 |
31 | ## Installation
32 |
33 | ```
34 | go get github.com/adrg/strutil
35 | ```
36 |
37 | ## String metrics
38 |
39 | - [Hamming](#hamming)
40 | - [Levenshtein](#levenshtein)
41 | - [Jaro](#jaro)
42 | - [Jaro-Winkler](#jaro-winkler)
43 | - [Smith-Waterman-Gotoh](#smith-waterman-gotoh)
44 | - [Sorensen-Dice](#sorensen-dice)
45 | - [Jaccard](#jaccard)
46 | - [Overlap Coefficient](#overlap-coefficient)
47 |
48 | The package defines the `StringMetric` interface, which is implemented by all
49 | the string metrics. The interface is used with the `Similarity` function, which
50 | calculates the similarity between the specified strings, using the provided
51 | string metric.
52 |
53 | ```go
54 | type StringMetric interface {
55 | Compare(a, b string) float64
56 | }
57 |
58 | func Similarity(a, b string, metric StringMetric) float64 {
59 | }
60 | ```
61 |
62 | All defined string metrics can be found in the
63 | [metrics](https://pkg.go.dev/github.com/adrg/strutil/metrics) package.
64 |
65 | #### Hamming
66 |
67 | Calculate similarity.
68 | ```go
69 | similarity := strutil.Similarity("text", "test", metrics.NewHamming())
70 | fmt.Printf("%.2f\n", similarity) // Output: 0.75
71 | ```
72 |
73 | Calculate distance.
74 | ```go
75 | ham := metrics.NewHamming()
76 | fmt.Printf("%d\n", ham.Distance("one", "once")) // Output: 2
77 | ```
78 |
79 | More information and additional examples can be found on
80 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Hamming).
81 |
82 | #### Levenshtein
83 |
84 | Calculate similarity using default options.
85 | ```go
86 | similarity := strutil.Similarity("graph", "giraffe", metrics.NewLevenshtein())
87 | fmt.Printf("%.2f\n", similarity) // Output: 0.43
88 | ```
89 |
90 | Configure edit operation costs.
91 | ```go
92 | lev := metrics.NewLevenshtein()
93 | lev.CaseSensitive = false
94 | lev.InsertCost = 1
95 | lev.ReplaceCost = 2
96 | lev.DeleteCost = 1
97 |
98 | similarity := strutil.Similarity("make", "Cake", lev)
99 | fmt.Printf("%.2f\n", similarity) // Output: 0.50
100 | ```
101 |
102 | Calculate distance.
103 | ```go
104 | lev := metrics.NewLevenshtein()
105 | fmt.Printf("%d\n", lev.Distance("graph", "giraffe")) // Output: 4
106 | ```
107 |
108 | More information and additional examples can be found on
109 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Levenshtein).
110 |
111 | #### Jaro
112 |
113 | ```go
114 | similarity := strutil.Similarity("think", "tank", metrics.NewJaro())
115 | fmt.Printf("%.2f\n", similarity) // Output: 0.78
116 | ```
117 |
118 | More information and additional examples can be found on
119 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaro).
120 |
121 | #### Jaro-Winkler
122 |
123 | ```go
124 | similarity := strutil.Similarity("think", "tank", metrics.NewJaroWinkler())
125 | fmt.Printf("%.2f\n", similarity) // Output: 0.80
126 | ```
127 |
128 | More information and additional examples can be found on
129 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#JaroWinkler).
130 |
131 | #### Smith-Waterman-Gotoh
132 |
133 | Calculate similarity using default options.
134 | ```go
135 | swg := metrics.NewSmithWatermanGotoh()
136 | similarity := strutil.Similarity("times roman", "times new roman", swg)
137 | fmt.Printf("%.2f\n", similarity) // Output: 0.82
138 | ```
139 |
140 | Customize gap penalty and substitution function.
141 | ```go
142 | swg := metrics.NewSmithWatermanGotoh()
143 | swg.CaseSensitive = false
144 | swg.GapPenalty = -0.1
145 | swg.Substitution = metrics.MatchMismatch {
146 | Match: 1,
147 | Mismatch: -0.5,
148 | }
149 |
150 | similarity := strutil.Similarity("Times Roman", "times new roman", swg)
151 | fmt.Printf("%.2f\n", similarity) // Output: 0.96
152 | ```
153 |
154 | More information and additional examples can be found on
155 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SmithWatermanGotoh).
156 |
157 | #### Sorensen-Dice
158 |
159 | Calculate similarity using default options.
160 | ```go
161 | sd := metrics.NewSorensenDice()
162 | similarity := strutil.Similarity("time to make haste", "no time to waste", sd)
163 | fmt.Printf("%.2f\n", similarity) // Output: 0.62
164 | ```
165 |
166 | Customize n-gram size.
167 | ```go
168 | sd := metrics.NewSorensenDice()
169 | sd.CaseSensitive = false
170 | sd.NgramSize = 3
171 |
172 | similarity := strutil.Similarity("Time to make haste", "no time to waste", sd)
173 | fmt.Printf("%.2f\n", similarity) // Output: 0.53
174 | ```
175 |
176 | More information and additional examples can be found on
177 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#SorensenDice).
178 |
179 | #### Jaccard
180 |
181 | Calculate similarity using default options.
182 | ```go
183 | j := metrics.NewJaccard()
184 | similarity := strutil.Similarity("time to make haste", "no time to waste", j)
185 | fmt.Printf("%.2f\n", similarity) // Output: 0.45
186 | ```
187 |
188 | Customize n-gram size.
189 | ```go
190 | j := metrics.NewJaccard()
191 | j.CaseSensitive = false
192 | j.NgramSize = 3
193 |
194 | similarity := strutil.Similarity("Time to make haste", "no time to waste", j)
195 | fmt.Printf("%.2f\n", similarity) // Output: 0.36
196 | ```
197 |
198 | The input of the Sorensen-Dice example is the same as the one of Jaccard
199 | because the metrics bear a resemblance to each other. In fact, each of the
200 | coefficients can be used to calculate the other one.
201 |
202 | Sorensen-Dice to Jaccard.
203 | ```
204 | J = SD/(2-SD)
205 |
206 | where SD is the Sorensen-Dice coefficient and J is the Jaccard index.
207 | ```
208 |
209 | Jaccard to Sorensen-Dice.
210 | ```
211 | SD = 2*J/(1+J)
212 |
213 | where SD is the Sorensen-Dice coefficient and J is the Jaccard index.
214 | ```
215 |
216 | More information and additional examples can be found on
217 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#Jaccard).
218 |
219 | #### Overlap Coefficient
220 |
221 | Calculate similarity using default options.
222 | ```go
223 | oc := metrics.NewOverlapCoefficient()
224 | similarity := strutil.Similarity("time to make haste", "no time to waste", oc)
225 | fmt.Printf("%.2f\n", similarity) // Output: 0.67
226 | ```
227 |
228 | Customize n-gram size.
229 | ```go
230 | oc := metrics.NewOverlapCoefficient()
231 | oc.CaseSensitive = false
232 | oc.NgramSize = 3
233 |
234 | similarity := strutil.Similarity("Time to make haste", "no time to waste", oc)
235 | fmt.Printf("%.2f\n", similarity) // Output: 0.57
236 | ```
237 |
238 | More information and additional examples can be found on
239 | [pkg.go.dev](https://pkg.go.dev/github.com/adrg/strutil/metrics#OverlapCoefficient).
240 |
241 | ## References
242 |
243 | For more information see:
244 | - [Hamming distance](https://en.wikipedia.org/wiki/Hamming_distance)
245 | - [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance)
246 | - [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro-Winkler_distance)
247 | - [Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith-Waterman_algorithm)
248 | - [Sorensen-Dice coefficient](https://en.wikipedia.org/wiki/Sorensen–Dice_coefficient)
249 | - [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index)
250 | - [Overlap coefficient](https://en.wikipedia.org/wiki/Overlap_coefficient)
251 |
252 | ## Stargazers over time
253 |
254 | [](https://starchart.cc/adrg/strutil)
255 |
256 | ## Contributing
257 |
258 | Contributions in the form of pull requests, issues or just general feedback,
259 | are always welcome.
260 | See [CONTRIBUTING.MD](CONTRIBUTING.md).
261 |
262 | ## License
263 |
264 | Copyright (c) 2019 Adrian-George Bostan.
265 |
266 | This project is licensed under the [MIT license](https://opensource.org/licenses/MIT).
267 | See [LICENSE](LICENSE) for more details.
268 |
--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
1 | package strutil_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/adrg/strutil"
7 | "github.com/adrg/strutil/metrics"
8 | )
9 |
10 | func ExampleSimilarity() {
11 | sim := strutil.Similarity("riddle", "needle", metrics.NewJaroWinkler())
12 | fmt.Printf("(riddle, needle) similarity: %.2f\n", sim)
13 |
14 | // Output:
15 | // (riddle, needle) similarity: 0.56
16 | }
17 |
18 | func ExampleCommonPrefix() {
19 | fmt.Println("(answer, anvil):", strutil.CommonPrefix("answer", "anvil"))
20 |
21 | // Output:
22 | // (answer, anvil): an
23 | }
24 |
25 | func ExampleUniqueSlice() {
26 | sample := []string{"a", "b", "a", "b", "b", "c"}
27 | fmt.Println("[a b a b b c]:", strutil.UniqueSlice(sample))
28 |
29 | // Output:
30 | // [a b a b b c]: [a b c]
31 | }
32 |
33 | func ExampleSliceContains() {
34 | terms := []string{"a", "b", "c"}
35 | fmt.Println("([a b c], b):", strutil.SliceContains(terms, "b"))
36 | fmt.Println("([a b c], d):", strutil.SliceContains(terms, "d"))
37 |
38 | // Output:
39 | // ([a b c], b): true
40 | // ([a b c], d): false
41 | }
42 |
43 | func ExampleNgramCount() {
44 | fmt.Println("abbcd n-gram count (size 2):", strutil.NgramCount("abbcd", 2))
45 | fmt.Println("abbcd n-gram count (size 3):", strutil.NgramCount("abbcd", 3))
46 |
47 | // Output:
48 | // abbcd n-gram count (size 2): 4
49 | // abbcd n-gram count (size 3): 3
50 | }
51 |
52 | func ExampleNgrams() {
53 | fmt.Println("abbcd n-grams (size 2):", strutil.Ngrams("abbcd", 2))
54 | fmt.Println("abbcd n-grams (size 3):", strutil.Ngrams("abbcd", 3))
55 |
56 | // Output:
57 | // abbcd n-grams (size 2): [ab bb bc cd]
58 | // abbcd n-grams (size 3): [abb bbc bcd]
59 | }
60 |
61 | func ExampleNgramMap() {
62 | // 2 character n-gram map.
63 | ngrams, total := strutil.NgramMap("abbcabb", 2)
64 | fmt.Printf("abbcabb n-gram map (size 2): %v (%d ngrams)\n", ngrams, total)
65 |
66 | // 3 character n-gram map.
67 | ngrams, total = strutil.NgramMap("abbcabb", 3)
68 | fmt.Printf("abbcabb n-gram map (size 3): %v (%d ngrams)\n", ngrams, total)
69 |
70 | // Output:
71 | // abbcabb n-gram map (size 2): map[ab:2 bb:2 bc:1 ca:1] (6 ngrams)
72 | // abbcabb n-gram map (size 3): map[abb:2 bbc:1 bca:1 cab:1] (5 ngrams)
73 | }
74 |
75 | func ExampleNgramIntersection() {
76 | ngrams, common, totalA, totalB := strutil.NgramIntersection("ababc", "ababd", 2)
77 | fmt.Printf("(ababc, ababd) n-gram intersection: %v (%d/%d n-grams)\n",
78 | ngrams, common, totalA+totalB)
79 |
80 | // Output:
81 | // (ababc, ababd) n-gram intersection: map[ab:2 ba:1] (3/8 n-grams)
82 | }
83 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/adrg/strutil
2 |
3 | go 1.19
4 |
5 | require github.com/stretchr/testify v1.10.0
6 |
7 | require (
8 | github.com/davecgh/go-spew v1.1.1 // indirect
9 | github.com/pmezard/go-difflib v1.0.0 // indirect
10 | gopkg.in/yaml.v3 v3.0.1 // indirect
11 | )
12 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
2 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
3 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
4 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
5 | github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA=
6 | github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
7 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
8 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
9 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
10 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
11 |
--------------------------------------------------------------------------------
/internal/mathutil/math.go:
--------------------------------------------------------------------------------
1 | package mathutil
2 |
3 | // Min returns the value of the smallest argument,
4 | // or 0 if no arguments are provided.
5 | func Min(args ...int) int {
6 | if len(args) == 0 {
7 | return 0
8 | }
9 | if len(args) == 1 {
10 | return args[0]
11 | }
12 |
13 | min := args[0]
14 | for _, arg := range args[1:] {
15 | if min > arg {
16 | min = arg
17 | }
18 | }
19 |
20 | return min
21 | }
22 |
23 | // Max returns the value of the largest argument,
24 | // or 0 if no arguments are provided.
25 | func Max(args ...int) int {
26 | if len(args) == 0 {
27 | return 0
28 | }
29 | if len(args) == 1 {
30 | return args[0]
31 | }
32 |
33 | max := args[0]
34 | for _, arg := range args[1:] {
35 | if max < arg {
36 | max = arg
37 | }
38 | }
39 |
40 | return max
41 | }
42 |
43 | // Minf returns the value of the smallest argument,
44 | // or 0 if no arguments are provided.
45 | func Minf(args ...float64) float64 {
46 | if len(args) == 0 {
47 | return 0
48 | }
49 | if len(args) == 1 {
50 | return args[0]
51 | }
52 |
53 | min := args[0]
54 | for _, arg := range args[1:] {
55 | if min > arg {
56 | min = arg
57 | }
58 | }
59 |
60 | return min
61 | }
62 |
63 | // Maxf returns the value of the largest argument,
64 | // or 0 if no arguments are provided.
65 | func Maxf(args ...float64) float64 {
66 | if len(args) == 0 {
67 | return 0
68 | }
69 | if len(args) == 1 {
70 | return args[0]
71 | }
72 |
73 | max := args[0]
74 | for _, arg := range args[1:] {
75 | if max < arg {
76 | max = arg
77 | }
78 | }
79 |
80 | return max
81 | }
82 |
--------------------------------------------------------------------------------
/internal/mathutil/mathutil_test.go:
--------------------------------------------------------------------------------
1 | package mathutil_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/adrg/strutil/internal/mathutil"
7 | "github.com/stretchr/testify/require"
8 | )
9 |
10 | func TestMin(t *testing.T) {
11 | requireEqual(t, [][2]interface{}{
12 | {0, mathutil.Min()},
13 | {1, mathutil.Min(1)},
14 | {0, mathutil.Min(0, 1)},
15 | {1, mathutil.Min(1, 1)},
16 | {1, mathutil.Min(2, 1)},
17 | {1, mathutil.Min(1, 2)},
18 | {0, mathutil.Min(2, 1, 0)},
19 | {0, mathutil.Min(0, 1, 2)},
20 | })
21 | }
22 |
23 | func TestMax(t *testing.T) {
24 | requireEqual(t, [][2]interface{}{
25 | {0, mathutil.Max()},
26 | {1, mathutil.Max(1)},
27 | {1, mathutil.Max(0, 1)},
28 | {1, mathutil.Max(1, 1)},
29 | {2, mathutil.Max(2, 1)},
30 | {2, mathutil.Max(1, 2)},
31 | {3, mathutil.Max(2, 1, 3)},
32 | {3, mathutil.Max(3, 1, 2)},
33 | })
34 | }
35 |
36 | func TestMinf(t *testing.T) {
37 | requireEqual(t, [][2]interface{}{
38 | {0.0, mathutil.Minf()},
39 | {1.0, mathutil.Minf(1.0)},
40 | {0.0, mathutil.Minf(0.0, 1.0)},
41 | {1.0, mathutil.Minf(1.0, 1.0)},
42 | {1.0, mathutil.Minf(2.0, 1.0)},
43 | {1.0, mathutil.Minf(1.0, 2.0)},
44 | {0.0, mathutil.Minf(2.0, 1.0, 0.0)},
45 | {0.0, mathutil.Minf(0.0, 1.0, 2.0)},
46 | })
47 | }
48 |
49 | func TestMaxf(t *testing.T) {
50 | requireEqual(t, [][2]interface{}{
51 | {0.0, mathutil.Maxf()},
52 | {1.0, mathutil.Maxf(1.0)},
53 | {1.0, mathutil.Maxf(0.0, 1.0)},
54 | {1.0, mathutil.Maxf(1.0, 1.0)},
55 | {2.0, mathutil.Maxf(2.0, 1.1, 1.0)},
56 | {2.0, mathutil.Maxf(1.1, 1.0, 2.0)},
57 | {3.0, mathutil.Maxf(2.0, 1.0, 3.0)},
58 | {3.0, mathutil.Maxf(3.0, 1.0, 2.0)},
59 | })
60 | }
61 |
62 | func requireEqual(t *testing.T, inputs [][2]interface{}) {
63 | t.Helper()
64 |
65 | for _, input := range inputs {
66 | require.Equal(t, input[0], input[1])
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/internal/ngram/ngram.go:
--------------------------------------------------------------------------------
1 | package ngram
2 |
3 | import "github.com/adrg/strutil/internal/mathutil"
4 |
5 | // Count returns the n-gram count of the specified size for the
6 | // provided term. An n-gram size of 1 is used if the provided size is
7 | // less than or equal to 0.
8 | func Count(runes []rune, size int) int {
9 | return mathutil.Max(len(runes)-(mathutil.Max(size, 1)-1), 0)
10 | }
11 |
12 | // Slice returns all the n-grams of the specified size for the provided term.
13 | // The n-grams in the output slice are in the order in which they occur in the
14 | // input term. An n-gram size of 1 is used if the provided size is less than
15 | // or equal to 0.
16 | func Slice(runes []rune, size int) []string {
17 | // Use an n-gram size of 1 if the provided size is invalid.
18 | size = mathutil.Max(size, 1)
19 |
20 | // Check if term length is too small.
21 | lenRunes := len(runes)
22 | if lenRunes == 0 || lenRunes < size {
23 | return nil
24 | }
25 |
26 | // Generate n-gram slice.
27 | limit := lenRunes - (size - 1)
28 | ngrams := make([]string, limit)
29 |
30 | for i, j := 0, 0; i < limit; i++ {
31 | ngrams[j] = string(runes[i : i+size])
32 | j++
33 | }
34 |
35 | return ngrams
36 | }
37 |
38 | // Map returns a map of all n-grams of the specified size for the provided
39 | // term, along with their frequency. The function also returns the total
40 | // number of n-grams, which is the sum of all the values in the output map.
41 | // An n-gram size of 1 is used if the provided size is less than or equal to 0.
42 | func Map(runes []rune, size int) (map[string]int, int) {
43 | // Use an n-gram size of 1 if the provided size is invalid.
44 | size = mathutil.Max(size, 1)
45 |
46 | // Check if term length is too small.
47 | lenRunes := len(runes)
48 | if lenRunes == 0 || lenRunes < size {
49 | return map[string]int{}, 0
50 | }
51 |
52 | // Generate n-gram map.
53 | limit := lenRunes - (size - 1)
54 | ngrams := make(map[string]int, limit)
55 |
56 | var ngramCount int
57 | for i := 0; i < limit; i++ {
58 | ngram := string(runes[i : i+size])
59 | count := ngrams[ngram]
60 | ngrams[ngram] = count + 1
61 | ngramCount++
62 | }
63 |
64 | return ngrams, ngramCount
65 | }
66 |
67 | // Intersection returns a map of the n-grams of the specified size found
68 | // in both terms, along with their frequency. The function also returns the
69 | // number of common n-grams (the sum of all the values in the output map),
70 | // the total number of n-grams in the first term and the total number of
71 | // n-grams in the second term. An n-gram size of 1 is used if the provided
72 | // size is less than or equal to 0.
73 | func Intersection(a, b []rune, size int) (map[string]int, int, int, int) {
74 | // Use an n-gram size of 1 if the provided size is invalid.
75 | size = mathutil.Max(size, 1)
76 |
77 | // Compute the n-grams of the first term.
78 | ngramsA, totalA := Map(a, size)
79 |
80 | // Calculate n-gram intersection with the second term.
81 | limit := len(b) - (size - 1)
82 | commonNgrams := make(map[string]int, mathutil.Max(limit, 0))
83 |
84 | var totalB, intersection int
85 | for i := 0; i < limit; i++ {
86 | ngram := string(b[i : i+size])
87 | totalB++
88 |
89 | if count, ok := ngramsA[ngram]; ok && count > 0 {
90 | // Decrease frequency of n-gram found in the first term each time
91 | // a successful match is found.
92 | intersection++
93 | ngramsA[ngram] = count - 1
94 |
95 | // Update common n-grams map with the matched n-gram and its
96 | // frequency.
97 | count = commonNgrams[ngram]
98 | commonNgrams[ngram] = count + 1
99 | }
100 | }
101 |
102 | return commonNgrams, intersection, totalA, totalB
103 | }
104 |
--------------------------------------------------------------------------------
/internal/ngram/ngram_test.go:
--------------------------------------------------------------------------------
1 | package ngram_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/adrg/strutil/internal/ngram"
7 | "github.com/stretchr/testify/require"
8 | )
9 |
10 | func TestNgramCount(t *testing.T) {
11 | requireEqual(t, [][2]interface{}{
12 | {0, ngram.Count(nil, -1)},
13 | {0, ngram.Count(nil, 0)},
14 | {0, ngram.Count(nil, 1)},
15 | {0, ngram.Count([]rune{}, -1)},
16 | {0, ngram.Count([]rune{}, 0)},
17 | {0, ngram.Count([]rune{}, 1)},
18 | {6, ngram.Count([]rune("abbabb"), -1)},
19 | {6, ngram.Count([]rune("abbabb"), 0)},
20 | {6, ngram.Count([]rune("abbabb"), 1)},
21 | {5, ngram.Count([]rune("abbabb"), 2)},
22 | {4, ngram.Count([]rune("abbabb"), 3)},
23 | {3, ngram.Count([]rune("abbabb"), 4)},
24 | {2, ngram.Count([]rune("abbabb"), 5)},
25 | {1, ngram.Count([]rune("abbabb"), 6)},
26 | {0, ngram.Count([]rune("abbabb"), 7)},
27 | {0, ngram.Count([]rune("abbabb"), 8)},
28 | })
29 | }
30 |
31 | func TestNgrams(t *testing.T) {
32 | requireEqual(t, [][2]interface{}{
33 | {0, len(ngram.Slice(nil, -1))},
34 | {0, len(ngram.Slice(nil, 0))},
35 | {0, len(ngram.Slice(nil, 1))},
36 | {0, len(ngram.Slice([]rune{}, -1))},
37 | {0, len(ngram.Slice([]rune{}, 0))},
38 | {0, len(ngram.Slice([]rune{}, 1))},
39 | {
40 | []string{"a", "b", "c", "d", "e", "f"},
41 | ngram.Slice([]rune("abcdef"), -1),
42 | },
43 | {
44 | []string{"a", "b", "c", "d", "e", "f"},
45 | ngram.Slice([]rune("abcdef"), 0),
46 | },
47 | {
48 | []string{"a", "b", "c", "d", "e", "f"},
49 | ngram.Slice([]rune("abcdef"), 1),
50 | },
51 | {
52 | []string{"ab", "bc", "cd", "de", "ef"},
53 | ngram.Slice([]rune("abcdef"), 2),
54 | },
55 | {
56 | []string{"abc", "bcd", "cde", "def"},
57 | ngram.Slice([]rune("abcdef"), 3),
58 | },
59 | {
60 | []string{"abcd", "bcde", "cdef"},
61 | ngram.Slice([]rune("abcdef"), 4),
62 | },
63 | {
64 | []string{"abcde", "bcdef"},
65 | ngram.Slice([]rune("abcdef"), 5),
66 | },
67 | {
68 | []string{"abcdef"},
69 | ngram.Slice([]rune("abcdef"), 6),
70 | },
71 | {
72 | 0,
73 | len(ngram.Slice([]rune("abcdef"), 7)),
74 | },
75 | {
76 | 0,
77 | len(ngram.Slice([]rune("abcdef"), 8)),
78 | },
79 | })
80 | }
81 |
82 | func TestNgramMap(t *testing.T) {
83 | inputs := []*struct {
84 | term []rune
85 | size int
86 | expMap map[string]int
87 | expTotal int
88 | }{
89 | {
90 | term: nil,
91 | size: -1,
92 | expMap: map[string]int{},
93 | },
94 | {
95 | term: nil,
96 | expMap: map[string]int{},
97 | },
98 | {
99 | term: nil,
100 | size: 1,
101 | expMap: map[string]int{},
102 | },
103 | {
104 | term: []rune{},
105 | size: -1,
106 | expMap: map[string]int{},
107 | },
108 | {
109 | term: []rune{},
110 | expMap: map[string]int{},
111 | },
112 | {
113 | term: []rune{},
114 | size: 1,
115 | expMap: map[string]int{},
116 | },
117 | {
118 | term: []rune("abbabb"),
119 | size: -1,
120 | expMap: map[string]int{"a": 2, "b": 4},
121 | expTotal: 6,
122 | },
123 | {
124 | term: []rune("abbabb"),
125 | expMap: map[string]int{"a": 2, "b": 4},
126 | expTotal: 6,
127 | },
128 | {
129 | term: []rune("abbabb"),
130 | size: 1,
131 | expMap: map[string]int{"a": 2, "b": 4},
132 | expTotal: 6,
133 | },
134 | {
135 | term: []rune("abbabb"),
136 | size: 2,
137 | expMap: map[string]int{"ab": 2, "bb": 2, "ba": 1},
138 | expTotal: 5,
139 | },
140 | {
141 | term: []rune("abbabb"),
142 | size: 3,
143 | expMap: map[string]int{"abb": 2, "bba": 1, "bab": 1},
144 | expTotal: 4,
145 | },
146 | {
147 | term: []rune("abbabb"),
148 | size: 4,
149 | expMap: map[string]int{"abba": 1, "bbab": 1, "babb": 1},
150 | expTotal: 3,
151 | },
152 | {
153 | term: []rune("abbabb"),
154 | size: 5,
155 | expMap: map[string]int{"abbab": 1, "bbabb": 1},
156 | expTotal: 2,
157 | },
158 | {
159 | term: []rune("abbabb"),
160 | size: 6,
161 | expMap: map[string]int{"abbabb": 1},
162 | expTotal: 1,
163 | },
164 | {
165 | term: []rune("abbabb"),
166 | size: 7,
167 | expMap: map[string]int{},
168 | expTotal: 0,
169 | },
170 | {
171 | term: []rune("abbabb"),
172 | size: 8,
173 | expMap: map[string]int{},
174 | expTotal: 0,
175 | },
176 | }
177 |
178 | for _, input := range inputs {
179 | actMap, actTotal := ngram.Map(input.term, input.size)
180 | require.Equal(t, input.expMap, actMap)
181 | require.Equal(t, input.expTotal, actTotal)
182 | }
183 | }
184 |
185 | func TestNgramIntersection(t *testing.T) {
186 | inputs := []*struct {
187 | a []rune
188 | b []rune
189 | size int
190 |
191 | expMap map[string]int
192 | expTotal int
193 | expTotalA int
194 | expTotalB int
195 | }{
196 | {
197 | size: 1,
198 | expMap: map[string]int{},
199 | },
200 | {
201 | a: []rune{},
202 | size: 1,
203 | expMap: map[string]int{},
204 | },
205 | {
206 | b: []rune{},
207 | size: 1,
208 | expMap: map[string]int{},
209 | },
210 | {
211 | a: []rune{},
212 | b: []rune{},
213 | size: 1,
214 | expMap: map[string]int{},
215 | },
216 | {
217 | a: []rune("ababbaa"),
218 | b: []rune("aabbaa"),
219 | size: -1,
220 | expMap: map[string]int{"a": 4, "b": 2},
221 | expTotal: 6,
222 | expTotalA: 7,
223 | expTotalB: 6,
224 | },
225 | {
226 | a: []rune("aabbaa"),
227 | b: []rune("ababbaa"),
228 | expMap: map[string]int{"a": 4, "b": 2},
229 | expTotal: 6,
230 | expTotalA: 6,
231 | expTotalB: 7,
232 | },
233 | {
234 | a: []rune("ababbaa"),
235 | b: []rune("aabbaa"),
236 | size: 1,
237 | expMap: map[string]int{"a": 4, "b": 2},
238 | expTotal: 6,
239 | expTotalA: 7,
240 | expTotalB: 6,
241 | },
242 | {
243 | a: []rune("aabbaa"),
244 | b: []rune("ababbaa"),
245 | size: 2,
246 | expMap: map[string]int{"aa": 1, "ab": 1, "ba": 1, "bb": 1},
247 | expTotal: 4,
248 | expTotalA: 5,
249 | expTotalB: 6,
250 | },
251 | {
252 | a: []rune("ababbaa"),
253 | b: []rune("aabbaa"),
254 | size: 3,
255 | expMap: map[string]int{"abb": 1, "bba": 1, "baa": 1},
256 | expTotal: 3,
257 | expTotalA: 5,
258 | expTotalB: 4,
259 | },
260 | {
261 | a: []rune("aabbaa"),
262 | b: []rune("ababbaa"),
263 | size: 4,
264 | expMap: map[string]int{"abba": 1, "bbaa": 1},
265 | expTotal: 2,
266 | expTotalA: 3,
267 | expTotalB: 4,
268 | },
269 | {
270 | a: []rune("ababbaa"),
271 | b: []rune("aabbaa"),
272 | size: 5,
273 | expMap: map[string]int{"abbaa": 1},
274 | expTotal: 1,
275 | expTotalA: 3,
276 | expTotalB: 2,
277 | },
278 | {
279 | a: []rune("aabbaa"),
280 | b: []rune("ababbaa"),
281 | size: 6,
282 | expMap: map[string]int{},
283 | expTotalA: 1,
284 | expTotalB: 2,
285 | },
286 | {
287 | a: []rune("ababbaa"),
288 | b: []rune("aabbaa"),
289 | size: 7,
290 | expMap: map[string]int{},
291 | expTotalA: 1,
292 | },
293 | {
294 | a: []rune("aabbaa"),
295 | b: []rune("ababbaa"),
296 | size: 7,
297 | expMap: map[string]int{},
298 | expTotalB: 1,
299 | },
300 | {
301 | a: []rune("ababbaa"),
302 | b: []rune("aabbaa"),
303 | size: 8,
304 | expMap: map[string]int{},
305 | },
306 | {
307 | a: []rune("aabbaa"),
308 | b: []rune("ababbaa"),
309 | size: 8,
310 | expMap: map[string]int{},
311 | },
312 | {
313 | a: []rune("ababbaa"),
314 | b: []rune("aabbaa"),
315 | size: 9,
316 | expMap: map[string]int{},
317 | },
318 | {
319 | a: []rune("aabbaa"),
320 | b: []rune("ababbaa"),
321 | size: 9,
322 | expMap: map[string]int{},
323 | },
324 | }
325 |
326 | for _, input := range inputs {
327 | actMap, actTotal, actTotalA, actTotalB := ngram.Intersection(input.a, input.b, input.size)
328 | require.Equal(t, input.expMap, actMap)
329 | require.Equal(t, input.expTotal, actTotal)
330 | require.Equal(t, input.expTotalA, actTotalA)
331 | require.Equal(t, input.expTotalB, actTotalB)
332 | }
333 | }
334 |
335 | func requireEqual(t *testing.T, inputs [][2]interface{}) {
336 | t.Helper()
337 |
338 | for _, input := range inputs {
339 | require.Equal(t, input[0], input[1])
340 | }
341 | }
342 |
--------------------------------------------------------------------------------
/internal/stringutil/stringutil.go:
--------------------------------------------------------------------------------
1 | package stringutil
2 |
3 | // CommonPrefix returns the common prefix of the specified strings. An empty
4 | // string is returned if the parameters have no prefix in common.
5 | func CommonPrefix(first, second string) string {
6 | fRunes, sRunes := []rune(first), []rune(second)
7 | if len(fRunes) > len(sRunes) {
8 | fRunes, sRunes = sRunes, fRunes
9 | }
10 |
11 | var commonLen int
12 | for i, r := range fRunes {
13 | if r != sRunes[i] {
14 | break
15 | }
16 |
17 | commonLen++
18 | }
19 |
20 | return string(sRunes[0:commonLen])
21 | }
22 |
23 | // UniqueSlice returns a slice containing the unique items from the specified
24 | // string slice. The items in the output slice are in the order in which they
25 | // occur in the input slice.
26 | func UniqueSlice(items []string) []string {
27 | var uniq []string
28 | registry := map[string]struct{}{}
29 |
30 | for _, item := range items {
31 | if _, ok := registry[item]; ok {
32 | continue
33 | }
34 |
35 | registry[item] = struct{}{}
36 | uniq = append(uniq, item)
37 | }
38 |
39 | return uniq
40 | }
41 |
42 | // SliceContains returns true if terms contains q, or false otherwise.
43 | func SliceContains(terms []string, q string) bool {
44 | for _, term := range terms {
45 | if q == term {
46 | return true
47 | }
48 | }
49 |
50 | return false
51 | }
52 |
--------------------------------------------------------------------------------
/internal/stringutil/stringutil_test.go:
--------------------------------------------------------------------------------
1 | package stringutil_test
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/adrg/strutil/internal/stringutil"
7 | "github.com/stretchr/testify/require"
8 | )
9 |
10 | func TestCommonPrefix(t *testing.T) {
11 | requireEqual(t, [][2]interface{}{
12 | {"", stringutil.CommonPrefix("", "")},
13 | {"", stringutil.CommonPrefix("a", "")},
14 | {"", stringutil.CommonPrefix("", "b")},
15 | {"", stringutil.CommonPrefix("a", "b")},
16 | {"a", stringutil.CommonPrefix("ab", "aab")},
17 | {"a", stringutil.CommonPrefix("aab", "ab")},
18 | {"aa", stringutil.CommonPrefix("aab", "aaab")},
19 | {"aa", stringutil.CommonPrefix("aaab", "aab")},
20 | {"忧郁的乌龟", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的乌龟")},
21 | {"忧郁的", stringutil.CommonPrefix("忧郁的", "忧郁的乌龟")},
22 | {"忧郁的", stringutil.CommonPrefix("忧郁的乌龟", "忧郁的")},
23 | {"", stringutil.CommonPrefix("忧郁的乌龟", "郁的乌龟")},
24 | {"", stringutil.CommonPrefix("郁的乌龟", "忧郁的乌龟")},
25 | {"\u2019", stringutil.CommonPrefix("\u2019a", "\u2019b")},
26 | {"a\u2019bc", stringutil.CommonPrefix("a\u2019bcd", "a\u2019bce")},
27 | {"abc", stringutil.CommonPrefix("abc\u2019d", "abc\u2020d")},
28 | })
29 | }
30 |
31 | func TestUniqueSlice(t *testing.T) {
32 | requireEqual(t, [][2]interface{}{
33 | {0, len(stringutil.UniqueSlice(nil))},
34 | {0, len(stringutil.UniqueSlice([]string{}))},
35 | {[]string{"a"}, stringutil.UniqueSlice([]string{"a"})},
36 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b"})},
37 | {[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a"})},
38 | {[]string{"a"}, stringutil.UniqueSlice([]string{"a", "a"})},
39 | {[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a", "a"})},
40 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "a", "b"})},
41 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "a", "a", "b"})},
42 | {[]string{"b", "a"}, stringutil.UniqueSlice([]string{"b", "a", "a", "a"})},
43 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b", "b", "a"})},
44 | {[]string{"a", "b"}, stringutil.UniqueSlice([]string{"a", "b", "a", "b"})},
45 | })
46 | }
47 |
48 | func TestSliceContains(t *testing.T) {
49 | requireEqual(t, [][2]interface{}{
50 | {false, stringutil.SliceContains(nil, "")},
51 | {false, stringutil.SliceContains(nil, "a")},
52 | {false, stringutil.SliceContains([]string{}, "")},
53 | {false, stringutil.SliceContains([]string{}, "a")},
54 | {true, stringutil.SliceContains([]string{"a", "b"}, "a")},
55 | {true, stringutil.SliceContains([]string{"b", "a"}, "a")},
56 | {false, stringutil.SliceContains([]string{"b", "a"}, "c")},
57 | })
58 | }
59 |
60 | func requireEqual(t *testing.T, inputs [][2]interface{}) {
61 | t.Helper()
62 |
63 | for _, input := range inputs {
64 | require.Equal(t, input[0], input[1])
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/metrics/examples_test.go:
--------------------------------------------------------------------------------
1 | package metrics_test
2 |
3 | import (
4 | "fmt"
5 |
6 | "github.com/adrg/strutil/metrics"
7 | )
8 |
9 | func ExampleHamming() {
10 | // Default options.
11 | h := metrics.NewHamming()
12 |
13 | sim := h.Compare("text", "test")
14 | fmt.Printf("(text, test) similarity: %.2f\n", sim)
15 |
16 | dist := h.Distance("text", "test")
17 | fmt.Printf("(text, test) distance: %d\n", dist)
18 |
19 | // Custom options.
20 | h.CaseSensitive = false
21 |
22 | sim = h.Compare("ONE", "once")
23 | fmt.Printf("(ONE, once) similarity: %.2f\n", sim)
24 |
25 | dist = h.Distance("one", "once")
26 | fmt.Printf("(ONE, once) distance: %d\n", dist)
27 |
28 | // Output:
29 | // (text, test) similarity: 0.75
30 | // (text, test) distance: 1
31 | // (ONE, once) similarity: 0.50
32 | // (ONE, once) distance: 2
33 | }
34 |
35 | func ExampleLevenshtein() {
36 | // Default options.
37 | lev := metrics.NewLevenshtein()
38 |
39 | sim := lev.Compare("book", "brick")
40 | fmt.Printf("(book, brick) similarity: %.2f\n", sim)
41 |
42 | dist := lev.Distance("book", "brick")
43 | fmt.Printf("(book, brick) distance: %d\n", dist)
44 |
45 | // Custom options.
46 | lev.CaseSensitive = false
47 | lev.ReplaceCost = 2
48 |
49 | sim = lev.Compare("HELLO", "jello")
50 | fmt.Printf("(HELLO, jello) similarity: %.2f\n", sim)
51 |
52 | dist = lev.Distance("HELLO", "jello")
53 | fmt.Printf("(HELLO, jello) distance: %d\n", dist)
54 |
55 | // Output:
56 | // (book, brick) similarity: 0.40
57 | // (book, brick) distance: 3
58 | // (HELLO, jello) similarity: 0.60
59 | // (HELLO, jello) distance: 2
60 | }
61 |
62 | func ExampleJaro() {
63 | jaro := metrics.NewJaro()
64 | sim := jaro.Compare("sort", "shirt")
65 | fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)
66 |
67 | // Output:
68 | // (sort, shirt) similarity: 0.78
69 | }
70 |
71 | func ExampleJaroWinkler() {
72 | jw := metrics.NewJaroWinkler()
73 | sim := jw.Compare("sort", "shirt")
74 | fmt.Printf("(sort, shirt) similarity: %.2f\n", sim)
75 |
76 | // Output:
77 | // (sort, shirt) similarity: 0.80
78 | }
79 |
80 | func ExampleSmithWatermanGotoh() {
81 | // Default options.
82 | swg := metrics.NewSmithWatermanGotoh()
83 |
84 | sim := swg.Compare("a pink kitten", "a kitten")
85 | fmt.Printf("(a pink kitten, a kitten) similarity: %.2f\n", sim)
86 |
87 | // Custom options.
88 | swg.CaseSensitive = false
89 | swg.GapPenalty = -0.1
90 | swg.Substitution = metrics.MatchMismatch{
91 | Match: 1,
92 | Mismatch: -0.5,
93 | }
94 |
95 | sim = swg.Compare("a pink kitten", "A KITTEN")
96 | fmt.Printf("(a pink kitten, A KITTEN) similarity: %.2f\n", sim)
97 |
98 | // Output:
99 | // (a pink kitten, a kitten) similarity: 0.88
100 | // (a pink kitten, A KITTEN) similarity: 0.94
101 | }
102 |
103 | func ExampleSorensenDice() {
104 | // Default options.
105 | sd := metrics.NewSorensenDice()
106 | sim := sd.Compare("night", "alright")
107 | fmt.Printf("(night, alright) similarity: %.2f\n", sim)
108 |
109 | // Custom options.
110 | sd.CaseSensitive = false
111 | sd.NgramSize = 3
112 |
113 | sim = sd.Compare("night", "alright")
114 | fmt.Printf("(night, alright) similarity: %.2f\n", sim)
115 |
116 | // Output:
117 | // (night, alright) similarity: 0.60
118 | // (night, alright) similarity: 0.50
119 | }
120 |
121 | func ExampleJaccard() {
122 | // Default options.
123 | j := metrics.NewJaccard()
124 | sim := j.Compare("night", "alright")
125 | fmt.Printf("(night, alright) similarity: %.2f\n", sim)
126 |
127 | // Custom options.
128 | j.CaseSensitive = false
129 | j.NgramSize = 3
130 |
131 | sim = j.Compare("night", "alright")
132 | fmt.Printf("(night, alright) similarity: %.2f\n", sim)
133 |
134 | // Output:
135 | // (night, alright) similarity: 0.43
136 | // (night, alright) similarity: 0.33
137 | }
138 |
139 | func ExampleOverlapCoefficient() {
140 | // Default options.
141 | oc := metrics.NewOverlapCoefficient()
142 | sim := oc.Compare("night", "alright")
143 | fmt.Printf("(night, alright) similarity: %.2f\n", sim)
144 |
145 | // Subset comparison.
146 | sim = oc.Compare("aa", "aaaa")
147 | fmt.Printf("(aa, aaaa) similarity: %.2f\n", sim)
148 |
149 | // Custom options.
150 | oc.CaseSensitive = false
151 | oc.NgramSize = 3
152 |
153 | sim = oc.Compare("night", "alright")
154 | fmt.Printf("(night, alright) similarity: %.2f\n", sim)
155 |
156 | // Output:
157 | // (night, alright) similarity: 0.75
158 | // (aa, aaaa) similarity: 1.00
159 | // (night, alright) similarity: 0.67
160 | }
161 |
--------------------------------------------------------------------------------
/metrics/hamming.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 | )
6 |
7 | // Hamming represents the Hamming metric for measuring the similarity
8 | // between sequences.
9 | // For more information see https://en.wikipedia.org/wiki/Hamming_distance.
10 | type Hamming struct {
11 | // CaseSensitive specifies if the string comparison is case sensitive.
12 | CaseSensitive bool
13 | }
14 |
15 | // NewHamming returns a new Hamming string metric.
16 | //
17 | // Default options:
18 | // CaseSensitive: true
19 | func NewHamming() *Hamming {
20 | return &Hamming{
21 | CaseSensitive: true,
22 | }
23 | }
24 |
25 | // Compare returns the Hamming similarity of a and b. The returned
26 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
27 | // closer matches.
28 | func (m *Hamming) Compare(a, b string) float64 {
29 | distance, maxLen := m.distance(a, b)
30 | return 1 - float64(distance)/float64(maxLen)
31 | }
32 |
33 | // Distance returns the Hamming distance between a and b. Lower distances
34 | // indicate closer matches. A distance of 0 means the strings are identical.
35 | func (m *Hamming) Distance(a, b string) int {
36 | distance, _ := m.distance(a, b)
37 | return distance
38 | }
39 |
40 | func (m *Hamming) distance(a, b string) (int, int) {
41 | // Lower terms if case insensitive comparison is specified.
42 | if !m.CaseSensitive {
43 | a = strings.ToLower(a)
44 | b = strings.ToLower(b)
45 | }
46 | runesA, runesB := []rune(a), []rune(b)
47 |
48 | // Check if both terms are empty.
49 | lenA, lenB := len(runesA), len(runesB)
50 | if lenA == 0 && lenB == 0 {
51 | return 0, 0
52 | }
53 |
54 | // If the lengths of the sequences are not equal, the distance is
55 | // initialized to their absolute difference. Otherwise, it is set to 0.
56 | if lenA > lenB {
57 | lenA, lenB = lenB, lenA
58 | }
59 | distance := lenB - lenA
60 |
61 | // Calculate Hamming distance.
62 | for i := 0; i < lenA; i++ {
63 | if runesA[i] != runesB[i] {
64 | distance++
65 | }
66 | }
67 |
68 | return distance, lenB
69 | }
70 |
--------------------------------------------------------------------------------
/metrics/jaccard.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/adrg/strutil/internal/ngram"
7 | )
8 |
9 | // Jaccard represents the Jaccard index for measuring the similarity
10 | // between sequences.
11 | // For more information see https://en.wikipedia.org/wiki/Jaccard_index.
12 | type Jaccard struct {
13 | // CaseSensitive specifies if the string comparison is case sensitive.
14 | CaseSensitive bool
15 |
16 | // NgramSize represents the size (in characters) of the tokens generated
17 | // when comparing the input sequences.
18 | NgramSize int
19 | }
20 |
21 | // NewJaccard returns a new Jaccard string metric.
22 | //
23 | // Default options:
24 | // CaseSensitive: true
25 | // NGramSize: 2
26 | func NewJaccard() *Jaccard {
27 | return &Jaccard{
28 | CaseSensitive: true,
29 | NgramSize: 2,
30 | }
31 | }
32 |
33 | // Compare returns the Jaccard similarity coefficient of a and b. The
34 | // returned similarity is a number between 0 and 1. Larger similarity numbers
35 | // indicate closer matches.
36 | // An n-gram size of 2 is used if the provided size is less than or equal to 0.
37 | func (m *Jaccard) Compare(a, b string) float64 {
38 | // Lower terms if case insensitive comparison is specified.
39 | if !m.CaseSensitive {
40 | a = strings.ToLower(a)
41 | b = strings.ToLower(b)
42 | }
43 |
44 | // Check if both terms are empty.
45 | runesA, runesB := []rune(a), []rune(b)
46 | if len(runesA) == 0 && len(runesB) == 0 {
47 | return 1
48 | }
49 |
50 | size := m.NgramSize
51 | if size <= 0 {
52 | size = 2
53 | }
54 |
55 | // Calculate n-gram intersection and union.
56 | _, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
57 |
58 | total := totalA + totalB
59 | if total == 0 {
60 | return 0
61 | }
62 |
63 | // Return similarity.
64 | return float64(common) / float64(total-common)
65 | }
66 |
--------------------------------------------------------------------------------
/metrics/jaro.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 | "unicode/utf8"
6 |
7 | "github.com/adrg/strutil/internal/mathutil"
8 | )
9 |
10 | // Jaro represents the Jaro metric for measuring the similarity
11 | // between sequences.
12 | // For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
13 | type Jaro struct {
14 | // CaseSensitive specifies if the string comparison is case sensitive.
15 | CaseSensitive bool
16 | }
17 |
18 | // NewJaro returns a new Jaro string metric.
19 | //
20 | // Default options:
21 | // CaseSensitive: true
22 | func NewJaro() *Jaro {
23 | return &Jaro{
24 | CaseSensitive: true,
25 | }
26 | }
27 |
28 | // Compare returns the Jaro similarity of a and b. The returned similarity is
29 | // a number between 0 and 1. Larger similarity numbers indicate closer matches.
30 | func (m *Jaro) Compare(a, b string) float64 {
31 | // Check if both terms are empty.
32 | lenA, lenB := utf8.RuneCountInString(a), utf8.RuneCountInString(b)
33 | if lenA == 0 && lenB == 0 {
34 | return 1
35 | }
36 |
37 | // Check if one of the terms is empty.
38 | if lenA == 0 || lenB == 0 {
39 | return 0
40 | }
41 |
42 | // Lower terms if case insensitive comparison is specified.
43 | if !m.CaseSensitive {
44 | a = strings.ToLower(a)
45 | b = strings.ToLower(b)
46 | }
47 |
48 | // Get matching runes.
49 | halfLen := mathutil.Max(0, mathutil.Max(lenA, lenB)/2)
50 | mrA := matchingRunes(a, b, halfLen)
51 | mrB := matchingRunes(b, a, halfLen)
52 |
53 | fmLen, smLen := len(mrA), len(mrB)
54 | if fmLen == 0 || smLen == 0 {
55 | return 0.0
56 | }
57 |
58 | // Return similarity.
59 | return (float64(fmLen)/float64(lenA) +
60 | float64(smLen)/float64(lenB) +
61 | float64(fmLen-transpositions(mrA, mrB)/2)/float64(fmLen)) / 3.0
62 | }
63 |
64 | func matchingRunes(a, b string, limit int) []rune {
65 | var (
66 | runesA = []rune(a)
67 | runesB = []rune(b)
68 | runesCommon = []rune{}
69 | lenB = len(runesB)
70 | )
71 |
72 | for i, r := range runesA {
73 | end := mathutil.Min(i+limit+1, lenB)
74 | for j := mathutil.Max(0, i-limit); j < end; j++ {
75 | if r == runesB[j] && runesB[j] != -1 {
76 | runesCommon = append(runesCommon, runesB[j])
77 | runesB[j] = -1
78 | break
79 | }
80 | }
81 | }
82 |
83 | return runesCommon
84 | }
85 |
86 | func transpositions(a, b []rune) int {
87 | var count int
88 |
89 | minLen := mathutil.Min(len(a), len(b))
90 | for i := 0; i < minLen; i++ {
91 | if a[i] != b[i] {
92 | count++
93 | }
94 | }
95 |
96 | return count
97 | }
98 |
--------------------------------------------------------------------------------
/metrics/jaro_winkler.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 | "unicode/utf8"
6 |
7 | "github.com/adrg/strutil/internal/stringutil"
8 | )
9 |
10 | // JaroWinkler represents the Jaro-Winkler metric for measuring the similarity
11 | // between sequences.
12 | // For more information see https://en.wikipedia.org/wiki/Jaro-Winkler_distance.
13 | type JaroWinkler struct {
14 | // CaseSensitive specifies if the string comparison is case sensitive.
15 | CaseSensitive bool
16 | }
17 |
18 | // NewJaroWinkler returns a new Jaro-Winkler string metric.
19 | //
20 | // Default options:
21 | // CaseSensitive: true
22 | func NewJaroWinkler() *JaroWinkler {
23 | return &JaroWinkler{
24 | CaseSensitive: true,
25 | }
26 | }
27 |
28 | // Compare returns the Jaro-Winkler similarity of a and b. The returned
29 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
30 | // closer matches.
31 | func (m *JaroWinkler) Compare(a, b string) float64 {
32 | // Lower terms if case insensitive comparison is specified.
33 | if !m.CaseSensitive {
34 | a = strings.ToLower(a)
35 | b = strings.ToLower(b)
36 | }
37 |
38 | // Calculate common prefix.
39 | lenPrefix := utf8.RuneCountInString(stringutil.CommonPrefix(a, b))
40 | if lenPrefix > 4 {
41 | lenPrefix = 4
42 | }
43 |
44 | jaro := NewJaro()
45 | jaro.CaseSensitive = m.CaseSensitive
46 |
47 | // Return similarity.
48 | similarity := jaro.Compare(a, b)
49 | return similarity + (0.1 * float64(lenPrefix) * (1.0 - similarity))
50 | }
51 |
--------------------------------------------------------------------------------
/metrics/levenshtein.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/adrg/strutil/internal/mathutil"
7 | )
8 |
9 | // Levenshtein represents the Levenshtein metric for measuring the similarity
10 | // between sequences.
11 | // For more information see https://en.wikipedia.org/wiki/Levenshtein_distance.
12 | type Levenshtein struct {
13 | // CaseSensitive specifies if the string comparison is case sensitive.
14 | CaseSensitive bool
15 |
16 | // InsertCost represents the Levenshtein cost of a character insertion.
17 | InsertCost int
18 |
19 | // InsertCost represents the Levenshtein cost of a character deletion.
20 | DeleteCost int
21 |
22 | // InsertCost represents the Levenshtein cost of a character substitution.
23 | ReplaceCost int
24 | }
25 |
26 | // NewLevenshtein returns a new Levenshtein string metric.
27 | //
28 | // Default options:
29 | // CaseSensitive: true
30 | // InsertCost: 1
31 | // DeleteCost: 1
32 | // ReplaceCost: 1
33 | func NewLevenshtein() *Levenshtein {
34 | return &Levenshtein{
35 | CaseSensitive: true,
36 | InsertCost: 1,
37 | DeleteCost: 1,
38 | ReplaceCost: 1,
39 | }
40 | }
41 |
42 | // Compare returns the Levenshtein similarity of a and b. The returned
43 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
44 | // closer matches.
45 | func (m *Levenshtein) Compare(a, b string) float64 {
46 | distance, maxLen := m.distance(a, b)
47 | return 1 - float64(distance)/float64(maxLen)
48 | }
49 |
50 | // Distance returns the Levenshtein distance between a and b. Lower distances
51 | // indicate closer matches. A distance of 0 means the strings are identical.
52 | func (m *Levenshtein) Distance(a, b string) int {
53 | distance, _ := m.distance(a, b)
54 | return distance
55 | }
56 |
57 | func (m *Levenshtein) distance(a, b string) (int, int) {
58 | // Lower terms if case insensitive comparison is specified.
59 | if !m.CaseSensitive {
60 | a = strings.ToLower(a)
61 | b = strings.ToLower(b)
62 | }
63 | runesA, runesB := []rune(a), []rune(b)
64 |
65 | // Check if both terms are empty.
66 | lenA, lenB := len(runesA), len(runesB)
67 | if lenA == 0 && lenB == 0 {
68 | return 0, 0
69 | }
70 |
71 | // Check if one of the terms is empty.
72 | maxLen := mathutil.Max(lenA, lenB)
73 | if lenA == 0 {
74 | return m.InsertCost * lenB, maxLen
75 | }
76 | if lenB == 0 {
77 | return m.DeleteCost * lenA, maxLen
78 | }
79 |
80 | // Initialize cost slice.
81 | prevCol := make([]int, lenB+1)
82 | for i := 0; i <= lenB; i++ {
83 | prevCol[i] = i
84 | }
85 |
86 | // Calculate distance.
87 | col := make([]int, lenB+1)
88 | for i := 0; i < lenA; i++ {
89 | col[0] = i + 1
90 | for j := 0; j < lenB; j++ {
91 | delCost := prevCol[j+1] + m.DeleteCost
92 | insCost := col[j] + m.InsertCost
93 |
94 | subCost := prevCol[j]
95 | if runesA[i] != runesB[j] {
96 | subCost += m.ReplaceCost
97 | }
98 |
99 | col[j+1] = mathutil.Min(delCost, insCost, subCost)
100 | }
101 |
102 | col, prevCol = prevCol, col
103 | }
104 |
105 | return prevCol[lenB], maxLen
106 | }
107 |
--------------------------------------------------------------------------------
/metrics/match_mismatch.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | // MatchMismatch represents a substitution function which returns the match or
4 | // mismatch value depeding on the equality of the compared characters. The
5 | // match value must be greater than the mismatch value.
6 | type MatchMismatch struct {
7 | // Match represents the score of equal character substitutions.
8 | Match float64
9 |
10 | // Mismatch represents the score of unequal character substitutions.
11 | Mismatch float64
12 | }
13 |
14 | // Compare returns the match value if a[idxA] is equal to b[idxB] or the
15 | // mismatch value otherwise.
16 | func (m MatchMismatch) Compare(a []rune, idxA int, b []rune, idxB int) float64 {
17 | if a[idxA] == b[idxB] {
18 | return m.Match
19 | }
20 |
21 | return m.Mismatch
22 | }
23 |
24 | // Max returns the match value.
25 | func (m MatchMismatch) Max() float64 {
26 | return m.Match
27 | }
28 |
29 | // Min returns the mismatch value.
30 | func (m MatchMismatch) Min() float64 {
31 | return m.Mismatch
32 | }
33 |
--------------------------------------------------------------------------------
/metrics/metrics_test.go:
--------------------------------------------------------------------------------
1 | package metrics_test
2 |
3 | import (
4 | "fmt"
5 | "testing"
6 |
7 | "github.com/adrg/strutil/metrics"
8 | "github.com/stretchr/testify/require"
9 | )
10 |
11 | func sf(a float64) string {
12 | return fmt.Sprintf("%.2f", a)
13 | }
14 |
15 | func TestHamming(t *testing.T) {
16 | h := metrics.NewHamming()
17 | require.Equal(t, 0, h.Distance("", ""))
18 | require.Equal(t, "0.75", sf(h.Compare("text", "test")))
19 | require.Equal(t, "0.50", sf(h.Compare("once", "one")))
20 | require.Equal(t, "1.00", sf(h.Compare("ab\u2019c", "ab\u2019c")))
21 | require.Equal(t, "0.75", sf(h.Compare("ab\u2019d", "ab\u2019c")))
22 | require.Equal(t, "0.75", sf(h.Compare("ab\u2018c", "ab\u2019c")))
23 | h.CaseSensitive = false
24 | require.Equal(t, "0.50", sf(h.Compare("one", "ONCE")))
25 | }
26 |
27 | func TestJaccard(t *testing.T) {
28 | j := metrics.NewJaccard()
29 | require.Equal(t, "1.00", sf(j.Compare("", "")))
30 | require.Equal(t, "0.00", sf(j.Compare("a", "b")))
31 | require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
32 | require.Equal(t, "0.50", sf(j.Compare("ab\u2019d", "ab\u2019c")))
33 | require.Equal(t, "0.20", sf(j.Compare("ab\u2018c", "ab\u2019c")))
34 | require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
35 | j.NgramSize = 0
36 | require.Equal(t, "0.43", sf(j.Compare("night", "alright")))
37 | j.CaseSensitive = false
38 | j.NgramSize = 3
39 | require.Equal(t, "0.33", sf(j.Compare("NIGHT", "alright")))
40 | }
41 |
42 | func TestJaro(t *testing.T) {
43 | j := metrics.NewJaro()
44 | require.Equal(t, "1.00", sf(j.Compare("", "")))
45 | require.Equal(t, "0.00", sf(j.Compare("test", "")))
46 | require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
47 | require.Equal(t, "0.83", sf(j.Compare("ab\u2019d", "ab\u2019c")))
48 | require.Equal(t, "0.83", sf(j.Compare("ab\u2018c", "ab\u2019c")))
49 | require.Equal(t, "0.00", sf(j.Compare("a", "b")))
50 | require.Equal(t, "0.78", sf(j.Compare("sort", "shirt")))
51 | require.Equal(t, "0.64", sf(j.Compare("sort", "report")))
52 | j.CaseSensitive = false
53 | require.Equal(t, "0.78", sf(j.Compare("sort", "SHIRT")))
54 | }
55 |
56 | func TestJaroWinkler(t *testing.T) {
57 | j := metrics.NewJaroWinkler()
58 | require.Equal(t, "1.00", sf(j.Compare("", "")))
59 | require.Equal(t, "0.00", sf(j.Compare("test", "")))
60 | require.Equal(t, "1.00", sf(j.Compare("ab\u2019c", "ab\u2019c")))
61 | require.Equal(t, "0.88", sf(j.Compare("ab\u2019d", "ab\u2019c")))
62 | require.Equal(t, "0.87", sf(j.Compare("ab\u2018c", "ab\u2019c")))
63 | require.Equal(t, "0.80", sf(j.Compare("sort", "shirt")))
64 | require.Equal(t, "0.94", sf(j.Compare("charm", "charmed")))
65 | j.CaseSensitive = false
66 | require.Equal(t, "0.80", sf(j.Compare("sort", "SHIRT")))
67 | }
68 |
69 | func TestLevenshtein(t *testing.T) {
70 | l := metrics.NewLevenshtein()
71 | require.Equal(t, 0, l.Distance("", ""))
72 | require.Equal(t, 4, l.Distance("test", ""))
73 | require.Equal(t, 4, l.Distance("", "test"))
74 | require.Equal(t, 0, l.Distance("ab\u2019c", "ab\u2019c"))
75 | require.Equal(t, 1, l.Distance("ab\u2019d", "ab\u2019c"))
76 | require.Equal(t, 1, l.Distance("ab\u2018c", "ab\u2019c"))
77 | require.Equal(t, "0.40", sf(l.Compare("book", "brick")))
78 | require.Equal(t, "0.75", sf(l.Compare("ab\u2019d", "ab\u2019c")))
79 | require.Equal(t, "0.75", sf(l.Compare("ab\u2018c", "ab\u2019c")))
80 | l.CaseSensitive = false
81 | require.Equal(t, "0.80", sf(l.Compare("hello", "jello")))
82 | l.ReplaceCost = 2
83 | require.Equal(t, "0.60", sf(l.Compare("hello", "JELLO")))
84 | require.Equal(t, "1.00", sf(l.Compare("ab\u2019c", "ab\u2019c")))
85 | require.Equal(t, "0.50", sf(l.Compare("ab\u2019d", "ab\u2019c")))
86 | require.Equal(t, "0.50", sf(l.Compare("ab\u2018c", "ab\u2019c")))
87 | }
88 |
89 | func TestOperlapCoefficient(t *testing.T) {
90 | o := metrics.NewOverlapCoefficient()
91 | require.Equal(t, "1.00", sf(o.Compare("", "")))
92 | require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
93 | require.Equal(t, "0.00", sf(o.Compare("aa", "")))
94 | require.Equal(t, "0.00", sf(o.Compare("bb", "")))
95 | require.Equal(t, "1.00", sf(o.Compare("ab\u2019c", "ab\u2019c")))
96 | require.Equal(t, "0.67", sf(o.Compare("ab\u2019d", "ab\u2019c")))
97 | require.Equal(t, "0.33", sf(o.Compare("ab\u2018c", "ab\u2019c")))
98 | o.NgramSize = 0
99 | require.Equal(t, "0.75", sf(o.Compare("night", "alright")))
100 | require.Equal(t, "1.00", sf(o.Compare("aa", "aaaa")))
101 | o.CaseSensitive = false
102 | require.Equal(t, "1.00", sf(o.Compare("aa", "AAAA")))
103 | o.NgramSize = 3
104 | require.Equal(t, "0.67", sf(o.Compare("night", "alright")))
105 | }
106 |
107 | func TestSmithWatermanGotoh(t *testing.T) {
108 | s := metrics.NewSmithWatermanGotoh()
109 | require.Equal(t, "1.00", sf(s.Compare("", "")))
110 | require.Equal(t, "0.00", sf(s.Compare("test", "")))
111 | require.Equal(t, "0.00", sf(s.Compare("", "test")))
112 | require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
113 | require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
114 | require.Equal(t, "0.75", sf(s.Compare("ab\u2019d", "ab\u2019c")))
115 | require.Equal(t, "0.50", sf(s.Compare("ab\u2018c", "ab\u2019c")))
116 | s.Substitution = nil
117 | require.Equal(t, "0.88", sf(s.Compare("a pink kitten", "a kitten")))
118 | s.CaseSensitive = false
119 | s.GapPenalty = -0.1
120 | s.Substitution = metrics.MatchMismatch{
121 | Match: 1,
122 | Mismatch: -0.5,
123 | }
124 | require.Equal(t, "0.94", sf(s.Compare("a pink kitten", "A KITTEN")))
125 | }
126 |
127 | func TestSorensenDice(t *testing.T) {
128 | s := metrics.NewSorensenDice()
129 | require.Equal(t, "1.00", sf(s.Compare("", "")))
130 | require.Equal(t, "0.00", sf(s.Compare("a", "b")))
131 | require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
132 | require.Equal(t, "1.00", sf(s.Compare("ab\u2019c", "ab\u2019c")))
133 | require.Equal(t, "0.67", sf(s.Compare("ab\u2019d", "ab\u2019c")))
134 | require.Equal(t, "0.33", sf(s.Compare("ab\u2018c", "ab\u2019c")))
135 | s.NgramSize = 0
136 | require.Equal(t, "0.60", sf(s.Compare("night", "alright")))
137 | s.CaseSensitive = false
138 | require.Equal(t, "0.60", sf(s.Compare("night", "ALRIGHT")))
139 | s.NgramSize = 3
140 | require.Equal(t, "0.50", sf(s.Compare("night", "alright")))
141 | }
142 |
143 | func TestMatchMismatch(t *testing.T) {
144 | m := metrics.MatchMismatch{
145 | Match: 2,
146 | Mismatch: 1,
147 | }
148 | require.Equal(t, "1.00", sf(m.Compare([]rune{'a'}, 0, []rune{'b'}, 0)))
149 | require.Equal(t, "2.00", sf(m.Compare([]rune{'a'}, 0, []rune{'a'}, 0)))
150 | require.Equal(t, "1.00", sf(m.Min()))
151 | require.Equal(t, "2.00", sf(m.Max()))
152 | }
153 |
--------------------------------------------------------------------------------
/metrics/overlap_coefficient.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/adrg/strutil/internal/mathutil"
7 | "github.com/adrg/strutil/internal/ngram"
8 | )
9 |
10 | // OverlapCoefficient represents the overlap coefficient for measuring the
11 | // similarity between sequences. The metric is also know as the
12 | // Szymkiewicz-Simpson coefficient.
13 | // For more information see https://en.wikipedia.org/wiki/Overlap_coefficient.
14 | type OverlapCoefficient struct {
15 | // CaseSensitive specifies if the string comparison is case sensitive.
16 | CaseSensitive bool
17 |
18 | // NgramSize represents the size (in characters) of the tokens generated
19 | // when comparing the input sequences.
20 | NgramSize int
21 | }
22 |
23 | // NewOverlapCoefficient returns a new overlap coefficient string metric.
24 | //
25 | // Default options:
26 | // CaseSensitive: true
27 | // NGramSize: 2
28 | func NewOverlapCoefficient() *OverlapCoefficient {
29 | return &OverlapCoefficient{
30 | CaseSensitive: true,
31 | NgramSize: 2,
32 | }
33 | }
34 |
35 | // Compare returns the OverlapCoefficient similarity coefficient of a and b.
36 | // The returned similarity is a number between 0 and 1. Larger similarity
37 | // numbers indicate closer matches.
38 | // An n-gram size of 2 is used if the provided size is less than or equal to 0.
39 | func (m *OverlapCoefficient) Compare(a, b string) float64 {
40 | // Lower terms if case insensitive comparison is specified.
41 | if !m.CaseSensitive {
42 | a = strings.ToLower(a)
43 | b = strings.ToLower(b)
44 | }
45 |
46 | // Check if both terms are empty.
47 | runesA, runesB := []rune(a), []rune(b)
48 | if len(runesA) == 0 && len(runesB) == 0 {
49 | return 1
50 | }
51 |
52 | size := m.NgramSize
53 | if size <= 0 {
54 | size = 2
55 | }
56 |
57 | // Calculate n-gram intersection and minimum subset.
58 | _, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
59 |
60 | min := mathutil.Min(totalA, totalB)
61 | if min == 0 {
62 | return 0
63 | }
64 |
65 | // Return similarity.
66 | return float64(common) / float64(min)
67 | }
68 |
--------------------------------------------------------------------------------
/metrics/smith_waterman_gotoh.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/adrg/strutil/internal/mathutil"
7 | )
8 |
9 | // SmithWatermanGotoh represents the Smith-Waterman-Gotoh metric for measuring
10 | // the similarity between sequences.
11 | // For more information see https://en.wikipedia.org/wiki/Smith-Waterman_algorithm.
12 | type SmithWatermanGotoh struct {
13 | // CaseSensitive specifies if the string comparison is case sensitive.
14 | CaseSensitive bool
15 |
16 | // GapPenalty defines a score penalty for character insertions or deletions.
17 | // For relevant results, the gap penalty should be a non-positive number.
18 | GapPenalty float64
19 |
20 | // Substitution represents a substitution function which is used to
21 | // calculate a score for character substitutions.
22 | Substitution Substitution
23 | }
24 |
25 | // NewSmithWatermanGotoh returns a new Smith-Waterman-Gotoh string metric.
26 | //
27 | // Default options:
28 | // CaseSensitive: true
29 | // GapPenalty: -0.5
30 | // Substitution: MatchMismatch{
31 | // Match: 1,
32 | // Mismatch: -2,
33 | // },
34 | func NewSmithWatermanGotoh() *SmithWatermanGotoh {
35 | return &SmithWatermanGotoh{
36 | CaseSensitive: true,
37 | GapPenalty: -0.5,
38 | Substitution: MatchMismatch{
39 | Match: 1,
40 | Mismatch: -2,
41 | },
42 | }
43 | }
44 |
45 | // Compare returns the Smith-Waterman-Gotoh similarity of a and b. The returned
46 | // similarity is a number between 0 and 1. Larger similarity numbers indicate
47 | // closer matches.
48 | func (m *SmithWatermanGotoh) Compare(a, b string) float64 {
49 | gap := m.GapPenalty
50 |
51 | // Lower terms if case insensitive comparison is specified.
52 | if !m.CaseSensitive {
53 | a = strings.ToLower(a)
54 | b = strings.ToLower(b)
55 | }
56 | runesA, runesB := []rune(a), []rune(b)
57 |
58 | // Check if both terms are empty.
59 | lenA, lenB := len(runesA), len(runesB)
60 | if lenA == 0 && lenB == 0 {
61 | return 1
62 | }
63 |
64 | // Check if one of the terms is empty.
65 | if lenA == 0 || lenB == 0 {
66 | return 0
67 | }
68 |
69 | // Use default substitution, if none is specified.
70 | subst := m.Substitution
71 | if subst == nil {
72 | subst = MatchMismatch{
73 | Match: 1,
74 | Mismatch: -2,
75 | }
76 | }
77 |
78 | // Calculate max distance.
79 | maxDistance := mathutil.Minf(float64(lenA), float64(lenB)) * mathutil.Maxf(subst.Max(), gap)
80 |
81 | // Calculate distance.
82 | v0 := make([]float64, lenB)
83 | v1 := make([]float64, lenB)
84 |
85 | distance := mathutil.Maxf(0, gap, subst.Compare(runesA, 0, runesB, 0))
86 | v0[0] = distance
87 |
88 | for i := 1; i < lenB; i++ {
89 | v0[i] = mathutil.Maxf(0, v0[i-1]+gap, subst.Compare(runesA, 0, runesB, i))
90 | distance = mathutil.Maxf(distance, v0[i])
91 | }
92 |
93 | for i := 1; i < lenA; i++ {
94 | v1[0] = mathutil.Maxf(0, v0[0]+gap, subst.Compare(runesA, i, runesB, 0))
95 | distance = mathutil.Maxf(distance, v1[0])
96 |
97 | for j := 1; j < lenB; j++ {
98 | v1[j] = mathutil.Maxf(0, v0[j]+gap, v1[j-1]+gap, v0[j-1]+subst.Compare(runesA, i, runesB, j))
99 | distance = mathutil.Maxf(distance, v1[j])
100 | }
101 |
102 | for j := 0; j < lenB; j++ {
103 | v0[j] = v1[j]
104 | }
105 | }
106 |
107 | // Return similarity.
108 | return distance / maxDistance
109 | }
110 |
--------------------------------------------------------------------------------
/metrics/sorensen_dice.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | import (
4 | "strings"
5 |
6 | "github.com/adrg/strutil/internal/ngram"
7 | )
8 |
9 | // SorensenDice represents the Sorensen-Dice metric for measuring the
10 | // similarity between sequences.
11 | // For more information see https://en.wikipedia.org/wiki/Sorensen-Dice_coefficient.
12 | type SorensenDice struct {
13 | // CaseSensitive specifies if the string comparison is case sensitive.
14 | CaseSensitive bool
15 |
16 | // NgramSize represents the size (in characters) of the tokens generated
17 | // when comparing the input sequences.
18 | NgramSize int
19 | }
20 |
21 | // NewSorensenDice returns a new Sorensen-Dice string metric.
22 | //
23 | // Default options:
24 | // CaseSensitive: true
25 | // NGramSize: 2
26 | func NewSorensenDice() *SorensenDice {
27 | return &SorensenDice{
28 | CaseSensitive: true,
29 | NgramSize: 2,
30 | }
31 | }
32 |
33 | // Compare returns the Sorensen-Dice similarity coefficient of a and b. The
34 | // returned similarity is a number between 0 and 1. Larger similarity numbers
35 | // indicate closer matches.
36 | // An n-gram size of 2 is used if the provided size is less than or equal to 0.
37 | func (m *SorensenDice) Compare(a, b string) float64 {
38 | // Lower terms if case insensitive comparison is specified.
39 | if !m.CaseSensitive {
40 | a = strings.ToLower(a)
41 | b = strings.ToLower(b)
42 | }
43 |
44 | // Check if both terms are empty.
45 | runesA, runesB := []rune(a), []rune(b)
46 | if len(runesA) == 0 && len(runesB) == 0 {
47 | return 1
48 | }
49 |
50 | size := m.NgramSize
51 | if size <= 0 {
52 | size = 2
53 | }
54 |
55 | // Calculate n-gram intersection and union.
56 | _, common, totalA, totalB := ngram.Intersection(runesA, runesB, size)
57 |
58 | total := totalA + totalB
59 | if total == 0 {
60 | return 0
61 | }
62 |
63 | // Return similarity.
64 | return 2 * float64(common) / float64(total)
65 | }
66 |
--------------------------------------------------------------------------------
/metrics/substitution.go:
--------------------------------------------------------------------------------
1 | package metrics
2 |
3 | // Substitution represents a substitution function which is used to
4 | // calculate a score for character substitutions.
5 | type Substitution interface {
6 | // Compare returns the substitution score of characters a[idxA] and b[idxB].
7 | Compare(a []rune, idxA int, b []rune, idxB int) float64
8 |
9 | // Returns the maximum score of a character substitution operation.
10 | Max() float64
11 |
12 | // Returns the minimum score of a character substitution operation.
13 | Min() float64
14 | }
15 |
--------------------------------------------------------------------------------
/strutil.go:
--------------------------------------------------------------------------------
1 | /*
2 | Package strutil provides string metrics for calculating string similarity as
3 | well as other string utility functions. Documentation for all the metrics can
4 | be found at https://pkg.go.dev/github.com/adrg/strutil/metrics.
5 |
6 | Included string metrics:
7 | - Hamming
8 | - Jaro
9 | - Jaro-Winkler
10 | - Levenshtein
11 | - Smith-Waterman-Gotoh
12 | - Sorensen-Dice
13 | - Jaccard
14 | - Overlap coefficient
15 |
16 | */
17 | package strutil
18 |
19 | import (
20 | "github.com/adrg/strutil/internal/ngram"
21 | "github.com/adrg/strutil/internal/stringutil"
22 | )
23 |
24 | // StringMetric represents a metric for measuring the similarity between
25 | // strings. The metrics package implements the following string metrics:
26 | // - Hamming
27 | // - Jaro
28 | // - Jaro-Winkler
29 | // - Levenshtein
30 | // - Smith-Waterman-Gotoh
31 | // - Sorensen-Dice
32 | // - Jaccard
33 | // - Overlap coefficient
34 | //
35 | // For more information see https://pkg.go.dev/github.com/adrg/strutil/metrics.
36 | type StringMetric interface {
37 | Compare(a, b string) float64
38 | }
39 |
40 | // Similarity returns the similarity of a and b, computed using the specified
41 | // string metric. The returned similarity is a number between 0 and 1. Larger
42 | // similarity numbers indicate closer matches.
43 | func Similarity(a, b string, metric StringMetric) float64 {
44 | return metric.Compare(a, b)
45 | }
46 |
47 | // CommonPrefix returns the common prefix of the specified strings. An empty
48 | // string is returned if the parameters have no prefix in common.
49 | func CommonPrefix(a, b string) string {
50 | return stringutil.CommonPrefix(a, b)
51 | }
52 |
53 | // UniqueSlice returns a slice containing the unique items from the specified
54 | // string slice. The items in the output slice are in the order in which they
55 | // occur in the input slice.
56 | func UniqueSlice(items []string) []string {
57 | return stringutil.UniqueSlice(items)
58 | }
59 |
60 | // SliceContains returns true if terms contains q, or false otherwise.
61 | func SliceContains(terms []string, q string) bool {
62 | return stringutil.SliceContains(terms, q)
63 | }
64 |
65 | // NgramCount returns the n-gram count of the specified size for the
66 | // provided term. An n-gram size of 1 is used if the provided size is
67 | // less than or equal to 0.
68 | func NgramCount(term string, size int) int {
69 | return ngram.Count([]rune(term), size)
70 | }
71 |
72 | // Ngrams returns all the n-grams of the specified size for the provided term.
73 | // The n-grams in the output slice are in the order in which they occur in the
74 | // input term. An n-gram size of 1 is used if the provided size is less than or
75 | // equal to 0.
76 | func Ngrams(term string, size int) []string {
77 | return ngram.Slice([]rune(term), size)
78 | }
79 |
80 | // NgramMap returns a map of all n-grams of the specified size for the provided
81 | // term, along with their frequency. The function also returns the total number
82 | // of n-grams, which is the sum of all the values in the output map.
83 | // An n-gram size of 1 is used if the provided size is less than or equal to 0.
84 | func NgramMap(term string, size int) (map[string]int, int) {
85 | return ngram.Map([]rune(term), size)
86 | }
87 |
88 | // NgramIntersection returns a map of the n-grams of the specified size found
89 | // in both terms, along with their frequency. The function also returns the
90 | // number of common n-grams (the sum of all the values in the output map), the
91 | // total number of n-grams in the first term and the total number of n-grams in
92 | // the second term. An n-gram size of 1 is used if the provided size is less
93 | // than or equal to 0.
94 | func NgramIntersection(a, b string, size int) (map[string]int, int, int, int) {
95 | return ngram.Intersection([]rune(a), []rune(b), size)
96 | }
97 |
--------------------------------------------------------------------------------