├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug-report.md
    │   ├── feature-request.md
    │   └── general-question.md
    └── workflows
    │   └── go.yml
├── .gitignore
├── LICENSE
├── README.md
├── check.go
├── cosine_conf.go
├── default_conf.go
├── dice_distance_conf.go
├── go.mod
├── go.sum
├── hamming_conf.go
├── jaro_conf.go
├── jaro_winkler_conf.go
├── prev_modify.go
├── prev_modify_test.go
├── public_config.go
├── simhash_conf.go
├── similarity
    ├── Cosine.go
    ├── best_result.go
    ├── dice_coefficient.go
    ├── dice_coefficient_test.go
    ├── edit_distance.go
    ├── edit_distance_test.go
    ├── hamming.go
    ├── hamming_test.go
    ├── jaro.go
    ├── jaro_test.go
    ├── jaro_winkler.go
    ├── simhash.go
    └── utils.go
├── strsim.go
├── strsim_priv.go
└── strsim_test.go


/.github/ISSUE_TEMPLATE/bug-report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F91D Bug Report"
 3 | about: As a User, I want to report a Bug.
 4 | labels: type/bug
 5 | ---
 6 | 
 7 | ## Bug Report
 8 | 
 9 | Please answer these questions before submitting your issue. Thanks!
10 | 
11 | ### 1. Minimal reproduce step (Required)
12 | 
13 | <!-- a step by step guide for reproducing the bug. -->
14 | 
15 | ### 2. What did you expect to see? (Required)
16 | 
17 | ### 3. What did you see instead (Required)
18 | 
19 | ### 4. What is your strsim version? (Required)
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F44F Feature Request"
 3 | about: As a user, I want to request a New Feature on the product.
 4 | labels: type/feature-request
 5 | ---
 6 | 
 7 | ## Feature Request
 8 | 
 9 | **Is your feature request related to a problem? Please describe:**
10 | <!-- A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] -->
11 | 
12 | **Describe the feature you'd like:**
13 | <!-- A clear and concise description of what you want to happen. -->
14 | 
15 | **Describe alternatives you've considered:**
16 | <!-- A clear and concise description of any alternative solutions or features you've considered. -->
17 | 
18 | **Teachability, Documentation, Adoption, Migration Strategy:**
19 | <!-- If you can, explain some scenarios how users might use this, situations it would be helpful in. Any API designs, mockups, or diagrams are also helpful. -->
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general-question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: "\U0001F600 Ask a Question"
 3 | about: I want to ask a question.
 4 | labels: type/question
 5 | ---
 6 | 
 7 | ## General Question
 8 | 
 9 | <!--
10 | 
11 | Before asking a question, make sure you have:
12 | 
13 | - Searched existing Stack Overflow questions.
14 | - Googled your question.
15 | - Searched open and closed [GitHub issues](https://github.com/antlabs/strsim/issues)
16 | - Read the documentation:
17 |   - [strsim Readme](https://github.com/antlabs/strsim/blob/master/README.md)
18 | 
19 | -->


--------------------------------------------------------------------------------
/.github/workflows/go.yml:
--------------------------------------------------------------------------------
 1 | name: Go
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | jobs:
 8 | 
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         go: [ '1.13', '1.14']
14 |     name: Go ${{ matrix.go }} sample
15 | 
16 |     steps:
17 | 
18 |     - name: Set up Go 1.13
19 |       uses: actions/setup-go@v1
20 |       with:
21 |         go-version: ${{ matrix.go }}
22 |       id: go
23 | 
24 |     - name: Check out code into the Go module directory
25 |       uses: actions/checkout@v1
26 | 
27 |     - name: Get dependencies
28 |       run: |
29 |         go get -v -t -d ./...
30 |         if [ -f Gopkg.toml ]; then
31 |             curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh
32 |             dep ensure
33 |         fi
34 | 
35 |     - name: Build
36 |       run: go build -v .
37 | 
38 |     - name: Test
39 |       run: go test -v -coverprofile='coverage.out' -covermode=count ./...
40 | 
41 |     - name: Upload Coverage report
42 |       uses: codecov/codecov-action@v1
43 |       with:
44 |         token: ${{secrets.CODECOV_TOKEN}}
45 |         file: ./coverage.out
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Binaries for programs and plugins
 2 | *swp
 3 | *~
 4 | *.exe
 5 | *.exe~
 6 | *.dll
 7 | *.so
 8 | *.dylib
 9 | 
10 | # Test binary, built with `go test -c`
11 | *.test
12 | 
13 | # Output of the go coverage tool, specifically when used with LiteIDE
14 | *.out
15 | 
16 | # Dependency directories (remove the comment below to include it)
17 | # vendor/
18 | 
19 | .idea/
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## strsim
 2 | strsim是golang实现的字符串相识度库，后端集成多种算法，主要解决现有相似度库不能很好的处理中文
 3 | 
 4 | [![Go](https://github.com/antlabs/strsim/workflows/Go/badge.svg)](https://github.com/antlabs/strsim/actions)
 5 | [![codecov](https://codecov.io/gh/antlabs/strsim/branch/master/graph/badge.svg)](https://codecov.io/gh/antlabs/strsim)
 6 | 
 7 | ## 构架
 8 | ![strsim.png](https://github.com/guonaihong/images/blob/master/strsim/strsim.png?raw=true)
 9 | 
10 | 
11 | 
12 | ## 使用方式
13 | 
14 | ```go
15 | go get -u github.com/antlabs/strsim
16 | ```
17 | 
18 | 
19 | 
20 | 
21 | 
22 | ## 功能
23 | * 可以忽略空白字符
24 | * 可以大小写
25 |     ### 多种算法支持
26 |     * 莱文斯坦-编辑距离(Levenshtein)
27 |     * Hamming
28 |     * Dice's coefficient
29 |     * Jaro 
30 |     * JaroWinkler 
31 |     * Cosine 
32 |     * Simhash
33 | 
34 | ## 内容
35 | - [比较两个字符串相识度](#比较两个字符串相识度)
36 | - [从字符串数组里面找到相似度最高的字符串](#从数组里找到相似度最高的字符串)
37 | - [从字符串数组里面找到相似度最高的字符串-带下标](#从数组里找到相似度最高的字符串-带下标)
38 | - [选择不同算法](##选择不同算法)
39 |     - [莱文斯坦-编辑距离(Levenshtein)](#莱文斯坦-编辑距离(Levenshtein))
40 |     - [选择Dice's coefficient](#选择Dice's-coefficient)
41 |     - [选择jaro](#选择jaro)
42 |     - [选择Hamming](#选择Hamming)
43 |     - [选择JaroWinkler](#选择JaroWinkler)
44 |     - [选择Cosine](#选择Cosine)
45 |     - [选择Simhash](#选择Simhash)
46 | ## 比较两个字符串相识度
47 | ```go
48 | strsim.Compare("中国人", "中")
49 | // -> 0.333333
50 | ```
51 | 
52 | ## 从数组里找到相似度最高的字符串
53 | ```go
54 | strsim.FindBestMatchOne("海刘", []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"})
55 | ```
56 | ## 从数组里找到相似度最高的字符串-带下标
57 | ```go
58 | strsim.FindBestMatch("海刘", []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"})
59 | ```
60 | 
61 | ## 选择不同算法
62 | ### 莱文斯坦-编辑距离(Levenshtein)
63 | ```go
64 | strsim.Compare("abc", "ab")
65 | // -> 0.6666666666666667
66 | ```
67 | ### 选择Dice's coefficient
68 | ```go
69 | strsim.Compare("abc", "ab", strsim.DiceCoefficient())
70 | //-> 0.6666666666666666
71 | ```
72 | ### 选择jaro
73 | ```go
74 | strsim.Compare("abc", "ab", strsim.Jaro())
75 | ```
76 | ### 选择JaroWinkler 
77 | 
78 | ```go
79 | strsim.Compare("abc", "ab", strsim.JaroWinkler())
80 | ```
81 | 
82 | ### 选择Hamming
83 | ```go
84 | strsim.Compare("abc", "ab", strsim.Hamming())
85 | ```
86 | 
87 | ### 选择Cosine
88 | 
89 | ```go
90 | strsim.Compare("abc", "ab", strsim.Cosine())
91 | ```
92 | 
93 | ### 选择Simhash
94 | 
95 | ```go
96 | strsim.Compare("abc", "ab", strsim.Simhash())
97 | ```
98 | 
99 | 


--------------------------------------------------------------------------------
/check.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | func check(s1, s2 string) (score float64, exit bool) {
 4 | 	if s1 == s2 {
 5 | 		return 1.0, true
 6 | 	}
 7 | 
 8 | 	if len(s1) == 0 {
 9 | 		return 0.0, true
10 | 	}
11 | 
12 | 	if len(s2) == 0 {
13 | 		return 0.0, true
14 | 	}
15 | 
16 | 	return 0, false
17 | }
18 | 


--------------------------------------------------------------------------------
/cosine_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import "github.com/antlabs/strsim/similarity"
 4 | 
 5 | // CosineConf is a configuration struct for Cosine similarity.
 6 | 
 7 | func Cosine() OptionFunc {
 8 | 
 9 | 	return OptionFunc(func(o *option) {
10 | 		if o.cmp == nil {
11 | 			l := similarity.Cosine{}
12 | 			o.base64 = true
13 | 			o.cmp = l.CompareUtf8
14 | 			if o.ascii {
15 | 				o.cmp = l.CompareAscii
16 | 			}
17 | 		}
18 | 	})
19 | 
20 | }
21 | 


--------------------------------------------------------------------------------
/default_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"github.com/antlabs/strsim/similarity"
 5 | )
 6 | 
 7 | func Default() OptionFunc {
 8 | 	return OptionFunc(func(o *option) {
 9 | 		if o.cmp == nil {
10 | 			l := similarity.EditDistance{}
11 | 			o.cmp = l.CompareUtf8
12 | 			if o.ascii {
13 | 				o.cmp = l.CompareAscii
14 | 			}
15 | 		}
16 | 	})
17 | }
18 | 


--------------------------------------------------------------------------------
/dice_distance_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"github.com/antlabs/strsim/similarity"
 5 | )
 6 | 
 7 | // ngram 是筛子系数需要用的一个值
 8 | func DiceCoefficient(ngram ...int) OptionFunc {
 9 | 	return OptionFunc(func(o *option) {
10 | 		ngram2 := 2
11 | 		if len(ngram) > 0 {
12 | 			ngram2 = ngram[0]
13 | 		}
14 | 
15 | 		d := &similarity.DiceCoefficient{Ngram: ngram2}
16 | 		o.cmp = d.CompareUtf8
17 | 	})
18 | }
19 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/antlabs/strsim
2 | 
3 | go 1.14
4 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/antlabs/strsim/113a66378916cb00a5b68e728ba52b5783a864dc/go.sum


--------------------------------------------------------------------------------
/hamming_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"github.com/antlabs/strsim/similarity"
 5 | )
 6 | 
 7 | func Hamming() OptionFunc {
 8 | 	return OptionFunc(func(o *option) {
 9 | 
10 | 		h := &similarity.Hamming{}
11 | 		o.cmp = h.CompareUtf8
12 | 		if o.ascii {
13 | 			o.cmp = h.CompareAscii
14 | 		}
15 | 	})
16 | }
17 | 


--------------------------------------------------------------------------------
/jaro_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"github.com/antlabs/strsim/similarity"
 5 | )
 6 | 
 7 | // ngram 是筛子系数需要用的一个值
 8 | func Jaro(matchWindow ...int) OptionFunc {
 9 | 	return OptionFunc(func(o *option) {
10 | 		mw := 0
11 | 		if len(matchWindow) > 0 {
12 | 			mw = matchWindow[0]
13 | 		}
14 | 		d := &similarity.Jaro{MatchWindow: mw}
15 | 		o.cmp = d.CompareUtf8
16 | 	})
17 | }
18 | 


--------------------------------------------------------------------------------
/jaro_winkler_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import "github.com/antlabs/strsim/similarity"
 4 | 
 5 | // JaroWinkler ngram 是筛子系数需要用的一个值
 6 | func JaroWinkler(matchWindow ...int) OptionFunc {
 7 | 	return OptionFunc(func(o *option) {
 8 | 		mw := 0
 9 | 		if len(matchWindow) > 0 {
10 | 			mw = matchWindow[0]
11 | 		}
12 | 		d := &similarity.JaroWinkler{MatchWindow: mw}
13 | 		o.cmp = d.CompareUtf8
14 | 	})
15 | }
16 | 


--------------------------------------------------------------------------------
/prev_modify.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"github.com/antlabs/strsim/similarity"
 5 | 	"strings"
 6 | )
 7 | 
 8 | const (
 9 | 	ignoreCase = 1 << iota
10 | 	ignoreSpace
11 | )
12 | 
13 | var replace = strings.NewReplacer("\r", "", "\n", "", "\t", "", "\f", "", " ", "")
14 | 
15 | var modiyTab = map[int]func(s *string){
16 | 
17 | 	ignoreCase: func(s *string) {
18 | 		*s = strings.ToLower(*s)
19 | 	},
20 | 
21 | 	ignoreSpace: func(s *string) {
22 | 		*s = replace.Replace(*s)
23 | 	},
24 | }
25 | 
26 | func modifyString(o *option, s *string) {
27 | 	for i := 1; i <= ignoreSpace; i <<= 1 {
28 | 		if i&o.ignore > 0 {
29 | 			modiyTab[i](s)
30 | 		}
31 | 	}
32 | }
33 | 
34 | func modifyStrToBase64Str(o *option, s *string) {
35 | 	if o.base64 {
36 | 		// 将字符串转换为base64编码
37 | 		*s = similarity.Base64Encode(*s)
38 | 	}
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/prev_modify_test.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | type testCase struct {
 8 | 	test string
 9 | 	need string
10 | 	arg1 string
11 | 	arg2 string
12 | 	sim  float64
13 | 
14 | 	opt Option
15 | }
16 | 
17 | func Test_ModifyString(t *testing.T) {
18 | 	var o option
19 | 
20 | 	o.ignore |= ignoreCase
21 | 	o.ignore |= ignoreSpace
22 | 	o.base64 = true
23 | 
24 | 	for _, v := range []testCase{
25 | 		{
26 | 			test: "hello world",
27 | 			need: "helloworld",
28 | 		},
29 | 	} {
30 | 		modifyString(&o, &v.test)
31 | 		if v.test != v.need {
32 | 			t.Fatalf("modifyString: got %q, want %q", v.test, v.need)
33 | 		}
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/public_config.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | type option struct {
 4 | 	ignore int  //
 5 | 	ascii  bool // 设置选用ascii还是utf8方式执行算法
 6 | 	cmp    func(s1, s2 string) float64
 7 | 	base64 bool // 设置是否使用base64算法
 8 | }
 9 | 
10 | // 调用Option接口设置option
11 | func (o *option) fillOption(opts ...Option) {
12 | 	for _, opt := range opts {
13 | 		opt.Apply(o)
14 | 	}
15 | 
16 | 	opt := Default()
17 | 	opt.Apply(o)
18 | }
19 | 
20 | type Option interface {
21 | 	Apply(*option)
22 | }
23 | 
24 | type OptionFunc func(*option)
25 | 
26 | func (o OptionFunc) Apply(opt *option) {
27 | 	o(opt)
28 | }
29 | 
30 | //忽略大小写
31 | func IgnoreCase() OptionFunc {
32 | 	return OptionFunc(func(o *option) {
33 | 		o.ignore |= ignoreCase
34 | 	})
35 | }
36 | 
37 | //忽略空白字符
38 | func IgnoreSpace() OptionFunc {
39 | 	return OptionFunc(func(o *option) {
40 | 		o.ignore |= ignoreSpace
41 | 	})
42 | }
43 | 
44 | //使用ascii编码
45 | func UseASCII() OptionFunc {
46 | 	return OptionFunc(func(o *option) {
47 | 		o.ascii = true
48 | 	})
49 | }
50 | 
51 | // UseBase64 使用base64编码
52 | func UseBase64() OptionFunc {
53 | 	return OptionFunc(func(o *option) {
54 | 		o.base64 = true
55 | 	})
56 | }
57 | 


--------------------------------------------------------------------------------
/simhash_conf.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import "github.com/antlabs/strsim/similarity"
 4 | 
 5 | func Simhash() OptionFunc {
 6 | 	return OptionFunc(func(o *option) {
 7 | 		if o.cmp == nil {
 8 | 			l := similarity.Simhash{}
 9 | 			o.base64 = true
10 | 			o.cmp = l.CompareUtf8
11 | 			if o.ascii {
12 | 				o.cmp = l.CompareAscii
13 | 			}
14 | 		}
15 | 	})
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/similarity/Cosine.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"unicode/utf8"
 6 | )
 7 | 
 8 | // Cosine similarity algorithm implementation.
 9 | type Cosine struct {
10 | }
11 | 
12 | func (c Cosine) CompareAscii(s1, s2 string) float64 {
13 | 	return c.CompareUtf8(s1, s2)
14 | }
15 | 
16 | func (c Cosine) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
17 | 	l1 := utf8.RuneCountInString(utf8Str1)
18 | 	l2 := utf8.RuneCountInString(utf8Str2)
19 | 	//l1 := len(utf8Str1)
20 | 	//l2 := len(utf8Str2)
21 | 	l3 := utf8.RuneCountInString(base64Table)
22 | 	dirts1 := make(map[string]int, l3)
23 | 	dirts2 := make(map[string]int, l3)
24 | 	// 将base64Table转化成[]string
25 | 	base64 := StrToStrs(base64Table, l3)
26 | 	// 遍历base64对dirts1和dirts2进行初始化
27 | 	for _, v := range base64 {
28 | 		dirts1[v] = 0
29 | 		dirts2[v] = 0
30 | 	}
31 | 	// 将s1和s2分别转化成[]string
32 | 	s1s := StrToStrs(utf8Str1, l1)
33 | 	s2s := StrToStrs(utf8Str2, l2)
34 | 	// 遍历s1s和s2s
35 | 	for _, v := range s1s {
36 | 		dirts1[v]++
37 | 	}
38 | 	for _, v := range s2s {
39 | 		dirts2[v]++
40 | 
41 | 	}
42 | 	// 计算s1s和s2s的向量的余弦值
43 | 	var sum1, sum2, sum3 float64
44 | 	for _, v := range base64 {
45 | 		sum1 += float64(dirts1[v]) * float64(dirts1[v])
46 | 		sum2 += float64(dirts2[v]) * float64(dirts2[v])
47 | 		sum3 += float64(dirts1[v]) * float64(dirts2[v])
48 | 	}
49 | 
50 | 	return sum3 / (math.Sqrt(sum1) * math.Sqrt(sum2))
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/similarity/best_result.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | type Match struct {
 4 | 	S     string
 5 | 	Score float64
 6 | }
 7 | 
 8 | type MatchResult struct {
 9 | 	AllResult []*Match
10 | 	Match     *Match
11 | 	BestIndex int
12 | }
13 | 
14 | type Compare func(s1, s2 string) float64
15 | 
16 | func findBestMatch(s string, targets []string, compare Compare) *MatchResult {
17 | 	match := make([]*Match, 0, len(targets))
18 | 	bestIndex := 0
19 | 	for k, s2 := range targets {
20 | 		score := compare(s, s2)
21 | 		match = append(match, &Match{S: s2, Score: score})
22 | 
23 | 		if k == 0 {
24 | 			continue
25 | 		}
26 | 
27 | 		if score > match[bestIndex].Score {
28 | 			bestIndex = k
29 | 		}
30 | 	}
31 | 
32 | 	return &MatchResult{AllResult: match, Match: match[bestIndex], BestIndex: bestIndex}
33 | }
34 | 


--------------------------------------------------------------------------------
/similarity/dice_coefficient.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"unicode/utf8"
 6 | )
 7 | 
 8 | type DiceCoefficient struct {
 9 | 	Ngram int
10 | 
11 | 	//test use
12 | 	l1    int
13 | 	l2    int
14 | 	mixed int
15 | 	key   []string
16 | 	test  bool
17 | }
18 | 
19 | type value struct {
20 | 	s1Count int
21 | 	s2Count int
22 | }
23 | 
24 | func (d *DiceCoefficient) CompareAscii(s1, s2 string) float64 {
25 | 	return d.CompareUtf8(s1, s2)
26 | }
27 | 
28 | func (d *DiceCoefficient) setOrGet(set map[string]value, s string, add bool) (mixed, l int) {
29 | 	var key strings.Builder
30 | 	ngram := d.Ngram
31 | 	if ngram == 0 {
32 | 		ngram = 2
33 | 	}
34 | 
35 | 	for i := 0; i < len(s); {
36 | 		firstSize := 0
37 | 		for j, total := 0, 0; j < ngram && i+total < len(s); j++ {
38 | 			r, size := utf8.DecodeRuneInString(s[i+total:])
39 | 			key.WriteRune(r)
40 | 			total += size
41 | 			if j == 0 {
42 | 				firstSize = size
43 | 			}
44 | 
45 | 		}
46 | 		if utf8.RuneCountInString(key.String()) != ngram {
47 | 			break
48 | 		}
49 | 		val, ok := set[key.String()]
50 | 		if add {
51 | 			if !ok {
52 | 				val = value{}
53 | 			}
54 | 			val.s1Count++
55 | 		} else {
56 | 
57 | 			if !ok {
58 | 				goto next
59 | 			}
60 | 
61 | 			val.s2Count++
62 | 			if val.s1Count >= val.s2Count {
63 | 				mixed++
64 | 			}
65 | 		}
66 | 
67 | 		set[key.String()] = val
68 | 
69 | 	next:
70 | 		if d.test {
71 | 			d.key = append(d.key, key.String())
72 | 		}
73 | 
74 | 		key.Reset()
75 | 		l++
76 | 		i += firstSize
77 | 	}
78 | 
79 | 	return mixed, l
80 | }
81 | 
82 | func (d *DiceCoefficient) CompareUtf8(s1, s2 string) float64 {
83 | 
84 | 	set := make(map[string]value, len(s1)/3)
85 | 	//TODO 边界比如字符长度小于ngram
86 | 
87 | 	mixed, l1 := d.setOrGet(set, s1, true)
88 | 
89 | 	mixed, l2 := d.setOrGet(set, s2, false)
90 | 
91 | 	d.l1 = l1
92 | 	d.l2 = l2
93 | 	d.mixed = mixed
94 | 	return 2.0 * float64(mixed) / float64(l1+l2)
95 | }
96 | 


--------------------------------------------------------------------------------
/similarity/dice_coefficient_test.go:
--------------------------------------------------------------------------------
  1 | package similarity
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"testing"
  6 | 	"unicode/utf8"
  7 | )
  8 | 
  9 | func Test_DiceCoefficient_CompareAscii(t *testing.T) {
 10 | 	// ngram = 1
 11 | 	d := &DiceCoefficient{Ngram: 1}
 12 | 
 13 | 	for k, v := range []testOneCase{
 14 | 		{s1: "ivan1", s2: "ivan2", cost: 0.8},
 15 | 		{s2: "ivan1", s1: "ivan2", cost: 0.8},
 16 | 
 17 | 		{s1: "love", s2: "love", cost: 1},
 18 | 	} {
 19 | 		m := fmt.Sprintf("error case:%d", k)
 20 | 		got := d.CompareAscii(v.s1, v.s2)
 21 | 		if got != v.cost {
 22 | 			t.Fatalf("%s: got %v, want %v", m, got, v.cost)
 23 | 		}
 24 | 		if d.l1 != len(v.s1) {
 25 | 			t.Fatalf("%s: l1 got %v, want %v", m, d.l1, len(v.s1))
 26 | 		}
 27 | 		if d.l2 != len(v.s2) {
 28 | 			t.Fatalf("%s: l2 got %v, want %v", m, d.l2, len(v.s2))
 29 | 		}
 30 | 	}
 31 | 
 32 | }
 33 | 
 34 | func Test_DiceCoefficient_CompareAscii_NgramOrMore(t *testing.T) {
 35 | 	// ngram = 2
 36 | 	d := &DiceCoefficient{Ngram: 2, test: true}
 37 | 	for k, v := range []testOneCase{
 38 | 		{s1: "John Smith", s2: "Smith, John D.", cost: 0.7272727272727273, ngram: 2},
 39 | 		{s2: "John Smith", s1: "Smith, John D.", cost: 0.7272727272727273, ngram: 2},
 40 | 
 41 | 		{s1: "John Smith", s2: "Smith, John D.", cost: 0.6, ngram: 3},
 42 | 		{s2: "John Smith", s1: "Smith, John D.", cost: 0.6, ngram: 3},
 43 | 
 44 | 		{s1: "John Smith", s2: "Smith, John D.", cost: 0.4444444444444444, ngram: 4},
 45 | 		{s2: "John Smith", s1: "Smith, John D.", cost: 0.4444444444444444, ngram: 4},
 46 | 	} {
 47 | 		if v.ngram != 0 {
 48 | 			d.Ngram = v.ngram
 49 | 		}
 50 | 
 51 | 		m := fmt.Sprintf("error case:%d", k)
 52 | 		got := d.CompareAscii(v.s1, v.s2)
 53 | 		if got != v.cost {
 54 | 			t.Fatalf("%s: got %v, want %v", m, got, v.cost)
 55 | 		}
 56 | 		for _, key := range d.key {
 57 | 			if utf8.RuneCountInString(key) != d.Ngram {
 58 | 				t.Fatalf("key is (%s): got ngram=%d, want %d", key, utf8.RuneCountInString(key), d.Ngram)
 59 | 			}
 60 | 		}
 61 | 
 62 | 		d.key = nil
 63 | 
 64 | 	}
 65 | }
 66 | 
 67 | func Test_DiceCoefficient_CompareUtf8(t *testing.T) {
 68 | 	d := &DiceCoefficient{Ngram: 1}
 69 | 
 70 | 	for k, v := range []testOneCase{
 71 | 		{s1: "你好中国", s2: "你好中国", cost: 1},
 72 | 		{s1: "中文也被称为华文、汉文。中文（汉语）有标准语和方言之分，其标准语即汉语普通话", s2: "方块", cost: 0.05},
 73 | 		{s1: "加油，来个", s2: "加油，来吧", cost: 0.8},
 74 | 	} {
 75 | 		got := d.CompareUtf8(v.s1, v.s2)
 76 | 		if got != v.cost {
 77 | 			t.Fatalf("error case:%d: got %v, want %v", k, got, v.cost)
 78 | 		}
 79 | 		//fmt.Printf("mixed:%d, l1:%d, l2:%d, l1:%d\n", d.mixed, d.l1, d.l2, utf8.RuneCountInString(v.s1))
 80 | 	}
 81 | }
 82 | 
 83 | func Test_DiceCoefficient_FindBestMatch(t *testing.T) {
 84 | 	d := &DiceCoefficient{Ngram: 1}
 85 | 
 86 | 	for k, v := range []testBestCase{
 87 | 		{s: "白日依山尽", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0},
 88 | 		{s: "黄河流", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1},
 89 | 		{s: "一层", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3},
 90 | 		{s: "楼", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3},
 91 | 		{s: "山近", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0},
 92 | 		{s: "海刘", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1},
 93 | 	} {
 94 | 		mr := findBestMatch(v.s, v.targets, d.CompareUtf8)
 95 | 		if mr.BestIndex != v.bestIndex {
 96 | 			t.Fatalf("error case:%d: got bestIndex=%v, want %v", k, mr.BestIndex, v.bestIndex)
 97 | 		}
 98 | 	}
 99 | }
100 | 


--------------------------------------------------------------------------------
/similarity/edit_distance.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | type EditDistance struct {
 4 | 	// test use
 5 | 	mixed int
 6 | }
 7 | 
 8 | // ascii
 9 | func (e *EditDistance) CompareAscii(s1, s2 string) float64 {
10 | 	cacheX := make([]int, len(s2))
11 | 
12 | 	diagonal := 0
13 | 	for y, yLen := 0, len(s1); y < yLen; y++ {
14 | 		for x, xLen := 0, len(cacheX); x < xLen; x++ {
15 | 			on := x + 1
16 | 			left := y + 1
17 | 			if x == 0 {
18 | 				diagonal = y
19 | 			} else if y == 0 {
20 | 				diagonal = x
21 | 			}
22 | 			if y > 0 {
23 | 				on = cacheX[x]
24 | 			}
25 | 			if x-1 >= 0 {
26 | 				left = cacheX[x-1]
27 | 			}
28 | 
29 | 			same := 0
30 | 			if s1[y] != s2[x] {
31 | 				same = 1
32 | 			}
33 | 
34 | 			oldDiagonal := cacheX[x]
35 | 			cacheX[x] = min(min(on+1, left+1), same+diagonal)
36 | 			diagonal = oldDiagonal
37 | 			//fmt.Printf("left:%d on:%d diagonal:%d (min:%d)#", left, on, oldDiagonal, cacheX[x])
38 | 
39 | 		}
40 | 		//fmt.Println()
41 | 	}
42 | 
43 | 	e.mixed = cacheX[len(cacheX)-1]
44 | 	return 1.0 - float64(cacheX[len(cacheX)-1])/float64(max(len(s1), len(s2)))
45 | }
46 | 
47 | // utf8
48 | func (e *EditDistance) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
49 | 	r1 := []rune(utf8Str1)
50 | 	r2 := []rune(utf8Str2)
51 | 	cacheX := make([]int, len(r2))
52 | 
53 | 	diagonal := 0
54 | 	for y, yLen := 0, len(r1); y < yLen; y++ {
55 | 		for x, xLen := 0, len(cacheX); x < xLen; x++ {
56 | 			on := x + 1
57 | 			left := y + 1
58 | 			if x == 0 {
59 | 				diagonal = y
60 | 			} else if y == 0 {
61 | 				diagonal = x
62 | 			}
63 | 			if y > 0 {
64 | 				on = cacheX[x]
65 | 			}
66 | 			if x-1 >= 0 {
67 | 				left = cacheX[x-1]
68 | 			}
69 | 
70 | 			same := 0
71 | 			if r1[y] != r2[x] {
72 | 				same = 1
73 | 			}
74 | 
75 | 			oldDiagonal := cacheX[x]
76 | 			cacheX[x] = min(min(on+1, left+1), same+diagonal)
77 | 			diagonal = oldDiagonal
78 | 
79 | 		}
80 | 	}
81 | 
82 | 	e.mixed = cacheX[len(cacheX)-1]
83 | 	return 1.0 - float64(cacheX[len(cacheX)-1])/float64(max(len(r1), len(r2)))
84 | }
85 | 


--------------------------------------------------------------------------------
/similarity/edit_distance_test.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | type testOneCase struct {
 8 | 	s1    string
 9 | 	s2    string
10 | 	cost  float64
11 | 	ngram int
12 | 	mixed int
13 | 	match int
14 | }
15 | 
16 | type testBestCase struct {
17 | 	s         string
18 | 	targets   []string
19 | 	bestIndex int
20 | }
21 | 
22 | func Test_EditDistance_CompareAscii(t *testing.T) {
23 | 	e := &EditDistance{}
24 | 
25 | 	for k, v := range []testOneCase{
26 | 		{s1: "ivan1", s2: "ivan2", cost: 0.8, mixed: 1},
27 | 		{s1: "love", s2: "love", cost: 1, mixed: 0},
28 | 		{s1: "kitten", s2: "sitting", cost: 1 - 3/7.0, mixed: 3},
29 | 		{s1: "12", s2: "1", cost: 0.5, mixed: 1},
30 | 		{s1: "1", s2: "12", cost: 0.5, mixed: 1},
31 | 		{s1: "123", s2: "1", cost: 0.33333333333333337, mixed: 2},
32 | 		{s1: "1", s2: "123", cost: 0.33333333333333337, mixed: 2},
33 | 		{s1: "1234", s2: "1", cost: 0.25, mixed: 3},
34 | 		{s1: "1", s2: "1234", cost: 0.25, mixed: 3},
35 | 	} {
36 | 		s := e.CompareAscii(v.s1, v.s2)
37 | 		if s != v.cost {
38 | 			t.Fatalf("cost:error case:%d: got %v, want %v", k, s, v.cost)
39 | 		}
40 | 		if e.mixed != v.mixed {
41 | 			t.Fatalf("mixed:error case:%d: got %v, want %v", k, e.mixed, v.mixed)
42 | 		}
43 | 	}
44 | }
45 | 
46 | func Test_EditDistance_CompareUtf8(t *testing.T) {
47 | 	e := &EditDistance{}
48 | 
49 | 	for k, v := range []testOneCase{
50 | 		{s1: "你好中国", s2: "你好中国", cost: 1, mixed: 0},
51 | 		{s1: "加油，来个", s2: "加油，来", cost: 0.8, mixed: 1},
52 | 		{s1: "一二三三四五", s2: "六二三三二五七", cost: 1 - 3/7.0, mixed: 3},
53 | 		{s1: "一二", s2: "一", cost: 0.5, mixed: 1},
54 | 		{s1: "一", s2: "一二", cost: 0.5, mixed: 1},
55 | 		{s1: "一二三", s2: "一", cost: 0.33333333333333337, mixed: 2},
56 | 		{s1: "一", s2: "一二三", cost: 0.33333333333333337, mixed: 2},
57 | 		{s1: "一二三四", s2: "一", cost: 0.25, mixed: 3},
58 | 		{s1: "一", s2: "一二三四", cost: 0.25, mixed: 3},
59 | 		{s1: "中文也被称为华文、汉文。中文（汉语）有标准语和方言之分，其标准语即汉语普通话", s2: "方块", cost: 0.02631578947368418, mixed: 37},
60 | 	} {
61 | 		s := e.CompareUtf8(v.s1, v.s2)
62 | 		if s != v.cost {
63 | 			t.Fatalf("cost:error case:%d: got %v, want %v", k, s, v.cost)
64 | 		}
65 | 		if e.mixed != v.mixed {
66 | 			t.Fatalf("mixed:error case:%d: got %v, want %v", k, e.mixed, v.mixed)
67 | 		}
68 | 	}
69 | }
70 | 
71 | func Test_EditDistance_FindBestMatch(t *testing.T) {
72 | 	e := &EditDistance{}
73 | 
74 | 	for k, v := range []testBestCase{
75 | 		{s: "白日依山尽", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0},
76 | 		{s: "黄河流", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1},
77 | 		{s: "一层", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3},
78 | 		{s: "楼", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3},
79 | 		{s: "山近", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0},
80 | 		{s: "海刘", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1},
81 | 	} {
82 | 		mr := findBestMatch(v.s, v.targets, e.CompareUtf8)
83 | 		if mr.BestIndex != v.bestIndex {
84 | 			t.Fatalf("error case:%d: got bestIndex=%v, want %v", k, mr.BestIndex, v.bestIndex)
85 | 		}
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/similarity/hamming.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | import (
 4 | 	"math"
 5 | 	"unicode/utf8"
 6 | )
 7 | 
 8 | type Hamming struct{}
 9 | 
10 | func (h *Hamming) CompareAscii(s1, s2 string) float64 {
11 | 
12 | 	count := 0
13 | 	max := len(s1)
14 | 	if max < len(s2) {
15 | 		max = len(s2)
16 | 	}
17 | 
18 | 	for i, j := 0, 0; i < len(s1) && j < len(s2); {
19 | 
20 | 		if s1[i] != s2[j] {
21 | 			count++
22 | 		}
23 | 
24 | 		i++
25 | 		j++
26 | 	}
27 | 
28 | 	return 1 - (float64(count)+math.Abs(float64(len(s1)-len(s2))))/float64(max)
29 | }
30 | 
31 | func (h *Hamming) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
32 | 	count := 0
33 | 
34 | 	l1 := utf8.RuneCountInString(utf8Str1)
35 | 	max := l1
36 | 
37 | 	l2 := utf8.RuneCountInString(utf8Str2)
38 | 	if max < l2 {
39 | 		max = l2
40 | 	}
41 | 
42 | 	for i, j := 0, 0; i < len(utf8Str1) && j < len(utf8Str2); {
43 | 		size := 0
44 | 		r1, size := utf8.DecodeRune(StringToBytes(utf8Str1[i:]))
45 | 		i += size
46 | 
47 | 		r2, size := utf8.DecodeRune(StringToBytes(utf8Str2[j:]))
48 | 		j += size
49 | 
50 | 		if r1 != r2 {
51 | 			count++
52 | 		}
53 | 
54 | 	}
55 | 
56 | 	return 1 - (float64(count)+math.Abs(float64(l1-l2)))/float64(max)
57 | }
58 | 


--------------------------------------------------------------------------------
/similarity/hamming_test.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func Test_Hamming_CompareAscii(t *testing.T) {
 8 | 	h := Hamming{}
 9 | 
10 | 	for k, v := range []testOneCase{
11 | 		{s1: "1011101000", s2: "1001001000", cost: 0.8},
12 | 		{s1: "21438960", s2: "22337960", cost: 0.625},
13 | 		{s1: "toned", s2: "roses", cost: 0.4},
14 | 		{s1: "1", s2: "12", cost: 0.5},
15 | 	} {
16 | 		got := h.CompareAscii(v.s1, v.s2)
17 | 		if got != v.cost {
18 | 			t.Fatalf("error case:%d: got %v, want %v", k, got, v.cost)
19 | 		}
20 | 	}
21 | }
22 | 
23 | func Test_Hamming_CompareUtf8(t *testing.T) {
24 | 	h := Hamming{}
25 | 
26 | 	for k, v := range []testOneCase{
27 | 		{s1: "中国嘿嘿", s2: "中国哈哈", cost: 0.5},
28 | 		{s1: "中国嘿嘿1", s2: "中国哈哈", cost: 0.4},
29 | 		{s1: "中国哈哈", s2: "中国嘿嘿1", cost: 0.4},
30 | 	} {
31 | 		got := h.CompareUtf8(v.s1, v.s2)
32 | 		if got != v.cost {
33 | 			t.Fatalf("error case:%d: got %v, want %v", k, got, v.cost)
34 | 		}
35 | 	}
36 | }
37 | 


--------------------------------------------------------------------------------
/similarity/jaro.go:
--------------------------------------------------------------------------------
  1 | package similarity
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"sort"
  6 | 	"sync"
  7 | 	"unicode/utf8"
  8 | )
  9 | 
 10 | type Jaro struct {
 11 | 	MatchWindow int
 12 | 	// test use
 13 | 	mw int
 14 | 	m  int
 15 | 	t  int
 16 | }
 17 | 
 18 | type check struct {
 19 | 	index int
 20 | 	c     rune
 21 | }
 22 | 
 23 | var checkPool = sync.Pool{
 24 | 	New: func() interface{} {
 25 | 		return &check{}
 26 | 	},
 27 | }
 28 | 
 29 | func (j *Jaro) CompareAscii(s1, s2 string) float64 {
 30 | 	return j.CompareUtf8(s1, s2)
 31 | }
 32 | 
 33 | func (j *Jaro) CompareUtf8(s1, s2 string) float64 {
 34 | 	mw := max(utf8.RuneCountInString(s1), utf8.RuneCountInString(s2))/2 - 1
 35 | 	if j.MatchWindow != 0 {
 36 | 		mw = j.MatchWindow
 37 | 	}
 38 | 
 39 | 	m := 0
 40 | 
 41 | 	matchSet := make(map[rune][]int, len(s1)/3)
 42 | 	l1 := 0
 43 | 	for _, c := range s1 {
 44 | 		matchSet[c] = append(matchSet[c], l1)
 45 | 		l1++
 46 | 	}
 47 | 
 48 | 	t := 0
 49 | 	l2 := 0
 50 | 
 51 | 	indexAndRune1 := make([]*check, 0, 8)
 52 | 	indexAndRune2 := make([]rune, 0, 8)
 53 | 
 54 | 	defer func() {
 55 | 		for _, v := range indexAndRune1 {
 56 | 			checkPool.Put(v)
 57 | 		}
 58 | 	}()
 59 | 
 60 | 	for _, c := range s2 {
 61 | 		indexs, ok := matchSet[c]
 62 | 		l2++
 63 | 		if !ok {
 64 | 			continue
 65 | 		}
 66 | 
 67 | 		for k, i := range indexs {
 68 | 			if i == -1 {
 69 | 				continue
 70 | 			}
 71 | 
 72 | 			//fmt.Printf("_______c %c:%d:%d\n", c, l2-1-i, mw)
 73 | 			if math.Abs(float64(l2-1-i)) <= float64(mw) {
 74 | 				m++
 75 | 
 76 | 				currCheck := checkPool.Get().(*check)
 77 | 				currCheck.index = i
 78 | 				currCheck.c = c
 79 | 
 80 | 				indexAndRune1 = append(indexAndRune1, currCheck)
 81 | 
 82 | 				indexAndRune2 = append(indexAndRune2, c)
 83 | 
 84 | 				indexs[k] = -1
 85 | 				break
 86 | 			}
 87 | 		}
 88 | 	}
 89 | 
 90 | 	m2 := float64(m)
 91 | 
 92 | 	if m2 == 0 {
 93 | 		return 0.0
 94 | 	}
 95 | 
 96 | 	sort.Slice(indexAndRune1, func(i, j int) bool {
 97 | 		return indexAndRune1[i].index < indexAndRune1[j].index
 98 | 	})
 99 | 
100 | 	for i, v := range indexAndRune1 {
101 | 		if v.c != indexAndRune2[i] {
102 | 			t++
103 | 		}
104 | 	}
105 | 
106 | 	j.mw = mw
107 | 	j.m = m
108 | 	j.t = t
109 | 	//fmt.Printf("l1:%d, l2:%d, m:%d, t:%d\n", l1, l2, m, t)
110 | 	return 1.0 / 3.0 * (m2/float64(l1) + m2/float64(l2) + (m2-float64(t)/2.0)/m2)
111 | }
112 | 


--------------------------------------------------------------------------------
/similarity/jaro_test.go:
--------------------------------------------------------------------------------
 1 | package similarity
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_Jaro_CompareAscii(t *testing.T) {
 9 | 	j := &Jaro{}
10 | 
11 | 	for k, v := range []testOneCase{
12 | 
13 | 		{s1: "bacde", s2: "abed", cost: 0.6722222222222222},
14 | 		{s1: "MARTHA", s2: "MARHTA", cost: 0.9444444444444444},
15 | 		{s1: "DIXON", s2: "DICKSONX", cost: 0.7666666666666666},
16 | 
17 | 		{s1: "JELLYFISH", s2: "SMELLYFISH", cost: 0.8962962962962964},
18 | 		{s2: "JELLYFISH", s1: "SMELLYFISH", cost: 0.8962962962962964},
19 | 	} {
20 | 		m := fmt.Sprintf("error case:%d", k)
21 | 		got := j.CompareAscii(v.s1, v.s2)
22 | 		if got != v.cost {
23 | 			t.Fatalf("%s: got %v, want %v", m, got, v.cost)
24 | 		}
25 | 		j.MatchWindow = 0
26 | 	}
27 | 
28 | }
29 | 
30 | func Test_Jaro_CompareUtf8(t *testing.T) {
31 | 	j := &Jaro{}
32 | 
33 | 	// jaro处理两个字符串长串接近的数据会好点
34 | 	for k, v := range []testOneCase{
35 | 
36 | 		{s1: "二一三四五", s2: "一二五四", cost: 0.6722222222222222},
37 | 		{s2: "二一三四五", s1: "一二五四", cost: 0.6722222222222222},
38 | 		{s1: "中文也被称为华文、汉文。中文（汉语）有标准语和方言之分，其标准语即汉语普通话", s2: "中文", cost: 0.6842105263157894, match: 1},
39 | 	} {
40 | 		m := fmt.Sprintf("error case:%d", k)
41 | 		got := j.CompareUtf8(v.s1, v.s2)
42 | 		if got != v.cost {
43 | 			t.Fatalf("%s: got %v, want %v", m, got, v.cost)
44 | 		}
45 | 		j.mw = 0
46 | 	}
47 | 
48 | }
49 | 


--------------------------------------------------------------------------------
/similarity/jaro_winkler.go:
--------------------------------------------------------------------------------
  1 | package similarity
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"sort"
  6 | 	"unicode/utf8"
  7 | )
  8 | 
  9 | type JaroWinkler struct {
 10 | 	MatchWindow int
 11 | 	// test use
 12 | 	mw int
 13 | 	m  int
 14 | 	t  int
 15 | }
 16 | 
 17 | func (j *JaroWinkler) CompareAscii(s1, s2 string) float64 {
 18 | 	return j.CompareUtf8(s1, s2)
 19 | }
 20 | 
 21 | func (j *JaroWinkler) CompareUtf8(s1, s2 string) float64 {
 22 | 	//matching window max size
 23 | 	mw := max(utf8.RuneCountInString(s1), utf8.RuneCountInString(s2))/2 - 1
 24 | 	if j.MatchWindow != 0 {
 25 | 		mw = j.MatchWindow
 26 | 	}
 27 | 
 28 | 	m := 0
 29 | 
 30 | 	matchSet := make(map[rune][]int, len(s1)/3)
 31 | 	l1 := 0
 32 | 	for _, c := range s1 {
 33 | 		matchSet[c] = append(matchSet[c], l1)
 34 | 		l1++
 35 | 	}
 36 | 
 37 | 	t := 0
 38 | 	l2 := 0
 39 | 
 40 | 	indexAndRune1 := make([]*check, 0, 8)
 41 | 	indexAndRune2 := make([]rune, 0, 8)
 42 | 
 43 | 	defer func() {
 44 | 		for _, v := range indexAndRune1 {
 45 | 			checkPool.Put(v)
 46 | 		}
 47 | 	}()
 48 | 
 49 | 	for _, c := range s2 {
 50 | 		indexs, ok := matchSet[c]
 51 | 		l2++
 52 | 		if !ok {
 53 | 			continue
 54 | 		}
 55 | 
 56 | 		for k, i := range indexs {
 57 | 			if i == -1 {
 58 | 				continue
 59 | 			}
 60 | 
 61 | 			//fmt.Printf("_______c %c:%d:%d\n", c, l2-1-i, mw)
 62 | 			if math.Abs(float64(l2-1-i)) <= float64(mw) {
 63 | 				m++
 64 | 
 65 | 				currCheck := checkPool.Get().(*check)
 66 | 				currCheck.index = i
 67 | 				currCheck.c = c
 68 | 
 69 | 				indexAndRune1 = append(indexAndRune1, currCheck)
 70 | 
 71 | 				indexAndRune2 = append(indexAndRune2, c)
 72 | 
 73 | 				indexs[k] = -1
 74 | 				break
 75 | 			}
 76 | 		}
 77 | 	}
 78 | 
 79 | 	m2 := float64(m)
 80 | 
 81 | 	if m2 == 0 {
 82 | 		return 0.0
 83 | 	}
 84 | 
 85 | 	sort.Slice(indexAndRune1, func(i, j int) bool {
 86 | 		return indexAndRune1[i].index < indexAndRune1[j].index
 87 | 	})
 88 | 
 89 | 	for i, v := range indexAndRune1 {
 90 | 		if v.c != indexAndRune2[i] {
 91 | 			t++
 92 | 		}
 93 | 	}
 94 | 
 95 | 	j.mw = mw
 96 | 	j.m = m
 97 | 	j.t = t
 98 | 	//fmt.Printf("l1:%d, l2:%d, m:%d, t:%d\n", l1, l2, m, t)
 99 | 	// s1 和 s2 的相同前缀长度
100 | 	prefixLength := 0
101 | 	for i := 0; i < min(len(s1), len(s2)); i++ {
102 | 		if s1[i] != s2[i] {
103 | 			break
104 | 		}
105 | 		if prefixLength <= 4 {
106 | 			prefixLength++
107 | 		} else {
108 | 			break
109 | 		}
110 | 	}
111 | 	// 影响因子 p 取值范围[0.1，0.25]，默认值为0.1
112 | 	p := 0.1
113 | 
114 | 	simj := 1.0 / 3.0 * (m2/float64(l1) + m2/float64(l2) + (m2-float64(t)/2.0)/m2)
115 | 
116 | 	return simj + float64(prefixLength)*p*(1.0-simj)
117 | }
118 | 


--------------------------------------------------------------------------------
/similarity/simhash.go:
--------------------------------------------------------------------------------
  1 | package similarity
  2 | 
  3 | import (
  4 | 	"hash/crc32"
  5 | 	"strconv"
  6 | 	"unicode/utf8"
  7 | )
  8 | 
  9 | type Simhash struct {
 10 | }
 11 | 
 12 | func (s Simhash) CompareAscii(s1, s2 string) float64 {
 13 | 	return s.CompareUtf8(s1, s2)
 14 | 
 15 | }
 16 | func (s Simhash) CompareUtf8(utf8Str1, utf8Str2 string) float64 {
 17 | 	// 字符串长度
 18 | 	l1 := utf8.RuneCountInString(utf8Str1)
 19 | 	l2 := utf8.RuneCountInString(utf8Str2)
 20 | 	// 将字符串转换为字符数组
 21 | 	s1s := StrToStrs4(utf8Str1, l1)
 22 | 	s2s := StrToStrs4(utf8Str2, l2)
 23 | 	// 计算每个字符在字符数组中出现的次数
 24 | 	counts1 := make(map[string]int)
 25 | 	counts2 := make(map[string]int)
 26 | 	for _, s := range s1s {
 27 | 		// 如果字符在字符数组中出现过，则计数加1
 28 | 		if _, ok := counts1[s]; ok {
 29 | 			counts1[s]++
 30 | 		} else {
 31 | 			// 如果字符在字符数组中没出现过，则计数设为1
 32 | 			counts1[s] = 1
 33 | 		}
 34 | 	}
 35 | 	for _, s := range s2s {
 36 | 		if _, ok := counts2[s]; ok {
 37 | 			counts2[s]++
 38 | 		} else {
 39 | 			counts2[s] = 1
 40 | 		}
 41 | 	}
 42 | 	h1 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts1))))
 43 | 	h2 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts2))))
 44 | 
 45 | 	// 计算h1, h2的汉明距离
 46 | 	Hamming := Hamming{}
 47 | 	//fmt.Printf("h1: %s\nh2: %s\n", h1, h2)
 48 | 
 49 | 	return Hamming.CompareUtf8(h1, h2)
 50 | 
 51 | }
 52 | 
 53 | // 降维度
 54 | func Dimensionality(ins []int) []int {
 55 | 	for i := 0; i < len(ins); i++ {
 56 | 		if ins[i] > 0 {
 57 | 			ins[i] = 1
 58 | 		} else {
 59 | 			ins[i] = 0
 60 | 		}
 61 | 
 62 | 	}
 63 | 	return ins
 64 | }
 65 | 
 66 | //合并
 67 | func merge(ins [][]int) []int {
 68 | 	res := make([]int, len(ins[0]))
 69 | 	lens := len(ins)
 70 | 	for i := 0; i < lens; i++ {
 71 | 		for j := 0; j < len(ins[i]); j++ {
 72 | 			res[j] += ins[i][j]
 73 | 		}
 74 | 	}
 75 | 	return res
 76 | }
 77 | 
 78 | // 计算hashcode并加权
 79 | func hashcodeAndAdd(counts map[string]int) [][]int {
 80 | 	// hashmap
 81 | 	lens := len(counts)
 82 | 	h1 := make([][]int, lens)
 83 | 	// 计算counts1,counts2 中每个字符的hash值, 并且将出现的次数分为5个等级, 将每个字符的hash值与出现的次数等级相乘
 84 | 	c1 := (lens - 1) * 4.0
 85 | 	j := 0
 86 | 	//for j := 0; j < lens; j++ {
 87 | 	for k, v := range counts {
 88 | 		////计算每一个字符串的hash
 89 | 		//for i := 0; i < len(h1); i++ {
 90 | 		// 出现的次数除以5
 91 | 		c := strconv.FormatUint(uint64(crc32.ChecksumIEEE([]byte(k))), 2)
 92 | 		// 将字符串转换为数字数组
 93 | 		cs := Int32StrToInts(c)
 94 | 		if v <= c1/5.0 {
 95 | 			// 加权
 96 | 			h1[j] = Add(cs, 1)
 97 | 		} else if v <= c1/5.0*2 {
 98 | 			// 加权
 99 | 			h1[j] = Add(cs, 2)
100 | 		} else if v <= c1/5.0*3 {
101 | 			// 加权
102 | 			h1[j] = Add(cs, 3)
103 | 		} else if v <= c1/5.0*4 {
104 | 			// 加权
105 | 			h1[j] = Add(cs, 4)
106 | 		} else {
107 | 			// 加权
108 | 			h1[j] = Add(cs, 5)
109 | 		}
110 | 		j++
111 | 	}
112 | 
113 | 	return h1
114 | }
115 | 


--------------------------------------------------------------------------------
/similarity/utils.go:
--------------------------------------------------------------------------------
  1 | package similarity
  2 | 
  3 | import (
  4 | 	"encoding/base64"
  5 | 	"reflect"
  6 | 	"strconv"
  7 | 	"unsafe"
  8 | )
  9 | 
 10 | const base64Table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
 11 | 
 12 | func min(x, y int) int {
 13 | 	if x < y {
 14 | 		return x
 15 | 	}
 16 | 	return y
 17 | }
 18 | 
 19 | func max(x, y int) int {
 20 | 	if x < y {
 21 | 		return y
 22 | 	}
 23 | 	return x
 24 | }
 25 | 
 26 | func StringToBytes(s string) (b []byte) {
 27 | 	bh := (*reflect.SliceHeader)(unsafe.Pointer(&b))
 28 | 	sh := *(*reflect.StringHeader)(unsafe.Pointer(&s))
 29 | 	bh.Data = sh.Data
 30 | 	bh.Len = sh.Len
 31 | 	bh.Cap = sh.Len
 32 | 	return b
 33 | }
 34 | 
 35 | // Base64Encode encodes a byte slice to a base64 string.
 36 | func Base64Encode(s string) string {
 37 | 	base := base64.NewEncoding(base64Table)
 38 | 	bytes := StringToBytes(s)
 39 | 	return base.EncodeToString(bytes)
 40 | }
 41 | 
 42 | // StrToStrs 字符串转化字符数组
 43 | func StrToStrs(s string, lenth int) []string {
 44 | 	base := make([]string, lenth)
 45 | 	for i := 0; i < lenth; i++ {
 46 | 		base[i] = string(s[i])
 47 | 	}
 48 | 	return base
 49 | }
 50 | 
 51 | // StrToStrs4 每隔四个字符转换成一个字符串
 52 | func StrToStrs4(s string, lenth int) []string {
 53 | 	base := make([]string, lenth/4)
 54 | 	var j = 0
 55 | 	for i := 0; i < lenth; i += 4 {
 56 | 		//base = append(base, s[i:i+4])
 57 | 		base[j] = s[i : i+4]
 58 | 		j++
 59 | 	}
 60 | 	return base
 61 | }
 62 | 
 63 | // Add 加权
 64 | func Add(uint64 []int, int int) []int {
 65 | 	lens := len(uint64)
 66 | 	for i := 0; i < 32; i++ {
 67 | 		if i < lens {
 68 | 			if uint64[i] == 1 {
 69 | 				uint64[i] = int
 70 | 			} else {
 71 | 				uint64[i] = -int
 72 | 			}
 73 | 		} else {
 74 | 			uint64 = append(uint64, int)
 75 | 		}
 76 | 
 77 | 	}
 78 | 	return uint64
 79 | }
 80 | 
 81 | // Int32StrToInts   将uint64转换成string
 82 | func Int32StrToInts(ins string) []int {
 83 | 	uints := make([]int, 32)
 84 | 
 85 | 	for i := 0; i < len(ins); i++ {
 86 | 		if string(ins[i]) == "1" {
 87 | 			uints[i] = 1
 88 | 		} else if string(ins[i]) == "0" {
 89 | 			uints[i] = 0
 90 | 		}
 91 | 	}
 92 | 	return uints
 93 | 
 94 | }
 95 | 
 96 | // IntsToStr []int 转换成string
 97 | func IntsToStr(ins []int) string {
 98 | 	res := ""
 99 | 	for _, v := range ins {
100 | 		res += strconv.Itoa(v)
101 | 	}
102 | 
103 | 	return res
104 | }
105 | 


--------------------------------------------------------------------------------
/strsim.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"github.com/antlabs/strsim/similarity"
 5 | )
 6 | 
 7 | // 比较两个字符串相似度
 8 | func Compare(s1, s2 string, opts ...Option) float64 {
 9 | 	var o option
10 | 
11 | 	o.fillOption(opts...)
12 | 
13 | 	return compare(s1, s2, &o)
14 | }
15 | 
16 | // 返回相似度最高的那个字符串
17 | func FindBestMatchOne(s string, targets []string, opts ...Option) *similarity.Match {
18 | 	r := findBestMatch(s, targets, opts...)
19 | 	return r.Match
20 | }
21 | 
22 | // 返回相似度最高的那个字符串, 以及索引位置
23 | func FindBestMatch(s string, targets []string, opts ...Option) *similarity.MatchResult {
24 | 	return findBestMatch(s, targets, opts...)
25 | }
26 | 


--------------------------------------------------------------------------------
/strsim_priv.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import "github.com/antlabs/strsim/similarity"
 4 | 
 5 | // 比较两个字符串内部函数
 6 | func compare(s1, s2 string, o *option) float64 {
 7 | 	if s, e := modifyStrAndCheck(o, &s1, &s2); e {
 8 | 		return s
 9 | 	}
10 | 
11 | 	return o.cmp(s1, s2)
12 | }
13 | 
14 | // 前处理主要涉及，修改字符串，和边界判断
15 | func modifyStrAndCheck(o *option, s1, s2 *string) (score float64, exit bool) {
16 | 	modifyString(o, s1)
17 | 	modifyString(o, s2)
18 | 	modifyStrToBase64Str(o, s1)
19 | 	modifyStrToBase64Str(o, s2)
20 | 
21 | 	return check(*s1, *s2)
22 | }
23 | 
24 | // 记录每个targets子串的相似度打分，并且返回相似度最高的那个字符串, 内部函数
25 | func findBestMatch(s string, targets []string, opts ...Option) *similarity.MatchResult {
26 | 
27 | 	var opt option
28 | 	opt.fillOption(opts...)
29 | 
30 | 	match := make([]*similarity.Match, 0, len(targets))
31 | 	bestIndex := 0
32 | 	for k, s2 := range targets {
33 | 
34 | 		score := compare(s, s2, &opt)
35 | 
36 | 		//fmt.Printf("score:%f(%s)(%s)\n", score, s, s2)
37 | 		match = append(match, &similarity.Match{S: s2, Score: score})
38 | 
39 | 		if k == 0 {
40 | 			continue
41 | 		}
42 | 
43 | 		if score > match[bestIndex].Score {
44 | 			bestIndex = k
45 | 		}
46 | 	}
47 | 
48 | 	return &similarity.MatchResult{AllResult: match, Match: match[bestIndex], BestIndex: bestIndex}
49 | }
50 | 


--------------------------------------------------------------------------------
/strsim_test.go:
--------------------------------------------------------------------------------
 1 | package strsim
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | func Test_Compare_Special(t *testing.T) {
 8 | 
 9 | 	for _, v := range []testCase{
10 | 		{arg1: "", arg2: "", sim: 1},
11 | 		{arg1: "1", arg2: "", sim: 0},
12 | 		{arg1: "", arg2: "1", sim: 0},
13 | 	} {
14 | 		for _, o := range []Option{
15 | 			Default(),
16 | 			Jaro(),
17 | 			DiceCoefficient(1),
18 | 			Hamming(),
19 | 			Simhash(),
20 | 			Cosine(),
21 | 			JaroWinkler(),
22 | 		} {
23 | 			sim := Compare(v.arg1, v.arg2, o)
24 | 			if sim != v.sim {
25 | 				t.Fatalf("Compare(%q, %q) with option: got %v, want %v", v.arg1, v.arg2, sim, v.sim)
26 | 			}
27 | 		}
28 | 	}
29 | }
30 | 
31 | type bestTest struct {
32 | 	best []string
33 | 	key  string
34 | 	need string
35 | }
36 | 
37 | func Test_FindBestMatchOne(t *testing.T) {
38 | 	for _, d := range []bestTest{
39 | 		{best: []string{"朝辞白帝彩云间", "千里江陵一日还", "两岸猿声啼不住", "轻舟已过万重山"}, key: "千里还", need: "千里江陵一日还"},
40 | 	} {
41 | 		for _, o := range []Option{
42 | 			DiceCoefficient(1),
43 | 			Jaro(),
44 | 			Default(),
45 | 			Simhash(),
46 | 			Cosine(),
47 | 			JaroWinkler(),
48 | 		} {
49 | 			m := FindBestMatchOne(d.key, d.best, o)
50 | 			if m.S != d.need {
51 | 				t.Fatalf("FindBestMatchOne(%q, %v) with option: got %q, want %q", d.key, d.best, m.S, d.need)
52 | 			}
53 | 		}
54 | 	}
55 | }
56 | 
57 | func Test_FindBestMatch(t *testing.T) {
58 | 	for _, d := range []bestTest{
59 | 		{best: []string{"朝辞白帝彩云间", "千里江陵一日还", "两岸猿声啼不住", "轻舟已过万重山"}, key: "千里还", need: "千里江陵一日还"},
60 | 	} {
61 | 		for _, o := range []Option{
62 | 			DiceCoefficient(1),
63 | 			Jaro(),
64 | 			Default(),
65 | 			Simhash(),
66 | 			Cosine(),
67 | 			JaroWinkler(),
68 | 		} {
69 | 			m := FindBestMatch(d.key, d.best, o)
70 | 			if m.Match.S != d.need {
71 | 				t.Fatalf("FindBestMatch(%q, %v) with option: got %q, want %q", d.key, d.best, m.Match.S, d.need)
72 | 			}
73 | 		}
74 | 	}
75 | }
76 | 


--------------------------------------------------------------------------------