├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.md │ ├── feature-request.md │ └── general-question.md └── workflows │ └── go.yml ├── .gitignore ├── LICENSE ├── README.md ├── check.go ├── cosine_conf.go ├── default_conf.go ├── dice_distance_conf.go ├── go.mod ├── go.sum ├── hamming_conf.go ├── jaro_conf.go ├── jaro_winkler_conf.go ├── prev_modify.go ├── prev_modify_test.go ├── public_config.go ├── simhash_conf.go ├── similarity ├── Cosine.go ├── best_result.go ├── dice_coefficient.go ├── dice_coefficient_test.go ├── edit_distance.go ├── edit_distance_test.go ├── hamming.go ├── hamming_test.go ├── jaro.go ├── jaro_test.go ├── jaro_winkler.go ├── simhash.go └── utils.go ├── strsim.go ├── strsim_priv.go └── strsim_test.go /.github/ISSUE_TEMPLATE/bug-report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F91D Bug Report" 3 | about: As a User, I want to report a Bug. 4 | labels: type/bug 5 | --- 6 | 7 | ## Bug Report 8 | 9 | Please answer these questions before submitting your issue. Thanks! 10 | 11 | ### 1. Minimal reproduce step (Required) 12 | 13 | 14 | 15 | ### 2. What did you expect to see? (Required) 16 | 17 | ### 3. What did you see instead (Required) 18 | 19 | ### 4. What is your strsim version? (Required) 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F44F Feature Request" 3 | about: As a user, I want to request a New Feature on the product. 4 | labels: type/feature-request 5 | --- 6 | 7 | ## Feature Request 8 | 9 | **Is your feature request related to a problem? Please describe:** 10 | 11 | 12 | **Describe the feature you'd like:** 13 | 14 | 15 | **Describe alternatives you've considered:** 16 | 17 | 18 | **Teachability, Documentation, Adoption, Migration Strategy:** 19 | 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general-question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: "\U0001F600 Ask a Question" 3 | about: I want to ask a question. 4 | labels: type/question 5 | --- 6 | 7 | ## General Question 8 | 9 | -------------------------------------------------------------------------------- /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | go: [ '1.13', '1.14'] 14 | name: Go ${{ matrix.go }} sample 15 | 16 | steps: 17 | 18 | - name: Set up Go 1.13 19 | uses: actions/setup-go@v1 20 | with: 21 | go-version: ${{ matrix.go }} 22 | id: go 23 | 24 | - name: Check out code into the Go module directory 25 | uses: actions/checkout@v1 26 | 27 | - name: Get dependencies 28 | run: | 29 | go get -v -t -d ./... 30 | if [ -f Gopkg.toml ]; then 31 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 32 | dep ensure 33 | fi 34 | 35 | - name: Build 36 | run: go build -v . 37 | 38 | - name: Test 39 | run: go test -v -coverprofile='coverage.out' -covermode=count ./... 40 | 41 | - name: Upload Coverage report 42 | uses: codecov/codecov-action@v1 43 | with: 44 | token: ${{secrets.CODECOV_TOKEN}} 45 | file: ./coverage.out 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *swp 3 | *~ 4 | *.exe 5 | *.exe~ 6 | *.dll 7 | *.so 8 | *.dylib 9 | 10 | # Test binary, built with `go test -c` 11 | *.test 12 | 13 | # Output of the go coverage tool, specifically when used with LiteIDE 14 | *.out 15 | 16 | # Dependency directories (remove the comment below to include it) 17 | # vendor/ 18 | 19 | .idea/ 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## strsim 2 | strsim是golang实现的字符串相识度库,后端集成多种算法,主要解决现有相似度库不能很好的处理中文 3 | 4 | [![Go](https://github.com/antlabs/strsim/workflows/Go/badge.svg)](https://github.com/antlabs/strsim/actions) 5 | [![codecov](https://codecov.io/gh/antlabs/strsim/branch/master/graph/badge.svg)](https://codecov.io/gh/antlabs/strsim) 6 | 7 | ## 构架 8 | ![strsim.png](https://github.com/guonaihong/images/blob/master/strsim/strsim.png?raw=true) 9 | 10 | 11 | 12 | ## 使用方式 13 | 14 | ```go 15 | go get -u github.com/antlabs/strsim 16 | ``` 17 | 18 | 19 | 20 | 21 | 22 | ## 功能 23 | * 可以忽略空白字符 24 | * 可以大小写 25 | ### 多种算法支持 26 | * 莱文斯坦-编辑距离(Levenshtein) 27 | * Hamming 28 | * Dice's coefficient 29 | * Jaro 30 | * JaroWinkler 31 | * Cosine 32 | * Simhash 33 | 34 | ## 内容 35 | - [比较两个字符串相识度](#比较两个字符串相识度) 36 | - [从字符串数组里面找到相似度最高的字符串](#从数组里找到相似度最高的字符串) 37 | - [从字符串数组里面找到相似度最高的字符串-带下标](#从数组里找到相似度最高的字符串-带下标) 38 | - [选择不同算法](##选择不同算法) 39 | - [莱文斯坦-编辑距离(Levenshtein)](#莱文斯坦-编辑距离(Levenshtein)) 40 | - [选择Dice's coefficient](#选择Dice's-coefficient) 41 | - [选择jaro](#选择jaro) 42 | - [选择Hamming](#选择Hamming) 43 | - [选择JaroWinkler](#选择JaroWinkler) 44 | - [选择Cosine](#选择Cosine) 45 | - [选择Simhash](#选择Simhash) 46 | ## 比较两个字符串相识度 47 | ```go 48 | strsim.Compare("中国人", "中") 49 | // -> 0.333333 50 | ``` 51 | 52 | ## 从数组里找到相似度最高的字符串 53 | ```go 54 | strsim.FindBestMatchOne("海刘", []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}) 55 | ``` 56 | ## 从数组里找到相似度最高的字符串-带下标 57 | ```go 58 | strsim.FindBestMatch("海刘", []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}) 59 | ``` 60 | 61 | ## 选择不同算法 62 | ### 莱文斯坦-编辑距离(Levenshtein) 63 | ```go 64 | strsim.Compare("abc", "ab") 65 | // -> 0.6666666666666667 66 | ``` 67 | ### 选择Dice's coefficient 68 | ```go 69 | strsim.Compare("abc", "ab", strsim.DiceCoefficient()) 70 | //-> 0.6666666666666666 71 | ``` 72 | ### 选择jaro 73 | ```go 74 | strsim.Compare("abc", "ab", strsim.Jaro()) 75 | ``` 76 | ### 选择JaroWinkler 77 | 78 | ```go 79 | strsim.Compare("abc", "ab", strsim.JaroWinkler()) 80 | ``` 81 | 82 | ### 选择Hamming 83 | ```go 84 | strsim.Compare("abc", "ab", strsim.Hamming()) 85 | ``` 86 | 87 | ### 选择Cosine 88 | 89 | ```go 90 | strsim.Compare("abc", "ab", strsim.Cosine()) 91 | ``` 92 | 93 | ### 选择Simhash 94 | 95 | ```go 96 | strsim.Compare("abc", "ab", strsim.Simhash()) 97 | ``` 98 | 99 | -------------------------------------------------------------------------------- /check.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | func check(s1, s2 string) (score float64, exit bool) { 4 | if s1 == s2 { 5 | return 1.0, true 6 | } 7 | 8 | if len(s1) == 0 { 9 | return 0.0, true 10 | } 11 | 12 | if len(s2) == 0 { 13 | return 0.0, true 14 | } 15 | 16 | return 0, false 17 | } 18 | -------------------------------------------------------------------------------- /cosine_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import "github.com/antlabs/strsim/similarity" 4 | 5 | // CosineConf is a configuration struct for Cosine similarity. 6 | 7 | func Cosine() OptionFunc { 8 | 9 | return OptionFunc(func(o *option) { 10 | if o.cmp == nil { 11 | l := similarity.Cosine{} 12 | o.base64 = true 13 | o.cmp = l.CompareUtf8 14 | if o.ascii { 15 | o.cmp = l.CompareAscii 16 | } 17 | } 18 | }) 19 | 20 | } 21 | -------------------------------------------------------------------------------- /default_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "github.com/antlabs/strsim/similarity" 5 | ) 6 | 7 | func Default() OptionFunc { 8 | return OptionFunc(func(o *option) { 9 | if o.cmp == nil { 10 | l := similarity.EditDistance{} 11 | o.cmp = l.CompareUtf8 12 | if o.ascii { 13 | o.cmp = l.CompareAscii 14 | } 15 | } 16 | }) 17 | } 18 | -------------------------------------------------------------------------------- /dice_distance_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "github.com/antlabs/strsim/similarity" 5 | ) 6 | 7 | // ngram 是筛子系数需要用的一个值 8 | func DiceCoefficient(ngram ...int) OptionFunc { 9 | return OptionFunc(func(o *option) { 10 | ngram2 := 2 11 | if len(ngram) > 0 { 12 | ngram2 = ngram[0] 13 | } 14 | 15 | d := &similarity.DiceCoefficient{Ngram: ngram2} 16 | o.cmp = d.CompareUtf8 17 | }) 18 | } 19 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/antlabs/strsim 2 | 3 | go 1.14 4 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/antlabs/strsim/113a66378916cb00a5b68e728ba52b5783a864dc/go.sum -------------------------------------------------------------------------------- /hamming_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "github.com/antlabs/strsim/similarity" 5 | ) 6 | 7 | func Hamming() OptionFunc { 8 | return OptionFunc(func(o *option) { 9 | 10 | h := &similarity.Hamming{} 11 | o.cmp = h.CompareUtf8 12 | if o.ascii { 13 | o.cmp = h.CompareAscii 14 | } 15 | }) 16 | } 17 | -------------------------------------------------------------------------------- /jaro_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "github.com/antlabs/strsim/similarity" 5 | ) 6 | 7 | // ngram 是筛子系数需要用的一个值 8 | func Jaro(matchWindow ...int) OptionFunc { 9 | return OptionFunc(func(o *option) { 10 | mw := 0 11 | if len(matchWindow) > 0 { 12 | mw = matchWindow[0] 13 | } 14 | d := &similarity.Jaro{MatchWindow: mw} 15 | o.cmp = d.CompareUtf8 16 | }) 17 | } 18 | -------------------------------------------------------------------------------- /jaro_winkler_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import "github.com/antlabs/strsim/similarity" 4 | 5 | // JaroWinkler ngram 是筛子系数需要用的一个值 6 | func JaroWinkler(matchWindow ...int) OptionFunc { 7 | return OptionFunc(func(o *option) { 8 | mw := 0 9 | if len(matchWindow) > 0 { 10 | mw = matchWindow[0] 11 | } 12 | d := &similarity.JaroWinkler{MatchWindow: mw} 13 | o.cmp = d.CompareUtf8 14 | }) 15 | } 16 | -------------------------------------------------------------------------------- /prev_modify.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "github.com/antlabs/strsim/similarity" 5 | "strings" 6 | ) 7 | 8 | const ( 9 | ignoreCase = 1 << iota 10 | ignoreSpace 11 | ) 12 | 13 | var replace = strings.NewReplacer("\r", "", "\n", "", "\t", "", "\f", "", " ", "") 14 | 15 | var modiyTab = map[int]func(s *string){ 16 | 17 | ignoreCase: func(s *string) { 18 | *s = strings.ToLower(*s) 19 | }, 20 | 21 | ignoreSpace: func(s *string) { 22 | *s = replace.Replace(*s) 23 | }, 24 | } 25 | 26 | func modifyString(o *option, s *string) { 27 | for i := 1; i <= ignoreSpace; i <<= 1 { 28 | if i&o.ignore > 0 { 29 | modiyTab[i](s) 30 | } 31 | } 32 | } 33 | 34 | func modifyStrToBase64Str(o *option, s *string) { 35 | if o.base64 { 36 | // 将字符串转换为base64编码 37 | *s = similarity.Base64Encode(*s) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /prev_modify_test.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | type testCase struct { 8 | test string 9 | need string 10 | arg1 string 11 | arg2 string 12 | sim float64 13 | 14 | opt Option 15 | } 16 | 17 | func Test_ModifyString(t *testing.T) { 18 | var o option 19 | 20 | o.ignore |= ignoreCase 21 | o.ignore |= ignoreSpace 22 | o.base64 = true 23 | 24 | for _, v := range []testCase{ 25 | { 26 | test: "hello world", 27 | need: "helloworld", 28 | }, 29 | } { 30 | modifyString(&o, &v.test) 31 | if v.test != v.need { 32 | t.Fatalf("modifyString: got %q, want %q", v.test, v.need) 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /public_config.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | type option struct { 4 | ignore int // 5 | ascii bool // 设置选用ascii还是utf8方式执行算法 6 | cmp func(s1, s2 string) float64 7 | base64 bool // 设置是否使用base64算法 8 | } 9 | 10 | // 调用Option接口设置option 11 | func (o *option) fillOption(opts ...Option) { 12 | for _, opt := range opts { 13 | opt.Apply(o) 14 | } 15 | 16 | opt := Default() 17 | opt.Apply(o) 18 | } 19 | 20 | type Option interface { 21 | Apply(*option) 22 | } 23 | 24 | type OptionFunc func(*option) 25 | 26 | func (o OptionFunc) Apply(opt *option) { 27 | o(opt) 28 | } 29 | 30 | //忽略大小写 31 | func IgnoreCase() OptionFunc { 32 | return OptionFunc(func(o *option) { 33 | o.ignore |= ignoreCase 34 | }) 35 | } 36 | 37 | //忽略空白字符 38 | func IgnoreSpace() OptionFunc { 39 | return OptionFunc(func(o *option) { 40 | o.ignore |= ignoreSpace 41 | }) 42 | } 43 | 44 | //使用ascii编码 45 | func UseASCII() OptionFunc { 46 | return OptionFunc(func(o *option) { 47 | o.ascii = true 48 | }) 49 | } 50 | 51 | // UseBase64 使用base64编码 52 | func UseBase64() OptionFunc { 53 | return OptionFunc(func(o *option) { 54 | o.base64 = true 55 | }) 56 | } 57 | -------------------------------------------------------------------------------- /simhash_conf.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import "github.com/antlabs/strsim/similarity" 4 | 5 | func Simhash() OptionFunc { 6 | return OptionFunc(func(o *option) { 7 | if o.cmp == nil { 8 | l := similarity.Simhash{} 9 | o.base64 = true 10 | o.cmp = l.CompareUtf8 11 | if o.ascii { 12 | o.cmp = l.CompareAscii 13 | } 14 | } 15 | }) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /similarity/Cosine.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "math" 5 | "unicode/utf8" 6 | ) 7 | 8 | // Cosine similarity algorithm implementation. 9 | type Cosine struct { 10 | } 11 | 12 | func (c Cosine) CompareAscii(s1, s2 string) float64 { 13 | return c.CompareUtf8(s1, s2) 14 | } 15 | 16 | func (c Cosine) CompareUtf8(utf8Str1, utf8Str2 string) float64 { 17 | l1 := utf8.RuneCountInString(utf8Str1) 18 | l2 := utf8.RuneCountInString(utf8Str2) 19 | //l1 := len(utf8Str1) 20 | //l2 := len(utf8Str2) 21 | l3 := utf8.RuneCountInString(base64Table) 22 | dirts1 := make(map[string]int, l3) 23 | dirts2 := make(map[string]int, l3) 24 | // 将base64Table转化成[]string 25 | base64 := StrToStrs(base64Table, l3) 26 | // 遍历base64对dirts1和dirts2进行初始化 27 | for _, v := range base64 { 28 | dirts1[v] = 0 29 | dirts2[v] = 0 30 | } 31 | // 将s1和s2分别转化成[]string 32 | s1s := StrToStrs(utf8Str1, l1) 33 | s2s := StrToStrs(utf8Str2, l2) 34 | // 遍历s1s和s2s 35 | for _, v := range s1s { 36 | dirts1[v]++ 37 | } 38 | for _, v := range s2s { 39 | dirts2[v]++ 40 | 41 | } 42 | // 计算s1s和s2s的向量的余弦值 43 | var sum1, sum2, sum3 float64 44 | for _, v := range base64 { 45 | sum1 += float64(dirts1[v]) * float64(dirts1[v]) 46 | sum2 += float64(dirts2[v]) * float64(dirts2[v]) 47 | sum3 += float64(dirts1[v]) * float64(dirts2[v]) 48 | } 49 | 50 | return sum3 / (math.Sqrt(sum1) * math.Sqrt(sum2)) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /similarity/best_result.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | type Match struct { 4 | S string 5 | Score float64 6 | } 7 | 8 | type MatchResult struct { 9 | AllResult []*Match 10 | Match *Match 11 | BestIndex int 12 | } 13 | 14 | type Compare func(s1, s2 string) float64 15 | 16 | func findBestMatch(s string, targets []string, compare Compare) *MatchResult { 17 | match := make([]*Match, 0, len(targets)) 18 | bestIndex := 0 19 | for k, s2 := range targets { 20 | score := compare(s, s2) 21 | match = append(match, &Match{S: s2, Score: score}) 22 | 23 | if k == 0 { 24 | continue 25 | } 26 | 27 | if score > match[bestIndex].Score { 28 | bestIndex = k 29 | } 30 | } 31 | 32 | return &MatchResult{AllResult: match, Match: match[bestIndex], BestIndex: bestIndex} 33 | } 34 | -------------------------------------------------------------------------------- /similarity/dice_coefficient.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "strings" 5 | "unicode/utf8" 6 | ) 7 | 8 | type DiceCoefficient struct { 9 | Ngram int 10 | 11 | //test use 12 | l1 int 13 | l2 int 14 | mixed int 15 | key []string 16 | test bool 17 | } 18 | 19 | type value struct { 20 | s1Count int 21 | s2Count int 22 | } 23 | 24 | func (d *DiceCoefficient) CompareAscii(s1, s2 string) float64 { 25 | return d.CompareUtf8(s1, s2) 26 | } 27 | 28 | func (d *DiceCoefficient) setOrGet(set map[string]value, s string, add bool) (mixed, l int) { 29 | var key strings.Builder 30 | ngram := d.Ngram 31 | if ngram == 0 { 32 | ngram = 2 33 | } 34 | 35 | for i := 0; i < len(s); { 36 | firstSize := 0 37 | for j, total := 0, 0; j < ngram && i+total < len(s); j++ { 38 | r, size := utf8.DecodeRuneInString(s[i+total:]) 39 | key.WriteRune(r) 40 | total += size 41 | if j == 0 { 42 | firstSize = size 43 | } 44 | 45 | } 46 | if utf8.RuneCountInString(key.String()) != ngram { 47 | break 48 | } 49 | val, ok := set[key.String()] 50 | if add { 51 | if !ok { 52 | val = value{} 53 | } 54 | val.s1Count++ 55 | } else { 56 | 57 | if !ok { 58 | goto next 59 | } 60 | 61 | val.s2Count++ 62 | if val.s1Count >= val.s2Count { 63 | mixed++ 64 | } 65 | } 66 | 67 | set[key.String()] = val 68 | 69 | next: 70 | if d.test { 71 | d.key = append(d.key, key.String()) 72 | } 73 | 74 | key.Reset() 75 | l++ 76 | i += firstSize 77 | } 78 | 79 | return mixed, l 80 | } 81 | 82 | func (d *DiceCoefficient) CompareUtf8(s1, s2 string) float64 { 83 | 84 | set := make(map[string]value, len(s1)/3) 85 | //TODO 边界比如字符长度小于ngram 86 | 87 | mixed, l1 := d.setOrGet(set, s1, true) 88 | 89 | mixed, l2 := d.setOrGet(set, s2, false) 90 | 91 | d.l1 = l1 92 | d.l2 = l2 93 | d.mixed = mixed 94 | return 2.0 * float64(mixed) / float64(l1+l2) 95 | } 96 | -------------------------------------------------------------------------------- /similarity/dice_coefficient_test.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | "unicode/utf8" 7 | ) 8 | 9 | func Test_DiceCoefficient_CompareAscii(t *testing.T) { 10 | // ngram = 1 11 | d := &DiceCoefficient{Ngram: 1} 12 | 13 | for k, v := range []testOneCase{ 14 | {s1: "ivan1", s2: "ivan2", cost: 0.8}, 15 | {s2: "ivan1", s1: "ivan2", cost: 0.8}, 16 | 17 | {s1: "love", s2: "love", cost: 1}, 18 | } { 19 | m := fmt.Sprintf("error case:%d", k) 20 | got := d.CompareAscii(v.s1, v.s2) 21 | if got != v.cost { 22 | t.Fatalf("%s: got %v, want %v", m, got, v.cost) 23 | } 24 | if d.l1 != len(v.s1) { 25 | t.Fatalf("%s: l1 got %v, want %v", m, d.l1, len(v.s1)) 26 | } 27 | if d.l2 != len(v.s2) { 28 | t.Fatalf("%s: l2 got %v, want %v", m, d.l2, len(v.s2)) 29 | } 30 | } 31 | 32 | } 33 | 34 | func Test_DiceCoefficient_CompareAscii_NgramOrMore(t *testing.T) { 35 | // ngram = 2 36 | d := &DiceCoefficient{Ngram: 2, test: true} 37 | for k, v := range []testOneCase{ 38 | {s1: "John Smith", s2: "Smith, John D.", cost: 0.7272727272727273, ngram: 2}, 39 | {s2: "John Smith", s1: "Smith, John D.", cost: 0.7272727272727273, ngram: 2}, 40 | 41 | {s1: "John Smith", s2: "Smith, John D.", cost: 0.6, ngram: 3}, 42 | {s2: "John Smith", s1: "Smith, John D.", cost: 0.6, ngram: 3}, 43 | 44 | {s1: "John Smith", s2: "Smith, John D.", cost: 0.4444444444444444, ngram: 4}, 45 | {s2: "John Smith", s1: "Smith, John D.", cost: 0.4444444444444444, ngram: 4}, 46 | } { 47 | if v.ngram != 0 { 48 | d.Ngram = v.ngram 49 | } 50 | 51 | m := fmt.Sprintf("error case:%d", k) 52 | got := d.CompareAscii(v.s1, v.s2) 53 | if got != v.cost { 54 | t.Fatalf("%s: got %v, want %v", m, got, v.cost) 55 | } 56 | for _, key := range d.key { 57 | if utf8.RuneCountInString(key) != d.Ngram { 58 | t.Fatalf("key is (%s): got ngram=%d, want %d", key, utf8.RuneCountInString(key), d.Ngram) 59 | } 60 | } 61 | 62 | d.key = nil 63 | 64 | } 65 | } 66 | 67 | func Test_DiceCoefficient_CompareUtf8(t *testing.T) { 68 | d := &DiceCoefficient{Ngram: 1} 69 | 70 | for k, v := range []testOneCase{ 71 | {s1: "你好中国", s2: "你好中国", cost: 1}, 72 | {s1: "中文也被称为华文、汉文。中文(汉语)有标准语和方言之分,其标准语即汉语普通话", s2: "方块", cost: 0.05}, 73 | {s1: "加油,来个", s2: "加油,来吧", cost: 0.8}, 74 | } { 75 | got := d.CompareUtf8(v.s1, v.s2) 76 | if got != v.cost { 77 | t.Fatalf("error case:%d: got %v, want %v", k, got, v.cost) 78 | } 79 | //fmt.Printf("mixed:%d, l1:%d, l2:%d, l1:%d\n", d.mixed, d.l1, d.l2, utf8.RuneCountInString(v.s1)) 80 | } 81 | } 82 | 83 | func Test_DiceCoefficient_FindBestMatch(t *testing.T) { 84 | d := &DiceCoefficient{Ngram: 1} 85 | 86 | for k, v := range []testBestCase{ 87 | {s: "白日依山尽", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0}, 88 | {s: "黄河流", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1}, 89 | {s: "一层", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3}, 90 | {s: "楼", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3}, 91 | {s: "山近", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0}, 92 | {s: "海刘", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1}, 93 | } { 94 | mr := findBestMatch(v.s, v.targets, d.CompareUtf8) 95 | if mr.BestIndex != v.bestIndex { 96 | t.Fatalf("error case:%d: got bestIndex=%v, want %v", k, mr.BestIndex, v.bestIndex) 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /similarity/edit_distance.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | type EditDistance struct { 4 | // test use 5 | mixed int 6 | } 7 | 8 | // ascii 9 | func (e *EditDistance) CompareAscii(s1, s2 string) float64 { 10 | cacheX := make([]int, len(s2)) 11 | 12 | diagonal := 0 13 | for y, yLen := 0, len(s1); y < yLen; y++ { 14 | for x, xLen := 0, len(cacheX); x < xLen; x++ { 15 | on := x + 1 16 | left := y + 1 17 | if x == 0 { 18 | diagonal = y 19 | } else if y == 0 { 20 | diagonal = x 21 | } 22 | if y > 0 { 23 | on = cacheX[x] 24 | } 25 | if x-1 >= 0 { 26 | left = cacheX[x-1] 27 | } 28 | 29 | same := 0 30 | if s1[y] != s2[x] { 31 | same = 1 32 | } 33 | 34 | oldDiagonal := cacheX[x] 35 | cacheX[x] = min(min(on+1, left+1), same+diagonal) 36 | diagonal = oldDiagonal 37 | //fmt.Printf("left:%d on:%d diagonal:%d (min:%d)#", left, on, oldDiagonal, cacheX[x]) 38 | 39 | } 40 | //fmt.Println() 41 | } 42 | 43 | e.mixed = cacheX[len(cacheX)-1] 44 | return 1.0 - float64(cacheX[len(cacheX)-1])/float64(max(len(s1), len(s2))) 45 | } 46 | 47 | // utf8 48 | func (e *EditDistance) CompareUtf8(utf8Str1, utf8Str2 string) float64 { 49 | r1 := []rune(utf8Str1) 50 | r2 := []rune(utf8Str2) 51 | cacheX := make([]int, len(r2)) 52 | 53 | diagonal := 0 54 | for y, yLen := 0, len(r1); y < yLen; y++ { 55 | for x, xLen := 0, len(cacheX); x < xLen; x++ { 56 | on := x + 1 57 | left := y + 1 58 | if x == 0 { 59 | diagonal = y 60 | } else if y == 0 { 61 | diagonal = x 62 | } 63 | if y > 0 { 64 | on = cacheX[x] 65 | } 66 | if x-1 >= 0 { 67 | left = cacheX[x-1] 68 | } 69 | 70 | same := 0 71 | if r1[y] != r2[x] { 72 | same = 1 73 | } 74 | 75 | oldDiagonal := cacheX[x] 76 | cacheX[x] = min(min(on+1, left+1), same+diagonal) 77 | diagonal = oldDiagonal 78 | 79 | } 80 | } 81 | 82 | e.mixed = cacheX[len(cacheX)-1] 83 | return 1.0 - float64(cacheX[len(cacheX)-1])/float64(max(len(r1), len(r2))) 84 | } 85 | -------------------------------------------------------------------------------- /similarity/edit_distance_test.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | type testOneCase struct { 8 | s1 string 9 | s2 string 10 | cost float64 11 | ngram int 12 | mixed int 13 | match int 14 | } 15 | 16 | type testBestCase struct { 17 | s string 18 | targets []string 19 | bestIndex int 20 | } 21 | 22 | func Test_EditDistance_CompareAscii(t *testing.T) { 23 | e := &EditDistance{} 24 | 25 | for k, v := range []testOneCase{ 26 | {s1: "ivan1", s2: "ivan2", cost: 0.8, mixed: 1}, 27 | {s1: "love", s2: "love", cost: 1, mixed: 0}, 28 | {s1: "kitten", s2: "sitting", cost: 1 - 3/7.0, mixed: 3}, 29 | {s1: "12", s2: "1", cost: 0.5, mixed: 1}, 30 | {s1: "1", s2: "12", cost: 0.5, mixed: 1}, 31 | {s1: "123", s2: "1", cost: 0.33333333333333337, mixed: 2}, 32 | {s1: "1", s2: "123", cost: 0.33333333333333337, mixed: 2}, 33 | {s1: "1234", s2: "1", cost: 0.25, mixed: 3}, 34 | {s1: "1", s2: "1234", cost: 0.25, mixed: 3}, 35 | } { 36 | s := e.CompareAscii(v.s1, v.s2) 37 | if s != v.cost { 38 | t.Fatalf("cost:error case:%d: got %v, want %v", k, s, v.cost) 39 | } 40 | if e.mixed != v.mixed { 41 | t.Fatalf("mixed:error case:%d: got %v, want %v", k, e.mixed, v.mixed) 42 | } 43 | } 44 | } 45 | 46 | func Test_EditDistance_CompareUtf8(t *testing.T) { 47 | e := &EditDistance{} 48 | 49 | for k, v := range []testOneCase{ 50 | {s1: "你好中国", s2: "你好中国", cost: 1, mixed: 0}, 51 | {s1: "加油,来个", s2: "加油,来", cost: 0.8, mixed: 1}, 52 | {s1: "一二三三四五", s2: "六二三三二五七", cost: 1 - 3/7.0, mixed: 3}, 53 | {s1: "一二", s2: "一", cost: 0.5, mixed: 1}, 54 | {s1: "一", s2: "一二", cost: 0.5, mixed: 1}, 55 | {s1: "一二三", s2: "一", cost: 0.33333333333333337, mixed: 2}, 56 | {s1: "一", s2: "一二三", cost: 0.33333333333333337, mixed: 2}, 57 | {s1: "一二三四", s2: "一", cost: 0.25, mixed: 3}, 58 | {s1: "一", s2: "一二三四", cost: 0.25, mixed: 3}, 59 | {s1: "中文也被称为华文、汉文。中文(汉语)有标准语和方言之分,其标准语即汉语普通话", s2: "方块", cost: 0.02631578947368418, mixed: 37}, 60 | } { 61 | s := e.CompareUtf8(v.s1, v.s2) 62 | if s != v.cost { 63 | t.Fatalf("cost:error case:%d: got %v, want %v", k, s, v.cost) 64 | } 65 | if e.mixed != v.mixed { 66 | t.Fatalf("mixed:error case:%d: got %v, want %v", k, e.mixed, v.mixed) 67 | } 68 | } 69 | } 70 | 71 | func Test_EditDistance_FindBestMatch(t *testing.T) { 72 | e := &EditDistance{} 73 | 74 | for k, v := range []testBestCase{ 75 | {s: "白日依山尽", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0}, 76 | {s: "黄河流", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1}, 77 | {s: "一层", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3}, 78 | {s: "楼", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 3}, 79 | {s: "山近", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 0}, 80 | {s: "海刘", targets: []string{"白日依山尽", "黄河入海流", "欲穷千里目", "更上一层楼"}, bestIndex: 1}, 81 | } { 82 | mr := findBestMatch(v.s, v.targets, e.CompareUtf8) 83 | if mr.BestIndex != v.bestIndex { 84 | t.Fatalf("error case:%d: got bestIndex=%v, want %v", k, mr.BestIndex, v.bestIndex) 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /similarity/hamming.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "math" 5 | "unicode/utf8" 6 | ) 7 | 8 | type Hamming struct{} 9 | 10 | func (h *Hamming) CompareAscii(s1, s2 string) float64 { 11 | 12 | count := 0 13 | max := len(s1) 14 | if max < len(s2) { 15 | max = len(s2) 16 | } 17 | 18 | for i, j := 0, 0; i < len(s1) && j < len(s2); { 19 | 20 | if s1[i] != s2[j] { 21 | count++ 22 | } 23 | 24 | i++ 25 | j++ 26 | } 27 | 28 | return 1 - (float64(count)+math.Abs(float64(len(s1)-len(s2))))/float64(max) 29 | } 30 | 31 | func (h *Hamming) CompareUtf8(utf8Str1, utf8Str2 string) float64 { 32 | count := 0 33 | 34 | l1 := utf8.RuneCountInString(utf8Str1) 35 | max := l1 36 | 37 | l2 := utf8.RuneCountInString(utf8Str2) 38 | if max < l2 { 39 | max = l2 40 | } 41 | 42 | for i, j := 0, 0; i < len(utf8Str1) && j < len(utf8Str2); { 43 | size := 0 44 | r1, size := utf8.DecodeRune(StringToBytes(utf8Str1[i:])) 45 | i += size 46 | 47 | r2, size := utf8.DecodeRune(StringToBytes(utf8Str2[j:])) 48 | j += size 49 | 50 | if r1 != r2 { 51 | count++ 52 | } 53 | 54 | } 55 | 56 | return 1 - (float64(count)+math.Abs(float64(l1-l2)))/float64(max) 57 | } 58 | -------------------------------------------------------------------------------- /similarity/hamming_test.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func Test_Hamming_CompareAscii(t *testing.T) { 8 | h := Hamming{} 9 | 10 | for k, v := range []testOneCase{ 11 | {s1: "1011101000", s2: "1001001000", cost: 0.8}, 12 | {s1: "21438960", s2: "22337960", cost: 0.625}, 13 | {s1: "toned", s2: "roses", cost: 0.4}, 14 | {s1: "1", s2: "12", cost: 0.5}, 15 | } { 16 | got := h.CompareAscii(v.s1, v.s2) 17 | if got != v.cost { 18 | t.Fatalf("error case:%d: got %v, want %v", k, got, v.cost) 19 | } 20 | } 21 | } 22 | 23 | func Test_Hamming_CompareUtf8(t *testing.T) { 24 | h := Hamming{} 25 | 26 | for k, v := range []testOneCase{ 27 | {s1: "中国嘿嘿", s2: "中国哈哈", cost: 0.5}, 28 | {s1: "中国嘿嘿1", s2: "中国哈哈", cost: 0.4}, 29 | {s1: "中国哈哈", s2: "中国嘿嘿1", cost: 0.4}, 30 | } { 31 | got := h.CompareUtf8(v.s1, v.s2) 32 | if got != v.cost { 33 | t.Fatalf("error case:%d: got %v, want %v", k, got, v.cost) 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /similarity/jaro.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "math" 5 | "sort" 6 | "sync" 7 | "unicode/utf8" 8 | ) 9 | 10 | type Jaro struct { 11 | MatchWindow int 12 | // test use 13 | mw int 14 | m int 15 | t int 16 | } 17 | 18 | type check struct { 19 | index int 20 | c rune 21 | } 22 | 23 | var checkPool = sync.Pool{ 24 | New: func() interface{} { 25 | return &check{} 26 | }, 27 | } 28 | 29 | func (j *Jaro) CompareAscii(s1, s2 string) float64 { 30 | return j.CompareUtf8(s1, s2) 31 | } 32 | 33 | func (j *Jaro) CompareUtf8(s1, s2 string) float64 { 34 | mw := max(utf8.RuneCountInString(s1), utf8.RuneCountInString(s2))/2 - 1 35 | if j.MatchWindow != 0 { 36 | mw = j.MatchWindow 37 | } 38 | 39 | m := 0 40 | 41 | matchSet := make(map[rune][]int, len(s1)/3) 42 | l1 := 0 43 | for _, c := range s1 { 44 | matchSet[c] = append(matchSet[c], l1) 45 | l1++ 46 | } 47 | 48 | t := 0 49 | l2 := 0 50 | 51 | indexAndRune1 := make([]*check, 0, 8) 52 | indexAndRune2 := make([]rune, 0, 8) 53 | 54 | defer func() { 55 | for _, v := range indexAndRune1 { 56 | checkPool.Put(v) 57 | } 58 | }() 59 | 60 | for _, c := range s2 { 61 | indexs, ok := matchSet[c] 62 | l2++ 63 | if !ok { 64 | continue 65 | } 66 | 67 | for k, i := range indexs { 68 | if i == -1 { 69 | continue 70 | } 71 | 72 | //fmt.Printf("_______c %c:%d:%d\n", c, l2-1-i, mw) 73 | if math.Abs(float64(l2-1-i)) <= float64(mw) { 74 | m++ 75 | 76 | currCheck := checkPool.Get().(*check) 77 | currCheck.index = i 78 | currCheck.c = c 79 | 80 | indexAndRune1 = append(indexAndRune1, currCheck) 81 | 82 | indexAndRune2 = append(indexAndRune2, c) 83 | 84 | indexs[k] = -1 85 | break 86 | } 87 | } 88 | } 89 | 90 | m2 := float64(m) 91 | 92 | if m2 == 0 { 93 | return 0.0 94 | } 95 | 96 | sort.Slice(indexAndRune1, func(i, j int) bool { 97 | return indexAndRune1[i].index < indexAndRune1[j].index 98 | }) 99 | 100 | for i, v := range indexAndRune1 { 101 | if v.c != indexAndRune2[i] { 102 | t++ 103 | } 104 | } 105 | 106 | j.mw = mw 107 | j.m = m 108 | j.t = t 109 | //fmt.Printf("l1:%d, l2:%d, m:%d, t:%d\n", l1, l2, m, t) 110 | return 1.0 / 3.0 * (m2/float64(l1) + m2/float64(l2) + (m2-float64(t)/2.0)/m2) 111 | } 112 | -------------------------------------------------------------------------------- /similarity/jaro_test.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func Test_Jaro_CompareAscii(t *testing.T) { 9 | j := &Jaro{} 10 | 11 | for k, v := range []testOneCase{ 12 | 13 | {s1: "bacde", s2: "abed", cost: 0.6722222222222222}, 14 | {s1: "MARTHA", s2: "MARHTA", cost: 0.9444444444444444}, 15 | {s1: "DIXON", s2: "DICKSONX", cost: 0.7666666666666666}, 16 | 17 | {s1: "JELLYFISH", s2: "SMELLYFISH", cost: 0.8962962962962964}, 18 | {s2: "JELLYFISH", s1: "SMELLYFISH", cost: 0.8962962962962964}, 19 | } { 20 | m := fmt.Sprintf("error case:%d", k) 21 | got := j.CompareAscii(v.s1, v.s2) 22 | if got != v.cost { 23 | t.Fatalf("%s: got %v, want %v", m, got, v.cost) 24 | } 25 | j.MatchWindow = 0 26 | } 27 | 28 | } 29 | 30 | func Test_Jaro_CompareUtf8(t *testing.T) { 31 | j := &Jaro{} 32 | 33 | // jaro处理两个字符串长串接近的数据会好点 34 | for k, v := range []testOneCase{ 35 | 36 | {s1: "二一三四五", s2: "一二五四", cost: 0.6722222222222222}, 37 | {s2: "二一三四五", s1: "一二五四", cost: 0.6722222222222222}, 38 | {s1: "中文也被称为华文、汉文。中文(汉语)有标准语和方言之分,其标准语即汉语普通话", s2: "中文", cost: 0.6842105263157894, match: 1}, 39 | } { 40 | m := fmt.Sprintf("error case:%d", k) 41 | got := j.CompareUtf8(v.s1, v.s2) 42 | if got != v.cost { 43 | t.Fatalf("%s: got %v, want %v", m, got, v.cost) 44 | } 45 | j.mw = 0 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /similarity/jaro_winkler.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "math" 5 | "sort" 6 | "unicode/utf8" 7 | ) 8 | 9 | type JaroWinkler struct { 10 | MatchWindow int 11 | // test use 12 | mw int 13 | m int 14 | t int 15 | } 16 | 17 | func (j *JaroWinkler) CompareAscii(s1, s2 string) float64 { 18 | return j.CompareUtf8(s1, s2) 19 | } 20 | 21 | func (j *JaroWinkler) CompareUtf8(s1, s2 string) float64 { 22 | //matching window max size 23 | mw := max(utf8.RuneCountInString(s1), utf8.RuneCountInString(s2))/2 - 1 24 | if j.MatchWindow != 0 { 25 | mw = j.MatchWindow 26 | } 27 | 28 | m := 0 29 | 30 | matchSet := make(map[rune][]int, len(s1)/3) 31 | l1 := 0 32 | for _, c := range s1 { 33 | matchSet[c] = append(matchSet[c], l1) 34 | l1++ 35 | } 36 | 37 | t := 0 38 | l2 := 0 39 | 40 | indexAndRune1 := make([]*check, 0, 8) 41 | indexAndRune2 := make([]rune, 0, 8) 42 | 43 | defer func() { 44 | for _, v := range indexAndRune1 { 45 | checkPool.Put(v) 46 | } 47 | }() 48 | 49 | for _, c := range s2 { 50 | indexs, ok := matchSet[c] 51 | l2++ 52 | if !ok { 53 | continue 54 | } 55 | 56 | for k, i := range indexs { 57 | if i == -1 { 58 | continue 59 | } 60 | 61 | //fmt.Printf("_______c %c:%d:%d\n", c, l2-1-i, mw) 62 | if math.Abs(float64(l2-1-i)) <= float64(mw) { 63 | m++ 64 | 65 | currCheck := checkPool.Get().(*check) 66 | currCheck.index = i 67 | currCheck.c = c 68 | 69 | indexAndRune1 = append(indexAndRune1, currCheck) 70 | 71 | indexAndRune2 = append(indexAndRune2, c) 72 | 73 | indexs[k] = -1 74 | break 75 | } 76 | } 77 | } 78 | 79 | m2 := float64(m) 80 | 81 | if m2 == 0 { 82 | return 0.0 83 | } 84 | 85 | sort.Slice(indexAndRune1, func(i, j int) bool { 86 | return indexAndRune1[i].index < indexAndRune1[j].index 87 | }) 88 | 89 | for i, v := range indexAndRune1 { 90 | if v.c != indexAndRune2[i] { 91 | t++ 92 | } 93 | } 94 | 95 | j.mw = mw 96 | j.m = m 97 | j.t = t 98 | //fmt.Printf("l1:%d, l2:%d, m:%d, t:%d\n", l1, l2, m, t) 99 | // s1 和 s2 的相同前缀长度 100 | prefixLength := 0 101 | for i := 0; i < min(len(s1), len(s2)); i++ { 102 | if s1[i] != s2[i] { 103 | break 104 | } 105 | if prefixLength <= 4 { 106 | prefixLength++ 107 | } else { 108 | break 109 | } 110 | } 111 | // 影响因子 p 取值范围[0.1,0.25],默认值为0.1 112 | p := 0.1 113 | 114 | simj := 1.0 / 3.0 * (m2/float64(l1) + m2/float64(l2) + (m2-float64(t)/2.0)/m2) 115 | 116 | return simj + float64(prefixLength)*p*(1.0-simj) 117 | } 118 | -------------------------------------------------------------------------------- /similarity/simhash.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "hash/crc32" 5 | "strconv" 6 | "unicode/utf8" 7 | ) 8 | 9 | type Simhash struct { 10 | } 11 | 12 | func (s Simhash) CompareAscii(s1, s2 string) float64 { 13 | return s.CompareUtf8(s1, s2) 14 | 15 | } 16 | func (s Simhash) CompareUtf8(utf8Str1, utf8Str2 string) float64 { 17 | // 字符串长度 18 | l1 := utf8.RuneCountInString(utf8Str1) 19 | l2 := utf8.RuneCountInString(utf8Str2) 20 | // 将字符串转换为字符数组 21 | s1s := StrToStrs4(utf8Str1, l1) 22 | s2s := StrToStrs4(utf8Str2, l2) 23 | // 计算每个字符在字符数组中出现的次数 24 | counts1 := make(map[string]int) 25 | counts2 := make(map[string]int) 26 | for _, s := range s1s { 27 | // 如果字符在字符数组中出现过,则计数加1 28 | if _, ok := counts1[s]; ok { 29 | counts1[s]++ 30 | } else { 31 | // 如果字符在字符数组中没出现过,则计数设为1 32 | counts1[s] = 1 33 | } 34 | } 35 | for _, s := range s2s { 36 | if _, ok := counts2[s]; ok { 37 | counts2[s]++ 38 | } else { 39 | counts2[s] = 1 40 | } 41 | } 42 | h1 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts1)))) 43 | h2 := IntsToStr(Dimensionality(merge(hashcodeAndAdd(counts2)))) 44 | 45 | // 计算h1, h2的汉明距离 46 | Hamming := Hamming{} 47 | //fmt.Printf("h1: %s\nh2: %s\n", h1, h2) 48 | 49 | return Hamming.CompareUtf8(h1, h2) 50 | 51 | } 52 | 53 | // 降维度 54 | func Dimensionality(ins []int) []int { 55 | for i := 0; i < len(ins); i++ { 56 | if ins[i] > 0 { 57 | ins[i] = 1 58 | } else { 59 | ins[i] = 0 60 | } 61 | 62 | } 63 | return ins 64 | } 65 | 66 | //合并 67 | func merge(ins [][]int) []int { 68 | res := make([]int, len(ins[0])) 69 | lens := len(ins) 70 | for i := 0; i < lens; i++ { 71 | for j := 0; j < len(ins[i]); j++ { 72 | res[j] += ins[i][j] 73 | } 74 | } 75 | return res 76 | } 77 | 78 | // 计算hashcode并加权 79 | func hashcodeAndAdd(counts map[string]int) [][]int { 80 | // hashmap 81 | lens := len(counts) 82 | h1 := make([][]int, lens) 83 | // 计算counts1,counts2 中每个字符的hash值, 并且将出现的次数分为5个等级, 将每个字符的hash值与出现的次数等级相乘 84 | c1 := (lens - 1) * 4.0 85 | j := 0 86 | //for j := 0; j < lens; j++ { 87 | for k, v := range counts { 88 | ////计算每一个字符串的hash 89 | //for i := 0; i < len(h1); i++ { 90 | // 出现的次数除以5 91 | c := strconv.FormatUint(uint64(crc32.ChecksumIEEE([]byte(k))), 2) 92 | // 将字符串转换为数字数组 93 | cs := Int32StrToInts(c) 94 | if v <= c1/5.0 { 95 | // 加权 96 | h1[j] = Add(cs, 1) 97 | } else if v <= c1/5.0*2 { 98 | // 加权 99 | h1[j] = Add(cs, 2) 100 | } else if v <= c1/5.0*3 { 101 | // 加权 102 | h1[j] = Add(cs, 3) 103 | } else if v <= c1/5.0*4 { 104 | // 加权 105 | h1[j] = Add(cs, 4) 106 | } else { 107 | // 加权 108 | h1[j] = Add(cs, 5) 109 | } 110 | j++ 111 | } 112 | 113 | return h1 114 | } 115 | -------------------------------------------------------------------------------- /similarity/utils.go: -------------------------------------------------------------------------------- 1 | package similarity 2 | 3 | import ( 4 | "encoding/base64" 5 | "reflect" 6 | "strconv" 7 | "unsafe" 8 | ) 9 | 10 | const base64Table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/" 11 | 12 | func min(x, y int) int { 13 | if x < y { 14 | return x 15 | } 16 | return y 17 | } 18 | 19 | func max(x, y int) int { 20 | if x < y { 21 | return y 22 | } 23 | return x 24 | } 25 | 26 | func StringToBytes(s string) (b []byte) { 27 | bh := (*reflect.SliceHeader)(unsafe.Pointer(&b)) 28 | sh := *(*reflect.StringHeader)(unsafe.Pointer(&s)) 29 | bh.Data = sh.Data 30 | bh.Len = sh.Len 31 | bh.Cap = sh.Len 32 | return b 33 | } 34 | 35 | // Base64Encode encodes a byte slice to a base64 string. 36 | func Base64Encode(s string) string { 37 | base := base64.NewEncoding(base64Table) 38 | bytes := StringToBytes(s) 39 | return base.EncodeToString(bytes) 40 | } 41 | 42 | // StrToStrs 字符串转化字符数组 43 | func StrToStrs(s string, lenth int) []string { 44 | base := make([]string, lenth) 45 | for i := 0; i < lenth; i++ { 46 | base[i] = string(s[i]) 47 | } 48 | return base 49 | } 50 | 51 | // StrToStrs4 每隔四个字符转换成一个字符串 52 | func StrToStrs4(s string, lenth int) []string { 53 | base := make([]string, lenth/4) 54 | var j = 0 55 | for i := 0; i < lenth; i += 4 { 56 | //base = append(base, s[i:i+4]) 57 | base[j] = s[i : i+4] 58 | j++ 59 | } 60 | return base 61 | } 62 | 63 | // Add 加权 64 | func Add(uint64 []int, int int) []int { 65 | lens := len(uint64) 66 | for i := 0; i < 32; i++ { 67 | if i < lens { 68 | if uint64[i] == 1 { 69 | uint64[i] = int 70 | } else { 71 | uint64[i] = -int 72 | } 73 | } else { 74 | uint64 = append(uint64, int) 75 | } 76 | 77 | } 78 | return uint64 79 | } 80 | 81 | // Int32StrToInts 将uint64转换成string 82 | func Int32StrToInts(ins string) []int { 83 | uints := make([]int, 32) 84 | 85 | for i := 0; i < len(ins); i++ { 86 | if string(ins[i]) == "1" { 87 | uints[i] = 1 88 | } else if string(ins[i]) == "0" { 89 | uints[i] = 0 90 | } 91 | } 92 | return uints 93 | 94 | } 95 | 96 | // IntsToStr []int 转换成string 97 | func IntsToStr(ins []int) string { 98 | res := "" 99 | for _, v := range ins { 100 | res += strconv.Itoa(v) 101 | } 102 | 103 | return res 104 | } 105 | -------------------------------------------------------------------------------- /strsim.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "github.com/antlabs/strsim/similarity" 5 | ) 6 | 7 | // 比较两个字符串相似度 8 | func Compare(s1, s2 string, opts ...Option) float64 { 9 | var o option 10 | 11 | o.fillOption(opts...) 12 | 13 | return compare(s1, s2, &o) 14 | } 15 | 16 | // 返回相似度最高的那个字符串 17 | func FindBestMatchOne(s string, targets []string, opts ...Option) *similarity.Match { 18 | r := findBestMatch(s, targets, opts...) 19 | return r.Match 20 | } 21 | 22 | // 返回相似度最高的那个字符串, 以及索引位置 23 | func FindBestMatch(s string, targets []string, opts ...Option) *similarity.MatchResult { 24 | return findBestMatch(s, targets, opts...) 25 | } 26 | -------------------------------------------------------------------------------- /strsim_priv.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import "github.com/antlabs/strsim/similarity" 4 | 5 | // 比较两个字符串内部函数 6 | func compare(s1, s2 string, o *option) float64 { 7 | if s, e := modifyStrAndCheck(o, &s1, &s2); e { 8 | return s 9 | } 10 | 11 | return o.cmp(s1, s2) 12 | } 13 | 14 | // 前处理主要涉及,修改字符串,和边界判断 15 | func modifyStrAndCheck(o *option, s1, s2 *string) (score float64, exit bool) { 16 | modifyString(o, s1) 17 | modifyString(o, s2) 18 | modifyStrToBase64Str(o, s1) 19 | modifyStrToBase64Str(o, s2) 20 | 21 | return check(*s1, *s2) 22 | } 23 | 24 | // 记录每个targets子串的相似度打分,并且返回相似度最高的那个字符串, 内部函数 25 | func findBestMatch(s string, targets []string, opts ...Option) *similarity.MatchResult { 26 | 27 | var opt option 28 | opt.fillOption(opts...) 29 | 30 | match := make([]*similarity.Match, 0, len(targets)) 31 | bestIndex := 0 32 | for k, s2 := range targets { 33 | 34 | score := compare(s, s2, &opt) 35 | 36 | //fmt.Printf("score:%f(%s)(%s)\n", score, s, s2) 37 | match = append(match, &similarity.Match{S: s2, Score: score}) 38 | 39 | if k == 0 { 40 | continue 41 | } 42 | 43 | if score > match[bestIndex].Score { 44 | bestIndex = k 45 | } 46 | } 47 | 48 | return &similarity.MatchResult{AllResult: match, Match: match[bestIndex], BestIndex: bestIndex} 49 | } 50 | -------------------------------------------------------------------------------- /strsim_test.go: -------------------------------------------------------------------------------- 1 | package strsim 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func Test_Compare_Special(t *testing.T) { 8 | 9 | for _, v := range []testCase{ 10 | {arg1: "", arg2: "", sim: 1}, 11 | {arg1: "1", arg2: "", sim: 0}, 12 | {arg1: "", arg2: "1", sim: 0}, 13 | } { 14 | for _, o := range []Option{ 15 | Default(), 16 | Jaro(), 17 | DiceCoefficient(1), 18 | Hamming(), 19 | Simhash(), 20 | Cosine(), 21 | JaroWinkler(), 22 | } { 23 | sim := Compare(v.arg1, v.arg2, o) 24 | if sim != v.sim { 25 | t.Fatalf("Compare(%q, %q) with option: got %v, want %v", v.arg1, v.arg2, sim, v.sim) 26 | } 27 | } 28 | } 29 | } 30 | 31 | type bestTest struct { 32 | best []string 33 | key string 34 | need string 35 | } 36 | 37 | func Test_FindBestMatchOne(t *testing.T) { 38 | for _, d := range []bestTest{ 39 | {best: []string{"朝辞白帝彩云间", "千里江陵一日还", "两岸猿声啼不住", "轻舟已过万重山"}, key: "千里还", need: "千里江陵一日还"}, 40 | } { 41 | for _, o := range []Option{ 42 | DiceCoefficient(1), 43 | Jaro(), 44 | Default(), 45 | Simhash(), 46 | Cosine(), 47 | JaroWinkler(), 48 | } { 49 | m := FindBestMatchOne(d.key, d.best, o) 50 | if m.S != d.need { 51 | t.Fatalf("FindBestMatchOne(%q, %v) with option: got %q, want %q", d.key, d.best, m.S, d.need) 52 | } 53 | } 54 | } 55 | } 56 | 57 | func Test_FindBestMatch(t *testing.T) { 58 | for _, d := range []bestTest{ 59 | {best: []string{"朝辞白帝彩云间", "千里江陵一日还", "两岸猿声啼不住", "轻舟已过万重山"}, key: "千里还", need: "千里江陵一日还"}, 60 | } { 61 | for _, o := range []Option{ 62 | DiceCoefficient(1), 63 | Jaro(), 64 | Default(), 65 | Simhash(), 66 | Cosine(), 67 | JaroWinkler(), 68 | } { 69 | m := FindBestMatch(d.key, d.best, o) 70 | if m.Match.S != d.need { 71 | t.Fatalf("FindBestMatch(%q, %v) with option: got %q, want %q", d.key, d.best, m.Match.S, d.need) 72 | } 73 | } 74 | } 75 | } 76 | --------------------------------------------------------------------------------