├── .gitignore ├── LICENSE ├── README.md ├── goJieba.go ├── similarity.go ├── similarity_test.go └── utils.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.exe~ 4 | *.dll 5 | *.so 6 | *.dylib 7 | 8 | # Test binary, build with `go test -c` 9 | *.test 10 | 11 | # Output of the go coverage tool, specifically when used with LiteIDE 12 | *.out 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 netkiddy 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Project 2 | A text similarity by simhash 3 | 4 | # About text similarity 5 | https://cloud.tencent.com/developer/article/1389446 6 | 7 | # How to test 8 | $ go test -v -test.run TestSimHashSimilar 9 | 10 | === RUN TestSimHashSimilar 11 | 12 | srcWordsWeight: [{区块链 58.69602153541771} {货币 42.49228769} {分布式 31.1229513822} {比特 30.7892744766} {密码学 26.4150609428} {数字 25.9012790598} {虚拟 25.18603834812} {数据结构 21.285162228} {链式 21.066763644} {利用 20.01581093792} {方式 19.20596931748} {数据 19.12542671356} {顺序 15.15263737918} {来讲 14.73377762808} {基础架构 12.8020653633} {一种 12.394908587969999} {2009 11.739204307083542} {中本聪 11.739204307083542} {最早 11.7133864415} {区块 11.5027823792} {保证 11.32904398058} {不可 10.95866957244} {数据传输 10.604840786} {账本 10.1871055853} {组合成 10.0720362555} {以太 10.0505300503} {篡改 9.96885201925} {莱特 9.87532596124} {编程 9.84023464143} {发明者 9.69598503258}] 13 | 14 | dstWordsWeight: [{区块链 58.69602153541771} {篡改 29.906556057750002} {数据 19.12542671356} {技术 18.87782871428} {节点 18.29417492174} {信息 15.76158207831} {金融 15.752110047990001} {缺陷 15.05813160948} {交易 14.735571450600002} {互联网 14.24378550858} {信任 14.15260796386} {痛点 12.8020653633} {中心化 12.8020653633} {有三大 11.739204307083542} {区块 11.5027823792} {假冒伪劣 11.2616203224} {不可 10.95866957244} {解决目前 10.9049453784} {领域 10.82459108482} {大有裨益 10.604840786} {分布式 10.3743171274} {银团 10.3171587135} {工业 10.1355963834} {记账 10.1164880181} {资产 10.08319666818} {内置 9.96885201925} {数据链 9.89334446674} {生命周期 9.72629038208} {金融业务 9.62401153296} {讲课 9.61021821083}] 15 | 16 | srcWords:[{区块链 58.69602153541771} {货币 42.49228769} {分布式 31.1229513822} {比特 30.7892744766} {密码学 26.4150609428} {数字 25.9012790598} {虚拟 25.18603834812} {数据结构 21.285162228} {链式 21.066763644} {利用 20.01581093792} {方式 19.20596931748} {数据 19.12542671356} {顺序 15.15263737918} {来讲 14.73377762808} {基础架构 12.8020653633} {一种 12.394908587969999} {2009 11.739204307083542} {中本聪 11.739204307083542} {最早 11.7133864415} {区块 11.5027823792} {保证 11.32904398058} {不可 10.95866957244} {数据传输 10.604840786} {账本 10.1871055853} {组合成 10.0720362555} {以太 10.0505300503} {篡改 9.96885201925} {莱特 9.87532596124} {编程 9.84023464143} {发明者 9.69598503258}] 17 | 18 | dstWords:[{区块链 58.69602153541771} {篡改 29.906556057750002} {数据 19.12542671356} {技术 18.87782871428} {节点 18.29417492174} {信息 15.76158207831} {金融 15.752110047990001} {缺陷 15.05813160948} {交易 14.735571450600002} {互联网 14.24378550858} {信任 14.15260796386} {痛点 12.8020653633} {中心化 12.8020653633} {有三大 11.739204307083542} {区块 11.5027823792} {假冒伪劣 11.2616203224} {不可 10.95866957244} {解决目前 10.9049453784} {领域 10.82459108482} {大有裨益 10.604840786} {分布式 10.3743171274} {银团 10.3171587135} {工业 10.1355963834} {记账 10.1164880181} {资产 10.08319666818} {内置 9.96885201925} {数据链 9.89334446674} {生命周期 9.72629038208} {金融业务 9.62401153296} {讲课 9.61021821083}] 19 | 20 | srcFingerPrint: [1 0 1 1 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 1 0 1 1 0 0 0 0 0 0 1 0 1] 21 | 22 | dstFingerPrint: [1 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1] 23 | 24 | --- PASS: TestSimHashSimilar (0.85s) 25 | 26 | similarity_test.go:56: SimHashSimilar distance: 8 27 | 28 | PASS 29 | -------------------------------------------------------------------------------- /goJieba.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "github.com/yanyiwu/gojieba" 5 | "sync" 6 | ) 7 | 8 | type GoJieba struct { 9 | C *gojieba.Jieba 10 | } 11 | 12 | var GJB *GoJieba 13 | var one sync.Once 14 | 15 | func NewGoJieba() (*GoJieba) { 16 | one.Do(func() { 17 | GJB = &GoJieba{ 18 | C: gojieba.NewJieba(), 19 | //equals with x := NewJieba(DICT_PATH, HMM_PATH, USER_DICT_PATH) 20 | } 21 | }) 22 | return GJB 23 | } 24 | 25 | func (this *GoJieba) Close() { 26 | this.C.Free() 27 | } 28 | 29 | func (this *GoJieba) AddWords(words []string) { 30 | for _, word := range words { 31 | this.C.AddWord(word) 32 | } 33 | } 34 | 35 | func (this *GoJieba) JiebaCut(rawStr string, useHmm bool, cutAll bool) (words []string) { 36 | if cutAll { 37 | words = jiebaCutAll(this.C, &rawStr) 38 | } else { 39 | words = jiebaCut(this.C, &rawStr, useHmm) 40 | } 41 | 42 | return 43 | } 44 | 45 | func (this *GoJieba) JiebaCutWithFrequency(rawStr string, useHmm bool, cutAll bool) (wordsFreqs map[string]int) { 46 | wordsFreqs = make(map[string]int) 47 | if cutAll { 48 | words := jiebaCutAll(this.C, &rawStr) 49 | for _, word := range words { 50 | freq := wordsFreqs[word] 51 | wordsFreqs[word] = freq + 1 52 | } 53 | } else { 54 | words := jiebaCut(this.C, &rawStr, useHmm) 55 | for _, word := range words { 56 | freq := wordsFreqs[word] 57 | wordsFreqs[word] = freq + 1 58 | } 59 | } 60 | 61 | return 62 | } 63 | 64 | func (this *GoJieba) JiebaCutForSearch(rawStr string, useHmm bool) { 65 | jiebaCut4Search(this.C, &rawStr, useHmm) 66 | 67 | } 68 | 69 | func jiebaCutAll(x *gojieba.Jieba, rawStr *string) (words []string) { 70 | words = x.CutAll(*rawStr) 71 | return 72 | } 73 | 74 | func jiebaCut(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) { 75 | words = x.Cut(*rawStr, useHmm) 76 | return 77 | } 78 | 79 | func jiebaCut4Search(x *gojieba.Jieba, rawStr *string, useHmm bool) (words []string) { 80 | words = x.CutForSearch(*rawStr, useHmm) 81 | return 82 | } -------------------------------------------------------------------------------- /similarity.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "hash/fnv" 5 | "fmt" 6 | "strings" 7 | ) 8 | 9 | const ( 10 | SIMILAR_DISTANCE = 3 11 | ) 12 | 13 | type WordWeight struct { 14 | Word string 15 | Weight float64 16 | } 17 | 18 | func SimHashSimilar(srcWordWeighs, dstWordWeights []WordWeight) (distance int, err error) { 19 | 20 | srcFingerPrint, err := simhashFingerPrint(srcWordWeighs) 21 | if err != nil { 22 | return 23 | } 24 | fmt.Println("srcFingerPrint: ", srcFingerPrint) 25 | dstFingerPrint, err := simhashFingerPrint(dstWordWeights) 26 | if err != nil { 27 | return 28 | } 29 | fmt.Println("dstFingerPrint: ", dstFingerPrint) 30 | 31 | distance = hammingDistance(srcFingerPrint, dstFingerPrint) 32 | 33 | return 34 | } 35 | 36 | func simhashFingerPrint(wordWeights []WordWeight) (fingerPrint []string, err error) { 37 | binaryWeights := make([]float64, 32) 38 | for _, ww := range wordWeights { 39 | bitHash := strHashBitCode(ww.Word) 40 | weights := calcWithWeight(bitHash, ww.Weight) //binary每个元素与weight的乘积结果数组 41 | binaryWeights, err = sliceInnerPlus(binaryWeights, weights) 42 | //fmt.Printf("ww.Word:%v, bitHash:%v, ww.Weight:%v, binaryWeights: %v\n", ww.Word,bitHash, ww.Weight, binaryWeights) 43 | if err != nil { 44 | return 45 | } 46 | } 47 | fingerPrint = make([]string, 0) 48 | for _, b := range binaryWeights { 49 | if b > 0 { // bit 1 50 | fingerPrint = append(fingerPrint, "1") 51 | } else { // bit 0 52 | fingerPrint = append(fingerPrint, "0") 53 | } 54 | } 55 | 56 | return 57 | } 58 | 59 | func strHashBitCode(str string) string { 60 | h := fnv.New32a() 61 | h.Write([]byte(str)) 62 | b := int64(h.Sum32()) 63 | return fmt.Sprintf("%032b", b) 64 | } 65 | 66 | func calcWithWeight(bitHash string, weight float64) []float64 { 67 | bitHashs := strings.Split(bitHash, "") 68 | binarys := make([]float64, 0) 69 | 70 | for _, bit := range bitHashs { 71 | if bit == "0" { 72 | binarys = append(binarys, float64(-1)*weight) 73 | } else { 74 | binarys = append(binarys, float64(weight)) 75 | } 76 | } 77 | 78 | return binarys 79 | } 80 | 81 | func sliceInnerPlus(arr1, arr2 [] float64) (dstArr []float64, err error) { 82 | dstArr = make([]float64, len(arr1), len(arr1)) 83 | 84 | if arr1 == nil || arr2 == nil { 85 | err = fmt.Errorf("sliceInnerPlus array nil") 86 | return 87 | } 88 | if len(arr1) != len(arr2) { 89 | err = fmt.Errorf("sliceInnerPlus array Length NOT match, %v != %v", len(arr1), len(arr2)) 90 | return 91 | } 92 | 93 | for i, v1 := range arr1 { 94 | dstArr[i] = v1 + arr2[i] 95 | } 96 | 97 | return 98 | } 99 | 100 | func hammingDistance(arr1, arr2 []string) int { 101 | count := 0 102 | for i, v1 := range arr1 { 103 | if v1 != arr2[i] { 104 | count++ 105 | } 106 | } 107 | 108 | return count 109 | } -------------------------------------------------------------------------------- /similarity_test.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "testing" 5 | "fmt" 6 | ) 7 | 8 | func TestSimHashSimilar(t *testing.T) { 9 | g := NewGoJieba() 10 | srcStr := RemoveHtml("关于区块链和数字货币的关系,很多人或多或少都存在疑惑。简单来说,区块链是比特币的底层运用,而比特币只是区块链的一个小应用而已。" + 11 | "数字货币即虚拟货币,最早的数字货币诞生于2009年,其发明者中本聪为了应对经济危机对于实体货币经济的冲击。比特币是最早的数字货币,后来出现了以太币、火币以及莱特币等虚拟货币,这些虚拟货币是不能用来交易的。" + 12 | "狭义来讲,区块链是一种按照时间顺序将数据区块以顺序相连的方式组合成的一种链式数据结构, 并以密码学方式保证的不可篡改和不可伪造的分布式账本。" + 13 | "广义来讲,区块链技术是利用块链式数据结构来验证与存储数据、利用分布式节点共识算法来生成和更新数据、利用密码学的方式保证数据传输和访问的安全、利用由自动化脚本代码组成的智能合约来编程和操作数据的一种全新的分布式基础架构与计算方式。") 14 | dstStr := RemoveHtml("区块链技术为我们的信息防伪与数据追踪提供了革新手段。区块链中的数据区块顺序相连构成了一个不可篡改的数据链条,时间戳为所有的交易行为贴上了一套不讲课伪造的真是标签,这对于人们在现实生活中打击假冒伪劣产品大有裨益; " + 15 | "市场分析指出,整体而言,区块链技术目前在十大金融领域显示出应用前景,分别是资产证券化、保险、供应链金融、场外市场、资产托管、大宗商品交易、风险信息共享机制、贸易融资、银团贷款、股权交易交割。" + 16 | "这些金融场景有三大共性:参与节点多、验真成本高、交易流程长,而区块链的分布式记账、不可篡改、内置合约等特性可以为这些金融业务中的痛点提供解决方案。" + 17 | "传统的工业互联网模式是由一个中心化的机构收集和管理所有的数据信息,容易产生因设备生命周期和安全等方面的缺陷引起的数据丢失、篡改等问题。区块链技术可以在无需任何信任单个节点的同时构建整个网络的信任共识,从而很好的解决目前工业互联网技术领域的一些缺陷,让物与物之间能够实现更好的连接.") 18 | srcWordsWeight := g.C.ExtractWithWeight(srcStr, 30) 19 | dstWordsWeight := g.C.ExtractWithWeight(dstStr, 30) 20 | fmt.Printf("srcWordsWeight: %v\n", srcWordsWeight) 21 | fmt.Printf("dstWordsWeight: %v\n", dstWordsWeight) 22 | 23 | srcWords := make([]WordWeight, len(srcWordsWeight)) 24 | dstWords := make([]WordWeight, len(dstWordsWeight)) 25 | for i, ww := range srcWordsWeight { 26 | word := WordWeight{Word: ww.Word, Weight: ww.Weight} 27 | srcWords[i] = word 28 | } 29 | for i, ww := range dstWordsWeight { 30 | word := WordWeight{Word: ww.Word, Weight: ww.Weight} 31 | dstWords[i] = word 32 | } 33 | fmt.Printf("srcWords:%v\n", srcWords) 34 | fmt.Printf("dstWords:%v\n", dstWords) 35 | 36 | distance, err := SimHashSimilar(srcWords, dstWords) 37 | if err != nil { 38 | t.Errorf("failed: %v", err.Error()) 39 | } 40 | 41 | t.Logf("SimHashSimilar distance: %v", distance) 42 | } 43 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "strings" 5 | "regexp" 6 | ) 7 | 8 | func RemoveHtml(src string) string { 9 | //将HTML标签全转换成小写 10 | re, _ := regexp.Compile(`\\<[\\S\\s]+?\\>`) 11 | src = re.ReplaceAllStringFunc(src, strings.ToLower) 12 | 13 | //去除STYLE 14 | re, _ = regexp.Compile(`\\`) 15 | src = re.ReplaceAllString(src, "") 16 | 17 | //去除SCRIPT 18 | re, _ = regexp.Compile(`\\`) 19 | src = re.ReplaceAllString(src, "") 20 | 21 | //去除所有尖括号内的HTML代码,并换成换行符 22 | re, _ = regexp.Compile(`\\<[\\S\\s]+?\\>`) 23 | src = re.ReplaceAllString(src, "\n") 24 | 25 | //去除连续的换行符 26 | re, _ = regexp.Compile(`\\s{2,}`) 27 | src = re.ReplaceAllString(src, "\n") 28 | 29 | return src 30 | } 31 | --------------------------------------------------------------------------------