├── .gitignore ├── README.md ├── tokenisers_test.go ├── simhash_test.go ├── utils.go ├── simhash.go └── tokenisers.go /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | simhash 2 | ======= 3 | 4 | A library to find the percentage of similarity between two given strings (can be expanded to compare every thing!). 5 | 6 | This is a port of C# version SimHash created by ArefKarimi, you can find it here (http://simhash.codeplex.com/). 7 | 8 | Usage 9 | ----- 10 | needle := "Reading bytes into structs using reflection" 11 | hayStack := "Golang - mapping an variable length array to a struct" 12 | 13 | likeness := GetLikenessValue(needle, hayStack) 14 | fmt.Println("Likeness:", likeness) 15 | -------------------------------------------------------------------------------- /tokenisers_test.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestFixedSizeStringTokeniser(t *testing.T) { 9 | tokeniser := NewFixedSizeStringTokeniser(5) 10 | 11 | tokens := tokeniser.Tokenise("hello world this is my way to hell") 12 | for _, tk := range tokens { 13 | fmt.Println(tk) 14 | } 15 | } 16 | 17 | func TestOverlappingStringTokeniser(t *testing.T) { 18 | tokeniser := NewOverlappingStringTokeniser(4, 3) 19 | 20 | tokens := tokeniser.Tokenise("hello world this is my way to hell") 21 | for _, tk := range tokens { 22 | fmt.Println(tk) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /simhash_test.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestGetLikenessValue(t *testing.T) { 9 | needle := "Reading bytes into structs using reflection" 10 | hayStack := "Golang - mapping an variable length array to a struct" 11 | 12 | likeness := GetLikenessValue(needle, hayStack) 13 | fmt.Println("Likeness:", likeness) 14 | } 15 | 16 | func BenchmarkGetLikenessValue(b *testing.B) { 17 | needle := "Reading bytes into structs using reflection" 18 | hayStack := "Golang - mapping an variable length array to a struct" 19 | 20 | for i := 0; i < b.N; i++ { 21 | GetLikenessValue(needle, hayStack) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | import ( 4 | "hash/crc32" 5 | "io" 6 | ) 7 | 8 | func getHashTokens(tokens []string) []uint32 { 9 | hashedTokens := make([]uint32, len(tokens)) 10 | ieee := crc32.NewIEEE() 11 | for i, token := range tokens { 12 | ieee.Reset() 13 | io.WriteString(ieee, token) 14 | hashedTokens[i] = ieee.Sum32() 15 | } 16 | return hashedTokens 17 | } 18 | 19 | func isBitSet(b, pos uint32) bool { 20 | return (b & (1 << pos)) != 0 21 | } 22 | 23 | func getHammingDistance(firstValue, secondValue int) int { 24 | hammingBits := firstValue ^ secondValue 25 | hammingValue := 0 26 | for i := 0; i < 32; i++ { 27 | if isBitSet(uint32(hammingBits), uint32(i)) { 28 | hammingValue += 1 29 | } 30 | } 31 | return hammingValue 32 | } 33 | -------------------------------------------------------------------------------- /simhash.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | const ( 4 | HashSize = 32 5 | ) 6 | 7 | func calculateSimHash(input string) int { 8 | tokeniser := NewOverlappingStringTokeniser(4, 3) 9 | hashedTokens := getHashTokens(tokeniser.Tokenise(input)) 10 | vector := make([]int, HashSize) 11 | for i, _ := range vector { 12 | vector[i] = 0 13 | } 14 | 15 | for _, v := range hashedTokens { 16 | for i, _ := range vector { 17 | if isBitSet(uint32(v), uint32(i)) { 18 | vector[i] += 1 19 | } else { 20 | vector[i] -= 1 21 | } 22 | } 23 | } 24 | 25 | fingerprint := 0 26 | for i, v := range vector { 27 | if v > 0 { 28 | fingerprint += 1 << uint32(i) 29 | } 30 | } 31 | 32 | return fingerprint 33 | } 34 | 35 | func GetLikenessValue(needle, haystack string) float64 { 36 | needleSimHash := calculateSimHash(needle) 37 | hayStackSimHash := calculateSimHash(haystack) 38 | return float64(HashSize-getHammingDistance(needleSimHash, hayStackSimHash)) / float64(HashSize) 39 | } 40 | -------------------------------------------------------------------------------- /tokenisers.go: -------------------------------------------------------------------------------- 1 | package simhash 2 | 3 | type Tokeniser interface { 4 | Tokenise(input string) []string 5 | } 6 | 7 | type FixedSizeStringTokeniser struct { 8 | tokensize uint8 9 | } 10 | 11 | func NewFixedSizeStringTokeniser(tokensize uint8) *FixedSizeStringTokeniser { 12 | if tokensize < 2 || tokensize > 127 { 13 | panic("Token size must be between 2 and 127") 14 | } 15 | 16 | return &FixedSizeStringTokeniser{tokensize: tokensize} 17 | } 18 | 19 | func (t *FixedSizeStringTokeniser) Tokenise(input string) []string { 20 | var chunks []string 21 | offset := 0 22 | inputLen := len(input) 23 | 24 | for offset < inputLen { 25 | offset2 := offset + int(t.tokensize) 26 | if offset2 >= inputLen { 27 | chunks = append(chunks, input[offset:]) 28 | } else { 29 | chunks = append(chunks, input[offset:offset2]) 30 | } 31 | offset += int(t.tokensize) 32 | } 33 | 34 | return chunks 35 | } 36 | 37 | type OverlappingStringTokeniser struct { 38 | chunkSize, overlapSize uint8 39 | } 40 | 41 | func NewOverlappingStringTokeniser(chunkSize, overlapSize uint8) *OverlappingStringTokeniser { 42 | if chunkSize <= overlapSize { 43 | panic("Chunk size must be greater than overlap size.") 44 | } 45 | 46 | return &OverlappingStringTokeniser{chunkSize: chunkSize, overlapSize: overlapSize} 47 | } 48 | 49 | func (t *OverlappingStringTokeniser) Tokenise(input string) []string { 50 | var chunks []string 51 | inputLen := len(input) 52 | for position := 0; position < inputLen-int(t.chunkSize); position += int(t.chunkSize - t.overlapSize) { 53 | chunks = append(chunks, input[position:position+int(t.chunkSize)]) 54 | } 55 | return chunks 56 | } 57 | --------------------------------------------------------------------------------