├── COPYING.txt ├── README.md ├── damerau_levenshtein.go ├── damerau_levenshtein_test.go ├── double_metaphone_corpus.txt.gz ├── go.mod ├── hamming.go ├── hamming_test.go ├── jarowinkler.go ├── jarowinkler_test.go ├── levenshtein.go ├── levenshtein_test.go ├── longestcommonsubsequence.go ├── longestcommonsubsequence_test.go ├── metaphone.go ├── metaphone_test.go ├── nysiis.go ├── nysiis_test.go ├── osa.go ├── osa_test.go ├── phonex.go ├── phonex_test.go ├── runestring.go ├── smithwaterman.go ├── smithwaterman_test.go ├── soundex.go ├── soundex_test.go ├── utf8.go └── util.go /COPYING.txt: -------------------------------------------------------------------------------- 1 | Matchr: an approximate string matching library for the Go programming language 2 | 3 | Copyright (C) 2013-2014 Ant Zucaro 4 | 5 | This program is free software; you can redistribute it and/or modify 6 | it under the terms of the GNU General Public License as published by 7 | the Free Software Foundation; either version 2 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU General Public License for more details. 14 | 15 | You should have received a copy of the GNU General Public License 16 | along with this program; if not, write to the Free Software 17 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 18 | 19 | You can contact Ant Zucaro at azucaro at gmail dot com. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # matchr 2 | 3 | [![Go Reference](https://pkg.go.dev/badge/github.com/antzucaro/matchr.svg)](https://pkg.go.dev/github.com/antzucaro/matchr) 4 | 5 | An approximate string matching library for the [Go programming language](http://www.golang.org). 6 | 7 | ## Rationale 8 | 9 | Data used in record linkage can often be of dubious quality. Typographical 10 | errors or changing data elements (to name a few things) make establishing similarity between two sets of data 11 | difficult. Rather than use exact string comparison in such situations, it is 12 | vital to have a means to identify how similar two strings are. Similarity functions can cater 13 | to certain data sets in order to make better matching decisions. The matchr library provides 14 | several of these similarity functions. 15 | -------------------------------------------------------------------------------- /damerau_levenshtein.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | // DamerauLevenshtein computes the Damerau-Levenshtein distance between two 4 | // strings. The returned value - distance - is the number of insertions, 5 | // deletions, substitutions, and transpositions it takes to transform one 6 | // string (s1) into another (s2). Each step in the transformation "costs" 7 | // one distance point. It is similar to the Optimal String Alignment, 8 | // algorithm, but is more complex because it allows multiple edits on 9 | // substrings. 10 | // 11 | // This implementation is based off of the one found on Wikipedia at 12 | // http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions 13 | // as well as KevinStern's Java implementation found at 14 | // https://github.com/KevinStern/software-and-algorithms. 15 | func DamerauLevenshtein(s1 string, s2 string) (distance int) { 16 | // index by code point, not byte 17 | r1 := []rune(s1) 18 | r2 := []rune(s2) 19 | 20 | // the maximum possible distance 21 | inf := len(r1) + len(r2) 22 | 23 | // if one string is blank, we needs insertions 24 | // for all characters in the other one 25 | if len(r1) == 0 { 26 | return len(r2) 27 | } 28 | 29 | if len(r2) == 0 { 30 | return len(r1) 31 | } 32 | 33 | // construct the edit-tracking matrix 34 | matrix := make([][]int, len(r1)) 35 | for i := range matrix { 36 | matrix[i] = make([]int, len(r2)) 37 | } 38 | 39 | // seen characters 40 | seenRunes := make(map[rune]int) 41 | 42 | if r1[0] != r2[0] { 43 | matrix[0][0] = 1 44 | } 45 | 46 | seenRunes[r1[0]] = 0 47 | for i := 1; i < len(r1); i++ { 48 | deleteDist := matrix[i-1][0] + 1 49 | insertDist := (i+1)*1 + 1 50 | var matchDist int 51 | if r1[i] == r2[0] { 52 | matchDist = i 53 | } else { 54 | matchDist = i + 1 55 | } 56 | matrix[i][0] = min(min(deleteDist, insertDist), matchDist) 57 | } 58 | 59 | for j := 1; j < len(r2); j++ { 60 | deleteDist := (j + 1) * 2 61 | insertDist := matrix[0][j-1] + 1 62 | var matchDist int 63 | if r1[0] == r2[j] { 64 | matchDist = j 65 | } else { 66 | matchDist = j + 1 67 | } 68 | 69 | matrix[0][j] = min(min(deleteDist, insertDist), matchDist) 70 | } 71 | 72 | for i := 1; i < len(r1); i++ { 73 | var maxSrcMatchIndex int 74 | if r1[i] == r2[0] { 75 | maxSrcMatchIndex = 0 76 | } else { 77 | maxSrcMatchIndex = -1 78 | } 79 | 80 | for j := 1; j < len(r2); j++ { 81 | swapIndex, ok := seenRunes[r2[j]] 82 | jSwap := maxSrcMatchIndex 83 | deleteDist := matrix[i-1][j] + 1 84 | insertDist := matrix[i][j-1] + 1 85 | matchDist := matrix[i-1][j-1] 86 | if r1[i] != r2[j] { 87 | matchDist += 1 88 | } else { 89 | maxSrcMatchIndex = j 90 | } 91 | 92 | // for transpositions 93 | var swapDist int 94 | if ok && jSwap != -1 { 95 | iSwap := swapIndex 96 | var preSwapCost int 97 | if iSwap == 0 && jSwap == 0 { 98 | preSwapCost = 0 99 | } else { 100 | preSwapCost = matrix[maxI(0, iSwap-1)][maxI(0, jSwap-1)] 101 | } 102 | swapDist = i + j + preSwapCost - iSwap - jSwap - 1 103 | } else { 104 | swapDist = inf 105 | } 106 | matrix[i][j] = min(min(min(deleteDist, insertDist), matchDist), swapDist) 107 | } 108 | seenRunes[r1[i]] = i 109 | } 110 | 111 | return matrix[len(r1)-1][len(r2)-1] 112 | } 113 | -------------------------------------------------------------------------------- /damerau_levenshtein_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var damlevtests = []struct { 6 | s1 string 7 | s2 string 8 | dist int 9 | }{ 10 | // insertion 11 | {"car", "cars", 1}, 12 | // substitution 13 | {"library", "librari", 1}, 14 | // deletion 15 | {"library", "librar", 1}, 16 | // transposition 17 | {"library", "librayr", 1}, 18 | // one empty, left 19 | {"", "library", 7}, 20 | // one empty, right 21 | {"library", "", 7}, 22 | // two empties 23 | {"", "", 0}, 24 | // unicode stuff! 25 | {"Schüßler", "Schübler", 1}, 26 | {"Schüßler", "Schußler", 1}, 27 | {"Schüßler", "Schüßler", 0}, 28 | {"Schßüler", "Schüßler", 1}, 29 | {"Schüßler", "Schüler", 1}, 30 | {"Schüßler", "Schüßlers", 1}, 31 | // difference between DL and OSA. This is DL, so it should be 2. 32 | {"ca", "abc", 2}, 33 | } 34 | 35 | // Damerau-Levenshtein 36 | func TestDamerauLevenshtein(t *testing.T) { 37 | for _, tt := range damlevtests { 38 | dist := DamerauLevenshtein(tt.s1, tt.s2) 39 | if dist != tt.dist { 40 | t.Errorf("DamerauLevenshtein('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /double_metaphone_corpus.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/antzucaro/matchr/7bed6ef61ef9d9753ace1aded16e9763fa4f7142/double_metaphone_corpus.txt.gz -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/antzucaro/matchr 2 | 3 | go 1.13 4 | -------------------------------------------------------------------------------- /hamming.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "errors" 4 | 5 | // Hamming computes the Hamming distance between two equal-length strings. 6 | // This is the number of times the two strings differ between characters at 7 | // the same index. This implementation is based off of the algorithm 8 | // description found at http://en.wikipedia.org/wiki/Hamming_distance. 9 | func Hamming(s1 string, s2 string) (distance int, err error) { 10 | // index by code point, not byte 11 | r1 := []rune(s1) 12 | r2 := []rune(s2) 13 | 14 | if len(r1) != len(r2) { 15 | err = errors.New("Hamming distance of different sized strings.") 16 | return 17 | } 18 | 19 | for i, v := range r1 { 20 | if r2[i] != v { 21 | distance += 1 22 | } 23 | } 24 | return 25 | } 26 | -------------------------------------------------------------------------------- /hamming_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var hamtests = []struct { 6 | s1 string 7 | s2 string 8 | dist int 9 | err bool 10 | }{ 11 | {"", "", 0, false}, 12 | {"cat", "cat", 0, false}, 13 | {"car", "cat", 1, false}, 14 | {"tar", "car", 1, false}, 15 | {"xyz", "zyx", 2, false}, 16 | {"wxyz", "zyx", 0, true}, 17 | {"Schüßler", "Schübler", 1, false}, 18 | {"Schüßler", "Schußler", 1, false}, 19 | } 20 | 21 | // Hamming Distance 22 | func TestHamming(t *testing.T) { 23 | for _, tt := range hamtests { 24 | dist, err := Hamming(tt.s1, tt.s2) 25 | if dist != tt.dist { 26 | t.Errorf("Hamming('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 27 | } 28 | 29 | if tt.err && err == nil { 30 | t.Errorf("Hamming('%s', '%s') should throw an error", tt.s1, tt.s2) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /jarowinkler.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | func jaroWinklerBase(s1 string, s2 string, 4 | longTolerance bool, winklerize bool) (distance float64) { 5 | 6 | // index by code point, not byte 7 | r1 := []rune(s1) 8 | r2 := []rune(s2) 9 | 10 | r1Length := len(r1) 11 | r2Length := len(r2) 12 | 13 | if r1Length == 0 || r2Length == 0 { 14 | return 15 | } 16 | 17 | minLength := 0 18 | if r1Length > r2Length { 19 | minLength = r1Length 20 | } else { 21 | minLength = r2Length 22 | } 23 | 24 | searchRange := minLength 25 | searchRange = (searchRange / 2) - 1 26 | if searchRange < 0 { 27 | searchRange = 0 28 | } 29 | var lowLim, hiLim, transCount, commonChars int 30 | var i, j, k int 31 | 32 | r1Flag := make([]bool, r1Length+1) 33 | r2Flag := make([]bool, r2Length+1) 34 | 35 | // find the common chars within the acceptable range 36 | commonChars = 0 37 | for i, _ = range r1 { 38 | if i >= searchRange { 39 | lowLim = i - searchRange 40 | } else { 41 | lowLim = 0 42 | } 43 | 44 | if (i + searchRange) <= (r2Length - 1) { 45 | hiLim = i + searchRange 46 | } else { 47 | hiLim = r2Length - 1 48 | } 49 | 50 | for j := lowLim; j <= hiLim; j++ { 51 | if !r2Flag[j] && r2[j] == r1[i] { 52 | r2Flag[j] = true 53 | r1Flag[i] = true 54 | commonChars++ 55 | 56 | break 57 | } 58 | } 59 | } 60 | 61 | // if we have nothing in common at this point, nothing else can be done 62 | if commonChars == 0 { 63 | return 64 | } 65 | 66 | // otherwise we count the transpositions 67 | k = 0 68 | transCount = 0 69 | for i, _ := range r1 { 70 | if r1Flag[i] { 71 | for j = k; j < r2Length; j++ { 72 | if r2Flag[j] { 73 | k = j + 1 74 | break 75 | } 76 | } 77 | if r1[i] != r2[j] { 78 | transCount++ 79 | } 80 | } 81 | } 82 | transCount /= 2 83 | 84 | // adjust for similarities in nonmatched characters 85 | distance = float64(commonChars)/float64(r1Length) + 86 | float64(commonChars)/float64(r2Length) + 87 | (float64(commonChars-transCount))/float64(commonChars) 88 | distance /= 3.0 89 | 90 | // give more weight to already-similar strings 91 | if winklerize && distance > 0.7 { 92 | 93 | // the first 4 characters in common 94 | if minLength >= 4 { 95 | j = 4 96 | } else { 97 | j = minLength 98 | } 99 | 100 | for i = 0; i < j && len(r1) > i && len(r2) > i && r1[i] == r2[i] && nan(r1[i]); i++ { 101 | } 102 | 103 | if i > 0 { 104 | distance += float64(i) * 0.1 * (1.0 - distance) 105 | } 106 | 107 | if longTolerance && (minLength > 4) && (commonChars > i+1) && 108 | (2*commonChars >= minLength+i) { 109 | if nan(r1[0]) { 110 | distance += (1.0 - distance) * (float64(commonChars-i-1) / 111 | (float64(r1Length) + float64(r2Length) - float64(i*2) + 2)) 112 | } 113 | } 114 | } 115 | 116 | return 117 | } 118 | 119 | // Jaro computes the Jaro edit distance between two strings. It represents 120 | // this with a float64 between 0 and 1 inclusive, with 0 indicating the two 121 | // strings are not at all similar and 1 indicating the two strings are exact 122 | // matches. 123 | // 124 | // See http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance for a 125 | // full description. 126 | func Jaro(r1 string, r2 string) (distance float64) { 127 | return jaroWinklerBase(r1, r2, false, false) 128 | } 129 | 130 | // JaroWinkler computes the Jaro-Winkler edit distance between two strings. 131 | // This is a modification of the Jaro algorithm that gives additional weight 132 | // to prefix matches. 133 | func JaroWinkler(r1 string, r2 string, longTolerance bool) (distance float64) { 134 | return jaroWinklerBase(r1, r2, longTolerance, true) 135 | } 136 | -------------------------------------------------------------------------------- /jarowinkler_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var jarotests = []struct { 6 | s1 string 7 | s2 string 8 | dist float64 9 | }{ 10 | {"", "cars", 0.0}, 11 | {"cars", "", 0.0}, 12 | {"car", "cars", 0.9166666666666666}, 13 | {"dixon", "dicksonx", 0.7666666666666666}, 14 | {"martha", "marhta", 0.9444444444444445}, 15 | {"dwayne", "duane", 0.8222222222222223}, 16 | {"martüa", "marüta", 0.9444444444444445}, 17 | {"dr", "driveway", 0.75}, 18 | } 19 | 20 | // Regular Jaro distance 21 | func TestJaro(t *testing.T) { 22 | for _, tt := range jarotests { 23 | dist := Jaro(tt.s1, tt.s2) 24 | if dist != tt.dist { 25 | t.Errorf("Jaro('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 26 | } 27 | } 28 | } 29 | 30 | var jarowtests = []struct { 31 | s1 string 32 | s2 string 33 | dist float64 34 | }{ 35 | {"", "cars", 0.0}, 36 | {"cars", "", 0.0}, 37 | {"dixon", "dicksonx", 0.8133333333333332}, 38 | {"martha", "marhta", 0.9611111111111111}, 39 | {"dwayne", "duane", 0.8400000000000001}, 40 | {"dr", "driveway", 0.8}, 41 | } 42 | 43 | // Jaro-Winkler distance 44 | func TestJaroWinkler(t *testing.T) { 45 | for _, tt := range jarowtests { 46 | dist := JaroWinkler(tt.s1, tt.s2, false) 47 | if dist != tt.dist { 48 | t.Errorf("JaroWinkler('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 49 | } 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /levenshtein.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | // Levenshtein computes the Levenshtein distance between two 4 | // strings. The returned value - distance - is the number of insertions, 5 | // deletions, and substitutions it takes to transform one 6 | // string (s1) into another (s2). Each step in the transformation "costs" 7 | // one distance point. 8 | func Levenshtein(s1 string, s2 string) (distance int) { 9 | // index by code point, not byte 10 | r1 := []rune(s1) 11 | r2 := []rune(s2) 12 | 13 | rows := len(r1) + 1 14 | cols := len(r2) + 1 15 | 16 | var d1 int 17 | var d2 int 18 | var d3 int 19 | var i int 20 | var j int 21 | dist := make([]int, rows*cols) 22 | 23 | for i = 0; i < rows; i++ { 24 | dist[i*cols] = i 25 | } 26 | 27 | for j = 0; j < cols; j++ { 28 | dist[j] = j 29 | } 30 | 31 | for j = 1; j < cols; j++ { 32 | for i = 1; i < rows; i++ { 33 | if r1[i-1] == r2[j-1] { 34 | dist[(i*cols)+j] = dist[((i-1)*cols)+(j-1)] 35 | } else { 36 | d1 = dist[((i-1)*cols)+j] + 1 37 | d2 = dist[(i*cols)+(j-1)] + 1 38 | d3 = dist[((i-1)*cols)+(j-1)] + 1 39 | 40 | dist[(i*cols)+j] = min(d1, min(d2, d3)) 41 | } 42 | } 43 | } 44 | 45 | distance = dist[(cols*rows)-1] 46 | 47 | return 48 | } 49 | -------------------------------------------------------------------------------- /levenshtein_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var levtests = []struct { 6 | s1 string 7 | s2 string 8 | dist int 9 | }{ 10 | // insertion 11 | {"car", "cars", 1}, 12 | // substitution 13 | {"library", "librari", 1}, 14 | // deletion 15 | {"library", "librar", 1}, 16 | // one empty, left 17 | {"", "library", 7}, 18 | // one empty, right 19 | {"library", "", 7}, 20 | // two empties 21 | {"", "", 0}, 22 | // unicode stuff! 23 | {"Schüßler", "Schübler", 1}, 24 | {"Schüßler", "Schußler", 1}, 25 | {"Schüßler", "Schüßler", 0}, 26 | {"Schüßler", "Schüler", 1}, 27 | {"Schüßler", "Schüßlers", 1}, 28 | } 29 | 30 | // Regular Levenshtein 31 | func TestLevenshtein(t *testing.T) { 32 | for _, tt := range levtests { 33 | dist := Levenshtein(tt.s1, tt.s2) 34 | if dist != tt.dist { 35 | t.Errorf("Levenshtein('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /longestcommonsubsequence.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | // LongestCommonSubsequence computes the longest substring 4 | // between two strings. The returned value is the length 5 | // of the substring, which contains letters from both 6 | // strings, while maintaining the order of the letters. 7 | func LongestCommonSubsequence(s1, s2 string) int { 8 | r1 := []rune(s1) 9 | r2 := []rune(s2) 10 | table := make([][]int, len(s1)+1) 11 | 12 | // Construct 2D table 13 | for i := range table { 14 | table[i] = make([]int, len(s2)+1) 15 | } 16 | 17 | var i int 18 | var j int 19 | 20 | for i = len(r1) - 1; i >= 0; i-- { 21 | for j = len(r2) - 1; j >= 0; j-- { 22 | if r1[i] == r2[j] { 23 | table[i][j] = 1 + table[i+1][j+1] 24 | } else { 25 | table[i][j] = maxI(table[i+1][j], table[i][j+1]) 26 | } 27 | } 28 | } 29 | return table[0][0] 30 | } 31 | -------------------------------------------------------------------------------- /longestcommonsubsequence_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var lcstests = []struct { 6 | s1 string 7 | s2 string 8 | length int 9 | }{ 10 | // match beginning 11 | {"cans", "can", 3}, 12 | // match end 13 | {"ebay", "bay", 3}, 14 | // gap in the middle 15 | {"coins", "cons", 4}, 16 | // one empty, left 17 | {"", "hello", 0}, 18 | // one empty, right 19 | {"goodbye", "", 0}, 20 | // two empties 21 | {"", "", 0}, 22 | // unicode stuff! 23 | {"Schüßler", "Schüßler", 8}, 24 | } 25 | 26 | func TestLongestCommonSubsequence(t *testing.T) { 27 | for _, tt := range lcstests { 28 | length := LongestCommonSubsequence(tt.s1, tt.s2) 29 | if length != tt.length { 30 | t.Errorf("LongestCommonSubsequence('%s', '%s') = %v, want %v", tt.s1, tt.s2, length, tt.length) 31 | } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /metaphone.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import ( 4 | "bytes" 5 | "strings" 6 | ) 7 | 8 | type metaphoneresult struct { 9 | // the maximum number of code values to calculate 10 | maxLength int 11 | 12 | // whether to calculate an alternate 13 | calcAlternate bool 14 | 15 | // no direct modifications - only through add() 16 | primary bytes.Buffer 17 | alternate bytes.Buffer 18 | 19 | // length of the private buffers 20 | PrimaryLength int 21 | AlternateLength int 22 | } 23 | 24 | func newMetaphoneresult(maxLength int, calcAlternate bool) (r *metaphoneresult) { 25 | r = &metaphoneresult{maxLength: maxLength, calcAlternate: calcAlternate} 26 | return 27 | } 28 | 29 | func (r *metaphoneresult) add(c1 string, c2 string) { 30 | if c1 != "" { 31 | r.primary.WriteString(c1) 32 | r.PrimaryLength += len(c1) 33 | } 34 | 35 | if c2 != "" && r.calcAlternate { 36 | r.alternate.WriteString(c2) 37 | r.AlternateLength += len(c2) 38 | } 39 | } 40 | 41 | func (r *metaphoneresult) isComplete() bool { 42 | return r.PrimaryLength >= r.maxLength && r.AlternateLength >= r.maxLength 43 | } 44 | 45 | func (r *metaphoneresult) result() (primary string, alternate string) { 46 | primary = r.primary.String() 47 | if len(primary) > r.maxLength { 48 | primary = primary[0:r.maxLength] 49 | } 50 | alternate = r.alternate.String() 51 | if len(alternate) > r.maxLength { 52 | alternate = alternate[0:r.maxLength] 53 | } 54 | return 55 | } 56 | 57 | // utility functions for checking things within a string 58 | func isSlavoGermanic(value string) bool { 59 | return strings.Contains(value, "W") || strings.Contains(value, "K") || 60 | strings.Contains(value, "CZ") || strings.Contains(value, "WITZ") 61 | } 62 | 63 | func isSilentStart(input runestring) bool { 64 | SILENT_START := [...]string{"GN", "KN", "PN", "WR", "PS"} 65 | 66 | prefix := input.SafeSubstr(0, 2) 67 | 68 | for _, criteria := range SILENT_START { 69 | if prefix == criteria { 70 | return true 71 | } 72 | } 73 | 74 | return false 75 | } 76 | 77 | func handleVowel(result *metaphoneresult, index int) int { 78 | if index == 0 { 79 | result.add("A", "A") 80 | } 81 | 82 | return index + 1 83 | } 84 | 85 | /****************************************************************************** 86 | * Entry handlers for letters. 87 | *****************************************************************************/ 88 | func handleC(input runestring, result *metaphoneresult, index int) int { 89 | if conditionC0(input, index) { 90 | result.add("K", "K") 91 | index += 2 92 | } else if index == 0 && input.Contains(index, 6, "CAESAR") { 93 | result.add("S", "S") 94 | index += 2 95 | } else if input.Contains(index, 2, "CH") { 96 | index = handleCH(input, result, index) 97 | } else if input.Contains(index, 2, "CZ") && 98 | !input.Contains(index-2, 4, "WICZ") { 99 | result.add("S", "X") 100 | index += 2 101 | } else if input.Contains(index+1, 3, "CIA") { 102 | result.add("X", "X") 103 | index += 3 104 | } else if input.Contains(index, 2, "CC") && 105 | !(index == 1 && input.SafeAt(0) == 'M') { 106 | return handleCC(input, result, index) 107 | } else if input.Contains(index, 2, "CK") || 108 | input.Contains(index, 2, "CG") || 109 | input.Contains(index, 2, "CQ") { 110 | result.add("K", "K") 111 | index += 2 112 | } else if input.Contains(index, 2, "CI") || 113 | input.Contains(index, 2, "CE") || 114 | input.Contains(index, 2, "CY") { 115 | if input.Contains(index, 3, "CIO") || 116 | input.Contains(index, 3, "CIE") || 117 | input.Contains(index, 3, "CIA") { 118 | result.add("S", "X") 119 | } else { 120 | result.add("S", "S") 121 | } 122 | index += 2 123 | } else { 124 | result.add("K", "K") 125 | if input.Contains(index+1, 2, " C") || 126 | input.Contains(index+1, 2, " Q") || 127 | input.Contains(index+1, 2, " G") { 128 | index += 3 129 | } else if (input.Contains(index+1, 1, "C") || 130 | input.Contains(index+1, 1, "K") || 131 | input.Contains(index+1, 1, "Q")) && 132 | !(input.Contains(index+1, 2, "CE") || 133 | input.Contains(index+1, 2, "CI")) { 134 | index += 2 135 | } else { 136 | index++ 137 | } 138 | } 139 | 140 | return index 141 | } 142 | 143 | func handleCC(input runestring, result *metaphoneresult, index int) int { 144 | if input.Contains(index+2, 1, "I", "E", "H") && 145 | !input.Contains(index+2, 2, "HU") { 146 | if (index == 1 && input.SafeAt(index-1) == 'A') || 147 | (input.Contains(index-1, 5, "UCCEE", "UCCES")) { 148 | result.add("KS", "KS") 149 | } else { 150 | result.add("X", "X") 151 | } 152 | index += 3 153 | } else { 154 | result.add("K", "K") 155 | index += 2 156 | } 157 | return index 158 | } 159 | 160 | func handleCH(input runestring, result *metaphoneresult, index int) int { 161 | if index > 0 && input.Contains(index, 4, "CHAE") { 162 | result.add("K", "X") 163 | return index + 2 164 | } else if conditionCH0(input, index) { 165 | result.add("K", "K") 166 | return index + 2 167 | // TODO: combine this condition with the one above? 168 | } else if conditionCH1(input, index) { 169 | result.add("K", "K") 170 | return index + 2 171 | } else { 172 | if index > 0 { 173 | if input.Contains(0, 2, "MC") { 174 | result.add("K", "K") 175 | } else { 176 | result.add("X", "K") 177 | } 178 | } else { 179 | result.add("X", "X") 180 | } 181 | return index + 2 182 | } 183 | } 184 | 185 | func handleD(input runestring, result *metaphoneresult, index int) int { 186 | if input.Contains(index, 2, "DG") { 187 | if input.Contains(index+2, 1, "I", "E", "Y") { 188 | result.add("J", "J") 189 | index += 3 190 | } else { 191 | result.add("TK", "TK") 192 | index += 2 193 | } 194 | } else if input.Contains(index, 2, "DT", "DD") { 195 | result.add("T", "T") 196 | index += 2 197 | } else { 198 | result.add("T", "T") 199 | index++ 200 | } 201 | return index 202 | } 203 | 204 | func handleG(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int { 205 | if input.SafeAt(index+1) == 'H' { 206 | index = handleGH(input, result, index) 207 | } else if input.SafeAt(index+1) == 'N' { 208 | if index == 1 && isVowel(input.SafeAt(0)) && !slavoGermanic { 209 | result.add("KN", "N") 210 | } else if !input.Contains(index+2, 2, "EY") && input.SafeAt(index+1) != 'Y' && !slavoGermanic { 211 | result.add("N", "KN") 212 | } else { 213 | result.add("KN", "KN") 214 | } 215 | index += 2 216 | } else if input.Contains(index+1, 2, "LI") && !slavoGermanic { 217 | result.add("KL", "L") 218 | index += 2 219 | } else if index == 0 && (input.SafeAt(index+1) == 'Y' || 220 | input.Contains(index+1, 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER")) { 221 | result.add("K", "J") 222 | index += 2 223 | } else if (input.Contains(index+1, 2, "ER") || 224 | input.SafeAt(index+1) == 'Y') && 225 | !input.Contains(0, 6, "DANGER", "RANGER", "MANGER") && 226 | !input.Contains(index-1, 1, "E", "I") && 227 | !input.Contains(index-1, 3, "RGY", "OGY") { 228 | result.add("K", "J") 229 | index += 2 230 | } else if input.Contains(index+1, 1, "E", "I", "Y") || 231 | input.Contains(index-1, 4, "AGGI", "OGGI") { 232 | if input.Contains(0, 4, "VAN ", "VON ") || 233 | input.Contains(0, 3, "SCH") || 234 | input.Contains(index+1, 2, "ET") { 235 | result.add("K", "K") 236 | } else if input.Contains(index+1, 3, "IER") { 237 | result.add("J", "J") 238 | } else { 239 | result.add("J", "K") 240 | } 241 | index += 2 242 | } else if input.SafeAt(index+1) == 'G' { 243 | result.add("K", "K") 244 | index += 2 245 | } else { 246 | result.add("K", "K") 247 | index++ 248 | } 249 | return index 250 | } 251 | 252 | func handleGH(input runestring, result *metaphoneresult, index int) int { 253 | if index > 0 && !isVowel(input.SafeAt(index-1)) { 254 | result.add("K", "K") 255 | index += 2 256 | } else if index == 0 { 257 | if input.SafeAt(index+2) == 'I' { 258 | result.add("J", "J") 259 | } else { 260 | result.add("K", "K") 261 | } 262 | index += 2 263 | } else if (index > 1 && input.Contains(index-2, 1, "B", "H", "D")) || 264 | (index > 2 && input.Contains(index-3, 1, "B", "H", "D")) || 265 | (index > 3 && input.Contains(index-4, 1, "B", "H")) { 266 | index += 2 267 | } else { 268 | if index > 2 && input.SafeAt(index-1) == 'U' && 269 | input.Contains(index-3, 1, "C", "G", "L", "R", "T") { 270 | result.add("F", "F") 271 | } else if index > 0 && input.SafeAt(index-1) != 'I' { 272 | result.add("K", "K") 273 | } 274 | index += 2 275 | } 276 | return index 277 | } 278 | 279 | func handleH(input runestring, result *metaphoneresult, index int) int { 280 | if (index == 0 || isVowel(input.SafeAt(index-1))) && 281 | isVowel(input.SafeAt(index+1)) { 282 | result.add("H", "H") 283 | index += 2 284 | } else { 285 | index++ 286 | } 287 | return index 288 | } 289 | 290 | func handleJ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int { 291 | if input.Contains(index, 4, "JOSE") || input.Contains(0, 4, "SAN ") { 292 | if (index == 0 && (input.SafeAt(index+4) == ' ') || 293 | len(input) == 4) || input.Contains(0, 4, "SAN ") { 294 | result.add("H", "H") 295 | } else { 296 | result.add("J", "H") 297 | } 298 | index++ 299 | } else { 300 | if index == 0 && !input.Contains(index, 4, "JOSE") { 301 | result.add("J", "A") 302 | } else if isVowel(input.SafeAt(index-1)) && !slavoGermanic && 303 | (input.SafeAt(index+1) == 'A' || input.SafeAt(index+1) == 'O') { 304 | result.add("J", "H") 305 | } else if index == (len(input) - 1) { 306 | result.add("J", " ") 307 | } else if !input.Contains(index+1, 1, 308 | "L", "T", "K", "S", "N", "M", "B", "Z") && 309 | !input.Contains(index-1, 1, "S", "K", "L") { 310 | result.add("J", "J") 311 | } 312 | 313 | if input.SafeAt(index+1) == 'J' { 314 | index += 2 315 | } else { 316 | index++ 317 | } 318 | } 319 | return index 320 | } 321 | 322 | func handleL(input runestring, result *metaphoneresult, index int) int { 323 | if input.SafeAt(index+1) == 'L' { 324 | if conditionL0(input, index) { 325 | result.add("L", "") 326 | } else { 327 | result.add("L", "L") 328 | } 329 | index += 2 330 | } else { 331 | result.add("L", "L") 332 | index++ 333 | } 334 | return index 335 | } 336 | 337 | func handleP(input runestring, result *metaphoneresult, index int) int { 338 | if input.SafeAt(index+1) == 'H' { 339 | result.add("F", "F") 340 | index += 2 341 | } else { 342 | result.add("P", "P") 343 | if input.Contains(index+1, 1, "P", "B") { 344 | index += 2 345 | } else { 346 | index++ 347 | } 348 | } 349 | return index 350 | } 351 | 352 | func handleR(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int { 353 | if index == (len(input)-1) && !slavoGermanic && 354 | input.Contains(index-2, 2, "IE") && 355 | !input.Contains(index-4, 2, "ME", "MA") { 356 | result.add("", "R") 357 | } else { 358 | result.add("R", "R") 359 | } 360 | 361 | if input.SafeAt(index+1) == 'R' { 362 | index += 2 363 | } else { 364 | index++ 365 | } 366 | return index 367 | } 368 | 369 | func handleS(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int { 370 | if input.Contains(index-1, 3, "ISL", "YSL") { 371 | index++ 372 | } else if index == 0 && input.Contains(index, 5, "SUGAR") { 373 | result.add("X", "S") 374 | index++ 375 | } else if input.Contains(index, 2, "SH") { 376 | if input.Contains(index+1, 4, "HEIM", "HOEK", "HOLM", "HOLZ") { 377 | result.add("S", "S") 378 | } else { 379 | result.add("X", "X") 380 | } 381 | index += 2 382 | } else if input.Contains(index, 3, "SIO", "SIA") || 383 | input.Contains(index, 4, "SIAN") { 384 | if slavoGermanic { 385 | result.add("S", "S") 386 | } else { 387 | result.add("S", "X") 388 | } 389 | index += 3 390 | } else if (index == 0 && input.Contains(index+1, 1, "M", "N", "L", "W")) || 391 | input.Contains(index+1, 1, "Z") { 392 | result.add("S", "X") 393 | if input.Contains(index+1, 1, "Z") { 394 | index += 2 395 | } else { 396 | index++ 397 | } 398 | } else if input.Contains(index, 2, "SC") { 399 | index = handleSC(input, result, index) 400 | } else { 401 | if index == len(input)-1 && 402 | input.Contains(index-2, 2, "AI", "OI") { 403 | result.add("", "S") 404 | } else { 405 | result.add("S", "S") 406 | } 407 | 408 | if input.Contains(index+1, 1, "S", "Z") { 409 | index += 2 410 | } else { 411 | index++ 412 | } 413 | } 414 | return index 415 | } 416 | 417 | func handleSC(input runestring, result *metaphoneresult, index int) int { 418 | if input.SafeAt(index+2) == 'H' { 419 | if input.Contains(index+3, 2, "OO", "ER", "EN", "UY", "ED", "EM") { 420 | if input.Contains(index+3, 2, "ER", "EN") { 421 | result.add("X", "SK") 422 | } else { 423 | result.add("SK", "SK") 424 | } 425 | } else { 426 | if index == 0 && !isVowel(input.SafeAt(3)) && input.SafeAt(3) != 'W' { 427 | result.add("X", "S") 428 | } else { 429 | result.add("X", "X") 430 | } 431 | } 432 | } else if input.Contains(index+2, 1, "I", "E", "Y") { 433 | result.add("S", "S") 434 | } else { 435 | result.add("SK", "SK") 436 | } 437 | index += 3 438 | 439 | return index 440 | } 441 | 442 | func handleT(input runestring, result *metaphoneresult, index int) int { 443 | if input.Contains(index, 4, "TION") { 444 | result.add("X", "X") 445 | index += 3 446 | } else if input.Contains(index, 3, "TIA", "TCH") { 447 | result.add("X", "X") 448 | index += 3 449 | } else if input.Contains(index, 2, "TH") || input.Contains(index, 3, "TTH") { 450 | if input.Contains(index+2, 2, "OM", "AM") || 451 | input.Contains(0, 4, "VAN ", "VON ") || 452 | input.Contains(0, 3, "SCH") { 453 | result.add("T", "T") 454 | } else { 455 | result.add("0", "T") 456 | } 457 | index += 2 458 | } else { 459 | result.add("T", "T") 460 | if input.Contains(index+1, 1, "T", "D") { 461 | index += 2 462 | } else { 463 | index++ 464 | } 465 | } 466 | return index 467 | } 468 | 469 | func handleW(input runestring, result *metaphoneresult, index int) int { 470 | if input.Contains(index, 2, "WR") { 471 | result.add("R", "R") 472 | index += 2 473 | } else { 474 | if index == 0 && (isVowel(input.SafeAt(index+1)) || 475 | input.Contains(index, 2, "WH")) { 476 | if isVowel(input.SafeAt(index + 1)) { 477 | result.add("A", "F") 478 | } else { 479 | result.add("A", "A") 480 | } 481 | index++ 482 | } else if (index == len(input)-1 && isVowel(input.SafeAt(index-1))) || 483 | input.Contains(index-1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") || 484 | input.Contains(0, 3, "SCH") { 485 | result.add("", "F") 486 | index++ 487 | } else if input.Contains(index, 4, "WICZ", "WITZ") { 488 | result.add("TS", "FX") 489 | index += 4 490 | } else { 491 | index++ 492 | } 493 | } 494 | return index 495 | } 496 | 497 | func handleX(input runestring, result *metaphoneresult, index int) int { 498 | if index == 0 { 499 | result.add("S", "S") 500 | index++ 501 | } else { 502 | if !((index == len(input)-1) && 503 | (input.Contains(index-3, 3, "IAU", "EAU") || 504 | input.Contains(index-2, 2, "AU", "OU"))) { 505 | result.add("KS", "KS") 506 | } 507 | 508 | if input.Contains(index+1, 1, "C", "X") { 509 | index += 2 510 | } else { 511 | index++ 512 | } 513 | } 514 | return index 515 | } 516 | 517 | func handleZ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int { 518 | if input.SafeAt(index+1) == 'H' { 519 | result.add("J", "J") 520 | } else { 521 | if input.Contains(index+1, 2, "ZO", "ZI", "ZA") || 522 | (slavoGermanic && (index > 0 && input.SafeAt(index-1) != 'T')) { 523 | result.add("S", "TS") 524 | } else { 525 | result.add("S", "S") 526 | } 527 | } 528 | 529 | if input.SafeAt(index+1) == 'Z' { 530 | index += 2 531 | } else { 532 | index++ 533 | } 534 | return index 535 | } 536 | 537 | /****************************************************************************** 538 | * Complex conditional handlers for letters 539 | *****************************************************************************/ 540 | func conditionC0(input runestring, index int) bool { 541 | if input.Contains(index, 4, "CHIA") { 542 | return true 543 | } else if index <= 1 { 544 | return false 545 | } else if isVowel(input.SafeAt(index - 2)) { 546 | return false 547 | } else if !input.Contains(index-1, 3, "ACH") { 548 | return false 549 | } else { 550 | c := input.SafeAt(index + 2) 551 | return (c != 'I' && c != 'E') || 552 | (input.Contains(index-2, 6, "BACHER") || 553 | input.Contains(index-2, 6, "MACHER")) 554 | } 555 | } 556 | 557 | func conditionCH0(input runestring, index int) bool { 558 | if index != 0 { 559 | return false 560 | } else if !input.Contains(index+1, 5, "HARAC", "HARIS") && 561 | !input.Contains(index+1, 3, "HOR", "HYM", "HIA", "HEM") { 562 | return false 563 | } else if input.Contains(0, 5, "CHORE") { 564 | return false 565 | } else { 566 | return true 567 | } 568 | } 569 | 570 | func conditionCH1(input runestring, index int) bool { 571 | // good god this is ugly 572 | return (input.Contains(0, 4, "VAN ", "VON ") || input.Contains(0, 3, "SCH")) || 573 | input.Contains(index-2, 6, "ORCHES", "ARCHIT", "ORCHID") || 574 | input.Contains(index+2, 1, "T", "S") || 575 | ((input.Contains(index-1, 1, "A", "O", "U", "E") || index == 0) && 576 | (input.Contains(index+2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") || 577 | index+1 == len(input)-1)) 578 | } 579 | 580 | func conditionL0(input runestring, index int) bool { 581 | if index == (len(input)-3) && 582 | input.Contains(index-1, 4, "ILLO", "ILLA", "ALLE") { 583 | return true 584 | } else if (input.Contains(len(input)-2, 2, "AS", "OS") || 585 | input.Contains(len(input)-1, 1, "A", "O")) && 586 | (input.Contains(index-1, 4, "ALLE")) { 587 | return true 588 | } else { 589 | return false 590 | } 591 | } 592 | 593 | func conditionM0(input runestring, index int) bool { 594 | if input.SafeAt(index+1) == 'M' { 595 | return true 596 | } 597 | 598 | return input.Contains(index-1, 3, "UMB") && 599 | ((index+1) == (len(input)-1) || 600 | input.Contains(index+2, 2, "ER")) 601 | } 602 | 603 | // DoubleMetaphone computes the Double-Metaphone value of the input string. 604 | // This value is a phonetic representation of how the string sounds, with 605 | // affordances for many different language dialects. It was originally 606 | // developed by Lawrence Phillips in the 1990s. 607 | // 608 | // More information about this algorithm can be found on Wikipedia at 609 | // http://en.wikipedia.org/wiki/Metaphone. 610 | func DoubleMetaphone(s1 string) (string, string) { 611 | // trim, upper space 612 | s1 = cleanInput(s1) 613 | 614 | // structure to traverse the string by code point, not byte 615 | input := runestring(s1) 616 | 617 | slavoGermanic := isSlavoGermanic(s1) 618 | 619 | // where we are in the string 620 | index := 0 621 | 622 | if isSilentStart(input) { 623 | index += 1 624 | } 625 | 626 | result := newMetaphoneresult(4, true) 627 | 628 | for !result.isComplete() && index <= len(input)-1 { 629 | c := rune(input.SafeAt(index)) 630 | switch c { 631 | case 'A', 'E', 'I', 'O', 'U', 'Y': 632 | index = handleVowel(result, index) 633 | case 'B': 634 | result.add("P", "P") 635 | if input.SafeAt(index+1) == 'B' { 636 | index += 2 637 | } else { 638 | index++ 639 | } 640 | case 'Ç': 641 | result.add("S", "S") 642 | index++ 643 | case 'C': 644 | index = handleC(input, result, index) 645 | case 'D': 646 | index = handleD(input, result, index) 647 | case 'F': 648 | result.add("F", "F") 649 | if input.SafeAt(index+1) == 'F' { 650 | index += 2 651 | } else { 652 | index++ 653 | } 654 | case 'G': 655 | index = handleG(input, result, index, slavoGermanic) 656 | case 'H': 657 | index = handleH(input, result, index) 658 | case 'J': 659 | index = handleJ(input, result, index, slavoGermanic) 660 | case 'K': 661 | result.add("K", "K") 662 | if input.SafeAt(index+1) == 'K' { 663 | index += 2 664 | } else { 665 | index++ 666 | } 667 | case 'L': 668 | index = handleL(input, result, index) 669 | case 'M': 670 | result.add("M", "M") 671 | if conditionM0(input, index) { 672 | index += 2 673 | } else { 674 | index++ 675 | } 676 | case 'N': 677 | result.add("N", "N") 678 | if input.SafeAt(index+1) == 'N' { 679 | index += 2 680 | } else { 681 | index++ 682 | } 683 | case 'Ñ': 684 | result.add("N", "N") 685 | index++ 686 | case 'P': 687 | index = handleP(input, result, index) 688 | case 'Q': 689 | result.add("K", "K") 690 | if input.SafeAt(index+1) == 'Q' { 691 | index += 2 692 | } else { 693 | index++ 694 | } 695 | case 'R': 696 | index = handleR(input, result, index, slavoGermanic) 697 | case 'S': 698 | index = handleS(input, result, index, slavoGermanic) 699 | case 'T': 700 | index = handleT(input, result, index) 701 | case 'V': 702 | result.add("F", "F") 703 | if input.SafeAt(index+1) == 'V' { 704 | index += 2 705 | } else { 706 | index++ 707 | } 708 | case 'W': 709 | index = handleW(input, result, index) 710 | case 'X': 711 | index = handleX(input, result, index) 712 | case 'Z': 713 | index = handleZ(input, result, index, slavoGermanic) 714 | default: 715 | index++ 716 | } 717 | 718 | } 719 | 720 | return result.result() 721 | } 722 | -------------------------------------------------------------------------------- /metaphone_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import ( 4 | "bufio" 5 | "compress/gzip" 6 | "os" 7 | "strings" 8 | "testing" 9 | ) 10 | 11 | func TestDoubleMetaphone(t *testing.T) { 12 | // load gzipped corpus 13 | f, err := os.Open("double_metaphone_corpus.txt.gz") 14 | if err != nil { 15 | panic("Error opening file double_metaphone_corpus.txt.gz! Exiting.") 16 | } 17 | defer f.Close() 18 | 19 | g, err := gzip.NewReader(f) 20 | if err != nil { 21 | panic("Error with supposedly gzipped file double_metaphone_corpus.txt.gz! Exiting.") 22 | } 23 | 24 | r := bufio.NewReader(g) 25 | 26 | line, err := r.ReadString('\n') 27 | for err == nil { 28 | line = strings.TrimRight(line, "\n") 29 | v := strings.Split(line, "|") 30 | 31 | metaphone, alternate := DoubleMetaphone(v[0]) 32 | if metaphone != v[1] || alternate != v[2] { 33 | t.Errorf("DoubleMetaphone('%s') = (%v, %v), want (%v, %v)", v[0], metaphone, alternate, v[1], v[2]) 34 | t.FailNow() 35 | } 36 | 37 | line, err = r.ReadString('\n') 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /nysiis.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | // NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a 4 | // modification of the traditional Soundex algorithm. 5 | func NYSIIS(s1 string) string { 6 | cleans1 := runestring(cleanInput(s1)) 7 | input := runestring(make([]rune, 0, len(s1))) 8 | 9 | // The output can't be larger than the string itself 10 | output := runestring(make([]rune, 0, len(s1))) 11 | 12 | // 0. Remove all non-ASCII characters 13 | for _, v := range cleans1 { 14 | if v >= 65 && v <= 90 { 15 | input = append(input, v) 16 | } 17 | } 18 | 19 | if len(input) == 0 { 20 | return "" 21 | } 22 | 23 | // 1. Transcoding first characters 24 | switch input[0] { 25 | case 'M': 26 | if input.SafeSubstr(0, 3) == "MAC" { 27 | // MAC -> MCC 28 | input[1] = 'C' 29 | } 30 | case 'K': 31 | if input.SafeSubstr(0, 2) == "KN" { 32 | // KN -> NN 33 | input[0] = 'N' 34 | } else { 35 | // K -> C 36 | input[0] = 'C' 37 | } 38 | case 'P': 39 | next := input.SafeAt(1) 40 | if next == 'H' { 41 | // PH -> FF 42 | input[0] = 'F' 43 | input[1] = 'F' 44 | } else if next == 'F' { 45 | // PF -> FF 46 | input[0] = 'F' 47 | } 48 | case 'S': 49 | if input.SafeSubstr(0, 3) == "SCH" { 50 | input[1] = 'S' 51 | input[2] = 'S' 52 | } 53 | } 54 | 55 | // 2. Transcoding last characters 56 | switch input.SafeSubstr(len(input)-2, 2) { 57 | case "EE", "IE": 58 | // EE, IE -> Y 59 | input.Del(len(input) - 2) 60 | input[len(input)-1] = 'Y' 61 | case "DT", "RT", "RD", "NT", "ND": 62 | // DT, RT, RD, NT, ND -> D 63 | input.Del(len(input) - 2) 64 | input[len(input)-1] = 'D' 65 | } 66 | 67 | // 3. First character of key = first character of name 68 | output = append(output, input[0]) 69 | last := input[0] 70 | 71 | for i := 1; i < len(input); i++ { 72 | c := input[i] 73 | switch c { 74 | case 'A', 'I', 'O', 'U': 75 | // A, E, I, O, U -> A (E is separate) 76 | input[i] = 'A' 77 | case 'E': 78 | // EV -> AF, else A 79 | if input.SafeAt(i+1) == 'V' { 80 | input[i+1] = 'F' 81 | } 82 | input[i] = 'A' 83 | case 'Q': 84 | // Q -> G 85 | input[i] = 'G' 86 | case 'Z': 87 | // Z -> S 88 | input[i] = 'S' 89 | case 'M': 90 | // M -> N 91 | input[i] = 'N' 92 | case 'K': 93 | // KN -> N, else K -> C 94 | if input.SafeAt(i+1) == 'N' { 95 | input.Del(i) 96 | } else { 97 | input[i] = 'C' 98 | } 99 | case 'S': 100 | // SCH -> SSS 101 | if input.SafeSubstr(i, 3) == "SCH" { 102 | input[i+1] = 'S' 103 | input[i+2] = 'S' 104 | } 105 | case 'P': 106 | // PH -> FF 107 | if input.SafeAt(i+1) == 'H' { 108 | input[i] = 'F' 109 | input[i+1] = 'F' 110 | } 111 | case 'H': 112 | // H -> $(previous character) if previous character or 113 | // next character is a non-vowel 114 | prev := input.SafeAt(i - 1) 115 | next := input.SafeAt(i + 1) 116 | if !isVowelNoY(prev) || !isVowelNoY(next) { 117 | input[i] = prev 118 | } 119 | case 'W': 120 | prev := input.SafeAt(i - 1) 121 | if isVowelNoY(prev) { 122 | input[i] = prev 123 | } 124 | } 125 | 126 | if input[i] != last && input[i] != 0 { 127 | output = append(output, input[i]) 128 | } 129 | last = input[i] 130 | } 131 | 132 | // have to be careful here because we've already added the first 133 | // key value 134 | if len(output) > 1 { 135 | // remove trailing s 136 | if output.SafeAt(len(output)-1) == 'S' { 137 | output.Del(len(output) - 1) 138 | } 139 | 140 | // trailing AY -> Y 141 | if len(output) > 2 && output.SafeSubstr(len(output)-2, 2) == "AY" { 142 | output.Del(len(output) - 2) 143 | } 144 | 145 | // trailing A -> remove it 146 | if output.SafeAt(len(output)-1) == 'A' { 147 | output.Del(len(output) - 1) 148 | } 149 | } 150 | 151 | if len(output) > 6 { 152 | return string(output[0:6]) 153 | } else { 154 | return string(output) 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /nysiis_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var nysiistests = []struct { 6 | s1 string 7 | nysiis string 8 | }{ 9 | {"knight", "NAGT"}, 10 | {"mitchell", "MATCAL"}, 11 | {"o'daniel", "ODANAL"}, 12 | {"brown sr", "BRANSR"}, 13 | {"browne III", "BRAN"}, 14 | {"browne IV", "BRANAV"}, 15 | {"O'Banion", "OBANAN"}, 16 | {"Mclaughlin", "MCLAGL"}, 17 | {"McCormack", "MCARNA"}, 18 | {"Chapman", "CAPNAN"}, 19 | {"Silva", "SALV"}, 20 | {"McDonald", "MCDANA"}, 21 | {"Lawson", "LASAN"}, 22 | {"Jacobs", "JACAB"}, 23 | {"Greene", "GRAN"}, 24 | {"O'Brien", "OBRAN"}, 25 | {"Morrison", "MARASA"}, 26 | {"Larson", "LARSAN"}, 27 | {"Willis", "WAL"}, 28 | {"Mackenzie", "MCANSY"}, 29 | {"Carr", "CAR"}, 30 | {"Lawrence", "LARANC"}, 31 | {"Matthews", "MAT"}, 32 | {"Richards", "RACARD"}, 33 | {"Bishop", "BASAP"}, 34 | {"Franklin", "FRANCL"}, 35 | {"McDaniel", "MCDANA"}, 36 | {"Harper", "HARPAR"}, 37 | {"Lynch", "LYNC"}, 38 | {"Watkins", "WATCAN"}, 39 | {"Carlson", "CARLSA"}, 40 | {"Wheeler", "WALAR"}, 41 | {"Louis XVI", "LASXV"}, 42 | {"2002", ""}, 43 | {"1/2", ""}, 44 | {"", ""}, 45 | } 46 | 47 | // NYSIIS 48 | func TestNYIIS(t *testing.T) { 49 | for _, tt := range nysiistests { 50 | nysiis := NYSIIS(tt.s1) 51 | if nysiis != tt.nysiis { 52 | t.Errorf("NYSIIS('%s') = %v, want %v", tt.s1, nysiis, tt.nysiis) 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /osa.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | // OSA computes the Optimal String Alignment distance between two 4 | // strings. The returned value - distance - is the number of insertions, 5 | // deletions, substitutions, and transpositions it takes to transform one 6 | // string (s1) into another (s2). Each step in the transformation "costs" 7 | // one distance point. It is similar to Damerau-Levenshtein, but is simpler 8 | // because it does not allow multiple edits on any substring. 9 | func OSA(s1 string, s2 string) (distance int) { 10 | // index by code point, not byte 11 | r1 := []rune(s1) 12 | r2 := []rune(s2) 13 | 14 | rows := len(r1) + 1 15 | cols := len(r2) + 1 16 | 17 | var i, j, d1, d2, d3, d_now, cost int 18 | 19 | dist := make([]int, rows*cols) 20 | 21 | for i = 0; i < rows; i++ { 22 | dist[i*cols] = i 23 | } 24 | 25 | for j = 0; j < cols; j++ { 26 | dist[j] = j 27 | } 28 | 29 | for i = 1; i < rows; i++ { 30 | for j = 1; j < cols; j++ { 31 | if r1[i-1] == r2[j-1] { 32 | cost = 0 33 | } else { 34 | cost = 1 35 | } 36 | 37 | d1 = dist[((i-1)*cols)+j] + 1 38 | d2 = dist[(i*cols)+(j-1)] + 1 39 | d3 = dist[((i-1)*cols)+(j-1)] + cost 40 | 41 | d_now = min(d1, min(d2, d3)) 42 | 43 | if i > 2 && j > 2 && r1[i-1] == r2[j-2] && 44 | r1[i-2] == r2[j-1] { 45 | d1 = dist[((i-2)*cols)+(j-2)] + cost 46 | d_now = min(d_now, d1) 47 | } 48 | 49 | dist[(i*cols)+j] = d_now 50 | } 51 | } 52 | 53 | distance = dist[(cols*rows)-1] 54 | 55 | return 56 | } 57 | -------------------------------------------------------------------------------- /osa_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var osatests = []struct { 6 | s1 string 7 | s2 string 8 | dist int 9 | }{ 10 | // insertion 11 | {"car", "cars", 1}, 12 | // substitution 13 | {"library", "librari", 1}, 14 | // deletion 15 | {"library", "librar", 1}, 16 | // transposition 17 | {"library", "librayr", 1}, 18 | // one empty, left 19 | {"", "library", 7}, 20 | // one empty, right 21 | {"library", "", 7}, 22 | // two empties 23 | {"", "", 0}, 24 | // unicode stuff! 25 | {"Schüßler", "Schübler", 1}, 26 | {"Schüßler", "Schußler", 1}, 27 | {"Schüßler", "Schüßler", 0}, 28 | {"Schßüler", "Schüßler", 1}, 29 | {"Schüßler", "Schüler", 1}, 30 | {"Schüßler", "Schüßlers", 1}, 31 | // difference between DL and OSA. This is OSA, so it should be 3. 32 | {"ca", "abc", 3}, 33 | } 34 | 35 | // OSA (Optimal String Alignment) 36 | func TestOSA(t *testing.T) { 37 | for _, tt := range osatests { 38 | dist := OSA(tt.s1, tt.s2) 39 | if dist != tt.dist { 40 | t.Errorf("OSA('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /phonex.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | func preProcess(input []rune) []rune { 4 | output := runestring(make([]rune, 0, len(input))) 5 | 6 | // 0. Remove all non-ASCII characters 7 | for _, v := range input { 8 | if v >= 65 && v <= 90 { 9 | output = append(output, v) 10 | } 11 | } 12 | 13 | // 1. Remove all trailing 'S' characters at the end of the name 14 | for i := len(output) - 1; i >= 0 && output[i] == 'S'; i-- { 15 | output.Del(i) 16 | } 17 | 18 | // 2. Convert leading letter pairs as follows 19 | // KN -> N, PH -> F, WR -> R 20 | switch output.SafeSubstr(0, 2) { 21 | case "KN": 22 | output = output[1:] 23 | case "PH": 24 | output[0] = 'F' // H will be ignored anyway 25 | case "WR": 26 | output = output[1:] 27 | } 28 | 29 | // 3a. Convert leading single letters as follows: 30 | // H -> Remove 31 | if output.SafeAt(0) == 'H' { 32 | output = output[1:] 33 | } 34 | 35 | // 3a. Convert leading single letters as follows: 36 | // E,I,O,U,Y -> A 37 | // P -> B 38 | // V -> F 39 | // K,Q -> C 40 | // J -> G 41 | // Z -> S 42 | switch output.SafeAt(0) { 43 | case 'E', 'I', 'O', 'U', 'Y': 44 | output[0] = 'A' 45 | case 'P': 46 | output[0] = 'B' 47 | case 'V': 48 | output[0] = 'F' 49 | case 'K', 'Q': 50 | output[0] = 'C' 51 | case 'J': 52 | output[0] = 'G' 53 | case 'Z': 54 | output[0] = 'S' 55 | } 56 | 57 | return output 58 | } 59 | 60 | // Phonex computes the Phonex phonetic encoding of the input string. Phonex is 61 | // a modification of the venerable Soundex algorithm. It accounts for a few 62 | // more letter combinations to improve accuracy on some data sets. 63 | // 64 | // This implementation is based off of the original C implementation by the 65 | // creator - A. J. Lait - as found in his research paper entitled "An 66 | // Assessment of Name Matching Algorithms." 67 | func Phonex(s1 string) string { 68 | 69 | // preprocess 70 | s1 = cleanInput(s1) 71 | 72 | input := runestring(preProcess([]rune(s1))) 73 | 74 | result := make([]rune, 0, len(input)) 75 | 76 | last := rune(0) 77 | code := rune(0) 78 | for i := 0; i < len(input) && 79 | input[i] != ' ' && 80 | input[i] != ',' && 81 | len(result) < 4; i++ { 82 | switch input[i] { 83 | case 'B', 'P', 'F', 'V': 84 | code = '1' 85 | case 'C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z': 86 | code = '2' 87 | case 'D', 'T': 88 | if input.SafeAt(i+1) != 'C' { 89 | code = '3' 90 | } 91 | case 'L': 92 | if isVowel(input.SafeAt(i+1)) || i == len(input)-1 { 93 | code = '4' 94 | } 95 | case 'M', 'N': 96 | nextChar := input.SafeAt(i + 1) 97 | if nextChar == 'D' || nextChar == 'G' { 98 | // ignore next character 99 | i++ 100 | } 101 | code = '5' 102 | case 'R': 103 | if isVowel(input.SafeAt(i+1)) || i == len(input)-1 { 104 | code = '6' 105 | } 106 | default: 107 | code = 0 108 | } 109 | 110 | if last != code && code != 0 && i != 0 { 111 | result = append(result, code) 112 | } 113 | 114 | // special case for 1st character: we use the actual character 115 | if i == 0 { 116 | result = append(result, input[i]) 117 | last = code 118 | } else { 119 | last = result[len(result)-1] 120 | } 121 | } 122 | 123 | for len(result) < 4 { 124 | result = append(result, '0') 125 | } 126 | 127 | return string(result) 128 | } 129 | -------------------------------------------------------------------------------- /phonex_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | // test cases from http://rosettacode.org/wiki/phonex#F.23 6 | var phonextests = []struct { 7 | s1 string 8 | phonex string 9 | }{ 10 | {"123 testsss", "T230"}, 11 | {"24/7 test", "T230"}, 12 | {"A", "A000"}, 13 | {"Lee", "L000"}, 14 | {"Kuhne", "C500"}, 15 | {"Meyer-Lansky", "M452"}, 16 | {"Oepping", "A150"}, 17 | {"Daley", "D400"}, 18 | {"Dalitz", "D432"}, 19 | {"Duhlitz", "D432"}, 20 | {"Dull", "D400"}, 21 | {"De Ledes", "D430"}, 22 | {"Sandemann", "S500"}, 23 | {"Schüßler", "S460"}, 24 | {"Schmidt", "S530"}, 25 | {"Sinatra", "S536"}, 26 | {"Heinrich", "A562"}, 27 | {"Hammerschlag", "A524"}, 28 | {"Williams", "W450"}, 29 | {"Wilms", "W500"}, 30 | {"Wilson", "W250"}, 31 | {"Worms", "W500"}, 32 | {"Zedlitz", "S343"}, 33 | {"Zotteldecke", "S320"}, 34 | {"ZYX test", "S232"}, 35 | {"Scherman", "S500"}, 36 | {"Schurman", "S500"}, 37 | {"Sherman", "S500"}, 38 | {"Shermansss", "S500"}, 39 | {"Shireman", "S650"}, 40 | {"Shurman", "S500"}, 41 | {"Euler", "A460"}, 42 | {"Ellery", "A460"}, 43 | {"Hilbert", "A130"}, 44 | {"Heilbronn", "A165"}, 45 | {"Gauss", "G000"}, 46 | {"Ghosh", "G200"}, 47 | {"Knuth", "N300"}, 48 | {"Kant", "C530"}, 49 | {"Lloyd", "L430"}, 50 | {"Ladd", "L300"}, 51 | {"Lukasiewicz", "L200"}, 52 | {"Lissajous", "L200"}, 53 | {"Ashcraft", "A261"}, 54 | {"Philip", "F410"}, 55 | {"Fripp", "F610"}, 56 | {"Czarkowska", "C200"}, 57 | {"Hornblower", "A514"}, 58 | {"Looser", "L260"}, 59 | {"Wright", "R230"}, 60 | {"Phonic", "F520"}, 61 | {"Quickening", "C250"}, 62 | {"Kuickening", "C250"}, 63 | {"Joben", "G150"}, 64 | {"Zelda", "S300"}, 65 | {"S", "0000"}, 66 | {"H", "0000"}, 67 | {"", "0000"}, 68 | } 69 | 70 | // phonex 71 | func TestPhonex(t *testing.T) { 72 | for _, tt := range phonextests { 73 | phonex := Phonex(tt.s1) 74 | if phonex != tt.phonex { 75 | t.Errorf("Phonex('%s') = %v, want %v", tt.s1, phonex, tt.phonex) 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /runestring.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | type runestring []rune 4 | 5 | // A safe way to index a runestring. It will return a null rune if you try 6 | // to index outside of the bounds of the runestring. 7 | func (r *runestring) SafeAt(pos int) rune { 8 | if pos < 0 || pos >= len(*r) { 9 | return 0 10 | } else { 11 | return (*r)[pos] 12 | } 13 | } 14 | 15 | // A safe way to obtain a substring of a runestring. It will return a null 16 | // string ("") if you index somewhere outside its bounds. 17 | func (r *runestring) SafeSubstr(pos int, length int) string { 18 | if pos < 0 || pos > len(*r) || (pos+length) > len(*r) { 19 | return "" 20 | } else { 21 | return string((*r)[pos : pos+length]) 22 | } 23 | } 24 | 25 | // Delete characters at positions pos. It will do nothing if you provide 26 | // an index outside the bounds of the runestring. 27 | func (r *runestring) Del(pos ...int) { 28 | for _, i := range pos { 29 | if i >= 0 && i <= len(*r) { 30 | *r = append((*r)[:i], (*r)[i+1:]...) 31 | } 32 | } 33 | } 34 | 35 | // A helper to determine if any substrings exist within the given runestring. 36 | func (r *runestring) Contains(start int, length int, criteria ...string) bool { 37 | substring := r.SafeSubstr(start, length) 38 | for _, c := range criteria { 39 | if substring == c { 40 | return true 41 | } 42 | } 43 | return false 44 | } 45 | -------------------------------------------------------------------------------- /smithwaterman.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | const GAP_COST = float64(0.5) 4 | 5 | func getCost(r1 []rune, r1Index int, r2 []rune, r2Index int) float64 { 6 | if r1[r1Index] == r2[r2Index] { 7 | return 1.0 8 | } else { 9 | return -2.0 10 | } 11 | } 12 | 13 | // SmithWaterman computes the Smith-Waterman local sequence alignment for the 14 | // two input strings. This was originally designed to find similar regions in 15 | // strings representing DNA or protein sequences. 16 | func SmithWaterman(s1 string, s2 string) float64 { 17 | var cost float64 18 | 19 | // index by code point, not byte 20 | r1 := []rune(s1) 21 | r2 := []rune(s2) 22 | 23 | r1Len := len(r1) 24 | r2Len := len(r2) 25 | 26 | if r1Len == 0 { 27 | return float64(r2Len) 28 | } 29 | 30 | if r2Len == 0 { 31 | return float64(r1Len) 32 | } 33 | 34 | d := make([][]float64, r1Len) 35 | for i := range d { 36 | d[i] = make([]float64, r2Len) 37 | } 38 | 39 | var maxSoFar float64 40 | for i := 0; i < r1Len; i++ { 41 | // substitution cost 42 | cost = getCost(r1, i, r2, 0) 43 | if i == 0 { 44 | d[0][0] = max(0.0, max(-GAP_COST, cost)) 45 | } else { 46 | d[i][0] = max(0.0, max(d[i-1][0]-GAP_COST, cost)) 47 | } 48 | 49 | // save if it is the biggest thus far 50 | if d[i][0] > maxSoFar { 51 | maxSoFar = d[i][0] 52 | } 53 | } 54 | 55 | for j := 0; j < r2Len; j++ { 56 | // substitution cost 57 | cost = getCost(r1, 0, r2, j) 58 | if j == 0 { 59 | d[0][0] = max(0, max(-GAP_COST, cost)) 60 | } else { 61 | d[0][j] = max(0, max(d[0][j-1]-GAP_COST, cost)) 62 | } 63 | 64 | // save if it is the biggest thus far 65 | if d[0][j] > maxSoFar { 66 | maxSoFar = d[0][j] 67 | } 68 | } 69 | 70 | for i := 1; i < r1Len; i++ { 71 | for j := 1; j < r2Len; j++ { 72 | cost = getCost(r1, i, r2, j) 73 | 74 | // find the lowest cost 75 | d[i][j] = max( 76 | max(0, d[i-1][j]-GAP_COST), 77 | max(d[i][j-1]-GAP_COST, d[i-1][j-1]+cost)) 78 | 79 | // save if it is the biggest thus far 80 | if d[i][j] > maxSoFar { 81 | maxSoFar = d[i][j] 82 | } 83 | } 84 | } 85 | 86 | return maxSoFar 87 | } 88 | -------------------------------------------------------------------------------- /smithwaterman_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | var swtests = []struct { 6 | s1 string 7 | s2 string 8 | dist float64 9 | }{ 10 | // insertion 11 | {"car", "cars", 3.0}, 12 | // substitution 13 | {"library", "librari", 6.0}, 14 | // deletion 15 | {"library", "librar", 6.0}, 16 | // transposition 17 | {"library", "librayr", 5.5}, 18 | // one empty, left 19 | {"", "library", 7.0}, 20 | // one empty, right 21 | {"library", "", 7.0}, 22 | // two empties 23 | {"", "", 0.0}, 24 | // unicode stuff! 25 | {"Schüßler", "Schübler", 6.0}, 26 | {"Ant Zucaro", "Anthony Zucaro", 8.0}, 27 | {"Schüßler", "Schüßler", 8.0}, 28 | {"Schßüler", "Schüßler", 6.0}, 29 | {"Schüßler", "Schüler", 6.5}, 30 | {"Schüßler", "Schüßlers", 8.0}, 31 | } 32 | 33 | // Smith-Waterman 34 | func TestSmithWaterman(t *testing.T) { 35 | for _, tt := range swtests { 36 | dist := SmithWaterman(tt.s1, tt.s2) 37 | if dist != tt.dist { 38 | t.Errorf("SmithWaterman('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist) 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /soundex.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "strings" 4 | 5 | // Soundex computes the Soundex phonetic representation of the input string. It 6 | // attempts to encode homophones with the same characters. More information can 7 | // be found at http://en.wikipedia.org/wiki/Soundex. 8 | func Soundex(s1 string) string { 9 | if len(s1) == 0 { 10 | return "" 11 | } 12 | 13 | // we should work with all uppercase 14 | s1 = strings.ToUpper(s1) 15 | 16 | input := NewString(s1) 17 | 18 | // the encoded value 19 | enc := input.Slice(0, 1) 20 | 21 | c := "" 22 | prev := "" 23 | hw := false 24 | 25 | for i := 0; i < input.RuneCount(); i++ { 26 | switch rune(input.At(i)) { 27 | case 'B', 'F', 'P', 'V': 28 | c = "1" 29 | case 'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z': 30 | c = "2" 31 | case 'D', 'T': 32 | c = "3" 33 | case 'L': 34 | c = "4" 35 | case 'M', 'N': 36 | c = "5" 37 | case 'R': 38 | c = "6" 39 | case 'H', 'W': 40 | hw = true 41 | default: 42 | c = "" 43 | } 44 | 45 | // don't encode the first position, but we need its code value 46 | // to prevent repeats 47 | if c != "" && c != prev && i > 0 { 48 | // if the next encoded digit is different, we can add it right away 49 | // if it is the same, though, it must not have been preceded 50 | // by an 'H' or a 'W' 51 | if enc[len(enc)-1:len(enc)] != c || !hw { 52 | enc = enc + c 53 | } 54 | 55 | // we're done when we reach four encoded characters 56 | if len(enc) == 4 { 57 | break 58 | } 59 | } 60 | 61 | prev = c 62 | hw = false 63 | } 64 | 65 | // if we've fallen short of 4 "real" encoded characters, 66 | // it gets padded with zeros 67 | for len(enc) < 4 { 68 | enc = enc + "0" 69 | } 70 | 71 | return enc 72 | } 73 | -------------------------------------------------------------------------------- /soundex_test.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import "testing" 4 | 5 | // test cases from http://rosettacode.org/wiki/Soundex#F.23 6 | var soundextests = []struct { 7 | s1 string 8 | soundex string 9 | }{ 10 | {"Ashcraft", "A261"}, 11 | {"Ashhhcraft", "A261"}, 12 | {"Ashcroft", "A261"}, 13 | {"Burroughs", "B620"}, 14 | {"Burrows", "B620"}, 15 | {"Ekzampul", "E251"}, 16 | {"Example", "E251"}, 17 | {"Ellery", "E460"}, 18 | {"Euler", "E460"}, 19 | {"Ghosh", "G200"}, 20 | {"Gauss", "G200"}, 21 | {"Gutierrez", "G362"}, 22 | {"Heilbronn", "H416"}, 23 | {"Hilbert", "H416"}, 24 | {"Jackson", "J250"}, 25 | {"Kant", "K530"}, 26 | {"Knuth", "K530"}, 27 | {"Lee", "L000"}, 28 | {"Lukasiewicz", "L222"}, 29 | {"Lissajous", "L222"}, 30 | {"Ladd", "L300"}, 31 | {"Lloyd", "L300"}, 32 | {"Moses", "M220"}, 33 | {"O'Hara", "O600"}, 34 | {"Pfister", "P236"}, 35 | {"Rubin", "R150"}, 36 | {"Robert", "R163"}, 37 | {"Rupert", "R163"}, 38 | {"Soundex", "S532"}, 39 | {"Sownteks", "S532"}, 40 | {"Tymczak", "T522"}, 41 | {"VanDeusen", "V532"}, 42 | {"Washington", "W252"}, 43 | {"Wheaton", "W350"}, 44 | } 45 | 46 | // Soundex 47 | func TestSoundex(t *testing.T) { 48 | for _, tt := range soundextests { 49 | soundex := Soundex(tt.s1) 50 | if soundex != tt.soundex { 51 | t.Errorf("Soundex('%s') = %v, want %v", tt.s1, soundex, tt.soundex) 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /utf8.go: -------------------------------------------------------------------------------- 1 | // Copyright 2009 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package matchr 6 | 7 | import ( 8 | "errors" 9 | "unicode/utf8" 10 | ) 11 | 12 | // String wraps a regular string with a small structure that provides more 13 | // efficient indexing by code point index, as opposed to byte index. 14 | // Scanning incrementally forwards or backwards is O(1) per index operation 15 | // (although not as fast a range clause going forwards). Random access is 16 | // O(N) in the length of the string, but the overhead is less than always 17 | // scanning from the beginning. 18 | // If the string is ASCII, random access is O(1). 19 | // Unlike the built-in string type, String has internal mutable state and 20 | // is not thread-safe. 21 | type String struct { 22 | str string 23 | numRunes int 24 | // If width > 0, the rune at runePos starts at bytePos and has the specified width. 25 | width int 26 | bytePos int 27 | runePos int 28 | nonASCII int // byte index of the first non-ASCII rune. 29 | } 30 | 31 | // NewString returns a new UTF-8 string with the provided contents. 32 | func NewString(contents string) *String { 33 | return new(String).Init(contents) 34 | } 35 | 36 | // Init initializes an existing String to hold the provided contents. 37 | // It returns a pointer to the initialized String. 38 | func (s *String) Init(contents string) *String { 39 | s.str = contents 40 | s.bytePos = 0 41 | s.runePos = 0 42 | for i := 0; i < len(contents); i++ { 43 | if contents[i] >= utf8.RuneSelf { 44 | // Not ASCII. 45 | s.numRunes = utf8.RuneCountInString(contents) 46 | _, s.width = utf8.DecodeRuneInString(contents) 47 | s.nonASCII = i 48 | return s 49 | } 50 | } 51 | // ASCII is simple. Also, the empty string is ASCII. 52 | s.numRunes = len(contents) 53 | s.width = 0 54 | s.nonASCII = len(contents) 55 | return s 56 | } 57 | 58 | // String returns the contents of the String. This method also means the 59 | // String is directly printable by fmt.Print. 60 | func (s *String) String() string { 61 | return s.str 62 | } 63 | 64 | // RuneCount returns the number of runes (Unicode code points) in the String. 65 | func (s *String) RuneCount() int { 66 | return s.numRunes 67 | } 68 | 69 | // IsASCII returns a boolean indicating whether the String contains only ASCII bytes. 70 | func (s *String) IsASCII() bool { 71 | return s.width == 0 72 | } 73 | 74 | // Slice returns the string sliced at rune positions [i:j]. 75 | func (s *String) Slice(i, j int) string { 76 | // ASCII is easy. Let the compiler catch the indexing error if there is one. 77 | if j < s.nonASCII { 78 | return s.str[i:j] 79 | } 80 | if i < 0 || j > s.numRunes || i > j { 81 | panic(errors.New("utf8.String: slice index out of range")) 82 | } 83 | if i == j { 84 | return "" 85 | } 86 | // For non-ASCII, after At(i), bytePos is always the position of the indexed character. 87 | var low, high int 88 | switch { 89 | case i < s.nonASCII: 90 | low = i 91 | case i == s.numRunes: 92 | low = len(s.str) 93 | default: 94 | s.At(i) 95 | low = s.bytePos 96 | } 97 | switch { 98 | case j == s.numRunes: 99 | high = len(s.str) 100 | default: 101 | s.At(j) 102 | high = s.bytePos 103 | } 104 | return s.str[low:high] 105 | } 106 | 107 | // At returns the rune with index i in the String. The sequence of runes is the same 108 | // as iterating over the contents with a "for range" clause. 109 | func (s *String) At(i int) int { 110 | // ASCII is easy. Let the compiler catch the indexing error if there is one. 111 | if i < s.nonASCII { 112 | return int(s.str[i]) 113 | } 114 | 115 | // Now we do need to know the index is valid. 116 | if i < 0 || i >= s.numRunes { 117 | panic(errors.New("utf8.String: index out of range")) 118 | } 119 | 120 | var r rune 121 | 122 | // Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end. 123 | // With these cases, all scans from beginning or end work in O(1) time per rune. 124 | switch { 125 | 126 | case i == s.runePos-1: // backing up one rune 127 | r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos]) 128 | s.runePos = i 129 | s.bytePos -= s.width 130 | return int(r) 131 | case i == s.runePos+1: // moving ahead one rune 132 | s.runePos = i 133 | s.bytePos += s.width 134 | fallthrough 135 | case i == s.runePos: 136 | r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:]) 137 | return int(r) 138 | case i == 0: // start of string 139 | r, s.width = utf8.DecodeRuneInString(s.str) 140 | s.runePos = 0 141 | s.bytePos = 0 142 | return int(r) 143 | 144 | case i == s.numRunes-1: // last rune in string 145 | r, s.width = utf8.DecodeLastRuneInString(s.str) 146 | s.runePos = i 147 | s.bytePos = len(s.str) - s.width 148 | return int(r) 149 | } 150 | 151 | // We need to do a linear scan. There are three places to start from: 152 | // 1) The beginning 153 | // 2) bytePos/runePos. 154 | // 3) The end 155 | // Choose the closest in rune count, scanning backwards if necessary. 156 | forward := true 157 | if i < s.runePos { 158 | // Between beginning and pos. Which is closer? 159 | // Since both i and runePos are guaranteed >= nonASCII, that's the 160 | // lowest location we need to start from. 161 | if i < (s.runePos-s.nonASCII)/2 { 162 | // Scan forward from beginning 163 | s.bytePos, s.runePos = s.nonASCII, s.nonASCII 164 | } else { 165 | // Scan backwards from where we are 166 | forward = false 167 | } 168 | } else { 169 | // Between pos and end. Which is closer? 170 | if i-s.runePos < (s.numRunes-s.runePos)/2 { 171 | // Scan forward from pos 172 | } else { 173 | // Scan backwards from end 174 | s.bytePos, s.runePos = len(s.str), s.numRunes 175 | forward = false 176 | } 177 | } 178 | if forward { 179 | // TODO: Is it much faster to use a range loop for this scan? 180 | for { 181 | r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:]) 182 | if s.runePos == i { 183 | break 184 | } 185 | s.runePos++ 186 | s.bytePos += s.width 187 | } 188 | } else { 189 | for { 190 | r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos]) 191 | s.runePos-- 192 | s.bytePos -= s.width 193 | if s.runePos == i { 194 | break 195 | } 196 | } 197 | } 198 | return int(r) 199 | } 200 | 201 | // We want the panic in At(i) to satisfy os.Error, because that's what 202 | // runtime panics satisfy, but we can't import os. This is our solution. 203 | 204 | // error is the type of the error returned if a user calls String.At(i) with i out of range. 205 | // It satisfies os.Error and runtime.Error. 206 | // type error string 207 | 208 | /* 209 | func (err error) String() string { 210 | return string(err) 211 | } 212 | 213 | func (err error) RunTimeError() { 214 | } 215 | */ 216 | -------------------------------------------------------------------------------- /util.go: -------------------------------------------------------------------------------- 1 | package matchr 2 | 3 | import ( 4 | "math" 5 | "strings" 6 | ) 7 | 8 | // min of two integers 9 | func min(a int, b int) (res int) { 10 | if a < b { 11 | res = a 12 | } else { 13 | res = b 14 | } 15 | 16 | return 17 | } 18 | 19 | // max of two integers 20 | func maxI(a int, b int) (res int) { 21 | if a < b { 22 | res = b 23 | } else { 24 | res = a 25 | } 26 | 27 | return 28 | } 29 | 30 | // max of two float64s 31 | func max(a float64, b float64) (res float64) { 32 | if a < b { 33 | res = b 34 | } else { 35 | res = a 36 | } 37 | 38 | return 39 | } 40 | 41 | // is this string index outside of the ASCII numeric code points? 42 | func nan(c rune) bool { 43 | return ((c > 57) || (c < 48)) 44 | } 45 | 46 | // Round a float64 to the given precision 47 | // 48 | // http://play.golang.org/p/S654PxAe_N 49 | // 50 | // (via Rory McGuire at 51 | // https://groups.google.com/forum/#!topic/golang-nuts/ITZV08gAugI) 52 | func round(x float64, prec int) float64 { 53 | if math.IsNaN(x) || math.IsInf(x, 0) { 54 | return x 55 | } 56 | 57 | sign := 1.0 58 | if x < 0 { 59 | sign = -1 60 | x *= -1 61 | } 62 | 63 | var rounder float64 64 | pow := math.Pow(10, float64(prec)) 65 | intermed := x * pow 66 | _, frac := math.Modf(intermed) 67 | 68 | if frac >= 0.5 { 69 | rounder = math.Ceil(intermed) 70 | } else { 71 | rounder = math.Floor(intermed) 72 | } 73 | 74 | return rounder / pow * sign 75 | } 76 | 77 | // A helper to determine if any substrings exist within the given string 78 | func contains(value *String, start int, length int, criteria ...string) bool { 79 | substring := substring(value, start, length) 80 | for _, c := range criteria { 81 | if substring == c { 82 | return true 83 | } 84 | } 85 | return false 86 | } 87 | 88 | // A fault-tolerant version of Slice. It will return nothing ("") if the index 89 | // is out of bounds. This allows substring-ing without having to bound check 90 | // every time. 91 | func substring(value *String, start int, length int) string { 92 | if start >= 0 && start+length <= value.RuneCount() { 93 | return value.Slice(start, start+length) 94 | } else { 95 | return "" 96 | } 97 | } 98 | 99 | func isVowel(c rune) bool { 100 | switch c { 101 | case 'A', 'E', 'I', 'O', 'U', 'Y': 102 | return true 103 | default: 104 | return false 105 | } 106 | } 107 | 108 | func isVowelNoY(c rune) bool { 109 | switch c { 110 | case 'A', 'E', 'I', 'O', 'U': 111 | return true 112 | default: 113 | return false 114 | } 115 | } 116 | 117 | func cleanInput(input string) string { 118 | return strings.ToUpper(strings.TrimSpace(input)) 119 | } 120 | --------------------------------------------------------------------------------