├── COPYING.txt
├── README.md
├── damerau_levenshtein.go
├── damerau_levenshtein_test.go
├── double_metaphone_corpus.txt.gz
├── go.mod
├── hamming.go
├── hamming_test.go
├── jarowinkler.go
├── jarowinkler_test.go
├── levenshtein.go
├── levenshtein_test.go
├── longestcommonsubsequence.go
├── longestcommonsubsequence_test.go
├── metaphone.go
├── metaphone_test.go
├── nysiis.go
├── nysiis_test.go
├── osa.go
├── osa_test.go
├── phonex.go
├── phonex_test.go
├── runestring.go
├── smithwaterman.go
├── smithwaterman_test.go
├── soundex.go
├── soundex_test.go
├── utf8.go
└── util.go


/COPYING.txt:
--------------------------------------------------------------------------------
 1 | Matchr: an approximate string matching library for the Go programming language
 2 | 
 3 | Copyright (C) 2013-2014 Ant Zucaro
 4 | 
 5 | This program is free software; you can redistribute it and/or modify
 6 | it under the terms of the GNU General Public License as published by
 7 | the Free Software Foundation; either version 2 of the License, or
 8 | (at your option) any later version.
 9 | 
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | GNU General Public License for more details.
14 | 
15 | You should have received a copy of the GNU General Public License
16 | along with this program; if not, write to the Free Software
17 | Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
18 | 
19 | You can contact Ant Zucaro at azucaro at gmail dot com.
20 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # matchr
 2 | 
 3 | [![Go Reference](https://pkg.go.dev/badge/github.com/antzucaro/matchr.svg)](https://pkg.go.dev/github.com/antzucaro/matchr)
 4 | 
 5 | An approximate string matching library for the [Go programming language](http://www.golang.org).
 6 | 
 7 | ## Rationale
 8 | 
 9 | Data used in record linkage can often be of dubious quality. Typographical 
10 | errors or changing data elements (to name a few things) make establishing similarity between two sets of data 
11 | difficult. Rather than use exact string comparison in such situations, it is
12 | vital to have a means to identify how similar two strings are. Similarity functions can cater
13 | to certain data sets in order to make better matching decisions. The matchr library provides
14 | several of these similarity functions.
15 | 


--------------------------------------------------------------------------------
/damerau_levenshtein.go:
--------------------------------------------------------------------------------
  1 | package matchr
  2 | 
  3 | // DamerauLevenshtein computes the Damerau-Levenshtein distance between two
  4 | // strings. The returned value - distance - is the number of insertions,
  5 | // deletions, substitutions, and transpositions it takes to transform one
  6 | // string (s1) into another (s2). Each step in the transformation "costs"
  7 | // one distance point. It is similar to the Optimal String Alignment,
  8 | // algorithm, but is more complex because it allows multiple edits on
  9 | // substrings.
 10 | //
 11 | // This implementation is based off of the one found on Wikipedia at
 12 | // http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Distance_with_adjacent_transpositions
 13 | // as well as KevinStern's Java implementation found at
 14 | // https://github.com/KevinStern/software-and-algorithms.
 15 | func DamerauLevenshtein(s1 string, s2 string) (distance int) {
 16 | 	// index by code point, not byte
 17 | 	r1 := []rune(s1)
 18 | 	r2 := []rune(s2)
 19 | 
 20 | 	// the maximum possible distance
 21 | 	inf := len(r1) + len(r2)
 22 | 
 23 | 	// if one string is blank, we needs insertions
 24 | 	// for all characters in the other one
 25 | 	if len(r1) == 0 {
 26 | 		return len(r2)
 27 | 	}
 28 | 
 29 | 	if len(r2) == 0 {
 30 | 		return len(r1)
 31 | 	}
 32 | 
 33 | 	// construct the edit-tracking matrix
 34 | 	matrix := make([][]int, len(r1))
 35 | 	for i := range matrix {
 36 | 		matrix[i] = make([]int, len(r2))
 37 | 	}
 38 | 
 39 | 	// seen characters
 40 | 	seenRunes := make(map[rune]int)
 41 | 
 42 | 	if r1[0] != r2[0] {
 43 | 		matrix[0][0] = 1
 44 | 	}
 45 | 
 46 | 	seenRunes[r1[0]] = 0
 47 | 	for i := 1; i < len(r1); i++ {
 48 | 		deleteDist := matrix[i-1][0] + 1
 49 | 		insertDist := (i+1)*1 + 1
 50 | 		var matchDist int
 51 | 		if r1[i] == r2[0] {
 52 | 			matchDist = i
 53 | 		} else {
 54 | 			matchDist = i + 1
 55 | 		}
 56 | 		matrix[i][0] = min(min(deleteDist, insertDist), matchDist)
 57 | 	}
 58 | 
 59 | 	for j := 1; j < len(r2); j++ {
 60 | 		deleteDist := (j + 1) * 2
 61 | 		insertDist := matrix[0][j-1] + 1
 62 | 		var matchDist int
 63 | 		if r1[0] == r2[j] {
 64 | 			matchDist = j
 65 | 		} else {
 66 | 			matchDist = j + 1
 67 | 		}
 68 | 
 69 | 		matrix[0][j] = min(min(deleteDist, insertDist), matchDist)
 70 | 	}
 71 | 
 72 | 	for i := 1; i < len(r1); i++ {
 73 | 		var maxSrcMatchIndex int
 74 | 		if r1[i] == r2[0] {
 75 | 			maxSrcMatchIndex = 0
 76 | 		} else {
 77 | 			maxSrcMatchIndex = -1
 78 | 		}
 79 | 
 80 | 		for j := 1; j < len(r2); j++ {
 81 | 			swapIndex, ok := seenRunes[r2[j]]
 82 | 			jSwap := maxSrcMatchIndex
 83 | 			deleteDist := matrix[i-1][j] + 1
 84 | 			insertDist := matrix[i][j-1] + 1
 85 | 			matchDist := matrix[i-1][j-1]
 86 | 			if r1[i] != r2[j] {
 87 | 				matchDist += 1
 88 | 			} else {
 89 | 				maxSrcMatchIndex = j
 90 | 			}
 91 | 
 92 | 			// for transpositions
 93 | 			var swapDist int
 94 | 			if ok && jSwap != -1 {
 95 | 				iSwap := swapIndex
 96 | 				var preSwapCost int
 97 | 				if iSwap == 0 && jSwap == 0 {
 98 | 					preSwapCost = 0
 99 | 				} else {
100 | 					preSwapCost = matrix[maxI(0, iSwap-1)][maxI(0, jSwap-1)]
101 | 				}
102 | 				swapDist = i + j + preSwapCost - iSwap - jSwap - 1
103 | 			} else {
104 | 				swapDist = inf
105 | 			}
106 | 			matrix[i][j] = min(min(min(deleteDist, insertDist), matchDist), swapDist)
107 | 		}
108 | 		seenRunes[r1[i]] = i
109 | 	}
110 | 
111 | 	return matrix[len(r1)-1][len(r2)-1]
112 | }
113 | 


--------------------------------------------------------------------------------
/damerau_levenshtein_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var damlevtests = []struct {
 6 | 	s1   string
 7 | 	s2   string
 8 | 	dist int
 9 | }{
10 | 	// insertion
11 | 	{"car", "cars", 1},
12 | 	// substitution
13 | 	{"library", "librari", 1},
14 | 	// deletion
15 | 	{"library", "librar", 1},
16 | 	// transposition
17 | 	{"library", "librayr", 1},
18 | 	// one empty, left
19 | 	{"", "library", 7},
20 | 	// one empty, right
21 | 	{"library", "", 7},
22 | 	// two empties
23 | 	{"", "", 0},
24 | 	// unicode stuff!
25 | 	{"Schüßler", "Schübler", 1},
26 | 	{"Schüßler", "Schußler", 1},
27 | 	{"Schüßler", "Schüßler", 0},
28 | 	{"Schßüler", "Schüßler", 1},
29 | 	{"Schüßler", "Schüler", 1},
30 | 	{"Schüßler", "Schüßlers", 1},
31 | 	// difference between DL and OSA. This is DL, so it should be 2.
32 | 	{"ca", "abc", 2},
33 | }
34 | 
35 | // Damerau-Levenshtein
36 | func TestDamerauLevenshtein(t *testing.T) {
37 | 	for _, tt := range damlevtests {
38 | 		dist := DamerauLevenshtein(tt.s1, tt.s2)
39 | 		if dist != tt.dist {
40 | 			t.Errorf("DamerauLevenshtein('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
41 | 		}
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/double_metaphone_corpus.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/antzucaro/matchr/7bed6ef61ef9d9753ace1aded16e9763fa4f7142/double_metaphone_corpus.txt.gz


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/antzucaro/matchr
2 | 
3 | go 1.13
4 | 


--------------------------------------------------------------------------------
/hamming.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "errors"
 4 | 
 5 | // Hamming computes the Hamming distance between two equal-length strings.
 6 | // This is the number of times the two strings differ between characters at
 7 | // the same index. This implementation is based off of the algorithm
 8 | // description found at http://en.wikipedia.org/wiki/Hamming_distance.
 9 | func Hamming(s1 string, s2 string) (distance int, err error) {
10 | 	// index by code point, not byte
11 | 	r1 := []rune(s1)
12 | 	r2 := []rune(s2)
13 | 
14 | 	if len(r1) != len(r2) {
15 | 		err = errors.New("Hamming distance of different sized strings.")
16 | 		return
17 | 	}
18 | 
19 | 	for i, v := range r1 {
20 | 		if r2[i] != v {
21 | 			distance += 1
22 | 		}
23 | 	}
24 | 	return
25 | }
26 | 


--------------------------------------------------------------------------------
/hamming_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var hamtests = []struct {
 6 | 	s1   string
 7 | 	s2   string
 8 | 	dist int
 9 | 	err  bool
10 | }{
11 | 	{"", "", 0, false},
12 | 	{"cat", "cat", 0, false},
13 | 	{"car", "cat", 1, false},
14 | 	{"tar", "car", 1, false},
15 | 	{"xyz", "zyx", 2, false},
16 | 	{"wxyz", "zyx", 0, true},
17 | 	{"Schüßler", "Schübler", 1, false},
18 | 	{"Schüßler", "Schußler", 1, false},
19 | }
20 | 
21 | // Hamming Distance
22 | func TestHamming(t *testing.T) {
23 | 	for _, tt := range hamtests {
24 | 		dist, err := Hamming(tt.s1, tt.s2)
25 | 		if dist != tt.dist {
26 | 			t.Errorf("Hamming('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
27 | 		}
28 | 
29 | 		if tt.err && err == nil {
30 | 			t.Errorf("Hamming('%s', '%s') should throw an error", tt.s1, tt.s2)
31 | 		}
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/jarowinkler.go:
--------------------------------------------------------------------------------
  1 | package matchr
  2 | 
  3 | func jaroWinklerBase(s1 string, s2 string,
  4 | 	longTolerance bool, winklerize bool) (distance float64) {
  5 | 
  6 | 	// index by code point, not byte
  7 | 	r1 := []rune(s1)
  8 | 	r2 := []rune(s2)
  9 | 
 10 | 	r1Length := len(r1)
 11 | 	r2Length := len(r2)
 12 | 
 13 | 	if r1Length == 0 || r2Length == 0 {
 14 | 		return
 15 | 	}
 16 | 
 17 | 	minLength := 0
 18 | 	if r1Length > r2Length {
 19 | 		minLength = r1Length
 20 | 	} else {
 21 | 		minLength = r2Length
 22 | 	}
 23 | 
 24 | 	searchRange := minLength
 25 | 	searchRange = (searchRange / 2) - 1
 26 | 	if searchRange < 0 {
 27 | 		searchRange = 0
 28 | 	}
 29 | 	var lowLim, hiLim, transCount, commonChars int
 30 | 	var i, j, k int
 31 | 
 32 | 	r1Flag := make([]bool, r1Length+1)
 33 | 	r2Flag := make([]bool, r2Length+1)
 34 | 
 35 | 	// find the common chars within the acceptable range
 36 | 	commonChars = 0
 37 | 	for i, _ = range r1 {
 38 | 		if i >= searchRange {
 39 | 			lowLim = i - searchRange
 40 | 		} else {
 41 | 			lowLim = 0
 42 | 		}
 43 | 
 44 | 		if (i + searchRange) <= (r2Length - 1) {
 45 | 			hiLim = i + searchRange
 46 | 		} else {
 47 | 			hiLim = r2Length - 1
 48 | 		}
 49 | 
 50 | 		for j := lowLim; j <= hiLim; j++ {
 51 | 			if !r2Flag[j] && r2[j] == r1[i] {
 52 | 				r2Flag[j] = true
 53 | 				r1Flag[i] = true
 54 | 				commonChars++
 55 | 
 56 | 				break
 57 | 			}
 58 | 		}
 59 | 	}
 60 | 
 61 | 	// if we have nothing in common at this point, nothing else can be done
 62 | 	if commonChars == 0 {
 63 | 		return
 64 | 	}
 65 | 
 66 | 	// otherwise we count the transpositions
 67 | 	k = 0
 68 | 	transCount = 0
 69 | 	for i, _ := range r1 {
 70 | 		if r1Flag[i] {
 71 | 			for j = k; j < r2Length; j++ {
 72 | 				if r2Flag[j] {
 73 | 					k = j + 1
 74 | 					break
 75 | 				}
 76 | 			}
 77 | 			if r1[i] != r2[j] {
 78 | 				transCount++
 79 | 			}
 80 | 		}
 81 | 	}
 82 | 	transCount /= 2
 83 | 
 84 | 	// adjust for similarities in nonmatched characters
 85 | 	distance = float64(commonChars)/float64(r1Length) +
 86 | 		float64(commonChars)/float64(r2Length) +
 87 | 		(float64(commonChars-transCount))/float64(commonChars)
 88 | 	distance /= 3.0
 89 | 
 90 | 	// give more weight to already-similar strings
 91 | 	if winklerize && distance > 0.7 {
 92 | 
 93 | 		// the first 4 characters in common
 94 | 		if minLength >= 4 {
 95 | 			j = 4
 96 | 		} else {
 97 | 			j = minLength
 98 | 		}
 99 | 
100 | 		for i = 0; i < j && len(r1) > i && len(r2) > i && r1[i] == r2[i] && nan(r1[i]); i++ {
101 | 		}
102 | 
103 | 		if i > 0 {
104 | 			distance += float64(i) * 0.1 * (1.0 - distance)
105 | 		}
106 | 
107 | 		if longTolerance && (minLength > 4) && (commonChars > i+1) &&
108 | 			(2*commonChars >= minLength+i) {
109 | 			if nan(r1[0]) {
110 | 				distance += (1.0 - distance) * (float64(commonChars-i-1) /
111 | 					(float64(r1Length) + float64(r2Length) - float64(i*2) + 2))
112 | 			}
113 | 		}
114 | 	}
115 | 
116 | 	return
117 | }
118 | 
119 | // Jaro computes the Jaro edit distance between two strings. It represents
120 | // this with a float64 between 0 and 1 inclusive, with 0 indicating the two
121 | // strings are not at all similar and 1 indicating the two strings are exact
122 | // matches.
123 | //
124 | // See http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance for a
125 | // full description.
126 | func Jaro(r1 string, r2 string) (distance float64) {
127 | 	return jaroWinklerBase(r1, r2, false, false)
128 | }
129 | 
130 | // JaroWinkler computes the Jaro-Winkler edit distance between two strings.
131 | // This is a modification of the Jaro algorithm that gives additional weight
132 | // to prefix matches.
133 | func JaroWinkler(r1 string, r2 string, longTolerance bool) (distance float64) {
134 | 	return jaroWinklerBase(r1, r2, longTolerance, true)
135 | }
136 | 


--------------------------------------------------------------------------------
/jarowinkler_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var jarotests = []struct {
 6 | 	s1   string
 7 | 	s2   string
 8 | 	dist float64
 9 | }{
10 | 	{"", "cars", 0.0},
11 | 	{"cars", "", 0.0},
12 | 	{"car", "cars", 0.9166666666666666},
13 | 	{"dixon", "dicksonx", 0.7666666666666666},
14 | 	{"martha", "marhta", 0.9444444444444445},
15 | 	{"dwayne", "duane", 0.8222222222222223},
16 | 	{"martüa", "marüta", 0.9444444444444445},
17 | 	{"dr", "driveway", 0.75},
18 | }
19 | 
20 | // Regular Jaro distance
21 | func TestJaro(t *testing.T) {
22 | 	for _, tt := range jarotests {
23 | 		dist := Jaro(tt.s1, tt.s2)
24 | 		if dist != tt.dist {
25 | 			t.Errorf("Jaro('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
26 | 		}
27 | 	}
28 | }
29 | 
30 | var jarowtests = []struct {
31 | 	s1   string
32 | 	s2   string
33 | 	dist float64
34 | }{
35 | 	{"", "cars", 0.0},
36 | 	{"cars", "", 0.0},
37 | 	{"dixon", "dicksonx", 0.8133333333333332},
38 | 	{"martha", "marhta", 0.9611111111111111},
39 | 	{"dwayne", "duane", 0.8400000000000001},
40 | 	{"dr", "driveway", 0.8},
41 | }
42 | 
43 | // Jaro-Winkler distance
44 | func TestJaroWinkler(t *testing.T) {
45 | 	for _, tt := range jarowtests {
46 | 		dist := JaroWinkler(tt.s1, tt.s2, false)
47 | 		if dist != tt.dist {
48 | 			t.Errorf("JaroWinkler('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
49 | 		}
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/levenshtein.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | // Levenshtein computes the Levenshtein distance between two
 4 | // strings. The returned value - distance - is the number of insertions,
 5 | // deletions, and substitutions it takes to transform one
 6 | // string (s1) into another (s2). Each step in the transformation "costs"
 7 | // one distance point.
 8 | func Levenshtein(s1 string, s2 string) (distance int) {
 9 | 	// index by code point, not byte
10 | 	r1 := []rune(s1)
11 | 	r2 := []rune(s2)
12 | 
13 | 	rows := len(r1) + 1
14 | 	cols := len(r2) + 1
15 | 
16 | 	var d1 int
17 | 	var d2 int
18 | 	var d3 int
19 | 	var i int
20 | 	var j int
21 | 	dist := make([]int, rows*cols)
22 | 
23 | 	for i = 0; i < rows; i++ {
24 | 		dist[i*cols] = i
25 | 	}
26 | 
27 | 	for j = 0; j < cols; j++ {
28 | 		dist[j] = j
29 | 	}
30 | 
31 | 	for j = 1; j < cols; j++ {
32 | 		for i = 1; i < rows; i++ {
33 | 			if r1[i-1] == r2[j-1] {
34 | 				dist[(i*cols)+j] = dist[((i-1)*cols)+(j-1)]
35 | 			} else {
36 | 				d1 = dist[((i-1)*cols)+j] + 1
37 | 				d2 = dist[(i*cols)+(j-1)] + 1
38 | 				d3 = dist[((i-1)*cols)+(j-1)] + 1
39 | 
40 | 				dist[(i*cols)+j] = min(d1, min(d2, d3))
41 | 			}
42 | 		}
43 | 	}
44 | 
45 | 	distance = dist[(cols*rows)-1]
46 | 
47 | 	return
48 | }
49 | 


--------------------------------------------------------------------------------
/levenshtein_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var levtests = []struct {
 6 | 	s1   string
 7 | 	s2   string
 8 | 	dist int
 9 | }{
10 | 	// insertion
11 | 	{"car", "cars", 1},
12 | 	// substitution
13 | 	{"library", "librari", 1},
14 | 	// deletion
15 | 	{"library", "librar", 1},
16 | 	// one empty, left
17 | 	{"", "library", 7},
18 | 	// one empty, right
19 | 	{"library", "", 7},
20 | 	// two empties
21 | 	{"", "", 0},
22 | 	// unicode stuff!
23 | 	{"Schüßler", "Schübler", 1},
24 | 	{"Schüßler", "Schußler", 1},
25 | 	{"Schüßler", "Schüßler", 0},
26 | 	{"Schüßler", "Schüler", 1},
27 | 	{"Schüßler", "Schüßlers", 1},
28 | }
29 | 
30 | // Regular Levenshtein
31 | func TestLevenshtein(t *testing.T) {
32 | 	for _, tt := range levtests {
33 | 		dist := Levenshtein(tt.s1, tt.s2)
34 | 		if dist != tt.dist {
35 | 			t.Errorf("Levenshtein('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
36 | 		}
37 | 	}
38 | }
39 | 


--------------------------------------------------------------------------------
/longestcommonsubsequence.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | // LongestCommonSubsequence computes the longest substring
 4 | // between two strings. The returned value is the length
 5 | // of the substring, which contains letters from both
 6 | // strings, while maintaining the order of the letters.
 7 | func LongestCommonSubsequence(s1, s2 string) int {
 8 | 	r1 := []rune(s1)
 9 | 	r2 := []rune(s2)
10 | 	table := make([][]int, len(s1)+1)
11 | 
12 | 	// Construct 2D table
13 | 	for i := range table {
14 | 		table[i] = make([]int, len(s2)+1)
15 | 	}
16 | 
17 | 	var i int
18 | 	var j int
19 | 
20 | 	for i = len(r1) - 1; i >= 0; i-- {
21 | 		for j = len(r2) - 1; j >= 0; j-- {
22 | 			if r1[i] == r2[j] {
23 | 				table[i][j] = 1 + table[i+1][j+1]
24 | 			} else {
25 | 				table[i][j] = maxI(table[i+1][j], table[i][j+1])
26 | 			}
27 | 		}
28 | 	}
29 | 	return table[0][0]
30 | }
31 | 


--------------------------------------------------------------------------------
/longestcommonsubsequence_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var lcstests = []struct {
 6 | 	s1     string
 7 | 	s2     string
 8 | 	length int
 9 | }{
10 | 	// match beginning
11 | 	{"cans", "can", 3},
12 | 	// match end
13 | 	{"ebay", "bay", 3},
14 | 	// gap in the middle
15 | 	{"coins", "cons", 4},
16 | 	// one empty, left
17 | 	{"", "hello", 0},
18 | 	// one empty, right
19 | 	{"goodbye", "", 0},
20 | 	// two empties
21 | 	{"", "", 0},
22 | 	// unicode stuff!
23 | 	{"Schüßler", "Schüßler", 8},
24 | }
25 | 
26 | func TestLongestCommonSubsequence(t *testing.T) {
27 | 	for _, tt := range lcstests {
28 | 		length := LongestCommonSubsequence(tt.s1, tt.s2)
29 | 		if length != tt.length {
30 | 			t.Errorf("LongestCommonSubsequence('%s', '%s') = %v, want %v", tt.s1, tt.s2, length, tt.length)
31 | 		}
32 | 	}
33 | }
34 | 


--------------------------------------------------------------------------------
/metaphone.go:
--------------------------------------------------------------------------------
  1 | package matchr
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"strings"
  6 | )
  7 | 
  8 | type metaphoneresult struct {
  9 | 	// the maximum number of code values to calculate
 10 | 	maxLength int
 11 | 
 12 | 	// whether to calculate an alternate
 13 | 	calcAlternate bool
 14 | 
 15 | 	// no direct modifications - only through add()
 16 | 	primary   bytes.Buffer
 17 | 	alternate bytes.Buffer
 18 | 
 19 | 	// length of the private buffers
 20 | 	PrimaryLength   int
 21 | 	AlternateLength int
 22 | }
 23 | 
 24 | func newMetaphoneresult(maxLength int, calcAlternate bool) (r *metaphoneresult) {
 25 | 	r = &metaphoneresult{maxLength: maxLength, calcAlternate: calcAlternate}
 26 | 	return
 27 | }
 28 | 
 29 | func (r *metaphoneresult) add(c1 string, c2 string) {
 30 | 	if c1 != "" {
 31 | 		r.primary.WriteString(c1)
 32 | 		r.PrimaryLength += len(c1)
 33 | 	}
 34 | 
 35 | 	if c2 != "" && r.calcAlternate {
 36 | 		r.alternate.WriteString(c2)
 37 | 		r.AlternateLength += len(c2)
 38 | 	}
 39 | }
 40 | 
 41 | func (r *metaphoneresult) isComplete() bool {
 42 | 	return r.PrimaryLength >= r.maxLength && r.AlternateLength >= r.maxLength
 43 | }
 44 | 
 45 | func (r *metaphoneresult) result() (primary string, alternate string) {
 46 | 	primary = r.primary.String()
 47 | 	if len(primary) > r.maxLength {
 48 | 		primary = primary[0:r.maxLength]
 49 | 	}
 50 | 	alternate = r.alternate.String()
 51 | 	if len(alternate) > r.maxLength {
 52 | 		alternate = alternate[0:r.maxLength]
 53 | 	}
 54 | 	return
 55 | }
 56 | 
 57 | // utility functions for checking things within a string
 58 | func isSlavoGermanic(value string) bool {
 59 | 	return strings.Contains(value, "W") || strings.Contains(value, "K") ||
 60 | 		strings.Contains(value, "CZ") || strings.Contains(value, "WITZ")
 61 | }
 62 | 
 63 | func isSilentStart(input runestring) bool {
 64 | 	SILENT_START := [...]string{"GN", "KN", "PN", "WR", "PS"}
 65 | 
 66 | 	prefix := input.SafeSubstr(0, 2)
 67 | 
 68 | 	for _, criteria := range SILENT_START {
 69 | 		if prefix == criteria {
 70 | 			return true
 71 | 		}
 72 | 	}
 73 | 
 74 | 	return false
 75 | }
 76 | 
 77 | func handleVowel(result *metaphoneresult, index int) int {
 78 | 	if index == 0 {
 79 | 		result.add("A", "A")
 80 | 	}
 81 | 
 82 | 	return index + 1
 83 | }
 84 | 
 85 | /******************************************************************************
 86 |  * Entry handlers for letters.
 87 |  *****************************************************************************/
 88 | func handleC(input runestring, result *metaphoneresult, index int) int {
 89 | 	if conditionC0(input, index) {
 90 | 		result.add("K", "K")
 91 | 		index += 2
 92 | 	} else if index == 0 && input.Contains(index, 6, "CAESAR") {
 93 | 		result.add("S", "S")
 94 | 		index += 2
 95 | 	} else if input.Contains(index, 2, "CH") {
 96 | 		index = handleCH(input, result, index)
 97 | 	} else if input.Contains(index, 2, "CZ") &&
 98 | 		!input.Contains(index-2, 4, "WICZ") {
 99 | 		result.add("S", "X")
100 | 		index += 2
101 | 	} else if input.Contains(index+1, 3, "CIA") {
102 | 		result.add("X", "X")
103 | 		index += 3
104 | 	} else if input.Contains(index, 2, "CC") &&
105 | 		!(index == 1 && input.SafeAt(0) == 'M') {
106 | 		return handleCC(input, result, index)
107 | 	} else if input.Contains(index, 2, "CK") ||
108 | 		input.Contains(index, 2, "CG") ||
109 | 		input.Contains(index, 2, "CQ") {
110 | 		result.add("K", "K")
111 | 		index += 2
112 | 	} else if input.Contains(index, 2, "CI") ||
113 | 		input.Contains(index, 2, "CE") ||
114 | 		input.Contains(index, 2, "CY") {
115 | 		if input.Contains(index, 3, "CIO") ||
116 | 			input.Contains(index, 3, "CIE") ||
117 | 			input.Contains(index, 3, "CIA") {
118 | 			result.add("S", "X")
119 | 		} else {
120 | 			result.add("S", "S")
121 | 		}
122 | 		index += 2
123 | 	} else {
124 | 		result.add("K", "K")
125 | 		if input.Contains(index+1, 2, " C") ||
126 | 			input.Contains(index+1, 2, " Q") ||
127 | 			input.Contains(index+1, 2, " G") {
128 | 			index += 3
129 | 		} else if (input.Contains(index+1, 1, "C") ||
130 | 			input.Contains(index+1, 1, "K") ||
131 | 			input.Contains(index+1, 1, "Q")) &&
132 | 			!(input.Contains(index+1, 2, "CE") ||
133 | 				input.Contains(index+1, 2, "CI")) {
134 | 			index += 2
135 | 		} else {
136 | 			index++
137 | 		}
138 | 	}
139 | 
140 | 	return index
141 | }
142 | 
143 | func handleCC(input runestring, result *metaphoneresult, index int) int {
144 | 	if input.Contains(index+2, 1, "I", "E", "H") &&
145 | 		!input.Contains(index+2, 2, "HU") {
146 | 		if (index == 1 && input.SafeAt(index-1) == 'A') ||
147 | 			(input.Contains(index-1, 5, "UCCEE", "UCCES")) {
148 | 			result.add("KS", "KS")
149 | 		} else {
150 | 			result.add("X", "X")
151 | 		}
152 | 		index += 3
153 | 	} else {
154 | 		result.add("K", "K")
155 | 		index += 2
156 | 	}
157 | 	return index
158 | }
159 | 
160 | func handleCH(input runestring, result *metaphoneresult, index int) int {
161 | 	if index > 0 && input.Contains(index, 4, "CHAE") {
162 | 		result.add("K", "X")
163 | 		return index + 2
164 | 	} else if conditionCH0(input, index) {
165 | 		result.add("K", "K")
166 | 		return index + 2
167 | 		// TODO: combine this condition with the one above?
168 | 	} else if conditionCH1(input, index) {
169 | 		result.add("K", "K")
170 | 		return index + 2
171 | 	} else {
172 | 		if index > 0 {
173 | 			if input.Contains(0, 2, "MC") {
174 | 				result.add("K", "K")
175 | 			} else {
176 | 				result.add("X", "K")
177 | 			}
178 | 		} else {
179 | 			result.add("X", "X")
180 | 		}
181 | 		return index + 2
182 | 	}
183 | }
184 | 
185 | func handleD(input runestring, result *metaphoneresult, index int) int {
186 | 	if input.Contains(index, 2, "DG") {
187 | 		if input.Contains(index+2, 1, "I", "E", "Y") {
188 | 			result.add("J", "J")
189 | 			index += 3
190 | 		} else {
191 | 			result.add("TK", "TK")
192 | 			index += 2
193 | 		}
194 | 	} else if input.Contains(index, 2, "DT", "DD") {
195 | 		result.add("T", "T")
196 | 		index += 2
197 | 	} else {
198 | 		result.add("T", "T")
199 | 		index++
200 | 	}
201 | 	return index
202 | }
203 | 
204 | func handleG(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
205 | 	if input.SafeAt(index+1) == 'H' {
206 | 		index = handleGH(input, result, index)
207 | 	} else if input.SafeAt(index+1) == 'N' {
208 | 		if index == 1 && isVowel(input.SafeAt(0)) && !slavoGermanic {
209 | 			result.add("KN", "N")
210 | 		} else if !input.Contains(index+2, 2, "EY") && input.SafeAt(index+1) != 'Y' && !slavoGermanic {
211 | 			result.add("N", "KN")
212 | 		} else {
213 | 			result.add("KN", "KN")
214 | 		}
215 | 		index += 2
216 | 	} else if input.Contains(index+1, 2, "LI") && !slavoGermanic {
217 | 		result.add("KL", "L")
218 | 		index += 2
219 | 	} else if index == 0 && (input.SafeAt(index+1) == 'Y' ||
220 | 		input.Contains(index+1, 2, "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER")) {
221 | 		result.add("K", "J")
222 | 		index += 2
223 | 	} else if (input.Contains(index+1, 2, "ER") ||
224 | 		input.SafeAt(index+1) == 'Y') &&
225 | 		!input.Contains(0, 6, "DANGER", "RANGER", "MANGER") &&
226 | 		!input.Contains(index-1, 1, "E", "I") &&
227 | 		!input.Contains(index-1, 3, "RGY", "OGY") {
228 | 		result.add("K", "J")
229 | 		index += 2
230 | 	} else if input.Contains(index+1, 1, "E", "I", "Y") ||
231 | 		input.Contains(index-1, 4, "AGGI", "OGGI") {
232 | 		if input.Contains(0, 4, "VAN ", "VON ") ||
233 | 			input.Contains(0, 3, "SCH") ||
234 | 			input.Contains(index+1, 2, "ET") {
235 | 			result.add("K", "K")
236 | 		} else if input.Contains(index+1, 3, "IER") {
237 | 			result.add("J", "J")
238 | 		} else {
239 | 			result.add("J", "K")
240 | 		}
241 | 		index += 2
242 | 	} else if input.SafeAt(index+1) == 'G' {
243 | 		result.add("K", "K")
244 | 		index += 2
245 | 	} else {
246 | 		result.add("K", "K")
247 | 		index++
248 | 	}
249 | 	return index
250 | }
251 | 
252 | func handleGH(input runestring, result *metaphoneresult, index int) int {
253 | 	if index > 0 && !isVowel(input.SafeAt(index-1)) {
254 | 		result.add("K", "K")
255 | 		index += 2
256 | 	} else if index == 0 {
257 | 		if input.SafeAt(index+2) == 'I' {
258 | 			result.add("J", "J")
259 | 		} else {
260 | 			result.add("K", "K")
261 | 		}
262 | 		index += 2
263 | 	} else if (index > 1 && input.Contains(index-2, 1, "B", "H", "D")) ||
264 | 		(index > 2 && input.Contains(index-3, 1, "B", "H", "D")) ||
265 | 		(index > 3 && input.Contains(index-4, 1, "B", "H")) {
266 | 		index += 2
267 | 	} else {
268 | 		if index > 2 && input.SafeAt(index-1) == 'U' &&
269 | 			input.Contains(index-3, 1, "C", "G", "L", "R", "T") {
270 | 			result.add("F", "F")
271 | 		} else if index > 0 && input.SafeAt(index-1) != 'I' {
272 | 			result.add("K", "K")
273 | 		}
274 | 		index += 2
275 | 	}
276 | 	return index
277 | }
278 | 
279 | func handleH(input runestring, result *metaphoneresult, index int) int {
280 | 	if (index == 0 || isVowel(input.SafeAt(index-1))) &&
281 | 		isVowel(input.SafeAt(index+1)) {
282 | 		result.add("H", "H")
283 | 		index += 2
284 | 	} else {
285 | 		index++
286 | 	}
287 | 	return index
288 | }
289 | 
290 | func handleJ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
291 | 	if input.Contains(index, 4, "JOSE") || input.Contains(0, 4, "SAN ") {
292 | 		if (index == 0 && (input.SafeAt(index+4) == ' ') ||
293 | 			len(input) == 4) || input.Contains(0, 4, "SAN ") {
294 | 			result.add("H", "H")
295 | 		} else {
296 | 			result.add("J", "H")
297 | 		}
298 | 		index++
299 | 	} else {
300 | 		if index == 0 && !input.Contains(index, 4, "JOSE") {
301 | 			result.add("J", "A")
302 | 		} else if isVowel(input.SafeAt(index-1)) && !slavoGermanic &&
303 | 			(input.SafeAt(index+1) == 'A' || input.SafeAt(index+1) == 'O') {
304 | 			result.add("J", "H")
305 | 		} else if index == (len(input) - 1) {
306 | 			result.add("J", " ")
307 | 		} else if !input.Contains(index+1, 1,
308 | 			"L", "T", "K", "S", "N", "M", "B", "Z") &&
309 | 			!input.Contains(index-1, 1, "S", "K", "L") {
310 | 			result.add("J", "J")
311 | 		}
312 | 
313 | 		if input.SafeAt(index+1) == 'J' {
314 | 			index += 2
315 | 		} else {
316 | 			index++
317 | 		}
318 | 	}
319 | 	return index
320 | }
321 | 
322 | func handleL(input runestring, result *metaphoneresult, index int) int {
323 | 	if input.SafeAt(index+1) == 'L' {
324 | 		if conditionL0(input, index) {
325 | 			result.add("L", "")
326 | 		} else {
327 | 			result.add("L", "L")
328 | 		}
329 | 		index += 2
330 | 	} else {
331 | 		result.add("L", "L")
332 | 		index++
333 | 	}
334 | 	return index
335 | }
336 | 
337 | func handleP(input runestring, result *metaphoneresult, index int) int {
338 | 	if input.SafeAt(index+1) == 'H' {
339 | 		result.add("F", "F")
340 | 		index += 2
341 | 	} else {
342 | 		result.add("P", "P")
343 | 		if input.Contains(index+1, 1, "P", "B") {
344 | 			index += 2
345 | 		} else {
346 | 			index++
347 | 		}
348 | 	}
349 | 	return index
350 | }
351 | 
352 | func handleR(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
353 | 	if index == (len(input)-1) && !slavoGermanic &&
354 | 		input.Contains(index-2, 2, "IE") &&
355 | 		!input.Contains(index-4, 2, "ME", "MA") {
356 | 		result.add("", "R")
357 | 	} else {
358 | 		result.add("R", "R")
359 | 	}
360 | 
361 | 	if input.SafeAt(index+1) == 'R' {
362 | 		index += 2
363 | 	} else {
364 | 		index++
365 | 	}
366 | 	return index
367 | }
368 | 
369 | func handleS(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
370 | 	if input.Contains(index-1, 3, "ISL", "YSL") {
371 | 		index++
372 | 	} else if index == 0 && input.Contains(index, 5, "SUGAR") {
373 | 		result.add("X", "S")
374 | 		index++
375 | 	} else if input.Contains(index, 2, "SH") {
376 | 		if input.Contains(index+1, 4, "HEIM", "HOEK", "HOLM", "HOLZ") {
377 | 			result.add("S", "S")
378 | 		} else {
379 | 			result.add("X", "X")
380 | 		}
381 | 		index += 2
382 | 	} else if input.Contains(index, 3, "SIO", "SIA") ||
383 | 		input.Contains(index, 4, "SIAN") {
384 | 		if slavoGermanic {
385 | 			result.add("S", "S")
386 | 		} else {
387 | 			result.add("S", "X")
388 | 		}
389 | 		index += 3
390 | 	} else if (index == 0 && input.Contains(index+1, 1, "M", "N", "L", "W")) ||
391 | 		input.Contains(index+1, 1, "Z") {
392 | 		result.add("S", "X")
393 | 		if input.Contains(index+1, 1, "Z") {
394 | 			index += 2
395 | 		} else {
396 | 			index++
397 | 		}
398 | 	} else if input.Contains(index, 2, "SC") {
399 | 		index = handleSC(input, result, index)
400 | 	} else {
401 | 		if index == len(input)-1 &&
402 | 			input.Contains(index-2, 2, "AI", "OI") {
403 | 			result.add("", "S")
404 | 		} else {
405 | 			result.add("S", "S")
406 | 		}
407 | 
408 | 		if input.Contains(index+1, 1, "S", "Z") {
409 | 			index += 2
410 | 		} else {
411 | 			index++
412 | 		}
413 | 	}
414 | 	return index
415 | }
416 | 
417 | func handleSC(input runestring, result *metaphoneresult, index int) int {
418 | 	if input.SafeAt(index+2) == 'H' {
419 | 		if input.Contains(index+3, 2, "OO", "ER", "EN", "UY", "ED", "EM") {
420 | 			if input.Contains(index+3, 2, "ER", "EN") {
421 | 				result.add("X", "SK")
422 | 			} else {
423 | 				result.add("SK", "SK")
424 | 			}
425 | 		} else {
426 | 			if index == 0 && !isVowel(input.SafeAt(3)) && input.SafeAt(3) != 'W' {
427 | 				result.add("X", "S")
428 | 			} else {
429 | 				result.add("X", "X")
430 | 			}
431 | 		}
432 | 	} else if input.Contains(index+2, 1, "I", "E", "Y") {
433 | 		result.add("S", "S")
434 | 	} else {
435 | 		result.add("SK", "SK")
436 | 	}
437 | 	index += 3
438 | 
439 | 	return index
440 | }
441 | 
442 | func handleT(input runestring, result *metaphoneresult, index int) int {
443 | 	if input.Contains(index, 4, "TION") {
444 | 		result.add("X", "X")
445 | 		index += 3
446 | 	} else if input.Contains(index, 3, "TIA", "TCH") {
447 | 		result.add("X", "X")
448 | 		index += 3
449 | 	} else if input.Contains(index, 2, "TH") || input.Contains(index, 3, "TTH") {
450 | 		if input.Contains(index+2, 2, "OM", "AM") ||
451 | 			input.Contains(0, 4, "VAN ", "VON ") ||
452 | 			input.Contains(0, 3, "SCH") {
453 | 			result.add("T", "T")
454 | 		} else {
455 | 			result.add("0", "T")
456 | 		}
457 | 		index += 2
458 | 	} else {
459 | 		result.add("T", "T")
460 | 		if input.Contains(index+1, 1, "T", "D") {
461 | 			index += 2
462 | 		} else {
463 | 			index++
464 | 		}
465 | 	}
466 | 	return index
467 | }
468 | 
469 | func handleW(input runestring, result *metaphoneresult, index int) int {
470 | 	if input.Contains(index, 2, "WR") {
471 | 		result.add("R", "R")
472 | 		index += 2
473 | 	} else {
474 | 		if index == 0 && (isVowel(input.SafeAt(index+1)) ||
475 | 			input.Contains(index, 2, "WH")) {
476 | 			if isVowel(input.SafeAt(index + 1)) {
477 | 				result.add("A", "F")
478 | 			} else {
479 | 				result.add("A", "A")
480 | 			}
481 | 			index++
482 | 		} else if (index == len(input)-1 && isVowel(input.SafeAt(index-1))) ||
483 | 			input.Contains(index-1, 5, "EWSKI", "EWSKY", "OWSKI", "OWSKY") ||
484 | 			input.Contains(0, 3, "SCH") {
485 | 			result.add("", "F")
486 | 			index++
487 | 		} else if input.Contains(index, 4, "WICZ", "WITZ") {
488 | 			result.add("TS", "FX")
489 | 			index += 4
490 | 		} else {
491 | 			index++
492 | 		}
493 | 	}
494 | 	return index
495 | }
496 | 
497 | func handleX(input runestring, result *metaphoneresult, index int) int {
498 | 	if index == 0 {
499 | 		result.add("S", "S")
500 | 		index++
501 | 	} else {
502 | 		if !((index == len(input)-1) &&
503 | 			(input.Contains(index-3, 3, "IAU", "EAU") ||
504 | 				input.Contains(index-2, 2, "AU", "OU"))) {
505 | 			result.add("KS", "KS")
506 | 		}
507 | 
508 | 		if input.Contains(index+1, 1, "C", "X") {
509 | 			index += 2
510 | 		} else {
511 | 			index++
512 | 		}
513 | 	}
514 | 	return index
515 | }
516 | 
517 | func handleZ(input runestring, result *metaphoneresult, index int, slavoGermanic bool) int {
518 | 	if input.SafeAt(index+1) == 'H' {
519 | 		result.add("J", "J")
520 | 	} else {
521 | 		if input.Contains(index+1, 2, "ZO", "ZI", "ZA") ||
522 | 			(slavoGermanic && (index > 0 && input.SafeAt(index-1) != 'T')) {
523 | 			result.add("S", "TS")
524 | 		} else {
525 | 			result.add("S", "S")
526 | 		}
527 | 	}
528 | 
529 | 	if input.SafeAt(index+1) == 'Z' {
530 | 		index += 2
531 | 	} else {
532 | 		index++
533 | 	}
534 | 	return index
535 | }
536 | 
537 | /******************************************************************************
538 |  * Complex conditional handlers for letters
539 |  *****************************************************************************/
540 | func conditionC0(input runestring, index int) bool {
541 | 	if input.Contains(index, 4, "CHIA") {
542 | 		return true
543 | 	} else if index <= 1 {
544 | 		return false
545 | 	} else if isVowel(input.SafeAt(index - 2)) {
546 | 		return false
547 | 	} else if !input.Contains(index-1, 3, "ACH") {
548 | 		return false
549 | 	} else {
550 | 		c := input.SafeAt(index + 2)
551 | 		return (c != 'I' && c != 'E') ||
552 | 			(input.Contains(index-2, 6, "BACHER") ||
553 | 				input.Contains(index-2, 6, "MACHER"))
554 | 	}
555 | }
556 | 
557 | func conditionCH0(input runestring, index int) bool {
558 | 	if index != 0 {
559 | 		return false
560 | 	} else if !input.Contains(index+1, 5, "HARAC", "HARIS") &&
561 | 		!input.Contains(index+1, 3, "HOR", "HYM", "HIA", "HEM") {
562 | 		return false
563 | 	} else if input.Contains(0, 5, "CHORE") {
564 | 		return false
565 | 	} else {
566 | 		return true
567 | 	}
568 | }
569 | 
570 | func conditionCH1(input runestring, index int) bool {
571 | 	// good god this is ugly
572 | 	return (input.Contains(0, 4, "VAN ", "VON ") || input.Contains(0, 3, "SCH")) ||
573 | 		input.Contains(index-2, 6, "ORCHES", "ARCHIT", "ORCHID") ||
574 | 		input.Contains(index+2, 1, "T", "S") ||
575 | 		((input.Contains(index-1, 1, "A", "O", "U", "E") || index == 0) &&
576 | 			(input.Contains(index+2, 1, "L", "R", "N", "M", "B", "H", "F", "V", "W", " ") ||
577 | 				index+1 == len(input)-1))
578 | }
579 | 
580 | func conditionL0(input runestring, index int) bool {
581 | 	if index == (len(input)-3) &&
582 | 		input.Contains(index-1, 4, "ILLO", "ILLA", "ALLE") {
583 | 		return true
584 | 	} else if (input.Contains(len(input)-2, 2, "AS", "OS") ||
585 | 		input.Contains(len(input)-1, 1, "A", "O")) &&
586 | 		(input.Contains(index-1, 4, "ALLE")) {
587 | 		return true
588 | 	} else {
589 | 		return false
590 | 	}
591 | }
592 | 
593 | func conditionM0(input runestring, index int) bool {
594 | 	if input.SafeAt(index+1) == 'M' {
595 | 		return true
596 | 	}
597 | 
598 | 	return input.Contains(index-1, 3, "UMB") &&
599 | 		((index+1) == (len(input)-1) ||
600 | 			input.Contains(index+2, 2, "ER"))
601 | }
602 | 
603 | // DoubleMetaphone computes the Double-Metaphone value of the input string.
604 | // This value is a phonetic representation of how the string sounds, with
605 | // affordances for many different language dialects. It was originally
606 | // developed by Lawrence Phillips in the 1990s.
607 | //
608 | // More information about this algorithm can be found on Wikipedia at
609 | // http://en.wikipedia.org/wiki/Metaphone.
610 | func DoubleMetaphone(s1 string) (string, string) {
611 | 	// trim, upper space
612 | 	s1 = cleanInput(s1)
613 | 
614 | 	// structure to traverse the string by code point, not byte
615 | 	input := runestring(s1)
616 | 
617 | 	slavoGermanic := isSlavoGermanic(s1)
618 | 
619 | 	// where we are in the string
620 | 	index := 0
621 | 
622 | 	if isSilentStart(input) {
623 | 		index += 1
624 | 	}
625 | 
626 | 	result := newMetaphoneresult(4, true)
627 | 
628 | 	for !result.isComplete() && index <= len(input)-1 {
629 | 		c := rune(input.SafeAt(index))
630 | 		switch c {
631 | 		case 'A', 'E', 'I', 'O', 'U', 'Y':
632 | 			index = handleVowel(result, index)
633 | 		case 'B':
634 | 			result.add("P", "P")
635 | 			if input.SafeAt(index+1) == 'B' {
636 | 				index += 2
637 | 			} else {
638 | 				index++
639 | 			}
640 | 		case 'Ç':
641 | 			result.add("S", "S")
642 | 			index++
643 | 		case 'C':
644 | 			index = handleC(input, result, index)
645 | 		case 'D':
646 | 			index = handleD(input, result, index)
647 | 		case 'F':
648 | 			result.add("F", "F")
649 | 			if input.SafeAt(index+1) == 'F' {
650 | 				index += 2
651 | 			} else {
652 | 				index++
653 | 			}
654 | 		case 'G':
655 | 			index = handleG(input, result, index, slavoGermanic)
656 | 		case 'H':
657 | 			index = handleH(input, result, index)
658 | 		case 'J':
659 | 			index = handleJ(input, result, index, slavoGermanic)
660 | 		case 'K':
661 | 			result.add("K", "K")
662 | 			if input.SafeAt(index+1) == 'K' {
663 | 				index += 2
664 | 			} else {
665 | 				index++
666 | 			}
667 | 		case 'L':
668 | 			index = handleL(input, result, index)
669 | 		case 'M':
670 | 			result.add("M", "M")
671 | 			if conditionM0(input, index) {
672 | 				index += 2
673 | 			} else {
674 | 				index++
675 | 			}
676 | 		case 'N':
677 | 			result.add("N", "N")
678 | 			if input.SafeAt(index+1) == 'N' {
679 | 				index += 2
680 | 			} else {
681 | 				index++
682 | 			}
683 | 		case 'Ñ':
684 | 			result.add("N", "N")
685 | 			index++
686 | 		case 'P':
687 | 			index = handleP(input, result, index)
688 | 		case 'Q':
689 | 			result.add("K", "K")
690 | 			if input.SafeAt(index+1) == 'Q' {
691 | 				index += 2
692 | 			} else {
693 | 				index++
694 | 			}
695 | 		case 'R':
696 | 			index = handleR(input, result, index, slavoGermanic)
697 | 		case 'S':
698 | 			index = handleS(input, result, index, slavoGermanic)
699 | 		case 'T':
700 | 			index = handleT(input, result, index)
701 | 		case 'V':
702 | 			result.add("F", "F")
703 | 			if input.SafeAt(index+1) == 'V' {
704 | 				index += 2
705 | 			} else {
706 | 				index++
707 | 			}
708 | 		case 'W':
709 | 			index = handleW(input, result, index)
710 | 		case 'X':
711 | 			index = handleX(input, result, index)
712 | 		case 'Z':
713 | 			index = handleZ(input, result, index, slavoGermanic)
714 | 		default:
715 | 			index++
716 | 		}
717 | 
718 | 	}
719 | 
720 | 	return result.result()
721 | }
722 | 


--------------------------------------------------------------------------------
/metaphone_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"compress/gzip"
 6 | 	"os"
 7 | 	"strings"
 8 | 	"testing"
 9 | )
10 | 
11 | func TestDoubleMetaphone(t *testing.T) {
12 | 	// load gzipped corpus
13 | 	f, err := os.Open("double_metaphone_corpus.txt.gz")
14 | 	if err != nil {
15 | 		panic("Error opening file double_metaphone_corpus.txt.gz! Exiting.")
16 | 	}
17 | 	defer f.Close()
18 | 
19 | 	g, err := gzip.NewReader(f)
20 | 	if err != nil {
21 | 		panic("Error with supposedly gzipped file double_metaphone_corpus.txt.gz! Exiting.")
22 | 	}
23 | 
24 | 	r := bufio.NewReader(g)
25 | 
26 | 	line, err := r.ReadString('\n')
27 | 	for err == nil {
28 | 		line = strings.TrimRight(line, "\n")
29 | 		v := strings.Split(line, "|")
30 | 
31 | 		metaphone, alternate := DoubleMetaphone(v[0])
32 | 		if metaphone != v[1] || alternate != v[2] {
33 | 			t.Errorf("DoubleMetaphone('%s') = (%v, %v), want (%v, %v)", v[0], metaphone, alternate, v[1], v[2])
34 | 			t.FailNow()
35 | 		}
36 | 
37 | 		line, err = r.ReadString('\n')
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/nysiis.go:
--------------------------------------------------------------------------------
  1 | package matchr
  2 | 
  3 | // NYSIIS computes the NYSIIS phonetic encoding of the input string. It is a
  4 | // modification of the traditional Soundex algorithm.
  5 | func NYSIIS(s1 string) string {
  6 | 	cleans1 := runestring(cleanInput(s1))
  7 | 	input := runestring(make([]rune, 0, len(s1)))
  8 | 
  9 | 	// The output can't be larger than the string itself
 10 | 	output := runestring(make([]rune, 0, len(s1)))
 11 | 
 12 | 	// 0. Remove all non-ASCII characters
 13 | 	for _, v := range cleans1 {
 14 | 		if v >= 65 && v <= 90 {
 15 | 			input = append(input, v)
 16 | 		}
 17 | 	}
 18 | 
 19 | 	if len(input) == 0 {
 20 | 		return ""
 21 | 	}
 22 | 
 23 | 	// 1. Transcoding first characters
 24 | 	switch input[0] {
 25 | 	case 'M':
 26 | 		if input.SafeSubstr(0, 3) == "MAC" {
 27 | 			// MAC -> MCC
 28 | 			input[1] = 'C'
 29 | 		}
 30 | 	case 'K':
 31 | 		if input.SafeSubstr(0, 2) == "KN" {
 32 | 			// KN -> NN
 33 | 			input[0] = 'N'
 34 | 		} else {
 35 | 			// K -> C
 36 | 			input[0] = 'C'
 37 | 		}
 38 | 	case 'P':
 39 | 		next := input.SafeAt(1)
 40 | 		if next == 'H' {
 41 | 			// PH -> FF
 42 | 			input[0] = 'F'
 43 | 			input[1] = 'F'
 44 | 		} else if next == 'F' {
 45 | 			// PF -> FF
 46 | 			input[0] = 'F'
 47 | 		}
 48 | 	case 'S':
 49 | 		if input.SafeSubstr(0, 3) == "SCH" {
 50 | 			input[1] = 'S'
 51 | 			input[2] = 'S'
 52 | 		}
 53 | 	}
 54 | 
 55 | 	// 2. Transcoding last characters
 56 | 	switch input.SafeSubstr(len(input)-2, 2) {
 57 | 	case "EE", "IE":
 58 | 		// EE, IE -> Y
 59 | 		input.Del(len(input) - 2)
 60 | 		input[len(input)-1] = 'Y'
 61 | 	case "DT", "RT", "RD", "NT", "ND":
 62 | 		// DT, RT, RD, NT, ND -> D
 63 | 		input.Del(len(input) - 2)
 64 | 		input[len(input)-1] = 'D'
 65 | 	}
 66 | 
 67 | 	// 3. First character of key = first character of name
 68 | 	output = append(output, input[0])
 69 | 	last := input[0]
 70 | 
 71 | 	for i := 1; i < len(input); i++ {
 72 | 		c := input[i]
 73 | 		switch c {
 74 | 		case 'A', 'I', 'O', 'U':
 75 | 			// A, E, I, O, U -> A (E is separate)
 76 | 			input[i] = 'A'
 77 | 		case 'E':
 78 | 			// EV -> AF, else A
 79 | 			if input.SafeAt(i+1) == 'V' {
 80 | 				input[i+1] = 'F'
 81 | 			}
 82 | 			input[i] = 'A'
 83 | 		case 'Q':
 84 | 			// Q -> G
 85 | 			input[i] = 'G'
 86 | 		case 'Z':
 87 | 			// Z -> S
 88 | 			input[i] = 'S'
 89 | 		case 'M':
 90 | 			// M -> N
 91 | 			input[i] = 'N'
 92 | 		case 'K':
 93 | 			// KN -> N, else K -> C
 94 | 			if input.SafeAt(i+1) == 'N' {
 95 | 				input.Del(i)
 96 | 			} else {
 97 | 				input[i] = 'C'
 98 | 			}
 99 | 		case 'S':
100 | 			// SCH -> SSS
101 | 			if input.SafeSubstr(i, 3) == "SCH" {
102 | 				input[i+1] = 'S'
103 | 				input[i+2] = 'S'
104 | 			}
105 | 		case 'P':
106 | 			// PH -> FF
107 | 			if input.SafeAt(i+1) == 'H' {
108 | 				input[i] = 'F'
109 | 				input[i+1] = 'F'
110 | 			}
111 | 		case 'H':
112 | 			// H -> $(previous character) if previous character or
113 | 			// next character is a non-vowel
114 | 			prev := input.SafeAt(i - 1)
115 | 			next := input.SafeAt(i + 1)
116 | 			if !isVowelNoY(prev) || !isVowelNoY(next) {
117 | 				input[i] = prev
118 | 			}
119 | 		case 'W':
120 | 			prev := input.SafeAt(i - 1)
121 | 			if isVowelNoY(prev) {
122 | 				input[i] = prev
123 | 			}
124 | 		}
125 | 
126 | 		if input[i] != last && input[i] != 0 {
127 | 			output = append(output, input[i])
128 | 		}
129 | 		last = input[i]
130 | 	}
131 | 
132 | 	// have to be careful here because we've already added the first
133 | 	// key value
134 | 	if len(output) > 1 {
135 | 		// remove trailing s
136 | 		if output.SafeAt(len(output)-1) == 'S' {
137 | 			output.Del(len(output) - 1)
138 | 		}
139 | 
140 | 		// trailing AY -> Y
141 | 		if len(output) > 2 && output.SafeSubstr(len(output)-2, 2) == "AY" {
142 | 			output.Del(len(output) - 2)
143 | 		}
144 | 
145 | 		// trailing A -> remove it
146 | 		if output.SafeAt(len(output)-1) == 'A' {
147 | 			output.Del(len(output) - 1)
148 | 		}
149 | 	}
150 | 
151 | 	if len(output) > 6 {
152 | 		return string(output[0:6])
153 | 	} else {
154 | 		return string(output)
155 | 	}
156 | }
157 | 


--------------------------------------------------------------------------------
/nysiis_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var nysiistests = []struct {
 6 | 	s1     string
 7 | 	nysiis string
 8 | }{
 9 | 	{"knight", "NAGT"},
10 | 	{"mitchell", "MATCAL"},
11 | 	{"o'daniel", "ODANAL"},
12 | 	{"brown sr", "BRANSR"},
13 | 	{"browne III", "BRAN"},
14 | 	{"browne IV", "BRANAV"},
15 | 	{"O'Banion", "OBANAN"},
16 | 	{"Mclaughlin", "MCLAGL"},
17 | 	{"McCormack", "MCARNA"},
18 | 	{"Chapman", "CAPNAN"},
19 | 	{"Silva", "SALV"},
20 | 	{"McDonald", "MCDANA"},
21 | 	{"Lawson", "LASAN"},
22 | 	{"Jacobs", "JACAB"},
23 | 	{"Greene", "GRAN"},
24 | 	{"O'Brien", "OBRAN"},
25 | 	{"Morrison", "MARASA"},
26 | 	{"Larson", "LARSAN"},
27 | 	{"Willis", "WAL"},
28 | 	{"Mackenzie", "MCANSY"},
29 | 	{"Carr", "CAR"},
30 | 	{"Lawrence", "LARANC"},
31 | 	{"Matthews", "MAT"},
32 | 	{"Richards", "RACARD"},
33 | 	{"Bishop", "BASAP"},
34 | 	{"Franklin", "FRANCL"},
35 | 	{"McDaniel", "MCDANA"},
36 | 	{"Harper", "HARPAR"},
37 | 	{"Lynch", "LYNC"},
38 | 	{"Watkins", "WATCAN"},
39 | 	{"Carlson", "CARLSA"},
40 | 	{"Wheeler", "WALAR"},
41 | 	{"Louis XVI", "LASXV"},
42 | 	{"2002", ""},
43 | 	{"1/2", ""},
44 | 	{"", ""},
45 | }
46 | 
47 | // NYSIIS
48 | func TestNYIIS(t *testing.T) {
49 | 	for _, tt := range nysiistests {
50 | 		nysiis := NYSIIS(tt.s1)
51 | 		if nysiis != tt.nysiis {
52 | 			t.Errorf("NYSIIS('%s') = %v, want %v", tt.s1, nysiis, tt.nysiis)
53 | 		}
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/osa.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | // OSA computes the Optimal String Alignment distance between two
 4 | // strings. The returned value - distance - is the number of insertions,
 5 | // deletions, substitutions, and transpositions it takes to transform one
 6 | // string (s1) into another (s2). Each step in the transformation "costs"
 7 | // one distance point. It is similar to Damerau-Levenshtein, but is simpler
 8 | // because it does not allow multiple edits on any substring.
 9 | func OSA(s1 string, s2 string) (distance int) {
10 | 	// index by code point, not byte
11 | 	r1 := []rune(s1)
12 | 	r2 := []rune(s2)
13 | 
14 | 	rows := len(r1) + 1
15 | 	cols := len(r2) + 1
16 | 
17 | 	var i, j, d1, d2, d3, d_now, cost int
18 | 
19 | 	dist := make([]int, rows*cols)
20 | 
21 | 	for i = 0; i < rows; i++ {
22 | 		dist[i*cols] = i
23 | 	}
24 | 
25 | 	for j = 0; j < cols; j++ {
26 | 		dist[j] = j
27 | 	}
28 | 
29 | 	for i = 1; i < rows; i++ {
30 | 		for j = 1; j < cols; j++ {
31 | 			if r1[i-1] == r2[j-1] {
32 | 				cost = 0
33 | 			} else {
34 | 				cost = 1
35 | 			}
36 | 
37 | 			d1 = dist[((i-1)*cols)+j] + 1
38 | 			d2 = dist[(i*cols)+(j-1)] + 1
39 | 			d3 = dist[((i-1)*cols)+(j-1)] + cost
40 | 
41 | 			d_now = min(d1, min(d2, d3))
42 | 
43 | 			if i > 2 && j > 2 && r1[i-1] == r2[j-2] &&
44 | 				r1[i-2] == r2[j-1] {
45 | 				d1 = dist[((i-2)*cols)+(j-2)] + cost
46 | 				d_now = min(d_now, d1)
47 | 			}
48 | 
49 | 			dist[(i*cols)+j] = d_now
50 | 		}
51 | 	}
52 | 
53 | 	distance = dist[(cols*rows)-1]
54 | 
55 | 	return
56 | }
57 | 


--------------------------------------------------------------------------------
/osa_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var osatests = []struct {
 6 | 	s1   string
 7 | 	s2   string
 8 | 	dist int
 9 | }{
10 | 	// insertion
11 | 	{"car", "cars", 1},
12 | 	// substitution
13 | 	{"library", "librari", 1},
14 | 	// deletion
15 | 	{"library", "librar", 1},
16 | 	// transposition
17 | 	{"library", "librayr", 1},
18 | 	// one empty, left
19 | 	{"", "library", 7},
20 | 	// one empty, right
21 | 	{"library", "", 7},
22 | 	// two empties
23 | 	{"", "", 0},
24 | 	// unicode stuff!
25 | 	{"Schüßler", "Schübler", 1},
26 | 	{"Schüßler", "Schußler", 1},
27 | 	{"Schüßler", "Schüßler", 0},
28 | 	{"Schßüler", "Schüßler", 1},
29 | 	{"Schüßler", "Schüler", 1},
30 | 	{"Schüßler", "Schüßlers", 1},
31 | 	// difference between DL and OSA. This is OSA, so it should be 3.
32 | 	{"ca", "abc", 3},
33 | }
34 | 
35 | // OSA (Optimal String Alignment)
36 | func TestOSA(t *testing.T) {
37 | 	for _, tt := range osatests {
38 | 		dist := OSA(tt.s1, tt.s2)
39 | 		if dist != tt.dist {
40 | 			t.Errorf("OSA('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
41 | 		}
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/phonex.go:
--------------------------------------------------------------------------------
  1 | package matchr
  2 | 
  3 | func preProcess(input []rune) []rune {
  4 | 	output := runestring(make([]rune, 0, len(input)))
  5 | 
  6 | 	// 0. Remove all non-ASCII characters
  7 | 	for _, v := range input {
  8 | 		if v >= 65 && v <= 90 {
  9 | 			output = append(output, v)
 10 | 		}
 11 | 	}
 12 | 
 13 | 	// 1. Remove all trailing 'S' characters at the end of the name
 14 | 	for i := len(output) - 1; i >= 0 && output[i] == 'S'; i-- {
 15 | 		output.Del(i)
 16 | 	}
 17 | 
 18 | 	// 2. Convert leading letter pairs as follows
 19 | 	//    KN -> N, PH -> F, WR -> R
 20 | 	switch output.SafeSubstr(0, 2) {
 21 | 	case "KN":
 22 | 		output = output[1:]
 23 | 	case "PH":
 24 | 		output[0] = 'F' // H will be ignored anyway
 25 | 	case "WR":
 26 | 		output = output[1:]
 27 | 	}
 28 | 
 29 | 	// 3a. Convert leading single letters as follows:
 30 | 	//    H         -> Remove
 31 | 	if output.SafeAt(0) == 'H' {
 32 | 		output = output[1:]
 33 | 	}
 34 | 
 35 | 	// 3a. Convert leading single letters as follows:
 36 | 	//    E,I,O,U,Y -> A
 37 | 	//    P         -> B
 38 | 	//    V         -> F
 39 | 	//    K,Q       -> C
 40 | 	//    J         -> G
 41 | 	//    Z         -> S
 42 | 	switch output.SafeAt(0) {
 43 | 	case 'E', 'I', 'O', 'U', 'Y':
 44 | 		output[0] = 'A'
 45 | 	case 'P':
 46 | 		output[0] = 'B'
 47 | 	case 'V':
 48 | 		output[0] = 'F'
 49 | 	case 'K', 'Q':
 50 | 		output[0] = 'C'
 51 | 	case 'J':
 52 | 		output[0] = 'G'
 53 | 	case 'Z':
 54 | 		output[0] = 'S'
 55 | 	}
 56 | 
 57 | 	return output
 58 | }
 59 | 
 60 | // Phonex computes the Phonex phonetic encoding of the input string. Phonex is
 61 | // a modification of the venerable Soundex algorithm. It accounts for a few
 62 | // more letter combinations to improve accuracy on some data sets.
 63 | //
 64 | // This implementation is based off of the original C implementation by the
 65 | // creator - A. J. Lait - as found in his research paper entitled "An
 66 | // Assessment of Name Matching Algorithms."
 67 | func Phonex(s1 string) string {
 68 | 
 69 | 	// preprocess
 70 | 	s1 = cleanInput(s1)
 71 | 
 72 | 	input := runestring(preProcess([]rune(s1)))
 73 | 
 74 | 	result := make([]rune, 0, len(input))
 75 | 
 76 | 	last := rune(0)
 77 | 	code := rune(0)
 78 | 	for i := 0; i < len(input) &&
 79 | 		input[i] != ' ' &&
 80 | 		input[i] != ',' &&
 81 | 		len(result) < 4; i++ {
 82 | 		switch input[i] {
 83 | 		case 'B', 'P', 'F', 'V':
 84 | 			code = '1'
 85 | 		case 'C', 'S', 'K', 'G', 'J', 'Q', 'X', 'Z':
 86 | 			code = '2'
 87 | 		case 'D', 'T':
 88 | 			if input.SafeAt(i+1) != 'C' {
 89 | 				code = '3'
 90 | 			}
 91 | 		case 'L':
 92 | 			if isVowel(input.SafeAt(i+1)) || i == len(input)-1 {
 93 | 				code = '4'
 94 | 			}
 95 | 		case 'M', 'N':
 96 | 			nextChar := input.SafeAt(i + 1)
 97 | 			if nextChar == 'D' || nextChar == 'G' {
 98 | 				// ignore next character
 99 | 				i++
100 | 			}
101 | 			code = '5'
102 | 		case 'R':
103 | 			if isVowel(input.SafeAt(i+1)) || i == len(input)-1 {
104 | 				code = '6'
105 | 			}
106 | 		default:
107 | 			code = 0
108 | 		}
109 | 
110 | 		if last != code && code != 0 && i != 0 {
111 | 			result = append(result, code)
112 | 		}
113 | 
114 | 		// special case for 1st character: we use the actual character
115 | 		if i == 0 {
116 | 			result = append(result, input[i])
117 | 			last = code
118 | 		} else {
119 | 			last = result[len(result)-1]
120 | 		}
121 | 	}
122 | 
123 | 	for len(result) < 4 {
124 | 		result = append(result, '0')
125 | 	}
126 | 
127 | 	return string(result)
128 | }
129 | 


--------------------------------------------------------------------------------
/phonex_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | // test cases from http://rosettacode.org/wiki/phonex#F.23
 6 | var phonextests = []struct {
 7 | 	s1     string
 8 | 	phonex string
 9 | }{
10 | 	{"123 testsss", "T230"},
11 | 	{"24/7 test", "T230"},
12 | 	{"A", "A000"},
13 | 	{"Lee", "L000"},
14 | 	{"Kuhne", "C500"},
15 | 	{"Meyer-Lansky", "M452"},
16 | 	{"Oepping", "A150"},
17 | 	{"Daley", "D400"},
18 | 	{"Dalitz", "D432"},
19 | 	{"Duhlitz", "D432"},
20 | 	{"Dull", "D400"},
21 | 	{"De Ledes", "D430"},
22 | 	{"Sandemann", "S500"},
23 | 	{"Schüßler", "S460"},
24 | 	{"Schmidt", "S530"},
25 | 	{"Sinatra", "S536"},
26 | 	{"Heinrich", "A562"},
27 | 	{"Hammerschlag", "A524"},
28 | 	{"Williams", "W450"},
29 | 	{"Wilms", "W500"},
30 | 	{"Wilson", "W250"},
31 | 	{"Worms", "W500"},
32 | 	{"Zedlitz", "S343"},
33 | 	{"Zotteldecke", "S320"},
34 | 	{"ZYX test", "S232"},
35 | 	{"Scherman", "S500"},
36 | 	{"Schurman", "S500"},
37 | 	{"Sherman", "S500"},
38 | 	{"Shermansss", "S500"},
39 | 	{"Shireman", "S650"},
40 | 	{"Shurman", "S500"},
41 | 	{"Euler", "A460"},
42 | 	{"Ellery", "A460"},
43 | 	{"Hilbert", "A130"},
44 | 	{"Heilbronn", "A165"},
45 | 	{"Gauss", "G000"},
46 | 	{"Ghosh", "G200"},
47 | 	{"Knuth", "N300"},
48 | 	{"Kant", "C530"},
49 | 	{"Lloyd", "L430"},
50 | 	{"Ladd", "L300"},
51 | 	{"Lukasiewicz", "L200"},
52 | 	{"Lissajous", "L200"},
53 | 	{"Ashcraft", "A261"},
54 | 	{"Philip", "F410"},
55 | 	{"Fripp", "F610"},
56 | 	{"Czarkowska", "C200"},
57 | 	{"Hornblower", "A514"},
58 | 	{"Looser", "L260"},
59 | 	{"Wright", "R230"},
60 | 	{"Phonic", "F520"},
61 | 	{"Quickening", "C250"},
62 | 	{"Kuickening", "C250"},
63 | 	{"Joben", "G150"},
64 | 	{"Zelda", "S300"},
65 | 	{"S", "0000"},
66 | 	{"H", "0000"},
67 | 	{"", "0000"},
68 | }
69 | 
70 | // phonex
71 | func TestPhonex(t *testing.T) {
72 | 	for _, tt := range phonextests {
73 | 		phonex := Phonex(tt.s1)
74 | 		if phonex != tt.phonex {
75 | 			t.Errorf("Phonex('%s') = %v, want %v", tt.s1, phonex, tt.phonex)
76 | 		}
77 | 	}
78 | }
79 | 


--------------------------------------------------------------------------------
/runestring.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | type runestring []rune
 4 | 
 5 | // A safe way to index a runestring. It will return a null rune if you try
 6 | // to index outside of the bounds of the runestring.
 7 | func (r *runestring) SafeAt(pos int) rune {
 8 | 	if pos < 0 || pos >= len(*r) {
 9 | 		return 0
10 | 	} else {
11 | 		return (*r)[pos]
12 | 	}
13 | }
14 | 
15 | // A safe way to obtain a substring of a runestring. It will return a null
16 | // string ("") if you index somewhere outside its bounds.
17 | func (r *runestring) SafeSubstr(pos int, length int) string {
18 | 	if pos < 0 || pos > len(*r) || (pos+length) > len(*r) {
19 | 		return ""
20 | 	} else {
21 | 		return string((*r)[pos : pos+length])
22 | 	}
23 | }
24 | 
25 | // Delete characters at positions pos. It will do nothing if you provide
26 | // an index outside the bounds of the runestring.
27 | func (r *runestring) Del(pos ...int) {
28 | 	for _, i := range pos {
29 | 		if i >= 0 && i <= len(*r) {
30 | 			*r = append((*r)[:i], (*r)[i+1:]...)
31 | 		}
32 | 	}
33 | }
34 | 
35 | // A helper to determine if any substrings exist within the given runestring.
36 | func (r *runestring) Contains(start int, length int, criteria ...string) bool {
37 | 	substring := r.SafeSubstr(start, length)
38 | 	for _, c := range criteria {
39 | 		if substring == c {
40 | 			return true
41 | 		}
42 | 	}
43 | 	return false
44 | }
45 | 


--------------------------------------------------------------------------------
/smithwaterman.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | const GAP_COST = float64(0.5)
 4 | 
 5 | func getCost(r1 []rune, r1Index int, r2 []rune, r2Index int) float64 {
 6 | 	if r1[r1Index] == r2[r2Index] {
 7 | 		return 1.0
 8 | 	} else {
 9 | 		return -2.0
10 | 	}
11 | }
12 | 
13 | // SmithWaterman computes the Smith-Waterman local sequence alignment for the
14 | // two input strings. This was originally designed to find similar regions in
15 | // strings representing DNA or protein sequences.
16 | func SmithWaterman(s1 string, s2 string) float64 {
17 | 	var cost float64
18 | 
19 | 	// index by code point, not byte
20 | 	r1 := []rune(s1)
21 | 	r2 := []rune(s2)
22 | 
23 | 	r1Len := len(r1)
24 | 	r2Len := len(r2)
25 | 
26 | 	if r1Len == 0 {
27 | 		return float64(r2Len)
28 | 	}
29 | 
30 | 	if r2Len == 0 {
31 | 		return float64(r1Len)
32 | 	}
33 | 
34 | 	d := make([][]float64, r1Len)
35 | 	for i := range d {
36 | 		d[i] = make([]float64, r2Len)
37 | 	}
38 | 
39 | 	var maxSoFar float64
40 | 	for i := 0; i < r1Len; i++ {
41 | 		// substitution cost
42 | 		cost = getCost(r1, i, r2, 0)
43 | 		if i == 0 {
44 | 			d[0][0] = max(0.0, max(-GAP_COST, cost))
45 | 		} else {
46 | 			d[i][0] = max(0.0, max(d[i-1][0]-GAP_COST, cost))
47 | 		}
48 | 
49 | 		// save if it is the biggest thus far
50 | 		if d[i][0] > maxSoFar {
51 | 			maxSoFar = d[i][0]
52 | 		}
53 | 	}
54 | 
55 | 	for j := 0; j < r2Len; j++ {
56 | 		// substitution cost
57 | 		cost = getCost(r1, 0, r2, j)
58 | 		if j == 0 {
59 | 			d[0][0] = max(0, max(-GAP_COST, cost))
60 | 		} else {
61 | 			d[0][j] = max(0, max(d[0][j-1]-GAP_COST, cost))
62 | 		}
63 | 
64 | 		// save if it is the biggest thus far
65 | 		if d[0][j] > maxSoFar {
66 | 			maxSoFar = d[0][j]
67 | 		}
68 | 	}
69 | 
70 | 	for i := 1; i < r1Len; i++ {
71 | 		for j := 1; j < r2Len; j++ {
72 | 			cost = getCost(r1, i, r2, j)
73 | 
74 | 			// find the lowest cost
75 | 			d[i][j] = max(
76 | 				max(0, d[i-1][j]-GAP_COST),
77 | 				max(d[i][j-1]-GAP_COST, d[i-1][j-1]+cost))
78 | 
79 | 			// save if it is the biggest thus far
80 | 			if d[i][j] > maxSoFar {
81 | 				maxSoFar = d[i][j]
82 | 			}
83 | 		}
84 | 	}
85 | 
86 | 	return maxSoFar
87 | }
88 | 


--------------------------------------------------------------------------------
/smithwaterman_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | var swtests = []struct {
 6 | 	s1   string
 7 | 	s2   string
 8 | 	dist float64
 9 | }{
10 | 	// insertion
11 | 	{"car", "cars", 3.0},
12 | 	// substitution
13 | 	{"library", "librari", 6.0},
14 | 	// deletion
15 | 	{"library", "librar", 6.0},
16 | 	// transposition
17 | 	{"library", "librayr", 5.5},
18 | 	// one empty, left
19 | 	{"", "library", 7.0},
20 | 	// one empty, right
21 | 	{"library", "", 7.0},
22 | 	// two empties
23 | 	{"", "", 0.0},
24 | 	// unicode stuff!
25 | 	{"Schüßler", "Schübler", 6.0},
26 | 	{"Ant Zucaro", "Anthony Zucaro", 8.0},
27 | 	{"Schüßler", "Schüßler", 8.0},
28 | 	{"Schßüler", "Schüßler", 6.0},
29 | 	{"Schüßler", "Schüler", 6.5},
30 | 	{"Schüßler", "Schüßlers", 8.0},
31 | }
32 | 
33 | // Smith-Waterman
34 | func TestSmithWaterman(t *testing.T) {
35 | 	for _, tt := range swtests {
36 | 		dist := SmithWaterman(tt.s1, tt.s2)
37 | 		if dist != tt.dist {
38 | 			t.Errorf("SmithWaterman('%s', '%s') = %v, want %v", tt.s1, tt.s2, dist, tt.dist)
39 | 		}
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/soundex.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "strings"
 4 | 
 5 | // Soundex computes the Soundex phonetic representation of the input string. It
 6 | // attempts to encode homophones with the same characters. More information can
 7 | // be found at http://en.wikipedia.org/wiki/Soundex.
 8 | func Soundex(s1 string) string {
 9 | 	if len(s1) == 0 {
10 | 		return ""
11 | 	}
12 | 
13 | 	// we should work with all uppercase
14 | 	s1 = strings.ToUpper(s1)
15 | 
16 | 	input := NewString(s1)
17 | 
18 | 	// the encoded value
19 | 	enc := input.Slice(0, 1)
20 | 
21 | 	c := ""
22 | 	prev := ""
23 | 	hw := false
24 | 
25 | 	for i := 0; i < input.RuneCount(); i++ {
26 | 		switch rune(input.At(i)) {
27 | 		case 'B', 'F', 'P', 'V':
28 | 			c = "1"
29 | 		case 'C', 'G', 'J', 'K', 'Q', 'S', 'X', 'Z':
30 | 			c = "2"
31 | 		case 'D', 'T':
32 | 			c = "3"
33 | 		case 'L':
34 | 			c = "4"
35 | 		case 'M', 'N':
36 | 			c = "5"
37 | 		case 'R':
38 | 			c = "6"
39 | 		case 'H', 'W':
40 | 			hw = true
41 | 		default:
42 | 			c = ""
43 | 		}
44 | 
45 | 		// don't encode the first position, but we need its code value
46 | 		// to prevent repeats
47 | 		if c != "" && c != prev && i > 0 {
48 | 			// if the next encoded digit is different, we can add it right away
49 | 			// if it is the same, though, it must not have been preceded
50 | 			// by an 'H' or a 'W'
51 | 			if enc[len(enc)-1:len(enc)] != c || !hw {
52 | 				enc = enc + c
53 | 			}
54 | 
55 | 			// we're done when we reach four encoded characters
56 | 			if len(enc) == 4 {
57 | 				break
58 | 			}
59 | 		}
60 | 
61 | 		prev = c
62 | 		hw = false
63 | 	}
64 | 
65 | 	// if we've fallen short of 4 "real" encoded characters,
66 | 	// it gets padded with zeros
67 | 	for len(enc) < 4 {
68 | 		enc = enc + "0"
69 | 	}
70 | 
71 | 	return enc
72 | }
73 | 


--------------------------------------------------------------------------------
/soundex_test.go:
--------------------------------------------------------------------------------
 1 | package matchr
 2 | 
 3 | import "testing"
 4 | 
 5 | // test cases from http://rosettacode.org/wiki/Soundex#F.23
 6 | var soundextests = []struct {
 7 | 	s1      string
 8 | 	soundex string
 9 | }{
10 | 	{"Ashcraft", "A261"},
11 | 	{"Ashhhcraft", "A261"},
12 | 	{"Ashcroft", "A261"},
13 | 	{"Burroughs", "B620"},
14 | 	{"Burrows", "B620"},
15 | 	{"Ekzampul", "E251"},
16 | 	{"Example", "E251"},
17 | 	{"Ellery", "E460"},
18 | 	{"Euler", "E460"},
19 | 	{"Ghosh", "G200"},
20 | 	{"Gauss", "G200"},
21 | 	{"Gutierrez", "G362"},
22 | 	{"Heilbronn", "H416"},
23 | 	{"Hilbert", "H416"},
24 | 	{"Jackson", "J250"},
25 | 	{"Kant", "K530"},
26 | 	{"Knuth", "K530"},
27 | 	{"Lee", "L000"},
28 | 	{"Lukasiewicz", "L222"},
29 | 	{"Lissajous", "L222"},
30 | 	{"Ladd", "L300"},
31 | 	{"Lloyd", "L300"},
32 | 	{"Moses", "M220"},
33 | 	{"O'Hara", "O600"},
34 | 	{"Pfister", "P236"},
35 | 	{"Rubin", "R150"},
36 | 	{"Robert", "R163"},
37 | 	{"Rupert", "R163"},
38 | 	{"Soundex", "S532"},
39 | 	{"Sownteks", "S532"},
40 | 	{"Tymczak", "T522"},
41 | 	{"VanDeusen", "V532"},
42 | 	{"Washington", "W252"},
43 | 	{"Wheaton", "W350"},
44 | }
45 | 
46 | // Soundex
47 | func TestSoundex(t *testing.T) {
48 | 	for _, tt := range soundextests {
49 | 		soundex := Soundex(tt.s1)
50 | 		if soundex != tt.soundex {
51 | 			t.Errorf("Soundex('%s') = %v, want %v", tt.s1, soundex, tt.soundex)
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/utf8.go:
--------------------------------------------------------------------------------
  1 | // Copyright 2009 The Go Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package matchr
  6 | 
  7 | import (
  8 | 	"errors"
  9 | 	"unicode/utf8"
 10 | )
 11 | 
 12 | // String wraps a regular string with a small structure that provides more
 13 | // efficient indexing by code point index, as opposed to byte index.
 14 | // Scanning incrementally forwards or backwards is O(1) per index operation
 15 | // (although not as fast a range clause going forwards).  Random access is
 16 | // O(N) in the length of the string, but the overhead is less than always
 17 | // scanning from the beginning.
 18 | // If the string is ASCII, random access is O(1).
 19 | // Unlike the built-in string type, String has internal mutable state and
 20 | // is not thread-safe.
 21 | type String struct {
 22 | 	str      string
 23 | 	numRunes int
 24 | 	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
 25 | 	width    int
 26 | 	bytePos  int
 27 | 	runePos  int
 28 | 	nonASCII int // byte index of the first non-ASCII rune.
 29 | }
 30 | 
 31 | // NewString returns a new UTF-8 string with the provided contents.
 32 | func NewString(contents string) *String {
 33 | 	return new(String).Init(contents)
 34 | }
 35 | 
 36 | // Init initializes an existing String to hold the provided contents.
 37 | // It returns a pointer to the initialized String.
 38 | func (s *String) Init(contents string) *String {
 39 | 	s.str = contents
 40 | 	s.bytePos = 0
 41 | 	s.runePos = 0
 42 | 	for i := 0; i < len(contents); i++ {
 43 | 		if contents[i] >= utf8.RuneSelf {
 44 | 			// Not ASCII.
 45 | 			s.numRunes = utf8.RuneCountInString(contents)
 46 | 			_, s.width = utf8.DecodeRuneInString(contents)
 47 | 			s.nonASCII = i
 48 | 			return s
 49 | 		}
 50 | 	}
 51 | 	// ASCII is simple.  Also, the empty string is ASCII.
 52 | 	s.numRunes = len(contents)
 53 | 	s.width = 0
 54 | 	s.nonASCII = len(contents)
 55 | 	return s
 56 | }
 57 | 
 58 | // String returns the contents of the String.  This method also means the
 59 | // String is directly printable by fmt.Print.
 60 | func (s *String) String() string {
 61 | 	return s.str
 62 | }
 63 | 
 64 | // RuneCount returns the number of runes (Unicode code points) in the String.
 65 | func (s *String) RuneCount() int {
 66 | 	return s.numRunes
 67 | }
 68 | 
 69 | // IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
 70 | func (s *String) IsASCII() bool {
 71 | 	return s.width == 0
 72 | }
 73 | 
 74 | // Slice returns the string sliced at rune positions [i:j].
 75 | func (s *String) Slice(i, j int) string {
 76 | 	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
 77 | 	if j < s.nonASCII {
 78 | 		return s.str[i:j]
 79 | 	}
 80 | 	if i < 0 || j > s.numRunes || i > j {
 81 | 		panic(errors.New("utf8.String: slice index out of range"))
 82 | 	}
 83 | 	if i == j {
 84 | 		return ""
 85 | 	}
 86 | 	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
 87 | 	var low, high int
 88 | 	switch {
 89 | 	case i < s.nonASCII:
 90 | 		low = i
 91 | 	case i == s.numRunes:
 92 | 		low = len(s.str)
 93 | 	default:
 94 | 		s.At(i)
 95 | 		low = s.bytePos
 96 | 	}
 97 | 	switch {
 98 | 	case j == s.numRunes:
 99 | 		high = len(s.str)
100 | 	default:
101 | 		s.At(j)
102 | 		high = s.bytePos
103 | 	}
104 | 	return s.str[low:high]
105 | }
106 | 
107 | // At returns the rune with index i in the String.  The sequence of runes is the same
108 | // as iterating over the contents with a "for range" clause.
109 | func (s *String) At(i int) int {
110 | 	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
111 | 	if i < s.nonASCII {
112 | 		return int(s.str[i])
113 | 	}
114 | 
115 | 	// Now we do need to know the index is valid.
116 | 	if i < 0 || i >= s.numRunes {
117 | 		panic(errors.New("utf8.String: index out of range"))
118 | 	}
119 | 
120 | 	var r rune
121 | 
122 | 	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
123 | 	// With these cases, all scans from beginning or end work in O(1) time per rune.
124 | 	switch {
125 | 
126 | 	case i == s.runePos-1: // backing up one rune
127 | 		r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
128 | 		s.runePos = i
129 | 		s.bytePos -= s.width
130 | 		return int(r)
131 | 	case i == s.runePos+1: // moving ahead one rune
132 | 		s.runePos = i
133 | 		s.bytePos += s.width
134 | 		fallthrough
135 | 	case i == s.runePos:
136 | 		r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
137 | 		return int(r)
138 | 	case i == 0: // start of string
139 | 		r, s.width = utf8.DecodeRuneInString(s.str)
140 | 		s.runePos = 0
141 | 		s.bytePos = 0
142 | 		return int(r)
143 | 
144 | 	case i == s.numRunes-1: // last rune in string
145 | 		r, s.width = utf8.DecodeLastRuneInString(s.str)
146 | 		s.runePos = i
147 | 		s.bytePos = len(s.str) - s.width
148 | 		return int(r)
149 | 	}
150 | 
151 | 	// We need to do a linear scan.  There are three places to start from:
152 | 	// 1) The beginning
153 | 	// 2) bytePos/runePos.
154 | 	// 3) The end
155 | 	// Choose the closest in rune count, scanning backwards if necessary.
156 | 	forward := true
157 | 	if i < s.runePos {
158 | 		// Between beginning and pos.  Which is closer?
159 | 		// Since both i and runePos are guaranteed >= nonASCII, that's the
160 | 		// lowest location we need to start from.
161 | 		if i < (s.runePos-s.nonASCII)/2 {
162 | 			// Scan forward from beginning
163 | 			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
164 | 		} else {
165 | 			// Scan backwards from where we are
166 | 			forward = false
167 | 		}
168 | 	} else {
169 | 		// Between pos and end.  Which is closer?
170 | 		if i-s.runePos < (s.numRunes-s.runePos)/2 {
171 | 			// Scan forward from pos
172 | 		} else {
173 | 			// Scan backwards from end
174 | 			s.bytePos, s.runePos = len(s.str), s.numRunes
175 | 			forward = false
176 | 		}
177 | 	}
178 | 	if forward {
179 | 		// TODO: Is it much faster to use a range loop for this scan?
180 | 		for {
181 | 			r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
182 | 			if s.runePos == i {
183 | 				break
184 | 			}
185 | 			s.runePos++
186 | 			s.bytePos += s.width
187 | 		}
188 | 	} else {
189 | 		for {
190 | 			r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
191 | 			s.runePos--
192 | 			s.bytePos -= s.width
193 | 			if s.runePos == i {
194 | 				break
195 | 			}
196 | 		}
197 | 	}
198 | 	return int(r)
199 | }
200 | 
201 | // We want the panic in At(i) to satisfy os.Error, because that's what
202 | // runtime panics satisfy, but we can't import os.  This is our solution.
203 | 
204 | // error is the type of the error returned if a user calls String.At(i) with i out of range.
205 | // It satisfies os.Error and runtime.Error.
206 | // type error string
207 | 
208 | /*
209 | func (err error) String() string {
210 | 	return string(err)
211 | }
212 | 
213 | func (err error) RunTimeError() {
214 | }
215 | */
216 | 


--------------------------------------------------------------------------------
/util.go:
--------------------------------------------------------------------------------
  1 | package matchr
  2 | 
  3 | import (
  4 | 	"math"
  5 | 	"strings"
  6 | )
  7 | 
  8 | // min of two integers
  9 | func min(a int, b int) (res int) {
 10 | 	if a < b {
 11 | 		res = a
 12 | 	} else {
 13 | 		res = b
 14 | 	}
 15 | 
 16 | 	return
 17 | }
 18 | 
 19 | // max of two integers
 20 | func maxI(a int, b int) (res int) {
 21 | 	if a < b {
 22 | 		res = b
 23 | 	} else {
 24 | 		res = a
 25 | 	}
 26 | 
 27 | 	return
 28 | }
 29 | 
 30 | // max of two float64s
 31 | func max(a float64, b float64) (res float64) {
 32 | 	if a < b {
 33 | 		res = b
 34 | 	} else {
 35 | 		res = a
 36 | 	}
 37 | 
 38 | 	return
 39 | }
 40 | 
 41 | // is this string index outside of the ASCII numeric code points?
 42 | func nan(c rune) bool {
 43 | 	return ((c > 57) || (c < 48))
 44 | }
 45 | 
 46 | // Round a float64 to the given precision
 47 | //
 48 | // http://play.golang.org/p/S654PxAe_N
 49 | //
 50 | // (via Rory McGuire at
 51 | // https://groups.google.com/forum/#!topic/golang-nuts/ITZV08gAugI)
 52 | func round(x float64, prec int) float64 {
 53 | 	if math.IsNaN(x) || math.IsInf(x, 0) {
 54 | 		return x
 55 | 	}
 56 | 
 57 | 	sign := 1.0
 58 | 	if x < 0 {
 59 | 		sign = -1
 60 | 		x *= -1
 61 | 	}
 62 | 
 63 | 	var rounder float64
 64 | 	pow := math.Pow(10, float64(prec))
 65 | 	intermed := x * pow
 66 | 	_, frac := math.Modf(intermed)
 67 | 
 68 | 	if frac >= 0.5 {
 69 | 		rounder = math.Ceil(intermed)
 70 | 	} else {
 71 | 		rounder = math.Floor(intermed)
 72 | 	}
 73 | 
 74 | 	return rounder / pow * sign
 75 | }
 76 | 
 77 | // A helper to determine if any substrings exist within the given string
 78 | func contains(value *String, start int, length int, criteria ...string) bool {
 79 | 	substring := substring(value, start, length)
 80 | 	for _, c := range criteria {
 81 | 		if substring == c {
 82 | 			return true
 83 | 		}
 84 | 	}
 85 | 	return false
 86 | }
 87 | 
 88 | // A fault-tolerant version of Slice. It will return nothing ("") if the index
 89 | // is out of bounds. This allows substring-ing without having to bound check
 90 | // every time.
 91 | func substring(value *String, start int, length int) string {
 92 | 	if start >= 0 && start+length <= value.RuneCount() {
 93 | 		return value.Slice(start, start+length)
 94 | 	} else {
 95 | 		return ""
 96 | 	}
 97 | }
 98 | 
 99 | func isVowel(c rune) bool {
100 | 	switch c {
101 | 	case 'A', 'E', 'I', 'O', 'U', 'Y':
102 | 		return true
103 | 	default:
104 | 		return false
105 | 	}
106 | }
107 | 
108 | func isVowelNoY(c rune) bool {
109 | 	switch c {
110 | 	case 'A', 'E', 'I', 'O', 'U':
111 | 		return true
112 | 	default:
113 | 		return false
114 | 	}
115 | }
116 | 
117 | func cleanInput(input string) string {
118 | 	return strings.ToUpper(strings.TrimSpace(input))
119 | }
120 | 


--------------------------------------------------------------------------------