├── go.mod ├── .gitignore ├── README.md ├── LICENSE └── levenshtein ├── levenshtein_test.go └── levenshtein.go /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/texttheater/golang-levenshtein 2 | 3 | go 1.13 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | golang-levenshtein 2 | ================== 3 | 4 | An implementation of the Levenshtein algorithm in Go. Provides edit distances, 5 | edit scripts and ratios for strings (slices of runes). 6 | 7 | Installation 8 | ------------ 9 | 10 | $ go get github.com/texttheater/golang-levenshtein/levenshtein 11 | 12 | Documentation 13 | ------------- 14 | 15 | The documentation can be viewed online here: 16 | https://godoc.org/github.com/texttheater/golang-levenshtein/levenshtein 17 | 18 | See also 19 | -------- 20 | 21 | For a package that is similar but more generic and provides more control, 22 | check out Daniël de Kok’s 23 | [editdistance](https://github.com/danieldk/editdistance). 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2013 Kilian Evang and contributors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /levenshtein/levenshtein_test.go: -------------------------------------------------------------------------------- 1 | package levenshtein 2 | 3 | import ( 4 | "fmt" 5 | "os" 6 | "testing" 7 | ) 8 | 9 | var testCases = []struct { 10 | source string 11 | target string 12 | options Options 13 | distance int 14 | ratio float64 15 | script EditScript 16 | }{ 17 | { 18 | source: "", 19 | target: "a", 20 | options: DefaultOptions, 21 | distance: 1, 22 | ratio: 0.0, 23 | script: EditScript{Ins}, 24 | }, 25 | { 26 | source: "a", 27 | target: "aa", 28 | options: DefaultOptions, 29 | distance: 1, 30 | ratio: 0.6666666666666666, 31 | script: EditScript{Match, Ins}, 32 | }, 33 | { 34 | source: "a", 35 | target: "aaa", 36 | options: DefaultOptions, 37 | distance: 2, 38 | ratio: 0.5, 39 | script: EditScript{Match, Ins, Ins}, 40 | }, 41 | { 42 | source: "", 43 | target: "", 44 | options: DefaultOptions, 45 | distance: 0, 46 | ratio: 0, 47 | script: EditScript{}, 48 | }, 49 | { 50 | source: "a", 51 | target: "b", 52 | options: DefaultOptions, 53 | distance: 2, 54 | ratio: 0, 55 | script: EditScript{Ins, Del}, 56 | }, 57 | { 58 | source: "aaa", 59 | target: "aba", 60 | options: DefaultOptions, 61 | distance: 2, 62 | ratio: 0.6666666666666666, 63 | script: EditScript{Match, Ins, Match, Del}, 64 | }, 65 | { 66 | source: "aaa", 67 | target: "ab", 68 | options: DefaultOptions, 69 | distance: 3, 70 | ratio: 0.4, 71 | script: EditScript{Match, Ins, Del, Del}, 72 | }, 73 | { 74 | source: "a", 75 | target: "a", 76 | options: DefaultOptions, 77 | distance: 0, 78 | ratio: 1, 79 | script: EditScript{Match}, 80 | }, 81 | { 82 | source: "ab", 83 | target: "ab", 84 | options: DefaultOptions, 85 | distance: 0, 86 | ratio: 1, 87 | script: EditScript{Match, Match}, 88 | }, 89 | { 90 | source: "a", 91 | target: "", 92 | options: DefaultOptions, 93 | distance: 1, 94 | ratio: 0, 95 | script: EditScript{Del}, 96 | }, 97 | { 98 | source: "aa", 99 | target: "a", 100 | options: DefaultOptions, 101 | distance: 1, 102 | ratio: 0.6666666666666666, 103 | script: EditScript{Match, Del}, 104 | }, 105 | { 106 | source: "aaa", 107 | target: "a", 108 | options: DefaultOptions, 109 | distance: 2, 110 | ratio: 0.5, 111 | script: EditScript{Match, Del, Del}, 112 | }, 113 | { 114 | source: "kitten", 115 | target: "sitting", 116 | options: DefaultOptions, 117 | distance: 5, 118 | ratio: 0.6153846153846154, 119 | script: EditScript{ 120 | Ins, 121 | Del, 122 | Match, 123 | Match, 124 | Match, 125 | Ins, 126 | Del, 127 | Match, 128 | Ins, 129 | }, 130 | }, 131 | { 132 | source: "kitten", 133 | target: "sitting", 134 | options: DefaultOptionsWithSub, 135 | distance: 3, 136 | ratio: 0.7692307692307693, 137 | script: EditScript{ 138 | Sub, 139 | Match, 140 | Match, 141 | Match, 142 | Sub, 143 | Match, 144 | Ins, 145 | }, 146 | }, 147 | { 148 | source: "Orange", 149 | target: "Apple", 150 | options: DefaultOptionsWithSub, 151 | distance: 5, 152 | ratio: 0.5454545454545454, 153 | script: EditScript{ 154 | Sub, 155 | Sub, 156 | Sub, 157 | Sub, 158 | Del, 159 | Match, 160 | }, 161 | }, 162 | { 163 | source: "ab", 164 | target: "bc", 165 | options: DefaultOptionsWithSub, 166 | distance: 2, 167 | ratio: 0.5, 168 | script: EditScript{ 169 | Del, 170 | Match, 171 | Ins, 172 | }, 173 | }, 174 | { 175 | source: "abd", 176 | target: "bec", 177 | options: DefaultOptionsWithSub, 178 | distance: 3, 179 | ratio: 0.5, 180 | script: EditScript{ 181 | Del, 182 | Match, 183 | Sub, 184 | Ins, 185 | }, 186 | }, 187 | { 188 | source: "me", 189 | target: "meme", 190 | options: Options{ 191 | InsCost: 2, 192 | DelCost: 1, 193 | SubCost: 3, 194 | Matches: IdenticalRunes, 195 | }, 196 | distance: 4, 197 | ratio: 0.3333333333333333, 198 | script: EditScript{ 199 | Match, 200 | Match, 201 | Ins, 202 | Ins, 203 | }, 204 | }, 205 | } 206 | 207 | func TestDistanceForStrings(t *testing.T) { 208 | for _, testCase := range testCases { 209 | distance := DistanceForStrings( 210 | []rune(testCase.source), 211 | []rune(testCase.target), 212 | testCase.options) 213 | if distance != testCase.distance { 214 | t.Log( 215 | "Distance between", 216 | testCase.source, 217 | "and", 218 | testCase.target, 219 | "computed as", 220 | distance, 221 | ", should be", 222 | testCase.distance) 223 | t.Fail() 224 | } 225 | // DistanceForMatrix(MatrixForStrings()) should calculate the same 226 | // value as DistanceForStrings. 227 | distance = DistanceForMatrix(MatrixForStrings( 228 | []rune(testCase.source), 229 | []rune(testCase.target), 230 | testCase.options)) 231 | if distance != testCase.distance { 232 | t.Log( 233 | "Distance between", 234 | testCase.source, 235 | "and", 236 | testCase.target, 237 | "computed as", 238 | distance, 239 | ", should be", 240 | testCase.distance) 241 | t.Fail() 242 | } 243 | } 244 | } 245 | 246 | func TestRatio(t *testing.T) { 247 | for _, testCase := range testCases { 248 | ratio := RatioForStrings( 249 | []rune(testCase.source), 250 | []rune(testCase.target), 251 | testCase.options) 252 | if ratio != testCase.ratio { 253 | t.Log( 254 | "Ratio between", 255 | testCase.source, 256 | "and", 257 | testCase.target, 258 | "computed as", 259 | ratio, 260 | ", should be", 261 | testCase.ratio) 262 | t.Fail() 263 | } 264 | } 265 | } 266 | 267 | func TestEditScriptForStrings(t *testing.T) { 268 | for _, testCase := range testCases { 269 | script := EditScriptForStrings( 270 | []rune(testCase.source), 271 | []rune(testCase.target), 272 | testCase.options) 273 | if !equal(script, testCase.script) { 274 | t.Log( 275 | "Edit script from", 276 | testCase.source, 277 | "to", 278 | testCase.target, 279 | "computed as", 280 | script, 281 | ", should be", 282 | testCase.script) 283 | t.Fail() 284 | } 285 | } 286 | } 287 | 288 | func equal(a, b EditScript) bool { 289 | for i := range a { 290 | if a[i] != b[i] { 291 | return false 292 | } 293 | } 294 | return true 295 | } 296 | 297 | func ExampleDistanceForStrings() { 298 | source := "a" 299 | target := "aa" 300 | distance := DistanceForStrings([]rune(source), []rune(target), DefaultOptions) 301 | fmt.Printf(`Distance between "%s" and "%s" computed as %d`, source, target, distance) 302 | // Output: Distance between "a" and "aa" computed as 1 303 | } 304 | 305 | func ExampleWriteMatrix() { 306 | source := []rune("neighbor") 307 | target := []rune("Neighbour") 308 | matrix := MatrixForStrings(source, target, DefaultOptions) 309 | WriteMatrix(source, target, matrix, os.Stdout) 310 | // Output: 311 | // N e i g h b o u r 312 | // 0 1 2 3 4 5 6 7 8 9 313 | // n 1 2 3 4 5 6 7 8 9 10 314 | // e 2 3 2 3 4 5 6 7 8 9 315 | // i 3 4 3 2 3 4 5 6 7 8 316 | // g 4 5 4 3 2 3 4 5 6 7 317 | // h 5 6 5 4 3 2 3 4 5 6 318 | // b 6 7 6 5 4 3 2 3 4 5 319 | // o 7 8 7 6 5 4 3 2 3 4 320 | // r 8 9 8 7 6 5 4 3 4 3 321 | } 322 | -------------------------------------------------------------------------------- /levenshtein/levenshtein.go: -------------------------------------------------------------------------------- 1 | // This package implements the Levenshtein algorithm for computing the 2 | // similarity between two strings. The central function is MatrixForStrings, 3 | // which computes the Levenshtein matrix. The functions DistanceForMatrix, 4 | // EditScriptForMatrix and RatioForMatrix read various interesting properties 5 | // off the matrix. The package also provides the convenience functions 6 | // DistanceForStrings, EditScriptForStrings and RatioForStrings for going 7 | // directly from two strings to the property of interest. 8 | package levenshtein 9 | 10 | import ( 11 | "fmt" 12 | "io" 13 | "os" 14 | ) 15 | 16 | type EditOperation int 17 | 18 | const ( 19 | Ins = iota 20 | Del 21 | Sub 22 | Match 23 | ) 24 | 25 | type EditScript []EditOperation 26 | 27 | type MatchFunction func(rune, rune) bool 28 | 29 | // IdenticalRunes is the default MatchFunction: it checks whether two runes are 30 | // identical. 31 | func IdenticalRunes(a rune, b rune) bool { 32 | return a == b 33 | } 34 | 35 | type Options struct { 36 | InsCost int 37 | DelCost int 38 | SubCost int 39 | Matches MatchFunction 40 | } 41 | 42 | // DefaultOptions is the default options without substitution: insertion cost 43 | // is 1, deletion cost is 1, substitution cost is 2 (meaning insert and delete 44 | // will be used instead), and two runes match iff they are identical. 45 | var DefaultOptions Options = Options{ 46 | InsCost: 1, 47 | DelCost: 1, 48 | SubCost: 2, 49 | Matches: IdenticalRunes, 50 | } 51 | 52 | // DefaultOptionsWithSub is the default options with substitution: insertion 53 | // cost is 1, deletion cost is 1, substitution cost is 1, and two runes match 54 | // iff they are identical. 55 | var DefaultOptionsWithSub Options = Options{ 56 | InsCost: 1, 57 | DelCost: 1, 58 | SubCost: 1, 59 | Matches: IdenticalRunes, 60 | } 61 | 62 | func (operation EditOperation) String() string { 63 | if operation == Match { 64 | return "match" 65 | } else if operation == Ins { 66 | return "ins" 67 | } else if operation == Sub { 68 | return "sub" 69 | } 70 | return "del" 71 | } 72 | 73 | // DistanceForStrings returns the edit distance between source and target. 74 | // 75 | // It has a runtime proportional to len(source) * len(target) and memory use 76 | // proportional to len(target). 77 | func DistanceForStrings(source []rune, target []rune, op Options) int { 78 | // Note: This algorithm is a specialization of MatrixForStrings. 79 | // MatrixForStrings returns the full edit matrix. However, we only need a 80 | // single value (see DistanceForMatrix) and the main loop of the algorithm 81 | // only uses the current and previous row. As such we create a 2D matrix, 82 | // but with height 2 (enough to store current and previous row). 83 | height := len(source) + 1 84 | width := len(target) + 1 85 | matrix := make([][]int, 2) 86 | 87 | // Initialize trivial distances (from/to empty string). That is, fill 88 | // the left column and the top row with row/column indices multiplied 89 | // by deletion/insertion cost. 90 | for i := 0; i < 2; i++ { 91 | matrix[i] = make([]int, width) 92 | matrix[i][0] = i * op.DelCost 93 | } 94 | for j := 1; j < width; j++ { 95 | matrix[0][j] = j * op.InsCost 96 | } 97 | 98 | // Fill in the remaining cells: for each prefix pair, choose the 99 | // (edit history, operation) pair with the lowest cost. 100 | for i := 1; i < height; i++ { 101 | cur := matrix[i%2] 102 | prev := matrix[(i-1)%2] 103 | cur[0] = i * op.DelCost 104 | for j := 1; j < width; j++ { 105 | delCost := prev[j] + op.DelCost 106 | matchSubCost := prev[j-1] 107 | if !op.Matches(source[i-1], target[j-1]) { 108 | matchSubCost += op.SubCost 109 | } 110 | insCost := cur[j-1] + op.InsCost 111 | cur[j] = min(delCost, min(matchSubCost, insCost)) 112 | } 113 | } 114 | return matrix[(height-1)%2][width-1] 115 | } 116 | 117 | // DistanceForMatrix reads the edit distance off the given Levenshtein matrix. 118 | func DistanceForMatrix(matrix [][]int) int { 119 | return matrix[len(matrix)-1][len(matrix[0])-1] 120 | } 121 | 122 | // RatioForStrings returns the Levenshtein ratio for the given strings. The 123 | // ratio is computed as follows: 124 | // 125 | // (sourceLength + targetLength - distance) / (sourceLength + targetLength) 126 | func RatioForStrings(source []rune, target []rune, op Options) float64 { 127 | matrix := MatrixForStrings(source, target, op) 128 | return RatioForMatrix(matrix) 129 | } 130 | 131 | // RatioForMatrix returns the Levenshtein ratio for the given matrix. The ratio 132 | // is computed as follows: 133 | // 134 | // (sourceLength + targetLength - distance) / (sourceLength + targetLength) 135 | func RatioForMatrix(matrix [][]int) float64 { 136 | sourcelength := len(matrix) - 1 137 | targetlength := len(matrix[0]) - 1 138 | sum := sourcelength + targetlength 139 | 140 | if sum == 0 { 141 | return 0 142 | } 143 | 144 | dist := DistanceForMatrix(matrix) 145 | return float64(sum-dist) / float64(sum) 146 | } 147 | 148 | // MatrixForStrings generates a 2-D array representing the dynamic programming 149 | // table used by the Levenshtein algorithm, as described e.g. here: 150 | // http://www.let.rug.nl/kleiweg/lev/ 151 | // The reason for putting the creation of the table into a separate function is 152 | // that it cannot only be used for reading of the edit distance between two 153 | // strings, but also e.g. to backtrace an edit script that provides an 154 | // alignment between the characters of both strings. 155 | func MatrixForStrings(source []rune, target []rune, op Options) [][]int { 156 | // Make a 2-D matrix. Rows correspond to prefixes of source, columns to 157 | // prefixes of target. Cells will contain edit distances. 158 | // Cf. http://www.let.rug.nl/~kleiweg/lev/levenshtein.html 159 | height := len(source) + 1 160 | width := len(target) + 1 161 | matrix := make([][]int, height) 162 | 163 | // Initialize trivial distances (from/to empty string). That is, fill 164 | // the left column and the top row with row/column indices multiplied 165 | // by deletion/insertion cost. 166 | for i := 0; i < height; i++ { 167 | matrix[i] = make([]int, width) 168 | matrix[i][0] = i * op.DelCost 169 | } 170 | for j := 1; j < width; j++ { 171 | matrix[0][j] = j * op.InsCost 172 | } 173 | 174 | // Fill in the remaining cells: for each prefix pair, choose the 175 | // (edit history, operation) pair with the lowest cost. 176 | for i := 1; i < height; i++ { 177 | for j := 1; j < width; j++ { 178 | delCost := matrix[i-1][j] + op.DelCost 179 | matchSubCost := matrix[i-1][j-1] 180 | if !op.Matches(source[i-1], target[j-1]) { 181 | matchSubCost += op.SubCost 182 | } 183 | insCost := matrix[i][j-1] + op.InsCost 184 | matrix[i][j] = min(delCost, min(matchSubCost, 185 | insCost)) 186 | } 187 | } 188 | //LogMatrix(source, target, matrix) 189 | return matrix 190 | } 191 | 192 | // EditScriptForStrings returns an optimal edit script to turn source into 193 | // target. 194 | func EditScriptForStrings(source []rune, target []rune, op Options) EditScript { 195 | return backtrace(len(source), len(target), 196 | MatrixForStrings(source, target, op), op) 197 | } 198 | 199 | // EditScriptForMatrix returns an optimal edit script based on the given 200 | // Levenshtein matrix. 201 | func EditScriptForMatrix(matrix [][]int, op Options) EditScript { 202 | return backtrace(len(matrix)-1, len(matrix[0])-1, matrix, op) 203 | } 204 | 205 | // WriteMatrix writes a visual representation of the given matrix for the given 206 | // strings to the given writer. 207 | func WriteMatrix(source []rune, target []rune, matrix [][]int, writer io.Writer) { 208 | fmt.Fprintf(writer, " ") 209 | for _, targetRune := range target { 210 | fmt.Fprintf(writer, " %c", targetRune) 211 | } 212 | fmt.Fprintf(writer, "\n") 213 | fmt.Fprintf(writer, " %2d", matrix[0][0]) 214 | for j, _ := range target { 215 | fmt.Fprintf(writer, " %2d", matrix[0][j+1]) 216 | } 217 | fmt.Fprintf(writer, "\n") 218 | for i, sourceRune := range source { 219 | fmt.Fprintf(writer, "%c %2d", sourceRune, matrix[i+1][0]) 220 | for j, _ := range target { 221 | fmt.Fprintf(writer, " %2d", matrix[i+1][j+1]) 222 | } 223 | fmt.Fprintf(writer, "\n") 224 | } 225 | } 226 | 227 | // LogMatrix writes a visual representation of the given matrix for the given 228 | // strings to os.Stderr. This function is deprecated, use 229 | // WriteMatrix(source, target, matrix, os.Stderr) instead. 230 | func LogMatrix(source []rune, target []rune, matrix [][]int) { 231 | WriteMatrix(source, target, matrix, os.Stderr) 232 | } 233 | 234 | func backtrace(i int, j int, matrix [][]int, op Options) EditScript { 235 | if i > 0 && matrix[i-1][j]+op.DelCost == matrix[i][j] { 236 | return append(backtrace(i-1, j, matrix, op), Del) 237 | } 238 | if j > 0 && matrix[i][j-1]+op.InsCost == matrix[i][j] { 239 | return append(backtrace(i, j-1, matrix, op), Ins) 240 | } 241 | if i > 0 && j > 0 && matrix[i-1][j-1]+op.SubCost == matrix[i][j] { 242 | return append(backtrace(i-1, j-1, matrix, op), Sub) 243 | } 244 | if i > 0 && j > 0 && matrix[i-1][j-1] == matrix[i][j] { 245 | return append(backtrace(i-1, j-1, matrix, op), Match) 246 | } 247 | return []EditOperation{} 248 | } 249 | 250 | func min(a int, b int) int { 251 | if b < a { 252 | return b 253 | } 254 | return a 255 | } 256 | 257 | func max(a int, b int) int { 258 | if b > a { 259 | return b 260 | } 261 | return a 262 | } 263 | --------------------------------------------------------------------------------