├── .travis.yml ├── LICENSE ├── old.go ├── data └── test.dict ├── README.md ├── fuzzy.go └── fuzzy_test.go /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: go 3 | go: 4 | - 1.13 5 | - 1.14 6 | - tip 7 | notifications: 8 | email: 9 | - infra@sajari.com 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Sajari Pty Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /old.go: -------------------------------------------------------------------------------- 1 | // Eventually this should be removed. Currently it gives backwards compatability to old 2 | // versions that did not store the query count, which is now used for autocomplete. 3 | package fuzzy 4 | 5 | import ( 6 | "encoding/json" 7 | "os" 8 | ) 9 | 10 | type OldModel struct { 11 | Data map[string]int `json:"data"` 12 | Maxcount int `json:"maxcount"` 13 | Suggest map[string][]string `json:"suggest"` 14 | Depth int `json:"depth"` 15 | Threshold int `json:"threshold"` 16 | UseAutocomplete bool `json:"autocomplete"` 17 | } 18 | 19 | // Converts the old model format to the new version 20 | func (model *Model) convertOldFormat(filename string) error { 21 | oldmodel := new(OldModel) 22 | f, err := os.Open(filename) 23 | if err != nil { 24 | return err 25 | } 26 | defer f.Close() 27 | d := json.NewDecoder(f) 28 | err = d.Decode(oldmodel) 29 | if err != nil { 30 | return err 31 | } 32 | 33 | // Correct for old models pre divergence measure 34 | if model.SuffDivergenceThreshold == 0 { 35 | model.SuffDivergenceThreshold = SuffDivergenceThresholdDefault 36 | } 37 | 38 | // Convert fields 39 | model.Maxcount = oldmodel.Maxcount 40 | model.Suggest = oldmodel.Suggest 41 | model.Depth = oldmodel.Depth 42 | model.Threshold = oldmodel.Threshold 43 | model.UseAutocomplete = oldmodel.UseAutocomplete 44 | 45 | // Convert the old counts 46 | if len(oldmodel.Data) > 0 { 47 | model.Data = make(map[string]*Counts, len(oldmodel.Data)) 48 | for term, cc := range oldmodel.Data { 49 | model.Data[term] = &Counts{cc, 0} 50 | } 51 | } 52 | return nil 53 | } 54 | -------------------------------------------------------------------------------- /data/test.dict: -------------------------------------------------------------------------------- 1 | {"data":{"aunty":1,"big":1,"bigger":1,"biggest":1,"bob":1,"delicate":1,"dynamite":1,"uncle":1,"you're":1,"your":1},"maxcount":1,"suggest":{"ant":["aunty"],"anty":["aunty"],"any":["aunty"],"aty":["aunty"],"aun":["aunty"],"aunt":["aunty"],"aunty":["aunty"],"auny":["aunty"],"aut":["aunty"],"auty":["aunty"],"auy":["aunty"],"bb":["bob"],"bg":["big"],"bger":["bigger"],"bgest":["biggest"],"bgge":["bigger"],"bgger":["bigger"],"bgges":["biggest"],"bggest":["biggest"],"bgget":["biggest"],"bggr":["bigger"],"bggst":["biggest"],"bi":["big"],"bier":["bigger"],"biest":["biggest"],"big":["big"],"bige":["bigger"],"biger":["bigger"],"biges":["biggest"],"bigest":["biggest"],"biget":["biggest"],"bigg":["bigger"],"bigge":["biggest","bigger"],"bigger":["bigger"],"bigges":["biggest"],"biggest":["biggest"],"bigget":["biggest"],"biggr":["bigger"],"biggs":["biggest"],"biggst":["biggest"],"biggt":["biggest"],"bigr":["bigger"],"bigst":["biggest"],"bo":["bob"],"bob":["bob"],"cle":["uncle"],"damite":["dynamite"],"decate":["delicate"],"deiate":["delicate"],"deicae":["delicate"],"deicat":["delicate"],"deicate":["delicate"],"deicte":["delicate"],"delate":["delicate"],"delcae":["delicate"],"delcat":["delicate"],"delcate":["delicate"],"delcte":["delicate"],"deliae":["delicate"],"deliat":["delicate"],"deliate":["delicate"],"delica":["delicate"],"delicae":["delicate"],"delicat":["delicate"],"delicate":["delicate"],"delice":["delicate"],"delict":["delicate"],"delicte":["delicate"],"delite":["delicate"],"dicate":["delicate"],"dlcate":["delicate"],"dliate":["delicate"],"dlicae":["delicate"],"dlicat":["delicate"],"dlicate":["delicate"],"dlicte":["delicate"],"dnaite":["dynamite"],"dnamie":["dynamite"],"dnamit":["dynamite"],"dnamite":["dynamite"],"dnamte":["dynamite"],"dnmite":["dynamite"],"dyaite":["dynamite"],"dyamie":["dynamite"],"dyamit":["dynamite"],"dyamite":["dynamite"],"dyamte":["dynamite"],"dymite":["dynamite"],"dynaie":["dynamite"],"dynait":["dynamite"],"dynaite":["dynamite"],"dyname":["dynamite"],"dynami":["dynamite"],"dynamie":["dynamite"],"dynamit":["dynamite"],"dynamite":["dynamite"],"dynamt":["dynamite"],"dynamte":["dynamite"],"dynate":["dynamite"],"dynite":["dynamite"],"dynmie":["dynamite"],"dynmit":["dynamite"],"dynmite":["dynamite"],"dynmte":["dynamite"],"eicate":["delicate"],"elcate":["delicate"],"eliate":["delicate"],"elicae":["delicate"],"elicat":["delicate"],"elicate":["delicate"],"elicte":["delicate"],"gger":["bigger"],"ggest":["biggest"],"ig":["big"],"iger":["bigger"],"igest":["biggest"],"igge":["bigger"],"igger":["bigger"],"igges":["biggest"],"iggest":["biggest"],"igget":["biggest"],"iggr":["bigger"],"iggst":["biggest"],"licate":["delicate"],"namite":["dynamite"],"nce":["uncle"],"ncl":["uncle"],"ncle":["uncle"],"nle":["uncle"],"nty":["aunty"],"o're":["you're"],"ob":["bob"],"or":["your"],"ou":["your"],"ou'e":["you're"],"ou'r":["you're"],"ou're":["you're"],"our":["your"],"oure":["you're"],"u're":["you're"],"uce":["uncle"],"ucl":["uncle"],"ucle":["uncle"],"ule":["uncle"],"unc":["uncle"],"unce":["uncle"],"uncl":["uncle"],"uncle":["uncle"],"une":["uncle"],"unl":["uncle"],"unle":["uncle"],"unt":["aunty"],"unty":["aunty"],"uny":["aunty"],"ur":["your"],"uty":["aunty"],"y're":["you're"],"yamite":["dynamite"],"ynaite":["dynamite"],"ynamie":["dynamite"],"ynamit":["dynamite"],"ynamite":["dynamite"],"ynamte":["dynamite"],"ynmite":["dynamite"],"yo":["your"],"yo'e":["you're"],"yo'r":["you're"],"yo're":["you're"],"yor":["your"],"yore":["you're"],"you":["your"],"you'":["you're"],"you'e":["you're"],"you'r":["you're"],"you're":["you're"],"youe":["you're"],"your":["your","you're"],"youre":["you're"],"yr":["your"],"yu":["your"],"yu'e":["you're"],"yu'r":["you're"],"yu're":["you're"],"yur":["your"],"yure":["you're"]},"depth":2,"threshold":1,"autocomplete":true} 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fuzzy 2 | [![Build Status](https://travis-ci.org/sajari/fuzzy.svg?branch=master)](https://travis-ci.org/sajari/fuzzy) 3 | 4 | Fuzzy is a very fast spell checker and query suggester written in Golang. 5 | 6 | Motivation: 7 | - Sajari uses very large queries (hundreds of words) but needs to respond sub-second to these queries where possible. Common spell check algorithms are quite slow or very resource intensive. 8 | - The aim was to achieve spell checks in sub 100usec per word (10,000 / second single core) with at least 60% accuracy and multi-language support. 9 | - Currently we see sub 40usec per word and ~70% accuracy for a Levenshtein distance of 2 chars on a 2012 macbook pro (english test set comes from Peter Norvig's article, see http://norvig.com/spell-correct.html). 10 | - A 500 word query can be spell checked in ~0.02 sec / cpu cores, which is good enough for us. 11 | 12 | Notes: 13 | - It is currently executed as a single goroutine per lookup, so undoubtedly this could be much faster using multiple cores, but currently the speed is quite good. 14 | - Accuracy is hit slightly because several correct words don't appear at all in the training text (data/big.txt). 15 | - Fuzzy is a "Symmetric Delete Spelling Corrector", which relates to some blogs by Wolf Garbe at Faroo.com (see http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/) 16 | 17 | Config: 18 | - Generally no config is required, but you can tweak the model for your application. 19 | - `"threshold"` is the trigger point when a word becomes popular enough to build lookup keys for it. Setting this to "1" means any instance of a given word makes it a legitimate spelling. This typically corrects the most errors, but can also cause false positives if incorrect spellings exist in the training data. It also causes a much larger index to be built. By default this is set to 4. 20 | - `"depth"` is the Levenshtein distance the model builds lookup keys for. For spelling correction, a setting of "2" is typically very good. At a distance of "3" the potential number of words is much, much larger, but adds little benefit to accuracy. For query prediction a larger number can be useful, but again is much more expensive. **A depth of "1" and threshold of "1" for the 1st Norvig test set gives ~70% correction accuracy at ~5usec per check (e.g. ~200kHz)**, for many applications this will be good enough. At depths > 2, the false positives begin to hurt the accuracy. 21 | 22 | Future improvements: 23 | - Make some of the expensive processes concurrent. 24 | - Add spelling checks for different languages. If you have misspellings in different languages please add them or send to us. 25 | - Allow the term-score map to be read from an external term set (e.g. integrating this currently may double up on keeping a term count). 26 | - Currently there is no method to delete lookup keys, so potentially this may cause bloating over time if the dictionary changes signficantly. 27 | - Add right to left deletion beyond Levenshtein config depth (e.g. don't process all deletes accept for query predictors). 28 | 29 | Usage: 30 | - Below is some example code showing how to use the package. 31 | - An example showing how to train with a static set of words is contained in the fuzzy_test.go file, which uses the "big.text" file to create an english dictionary. 32 | - To integrate with your application (e.g. custom dictionary / word popularity), use the single word and multiword training functions shown in the example below. Each time you add a new instance of a given word, pass it to this function. The model will keep a count and 33 | - We haven't tested with other langauges, but this should work fine. Please let us know how you go? `support@sajari.com` 34 | 35 | 36 | ```go 37 | package main 38 | 39 | import( 40 | "github.com/sajari/fuzzy" 41 | "fmt" 42 | ) 43 | 44 | func main() { 45 | model := fuzzy.NewModel() 46 | 47 | // For testing only, this is not advisable on production 48 | model.SetThreshold(1) 49 | 50 | // This expands the distance searched, but costs more resources (memory and time). 51 | // For spell checking, "2" is typically enough, for query suggestions this can be higher 52 | model.SetDepth(5) 53 | 54 | // Train multiple words simultaneously by passing an array of strings to the "Train" function 55 | words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're"} 56 | model.Train(words) 57 | 58 | // Train word by word (typically triggered in your application once a given word is popular enough) 59 | model.TrainWord("single") 60 | 61 | // Check Spelling 62 | fmt.Println("\nSPELL CHECKS") 63 | fmt.Println(" Deletion test (yor) : ", model.SpellCheck("yor")) 64 | fmt.Println(" Swap test (uncel) : ", model.SpellCheck("uncel")) 65 | fmt.Println(" Replace test (dynemite) : ", model.SpellCheck("dynemite")) 66 | fmt.Println(" Insert test (dellicate) : ", model.SpellCheck("dellicate")) 67 | fmt.Println(" Two char test (dellicade) : ", model.SpellCheck("dellicade")) 68 | 69 | // Suggest completions 70 | fmt.Println("\nQUERY SUGGESTIONS") 71 | fmt.Println(" \"bigge\". Did you mean?: ", model.Suggestions("bigge", false)) 72 | fmt.Println(" \"bo\". Did you mean?: ", model.Suggestions("bo", false)) 73 | fmt.Println(" \"dyn\". Did you mean?: ", model.Suggestions("dyn", false)) 74 | 75 | // Autocomplete suggestions 76 | suggested, _ := model.Autocomplete("bi") 77 | fmt.Printf(" \"bi\". Suggestions: %v", suggested) 78 | 79 | } 80 | ``` -------------------------------------------------------------------------------- /fuzzy.go: -------------------------------------------------------------------------------- 1 | package fuzzy 2 | 3 | import ( 4 | "bufio" 5 | "encoding/json" 6 | "errors" 7 | "fmt" 8 | "index/suffixarray" 9 | "io" 10 | "log" 11 | "os" 12 | "regexp" 13 | "sort" 14 | "strings" 15 | "sync" 16 | ) 17 | 18 | const ( 19 | SpellDepthDefault = 2 20 | SpellThresholdDefault = 5 21 | SuffDivergenceThresholdDefault = 100 22 | ) 23 | 24 | type Pair struct { 25 | str1 string 26 | str2 string 27 | } 28 | 29 | type Method int 30 | 31 | const ( 32 | MethodIsWord Method = 0 33 | MethodSuggestMapsToInput = 1 34 | MethodInputDeleteMapsToDict = 2 35 | MethodInputDeleteMapsToSuggest = 3 36 | ) 37 | 38 | type Potential struct { 39 | Term string // Potential term string 40 | Score int // Score 41 | Leven int // Levenstein distance from the suggestion to the input 42 | Method Method // How this potential was matched 43 | } 44 | 45 | type Counts struct { 46 | Corpus int `json:"corpus"` 47 | Query int `json:"query"` 48 | } 49 | 50 | type Model struct { 51 | Data map[string]*Counts `json:"data"` 52 | Maxcount int `json:"maxcount"` 53 | Suggest map[string][]string `json:"suggest"` 54 | Depth int `json:"depth"` 55 | Threshold int `json:"threshold"` 56 | UseAutocomplete bool `json:"autocomplete"` 57 | SuffDivergence int `json:"-"` 58 | SuffDivergenceThreshold int `json:"suff_threshold"` 59 | SuffixArr *suffixarray.Index `json:"-"` 60 | SuffixArrConcat string `json:"-"` 61 | sync.RWMutex 62 | } 63 | 64 | // For sorting autocomplete suggestions 65 | // to bias the most popular first 66 | type Autos struct { 67 | Results []string 68 | Model *Model 69 | } 70 | 71 | func (a Autos) Len() int { return len(a.Results) } 72 | func (a Autos) Swap(i, j int) { a.Results[i], a.Results[j] = a.Results[j], a.Results[i] } 73 | 74 | func (a Autos) Less(i, j int) bool { 75 | icc := a.Model.Data[a.Results[i]].Corpus 76 | jcc := a.Model.Data[a.Results[j]].Corpus 77 | icq := a.Model.Data[a.Results[i]].Query 78 | jcq := a.Model.Data[a.Results[j]].Query 79 | if icq == jcq { 80 | if icc == jcc { 81 | return a.Results[i] > a.Results[j] 82 | } 83 | return icc > jcc 84 | } 85 | return icq > jcq 86 | } 87 | 88 | func (m Method) String() string { 89 | switch m { 90 | case MethodIsWord: 91 | return "Input in dictionary" 92 | case MethodSuggestMapsToInput: 93 | return "Suggest maps to input" 94 | case MethodInputDeleteMapsToDict: 95 | return "Input delete maps to dictionary" 96 | case MethodInputDeleteMapsToSuggest: 97 | return "Input delete maps to suggest key" 98 | } 99 | return "unknown" 100 | } 101 | 102 | func (pot *Potential) String() string { 103 | return fmt.Sprintf("Term: %v\n\tScore: %v\n\tLeven: %v\n\tMethod: %v\n\n", pot.Term, pot.Score, pot.Leven, pot.Method) 104 | } 105 | 106 | // Create and initialise a new model 107 | func NewModel() *Model { 108 | model := new(Model) 109 | return model.Init() 110 | } 111 | 112 | func (model *Model) Init() *Model { 113 | model.Data = make(map[string]*Counts) 114 | model.Suggest = make(map[string][]string) 115 | model.Depth = SpellDepthDefault 116 | model.Threshold = SpellThresholdDefault // Setting this to 1 is most accurate, but "1" is 5x more memory and 30x slower processing than "4". This is a big performance tuning knob 117 | model.UseAutocomplete = true // Default is to include Autocomplete 118 | model.updateSuffixArr() 119 | model.SuffDivergenceThreshold = SuffDivergenceThresholdDefault 120 | return model 121 | } 122 | 123 | // WriteTo writes a model to a Writer 124 | func (model *Model) WriteTo(w io.Writer) (int64, error) { 125 | model.RLock() 126 | defer model.RUnlock() 127 | b, err := json.Marshal(model) 128 | if err != nil { 129 | return 0, err 130 | } 131 | n, err := w.Write(b) 132 | if err != nil { 133 | return int64(n), err 134 | } 135 | return int64(n), nil 136 | } 137 | 138 | // Save a spelling model to disk 139 | func (model *Model) Save(filename string) error { 140 | f, err := os.Create(filename) 141 | if err != nil { 142 | log.Println("Fuzzy model:", err) 143 | return err 144 | } 145 | defer f.Close() 146 | _, err = model.WriteTo(f) 147 | if err != nil { 148 | log.Println("Fuzzy model:", err) 149 | return err 150 | } 151 | return nil 152 | } 153 | 154 | // Save a spelling model to disk, but discard all 155 | // entries less than the threshold number of occurences 156 | // Much smaller and all that is used when generated 157 | // as a once off, but not useful for incremental usage 158 | func (model *Model) SaveLight(filename string) error { 159 | model.Lock() 160 | for term, count := range model.Data { 161 | if count.Corpus < model.Threshold { 162 | delete(model.Data, term) 163 | } 164 | } 165 | model.Unlock() 166 | return model.Save(filename) 167 | } 168 | 169 | // FromReader loads a model from a Reader 170 | func FromReader(r io.Reader) (*Model, error) { 171 | model := new(Model) 172 | d := json.NewDecoder(r) 173 | err := d.Decode(model) 174 | if err != nil { 175 | return nil, err 176 | } 177 | model.updateSuffixArr() 178 | return model, nil 179 | } 180 | 181 | // Load a saved model from disk 182 | func Load(filename string) (*Model, error) { 183 | f, err := os.Open(filename) 184 | if err != nil { 185 | return nil, err 186 | } 187 | defer f.Close() 188 | model, err := FromReader(f) 189 | if err != nil { 190 | model = new(Model) 191 | if err1 := model.convertOldFormat(filename); err1 != nil { 192 | return model, err1 193 | } 194 | return model, nil 195 | } 196 | return model, nil 197 | } 198 | 199 | // Change the default depth value of the model. This sets how many 200 | // character differences are indexed. The default is 2. 201 | func (model *Model) SetDepth(val int) { 202 | model.Lock() 203 | model.Depth = val 204 | model.Unlock() 205 | } 206 | 207 | // Change the default threshold of the model. This is how many times 208 | // a term must be seen before suggestions are created for it 209 | func (model *Model) SetThreshold(val int) { 210 | model.Lock() 211 | model.Threshold = val 212 | model.Unlock() 213 | } 214 | 215 | // Optionally disabled suffixarray based autocomplete support 216 | func (model *Model) SetUseAutocomplete(val bool) { 217 | model.Lock() 218 | old := model.UseAutocomplete 219 | model.Unlock() 220 | model.UseAutocomplete = val 221 | if !old && val { 222 | model.updateSuffixArr() 223 | } 224 | } 225 | 226 | // Optionally set the suffix array divergence threshold. This is 227 | // the number of query training steps between rebuilds of the 228 | // suffix array. A low number will be more accurate but will use 229 | // resources and create more garbage. 230 | func (model *Model) SetDivergenceThreshold(val int) { 231 | model.Lock() 232 | model.SuffDivergenceThreshold = val 233 | model.Unlock() 234 | } 235 | 236 | // Calculate the Levenshtein distance between two strings 237 | func Levenshtein(a, b *string) int { 238 | la := len(*a) 239 | lb := len(*b) 240 | d := make([]int, la+1) 241 | var lastdiag, olddiag, temp int 242 | 243 | for i := 1; i <= la; i++ { 244 | d[i] = i 245 | } 246 | for i := 1; i <= lb; i++ { 247 | d[0] = i 248 | lastdiag = i - 1 249 | for j := 1; j <= la; j++ { 250 | olddiag = d[j] 251 | min := d[j] + 1 252 | if (d[j-1] + 1) < min { 253 | min = d[j-1] + 1 254 | } 255 | if (*a)[j-1] == (*b)[i-1] { 256 | temp = 0 257 | } else { 258 | temp = 1 259 | } 260 | if (lastdiag + temp) < min { 261 | min = lastdiag + temp 262 | } 263 | d[j] = min 264 | lastdiag = olddiag 265 | } 266 | } 267 | return d[la] 268 | } 269 | 270 | // Add an array of words to train the model in bulk 271 | func (model *Model) Train(terms []string) { 272 | for _, term := range terms { 273 | model.TrainWord(term) 274 | } 275 | model.updateSuffixArr() 276 | } 277 | 278 | // Manually set the count of a word. Optionally trigger the 279 | // creation of suggestion keys for the term. This function lets 280 | // you build a model from an existing dictionary with word popularity 281 | // counts without needing to run "TrainWord" repeatedly 282 | func (model *Model) SetCount(term string, count int, suggest bool) { 283 | model.Lock() 284 | model.Data[term] = &Counts{count, 0} // Note: This may reset a query count? TODO 285 | if suggest { 286 | model.createSuggestKeys(term) 287 | } 288 | model.Unlock() 289 | } 290 | 291 | // Train the model word by word. This is corpus training as opposed 292 | // to query training. Word counts from this type of training are not 293 | // likely to correlate with those of search queries 294 | func (model *Model) TrainWord(term string) { 295 | model.Lock() 296 | if t, ok := model.Data[term]; ok { 297 | t.Corpus++ 298 | } else { 299 | model.Data[term] = &Counts{1, 0} 300 | } 301 | // Set the max 302 | if model.Data[term].Corpus > model.Maxcount { 303 | model.Maxcount = model.Data[term].Corpus 304 | model.SuffDivergence++ 305 | } 306 | // If threshold is triggered, store delete suggestion keys 307 | if model.Data[term].Corpus == model.Threshold { 308 | model.createSuggestKeys(term) 309 | } 310 | model.Unlock() 311 | } 312 | 313 | // Train using a search query term. This builds a second popularity 314 | // index of terms used to search, as opposed to generally occurring 315 | // in corpus text 316 | func (model *Model) TrainQuery(term string) { 317 | model.Lock() 318 | if t, ok := model.Data[term]; ok { 319 | t.Query++ 320 | } else { 321 | model.Data[term] = &Counts{0, 1} 322 | } 323 | model.SuffDivergence++ 324 | update := model.SuffDivergence > model.SuffDivergenceThreshold 325 | model.Unlock() 326 | if update { 327 | model.updateSuffixArr() 328 | } 329 | } 330 | 331 | // For a given term, create the partially deleted lookup keys 332 | func (model *Model) createSuggestKeys(term string) { 333 | edits := model.EditsMulti(term, model.Depth) 334 | for _, edit := range edits { 335 | if len(edit) <= 1 { 336 | continue 337 | } 338 | skip := false 339 | for _, hit := range model.Suggest[edit] { 340 | if hit == term { 341 | // Already know about this one 342 | skip = true 343 | break 344 | } 345 | } 346 | if !skip { 347 | model.Suggest[edit] = append(model.Suggest[edit], term) 348 | } 349 | } 350 | } 351 | 352 | // Edits at any depth for a given term. The depth of the model is used 353 | func (model *Model) EditsMulti(term string, depth int) []string { 354 | edits := Edits1(term) 355 | for { 356 | depth-- 357 | if depth <= 0 { 358 | break 359 | } 360 | for _, edit := range edits { 361 | edits = append(edits, Edits1(edit)...) 362 | } 363 | } 364 | return edits 365 | } 366 | 367 | // Edits1 creates a set of terms that are 1 char delete from the input term 368 | func Edits1(word string) []string { 369 | total_set := make([]string, 0, len(word)+2) 370 | for i := 0; i < len(word); i++ { 371 | // delete ith character 372 | total_set = append(total_set, word[:i]+word[i+1:]) 373 | } 374 | 375 | total_set = append(total_set, word) 376 | 377 | // Special case ending in "ies" or "ys" 378 | if strings.HasSuffix(word, "ies") { 379 | total_set = append(total_set, word[:len(word)-3]+"ys") 380 | } 381 | if strings.HasSuffix(word, "ys") { 382 | total_set = append(total_set, word[:len(word)-2]+"ies") 383 | } 384 | 385 | return total_set 386 | } 387 | 388 | func (model *Model) corpusCount(input string) int { 389 | if score, ok := model.Data[input]; ok { 390 | return score.Corpus 391 | } 392 | return 0 393 | } 394 | 395 | // From a group of potentials, work out the most likely result 396 | func best(input string, potential map[string]*Potential) string { 397 | var best string 398 | var bestcalc, bonus int 399 | for i := 0; i < 4; i++ { 400 | for _, pot := range potential { 401 | if pot.Leven == 0 { 402 | return pot.Term 403 | } else if pot.Leven == i { 404 | bonus = 0 405 | // If the first letter is the same, that's a good sign. Bias these potentials 406 | if pot.Term[0] == input[0] { 407 | bonus += 100 408 | } 409 | if pot.Score+bonus > bestcalc { 410 | bestcalc = pot.Score + bonus 411 | best = pot.Term 412 | } 413 | } 414 | } 415 | if bestcalc > 0 { 416 | return best 417 | } 418 | } 419 | return best 420 | } 421 | 422 | // From a group of potentials, work out the most likely results, in order of 423 | // best to worst 424 | func bestn(input string, potential map[string]*Potential, n int) []string { 425 | var output []string 426 | for i := 0; i < n; i++ { 427 | if len(potential) == 0 { 428 | break 429 | } 430 | b := best(input, potential) 431 | output = append(output, b) 432 | delete(potential, b) 433 | } 434 | return output 435 | } 436 | 437 | // Test an input, if we get it wrong, look at why it is wrong. This 438 | // function returns a bool indicating if the guess was correct as well 439 | // as the term it is suggesting. Typically this function would be used 440 | // for testing, not for production 441 | func (model *Model) CheckKnown(input string, correct string) bool { 442 | model.RLock() 443 | defer model.RUnlock() 444 | suggestions := model.suggestPotential(input, true) 445 | best := best(input, suggestions) 446 | if best == correct { 447 | // This guess is correct 448 | fmt.Printf("Input correctly maps to correct term") 449 | return true 450 | } 451 | if pot, ok := suggestions[correct]; !ok { 452 | 453 | if model.corpusCount(correct) > 0 { 454 | fmt.Printf("\"%v\" - %v (%v) not in the suggestions. (%v) best option.\n", input, correct, model.corpusCount(correct), best) 455 | for _, sugg := range suggestions { 456 | fmt.Printf(" %v\n", sugg) 457 | } 458 | } else { 459 | fmt.Printf("\"%v\" - Not in dictionary\n", correct) 460 | } 461 | } else { 462 | fmt.Printf("\"%v\" - (%v) suggested, should however be (%v).\n", input, suggestions[best], pot) 463 | } 464 | return false 465 | } 466 | 467 | // For a given input term, suggest some alternatives. If exhaustive, each of the 4 468 | // cascading checks will be performed and all potentials will be sorted accordingly 469 | func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*Potential { 470 | input = strings.ToLower(input) 471 | suggestions := make(map[string]*Potential, 20) 472 | 473 | // 0 - If this is a dictionary term we're all good, no need to go further 474 | if model.corpusCount(input) > model.Threshold { 475 | suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord} 476 | if !exhaustive { 477 | return suggestions 478 | } 479 | } 480 | 481 | // 1 - See if the input matches a "suggest" key 482 | if sugg, ok := model.Suggest[input]; ok { 483 | for _, pot := range sugg { 484 | if _, ok := suggestions[pot]; !ok { 485 | suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput} 486 | } 487 | } 488 | 489 | if !exhaustive { 490 | return suggestions 491 | } 492 | } 493 | 494 | // 2 - See if edit1 matches input 495 | max := 0 496 | edits := model.EditsMulti(input, model.Depth) 497 | for _, edit := range edits { 498 | score := model.corpusCount(edit) 499 | if score > 0 && len(edit) > 2 { 500 | if _, ok := suggestions[edit]; !ok { 501 | suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict} 502 | } 503 | if score > max { 504 | max = score 505 | } 506 | } 507 | } 508 | if max > 0 { 509 | if !exhaustive { 510 | return suggestions 511 | } 512 | } 513 | 514 | // 3 - No hits on edit1 distance, look for transposes and replaces 515 | // Note: these are more complex, we need to check the guesses 516 | // more thoroughly, e.g. levals=[valves] in a raw sense, which 517 | // is incorrect 518 | for _, edit := range edits { 519 | if sugg, ok := model.Suggest[edit]; ok { 520 | // Is this a real transpose or replace? 521 | for _, pot := range sugg { 522 | lev := Levenshtein(&input, &pot) 523 | if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions 524 | if _, ok := suggestions[pot]; !ok { 525 | suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest} 526 | } 527 | } 528 | } 529 | } 530 | } 531 | return suggestions 532 | } 533 | 534 | // Return the raw potential terms so they can be ranked externally 535 | // to this package 536 | func (model *Model) Potentials(input string, exhaustive bool) map[string]*Potential { 537 | model.RLock() 538 | defer model.RUnlock() 539 | return model.suggestPotential(input, exhaustive) 540 | } 541 | 542 | // For a given input string, suggests potential replacements 543 | func (model *Model) Suggestions(input string, exhaustive bool) []string { 544 | model.RLock() 545 | suggestions := model.suggestPotential(input, exhaustive) 546 | model.RUnlock() 547 | output := make([]string, 0, 10) 548 | for _, suggestion := range suggestions { 549 | output = append(output, suggestion.Term) 550 | } 551 | return output 552 | } 553 | 554 | // Return the most likely correction for the input term 555 | func (model *Model) SpellCheck(input string) string { 556 | model.RLock() 557 | suggestions := model.suggestPotential(input, false) 558 | model.RUnlock() 559 | return best(input, suggestions) 560 | } 561 | 562 | // Return the most likely corrections in order from best to worst 563 | func (model *Model) SpellCheckSuggestions(input string, n int) []string { 564 | model.RLock() 565 | suggestions := model.suggestPotential(input, true) 566 | model.RUnlock() 567 | return bestn(input, suggestions, n) 568 | } 569 | 570 | func SampleEnglish() []string { 571 | var out []string 572 | file, err := os.Open("data/big.txt") 573 | if err != nil { 574 | fmt.Println(err) 575 | return out 576 | } 577 | reader := bufio.NewReader(file) 578 | scanner := bufio.NewScanner(reader) 579 | scanner.Split(bufio.ScanLines) 580 | // Count the words. 581 | count := 0 582 | for scanner.Scan() { 583 | exp, _ := regexp.Compile("[a-zA-Z]+") 584 | words := exp.FindAll([]byte(scanner.Text()), -1) 585 | for _, word := range words { 586 | if len(word) > 1 { 587 | out = append(out, strings.ToLower(string(word))) 588 | count++ 589 | } 590 | } 591 | } 592 | if err := scanner.Err(); err != nil { 593 | fmt.Fprintln(os.Stderr, "reading input:", err) 594 | } 595 | 596 | return out 597 | } 598 | 599 | // Takes the known dictionary listing and creates a suffix array 600 | // model for these terms. If a model already existed, it is discarded 601 | func (model *Model) updateSuffixArr() { 602 | if !model.UseAutocomplete { 603 | return 604 | } 605 | model.RLock() 606 | termArr := make([]string, 0, 1000) 607 | for term, count := range model.Data { 608 | if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold? 609 | termArr = append(termArr, term) 610 | } 611 | } 612 | model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00" 613 | model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat)) 614 | model.SuffDivergence = 0 615 | model.RUnlock() 616 | } 617 | 618 | // For a given string, autocomplete using the suffix array model 619 | func (model *Model) Autocomplete(input string) ([]string, error) { 620 | model.RLock() 621 | defer model.RUnlock() 622 | if !model.UseAutocomplete { 623 | return []string{}, errors.New("Autocomplete is disabled") 624 | } 625 | if len(input) == 0 { 626 | return []string{}, errors.New("Input cannot have length zero") 627 | } 628 | express := "\x00" + input + "[^\x00]*" 629 | match, err := regexp.Compile(express) 630 | if err != nil { 631 | return []string{}, err 632 | } 633 | matches := model.SuffixArr.FindAllIndex(match, -1) 634 | a := &Autos{Results: make([]string, 0, len(matches)), Model: model} 635 | for _, m := range matches { 636 | str := strings.Trim(model.SuffixArrConcat[m[0]:m[1]], "\x00") 637 | if count, ok := model.Data[str]; ok { 638 | if count.Corpus > model.Threshold || count.Query > 0 { 639 | a.Results = append(a.Results, str) 640 | } 641 | } 642 | } 643 | sort.Sort(a) 644 | if len(a.Results) >= 10 { 645 | return a.Results[:10], nil 646 | } 647 | return a.Results, nil 648 | } 649 | -------------------------------------------------------------------------------- /fuzzy_test.go: -------------------------------------------------------------------------------- 1 | package fuzzy 2 | 3 | import ( 4 | "fmt" 5 | "reflect" 6 | "runtime" 7 | "strings" 8 | "sync" 9 | "testing" 10 | "time" 11 | ) 12 | 13 | var sampleEnglish []string 14 | 15 | func init() { 16 | sampleEnglish = SampleEnglish() 17 | } 18 | 19 | func TestSpelling(t *testing.T) { 20 | model := NewModel() 21 | 22 | // For testing only, this is not advisable on production 23 | model.SetThreshold(1) 24 | 25 | // Train multiple words simultaneously 26 | words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're", "bob", "your"} 27 | model.Train(words) 28 | 29 | // Check Spelling 30 | if model.SpellCheck("yor") != "your" { 31 | t.Errorf("Spell check: Single char delete failed") 32 | } 33 | if model.SpellCheck("uncel") != "uncle" { 34 | t.Errorf("Spell check: Single char transpose failed") 35 | } 36 | if model.SpellCheck("dynemite") != "dynamite" { 37 | t.Errorf("Spell check: Single char swap failed") 38 | } 39 | if model.SpellCheck("dellicate") != "delicate" { 40 | t.Errorf("Spell check: Single char insertion failed") 41 | } 42 | if model.SpellCheck("dellicade") != "delicate" { 43 | t.Errorf("Spell check: Two char change failed") 44 | } 45 | } 46 | 47 | func TestSpellingSuggestions(t *testing.T) { 48 | model := NewModel() 49 | 50 | // For testing only, this is not advisable on production 51 | model.SetThreshold(1) 52 | 53 | // Train multiple words simultaneously 54 | words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "biggest", "big", "bigger", "aunty", "you're", "bob", "your"} 55 | model.Train(words) 56 | 57 | // Check Spelling 58 | if model.SpellCheckSuggestions("yor", 2)[0] != "your" { 59 | t.Errorf("Spell check: Single char delete failed") 60 | } 61 | if model.SpellCheckSuggestions("uncel", 2)[0] != "uncle" { 62 | t.Errorf("Spell check: Single char transpose failed") 63 | } 64 | if model.SpellCheckSuggestions("dynemite", 2)[0] != "dynamite" { 65 | t.Errorf("Spell check: Single char swap failed") 66 | } 67 | if model.SpellCheckSuggestions("dellicate", 2)[0] != "delicate" { 68 | t.Errorf("Spell check: Single char insertion failed") 69 | } 70 | if model.SpellCheckSuggestions("dellicade", 2)[0] != "delicate" { 71 | t.Errorf("Spell check: Two char change failed") 72 | } 73 | 74 | suggestions := model.SpellCheckSuggestions("bigge", 2) 75 | if suggestions[0] != "bigger" { 76 | t.Errorf("Spell check suggestions, Single char delete is closest") 77 | } 78 | // "biggest" should be before "big" since it appears twice 79 | if suggestions[1] != "biggest" { 80 | t.Errorf("Spell check suggestions, Double char delete 2nd closest") 81 | } 82 | } 83 | 84 | func TestSuggestions(t *testing.T) { 85 | model := NewModel() 86 | 87 | // For testing only, this is not advisable on production 88 | model.SetThreshold(1) 89 | 90 | // Train multiple words simultaneously 91 | words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're"} 92 | model.Train(words) 93 | 94 | // Train word by word 95 | model.TrainWord("single") 96 | 97 | // Suggest completions 98 | potential := model.Suggestions("bigge", false) 99 | bigger := false 100 | biggest := false 101 | for _, term := range potential { 102 | if term == "bigger" { 103 | bigger = true 104 | } 105 | if term == "biggest" { 106 | biggest = true 107 | } 108 | } 109 | if !biggest || !bigger { 110 | t.Errorf("Suggestions are missing values that should be there") 111 | } 112 | } 113 | 114 | func TestManualTermAddition(t *testing.T) { 115 | model := NewModel() 116 | model.SetThreshold(4) 117 | 118 | model.SetCount("elephant", 10, true) 119 | 120 | if model.SpellCheck("elphant") != "elephant" { 121 | t.Errorf("Spell check: manual term addition didn't work") 122 | } 123 | } 124 | 125 | // Not exhaustive, but shows training and spell checks can run concurrently 126 | func TestConcurrency(t *testing.T) { 127 | cpu := runtime.NumCPU() 128 | runtime.GOMAXPROCS(cpu) 129 | model := NewModel() 130 | 131 | piece := len(sampleEnglish) / cpu 132 | 133 | var wg sync.WaitGroup 134 | // Train concurrently 135 | for i := 0; i < cpu; i++ { 136 | wg.Add(1) 137 | go func(i int) { 138 | begin := i * piece 139 | end := (i+1)*piece - 1 140 | model.Train(sampleEnglish[begin:end]) 141 | wg.Done() 142 | }(i) 143 | } 144 | wg.Wait() 145 | 146 | // Test concurrently 147 | words := []string{"bob", "your", "uncle", "dynmite", "delidate", "bgigest", "bigr", "biger", "arnty", "you're"} 148 | for i := 0; i < cpu; i++ { 149 | wg.Add(1) 150 | go func() { 151 | for _, word := range words { 152 | model.SpellCheck(word) 153 | } 154 | wg.Done() 155 | }() 156 | } 157 | wg.Wait() 158 | } 159 | 160 | func TestColdInit(t *testing.T) { 161 | model := NewModel() 162 | _, err := model.Autocomplete("a") 163 | if err != nil { 164 | t.Errorf("Failed to init and autocomplete: %v", err) 165 | } 166 | } 167 | 168 | // Accuracy test sets come from Peter Norvig's set 169 | // The big.txt file is also from Peter Norvig's set. This helps to define a decent 170 | // dictionary, although it is still missing some of the common words in the test sets 171 | // We aim for > 60% correction success at a rate of > 5000Hz (single threaded) 172 | func TestAccuracy(t *testing.T) { 173 | const test2AccuracyThreshold = .60 174 | 175 | tests1 := map[string]string{"access": "acess", "accessing": "accesing", "accommodation": "accomodation acommodation acomodation", "account": "acount", "address": "adress adres", "addressable": "addresable", "arranged": "aranged arrainged", 176 | "arranging": "aranging", "arrangement": "arragment", "articles": "articals", 177 | "aunt": "annt anut arnt", "auxiliary": "auxillary", "available": "avaible", 178 | "awful": "awfall afful", "basically": "basicaly", "beginning": "begining", 179 | "benefit": "benifit", "benefits": "benifits", "between": "beetween", "bicycle": "bicycal bycicle bycycle", "biscuits": "biscits biscutes biscuts bisquits buiscits buiscuts", "built": "biult", 180 | "cake": "cak", "career": "carrer", 181 | "cemetery": "cemetary semetary", "centrally": "centraly", "certain": "cirtain", 182 | "challenges": "chalenges chalenges", "chapter": "chaper chaphter chaptur", 183 | "choice": "choise", "choosing": "chosing", "clerical": "clearical", 184 | "committee": "comittee", "compare": "compair", "completely": "completly", 185 | "consider": "concider", "considerable": "conciderable", "contented": "contenpted contende contended contentid", "curtains": "cartains certans courtens cuaritains curtans curtians curtions", "decide": "descide", "decided": "descided", "definitely": "definately difinately", "definition": "defenition", 186 | "definitions": "defenitions", "description": "discription", "desiccate": "desicate dessicate dessiccate", "diagrammatically": "diagrammaticaally", 187 | "different": "diffrent", "driven": "dirven", "ecstasy": "exstacy ecstacy", 188 | "embarrass": "embaras embarass", "establishing": "astablishing establising", 189 | "experience": "experance experiance", "experiences": "experances", "extended": "extented", "extremely": "extreamly", "fails": "failes", "families": "familes", 190 | "february": "febuary", "further": "futher", "gallery": "galery gallary gallerry gallrey", 191 | "hierarchal": "hierachial", "hierarchy": "hierchy", "inconvenient": "inconvienient inconvient inconvinient", "independent": "independant independant", 192 | "initial": "intial", "initials": "inetials inistals initails initals intials", 193 | "juice": "guic juce jucie juise juse", "latest": "lates latets latiest latist", 194 | "laugh": "lagh lauf laught lugh", "level": "leval", 195 | "levels": "levals", "liaison": "liaision liason", "lieu": "liew", "literature": "litriture", "loans": "lones", "locally": "localy", "magnificent": "magnificnet magificent magnifcent magnifecent magnifiscant magnifisent magnificant", 196 | "management": "managment", "meant": "ment", "minuscule": "miniscule", 197 | "minutes": "muinets", "monitoring": "monitering", "necessary": "neccesary necesary neccesary necassary necassery neccasary", "occurrence": "occurence occurence", "often": "ofen offen offten ofton", "opposite": "opisite oppasite oppesite oppisit oppisite opposit oppossite oppossitte", "parallel": "paralel paralell parrallel parralell parrallell", "particular": "particulaur", 198 | "perhaps": "perhapse", "personnel": "personnell", "planned": "planed", "poem": "poame", "poems": "poims pomes", "poetry": "poartry poertry poetre poety powetry", 199 | "position": "possition", "possible": "possable", "pretend": "pertend protend prtend pritend", "problem": "problam proble promblem proplen", 200 | "pronunciation": "pronounciation", "purple": "perple perpul poarple", 201 | "questionnaire": "questionaire", "really": "realy relley relly", "receipt": "receit receite reciet recipt", "receive": "recieve", "refreshment": "reafreshment refreshmant refresment refressmunt", "remember": "rember remeber rememmer rermember", 202 | "remind": "remine remined", "scarcely": "scarcly scarecly scarely scarsely", 203 | "scissors": "scisors sissors", "separate": "seperate", 204 | "singular": "singulaur", "someone": "somone", "sources": "sorces", "southern": "southen", "special": "speaical specail specal speical", "splendid": "spledid splended splened splended", "standardizing": "stanerdizing", "stomach": "stomac stomache stomec stumache", "supersede": "supercede superceed", "there": "ther", 205 | "totally": "totaly", "transferred": "transfred", "transportability": "transportibility", "triangular": "triangulaur", "understand": "undersand undistand", 206 | "unexpected": "unexpcted unexpeted unexspected", "unfortunately": "unfortunatly", "unique": "uneque", "useful": "usefull", "valuable": "valubale valuble", 207 | "variable": "varable", "variant": "vairiant", "various": "vairious", 208 | "visited": "fisited viseted vistid vistied", "visitors": "vistors", 209 | "voluntary": "volantry", "voting": "voteing", "wanted": "wantid wonted", 210 | "whether": "wether", "wrote": "rote wote"} 211 | 212 | tests2 := map[string]string{"forbidden": "forbiden", "decisions": "deciscions descisions", 213 | "supposedly": "supposidly", "embellishing": "embelishing", "technique": "tecnique", "permanently": "perminantly", "confirmation": "confermation", 214 | "appointment": "appoitment", "progression": "progresion", "accompanying": "acompaning", "applicable": "aplicable", "regained": "regined", "guidelines": "guidlines", "surrounding": "serounding", "titles": "tittles", "unavailable": "unavailble", "advantageous": "advantageos", "brief": "brif", "appeal": "apeal", "consisting": "consisiting", "clerk": "cleark clerck", "component": "componant", "favourable": "faverable", "separation": "seperation", "search": "serch", "receive": "recieve", "employees": "emploies", "prior": "piror", 215 | "resulting": "reulting", "suggestion": "sugestion", "opinion": "oppinion", 216 | "cancellation": "cancelation", "criticism": "citisum", "useful": "usful", 217 | "humour": "humor", "anomalies": "anomolies", "would": "whould", "doubt": "doupt", "examination": "eximination", "therefore": "therefoe", "recommend": "recomend", "separated": "seperated", "successful": "sucssuful succesful", 218 | "apparent": "apparant", "occurred": "occureed", "particular": "paerticulaur", 219 | "pivoting": "pivting", "announcing": "anouncing", "challenge": "chalange", 220 | "arrangements": "araingements", "proportions": "proprtions", "organized": "oranised", "accept": "acept", "dependence": "dependance", "unequalled": "unequaled", "numbers": "numbuers", "sense": "sence", "conversely": "conversly", "provide": "provid", "arrangement": "arrangment", 221 | "responsibilities": "responsiblities", "fourth": "forth", "ordinary": "ordenary", "description": "desription descvription desacription", 222 | "inconceivable": "inconcievable", "data": "dsata", "register": "rgister", 223 | "supervision": "supervison", "encompassing": "encompasing", "negligible": "negligable", "allow": "alow", "operations": "operatins", "executed": "executted", "interpretation": "interpritation", "hierarchy": "heiarky", 224 | "indeed": "indead", "years": "yesars", "through": "throut", "committee": "committe", "inquiries": "equiries", "before": "befor", "continued": "contuned", "permanent": "perminant", "choose": "chose", "virtually": "vertually", "correspondence": "correspondance", "eventually": "eventully", 225 | "lonely": "lonley", "profession": "preffeson", "they": "thay", "now": "noe", 226 | "desperately": "despratly", "university": "unversity", "adjournment": "adjurnment", "possibilities": "possablities", "stopped": "stoped", "mean": "meen", "weighted": "wagted", "adequately": "adequattly", "shown": "hown", 227 | "matrix": "matriiix", "profit": "proffit", "encourage": "encorage", "collate": "colate", "disaggregate": "disaggreagte disaggreaget", "receiving": "recieving reciving", "proviso": "provisoe", "umbrella": "umberalla", "approached": "aproached", "pleasant": "plesent", "difficulty": "dificulty", "appointments": "apointments", "base": "basse", "conditioning": "conditining", "earliest": "earlyest", "beginning": "begining", "universally": "universaly", 228 | "unresolved": "unresloved", "length": "lengh", "exponentially": "exponentualy", "utilized": "utalised", "set": "et", "surveys": "servays", 229 | "families": "familys", "system": "sysem", "approximately": "aproximatly", 230 | "their": "ther", "scheme": "scheem", "speaking": "speeking", "repetitive": "repetative", "inefficient": "ineffiect", "geneva": "geniva", "exactly": "exsactly", "immediate": "imediate", "appreciation": "apreciation", "luckily": "luckeley", "eliminated": "elimiated", "believe": "belive", "appreciated": "apreciated", "readjusted": "reajusted", "were": "wer where", "feeling": "fealing", "and": "anf", "false": "faulse", "seen": "seeen", "interrogating": "interogationg", "academically": "academicly", "relatively": "relativly relitivly", 231 | "traditionally": "traditionaly", "studying": "studing", 232 | "majority": "majorty", "build": "biuld", "aggravating": "agravating", 233 | "transactions": "trasactions", "arguing": "aurguing", "sheets": "sheertes", 234 | "successive": "sucsesive sucessive", "segment": "segemnt", "especially": "especaily", "later": "latter", "senior": "sienior", "dragged": "draged", 235 | "atmosphere": "atmospher", "drastically": "drasticaly", "particularly": "particulary", "visitor": "vistor", "session": "sesion", "continually": "contually", "availability": "avaiblity", "busy": "buisy", "parameters": "perametres", "surroundings": "suroundings seroundings", "employed": "emploied", "adequate": "adiquate", "handle": "handel", "means": "meens", 236 | "familiar": "familer", "between": "beeteen", "overall": "overal", "timing": "timeing", "committees": "comittees commitees", "queries": "quies", 237 | "econometric": "economtric", "erroneous": "errounous", "decides": "descides", 238 | "reference": "refereence refference", "intelligence": "inteligence", 239 | "edition": "ediion ediition", "are": "arte", "apologies": "appologies", 240 | "thermawear": "thermawere thermawhere", "techniques": "tecniques", 241 | "voluntary": "volantary", "subsequent": "subsequant subsiquent", "currently": "curruntly", "forecast": "forcast", "weapons": "wepons", "routine": "rouint", 242 | "neither": "niether", "approach": "aproach", "available": "availble", 243 | "recently": "reciently", "ability": "ablity", "nature": "natior", 244 | "commercial": "comersial", "agencies": "agences", "however": "howeverr", 245 | "suggested": "sugested", "career": "carear", "many": "mony", "annual": "anual", "according": "acording", "receives": "recives recieves", 246 | "interesting": "intresting", "expense": "expence", "relevant": "relavent relevaant", "table": "tasble", "throughout": "throuout", "conference": "conferance", "sensible": "sensable", "described": "discribed describd", 247 | "union": "unioun", "interest": "intrest", "flexible": "flexable", "refered": "reffered", "controlled": "controled", "sufficient": "suficient", 248 | "dissension": "desention", "adaptable": "adabtable", "representative": "representitive", "irrelevant": "irrelavent", "unnecessarily": "unessasarily", 249 | "applied": "upplied", "apologised": "appologised", "these": "thees thess", 250 | "choices": "choises", "will": "wil", "procedure": "proceduer", "shortened": "shortend", "manually": "manualy", "disappointing": "dissapoiting", 251 | "excessively": "exessively", "comments": "coments", "containing": "containg", 252 | "develop": "develope", "credit": "creadit", "government": "goverment", 253 | "acquaintances": "aquantences", "orientated": "orentated", "widely": "widly", 254 | "advise": "advice", "difficult": "dificult", "investigated": "investegated", 255 | "bonus": "bonas", "conceived": "concieved", "nationally": "nationaly", 256 | "compared": "comppared compased", "moving": "moveing", "necessity": "nessesity", "opportunity": "oppertunity oppotunity opperttunity", "thoughts": "thorts", "equalled": "equaled", "variety": "variatry", "analysis": "analiss analsis analisis", "patterns": "pattarns", "qualities": "quaties", "easily": "easyly", "organization": "oranisation oragnisation", "the": "thw hte thi", 257 | "corporate": "corparate", "composed": "compossed", "enormously": "enomosly", 258 | "financially": "financialy", "functionally": "functionaly", "discipline": "disiplin", "announcement": "anouncement", "progresses": "progressess", 259 | "except": "excxept", "recommending": "recomending", "mathematically": "mathematicaly", "source": "sorce", "combine": "comibine", "input": "inut", 260 | "careers": "currers carrers", "resolved": "resoved", "demands": "diemands", 261 | "unequivocally": "unequivocaly", "suffering": "suufering", "immediately": "imidatly imediatly", "accepted": "acepted", "projects": "projeccts", 262 | "necessary": "necasery nessasary nessisary neccassary", "journalism": "journaism", "unnecessary": "unessessay", "night": "nite", "output": "oputput", "security": "seurity", "essential": "esential", "beneficial": "benificial benficial", "explaining": "explaning", "supplementary": "suplementary", "questionnaire": "questionare", "employment": "empolyment", 263 | "proceeding": "proceding", "decision": "descisions descision", "per": "pere", 264 | "discretion": "discresion", "reaching": "reching", "analysed": "analised", 265 | "expansion": "expanion", "although": "athough", "subtract": "subtrcat", 266 | "analysing": "aalysing", "comparison": "comparrison", "months": "monthes", 267 | "hierarchal": "hierachial", "misleading": "missleading", "commit": "comit", 268 | "auguments": "aurgument", "within": "withing", "obtaining": "optaning", 269 | "accounts": "acounts", "primarily": "pimarily", "operator": "opertor", 270 | "accumulated": "acumulated", "extremely": "extreemly", "there": "thear", 271 | "summarys": "sumarys", "analyse": "analiss", "understandable": "understadable", "safeguard": "safegaurd", "consist": "consisit", 272 | "declarations": "declaratrions", "minutes": "muinutes muiuets", "associated": "assosiated", "accessibility": "accessability", "examine": "examin", 273 | "surveying": "servaying", "politics": "polatics", "annoying": "anoying", 274 | "again": "agiin", "assessing": "accesing", "ideally": "idealy", "scrutinized": "scrutiniesed", "simular": "similar", "personnel": "personel", "whereas": "wheras", "when": "whn", "geographically": "goegraphicaly", "gaining": "ganing", "requested": "rquested", "separate": "seporate", "students": "studens", "prepared": "prepaired", "generated": "generataed", "graphically": "graphicaly", "suited": "suted", "variable": "varible vaiable", "building": "biulding", "required": "reequired", "necessitates": "nessisitates", 275 | "together": "togehter", "profits": "proffits"} 276 | 277 | model := NewModel() 278 | model.SetThreshold(1) // This ensures a more complete dictionary at the expense of size/speed. 279 | model.Train(sampleEnglish) 280 | 281 | // Look at test sets 282 | // SET 1 283 | count, correct, incorrect := 0, 0, 0 284 | t2 := time.Now() 285 | for target, testwords := range tests1 { 286 | testwordarr := strings.Split(testwords, " ") 287 | for _, testword := range testwordarr { 288 | if model.SpellCheck(testword) == target { 289 | correct++ 290 | } else { 291 | incorrect++ 292 | } 293 | count++ 294 | } 295 | } 296 | t3 := time.Now() 297 | 298 | fmt.Printf("Spell test1 count: %v, Correct: %v, Incorrect: %v, Ratio: %f, Total time: %v \n\n", count, correct, incorrect, float32(correct)/float32(count), t3.Sub(t2)) 299 | 300 | successrate := float32(correct) / float32(count) 301 | if successrate < 0.60 { 302 | t.Errorf("Unacceptable correction rate for set test1 (%v). e.g. below 60 percent.", successrate) 303 | } 304 | 305 | // 5000Hz is our aim 306 | maxtime := time.Duration(count) * 200 * time.Microsecond 307 | 308 | if t3.Sub(t2) > maxtime { 309 | t.Errorf("Unacceptable completion time for set test1 (%v). e.g. %v corrections took greater than %v.", t3.Sub(t2), count, maxtime) 310 | } 311 | 312 | // SET 2 313 | count, correct, incorrect = 0, 0, 0 314 | t2 = time.Now() 315 | for target, testwords := range tests2 { 316 | testwordarr := strings.Split(testwords, " ") 317 | for _, testword := range testwordarr { 318 | if model.SpellCheck(testword) == target { 319 | correct++ 320 | } else { 321 | incorrect++ 322 | } 323 | count++ 324 | } 325 | } 326 | t3 = time.Now() 327 | 328 | fmt.Printf("Spell test2 count: %v, Correct: %v, Incorrect: %v, Ratio: %f, Total time: %v \n\n", count, correct, incorrect, float32(correct)/float32(count), t3.Sub(t2)) 329 | 330 | successrate = float32(correct) / float32(count) 331 | if successrate < test2AccuracyThreshold { 332 | t.Errorf("Unacceptable correction rate for set test2 (%v). e.g. below %v.", successrate, test2AccuracyThreshold) 333 | } 334 | 335 | // 5000Hz is our aim 336 | maxtime = time.Duration(count) * 200 * time.Microsecond 337 | 338 | if t3.Sub(t2) > maxtime { 339 | t.Errorf("Unacceptable completion time for set test2 (%v). e.g. %v corrections took greater than %v", t3.Sub(t2), count, maxtime) 340 | } 341 | 342 | } 343 | 344 | // Quick test to make sure we're picking up the right stuff 345 | func TestAutocomplete(t *testing.T) { 346 | model := NewModel() 347 | model.Train(sampleEnglish) 348 | out, err := model.Autocomplete("accoun") 349 | if err != nil { 350 | t.Errorf("Autocomplete() returned an error: %s", err) 351 | } 352 | expected := map[string]bool{ 353 | "account": true, 354 | "accountant": true, 355 | "accounts": true, 356 | "accounted": true, 357 | } 358 | for _, m := range out { 359 | if val, ok := expected[m]; !ok { 360 | t.Errorf("Expected to find %v (%v), but didn't", m, val) 361 | } 362 | } 363 | } 364 | 365 | // Test to ensure query training begins to dominate over 366 | // corpus training when autocompleting 367 | func TestAutocompleteFromQueries(t *testing.T) { 368 | model := NewModel() 369 | // Changing defaults for testing only, this is not advisable on production 370 | model.SetThreshold(1) 371 | model.SetDivergenceThreshold(1) 372 | 373 | model.Train([]string{"every", "every", "every", "every", "every", "every", "everest", "eveready", "eveready", "everything", "everything"}) 374 | model.TrainQuery("everest") // Simulate a query 375 | model.TrainQuery("everest") // Simulate a query 376 | model.TrainQuery("eveready") // Simulate a query 377 | 378 | out, err := model.Autocomplete("eve") 379 | if err != nil { 380 | t.Errorf("Autocomplete() returned an error: %s", err) 381 | } 382 | if out[0] != "everest" { 383 | t.Errorf("Autocomplete failed to account for query training") 384 | } 385 | if out[1] != "eveready" { 386 | t.Errorf("Autocomplete failed to account for query training") 387 | } 388 | } 389 | 390 | func TestLoadOldModel(t *testing.T) { 391 | if _, err := Load("data/test.dict"); err != nil { 392 | t.Errorf("Couldn't load old model format: %v", err) 393 | } 394 | } 395 | 396 | func TestEditsMulti(t *testing.T) { 397 | model := NewModel() 398 | got := model.EditsMulti("elephant", model.Depth) 399 | want := []string{ 400 | "lephant", "eephant", "elphant", "elehant", "elepant", "elephnt", "elephat", "elephan", "elephant", 401 | "ephant", "lphant", "lehant", "lepant", "lephnt", "lephat", "lephan", "lephant", 402 | "ephant", "ephant", "eehant", "eepant", "eephnt", "eephat", "eephan", "eephant", 403 | "lphant", "ephant", "elhant", "elpant", "elphnt", "elphat", "elphan", "elphant", 404 | "lehant", "eehant", "elhant", "eleant", "elehnt", "elehat", "elehan", "elehant", 405 | "lepant", "eepant", "elpant", "eleant", "elepnt", "elepat", "elepan", "elepant", 406 | "lephnt", "eephnt", "elphnt", "elehnt", "elepnt", "elepht", "elephn", "elephnt", 407 | "lephat", "eephat", "elphat", "elehat", "elepat", "elepht", "elepha", "elephat", 408 | "lephan", "eephan", "elphan", "elehan", "elepan", "elephn", "elepha", "elephan", 409 | "lephant", "eephant", "elphant", "elehant", "elepant", "elephnt", "elephat", "elephan", "elephant"} 410 | 411 | if !reflect.DeepEqual(got, want) { 412 | t.Errorf("EditsMulti didn't match:\nGot: %v\nWant: %v", got, want) 413 | } 414 | } 415 | 416 | var result []string // prevent the benchmark from getting optimized out 417 | 418 | func BenchmarkEditsMulti(b *testing.B) { 419 | model := NewModel() 420 | for i := 0; i < b.N; i++ { 421 | result = model.EditsMulti("elephant", model.Depth) 422 | } 423 | } 424 | --------------------------------------------------------------------------------