├── .travis.yml
├── LICENSE
├── old.go
├── data
    └── test.dict
├── README.md
├── fuzzy.go
└── fuzzy_test.go


/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | language: go
 3 | go:
 4 | - 1.13
 5 | - 1.14
 6 | - tip
 7 | notifications:
 8 |   email:
 9 |     - infra@sajari.com
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Sajari Pty Ltd
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.


--------------------------------------------------------------------------------
/old.go:
--------------------------------------------------------------------------------
 1 | // Eventually this should be removed. Currently it gives backwards compatability to old
 2 | // versions that did not store the query count, which is now used for autocomplete.
 3 | package fuzzy
 4 | 
 5 | import (
 6 | 	"encoding/json"
 7 | 	"os"
 8 | )
 9 | 
10 | type OldModel struct {
11 | 	Data            map[string]int      `json:"data"`
12 | 	Maxcount        int                 `json:"maxcount"`
13 | 	Suggest         map[string][]string `json:"suggest"`
14 | 	Depth           int                 `json:"depth"`
15 | 	Threshold       int                 `json:"threshold"`
16 | 	UseAutocomplete bool                `json:"autocomplete"`
17 | }
18 | 
19 | // Converts the old model format to the new version
20 | func (model *Model) convertOldFormat(filename string) error {
21 | 	oldmodel := new(OldModel)
22 | 	f, err := os.Open(filename)
23 | 	if err != nil {
24 | 		return err
25 | 	}
26 | 	defer f.Close()
27 | 	d := json.NewDecoder(f)
28 | 	err = d.Decode(oldmodel)
29 | 	if err != nil {
30 | 		return err
31 | 	}
32 | 
33 | 	// Correct for old models pre divergence measure
34 | 	if model.SuffDivergenceThreshold == 0 {
35 | 		model.SuffDivergenceThreshold = SuffDivergenceThresholdDefault
36 | 	}
37 | 
38 | 	// Convert fields
39 | 	model.Maxcount = oldmodel.Maxcount
40 | 	model.Suggest = oldmodel.Suggest
41 | 	model.Depth = oldmodel.Depth
42 | 	model.Threshold = oldmodel.Threshold
43 | 	model.UseAutocomplete = oldmodel.UseAutocomplete
44 | 
45 | 	// Convert the old counts
46 | 	if len(oldmodel.Data) > 0 {
47 | 		model.Data = make(map[string]*Counts, len(oldmodel.Data))
48 | 		for term, cc := range oldmodel.Data {
49 | 			model.Data[term] = &Counts{cc, 0}
50 | 		}
51 | 	}
52 | 	return nil
53 | }
54 | 


--------------------------------------------------------------------------------
/data/test.dict:
--------------------------------------------------------------------------------
1 | {"data":{"aunty":1,"big":1,"bigger":1,"biggest":1,"bob":1,"delicate":1,"dynamite":1,"uncle":1,"you're":1,"your":1},"maxcount":1,"suggest":{"ant":["aunty"],"anty":["aunty"],"any":["aunty"],"aty":["aunty"],"aun":["aunty"],"aunt":["aunty"],"aunty":["aunty"],"auny":["aunty"],"aut":["aunty"],"auty":["aunty"],"auy":["aunty"],"bb":["bob"],"bg":["big"],"bger":["bigger"],"bgest":["biggest"],"bgge":["bigger"],"bgger":["bigger"],"bgges":["biggest"],"bggest":["biggest"],"bgget":["biggest"],"bggr":["bigger"],"bggst":["biggest"],"bi":["big"],"bier":["bigger"],"biest":["biggest"],"big":["big"],"bige":["bigger"],"biger":["bigger"],"biges":["biggest"],"bigest":["biggest"],"biget":["biggest"],"bigg":["bigger"],"bigge":["biggest","bigger"],"bigger":["bigger"],"bigges":["biggest"],"biggest":["biggest"],"bigget":["biggest"],"biggr":["bigger"],"biggs":["biggest"],"biggst":["biggest"],"biggt":["biggest"],"bigr":["bigger"],"bigst":["biggest"],"bo":["bob"],"bob":["bob"],"cle":["uncle"],"damite":["dynamite"],"decate":["delicate"],"deiate":["delicate"],"deicae":["delicate"],"deicat":["delicate"],"deicate":["delicate"],"deicte":["delicate"],"delate":["delicate"],"delcae":["delicate"],"delcat":["delicate"],"delcate":["delicate"],"delcte":["delicate"],"deliae":["delicate"],"deliat":["delicate"],"deliate":["delicate"],"delica":["delicate"],"delicae":["delicate"],"delicat":["delicate"],"delicate":["delicate"],"delice":["delicate"],"delict":["delicate"],"delicte":["delicate"],"delite":["delicate"],"dicate":["delicate"],"dlcate":["delicate"],"dliate":["delicate"],"dlicae":["delicate"],"dlicat":["delicate"],"dlicate":["delicate"],"dlicte":["delicate"],"dnaite":["dynamite"],"dnamie":["dynamite"],"dnamit":["dynamite"],"dnamite":["dynamite"],"dnamte":["dynamite"],"dnmite":["dynamite"],"dyaite":["dynamite"],"dyamie":["dynamite"],"dyamit":["dynamite"],"dyamite":["dynamite"],"dyamte":["dynamite"],"dymite":["dynamite"],"dynaie":["dynamite"],"dynait":["dynamite"],"dynaite":["dynamite"],"dyname":["dynamite"],"dynami":["dynamite"],"dynamie":["dynamite"],"dynamit":["dynamite"],"dynamite":["dynamite"],"dynamt":["dynamite"],"dynamte":["dynamite"],"dynate":["dynamite"],"dynite":["dynamite"],"dynmie":["dynamite"],"dynmit":["dynamite"],"dynmite":["dynamite"],"dynmte":["dynamite"],"eicate":["delicate"],"elcate":["delicate"],"eliate":["delicate"],"elicae":["delicate"],"elicat":["delicate"],"elicate":["delicate"],"elicte":["delicate"],"gger":["bigger"],"ggest":["biggest"],"ig":["big"],"iger":["bigger"],"igest":["biggest"],"igge":["bigger"],"igger":["bigger"],"igges":["biggest"],"iggest":["biggest"],"igget":["biggest"],"iggr":["bigger"],"iggst":["biggest"],"licate":["delicate"],"namite":["dynamite"],"nce":["uncle"],"ncl":["uncle"],"ncle":["uncle"],"nle":["uncle"],"nty":["aunty"],"o're":["you're"],"ob":["bob"],"or":["your"],"ou":["your"],"ou'e":["you're"],"ou'r":["you're"],"ou're":["you're"],"our":["your"],"oure":["you're"],"u're":["you're"],"uce":["uncle"],"ucl":["uncle"],"ucle":["uncle"],"ule":["uncle"],"unc":["uncle"],"unce":["uncle"],"uncl":["uncle"],"uncle":["uncle"],"une":["uncle"],"unl":["uncle"],"unle":["uncle"],"unt":["aunty"],"unty":["aunty"],"uny":["aunty"],"ur":["your"],"uty":["aunty"],"y're":["you're"],"yamite":["dynamite"],"ynaite":["dynamite"],"ynamie":["dynamite"],"ynamit":["dynamite"],"ynamite":["dynamite"],"ynamte":["dynamite"],"ynmite":["dynamite"],"yo":["your"],"yo'e":["you're"],"yo'r":["you're"],"yo're":["you're"],"yor":["your"],"yore":["you're"],"you":["your"],"you'":["you're"],"you'e":["you're"],"you'r":["you're"],"you're":["you're"],"youe":["you're"],"your":["your","you're"],"youre":["you're"],"yr":["your"],"yu":["your"],"yu'e":["you're"],"yu'r":["you're"],"yu're":["you're"],"yur":["your"],"yure":["you're"]},"depth":2,"threshold":1,"autocomplete":true}
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fuzzy
 2 | [![Build Status](https://travis-ci.org/sajari/fuzzy.svg?branch=master)](https://travis-ci.org/sajari/fuzzy)
 3 | 
 4 | Fuzzy is a very fast spell checker and query suggester written in Golang. 
 5 | 
 6 | Motivation:
 7 | - Sajari uses very large queries (hundreds of words) but needs to respond sub-second to these queries where possible. Common spell check algorithms are quite slow or very resource intensive.
 8 | - The aim was to achieve spell checks in sub 100usec per word (10,000 / second single core) with at least 60% accuracy and multi-language support.
 9 | - Currently we see sub 40usec per word and ~70% accuracy for a Levenshtein distance of 2 chars on a 2012 macbook pro (english test set comes from Peter Norvig's article, see http://norvig.com/spell-correct.html). 
10 | - A 500 word query can be spell checked in ~0.02 sec / cpu cores, which is good enough for us.
11 | 
12 | Notes:
13 | - It is currently executed as a single goroutine per lookup, so undoubtedly this could be much faster using multiple cores, but currently the speed is quite good.
14 | - Accuracy is hit slightly because several correct words don't appear at all in the training text (data/big.txt).
15 | - Fuzzy is a "Symmetric Delete Spelling Corrector", which relates to some blogs by Wolf Garbe at Faroo.com (see http://blog.faroo.com/2012/06/07/improved-edit-distance-based-spelling-correction/)
16 | 
17 | Config:
18 | - Generally no config is required, but you can tweak the model for your application. 
19 | - `"threshold"` is the trigger point when a word becomes popular enough to build lookup keys for it. Setting this to "1" means any instance of a given word makes it a legitimate spelling. This typically corrects the most errors, but can also cause false positives if incorrect spellings exist in the training data. It also causes a much larger index to be built. By default this is set to 4.
20 | - `"depth"` is the Levenshtein distance the model builds lookup keys for. For spelling correction, a setting of "2" is typically very good. At a distance of "3" the potential number of words is much, much larger, but adds little benefit to accuracy. For query prediction a larger number can be useful, but again is much more expensive. **A depth of "1" and threshold of "1" for the 1st Norvig test set gives ~70% correction accuracy at ~5usec per check (e.g. ~200kHz)**, for many applications this will be good enough. At depths > 2, the false positives begin to hurt the accuracy.
21 | 
22 | Future improvements:
23 | - Make some of the expensive processes concurrent. 
24 | - Add spelling checks for different languages. If you have misspellings in different languages please add them or send to us.
25 | - Allow the term-score map to be read from an external term set (e.g. integrating this currently may double up on keeping a term count).
26 | - Currently there is no method to delete lookup keys, so potentially this may cause bloating over time if the dictionary changes signficantly.
27 | - Add right to left deletion beyond Levenshtein config depth (e.g. don't process all deletes accept for query predictors).
28 | 
29 | Usage:
30 | - Below is some example code showing how to use the package.
31 | - An example showing how to train with a static set of words is contained in the fuzzy_test.go file, which uses the "big.text" file to create an english dictionary. 
32 | - To integrate with your application (e.g. custom dictionary / word popularity), use the single word and multiword training functions shown in the example below. Each time you add a new instance of a given word, pass it to this function. The model will keep a count and 
33 | - We haven't tested with other langauges, but this should work fine. Please let us know how you go? `support@sajari.com`
34 | 
35 | 
36 | ```go
37 | package main 
38 | 
39 | import(
40 | 	"github.com/sajari/fuzzy"
41 | 	"fmt"
42 | )
43 | 
44 | func main() {
45 | 	model := fuzzy.NewModel()
46 | 
47 | 	// For testing only, this is not advisable on production
48 | 	model.SetThreshold(1)
49 | 
50 | 	// This expands the distance searched, but costs more resources (memory and time). 
51 | 	// For spell checking, "2" is typically enough, for query suggestions this can be higher
52 | 	model.SetDepth(5)
53 | 
54 | 	// Train multiple words simultaneously by passing an array of strings to the "Train" function
55 | 	words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're"}
56 | 	model.Train(words)
57 | 	
58 | 	// Train word by word (typically triggered in your application once a given word is popular enough)
59 | 	model.TrainWord("single")
60 | 
61 | 	// Check Spelling
62 | 	fmt.Println("\nSPELL CHECKS")
63 | 	fmt.Println("	Deletion test (yor) : ", model.SpellCheck("yor"))
64 | 	fmt.Println("	Swap test (uncel) : ", model.SpellCheck("uncel"))
65 | 	fmt.Println("	Replace test (dynemite) : ", model.SpellCheck("dynemite"))
66 | 	fmt.Println("	Insert test (dellicate) : ", model.SpellCheck("dellicate"))
67 | 	fmt.Println("	Two char test (dellicade) : ", model.SpellCheck("dellicade"))
68 | 
69 | 	// Suggest completions
70 | 	fmt.Println("\nQUERY SUGGESTIONS")
71 | 	fmt.Println("	\"bigge\". Did you mean?: ", model.Suggestions("bigge", false))
72 | 	fmt.Println("	\"bo\". Did you mean?: ", model.Suggestions("bo", false))
73 | 	fmt.Println("	\"dyn\". Did you mean?: ", model.Suggestions("dyn", false))
74 | 
75 | 	// Autocomplete suggestions
76 | 	suggested, _ := model.Autocomplete("bi")
77 | 	fmt.Printf("	\"bi\". Suggestions: %v", suggested)
78 | 
79 | }
80 | ```


--------------------------------------------------------------------------------
/fuzzy.go:
--------------------------------------------------------------------------------
  1 | package fuzzy
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"encoding/json"
  6 | 	"errors"
  7 | 	"fmt"
  8 | 	"index/suffixarray"
  9 | 	"io"
 10 | 	"log"
 11 | 	"os"
 12 | 	"regexp"
 13 | 	"sort"
 14 | 	"strings"
 15 | 	"sync"
 16 | )
 17 | 
 18 | const (
 19 | 	SpellDepthDefault              = 2
 20 | 	SpellThresholdDefault          = 5
 21 | 	SuffDivergenceThresholdDefault = 100
 22 | )
 23 | 
 24 | type Pair struct {
 25 | 	str1 string
 26 | 	str2 string
 27 | }
 28 | 
 29 | type Method int
 30 | 
 31 | const (
 32 | 	MethodIsWord                   Method = 0
 33 | 	MethodSuggestMapsToInput              = 1
 34 | 	MethodInputDeleteMapsToDict           = 2
 35 | 	MethodInputDeleteMapsToSuggest        = 3
 36 | )
 37 | 
 38 | type Potential struct {
 39 | 	Term   string // Potential term string
 40 | 	Score  int    // Score
 41 | 	Leven  int    // Levenstein distance from the suggestion to the input
 42 | 	Method Method // How this potential was matched
 43 | }
 44 | 
 45 | type Counts struct {
 46 | 	Corpus int `json:"corpus"`
 47 | 	Query  int `json:"query"`
 48 | }
 49 | 
 50 | type Model struct {
 51 | 	Data                    map[string]*Counts  `json:"data"`
 52 | 	Maxcount                int                 `json:"maxcount"`
 53 | 	Suggest                 map[string][]string `json:"suggest"`
 54 | 	Depth                   int                 `json:"depth"`
 55 | 	Threshold               int                 `json:"threshold"`
 56 | 	UseAutocomplete         bool                `json:"autocomplete"`
 57 | 	SuffDivergence          int                 `json:"-"`
 58 | 	SuffDivergenceThreshold int                 `json:"suff_threshold"`
 59 | 	SuffixArr               *suffixarray.Index  `json:"-"`
 60 | 	SuffixArrConcat         string              `json:"-"`
 61 | 	sync.RWMutex
 62 | }
 63 | 
 64 | // For sorting autocomplete suggestions
 65 | // to bias the most popular first
 66 | type Autos struct {
 67 | 	Results []string
 68 | 	Model   *Model
 69 | }
 70 | 
 71 | func (a Autos) Len() int      { return len(a.Results) }
 72 | func (a Autos) Swap(i, j int) { a.Results[i], a.Results[j] = a.Results[j], a.Results[i] }
 73 | 
 74 | func (a Autos) Less(i, j int) bool {
 75 | 	icc := a.Model.Data[a.Results[i]].Corpus
 76 | 	jcc := a.Model.Data[a.Results[j]].Corpus
 77 | 	icq := a.Model.Data[a.Results[i]].Query
 78 | 	jcq := a.Model.Data[a.Results[j]].Query
 79 | 	if icq == jcq {
 80 | 		if icc == jcc {
 81 | 			return a.Results[i] > a.Results[j]
 82 | 		}
 83 | 		return icc > jcc
 84 | 	}
 85 | 	return icq > jcq
 86 | }
 87 | 
 88 | func (m Method) String() string {
 89 | 	switch m {
 90 | 	case MethodIsWord:
 91 | 		return "Input in dictionary"
 92 | 	case MethodSuggestMapsToInput:
 93 | 		return "Suggest maps to input"
 94 | 	case MethodInputDeleteMapsToDict:
 95 | 		return "Input delete maps to dictionary"
 96 | 	case MethodInputDeleteMapsToSuggest:
 97 | 		return "Input delete maps to suggest key"
 98 | 	}
 99 | 	return "unknown"
100 | }
101 | 
102 | func (pot *Potential) String() string {
103 | 	return fmt.Sprintf("Term: %v\n\tScore: %v\n\tLeven: %v\n\tMethod: %v\n\n", pot.Term, pot.Score, pot.Leven, pot.Method)
104 | }
105 | 
106 | // Create and initialise a new model
107 | func NewModel() *Model {
108 | 	model := new(Model)
109 | 	return model.Init()
110 | }
111 | 
112 | func (model *Model) Init() *Model {
113 | 	model.Data = make(map[string]*Counts)
114 | 	model.Suggest = make(map[string][]string)
115 | 	model.Depth = SpellDepthDefault
116 | 	model.Threshold = SpellThresholdDefault // Setting this to 1 is most accurate, but "1" is 5x more memory and 30x slower processing than "4". This is a big performance tuning knob
117 | 	model.UseAutocomplete = true            // Default is to include Autocomplete
118 | 	model.updateSuffixArr()
119 | 	model.SuffDivergenceThreshold = SuffDivergenceThresholdDefault
120 | 	return model
121 | }
122 | 
123 | // WriteTo writes a model to a Writer
124 | func (model *Model) WriteTo(w io.Writer) (int64, error) {
125 | 	model.RLock()
126 | 	defer model.RUnlock()
127 | 	b, err := json.Marshal(model)
128 | 	if err != nil {
129 | 		return 0, err
130 | 	}
131 | 	n, err := w.Write(b)
132 | 	if err != nil {
133 | 		return int64(n), err
134 | 	}
135 | 	return int64(n), nil
136 | }
137 | 
138 | // Save a spelling model to disk
139 | func (model *Model) Save(filename string) error {
140 | 	f, err := os.Create(filename)
141 | 	if err != nil {
142 | 		log.Println("Fuzzy model:", err)
143 | 		return err
144 | 	}
145 | 	defer f.Close()
146 | 	_, err = model.WriteTo(f)
147 | 	if err != nil {
148 | 		log.Println("Fuzzy model:", err)
149 | 		return err
150 | 	}
151 | 	return nil
152 | }
153 | 
154 | // Save a spelling model to disk, but discard all
155 | // entries less than the threshold number of occurences
156 | // Much smaller and all that is used when generated
157 | // as a once off, but not useful for incremental usage
158 | func (model *Model) SaveLight(filename string) error {
159 | 	model.Lock()
160 | 	for term, count := range model.Data {
161 | 		if count.Corpus < model.Threshold {
162 | 			delete(model.Data, term)
163 | 		}
164 | 	}
165 | 	model.Unlock()
166 | 	return model.Save(filename)
167 | }
168 | 
169 | // FromReader loads a model from a Reader
170 | func FromReader(r io.Reader) (*Model, error) {
171 | 	model := new(Model)
172 | 	d := json.NewDecoder(r)
173 | 	err := d.Decode(model)
174 | 	if err != nil {
175 | 		return nil, err
176 | 	}
177 | 	model.updateSuffixArr()
178 | 	return model, nil
179 | }
180 | 
181 | // Load a saved model from disk
182 | func Load(filename string) (*Model, error) {
183 | 	f, err := os.Open(filename)
184 | 	if err != nil {
185 | 		return nil, err
186 | 	}
187 | 	defer f.Close()
188 | 	model, err := FromReader(f)
189 | 	if err != nil {
190 | 		model = new(Model)
191 | 		if err1 := model.convertOldFormat(filename); err1 != nil {
192 | 			return model, err1
193 | 		}
194 | 		return model, nil
195 | 	}
196 | 	return model, nil
197 | }
198 | 
199 | // Change the default depth value of the model. This sets how many
200 | // character differences are indexed. The default is 2.
201 | func (model *Model) SetDepth(val int) {
202 | 	model.Lock()
203 | 	model.Depth = val
204 | 	model.Unlock()
205 | }
206 | 
207 | // Change the default threshold of the model. This is how many times
208 | // a term must be seen before suggestions are created for it
209 | func (model *Model) SetThreshold(val int) {
210 | 	model.Lock()
211 | 	model.Threshold = val
212 | 	model.Unlock()
213 | }
214 | 
215 | // Optionally disabled suffixarray based autocomplete support
216 | func (model *Model) SetUseAutocomplete(val bool) {
217 | 	model.Lock()
218 | 	old := model.UseAutocomplete
219 | 	model.Unlock()
220 | 	model.UseAutocomplete = val
221 | 	if !old && val {
222 | 		model.updateSuffixArr()
223 | 	}
224 | }
225 | 
226 | // Optionally set the suffix array divergence threshold. This is
227 | // the number of query training steps between rebuilds of the
228 | // suffix array. A low number will be more accurate but will use
229 | // resources and create more garbage.
230 | func (model *Model) SetDivergenceThreshold(val int) {
231 | 	model.Lock()
232 | 	model.SuffDivergenceThreshold = val
233 | 	model.Unlock()
234 | }
235 | 
236 | // Calculate the Levenshtein distance between two strings
237 | func Levenshtein(a, b *string) int {
238 | 	la := len(*a)
239 | 	lb := len(*b)
240 | 	d := make([]int, la+1)
241 | 	var lastdiag, olddiag, temp int
242 | 
243 | 	for i := 1; i <= la; i++ {
244 | 		d[i] = i
245 | 	}
246 | 	for i := 1; i <= lb; i++ {
247 | 		d[0] = i
248 | 		lastdiag = i - 1
249 | 		for j := 1; j <= la; j++ {
250 | 			olddiag = d[j]
251 | 			min := d[j] + 1
252 | 			if (d[j-1] + 1) < min {
253 | 				min = d[j-1] + 1
254 | 			}
255 | 			if (*a)[j-1] == (*b)[i-1] {
256 | 				temp = 0
257 | 			} else {
258 | 				temp = 1
259 | 			}
260 | 			if (lastdiag + temp) < min {
261 | 				min = lastdiag + temp
262 | 			}
263 | 			d[j] = min
264 | 			lastdiag = olddiag
265 | 		}
266 | 	}
267 | 	return d[la]
268 | }
269 | 
270 | // Add an array of words to train the model in bulk
271 | func (model *Model) Train(terms []string) {
272 | 	for _, term := range terms {
273 | 		model.TrainWord(term)
274 | 	}
275 | 	model.updateSuffixArr()
276 | }
277 | 
278 | // Manually set the count of a word. Optionally trigger the
279 | // creation of suggestion keys for the term. This function lets
280 | // you build a model from an existing dictionary with word popularity
281 | // counts without needing to run "TrainWord" repeatedly
282 | func (model *Model) SetCount(term string, count int, suggest bool) {
283 | 	model.Lock()
284 | 	model.Data[term] = &Counts{count, 0} // Note: This may reset a query count? TODO
285 | 	if suggest {
286 | 		model.createSuggestKeys(term)
287 | 	}
288 | 	model.Unlock()
289 | }
290 | 
291 | // Train the model word by word. This is corpus training as opposed
292 | // to query training. Word counts from this type of training are not
293 | // likely to correlate with those of search queries
294 | func (model *Model) TrainWord(term string) {
295 | 	model.Lock()
296 | 	if t, ok := model.Data[term]; ok {
297 | 		t.Corpus++
298 | 	} else {
299 | 		model.Data[term] = &Counts{1, 0}
300 | 	}
301 | 	// Set the max
302 | 	if model.Data[term].Corpus > model.Maxcount {
303 | 		model.Maxcount = model.Data[term].Corpus
304 | 		model.SuffDivergence++
305 | 	}
306 | 	// If threshold is triggered, store delete suggestion keys
307 | 	if model.Data[term].Corpus == model.Threshold {
308 | 		model.createSuggestKeys(term)
309 | 	}
310 | 	model.Unlock()
311 | }
312 | 
313 | // Train using a search query term. This builds a second popularity
314 | // index of terms used to search, as opposed to generally occurring
315 | // in corpus text
316 | func (model *Model) TrainQuery(term string) {
317 | 	model.Lock()
318 | 	if t, ok := model.Data[term]; ok {
319 | 		t.Query++
320 | 	} else {
321 | 		model.Data[term] = &Counts{0, 1}
322 | 	}
323 | 	model.SuffDivergence++
324 | 	update := model.SuffDivergence > model.SuffDivergenceThreshold
325 | 	model.Unlock()
326 | 	if update {
327 | 		model.updateSuffixArr()
328 | 	}
329 | }
330 | 
331 | // For a given term, create the partially deleted lookup keys
332 | func (model *Model) createSuggestKeys(term string) {
333 | 	edits := model.EditsMulti(term, model.Depth)
334 | 	for _, edit := range edits {
335 | 		if len(edit) <= 1 {
336 | 			continue
337 | 		}
338 | 		skip := false
339 | 		for _, hit := range model.Suggest[edit] {
340 | 			if hit == term {
341 | 				// Already know about this one
342 | 				skip = true
343 | 				break
344 | 			}
345 | 		}
346 | 		if !skip {
347 | 			model.Suggest[edit] = append(model.Suggest[edit], term)
348 | 		}
349 | 	}
350 | }
351 | 
352 | // Edits at any depth for a given term. The depth of the model is used
353 | func (model *Model) EditsMulti(term string, depth int) []string {
354 | 	edits := Edits1(term)
355 | 	for {
356 | 		depth--
357 | 		if depth <= 0 {
358 | 			break
359 | 		}
360 | 		for _, edit := range edits {
361 | 			edits = append(edits, Edits1(edit)...)
362 | 		}
363 | 	}
364 | 	return edits
365 | }
366 | 
367 | // Edits1 creates a set of terms that are 1 char delete from the input term
368 | func Edits1(word string) []string {
369 | 	total_set := make([]string, 0, len(word)+2)
370 | 	for i := 0; i < len(word); i++ {
371 | 		// delete ith character
372 | 		total_set = append(total_set, word[:i]+word[i+1:])
373 | 	}
374 | 
375 | 	total_set = append(total_set, word)
376 | 
377 | 	// Special case ending in "ies" or "ys"
378 | 	if strings.HasSuffix(word, "ies") {
379 | 		total_set = append(total_set, word[:len(word)-3]+"ys")
380 | 	}
381 | 	if strings.HasSuffix(word, "ys") {
382 | 		total_set = append(total_set, word[:len(word)-2]+"ies")
383 | 	}
384 | 
385 | 	return total_set
386 | }
387 | 
388 | func (model *Model) corpusCount(input string) int {
389 | 	if score, ok := model.Data[input]; ok {
390 | 		return score.Corpus
391 | 	}
392 | 	return 0
393 | }
394 | 
395 | // From a group of potentials, work out the most likely result
396 | func best(input string, potential map[string]*Potential) string {
397 | 	var best string
398 | 	var bestcalc, bonus int
399 | 	for i := 0; i < 4; i++ {
400 | 		for _, pot := range potential {
401 | 			if pot.Leven == 0 {
402 | 				return pot.Term
403 | 			} else if pot.Leven == i {
404 | 				bonus = 0
405 | 				// If the first letter is the same, that's a good sign. Bias these potentials
406 | 				if pot.Term[0] == input[0] {
407 | 					bonus += 100
408 | 				}
409 | 				if pot.Score+bonus > bestcalc {
410 | 					bestcalc = pot.Score + bonus
411 | 					best = pot.Term
412 | 				}
413 | 			}
414 | 		}
415 | 		if bestcalc > 0 {
416 | 			return best
417 | 		}
418 | 	}
419 | 	return best
420 | }
421 | 
422 | // From a group of potentials, work out the most likely results, in order of
423 | // best to worst
424 | func bestn(input string, potential map[string]*Potential, n int) []string {
425 | 	var output []string
426 | 	for i := 0; i < n; i++ {
427 | 		if len(potential) == 0 {
428 | 			break
429 | 		}
430 | 		b := best(input, potential)
431 | 		output = append(output, b)
432 | 		delete(potential, b)
433 | 	}
434 | 	return output
435 | }
436 | 
437 | // Test an input, if we get it wrong, look at why it is wrong. This
438 | // function returns a bool indicating if the guess was correct as well
439 | // as the term it is suggesting. Typically this function would be used
440 | // for testing, not for production
441 | func (model *Model) CheckKnown(input string, correct string) bool {
442 | 	model.RLock()
443 | 	defer model.RUnlock()
444 | 	suggestions := model.suggestPotential(input, true)
445 | 	best := best(input, suggestions)
446 | 	if best == correct {
447 | 		// This guess is correct
448 | 		fmt.Printf("Input correctly maps to correct term")
449 | 		return true
450 | 	}
451 | 	if pot, ok := suggestions[correct]; !ok {
452 | 
453 | 		if model.corpusCount(correct) > 0 {
454 | 			fmt.Printf("\"%v\" - %v (%v) not in the suggestions. (%v) best option.\n", input, correct, model.corpusCount(correct), best)
455 | 			for _, sugg := range suggestions {
456 | 				fmt.Printf("	%v\n", sugg)
457 | 			}
458 | 		} else {
459 | 			fmt.Printf("\"%v\" - Not in dictionary\n", correct)
460 | 		}
461 | 	} else {
462 | 		fmt.Printf("\"%v\" - (%v) suggested, should however be (%v).\n", input, suggestions[best], pot)
463 | 	}
464 | 	return false
465 | }
466 | 
467 | // For a given input term, suggest some alternatives. If exhaustive, each of the 4
468 | // cascading checks will be performed and all potentials will be sorted accordingly
469 | func (model *Model) suggestPotential(input string, exhaustive bool) map[string]*Potential {
470 | 	input = strings.ToLower(input)
471 | 	suggestions := make(map[string]*Potential, 20)
472 | 
473 | 	// 0 - If this is a dictionary term we're all good, no need to go further
474 | 	if model.corpusCount(input) > model.Threshold {
475 | 		suggestions[input] = &Potential{Term: input, Score: model.corpusCount(input), Leven: 0, Method: MethodIsWord}
476 | 		if !exhaustive {
477 | 			return suggestions
478 | 		}
479 | 	}
480 | 
481 | 	// 1 - See if the input matches a "suggest" key
482 | 	if sugg, ok := model.Suggest[input]; ok {
483 | 		for _, pot := range sugg {
484 | 			if _, ok := suggestions[pot]; !ok {
485 | 				suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: Levenshtein(&input, &pot), Method: MethodSuggestMapsToInput}
486 | 			}
487 | 		}
488 | 
489 | 		if !exhaustive {
490 | 			return suggestions
491 | 		}
492 | 	}
493 | 
494 | 	// 2 - See if edit1 matches input
495 | 	max := 0
496 | 	edits := model.EditsMulti(input, model.Depth)
497 | 	for _, edit := range edits {
498 | 		score := model.corpusCount(edit)
499 | 		if score > 0 && len(edit) > 2 {
500 | 			if _, ok := suggestions[edit]; !ok {
501 | 				suggestions[edit] = &Potential{Term: edit, Score: score, Leven: Levenshtein(&input, &edit), Method: MethodInputDeleteMapsToDict}
502 | 			}
503 | 			if score > max {
504 | 				max = score
505 | 			}
506 | 		}
507 | 	}
508 | 	if max > 0 {
509 | 		if !exhaustive {
510 | 			return suggestions
511 | 		}
512 | 	}
513 | 
514 | 	// 3 - No hits on edit1 distance, look for transposes and replaces
515 | 	// Note: these are more complex, we need to check the guesses
516 | 	// more thoroughly, e.g. levals=[valves] in a raw sense, which
517 | 	// is incorrect
518 | 	for _, edit := range edits {
519 | 		if sugg, ok := model.Suggest[edit]; ok {
520 | 			// Is this a real transpose or replace?
521 | 			for _, pot := range sugg {
522 | 				lev := Levenshtein(&input, &pot)
523 | 				if lev <= model.Depth+1 { // The +1 doesn't seem to impact speed, but has greater coverage when the depth is not sufficient to make suggestions
524 | 					if _, ok := suggestions[pot]; !ok {
525 | 						suggestions[pot] = &Potential{Term: pot, Score: model.corpusCount(pot), Leven: lev, Method: MethodInputDeleteMapsToSuggest}
526 | 					}
527 | 				}
528 | 			}
529 | 		}
530 | 	}
531 | 	return suggestions
532 | }
533 | 
534 | // Return the raw potential terms so they can be ranked externally
535 | // to this package
536 | func (model *Model) Potentials(input string, exhaustive bool) map[string]*Potential {
537 | 	model.RLock()
538 | 	defer model.RUnlock()
539 | 	return model.suggestPotential(input, exhaustive)
540 | }
541 | 
542 | // For a given input string, suggests potential replacements
543 | func (model *Model) Suggestions(input string, exhaustive bool) []string {
544 | 	model.RLock()
545 | 	suggestions := model.suggestPotential(input, exhaustive)
546 | 	model.RUnlock()
547 | 	output := make([]string, 0, 10)
548 | 	for _, suggestion := range suggestions {
549 | 		output = append(output, suggestion.Term)
550 | 	}
551 | 	return output
552 | }
553 | 
554 | // Return the most likely correction for the input term
555 | func (model *Model) SpellCheck(input string) string {
556 | 	model.RLock()
557 | 	suggestions := model.suggestPotential(input, false)
558 | 	model.RUnlock()
559 | 	return best(input, suggestions)
560 | }
561 | 
562 | // Return the most likely corrections in order from best to worst
563 | func (model *Model) SpellCheckSuggestions(input string, n int) []string {
564 | 	model.RLock()
565 | 	suggestions := model.suggestPotential(input, true)
566 | 	model.RUnlock()
567 | 	return bestn(input, suggestions, n)
568 | }
569 | 
570 | func SampleEnglish() []string {
571 | 	var out []string
572 | 	file, err := os.Open("data/big.txt")
573 | 	if err != nil {
574 | 		fmt.Println(err)
575 | 		return out
576 | 	}
577 | 	reader := bufio.NewReader(file)
578 | 	scanner := bufio.NewScanner(reader)
579 | 	scanner.Split(bufio.ScanLines)
580 | 	// Count the words.
581 | 	count := 0
582 | 	for scanner.Scan() {
583 | 		exp, _ := regexp.Compile("[a-zA-Z]+")
584 | 		words := exp.FindAll([]byte(scanner.Text()), -1)
585 | 		for _, word := range words {
586 | 			if len(word) > 1 {
587 | 				out = append(out, strings.ToLower(string(word)))
588 | 				count++
589 | 			}
590 | 		}
591 | 	}
592 | 	if err := scanner.Err(); err != nil {
593 | 		fmt.Fprintln(os.Stderr, "reading input:", err)
594 | 	}
595 | 
596 | 	return out
597 | }
598 | 
599 | // Takes the known dictionary listing and creates a suffix array
600 | // model for these terms. If a model already existed, it is discarded
601 | func (model *Model) updateSuffixArr() {
602 | 	if !model.UseAutocomplete {
603 | 		return
604 | 	}
605 | 	model.RLock()
606 | 	termArr := make([]string, 0, 1000)
607 | 	for term, count := range model.Data {
608 | 		if count.Corpus > model.Threshold || count.Query > 0 { // TODO: query threshold?
609 | 			termArr = append(termArr, term)
610 | 		}
611 | 	}
612 | 	model.SuffixArrConcat = "\x00" + strings.Join(termArr, "\x00") + "\x00"
613 | 	model.SuffixArr = suffixarray.New([]byte(model.SuffixArrConcat))
614 | 	model.SuffDivergence = 0
615 | 	model.RUnlock()
616 | }
617 | 
618 | // For a given string, autocomplete using the suffix array model
619 | func (model *Model) Autocomplete(input string) ([]string, error) {
620 | 	model.RLock()
621 | 	defer model.RUnlock()
622 | 	if !model.UseAutocomplete {
623 | 		return []string{}, errors.New("Autocomplete is disabled")
624 | 	}
625 | 	if len(input) == 0 {
626 | 		return []string{}, errors.New("Input cannot have length zero")
627 | 	}
628 | 	express := "\x00" + input + "[^\x00]*"
629 | 	match, err := regexp.Compile(express)
630 | 	if err != nil {
631 | 		return []string{}, err
632 | 	}
633 | 	matches := model.SuffixArr.FindAllIndex(match, -1)
634 | 	a := &Autos{Results: make([]string, 0, len(matches)), Model: model}
635 | 	for _, m := range matches {
636 | 		str := strings.Trim(model.SuffixArrConcat[m[0]:m[1]], "\x00")
637 | 		if count, ok := model.Data[str]; ok {
638 | 			if count.Corpus > model.Threshold || count.Query > 0 {
639 | 				a.Results = append(a.Results, str)
640 | 			}
641 | 		}
642 | 	}
643 | 	sort.Sort(a)
644 | 	if len(a.Results) >= 10 {
645 | 		return a.Results[:10], nil
646 | 	}
647 | 	return a.Results, nil
648 | }
649 | 


--------------------------------------------------------------------------------
/fuzzy_test.go:
--------------------------------------------------------------------------------
  1 | package fuzzy
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"reflect"
  6 | 	"runtime"
  7 | 	"strings"
  8 | 	"sync"
  9 | 	"testing"
 10 | 	"time"
 11 | )
 12 | 
 13 | var sampleEnglish []string
 14 | 
 15 | func init() {
 16 | 	sampleEnglish = SampleEnglish()
 17 | }
 18 | 
 19 | func TestSpelling(t *testing.T) {
 20 | 	model := NewModel()
 21 | 
 22 | 	// For testing only, this is not advisable on production
 23 | 	model.SetThreshold(1)
 24 | 
 25 | 	// Train multiple words simultaneously
 26 | 	words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're", "bob", "your"}
 27 | 	model.Train(words)
 28 | 
 29 | 	// Check Spelling
 30 | 	if model.SpellCheck("yor") != "your" {
 31 | 		t.Errorf("Spell check: Single char delete failed")
 32 | 	}
 33 | 	if model.SpellCheck("uncel") != "uncle" {
 34 | 		t.Errorf("Spell check: Single char transpose failed")
 35 | 	}
 36 | 	if model.SpellCheck("dynemite") != "dynamite" {
 37 | 		t.Errorf("Spell check: Single char swap failed")
 38 | 	}
 39 | 	if model.SpellCheck("dellicate") != "delicate" {
 40 | 		t.Errorf("Spell check: Single char insertion failed")
 41 | 	}
 42 | 	if model.SpellCheck("dellicade") != "delicate" {
 43 | 		t.Errorf("Spell check: Two char change failed")
 44 | 	}
 45 | }
 46 | 
 47 | func TestSpellingSuggestions(t *testing.T) {
 48 | 	model := NewModel()
 49 | 
 50 | 	// For testing only, this is not advisable on production
 51 | 	model.SetThreshold(1)
 52 | 
 53 | 	// Train multiple words simultaneously
 54 | 	words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "biggest", "big", "bigger", "aunty", "you're", "bob", "your"}
 55 | 	model.Train(words)
 56 | 
 57 | 	// Check Spelling
 58 | 	if model.SpellCheckSuggestions("yor", 2)[0] != "your" {
 59 | 		t.Errorf("Spell check: Single char delete failed")
 60 | 	}
 61 | 	if model.SpellCheckSuggestions("uncel", 2)[0] != "uncle" {
 62 | 		t.Errorf("Spell check: Single char transpose failed")
 63 | 	}
 64 | 	if model.SpellCheckSuggestions("dynemite", 2)[0] != "dynamite" {
 65 | 		t.Errorf("Spell check: Single char swap failed")
 66 | 	}
 67 | 	if model.SpellCheckSuggestions("dellicate", 2)[0] != "delicate" {
 68 | 		t.Errorf("Spell check: Single char insertion failed")
 69 | 	}
 70 | 	if model.SpellCheckSuggestions("dellicade", 2)[0] != "delicate" {
 71 | 		t.Errorf("Spell check: Two char change failed")
 72 | 	}
 73 | 
 74 | 	suggestions := model.SpellCheckSuggestions("bigge", 2)
 75 | 	if suggestions[0] != "bigger" {
 76 | 		t.Errorf("Spell check suggestions, Single char delete is closest")
 77 | 	}
 78 | 	// "biggest" should be before "big" since it appears twice
 79 | 	if suggestions[1] != "biggest" {
 80 | 		t.Errorf("Spell check suggestions, Double char delete 2nd closest")
 81 | 	}
 82 | }
 83 | 
 84 | func TestSuggestions(t *testing.T) {
 85 | 	model := NewModel()
 86 | 
 87 | 	// For testing only, this is not advisable on production
 88 | 	model.SetThreshold(1)
 89 | 
 90 | 	// Train multiple words simultaneously
 91 | 	words := []string{"bob", "your", "uncle", "dynamite", "delicate", "biggest", "big", "bigger", "aunty", "you're"}
 92 | 	model.Train(words)
 93 | 
 94 | 	// Train word by word
 95 | 	model.TrainWord("single")
 96 | 
 97 | 	// Suggest completions
 98 | 	potential := model.Suggestions("bigge", false)
 99 | 	bigger := false
100 | 	biggest := false
101 | 	for _, term := range potential {
102 | 		if term == "bigger" {
103 | 			bigger = true
104 | 		}
105 | 		if term == "biggest" {
106 | 			biggest = true
107 | 		}
108 | 	}
109 | 	if !biggest || !bigger {
110 | 		t.Errorf("Suggestions are missing values that should be there")
111 | 	}
112 | }
113 | 
114 | func TestManualTermAddition(t *testing.T) {
115 | 	model := NewModel()
116 | 	model.SetThreshold(4)
117 | 
118 | 	model.SetCount("elephant", 10, true)
119 | 
120 | 	if model.SpellCheck("elphant") != "elephant" {
121 | 		t.Errorf("Spell check: manual term addition didn't work")
122 | 	}
123 | }
124 | 
125 | // Not exhaustive, but shows training and spell checks can run concurrently
126 | func TestConcurrency(t *testing.T) {
127 | 	cpu := runtime.NumCPU()
128 | 	runtime.GOMAXPROCS(cpu)
129 | 	model := NewModel()
130 | 
131 | 	piece := len(sampleEnglish) / cpu
132 | 
133 | 	var wg sync.WaitGroup
134 | 	// Train concurrently
135 | 	for i := 0; i < cpu; i++ {
136 | 		wg.Add(1)
137 | 		go func(i int) {
138 | 			begin := i * piece
139 | 			end := (i+1)*piece - 1
140 | 			model.Train(sampleEnglish[begin:end])
141 | 			wg.Done()
142 | 		}(i)
143 | 	}
144 | 	wg.Wait()
145 | 
146 | 	// Test concurrently
147 | 	words := []string{"bob", "your", "uncle", "dynmite", "delidate", "bgigest", "bigr", "biger", "arnty", "you're"}
148 | 	for i := 0; i < cpu; i++ {
149 | 		wg.Add(1)
150 | 		go func() {
151 | 			for _, word := range words {
152 | 				model.SpellCheck(word)
153 | 			}
154 | 			wg.Done()
155 | 		}()
156 | 	}
157 | 	wg.Wait()
158 | }
159 | 
160 | func TestColdInit(t *testing.T) {
161 | 	model := NewModel()
162 | 	_, err := model.Autocomplete("a")
163 | 	if err != nil {
164 | 		t.Errorf("Failed to init and autocomplete: %v", err)
165 | 	}
166 | }
167 | 
168 | // Accuracy test sets come from Peter Norvig's set
169 | // The big.txt file is also from Peter Norvig's set. This helps to define a decent
170 | // dictionary, although it is still missing some of the common words in the test sets
171 | // We aim for > 60% correction success at a rate of > 5000Hz (single threaded)
172 | func TestAccuracy(t *testing.T) {
173 | 	const test2AccuracyThreshold = .60
174 | 
175 | 	tests1 := map[string]string{"access": "acess", "accessing": "accesing", "accommodation": "accomodation acommodation acomodation", "account": "acount", "address": "adress adres", "addressable": "addresable", "arranged": "aranged arrainged",
176 | 		"arranging": "aranging", "arrangement": "arragment", "articles": "articals",
177 | 		"aunt": "annt anut arnt", "auxiliary": "auxillary", "available": "avaible",
178 | 		"awful": "awfall afful", "basically": "basicaly", "beginning": "begining",
179 | 		"benefit": "benifit", "benefits": "benifits", "between": "beetween", "bicycle": "bicycal bycicle bycycle", "biscuits": "biscits biscutes biscuts bisquits buiscits buiscuts", "built": "biult",
180 | 		"cake": "cak", "career": "carrer",
181 | 		"cemetery": "cemetary semetary", "centrally": "centraly", "certain": "cirtain",
182 | 		"challenges": "chalenges chalenges", "chapter": "chaper chaphter chaptur",
183 | 		"choice": "choise", "choosing": "chosing", "clerical": "clearical",
184 | 		"committee": "comittee", "compare": "compair", "completely": "completly",
185 | 		"consider": "concider", "considerable": "conciderable", "contented": "contenpted contende contended contentid", "curtains": "cartains certans courtens cuaritains curtans curtians curtions", "decide": "descide", "decided": "descided", "definitely": "definately difinately", "definition": "defenition",
186 | 		"definitions": "defenitions", "description": "discription", "desiccate": "desicate dessicate dessiccate", "diagrammatically": "diagrammaticaally",
187 | 		"different": "diffrent", "driven": "dirven", "ecstasy": "exstacy ecstacy",
188 | 		"embarrass": "embaras embarass", "establishing": "astablishing establising",
189 | 		"experience": "experance experiance", "experiences": "experances", "extended": "extented", "extremely": "extreamly", "fails": "failes", "families": "familes",
190 | 		"february": "febuary", "further": "futher", "gallery": "galery gallary gallerry gallrey",
191 | 		"hierarchal": "hierachial", "hierarchy": "hierchy", "inconvenient": "inconvienient inconvient inconvinient", "independent": "independant independant",
192 | 		"initial": "intial", "initials": "inetials inistals initails initals intials",
193 | 		"juice": "guic juce jucie juise juse", "latest": "lates latets latiest latist",
194 | 		"laugh": "lagh lauf laught lugh", "level": "leval",
195 | 		"levels": "levals", "liaison": "liaision liason", "lieu": "liew", "literature": "litriture", "loans": "lones", "locally": "localy", "magnificent": "magnificnet magificent magnifcent magnifecent magnifiscant magnifisent magnificant",
196 | 		"management": "managment", "meant": "ment", "minuscule": "miniscule",
197 | 		"minutes": "muinets", "monitoring": "monitering", "necessary": "neccesary necesary neccesary necassary necassery neccasary", "occurrence": "occurence occurence", "often": "ofen offen offten ofton", "opposite": "opisite oppasite oppesite oppisit oppisite opposit oppossite oppossitte", "parallel": "paralel paralell parrallel parralell parrallell", "particular": "particulaur",
198 | 		"perhaps": "perhapse", "personnel": "personnell", "planned": "planed", "poem": "poame", "poems": "poims pomes", "poetry": "poartry poertry poetre poety powetry",
199 | 		"position": "possition", "possible": "possable", "pretend": "pertend protend prtend pritend", "problem": "problam proble promblem proplen",
200 | 		"pronunciation": "pronounciation", "purple": "perple perpul poarple",
201 | 		"questionnaire": "questionaire", "really": "realy relley relly", "receipt": "receit receite reciet recipt", "receive": "recieve", "refreshment": "reafreshment refreshmant refresment refressmunt", "remember": "rember remeber rememmer rermember",
202 | 		"remind": "remine remined", "scarcely": "scarcly scarecly scarely scarsely",
203 | 		"scissors": "scisors sissors", "separate": "seperate",
204 | 		"singular": "singulaur", "someone": "somone", "sources": "sorces", "southern": "southen", "special": "speaical specail specal speical", "splendid": "spledid splended splened splended", "standardizing": "stanerdizing", "stomach": "stomac stomache stomec stumache", "supersede": "supercede superceed", "there": "ther",
205 | 		"totally": "totaly", "transferred": "transfred", "transportability": "transportibility", "triangular": "triangulaur", "understand": "undersand undistand",
206 | 		"unexpected": "unexpcted unexpeted unexspected", "unfortunately": "unfortunatly", "unique": "uneque", "useful": "usefull", "valuable": "valubale valuble",
207 | 		"variable": "varable", "variant": "vairiant", "various": "vairious",
208 | 		"visited": "fisited viseted vistid vistied", "visitors": "vistors",
209 | 		"voluntary": "volantry", "voting": "voteing", "wanted": "wantid wonted",
210 | 		"whether": "wether", "wrote": "rote wote"}
211 | 
212 | 	tests2 := map[string]string{"forbidden": "forbiden", "decisions": "deciscions descisions",
213 | 		"supposedly": "supposidly", "embellishing": "embelishing", "technique": "tecnique", "permanently": "perminantly", "confirmation": "confermation",
214 | 		"appointment": "appoitment", "progression": "progresion", "accompanying": "acompaning", "applicable": "aplicable", "regained": "regined", "guidelines": "guidlines", "surrounding": "serounding", "titles": "tittles", "unavailable": "unavailble", "advantageous": "advantageos", "brief": "brif", "appeal": "apeal", "consisting": "consisiting", "clerk": "cleark clerck", "component": "componant", "favourable": "faverable", "separation": "seperation", "search": "serch", "receive": "recieve", "employees": "emploies", "prior": "piror",
215 | 		"resulting": "reulting", "suggestion": "sugestion", "opinion": "oppinion",
216 | 		"cancellation": "cancelation", "criticism": "citisum", "useful": "usful",
217 | 		"humour": "humor", "anomalies": "anomolies", "would": "whould", "doubt": "doupt", "examination": "eximination", "therefore": "therefoe", "recommend": "recomend", "separated": "seperated", "successful": "sucssuful succesful",
218 | 		"apparent": "apparant", "occurred": "occureed", "particular": "paerticulaur",
219 | 		"pivoting": "pivting", "announcing": "anouncing", "challenge": "chalange",
220 | 		"arrangements": "araingements", "proportions": "proprtions", "organized": "oranised", "accept": "acept", "dependence": "dependance", "unequalled": "unequaled", "numbers": "numbuers", "sense": "sence", "conversely": "conversly", "provide": "provid", "arrangement": "arrangment",
221 | 		"responsibilities": "responsiblities", "fourth": "forth", "ordinary": "ordenary", "description": "desription descvription desacription",
222 | 		"inconceivable": "inconcievable", "data": "dsata", "register": "rgister",
223 | 		"supervision": "supervison", "encompassing": "encompasing", "negligible": "negligable", "allow": "alow", "operations": "operatins", "executed": "executted", "interpretation": "interpritation", "hierarchy": "heiarky",
224 | 		"indeed": "indead", "years": "yesars", "through": "throut", "committee": "committe", "inquiries": "equiries", "before": "befor", "continued": "contuned", "permanent": "perminant", "choose": "chose", "virtually": "vertually", "correspondence": "correspondance", "eventually": "eventully",
225 | 		"lonely": "lonley", "profession": "preffeson", "they": "thay", "now": "noe",
226 | 		"desperately": "despratly", "university": "unversity", "adjournment": "adjurnment", "possibilities": "possablities", "stopped": "stoped", "mean": "meen", "weighted": "wagted", "adequately": "adequattly", "shown": "hown",
227 | 		"matrix": "matriiix", "profit": "proffit", "encourage": "encorage", "collate": "colate", "disaggregate": "disaggreagte disaggreaget", "receiving": "recieving reciving", "proviso": "provisoe", "umbrella": "umberalla", "approached": "aproached", "pleasant": "plesent", "difficulty": "dificulty", "appointments": "apointments", "base": "basse", "conditioning": "conditining", "earliest": "earlyest", "beginning": "begining", "universally": "universaly",
228 | 		"unresolved": "unresloved", "length": "lengh", "exponentially": "exponentualy", "utilized": "utalised", "set": "et", "surveys": "servays",
229 | 		"families": "familys", "system": "sysem", "approximately": "aproximatly",
230 | 		"their": "ther", "scheme": "scheem", "speaking": "speeking", "repetitive": "repetative", "inefficient": "ineffiect", "geneva": "geniva", "exactly": "exsactly", "immediate": "imediate", "appreciation": "apreciation", "luckily": "luckeley", "eliminated": "elimiated", "believe": "belive", "appreciated": "apreciated", "readjusted": "reajusted", "were": "wer where", "feeling": "fealing", "and": "anf", "false": "faulse", "seen": "seeen", "interrogating": "interogationg", "academically": "academicly", "relatively": "relativly relitivly",
231 | 		"traditionally": "traditionaly", "studying": "studing",
232 | 		"majority": "majorty", "build": "biuld", "aggravating": "agravating",
233 | 		"transactions": "trasactions", "arguing": "aurguing", "sheets": "sheertes",
234 | 		"successive": "sucsesive sucessive", "segment": "segemnt", "especially": "especaily", "later": "latter", "senior": "sienior", "dragged": "draged",
235 | 		"atmosphere": "atmospher", "drastically": "drasticaly", "particularly": "particulary", "visitor": "vistor", "session": "sesion", "continually": "contually", "availability": "avaiblity", "busy": "buisy", "parameters": "perametres", "surroundings": "suroundings seroundings", "employed": "emploied", "adequate": "adiquate", "handle": "handel", "means": "meens",
236 | 		"familiar": "familer", "between": "beeteen", "overall": "overal", "timing": "timeing", "committees": "comittees commitees", "queries": "quies",
237 | 		"econometric": "economtric", "erroneous": "errounous", "decides": "descides",
238 | 		"reference": "refereence refference", "intelligence": "inteligence",
239 | 		"edition": "ediion ediition", "are": "arte", "apologies": "appologies",
240 | 		"thermawear": "thermawere thermawhere", "techniques": "tecniques",
241 | 		"voluntary": "volantary", "subsequent": "subsequant subsiquent", "currently": "curruntly", "forecast": "forcast", "weapons": "wepons", "routine": "rouint",
242 | 		"neither": "niether", "approach": "aproach", "available": "availble",
243 | 		"recently": "reciently", "ability": "ablity", "nature": "natior",
244 | 		"commercial": "comersial", "agencies": "agences", "however": "howeverr",
245 | 		"suggested": "sugested", "career": "carear", "many": "mony", "annual": "anual", "according": "acording", "receives": "recives recieves",
246 | 		"interesting": "intresting", "expense": "expence", "relevant": "relavent relevaant", "table": "tasble", "throughout": "throuout", "conference": "conferance", "sensible": "sensable", "described": "discribed describd",
247 | 		"union": "unioun", "interest": "intrest", "flexible": "flexable", "refered": "reffered", "controlled": "controled", "sufficient": "suficient",
248 | 		"dissension": "desention", "adaptable": "adabtable", "representative": "representitive", "irrelevant": "irrelavent", "unnecessarily": "unessasarily",
249 | 		"applied": "upplied", "apologised": "appologised", "these": "thees thess",
250 | 		"choices": "choises", "will": "wil", "procedure": "proceduer", "shortened": "shortend", "manually": "manualy", "disappointing": "dissapoiting",
251 | 		"excessively": "exessively", "comments": "coments", "containing": "containg",
252 | 		"develop": "develope", "credit": "creadit", "government": "goverment",
253 | 		"acquaintances": "aquantences", "orientated": "orentated", "widely": "widly",
254 | 		"advise": "advice", "difficult": "dificult", "investigated": "investegated",
255 | 		"bonus": "bonas", "conceived": "concieved", "nationally": "nationaly",
256 | 		"compared": "comppared compased", "moving": "moveing", "necessity": "nessesity", "opportunity": "oppertunity oppotunity opperttunity", "thoughts": "thorts", "equalled": "equaled", "variety": "variatry", "analysis": "analiss analsis analisis", "patterns": "pattarns", "qualities": "quaties", "easily": "easyly", "organization": "oranisation oragnisation", "the": "thw hte thi",
257 | 		"corporate": "corparate", "composed": "compossed", "enormously": "enomosly",
258 | 		"financially": "financialy", "functionally": "functionaly", "discipline": "disiplin", "announcement": "anouncement", "progresses": "progressess",
259 | 		"except": "excxept", "recommending": "recomending", "mathematically": "mathematicaly", "source": "sorce", "combine": "comibine", "input": "inut",
260 | 		"careers": "currers carrers", "resolved": "resoved", "demands": "diemands",
261 | 		"unequivocally": "unequivocaly", "suffering": "suufering", "immediately": "imidatly imediatly", "accepted": "acepted", "projects": "projeccts",
262 | 		"necessary": "necasery nessasary nessisary neccassary", "journalism": "journaism", "unnecessary": "unessessay", "night": "nite", "output": "oputput", "security": "seurity", "essential": "esential", "beneficial": "benificial benficial", "explaining": "explaning", "supplementary": "suplementary", "questionnaire": "questionare", "employment": "empolyment",
263 | 		"proceeding": "proceding", "decision": "descisions descision", "per": "pere",
264 | 		"discretion": "discresion", "reaching": "reching", "analysed": "analised",
265 | 		"expansion": "expanion", "although": "athough", "subtract": "subtrcat",
266 | 		"analysing": "aalysing", "comparison": "comparrison", "months": "monthes",
267 | 		"hierarchal": "hierachial", "misleading": "missleading", "commit": "comit",
268 | 		"auguments": "aurgument", "within": "withing", "obtaining": "optaning",
269 | 		"accounts": "acounts", "primarily": "pimarily", "operator": "opertor",
270 | 		"accumulated": "acumulated", "extremely": "extreemly", "there": "thear",
271 | 		"summarys": "sumarys", "analyse": "analiss", "understandable": "understadable", "safeguard": "safegaurd", "consist": "consisit",
272 | 		"declarations": "declaratrions", "minutes": "muinutes muiuets", "associated": "assosiated", "accessibility": "accessability", "examine": "examin",
273 | 		"surveying": "servaying", "politics": "polatics", "annoying": "anoying",
274 | 		"again": "agiin", "assessing": "accesing", "ideally": "idealy", "scrutinized": "scrutiniesed", "simular": "similar", "personnel": "personel", "whereas": "wheras", "when": "whn", "geographically": "goegraphicaly", "gaining": "ganing", "requested": "rquested", "separate": "seporate", "students": "studens", "prepared": "prepaired", "generated": "generataed", "graphically": "graphicaly", "suited": "suted", "variable": "varible vaiable", "building": "biulding", "required": "reequired", "necessitates": "nessisitates",
275 | 		"together": "togehter", "profits": "proffits"}
276 | 
277 | 	model := NewModel()
278 | 	model.SetThreshold(1) // This ensures a more complete dictionary at the expense of size/speed.
279 | 	model.Train(sampleEnglish)
280 | 
281 | 	// Look at test sets
282 | 	// SET 1
283 | 	count, correct, incorrect := 0, 0, 0
284 | 	t2 := time.Now()
285 | 	for target, testwords := range tests1 {
286 | 		testwordarr := strings.Split(testwords, " ")
287 | 		for _, testword := range testwordarr {
288 | 			if model.SpellCheck(testword) == target {
289 | 				correct++
290 | 			} else {
291 | 				incorrect++
292 | 			}
293 | 			count++
294 | 		}
295 | 	}
296 | 	t3 := time.Now()
297 | 
298 | 	fmt.Printf("Spell test1 count: %v, Correct: %v, Incorrect: %v, Ratio: %f, Total time: %v \n\n", count, correct, incorrect, float32(correct)/float32(count), t3.Sub(t2))
299 | 
300 | 	successrate := float32(correct) / float32(count)
301 | 	if successrate < 0.60 {
302 | 		t.Errorf("Unacceptable correction rate for set test1 (%v). e.g. below 60 percent.", successrate)
303 | 	}
304 | 
305 | 	// 5000Hz is our aim
306 | 	maxtime := time.Duration(count) * 200 * time.Microsecond
307 | 
308 | 	if t3.Sub(t2) > maxtime {
309 | 		t.Errorf("Unacceptable completion time for set test1 (%v). e.g. %v corrections took greater than %v.", t3.Sub(t2), count, maxtime)
310 | 	}
311 | 
312 | 	// SET 2
313 | 	count, correct, incorrect = 0, 0, 0
314 | 	t2 = time.Now()
315 | 	for target, testwords := range tests2 {
316 | 		testwordarr := strings.Split(testwords, " ")
317 | 		for _, testword := range testwordarr {
318 | 			if model.SpellCheck(testword) == target {
319 | 				correct++
320 | 			} else {
321 | 				incorrect++
322 | 			}
323 | 			count++
324 | 		}
325 | 	}
326 | 	t3 = time.Now()
327 | 
328 | 	fmt.Printf("Spell test2 count: %v, Correct: %v, Incorrect: %v, Ratio: %f, Total time: %v \n\n", count, correct, incorrect, float32(correct)/float32(count), t3.Sub(t2))
329 | 
330 | 	successrate = float32(correct) / float32(count)
331 | 	if successrate < test2AccuracyThreshold {
332 | 		t.Errorf("Unacceptable correction rate for set test2 (%v). e.g. below %v.", successrate, test2AccuracyThreshold)
333 | 	}
334 | 
335 | 	// 5000Hz is our aim
336 | 	maxtime = time.Duration(count) * 200 * time.Microsecond
337 | 
338 | 	if t3.Sub(t2) > maxtime {
339 | 		t.Errorf("Unacceptable completion time for set test2 (%v). e.g. %v corrections took greater than %v", t3.Sub(t2), count, maxtime)
340 | 	}
341 | 
342 | }
343 | 
344 | // Quick test to make sure we're picking up the right stuff
345 | func TestAutocomplete(t *testing.T) {
346 | 	model := NewModel()
347 | 	model.Train(sampleEnglish)
348 | 	out, err := model.Autocomplete("accoun")
349 | 	if err != nil {
350 | 		t.Errorf("Autocomplete() returned an error: %s", err)
351 | 	}
352 | 	expected := map[string]bool{
353 | 		"account":    true,
354 | 		"accountant": true,
355 | 		"accounts":   true,
356 | 		"accounted":  true,
357 | 	}
358 | 	for _, m := range out {
359 | 		if val, ok := expected[m]; !ok {
360 | 			t.Errorf("Expected to find %v (%v), but didn't", m, val)
361 | 		}
362 | 	}
363 | }
364 | 
365 | // Test to ensure query training begins to dominate over
366 | // corpus training when autocompleting
367 | func TestAutocompleteFromQueries(t *testing.T) {
368 | 	model := NewModel()
369 | 	// Changing defaults for testing only, this is not advisable on production
370 | 	model.SetThreshold(1)
371 | 	model.SetDivergenceThreshold(1)
372 | 
373 | 	model.Train([]string{"every", "every", "every", "every", "every", "every", "everest", "eveready", "eveready", "everything", "everything"})
374 | 	model.TrainQuery("everest")  // Simulate a query
375 | 	model.TrainQuery("everest")  // Simulate a query
376 | 	model.TrainQuery("eveready") // Simulate a query
377 | 
378 | 	out, err := model.Autocomplete("eve")
379 | 	if err != nil {
380 | 		t.Errorf("Autocomplete() returned an error: %s", err)
381 | 	}
382 | 	if out[0] != "everest" {
383 | 		t.Errorf("Autocomplete failed to account for query training")
384 | 	}
385 | 	if out[1] != "eveready" {
386 | 		t.Errorf("Autocomplete failed to account for query training")
387 | 	}
388 | }
389 | 
390 | func TestLoadOldModel(t *testing.T) {
391 | 	if _, err := Load("data/test.dict"); err != nil {
392 | 		t.Errorf("Couldn't load old model format: %v", err)
393 | 	}
394 | }
395 | 
396 | func TestEditsMulti(t *testing.T) {
397 | 	model := NewModel()
398 | 	got := model.EditsMulti("elephant", model.Depth)
399 | 	want := []string{
400 | 		"lephant", "eephant", "elphant", "elehant", "elepant", "elephnt", "elephat", "elephan", "elephant",
401 | 		"ephant", "lphant", "lehant", "lepant", "lephnt", "lephat", "lephan", "lephant",
402 | 		"ephant", "ephant", "eehant", "eepant", "eephnt", "eephat", "eephan", "eephant",
403 | 		"lphant", "ephant", "elhant", "elpant", "elphnt", "elphat", "elphan", "elphant",
404 | 		"lehant", "eehant", "elhant", "eleant", "elehnt", "elehat", "elehan", "elehant",
405 | 		"lepant", "eepant", "elpant", "eleant", "elepnt", "elepat", "elepan", "elepant",
406 | 		"lephnt", "eephnt", "elphnt", "elehnt", "elepnt", "elepht", "elephn", "elephnt",
407 | 		"lephat", "eephat", "elphat", "elehat", "elepat", "elepht", "elepha", "elephat",
408 | 		"lephan", "eephan", "elphan", "elehan", "elepan", "elephn", "elepha", "elephan",
409 | 		"lephant", "eephant", "elphant", "elehant", "elepant", "elephnt", "elephat", "elephan", "elephant"}
410 | 
411 | 	if !reflect.DeepEqual(got, want) {
412 | 		t.Errorf("EditsMulti didn't match:\nGot:  %v\nWant: %v", got, want)
413 | 	}
414 | }
415 | 
416 | var result []string // prevent the benchmark from getting optimized out
417 | 
418 | func BenchmarkEditsMulti(b *testing.B) {
419 | 	model := NewModel()
420 | 	for i := 0; i < b.N; i++ {
421 | 		result = model.EditsMulti("elephant", model.Depth)
422 | 	}
423 | }
424 | 


--------------------------------------------------------------------------------