├── misspelled_words
├── misspeller.py
├── README.md
└── spellchecker.py


/misspelled_words:
--------------------------------------------------------------------------------
1 | feshe
2 | menstir
3 | epply
4 | syint
5 | putyti
6 | mith
7 | 


--------------------------------------------------------------------------------
/misspeller.py:
--------------------------------------------------------------------------------
 1 | # Nick Sweeting 2014
 2 | # python mispeller for disqus
 3 | 
 4 | from itertools import product
 5 | import random
 6 | 
 7 | vowels = {"a","e","i","o","u","y"}
 8 | 
 9 | def get_inflations(word):
10 |     """return flat option list of all possible variations of the word by adding duplicate letters"""
11 |     word = list(word)
12 |     for idx, l in enumerate(word):
13 |         if random.random() * 100 > 60:
14 |             word[idx] = word[idx]*int(random.random()*10)
15 |     # ['h','i', 'i', 'i'] becomes ['h', ['i', 'ii', 'iii']]
16 |     return word
17 | 
18 | def get_vowelswaps(word):
19 |     """return flat option list of all possible variations of the word by swapping vowels"""
20 |     word = list(word)
21 |     for idx, l in enumerate(word):
22 |         if type(l) == list:
23 |             pass
24 |         elif l in vowels:
25 |             word[idx] = list(vowels)
26 |         
27 |     # ['h','i'] becomes ['h', ['a', 'e', 'i', 'o', 'u', 'y']]
28 |     return word
29 | 
30 | def flatten(options):
31 |     """convert compact nested options list into full list"""
32 |     # ['h',['i','ii','iii']] becomes 'hi','hii','hiii'
33 |     a = set()
34 |     for p in product(*options):
35 |         a.add(''.join(p))
36 |     return a
37 | 
38 | def misspell(word):
39 |     """return a randomly misspelled version of the inputted word"""
40 | 
41 |     return random.choice(list(flatten(get_vowelswaps(word)) | flatten(get_inflations(word))))
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     words = ["fishy", "monster", "apple", "saint", "potato", "moth"]
46 |     for word in words:
47 |         print(misspell(word))
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Simple Python Spell-Checker
 2 | 
 3 | ## Quickstart
 4 | 
 5 | ```bash
 6 | git clone https://github.com/pirate/spellchecker
 7 | cd spellchecker/
 8 | python spellchecker.py
 9 | 
10 | # type interactively to get suggestions
11 | Total Word Set: 285750
12 | Model Precision: 1.62249168854
13 | >manster
14 | [('monster', 2), ('minster', 2)]
15 | 
16 | # or try some preset mispelled words
17 | python misspeller.py | python spellchecker.py 
18 | ```
19 | You can edit `spellchecker.py` and add more files to the training list to increase the word-frequency model precision.
20 | 
21 | ## Background
22 | 
23 | 
24 | Peter Norvig wrote an amazing article titled [How to Write a Spelling Corrector](http://norvig.com/spell-correct.html) detailing a basic approach to this deceivingly simple problem.
25 | I had to write a spellchecker as an interview question for [Disqus](https://disqus.com/), and this repo details my efforts.
26 | 
27 | The core code that I borrow from [Darius Bacon](https://github.com/darius) & Norvig is this beautiful block:
28 | ```python
29 | def variants(word):
30 |     """get all possible variants for a word"""
31 |     splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
32 |     deletes    = [a + b[1:] for a, b in splits if b]
33 |     transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
34 |     replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
35 |     inserts    = [a + c + b for a, b in splits for c in alphabet]
36 |     return set(deletes + transposes + replaces + inserts)
37 | ```
38 | 
39 | Of course that wasn't my only code, I added a lot more on top of Norvig's implementation.
40 | 
41 | My additions are:
42 |   - short-circuiting options for faster checking
43 |   - hamming distance and word-frequency model based chooser for suggestions
44 |   - double word variants for catching more complex multi-typos
45 |   - vowel-swapping detection
46 |   - a reductions function to efficiently store word variants like `monster: ['m',['o', 'a'], 'n', 's', 't', 'e', 'r']`
47 | 
48 | ## Faster Algorithm
49 | 
50 | A faster spellchecking algorithm exists, see: https://github.com/wolfgarbe/SymSpell
51 | 


--------------------------------------------------------------------------------
/spellchecker.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # Nick Sweeting 2014
  3 | # python spellchecker
  4 | 
  5 | import re
  6 | import collections
  7 | from itertools import product, imap
  8 | 
  9 | VERBOSE = True
 10 | SYSTEM_DICTIONARY = '/usr/share/dict/words'
 11 | vowels = set('aeiouy')
 12 | alphabet = set('abcdefghijklmnopqrstuvwxyz')
 13 | 
 14 | 
 15 | ### IO
 16 | 
 17 | def log(*args):
 18 |     if VERBOSE: print ''.join([str(x) for x in args])
 19 | 
 20 | def words(text):
 21 |     """filter body of text for words"""
 22 |     return re.findall('[a-z]+', text.lower())
 23 | 
 24 | def train(text, model=None):
 25 |     """generate or update a word model (dictionary of word:frequency)"""
 26 |     model = collections.defaultdict(lambda: 0) if model is None else model
 27 |     for word in words(text):
 28 |         model[word] += 1
 29 |     return model
 30 | 
 31 | def train_from_files(file_list, model=None):
 32 |     for f in file_list:
 33 |         model = train(file(f).read(), model)
 34 |     return model
 35 | 
 36 | ### UTILITY FUNCTIONS
 37 | 
 38 | def numberofdupes(string, idx):
 39 |     """return the number of times in a row the letter at index idx is duplicated"""
 40 |     # "abccdefgh", 2  returns 1
 41 |     initial_idx = idx
 42 |     last = string[idx]
 43 |     while idx+1 < len(string) and string[idx+1] == last:
 44 |         idx += 1
 45 |     return idx-initial_idx
 46 | 
 47 | def hamming_distance(word1, word2):
 48 |     if word1 == word2:
 49 |         return 0
 50 |     dist = sum(imap(str.__ne__, word1[:len(word2)], word2[:len(word1)]))
 51 |     dist = max([word2, word1]) if not dist else dist+abs(len(word2)-len(word1))
 52 |     return dist
 53 | 
 54 | def frequency(word, word_model):
 55 |     return word_model.get(word, 0)
 56 | 
 57 | ### POSSIBILITIES ANALYSIS
 58 | 
 59 | def variants(word):
 60 |     """get all possible variants for a word"""
 61 |     splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
 62 |     deletes    = [a + b[1:] for a, b in splits if b]
 63 |     transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
 64 |     replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
 65 |     inserts    = [a + c + b for a, b in splits for c in alphabet]
 66 |     return set(deletes + transposes + replaces + inserts)
 67 | 
 68 | def double_variants(word):
 69 |     """get variants for the variants for a word"""
 70 |     return set(s for w in variants(word) for s in variants(w))
 71 | 
 72 | def reductions(word):
 73 |     """return flat option list of all possible variations of the word by removing duplicate letters"""
 74 |     word = list(word)
 75 |     # ['h','i', 'i', 'i'] becomes ['h', ['i', 'ii', 'iii']]
 76 |     for idx, l in enumerate(word):
 77 |         n = numberofdupes(word, idx)
 78 |         # if letter appears more than once in a row
 79 |         if n:
 80 |             # generate a flat list of options ('hhh' becomes ['h','hh','hhh'])
 81 |             flat_dupes = [l*(r+1) for r in xrange(n+1)][:3] # only take up to 3, there are no 4 letter repetitions in english
 82 |             # remove duplicate letters in original word
 83 |             for _ in range(n):
 84 |                 word.pop(idx+1)
 85 |             # replace original letter with flat list
 86 |             word[idx] = flat_dupes
 87 | 
 88 |     # ['h',['i','ii','iii']] becomes 'hi','hii','hiii'
 89 |     for p in product(*word):
 90 |         yield ''.join(p)
 91 | 
 92 | def vowelswaps(word):
 93 |     """return flat option list of all possible variations of the word by swapping vowels"""
 94 |     word = list(word)
 95 |     # ['h','i'] becomes ['h', ['a', 'e', 'i', 'o', 'u', 'y']]
 96 |     for idx, l in enumerate(word):
 97 |         if type(l) == list:
 98 |             pass                        # dont mess with the reductions
 99 |         elif l in vowels:
100 |             word[idx] = list(vowels)    # if l is a vowel, replace with all possible vowels
101 | 
102 |     # ['h',['i','ii','iii']] becomes 'hi','hii','hiii'
103 |     for p in product(*word):
104 |         yield ''.join(p)
105 | 
106 | def both(word):
107 |     """permute all combinations of reductions and vowelswaps"""
108 |     for reduction in reductions(word):
109 |         for variant in vowelswaps(reduction):
110 |             yield variant
111 | 
112 | ### POSSIBILITY CHOOSING
113 | 
114 | def suggestions(word, real_words, short_circuit=True):
115 |     """get best spelling suggestion for word
116 |     return on first match if short_circuit is true, otherwise collect all possible suggestions"""
117 |     word = word.lower()
118 |     if short_circuit:   # setting short_circuit makes the spellchecker much faster, but less accurate in some cases
119 |         return ({word}                      & real_words or   #  caps     "inSIDE" => "inside"
120 |                 set(reductions(word))       & real_words or   #  repeats  "jjoobbb" => "job"
121 |                 set(vowelswaps(word))       & real_words or   #  vowels   "weke" => "wake"
122 |                 set(variants(word))         & real_words or   #  other    "nonster" => "monster"
123 |                 set(both(word))             & real_words or   #  both     "CUNsperrICY" => "conspiracy"
124 |                 set(double_variants(word))  & real_words or   #  other    "nmnster" => "manster"
125 |                 {"NO SUGGESTION"})
126 |     else:
127 |         return ({word}                      & real_words or
128 |                 (set(reductions(word))  | set(vowelswaps(word)) | set(variants(word)) | set(both(word)) | set(double_variants(word))) & real_words or
129 |                 {"NO SUGGESTION"})
130 | 
131 | def best(inputted_word, suggestions, word_model=None):
132 |     """choose the best suggestion in a list based on lowest hamming distance from original word, or based on frequency if word_model is provided"""
133 | 
134 |     suggestions = list(suggestions)
135 | 
136 |     def comparehamm(one, two):
137 |         score1 = hamming_distance(inputted_word, one)
138 |         score2 = hamming_distance(inputted_word, two)
139 |         return cmp(score1, score2)  # lower is better
140 | 
141 |     def comparefreq(one, two):
142 |         score1 = frequency(one, word_model)
143 |         score2 = frequency(two, word_model)
144 |         return cmp(score2, score1)  # higher is better
145 | 
146 |     freq_sorted = sorted(suggestions, cmp=comparefreq)[10:]     # take the top 10
147 |     hamming_sorted = sorted(suggestions, cmp=comparehamm)[10:]  # take the top 10
148 |     print 'FREQ', freq_sorted
149 |     print 'HAM', hamming_sorted
150 |     return ''
151 | 
152 | if __name__ == '__main__':
153 |     # init the word frequency model with a simple list of all possible words
154 |     word_model = train(file(SYSTEM_DICTIONARY).read())
155 |     real_words = set(word_model)
156 | 
157 |     # add other texts here, they are used to train the word frequency model
158 |     texts = [
159 |         'sherlockholmes.txt',
160 |         'lemmas.txt',
161 |     ]
162 |     # enhance the model with real bodies of english so we know which words are more common than others
163 |     word_model = train_from_files(texts, word_model)
164 | 
165 |     log('Total Word Set: ', len(word_model))
166 |     log('Model Precision: %s' % (float(sum(word_model.values()))/len(word_model)))
167 |     try:
168 |         while True:
169 |             word = str(raw_input('>'))
170 | 
171 |             possibilities = suggestions(word, real_words, short_circuit=False)
172 |             short_circuit_result = suggestions(word, real_words, short_circuit=True)
173 |             if VERBOSE:
174 |                 print [(x, word_model[x]) for x in possibilities]
175 |                 print best(word, possibilities, word_model)
176 |                 print '---'
177 |             print [(x, word_model[x]) for x in short_circuit_result]
178 |             if VERBOSE:
179 |                 print best(word, short_circuit_result, word_model)
180 | 
181 |     except (EOFError, KeyboardInterrupt):
182 |         exit(0)
183 | 


--------------------------------------------------------------------------------