├── misspelled_words ├── misspeller.py ├── README.md └── spellchecker.py /misspelled_words: -------------------------------------------------------------------------------- 1 | feshe 2 | menstir 3 | epply 4 | syint 5 | putyti 6 | mith 7 | -------------------------------------------------------------------------------- /misspeller.py: -------------------------------------------------------------------------------- 1 | # Nick Sweeting 2014 2 | # python mispeller for disqus 3 | 4 | from itertools import product 5 | import random 6 | 7 | vowels = {"a","e","i","o","u","y"} 8 | 9 | def get_inflations(word): 10 | """return flat option list of all possible variations of the word by adding duplicate letters""" 11 | word = list(word) 12 | for idx, l in enumerate(word): 13 | if random.random() * 100 > 60: 14 | word[idx] = word[idx]*int(random.random()*10) 15 | # ['h','i', 'i', 'i'] becomes ['h', ['i', 'ii', 'iii']] 16 | return word 17 | 18 | def get_vowelswaps(word): 19 | """return flat option list of all possible variations of the word by swapping vowels""" 20 | word = list(word) 21 | for idx, l in enumerate(word): 22 | if type(l) == list: 23 | pass 24 | elif l in vowels: 25 | word[idx] = list(vowels) 26 | 27 | # ['h','i'] becomes ['h', ['a', 'e', 'i', 'o', 'u', 'y']] 28 | return word 29 | 30 | def flatten(options): 31 | """convert compact nested options list into full list""" 32 | # ['h',['i','ii','iii']] becomes 'hi','hii','hiii' 33 | a = set() 34 | for p in product(*options): 35 | a.add(''.join(p)) 36 | return a 37 | 38 | def misspell(word): 39 | """return a randomly misspelled version of the inputted word""" 40 | 41 | return random.choice(list(flatten(get_vowelswaps(word)) | flatten(get_inflations(word)))) 42 | 43 | if __name__ == "__main__": 44 | 45 | words = ["fishy", "monster", "apple", "saint", "potato", "moth"] 46 | for word in words: 47 | print(misspell(word)) 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Simple Python Spell-Checker 2 | 3 | ## Quickstart 4 | 5 | ```bash 6 | git clone https://github.com/pirate/spellchecker 7 | cd spellchecker/ 8 | python spellchecker.py 9 | 10 | # type interactively to get suggestions 11 | Total Word Set: 285750 12 | Model Precision: 1.62249168854 13 | >manster 14 | [('monster', 2), ('minster', 2)] 15 | 16 | # or try some preset mispelled words 17 | python misspeller.py | python spellchecker.py 18 | ``` 19 | You can edit `spellchecker.py` and add more files to the training list to increase the word-frequency model precision. 20 | 21 | ## Background 22 | 23 | 24 | Peter Norvig wrote an amazing article titled [How to Write a Spelling Corrector](http://norvig.com/spell-correct.html) detailing a basic approach to this deceivingly simple problem. 25 | I had to write a spellchecker as an interview question for [Disqus](https://disqus.com/), and this repo details my efforts. 26 | 27 | The core code that I borrow from [Darius Bacon](https://github.com/darius) & Norvig is this beautiful block: 28 | ```python 29 | def variants(word): 30 | """get all possible variants for a word""" 31 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 32 | deletes = [a + b[1:] for a, b in splits if b] 33 | transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] 34 | replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] 35 | inserts = [a + c + b for a, b in splits for c in alphabet] 36 | return set(deletes + transposes + replaces + inserts) 37 | ``` 38 | 39 | Of course that wasn't my only code, I added a lot more on top of Norvig's implementation. 40 | 41 | My additions are: 42 | - short-circuiting options for faster checking 43 | - hamming distance and word-frequency model based chooser for suggestions 44 | - double word variants for catching more complex multi-typos 45 | - vowel-swapping detection 46 | - a reductions function to efficiently store word variants like `monster: ['m',['o', 'a'], 'n', 's', 't', 'e', 'r']` 47 | 48 | ## Faster Algorithm 49 | 50 | A faster spellchecking algorithm exists, see: https://github.com/wolfgarbe/SymSpell 51 | -------------------------------------------------------------------------------- /spellchecker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # Nick Sweeting 2014 3 | # python spellchecker 4 | 5 | import re 6 | import collections 7 | from itertools import product, imap 8 | 9 | VERBOSE = True 10 | SYSTEM_DICTIONARY = '/usr/share/dict/words' 11 | vowels = set('aeiouy') 12 | alphabet = set('abcdefghijklmnopqrstuvwxyz') 13 | 14 | 15 | ### IO 16 | 17 | def log(*args): 18 | if VERBOSE: print ''.join([str(x) for x in args]) 19 | 20 | def words(text): 21 | """filter body of text for words""" 22 | return re.findall('[a-z]+', text.lower()) 23 | 24 | def train(text, model=None): 25 | """generate or update a word model (dictionary of word:frequency)""" 26 | model = collections.defaultdict(lambda: 0) if model is None else model 27 | for word in words(text): 28 | model[word] += 1 29 | return model 30 | 31 | def train_from_files(file_list, model=None): 32 | for f in file_list: 33 | model = train(file(f).read(), model) 34 | return model 35 | 36 | ### UTILITY FUNCTIONS 37 | 38 | def numberofdupes(string, idx): 39 | """return the number of times in a row the letter at index idx is duplicated""" 40 | # "abccdefgh", 2 returns 1 41 | initial_idx = idx 42 | last = string[idx] 43 | while idx+1 < len(string) and string[idx+1] == last: 44 | idx += 1 45 | return idx-initial_idx 46 | 47 | def hamming_distance(word1, word2): 48 | if word1 == word2: 49 | return 0 50 | dist = sum(imap(str.__ne__, word1[:len(word2)], word2[:len(word1)])) 51 | dist = max([word2, word1]) if not dist else dist+abs(len(word2)-len(word1)) 52 | return dist 53 | 54 | def frequency(word, word_model): 55 | return word_model.get(word, 0) 56 | 57 | ### POSSIBILITIES ANALYSIS 58 | 59 | def variants(word): 60 | """get all possible variants for a word""" 61 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 62 | deletes = [a + b[1:] for a, b in splits if b] 63 | transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] 64 | replaces = [a + c + b[1:] for a, b in splits for c in alphabet if b] 65 | inserts = [a + c + b for a, b in splits for c in alphabet] 66 | return set(deletes + transposes + replaces + inserts) 67 | 68 | def double_variants(word): 69 | """get variants for the variants for a word""" 70 | return set(s for w in variants(word) for s in variants(w)) 71 | 72 | def reductions(word): 73 | """return flat option list of all possible variations of the word by removing duplicate letters""" 74 | word = list(word) 75 | # ['h','i', 'i', 'i'] becomes ['h', ['i', 'ii', 'iii']] 76 | for idx, l in enumerate(word): 77 | n = numberofdupes(word, idx) 78 | # if letter appears more than once in a row 79 | if n: 80 | # generate a flat list of options ('hhh' becomes ['h','hh','hhh']) 81 | flat_dupes = [l*(r+1) for r in xrange(n+1)][:3] # only take up to 3, there are no 4 letter repetitions in english 82 | # remove duplicate letters in original word 83 | for _ in range(n): 84 | word.pop(idx+1) 85 | # replace original letter with flat list 86 | word[idx] = flat_dupes 87 | 88 | # ['h',['i','ii','iii']] becomes 'hi','hii','hiii' 89 | for p in product(*word): 90 | yield ''.join(p) 91 | 92 | def vowelswaps(word): 93 | """return flat option list of all possible variations of the word by swapping vowels""" 94 | word = list(word) 95 | # ['h','i'] becomes ['h', ['a', 'e', 'i', 'o', 'u', 'y']] 96 | for idx, l in enumerate(word): 97 | if type(l) == list: 98 | pass # dont mess with the reductions 99 | elif l in vowels: 100 | word[idx] = list(vowels) # if l is a vowel, replace with all possible vowels 101 | 102 | # ['h',['i','ii','iii']] becomes 'hi','hii','hiii' 103 | for p in product(*word): 104 | yield ''.join(p) 105 | 106 | def both(word): 107 | """permute all combinations of reductions and vowelswaps""" 108 | for reduction in reductions(word): 109 | for variant in vowelswaps(reduction): 110 | yield variant 111 | 112 | ### POSSIBILITY CHOOSING 113 | 114 | def suggestions(word, real_words, short_circuit=True): 115 | """get best spelling suggestion for word 116 | return on first match if short_circuit is true, otherwise collect all possible suggestions""" 117 | word = word.lower() 118 | if short_circuit: # setting short_circuit makes the spellchecker much faster, but less accurate in some cases 119 | return ({word} & real_words or # caps "inSIDE" => "inside" 120 | set(reductions(word)) & real_words or # repeats "jjoobbb" => "job" 121 | set(vowelswaps(word)) & real_words or # vowels "weke" => "wake" 122 | set(variants(word)) & real_words or # other "nonster" => "monster" 123 | set(both(word)) & real_words or # both "CUNsperrICY" => "conspiracy" 124 | set(double_variants(word)) & real_words or # other "nmnster" => "manster" 125 | {"NO SUGGESTION"}) 126 | else: 127 | return ({word} & real_words or 128 | (set(reductions(word)) | set(vowelswaps(word)) | set(variants(word)) | set(both(word)) | set(double_variants(word))) & real_words or 129 | {"NO SUGGESTION"}) 130 | 131 | def best(inputted_word, suggestions, word_model=None): 132 | """choose the best suggestion in a list based on lowest hamming distance from original word, or based on frequency if word_model is provided""" 133 | 134 | suggestions = list(suggestions) 135 | 136 | def comparehamm(one, two): 137 | score1 = hamming_distance(inputted_word, one) 138 | score2 = hamming_distance(inputted_word, two) 139 | return cmp(score1, score2) # lower is better 140 | 141 | def comparefreq(one, two): 142 | score1 = frequency(one, word_model) 143 | score2 = frequency(two, word_model) 144 | return cmp(score2, score1) # higher is better 145 | 146 | freq_sorted = sorted(suggestions, cmp=comparefreq)[10:] # take the top 10 147 | hamming_sorted = sorted(suggestions, cmp=comparehamm)[10:] # take the top 10 148 | print 'FREQ', freq_sorted 149 | print 'HAM', hamming_sorted 150 | return '' 151 | 152 | if __name__ == '__main__': 153 | # init the word frequency model with a simple list of all possible words 154 | word_model = train(file(SYSTEM_DICTIONARY).read()) 155 | real_words = set(word_model) 156 | 157 | # add other texts here, they are used to train the word frequency model 158 | texts = [ 159 | 'sherlockholmes.txt', 160 | 'lemmas.txt', 161 | ] 162 | # enhance the model with real bodies of english so we know which words are more common than others 163 | word_model = train_from_files(texts, word_model) 164 | 165 | log('Total Word Set: ', len(word_model)) 166 | log('Model Precision: %s' % (float(sum(word_model.values()))/len(word_model))) 167 | try: 168 | while True: 169 | word = str(raw_input('>')) 170 | 171 | possibilities = suggestions(word, real_words, short_circuit=False) 172 | short_circuit_result = suggestions(word, real_words, short_circuit=True) 173 | if VERBOSE: 174 | print [(x, word_model[x]) for x in possibilities] 175 | print best(word, possibilities, word_model) 176 | print '---' 177 | print [(x, word_model[x]) for x in short_circuit_result] 178 | if VERBOSE: 179 | print best(word, short_circuit_result, word_model) 180 | 181 | except (EOFError, KeyboardInterrupt): 182 | exit(0) 183 | --------------------------------------------------------------------------------