├── MANIFEST.in ├── spelchek ├── __init__.py └── checker.py ├── .gitignore ├── LICENSE ├── setup.py └── README.rst /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include spelchek/corpus.txt 2 | -------------------------------------------------------------------------------- /spelchek/__init__.py: -------------------------------------------------------------------------------- 1 | from .checker import * 2 | __version__ = 0.54 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Steve Theodore 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | import os 3 | import sys 4 | 5 | _here = os.path.abspath(os.path.dirname(__file__)) 6 | 7 | if sys.version_info[0] < 3: 8 | with open(os.path.join(_here, 'README.rst')) as f: 9 | long_description = f.read() 10 | else: 11 | with open(os.path.join(_here, 'README.rst'), encoding='utf-8') as f: 12 | long_description = f.read() 13 | 14 | 15 | setup(name='spelchek', 16 | version='0.54', 17 | description='A pure-python Bayesian spellchecker', 18 | long_description=long_description, 19 | url='https://github.com/theodox/spelchek', 20 | author='Steve Theodore', 21 | author_email='steve@theodox.com', 22 | license='MIT', 23 | packages=['spelchek'], 24 | include_package_data=True, 25 | classifiers=[ 26 | 'Development Status :: 5 - Production/Stable', 27 | 'Intended Audience :: Developers', 28 | 'Topic :: Text Processing', 29 | 'Programming Language :: Python :: 2.7', 30 | 'Programming Language :: Python :: 3', 31 | 'Programming Language :: Python :: Implementation' 32 | ], 33 | install_requires=[]) 34 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | spelchek 2 | -------- 3 | 4 | A cheap-ass, pure-python spellchecker based on `Peter Norvig's Python 5 | Bayes demo `__ All the interesting 6 | work is his. 7 | 8 | The interesting external methods are 9 | 10 | - ``known()`` filters a list of words and returns only those in the 11 | dictionary, 12 | - ``correct()`` returns the best guess for the supplied word 13 | - ``guesses()`` returns all guesses for the supplied word 14 | - ``add()`` adds a word to the dictionary, with an optional priority 15 | value 16 | 17 | So simple uses would be something like 18 | 19 | :: 20 | 21 | import spelchek 22 | print spelchek.correct('eaxmple') 23 | # 'example' 24 | 25 | The current corpus of words includes about 75,000 entries. It does not 26 | include punction such as hyphens, apostrophes or spaces. The module also 27 | supports optional user-supplied dictionaries, see the documentation of 28 | ``spelchek.py`` for details. 29 | 30 | Important Caveat 31 | ================= 32 | 33 | The heart of a spell checker is the dictionary, and the dictionary here 34 | is cadged together out of a bunch of free online sources. No real effort 35 | has been made to check it for accuracy, and although it's trivially 36 | correct with several tens of thousands of words involved errors are 37 | pretty much inevitable (if you find one, feel free to submit a pull 38 | request and I'll update ``corpus.txt`` as needed). 39 | 40 | The algorithm is language agnostic so it should be easy to create 41 | dictionaries for languages other than English. If you come up with a 42 | non-English dictionary submit a pull request and we can extend the module 43 | to support language choice. 44 | 45 | Installation 46 | ============ 47 | 48 | the module is a simple python module with no binary dependencies. 49 | The default dictionary is the file `corpus.txt` which lives inside 50 | the spelchek package. 51 | 52 | You can extend the built in dictionary in two ways. 53 | 54 | 1. You can add words to the corpus.txt file; its's a plain text file 55 | with words and frequency scores separated by a comma. High frequency 56 | scores make a word more likely to be suggested as a correction, where 57 | low frequencies are 'rarer' and so less likely to be suggested. This 58 | method is easiest if you are working with a source distributions from 59 | the github repository 60 | 2. You can add a custom dictionary of your own using the same , format 61 | and point to it be setting an envrionment variable called SPELCHEK. These 62 | entries will be added to the default dictionary at import time (note that 63 | they will replace the assigned priorities of existing words). This is a 64 | low-friction way to try adding non-English language support. 65 | 66 | -------------------------------------------------------------------------------- /spelchek/checker.py: -------------------------------------------------------------------------------- 1 | """ 2 | spelchek 3 | -------- 4 | 5 | A cheap-ass, pure-python spellchecker based on Peter Norvig's python bayes demo at http://norvig.com/spell-correct.html 6 | 7 | The interesting external methods are 8 | * known() filters a list of words and returns only those in the dictionary, 9 | * correct() returns the best guess for the supplied word 10 | * guesses() returns all guesses for the supplied word 11 | 12 | The dictionary is stored in corpus.txt. It's not very scientific or exact, I kludged it together from a variety of 13 | public domain sources. Values over 5 are from the [GSL word list](http://jbauman.com/aboutgsl.html), the rest are 14 | guesstimated from other word lists. It's not guaranteed to be error free! If you discover mistakes, feel free to 15 | submit a pull request. 16 | 17 | Still, it works as is. Do remember to double check that the result of 'correct' is 'known': the `correct()` will return 18 | the original word unchanged if it finds no candidates! 19 | 20 | Installation 21 | ============ 22 | the module is a single file python module with no binary dependencies. You do, however, need to keep the `corpus.txt` 23 | file in the same location as `spelchek.py`. 24 | 25 | You can extend the built in dictionary in two ways. 26 | 27 | 1. You can add words to the corpus.txt file; its's a plain text file with words and frequency scores separated by a 28 | comma. High frequency scores make a word more likely to be suggested as a correction, where low frequencies are 29 | 'rarer' and so less likely to be suggested. 30 | 31 | 2. You can add a custom dictionary of your own using the same , format and point to it be setting an 32 | environment variable called SPELCHEK. 33 | 34 | """ 35 | __author__ = 'stevet' 36 | 37 | import os 38 | import pkgutil 39 | import sys 40 | import warnings 41 | 42 | _ALPHABET = 'abcdefghijklmnopqrstuvwxyz' 43 | 44 | # this is the bayes dictionary, which is auto-populated using the comma-delimited list in `corpus.txt' 45 | # this version is hardly scientific; the top 2000 words from the GSL list have good values, 46 | # everything else is cadged together from random word list sources with an arbitrary values of 4 for 47 | # 'ordinary' and 3 for 'plurals, adjectives, and participials' 48 | _DICTIONARY = {} 49 | 50 | 51 | def update_dictionary(corpus): 52 | """ 53 | given an iterable of strings in the format , add the words to the dictionary with the corresponding score. Typical usage: 54 | 55 | with open("custom_dict.txt", "rt") as new_dict: 56 | parse(new_dict) 57 | """ 58 | for line in corpus: 59 | name, val = line.split(",") 60 | val = int(val) 61 | _DICTIONARY[name] = val 62 | 63 | 64 | def first_order_variants(word): 65 | """ 66 | return the obvious spelling variants of with missing words, transpositions, or misplaced characters 67 | """ 68 | splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] 69 | deletes = [a + b[1:] for a, b in splits if b] 70 | transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1] 71 | replaces = [a + c + b[1:] for a, b in splits for c in _ALPHABET if b] 72 | inserts = [a + c + b for a, b in splits for c in _ALPHABET] 73 | return set(deletes + transposes + replaces + inserts) 74 | 75 | 76 | def second_order_variants(word): 77 | "return second-order candidates" 78 | return set(e2 for e1 in first_order_variants(word) for e2 in first_order_variants(e1) if e2 in _DICTIONARY) 79 | 80 | 81 | def known(*words): 82 | """ 83 | Return all the words in *words which are in the dictionary 84 | """ 85 | return set(w for w in words if w in _DICTIONARY) 86 | 87 | 88 | def correct(word): 89 | """ 90 | pick the 'best' candidate based on stored score of the possibilities. If nothing else is close 91 | returns the original word, so don't assume its always right! 92 | """ 93 | candidates = known(word) or known(*first_order_variants(word)) or second_order_variants(word) or [word] 94 | return max(candidates, key=_DICTIONARY.get) 95 | 96 | 97 | def guesses(word): 98 | """ 99 | return all of the first and second order guesses for this word 100 | """ 101 | result = list(known(*first_order_variants(word))) 102 | result.sort() 103 | return result 104 | 105 | 106 | def add(word, priority=4): 107 | """ 108 | Adds to the dictionary with the specified priority (default is 4). 109 | 110 | IMPORTANT NOTE: this is temporary! The addition is not saved to disk, so it won't persist between loads! 111 | """ 112 | _DICTIONARY[word.lower().strip()] = priority 113 | 114 | 115 | # ----------------------------------------------------------------------------------- 116 | # import time initializations 117 | # 118 | # the dictionary is populated on module import with the context of corpus.txt in this package 119 | if sys.version_info.major >= 3: 120 | _corpus = (i.decode("utf-8") for i in pkgutil.get_data("spelchek", "corpus.txt").splitlines()) 121 | else: 122 | _corpus = (i for i in pkgutil.get_data("spelchek", "corpus.txt").splitlines()) 123 | 124 | update_dictionary(_corpus) 125 | del _corpus 126 | 127 | # if an environment variable with a corpus file is provided, 128 | # try to load that file too: 129 | 130 | if os.environ.get('spelchek'): 131 | abs = os.path.abspath(os.path.expandvars(os.environ['spelchek'])) 132 | if os.path.exists(abs): 133 | with open(abs, 'rt') as user_dictionary: 134 | update_dictionary(user_dictionary) 135 | else: 136 | warnings.warn("could not find local user dictionary '{}'".format(abs)) 137 | --------------------------------------------------------------------------------