├── MANIFEST.in
├── spelchek
    ├── __init__.py
    └── checker.py
├── .gitignore
├── LICENSE
├── setup.py
└── README.rst


/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include spelchek/corpus.txt
2 | 


--------------------------------------------------------------------------------
/spelchek/__init__.py:
--------------------------------------------------------------------------------
1 | from .checker import *
2 | __version__ = 0.54
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Steve Theodore
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | import os
 3 | import sys
 4 | 
 5 | _here = os.path.abspath(os.path.dirname(__file__))
 6 | 
 7 | if sys.version_info[0] < 3:
 8 |     with open(os.path.join(_here, 'README.rst')) as f:
 9 |         long_description = f.read()
10 | else:
11 |     with open(os.path.join(_here, 'README.rst'), encoding='utf-8') as f:
12 |         long_description = f.read()
13 | 
14 | 
15 | setup(name='spelchek',
16 |       version='0.54',
17 |       description='A pure-python Bayesian spellchecker',
18 |       long_description=long_description,
19 |       url='https://github.com/theodox/spelchek',
20 |       author='Steve Theodore',
21 |       author_email='steve@theodox.com',
22 |       license='MIT',
23 |       packages=['spelchek'],
24 |       include_package_data=True,
25 |       classifiers=[
26 |           'Development Status :: 5 - Production/Stable',
27 |           'Intended Audience :: Developers',
28 |           'Topic :: Text Processing',
29 |           'Programming Language :: Python :: 2.7',
30 |           'Programming Language :: Python :: 3',
31 |           'Programming Language :: Python :: Implementation'
32 |       ],
33 |       install_requires=[])
34 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | spelchek
 2 | --------
 3 | 
 4 | A cheap-ass, pure-python spellchecker based on `Peter Norvig's Python
 5 | Bayes demo <http://norvig.com/spell-correct.html>`__ All the interesting
 6 | work is his.
 7 | 
 8 | The interesting external methods are
 9 | 
10 | -  ``known()`` filters a list of words and returns only those in the
11 |    dictionary,
12 | -  ``correct()`` returns the best guess for the supplied word
13 | -  ``guesses()`` returns all guesses for the supplied word
14 | -  ``add()`` adds a word to the dictionary, with an optional priority
15 |    value
16 | 
17 | So simple uses would be something like
18 | 
19 | ::
20 | 
21 |     import spelchek
22 |     print spelchek.correct('eaxmple')
23 |     # 'example'
24 | 
25 | The current corpus of words includes about 75,000 entries. It does not
26 | include punction such as hyphens, apostrophes or spaces. The module also
27 | supports optional user-supplied dictionaries, see the documentation of
28 | ``spelchek.py`` for details.
29 | 
30 | Important Caveat
31 | =================
32 | 
33 | The heart of a spell checker is the dictionary, and the dictionary here
34 | is cadged together out of a bunch of free online sources. No real effort
35 | has been made to check it for accuracy, and although it's trivially
36 | correct with several tens of thousands of words involved errors are
37 | pretty much inevitable (if you find one, feel free to submit a pull
38 | request and I'll update ``corpus.txt`` as needed).
39 | 
40 | The algorithm is language agnostic so it should be easy to create 
41 | dictionaries for languages other than English.  If you come up with a 
42 | non-English dictionary submit a pull request and we can extend the module
43 | to support language choice.
44 | 
45 | Installation
46 | ============
47 | 
48 | the module is a simple python module with no binary dependencies.
49 | The default dictionary is the file `corpus.txt` which lives inside 
50 | the spelchek package.
51 | 
52 | You can extend the built in dictionary in two ways.
53 | 
54 | 1. You can add words to the corpus.txt file; its's a plain text file
55 |    with words and frequency scores separated by a comma. High frequency
56 |    scores make a word more likely to be suggested as a correction, where
57 |    low frequencies are 'rarer' and so less likely to be suggested.  This
58 |    method is easiest if you are working with a source distributions from
59 |    the github repository
60 | 2. You can add a custom dictionary of your own using the same , format
61 |    and point to it be setting an envrionment variable called SPELCHEK. These 
62 |    entries will be added to the default dictionary at import time (note that
63 |    they will replace the assigned priorities of existing words).  This is a
64 |    low-friction way to try adding non-English language support.
65 | 
66 | 


--------------------------------------------------------------------------------
/spelchek/checker.py:
--------------------------------------------------------------------------------
  1 | """
  2 | spelchek
  3 | --------
  4 | 
  5 | A cheap-ass, pure-python spellchecker based on Peter Norvig's python bayes demo at http://norvig.com/spell-correct.html
  6 | 
  7 | The interesting external methods are
  8 |     * known() filters a list of words and returns only those in the dictionary,
  9 |     * correct() returns the best guess for the supplied word
 10 |     * guesses() returns all guesses for the supplied word
 11 | 
 12 | The dictionary is stored in corpus.txt. It's not very scientific or exact, I kludged it together from a variety of
 13 | public domain sources. Values over 5 are from the [GSL word list](http://jbauman.com/aboutgsl.html), the rest are
 14 | guesstimated from other word lists.  It's not guaranteed to be error free! If you discover mistakes, feel free to
 15 | submit a pull request.
 16 | 
 17 | Still, it works as is. Do remember to double check that the result of 'correct' is 'known': the `correct()` will return
 18 | the original word unchanged if it finds no candidates!
 19 | 
 20 | Installation
 21 | ============
 22 | the module is a single file python module with no binary dependencies. You do, however, need to keep the `corpus.txt`
 23 | file in the same location as `spelchek.py`.
 24 | 
 25 | You can extend the built in dictionary in two ways.
 26 | 
 27 | 1. You can add words to the corpus.txt file; its's a plain text file with words and frequency scores separated by a
 28 |    comma.  High frequency scores make a word more likely to be suggested as a correction, where low frequencies are
 29 |    'rarer' and so less likely to be suggested.
 30 | 
 31 | 2. You can add a custom dictionary of your own using the same <word>,<score> format and point to it be setting an
 32 |    environment variable called SPELCHEK.
 33 | 
 34 | """
 35 | __author__ = 'stevet'
 36 | 
 37 | import os
 38 | import pkgutil
 39 | import sys
 40 | import warnings
 41 | 
 42 | _ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
 43 | 
 44 | # this is the bayes dictionary, which is auto-populated using the comma-delimited list in `corpus.txt'
 45 | # this version is hardly scientific; the top 2000 words from the GSL list have good values,
 46 | # everything else is cadged together from random word list sources with an arbitrary values of 4 for
 47 | # 'ordinary' and 3 for 'plurals, adjectives, and participials'
 48 | _DICTIONARY = {}
 49 | 
 50 | 
 51 | def update_dictionary(corpus):
 52 |     """
 53 |     given an iterable of strings in the format <word>,<score> add the words to the dictionary with the corresponding score.  Typical usage:
 54 | 
 55 |          with open("custom_dict.txt", "rt") as new_dict:
 56 |             parse(new_dict)
 57 |     """
 58 |     for line in corpus:
 59 |         name, val = line.split(",")
 60 |         val = int(val)
 61 |         _DICTIONARY[name] = val
 62 | 
 63 | 
 64 | def first_order_variants(word):
 65 |     """
 66 |     return the obvious spelling variants of <word> with missing words, transpositions, or misplaced characters
 67 |     """
 68 |     splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
 69 |     deletes = [a + b[1:] for a, b in splits if b]
 70 |     transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b) > 1]
 71 |     replaces = [a + c + b[1:] for a, b in splits for c in _ALPHABET if b]
 72 |     inserts = [a + c + b for a, b in splits for c in _ALPHABET]
 73 |     return set(deletes + transposes + replaces + inserts)
 74 | 
 75 | 
 76 | def second_order_variants(word):
 77 |     "return second-order candidates"
 78 |     return set(e2 for e1 in first_order_variants(word) for e2 in first_order_variants(e1) if e2 in _DICTIONARY)
 79 | 
 80 | 
 81 | def known(*words):
 82 |     """
 83 |     Return all the words in *words which are in the dictionary
 84 |     """
 85 |     return set(w for w in words if w in _DICTIONARY)
 86 | 
 87 | 
 88 | def correct(word):
 89 |     """
 90 |     pick the 'best' candidate based on stored score of the possibilities.  If nothing else is close
 91 |     returns the original word, so don't assume its always right!
 92 |     """
 93 |     candidates = known(word) or known(*first_order_variants(word)) or second_order_variants(word) or [word]
 94 |     return max(candidates, key=_DICTIONARY.get)
 95 | 
 96 | 
 97 | def guesses(word):
 98 |     """
 99 |     return all of the first and second order guesses for this word
100 |     """
101 |     result = list(known(*first_order_variants(word)))
102 |     result.sort()
103 |     return result
104 | 
105 | 
106 | def add(word, priority=4):
107 |     """
108 |     Adds <word> to the dictionary with the specified priority (default is 4).
109 | 
110 |     IMPORTANT NOTE: this is temporary! The addition is not saved to disk, so it won't persist between loads!
111 |     """
112 |     _DICTIONARY[word.lower().strip()] = priority
113 | 
114 | 
115 | # -----------------------------------------------------------------------------------
116 | # import time initializations
117 | #
118 | # the dictionary is populated on module import with the context of corpus.txt in this package
119 | if sys.version_info.major >= 3:
120 |     _corpus = (i.decode("utf-8") for i in pkgutil.get_data("spelchek", "corpus.txt").splitlines())
121 | else:
122 |     _corpus = (i for i in pkgutil.get_data("spelchek", "corpus.txt").splitlines())
123 | 
124 | update_dictionary(_corpus)
125 | del _corpus
126 | 
127 | # if an environment variable with a corpus file is provided,
128 | # try to load that file too:
129 | 
130 | if os.environ.get('spelchek'):
131 |     abs = os.path.abspath(os.path.expandvars(os.environ['spelchek']))
132 |     if os.path.exists(abs):
133 |         with open(abs, 'rt') as user_dictionary:
134 |             update_dictionary(user_dictionary)
135 |     else:
136 |         warnings.warn("could not find local user dictionary '{}'".format(abs))
137 | 


--------------------------------------------------------------------------------