├── .coveragerc ├── .git-blame-ignore-revs ├── .gitattributes ├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ ├── publish.yml │ ├── tests.yml │ └── weekly.yml ├── .gitignore ├── .readthedocs.yaml ├── CHANGELOG.md ├── INSTALL.rst ├── LICENSE ├── README.md ├── docs ├── Makefile ├── _templates │ └── layout.html ├── api │ ├── abstract_distance_comparer.rst │ ├── editdistance.rst │ ├── helpers.rst │ ├── index.rst │ └── symspellpy.rst ├── conf.py ├── examples │ ├── custom_distance_comparer.rst │ ├── dictionary.rst │ ├── index.rst │ ├── lookup.rst │ ├── lookup_compound.rst │ └── word_segmentation.rst ├── index.rst ├── make.bat ├── requirements.txt └── users │ └── installing.rst ├── pyproject.toml ├── requirements.txt ├── symspellpy ├── __init__.py ├── abstract_distance_comparer.py ├── composition.py ├── editdistance.py ├── frequency_bigramdictionary_en_243_342.txt ├── frequency_dictionary_en_82_765.txt ├── helpers.py ├── logging.py ├── pickle_mixin.py ├── suggest_item.py ├── symspellpy.py └── verbosity.py └── tests ├── __init__.py ├── benchmarks.ipynb ├── conftest.py ├── fortests ├── bad_dict.txt ├── below_threshold_dict.txt ├── big_modified.txt ├── big_words.txt ├── lookup_compound_data.json ├── lookup_compound_ignore_non_words_data.json ├── lookup_compound_replaced_words_data.json ├── lookup_compound_transfer_casing_data.json ├── lookup_compound_transfer_casing_ignore_nonwords_data.json ├── noisy_query_en_1000.txt ├── non_en_dict.txt ├── separator_dict.txt └── word_segmentation_data.json ├── test_compatibility.py ├── test_editdistance.py ├── test_helpers.py ├── test_suggest_item.py ├── test_symspellpy.py ├── test_symspellpy_edge_cases.py ├── test_symspellpy_lookup.py ├── test_symspellpy_lookup_compound.py ├── test_symspellpy_pickle.py └── test_symspellpy_word_segmentation.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = true 3 | source = symspellpy 4 | 5 | [report] 6 | exclude_lines = 7 | pragma: no cover 8 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # format all 2 | b0abc5ed3a37b05848ca1e2de790321d7c07fd75 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.py eol=lf 2 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | github: mammothb 3 | ko_fi: mammothb 4 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | day: "friday" 13 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to TestPyPI and PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | release: 6 | types: [published] 7 | 8 | jobs: 9 | publish-test-pypi: 10 | name: Build and publish to TestPyPI 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Set up Python 3.10 16 | uses: actions/setup-python@v5.6.0 17 | with: 18 | python-version: "3.10" 19 | 20 | - name: Build 21 | run: | 22 | echo "Building ..." 23 | python -m pip install --upgrade pip 24 | python -m pip install build 25 | python -m build 26 | 27 | - name: Publish to TestPyPI 28 | uses: pypa/gh-action-pypi-publish@v1.12.4 29 | with: 30 | user: __token__ 31 | password: ${{ secrets.TEST_PYPI_API_TOKEN }} 32 | repository-url: https://test.pypi.org/legacy/ 33 | 34 | - name: Publish to PyPI 35 | if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/v') 36 | uses: pypa/gh-action-pypi-publish@v1.12.4 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | name: "Python ${{ matrix.python-version }} on ${{ matrix.os }}" 8 | runs-on: ${{ matrix.os }} 9 | environment: Development 10 | 11 | strategy: 12 | matrix: 13 | os: [ubuntu-latest] 14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5.6.0 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements.txt 28 | 29 | - name: Run pytest 30 | run: python -m pytest --cov-report=xml --cov=symspellpy 31 | 32 | - name: Upload code coverage 33 | uses: codecov/codecov-action@v5 34 | with: 35 | token: ${{ secrets.CODECOV_TOKEN }} 36 | -------------------------------------------------------------------------------- /.github/workflows/weekly.yml: -------------------------------------------------------------------------------- 1 | name: Weekly Tests 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | # Runs every friday 7 | - cron: "0 0 * * 5" 8 | 9 | jobs: 10 | test: 11 | name: "Python ${{ matrix.python-version }} on ${{ matrix.os }}" 12 | runs-on: ${{ matrix.os }} 13 | 14 | strategy: 15 | fail-fast: false 16 | matrix: 17 | os: [ubuntu-latest, macos-latest, windows-latest] 18 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] 19 | 20 | steps: 21 | - uses: actions/checkout@v4 22 | 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v5.6.0 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | 28 | - name: Install dependencies 29 | run: | 30 | python -m pip install --upgrade pip 31 | pip install -r requirements.txt -v 32 | 33 | - name: Run pytest 34 | run: python -m pytest 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | .vscode/ 106 | 107 | #pycharm files 108 | .idea/ -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-24.04 11 | tools: 12 | python: "3.13" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | CHANGELOG
2 | ============== 3 | 4 | ## 6.9.0 (2025-03-09) 5 | 6 | - Specify that frequency count must be 64-bit int [#180](https://github.com/mammothb/symspellpy/pull/180) 7 | - Rename `string1` and `string2` argument names [#181](https://github.com/mammothb/symspellpy/pull/181) 8 | 9 | ## 6.8.0 (2025-03-09) 10 | - Allow file object as corpus of load_dictionary [#176](https://github.com/mammothb/symspellpy/pull/176) 11 | - Bump supported Python version to 3.9 - 3.13 [#177](https://github.com/mammothb/symspellpy/pull/177) 12 | 13 | ## 6.7.8 (2024-08-31) 14 | - Handle encoding errors [#149](https://github.com/mammothb/symspellpy/pull/149) 15 | - Bump supported Python version to 3.8 - 3.12 [#151](https://github.com/mammothb/symspellpy/pull/151) 16 | - Remove numpy dependency [#156](https://github.com/mammothb/symspellpy/pull/156) 17 | - Feature: distance comparer interface [#159](https://github.com/mammothb/symspellpy/pull/159) 18 | 19 | ## 6.7.7 (2022-10-24) 20 | - Remove support for Python 3.6 21 | - Use compiled regex expression in `create_dictionary()` ([#129](https://github.com/mammothb/symspellpy/pull/129)) 22 | - Configure module logger instead of modifying root logger ([#132](https://github.com/mammothb/symspellpy/pull/132), [#133](https://github.com/mammothb/symspellpy/pull/133)) 23 | 24 | ## 6.7.6 (2021-12-19) 25 | - Fix suggestion `count` in `lookup_compound` when `ignore_words=True` ([#108](https://github.com/mammothb/symspellpy/pull/108)) 26 | - Log error message when loading dictionary fails ([#109](https://github.com/mammothb/symspellpy/pull/109)) 27 | 28 | ## 6.7.5 (2021-12-02) 29 | - Fix `replaced_words` not being updated when best match is a combi (closes [#103](https://github.com/mammothb/symspellpy/issues/103)) 30 | - Implement a way to change the edit distance comparer algorightm via `distance_algorithm` property. Available values are found in [`DistanceAlgorithm`](https://symspellpy.readthedocs.io/en/latest/api/editdistance.html#symspellpy.editdistance.DistanceAlgorithm) 31 | 32 | ## 6.7.4 (2021-11-29) 33 | - Update `editdistpy` dependency version 34 | - Update `LevenshteinFast` and `DamerauOsaFast` to match the functionality of the `editdistpy` library 35 | 36 | ## 6.7.3 (2021-11-27) 37 | - Update `editdistpy` dependency version 38 | 39 | ## 6.7.2 (2021-11-25) 40 | - Fix typo of Dameruau to Damerau in various places. Can potentially break some setups that explicitly `_distance_algorithm` 41 | - Implement fast distance comparers with [editdistpy](https://github.com/mammothb/editdistpy) 42 | - Set `DamerauOsaFast` as the default distance comparer 43 | 44 | ## 6.7.1 (2021-11-21) 45 | - Updated `frequency_dictionary_en_82_765.txt` dictionary with common contractions 46 | - Added `_below_threshold_words`, `_bigrams`, `_count_threshold`, `_max_dictionary_edit_distance`, and `_prefix_length` when saving to pickle. (closes [#93](https://github.com/mammothb/symspellpy/issues/93)) 47 | - Implemented `to_bytes` and `from_bytes` options to save and load pickle with bytes string 48 | - Updated data_version to 3 49 | - Removed Python 3.4 and Python 3.5 support 50 | 51 | ## 6.7.0 (2020-08-28) 52 | - Removed numpy dependency 53 | - `word_segmentation` now retains/preserves case. 54 | - `word_segmentation` now keeps punctuation or apostrophe adjacent to previous 55 | word. 56 | - `word_segmentation` now normalizes ligatures: "scientific" -> "scientific". 57 | - `word_segmentation` now removes hyphens prior to word segmentation 58 | (untested). 59 | - American English word forms added to dictionary in addition to British 60 | English e.g. favourable & favorable. 61 | 62 | ## 6.5.2 (2019-10-23) 63 | - Modified `load_bigram_dictionary` to allow dictionary entries to be split 64 | into only 2 parts when using a custom separator 65 | - Added dictionary files to wheels so `pkg_resources` could be used to access 66 | them 67 | 68 | ## 6.5.1 (2019-10-08) 69 | - Added `separator` argument to allow user to choose custom separator for `load_dictionary` 70 | 71 | ## 6.5.0 (2019-09-21) 72 | - Added `load_bigram_dictionary` and bigram dictionary `frequency_bigramdictionary_en_243_342.txt` 73 | - Updated `lookup_compound` algorithm 74 | - Added `Levenshtein` to compute edit distance 75 | - Added `save_pickle_stream` and `load_pickle_stream` to save/load SymSpell data alongside other structure (contribution by [marcoffee](https://github.com/marcoffee)) 76 | 77 | ## 6.3.9 (2019-08-06) 78 | - Added `transfer_casing` to `lookup` and `lookup_compound` 79 | - Fixed prefix length check in `_edits_prefix` 80 | 81 | ## 6.3.8 (2019-03-21) 82 | - Implemented `delete_dictionary_entry` 83 | - Improved performance by using python builtin hashing 84 | - Added versioning of the pickle 85 | 86 | ## 6.3.7 (2019-02-18) 87 | - Fixed `include_unknown` in `lookup` 88 | - Removed unused `initial_capacity` argument 89 | - Improved `_get_str_hash` performance 90 | - Implemented `save_pickle` and `load_pickle` to avoid having to create the 91 | dictionary every time 92 | 93 | ## 6.3.6 (2019-02-11) 94 | - Added `create_dictionary()` feature 95 | 96 | ## 6.3.5 (2019-01-14) 97 | - Fixed `lookup_compound()` to return the correct `distance` 98 | 99 | ## 6.3.4 (2019-01-04) 100 | - Added `` to track number of misspelled words 101 | - Added `ignore_token` to `word_segmentation()` to ignore words with regular expression 102 | 103 | ## 6.3.3 (2018-12-05) 104 | - Added `word_segmentation()` feature 105 | 106 | ## 6.3.2 (2018-10-23) 107 | - Added `encoding` option to `load_dictionary()` 108 | 109 | ## 6.3.1 (2018-08-30) 110 | - Create a package for `symspellpy` 111 | 112 | ## 6.3.0 (2018-08-13) 113 | - Ported [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.3 114 | -------------------------------------------------------------------------------- /INSTALL.rst: -------------------------------------------------------------------------------- 1 | ********** 2 | Installing 3 | ********** 4 | 5 | Installing an official release 6 | ============================== 7 | 8 | symspellpy and its dependencies are available as wheel packages for macOS, 9 | Windows and Linux distributions:: 10 | 11 | python -m pip install -U symspellpy 12 | 13 | **NOTE**: symspellpy has only been tested on Windows and Linux systems and is 14 | assumed to work on macOS. 15 | 16 | Dictionary data 17 | =============== 18 | 19 | The dictionary files that are shipped with symspellpy can be accesed using 20 | `importlib.resources`:: 21 | 22 | dictionary_path = importlib.resources.files("symspellpy") / "frequency_dictionary_en_82_765.txt" 23 | bigram_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt" 24 | 25 | Alternatively, you can download the dictionary files from the repository and 26 | add them to your project directory:: 27 | 28 | curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt 29 | curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt 30 | 31 | You could end up with a project directory layout like:: 32 | 33 | project_dir 34 | +-frequency_bigramdictionary_en_243_342.txt 35 | +-frequency_dictionary_en_82_765.txt 36 | \-project.py 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 mmb L (Python port https://github.com/mammothb/symspellpy) 4 | Copyright (c) 2021 Wolf Garbe (Original C# implementation https://github.com/wolfgarbe/SymSpell) 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | symspellpy
2 | [![PyPI version](https://badge.fury.io/py/symspellpy.svg)](https://badge.fury.io/py/symspellpy) 3 | [![Tests](https://github.com/mammothb/symspellpy/actions/workflows/tests.yml/badge.svg)](https://github.com/mammothb/symspellpy/actions/workflows/tests.yml) 4 | [![Documentation Status](https://readthedocs.org/projects/symspellpy/badge/?version=latest)](https://symspellpy.readthedocs.io/en/latest/?badge=latest) 5 | [![codecov](https://codecov.io/gh/mammothb/symspellpy/branch/master/graph/badge.svg)](https://codecov.io/gh/mammothb/symspellpy) 6 | ======== 7 | 8 | symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.2, which provides much higher speed and lower memory consumption. Unit tests 9 | from the original project are implemented to ensure the accuracy of the port. 10 | 11 | Please note that the port has not been optimized for speed. 12 | 13 | Notable Changes 14 | =============== 15 | v6.7.2: Implemented fast distance comparer with [editdistpy](https://github.com/mammothb/editdistpy). Approximately 2x speed up for usage under default settings, benchmarks found [here](https://github.com/mammothb/symspellpy/blob/master/tests/benchmarks.ipynb). 16 | 17 | Install 18 | ======= 19 | For installation instructions, see the `INSTALL.rst` file or the [install](https://symspellpy.readthedocs.io/en/latest/users/installing.html) documentation. 20 | 21 | Usage 22 | ===== 23 | Check out the [examples](https://symspellpy.readthedocs.io/en/latest/examples/index.html) provided for sample usage. 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | .PHONY: help Makefile 15 | 16 | # Catch-all target: route all unknown targets to Sphinx using the new 17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 18 | %: Makefile 19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {%- block rootrellink %} 4 |
  • Home
  • 5 |
  • Examples
  • 6 |
  • API
  • 7 | {%- endblock %} 8 | -------------------------------------------------------------------------------- /docs/api/abstract_distance_comparer.rst: -------------------------------------------------------------------------------- 1 | ************************** 2 | abstract_distance_comparer 3 | ************************** 4 | 5 | Distance comparer interface 6 | =========================== 7 | 8 | .. autoclass:: symspellpy.abstract_distance_comparer.AbstractDistanceComparer 9 | :members: 10 | -------------------------------------------------------------------------------- /docs/api/editdistance.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | editdistance 3 | ************ 4 | 5 | Enum class 6 | ========== 7 | 8 | .. autoclass:: symspellpy.editdistance.DistanceAlgorithm 9 | :members: 10 | :member-order: bysource 11 | 12 | EditDistance class 13 | ================== 14 | 15 | .. autoclass:: symspellpy.editdistance.EditDistance 16 | :members: 17 | 18 | Distance comparer classes 19 | ========================= 20 | 21 | .. autoclass:: symspellpy.editdistance.DamerauOsa 22 | :members: 23 | 24 | .. autoclass:: symspellpy.editdistance.Levenshtein 25 | :members: 26 | 27 | .. autoclass:: symspellpy.editdistance.DamerauOsaFast 28 | :members: 29 | 30 | .. autoclass:: symspellpy.editdistance.LevenshteinFast 31 | :members: 32 | -------------------------------------------------------------------------------- /docs/api/helpers.rst: -------------------------------------------------------------------------------- 1 | ******* 2 | helpers 3 | ******* 4 | 5 | Helpers for `editdistance` 6 | ========================== 7 | 8 | .. autofunction:: symspellpy.helpers.null_distance_results 9 | 10 | .. autofunction:: symspellpy.helpers.prefix_suffix_prep 11 | 12 | Helpers for `symspellpy` 13 | ======================== 14 | 15 | .. autoclass:: symspellpy.helpers.DictIO 16 | 17 | .. autofunction:: symspellpy.helpers.case_transfer_matching 18 | 19 | .. autofunction:: symspellpy.helpers.case_transfer_similar 20 | 21 | .. autofunction:: symspellpy.helpers.increment_count 22 | 23 | .. autofunction:: symspellpy.helpers.is_acronym 24 | 25 | .. autofunction:: symspellpy.helpers.parse_words 26 | 27 | .. autofunction:: symspellpy.helpers.try_parse_int64 28 | 29 | Misc 30 | ==== 31 | 32 | .. autofunction:: symspellpy.helpers.to_similarity 33 | 34 | -------------------------------------------------------------------------------- /docs/api/index.rst: -------------------------------------------------------------------------------- 1 | ************ 2 | API Overview 3 | ************ 4 | 5 | Modules 6 | ======= 7 | 8 | .. only:: html 9 | 10 | .. toctree:: 11 | :maxdepth: 2 12 | 13 | helpers.rst 14 | abstract_distance_comparer.rst 15 | editdistance.rst 16 | symspellpy.rst 17 | -------------------------------------------------------------------------------- /docs/api/symspellpy.rst: -------------------------------------------------------------------------------- 1 | ********** 2 | symspellpy 3 | ********** 4 | 5 | Enum class 6 | ========== 7 | 8 | .. autoclass:: symspellpy.verbosity.Verbosity 9 | :members: 10 | :member-order: bysource 11 | 12 | Data class 13 | ========== 14 | 15 | .. autoclass:: symspellpy.suggest_item.SuggestItem 16 | :members: 17 | :special-members: __eq__, __lt__, __str__ 18 | 19 | .. autoclass:: symspellpy.composition.Composition 20 | :members: 21 | :exclude-members: corrected_string, distance_sum, log_prob_sum, segmented_string 22 | 23 | Utility class 24 | ============= 25 | 26 | .. autoclass:: symspellpy.pickle_mixin.PickleMixin 27 | :members: 28 | :private-members: 29 | 30 | SymSpell 31 | ======== 32 | 33 | .. autoclass:: symspellpy.symspellpy.SymSpell 34 | :members: 35 | :private-members: 36 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | 15 | import os.path 16 | import sys 17 | 18 | sys.path.insert(0, os.path.abspath("..")) 19 | 20 | from pathlib import Path 21 | 22 | import tomllib 23 | 24 | # -- Project information ----------------------------------------------------- 25 | 26 | project = "symspellpy" 27 | copyright = "2025, mmb L, Wolf Garbe" 28 | author = "mmb L, Wolf Garbe" 29 | 30 | # The short X.Y version 31 | version = "" 32 | # The full version, including alpha/beta/rc tags 33 | with open(Path(__file__).parents[1] / "pyproject.toml", "rb") as infile: 34 | data = tomllib.load(infile) 35 | release = data["project"]["version"] 36 | 37 | 38 | # -- General configuration --------------------------------------------------- 39 | 40 | # If your documentation needs a minimal Sphinx version, state it here. 41 | # 42 | # needs_sphinx = '1.0' 43 | 44 | # Add any Sphinx extension module names here, as strings. They can be 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 46 | # ones. 47 | extensions = [ 48 | "sphinx.ext.autodoc", 49 | "sphinx.ext.napoleon", 50 | "sphinx.ext.viewcode", 51 | "sphinx_autodoc_typehints", 52 | ] 53 | # numpydoc_class_members_toctree = False 54 | # numpydoc_show_inherited_class_members = False 55 | highlight_language = "none" 56 | 57 | # Add any paths that contain templates here, relative to this directory. 58 | templates_path = ["_templates"] 59 | 60 | # The suffix(es) of source filenames. 61 | # You can specify multiple suffix as a list of string: 62 | # 63 | # source_suffix = ['.rst', '.md'] 64 | source_suffix = ".rst" 65 | 66 | # The master toctree document. 67 | master_doc = "index" 68 | 69 | # The language for content autogenerated by Sphinx. Refer to documentation 70 | # for a list of supported languages. 71 | # 72 | # This is also used if you do content translation via gettext catalogs. 73 | # Usually you set "language" from the command line for these cases. 74 | language = "en" 75 | 76 | # List of patterns, relative to source directory, that match files and 77 | # directories to ignore when looking for source files. 78 | # This pattern also affects html_static_path and html_extra_path. 79 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 80 | 81 | # The name of the Pygments (syntax highlighting) style to use. 82 | pygments_style = None 83 | 84 | 85 | # -- Options for HTML output ------------------------------------------------- 86 | 87 | # The theme to use for HTML and HTML Help pages. See the documentation for 88 | # a list of builtin themes. 89 | # 90 | html_theme = "sphinxdoc" 91 | 92 | # Theme options are theme-specific and customize the look and feel of a theme 93 | # further. For a list of options available for each theme, see the 94 | # documentation. 95 | # 96 | # html_theme_options = {} 97 | 98 | # Add any paths that contain custom static files (such as style sheets) here, 99 | # relative to this directory. They are copied after the builtin static files, 100 | # so a file named "default.css" will overwrite the builtin "default.css". 101 | # html_static_path = ["_static"] 102 | html_static_path = [] 103 | 104 | # Custom sidebar templates, must be a dictionary that maps document names 105 | # to template names. 106 | # 107 | # The default sidebars (for documents that don't match any pattern) are 108 | # defined by theme itself. Builtin themes are using these templates by 109 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 110 | # 'searchbox.html']``. 111 | # 112 | html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]} 113 | 114 | 115 | # -- Options for HTMLHelp output --------------------------------------------- 116 | 117 | # Output file base name for HTML help builder. 118 | htmlhelp_basename = "symspellpydoc" 119 | -------------------------------------------------------------------------------- /docs/examples/custom_distance_comparer.rst: -------------------------------------------------------------------------------- 1 | ************************ 2 | Custom distance comparer 3 | ************************ 4 | 5 | Basic usage 6 | =========== 7 | 8 | Create a comparer class which satisfies the interface specified by 9 | :class:`~symspellpy.abstract_distance_comparer.AbstractDistanceComparer`: 10 | 11 | .. code-block:: python 12 | 13 | import importlib.resources 14 | from itertools import islice 15 | 16 | from symspellpy import SymSpell 17 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer 18 | from symspellpy.editdistance import DistanceAlgorithm, EditDistance 19 | 20 | class CustomComparer(AbstractDistanceComparer): 21 | def distance(self, string_1, string_2, max_distance): 22 | # Compare distance between string_1 and string_2 23 | return -1 if distance > max_distance else distance 24 | 25 | custom_comparer = Editdistance(DistanceAlgorithm.USER_PROVIDED, CustomComparer()) 26 | sym_spell = SymSpell(distance_comparer=custom_comparer) 27 | dictionary_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt" 28 | sym_spell.load_bigram_dictionary(dictionary_path, 0, 2) 29 | 30 | # Print out first 5 elements to demonstrate that dictionary is 31 | # successfully loaded 32 | print(list(islice(sym_spell.bigrams.items(), 5))) 33 | -------------------------------------------------------------------------------- /docs/examples/dictionary.rst: -------------------------------------------------------------------------------- 1 | ********** 2 | Dictionary 3 | ********** 4 | 5 | Load frequency dictionary 6 | ========================= 7 | 8 | `load_dictionary` 9 | ----------------- 10 | 11 | Given a dictionary file like:: 12 | 13 | 14 | 15 | ... 16 | 17 | 18 | We can use :meth:`~symspellpy.symspellpy.SymSpell.load_dictionary`: 19 | 20 | .. code-block:: python 21 | :emphasize-lines: 8 22 | 23 | import importlib.resources 24 | from itertools import islice 25 | 26 | from symspellpy import SymSpell 27 | 28 | sym_spell = SymSpell() 29 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 30 | sym_spell.load_dictionary(dictionary_path, 0, 1) 31 | 32 | # Print out first 5 elements to demonstrate that dictionary is 33 | # successfully loaded 34 | print(list(islice(sym_spell.words.items(), 5))) 35 | 36 | Output:: 37 | 38 | [('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698)] 39 | 40 | `load_bigram_dictionary` 41 | ------------------------ 42 | 43 | Given a bigram dictionary file like:: 44 | 45 | 46 | 47 | ... 48 | 49 | 50 | We can use :meth:`~symspellpy.symspellpy.SymSpell.load_bigram_dictionary`: 51 | 52 | .. code-block:: python 53 | :emphasize-lines: 8 54 | 55 | import importlib.resources 56 | from itertools import islice 57 | 58 | from symspellpy import SymSpell 59 | 60 | sym_spell = SymSpell() 61 | dictionary_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt" 62 | sym_spell.load_bigram_dictionary(dictionary_path, 0, 2) 63 | 64 | # Print out first 5 elements to demonstrate that dictionary is 65 | # successfully loaded 66 | print(list(islice(sym_spell.bigrams.items(), 5))) 67 | 68 | Output:: 69 | 70 | [('abcs of', 10956800), ('aaron and', 10721728), ('abbott and', 7861376), ('abbreviations and', 13518272), ('aberdeen and', 7347776)] 71 | 72 | Load frequency dictionary with custom separator 73 | =============================================== 74 | 75 | `load_dictionary` 76 | ----------------- 77 | 78 | It is also possible to specific a custom `separator` so that dictionaries can 79 | contain space separated terms. For example, given a dictionary file like:: 80 | 81 | the$23135851162 82 | abcs of$10956800 83 | of$13151942776 84 | aaron and$10721728 85 | abbott and$7861376 86 | abbreviations and$13518272 87 | aberdeen and$7347776 88 | 89 | We can specify "$" as the custom `separator` in 90 | :meth:`~symspellpy.symspellpy.SymSpell.load_dictionary` like: 91 | 92 | .. code-block:: python 93 | :emphasize-lines: 7 94 | 95 | from itertools import islice 96 | 97 | from symspellpy import SymSpell 98 | 99 | sym_spell = SymSpell() 100 | dictionary_path = 101 | sym_spell.load_dictionary(dictionary_path, 0, 1, separator="$") 102 | 103 | # Print out first 5 elements to demonstrate that dictionary is 104 | # successfully loaded 105 | print(list(islice(sym_spell.words.items(), 5))) 106 | 107 | Output:: 108 | 109 | [('the', 23135851162), ('abcs of', 10956800), ('of', 13151942776), ('aaron and', 10721728), ('abbott and', 7861376)] 110 | 111 | Note that space separated terms such as "abcs of", "aaron and", and 112 | "abbott and" can now be found in `words` instead of `bigrams`. 113 | 114 | `load_bigram_dictionary` 115 | ------------------------ 116 | 117 | We can also specify "$" as the custom `separator` in 118 | :meth:`~symspellpy.symspellpy.SymSpell.load_bigram_dictionary` like 119 | (note that we changed `count_index` from 2 to 1): 120 | 121 | .. code-block:: python 122 | :emphasize-lines: 7 123 | 124 | from itertools import islice 125 | 126 | from symspellpy import SymSpell 127 | 128 | sym_spell = SymSpell() 129 | dictionary_path = 130 | sym_spell.load_bigram_dictionary(dictionary_path, 0, 1, separator="$") 131 | 132 | # Print out first 5 elements to demonstrate that dictionary is 133 | # successfully loaded 134 | print(list(islice(sym_spell.bigrams.items(), 5))) 135 | 136 | Output:: 137 | 138 | [('the', 23135851162), ('abcs of', 10956800), ('of', 13151942776), ('aaron and', 10721728), ('abbott and', 7861376)] 139 | 140 | Note that `bigrams` now **erroneously** contains monograms. Precautions 141 | should taken when creating bigram dictionary with custom separator. 142 | 143 | Create dictionary from plain text file 144 | ====================================== 145 | 146 | Given a plain text file like:: 147 | 148 | abc abc-def abc_def abc'def abc qwe qwe1 1qwe q1we 1234 1234 149 | 150 | We can create a dictionary from the file using 151 | :meth:`~symspellpy.symspellpy.SymSpell.create_dictionary` like: 152 | 153 | .. code-block:: python 154 | :emphasize-lines: 5 155 | 156 | from symspellpy import SymSpell 157 | 158 | sym_spell = SymSpell() 159 | corpus_path = 160 | sym_spell.create_dictionary(corpus_path) 161 | 162 | print(sym_spell.words) 163 | 164 | Output:: 165 | 166 | {'abc': 4, 'def': 2, "abc'def": 1, 'qwe': 1, 'qwe1': 1, '1qwe': 1, 'q1we': 1, '1234': 2} 167 | 168 | Note that :meth:`~symspellpy.symspellpy.SymSpell.create_dictionary` did not 169 | split words at apostrophes and did not check if the words contained numbers. 170 | -------------------------------------------------------------------------------- /docs/examples/index.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Examples 3 | ======== 4 | 5 | .. only:: html 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | 10 | dictionary.rst 11 | custom_distance_comparer.rst 12 | lookup.rst 13 | lookup_compound.rst 14 | word_segmentation.rst 15 | -------------------------------------------------------------------------------- /docs/examples/lookup.rst: -------------------------------------------------------------------------------- 1 | ****** 2 | lookup 3 | ****** 4 | 5 | Basic usage 6 | =========== 7 | 8 | .. code-block:: python 9 | :emphasize-lines: 15 10 | 11 | import importlib.resources 12 | 13 | from symspellpy import SymSpell, Verbosity 14 | 15 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) 16 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 17 | # term_index is the column of the term and count_index is the 18 | # column of the term frequency 19 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 20 | 21 | # lookup suggestions for single-word input strings 22 | input_term = "memebers" # misspelling of "members" 23 | # max edit distance per lookup 24 | # (max_edit_distance_lookup <= max_dictionary_edit_distance) 25 | suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2) 26 | # display suggestion term, edit distance, and term frequency 27 | for suggestion in suggestions: 28 | print(suggestion) 29 | 30 | Output:: 31 | 32 | members, 1, 226656153 33 | 34 | Return original word if no correction within edit distance is found 35 | =================================================================== 36 | 37 | .. code-block:: python 38 | :emphasize-lines: 15,16,17 39 | 40 | import importlib.resources 41 | 42 | from symspellpy import SymSpell, Verbosity 43 | 44 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) 45 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 46 | # term_index is the column of the term and count_index is the 47 | # column of the term frequency 48 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 49 | 50 | # lookup suggestions for single-word input strings 51 | input_term = "apastraphee" # misspelling of "apostrophe" 52 | # max edit distance per lookup 53 | # (max_edit_distance_lookup <= max_dictionary_edit_distance) 54 | suggestions = sym_spell.lookup( 55 | input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True 56 | ) 57 | # display suggestion term, edit distance, and term frequency 58 | for suggestion in suggestions: 59 | print(suggestion) 60 | 61 | Output:: 62 | 63 | apastraphee, 3, 0 64 | 65 | Note that `suggestions` would have been empty if `include_unknown` was 66 | `False`. 67 | 68 | Avoid correcting phrases matching regex 69 | ======================================= 70 | 71 | .. code-block:: python 72 | :emphasize-lines: 14,15,16 73 | 74 | import importlib.resources 75 | 76 | from symspellpy import SymSpell, Verbosity 77 | 78 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) 79 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 80 | # column of the term frequency 81 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 82 | 83 | # lookup suggestions for single-word input strings 84 | input_term = "members1" 85 | # max edit distance per lookup 86 | # (max_edit_distance_lookup <= max_dictionary_edit_distance) 87 | suggestions = sym_spell.lookup( 88 | input_term, Verbosity.CLOSEST, max_edit_distance=2, ignore_token=r"\w+\d" 89 | ) 90 | # display suggestion term, edit distance, and term frequency 91 | for suggestion in suggestions: 92 | print(suggestion) 93 | 94 | Output:: 95 | 96 | members1, 0, 1 97 | 98 | Note that `members, 1, 226656153` would be returned if `ignore_token` wasn't 99 | specified. 100 | 101 | Keep original casing 102 | ==================== 103 | 104 | .. code-block:: python 105 | :emphasize-lines: 15,16,17 106 | 107 | import importlib.resources 108 | 109 | from symspellpy import SymSpell, Verbosity 110 | 111 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) 112 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 113 | # term_index is the column of the term and count_index is the 114 | # column of the term frequency 115 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 116 | 117 | # lookup suggestions for single-word input strings 118 | input_term = "mEmEbers" 119 | # max edit distance per lookup 120 | # (max_edit_distance_lookup <= max_dictionary_edit_distance) 121 | suggestions = sym_spell.lookup( 122 | input_term, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True 123 | ) 124 | # display suggestion term, edit distance, and term frequency 125 | for suggestion in suggestions: 126 | print(suggestion) 127 | 128 | 129 | Output:: 130 | 131 | mEmbers, 1, 226656153 132 | 133 | Note that the uppercase of the second "E" was not passed on to "b" in the 134 | corrected word. 135 | -------------------------------------------------------------------------------- /docs/examples/lookup_compound.rst: -------------------------------------------------------------------------------- 1 | *************** 2 | lookup_compound 3 | *************** 4 | 5 | Basic usage 6 | =========== 7 | 8 | .. code-block:: python 9 | :emphasize-lines: 20 10 | 11 | import importlib.resources 12 | 13 | from symspellpy import SymSpell 14 | 15 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) 16 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 17 | bigram_path = importlib.resources("symspellpy") / "frequency_bigramdictionary_en_243_342.txt" 18 | # term_index is the column of the term and count_index is the 19 | # column of the term frequency 20 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 21 | sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) 22 | 23 | # lookup suggestions for multi-word input strings (supports compound 24 | # splitting & merging) 25 | input_term = ( 26 | "whereis th elove hehad dated forImuch of thepast who " 27 | "couqdn'tread in sixtgrade and ins pired him" 28 | ) 29 | # max edit distance per lookup (per single word, not per whole input string) 30 | suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2) 31 | # display suggestion term, edit distance, and term frequency 32 | for suggestion in suggestions: 33 | print(suggestion) 34 | 35 | Output:: 36 | 37 | where is the love he had dated for much of the past who couldn't read in six grade and inspired him, 9, 0 38 | 39 | Keep original casing 40 | ==================== 41 | 42 | .. code-block:: python 43 | :emphasize-lines: 20,21,22 44 | 45 | import importlib.resources 46 | 47 | from symspellpy import SymSpell 48 | 49 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) 50 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 51 | bigram_path = importlib.resources("symspellpy") / "frequency_bigramdictionary_en_243_342.txt" 52 | # term_index is the column of the term and count_index is the 53 | # column of the term frequency 54 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 55 | sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) 56 | 57 | # lookup suggestions for multi-word input strings (supports compound 58 | # splitting & merging) 59 | input_term = ( 60 | "whereis th elove heHAd dated forImuch of thEPast who " 61 | "couqdn'tread in sixtgrade and ins pired him" 62 | ) 63 | # max edit distance per lookup (per single word, not per whole input string) 64 | suggestions = sym_spell.lookup_compound( 65 | input_term, max_edit_distance=2, transfer_casing=True 66 | ) 67 | # display suggestion term, edit distance, and term frequency 68 | for suggestion in suggestions: 69 | print(suggestion) 70 | 71 | Output:: 72 | 73 | where is the love he HAd dated for much of thE Past who couldn't read in six grade and inspired him, 9, 0 74 | -------------------------------------------------------------------------------- /docs/examples/word_segmentation.rst: -------------------------------------------------------------------------------- 1 | ***************** 2 | word_segmentation 3 | ***************** 4 | 5 | Basic usage 6 | =========== 7 | 8 | .. code-block:: python 9 | :emphasize-lines: 14 10 | 11 | import importlib.resources 12 | 13 | from symspellpy.symspellpy import SymSpell 14 | 15 | # Set max_dictionary_edit_distance to avoid spelling correction 16 | sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7) 17 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt" 18 | # term_index is the column of the term and count_index is the 19 | # column of the term frequency 20 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) 21 | 22 | # a sentence without any spaces 23 | input_term = "thequickbrownfoxjumpsoverthelazydog" 24 | result = sym_spell.word_segmentation(input_term) 25 | print(f"{result.corrected_string}, {result.distance_sum}, {result.log_prob_sum}") 26 | 27 | Output:: 28 | 29 | the quick brown fox jumps over the lazy dog, 8, -34.491167981910635 30 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. symspellpy documentation master file, created by 2 | sphinx-quickstart on Tue Feb 19 09:03:54 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | :orphan: 7 | 8 | .. title:: symspellpy: a SymSpell Python port 9 | 10 | .. toctree:: 11 | :hidden: 12 | 13 | users/installing 14 | examples/index 15 | api/index 16 | 17 | ********** 18 | symspellpy 19 | ********** 20 | 21 | symspellpy is a Python port of SymSpell_ v6.7.2, a Symmetric Delete 22 | spelling correction algorithm which provides much higher speed and lower 23 | memory consumption. 24 | 25 | .. _SymSpell: https://github.com/wolfgarbe/SymSpell 26 | 27 | Unit tests from the original project are implemented to ensure the accuracy 28 | of the port. Please note that the port has tried to replicate the code 29 | structure of the original project and has not been optimized for speed. 30 | 31 | Installation 32 | ============ 33 | 34 | Visit the :doc:`symspellpy installation instructions `. 35 | 36 | Usage examples 37 | ============== 38 | 39 | Check out :doc:`examples ` to learn how to use symspellpy. 40 | 41 | Documentation 42 | ============= 43 | 44 | Check out the :doc:`documentation `. 45 | 46 | Indices and tables 47 | ------------------ 48 | 49 | * :ref:`genindex` 50 | * :ref:`modindex` 51 | * :ref:`search` 52 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | editdistpy>=0.1.3 2 | numpydoc==1.8.0 3 | sphinx==8.2.3 4 | sphinx-autodoc-typehints==3.1.0 5 | -------------------------------------------------------------------------------- /docs/users/installing.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../INSTALL.rst 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "symspellpy" 7 | version = "6.9.0" 8 | dependencies = [ 9 | "editdistpy>=0.1.3", 10 | ] 11 | requires-python = ">=3.9" 12 | authors = [ 13 | {name = "mmb L"}, 14 | ] 15 | description = "Python SymSpell" 16 | readme = "README.md" 17 | license = {file = "LICENSE"} 18 | keywords = ["spellchecker", "symspell", "word segmentation"] 19 | classifiers = [ 20 | "Development Status :: 4 - Beta", 21 | "Intended Audience :: Developers", 22 | "Intended Audience :: Education", 23 | "Natural Language :: English", 24 | "License :: OSI Approved :: MIT License", 25 | "Programming Language :: Python", 26 | "Programming Language :: Python :: 3", 27 | "Programming Language :: Python :: 3.9", 28 | "Programming Language :: Python :: 3.10", 29 | "Programming Language :: Python :: 3.11", 30 | "Programming Language :: Python :: 3.12", 31 | "Programming Language :: Python :: 3.13", 32 | ] 33 | 34 | [project.urls] 35 | Repository = "https://github.com/mammothb/symspellpy" 36 | Documentation = "https://symspellpy.readthedocs.io/en/latest" 37 | Changelog = "https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md" 38 | 39 | [tool.basedpyright] 40 | ignore = ["tests"] 41 | pythonVersion = "3.9" 42 | 43 | reportUnusedCallResult = "none" 44 | 45 | [tool.ruff] 46 | line-length = 88 47 | indent-width = 4 48 | 49 | [tool.ruff.format] 50 | docstring-code-format = false 51 | indent-style = "space" 52 | line-ending = "auto" 53 | quote-style = "double" 54 | skip-magic-trailing-comma = false 55 | 56 | [tool.setuptools.dynamic] 57 | version = {attr = "symspellpy.__version__"} 58 | 59 | [tool.setuptools.packages.find] 60 | where = ["."] 61 | include = ["symspellpy"] 62 | 63 | [tool.setuptools.package-data] 64 | symspellpy = ["frequency_*.txt"] 65 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | editdistpy>=0.1.3 2 | 3 | # For testing 4 | importlib-resources>=6.3.2 5 | pytest==8.3.4 6 | pytest-cov==6.0.0 7 | -------------------------------------------------------------------------------- /symspellpy/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """symspellpy 17 | 18 | .. moduleauthor:: mmb L 19 | .. moduleauthor:: Wolf Garbe 20 | """ 21 | 22 | from . import editdistance, helpers, logging 23 | from .symspellpy import SymSpell 24 | from .verbosity import Verbosity 25 | -------------------------------------------------------------------------------- /symspellpy/abstract_distance_comparer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Optional 3 | 4 | 5 | class AbstractDistanceComparer(ABC): 6 | """An interface to compute relative distance between two strings.""" 7 | 8 | @abstractmethod 9 | def distance( 10 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int 11 | ) -> int: 12 | """Returns a measure of the distance between two strings. 13 | 14 | Args: 15 | string_1: One of the strings to compare. 16 | string_2: The other string to compare. 17 | max_distance: The maximum distance that is of interest. 18 | 19 | Returns: 20 | -1 if the distance is greater than the max_distance, 0 if the strings 21 | are equivalent, otherwise a positive number whose magnitude 22 | increases as difference between the strings increases. 23 | """ 24 | -------------------------------------------------------------------------------- /symspellpy/composition.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """ 17 | .. module:: compostiion 18 | :synopsis: Data class for :meth:`symspellpy.symspellpy.word_segmentation`. 19 | """ 20 | 21 | from typing import NamedTuple 22 | 23 | 24 | class Composition(NamedTuple): 25 | """Used by :meth:`word_segmentation`. 26 | 27 | Attributes: 28 | segmented_string: The word segmented string. 29 | corrected_string: The spelling corrected string. 30 | distance_sum: The sum of edit distance between input string and 31 | corrected string 32 | log_prob_sum: The sum of word occurrence probabilities in log 33 | scale (a measure of how common and probable the corrected 34 | segmentation is). 35 | """ 36 | 37 | segmented_string: str = "" 38 | corrected_string: str = "" 39 | distance_sum: int = 0 40 | log_prob_sum: float = 0 41 | 42 | @classmethod 43 | def create( 44 | cls, 45 | composition: "Composition", 46 | segmented_part: str, 47 | corrected_part: str, 48 | distance: int, 49 | log_prob: float, 50 | ) -> "Composition": 51 | """Creates a Composition by appending to an existing Composition.""" 52 | return cls( 53 | composition.segmented_string + segmented_part, 54 | composition.corrected_string + corrected_part, 55 | composition.distance_sum + distance, 56 | composition.log_prob_sum + log_prob, 57 | ) 58 | -------------------------------------------------------------------------------- /symspellpy/editdistance.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """ 17 | .. module:: editdistance 18 | :synopsis: Module for edit distance algorithms. 19 | """ 20 | 21 | import warnings 22 | from enum import Enum 23 | from typing import Optional 24 | 25 | from editdistpy import damerau_osa, levenshtein 26 | 27 | from symspellpy import helpers 28 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer 29 | 30 | 31 | class DistanceAlgorithm(Enum): 32 | """Supported edit distance algorithms.""" 33 | 34 | LEVENSHTEIN = 0 #: Levenshtein algorithm. 35 | DAMERAU_OSA = 1 #: Damerau optimal string alignment algorithm 36 | LEVENSHTEIN_FAST = 2 #: Fast Levenshtein algorithm. 37 | DAMERAU_OSA_FAST = 3 #: Fast Damerau optimal string alignment algorithm 38 | USER_PROVIDED = 4 #: User provided custom edit distance algorithm 39 | 40 | 41 | class EditDistance: 42 | """Edit distance algorithms. 43 | 44 | Args: 45 | algorithm: The distance algorithm to use. 46 | 47 | Attributes: 48 | _algorithm (:class:`DistanceAlgorithm`): The edit distance algorithm to 49 | use. 50 | _distance_comparer (:class:`AbstractDistanceComparer`): An object to 51 | compute the relative distance between two strings. The concrete 52 | object will be chosen based on the value of :attr:`_algorithm`. 53 | 54 | Raises: 55 | ValueError: If `algorithm` specifies an invalid distance algorithm. 56 | """ 57 | 58 | def __init__( 59 | self, 60 | algorithm: DistanceAlgorithm, 61 | comparer: Optional[AbstractDistanceComparer] = None, 62 | ) -> None: 63 | if algorithm != DistanceAlgorithm.USER_PROVIDED and comparer is not None: 64 | warnings.warn( 65 | f"A comparer is passed in but algorithm is not {DistanceAlgorithm.USER_PROVIDED.value}. A built-in comparer will be used." 66 | ) 67 | 68 | self._distance_comparer: AbstractDistanceComparer 69 | self._algorithm = algorithm 70 | if algorithm == DistanceAlgorithm.LEVENSHTEIN: 71 | self._distance_comparer = Levenshtein() 72 | elif algorithm == DistanceAlgorithm.DAMERAU_OSA: 73 | self._distance_comparer = DamerauOsa() 74 | elif algorithm == DistanceAlgorithm.LEVENSHTEIN_FAST: 75 | self._distance_comparer = LevenshteinFast() 76 | elif algorithm == DistanceAlgorithm.DAMERAU_OSA_FAST: 77 | self._distance_comparer = DamerauOsaFast() 78 | elif algorithm == DistanceAlgorithm.USER_PROVIDED: 79 | if not isinstance(comparer, AbstractDistanceComparer): 80 | raise ValueError( 81 | f"{algorithm.value} selected but no comparer passed in." 82 | ) 83 | self._distance_comparer = comparer 84 | else: 85 | raise ValueError("unknown distance algorithm") 86 | 87 | def compare(self, string_1: str, string_2: str, max_distance: int) -> int: 88 | """Compares a string to the base string to determine the edit distance, 89 | using the previously selected algorithm. 90 | 91 | Args: 92 | string_1: Base string. 93 | string_2: The string to compare. 94 | max_distance: The maximum distance allowed. 95 | 96 | Returns: 97 | The edit distance (or -1 if `max_distance` exceeded). 98 | """ 99 | return self._distance_comparer.distance(string_1, string_2, max_distance) 100 | 101 | 102 | class Levenshtein(AbstractDistanceComparer): 103 | """Provides Levenshtein algorithm for computing edit distance metric between 104 | two strings. 105 | 106 | Attributes: 107 | _base_char_1_costs (list[int]): 108 | """ 109 | 110 | def __init__(self): 111 | self._base_char_1_costs: list[int] = [] 112 | 113 | def distance( 114 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int 115 | ) -> int: 116 | """Computes the Levenshtein edit distance between two strings. 117 | 118 | Args: 119 | string_1: One of the strings to compare. 120 | string_2: The other string to compare. 121 | max_distance: The maximum distance that is of interest. 122 | 123 | Returns: 124 | -1 if the distance is greater than the max_distance, 0 if the strings 125 | are equivalent, otherwise a positive number whose magnitude 126 | increases as difference between the strings increases. 127 | """ 128 | if string_1 is None or string_2 is None: 129 | return helpers.null_distance_results(string_1, string_2, max_distance) 130 | if max_distance <= 0: 131 | return 0 if string_1 == string_2 else -1 132 | max_distance = int(min(2**31 - 1, max_distance)) 133 | # if strings of different lengths, ensure shorter string is in string_1. 134 | # This can result in a little faster speed by spending more time spinning 135 | # just the inner loop during the main processing. 136 | if len(string_1) > len(string_2): 137 | string_2, string_1 = string_1, string_2 138 | if len(string_2) - len(string_1) > max_distance: 139 | return -1 140 | # identify common suffix and/or prefix that can be ignored 141 | len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2) 142 | if len_1 == 0: 143 | return len_2 if len_2 <= max_distance else -1 144 | 145 | if len_2 > len(self._base_char_1_costs): 146 | self._base_char_1_costs = [0 for _ in range(len_2)] 147 | if max_distance < len_2: 148 | return self._distance_max( 149 | string_1, 150 | string_2, 151 | len_1, 152 | len_2, 153 | start, 154 | max_distance, 155 | self._base_char_1_costs, 156 | ) 157 | return self._distance( 158 | string_1, string_2, len_1, len_2, start, self._base_char_1_costs 159 | ) 160 | 161 | @staticmethod 162 | def _distance( 163 | string_1: str, 164 | string_2: str, 165 | len_1: int, 166 | len_2: int, 167 | start: int, 168 | char_1_costs: list[int], 169 | ) -> int: 170 | """Internal implementation of the core Levenshtein algorithm. 171 | 172 | **From**: https://github.com/softwx/SoftWx.Match 173 | """ 174 | char_1_costs = [j + 1 for j in range(len_2)] 175 | current_cost = 0 176 | for i in range(len_1): 177 | left_char_cost = above_char_cost = i 178 | char_1 = string_1[start + i] 179 | for j in range(len_2): 180 | # cost of diagonal (substitution) 181 | current_cost = left_char_cost 182 | left_char_cost = char_1_costs[j] 183 | if string_2[start + j] != char_1: 184 | # substitution if neither of the two conditions below 185 | if above_char_cost < current_cost: 186 | current_cost = above_char_cost 187 | if left_char_cost < current_cost: 188 | current_cost = left_char_cost 189 | current_cost += 1 190 | char_1_costs[j] = above_char_cost = current_cost 191 | return current_cost 192 | 193 | @staticmethod 194 | def _distance_max( 195 | string_1: str, 196 | string_2: str, 197 | len_1: int, 198 | len_2: int, 199 | start: int, 200 | max_distance: int, 201 | char_1_costs: list[int], 202 | ) -> int: 203 | """Internal implementation of the core Levenshtein algorithm that accepts 204 | a max_distance. 205 | 206 | **From**: https://github.com/softwx/SoftWx.Match 207 | """ 208 | char_1_costs = [ 209 | j + 1 if j < max_distance else max_distance + 1 for j in range(len_2) 210 | ] 211 | len_diff = len_2 - len_1 212 | j_start_offset = max_distance - len_diff 213 | j_start = 0 214 | j_end = max_distance 215 | current_cost = 0 216 | for i in range(len_1): 217 | char_1 = string_1[start + i] 218 | prev_char_1_cost = above_char_cost = i 219 | # no need to look beyond window of lower right diagonal - 220 | # max_distance cells (lower right diag is i - lenDiff) and the upper 221 | # left diagonal + max_distance cells (upper left is i) 222 | j_start += 1 if i > j_start_offset else 0 223 | j_end += 1 if j_end < len_2 else 0 224 | for j in range(j_start, j_end): 225 | # cost of diagonal (substitution) 226 | current_cost = prev_char_1_cost 227 | prev_char_1_cost = char_1_costs[j] 228 | if string_2[start + j] != char_1: 229 | # substitution if neither of the two conditions below 230 | if above_char_cost < current_cost: 231 | current_cost = above_char_cost 232 | if prev_char_1_cost < current_cost: 233 | current_cost = prev_char_1_cost 234 | current_cost += 1 235 | char_1_costs[j] = above_char_cost = current_cost 236 | if char_1_costs[i + len_diff] > max_distance: 237 | return -1 238 | return current_cost if current_cost <= max_distance else -1 239 | 240 | 241 | class DamerauOsa(AbstractDistanceComparer): 242 | """Provides optimized methods for computing Damerau-Levenshtein Optimal 243 | String Alignment (OSA) comparisons between two strings. 244 | 245 | Attributes: 246 | _base_char_1_costs (list[int]): 247 | _base_prev_char_1_costs (list[int]): 248 | """ 249 | 250 | def __init__(self) -> None: 251 | self._base_char_1_costs: list[int] = [] 252 | self._base_prev_char_1_costs: list[int] = [] 253 | 254 | def distance( 255 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int 256 | ) -> int: 257 | """Computes the Damerau-Levenshtein optimal string alignment edit 258 | distance between two strings. 259 | 260 | Args: 261 | string_1: One of the strings to compare. 262 | string_2: The other string to compare. 263 | max_distance: The maximum distance that is of interest. 264 | 265 | Returns: 266 | -1 if the distance is greater than the max_distance, 0 if the strings 267 | are equivalent, otherwise a positive number whose magnitude 268 | increases as difference between the strings increases. 269 | """ 270 | if string_1 is None or string_2 is None: 271 | return helpers.null_distance_results(string_1, string_2, max_distance) 272 | if max_distance <= 0: 273 | return 0 if string_1 == string_2 else -1 274 | max_distance = int(min(2**31 - 1, max_distance)) 275 | # if strings of different lengths, ensure shorter string is in string_1. 276 | # This can result in a little faster speed by spending more time spinning 277 | # just the inner loop during the main processing. 278 | if len(string_1) > len(string_2): 279 | string_2, string_1 = string_1, string_2 280 | if len(string_2) - len(string_1) > max_distance: 281 | return -1 282 | # identify common suffix and/or prefix that can be ignored 283 | len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2) 284 | if len_1 == 0: 285 | return len_2 if len_2 <= max_distance else -1 286 | 287 | if len_2 > len(self._base_char_1_costs): 288 | self._base_char_1_costs = [0 for _ in range(len_2)] 289 | self._base_prev_char_1_costs = [0 for _ in range(len_2)] 290 | if max_distance < len_2: 291 | return self._distance_max( 292 | string_1, 293 | string_2, 294 | len_1, 295 | len_2, 296 | start, 297 | max_distance, 298 | self._base_char_1_costs, 299 | self._base_prev_char_1_costs, 300 | ) 301 | return self._distance( 302 | string_1, 303 | string_2, 304 | len_1, 305 | len_2, 306 | start, 307 | self._base_char_1_costs, 308 | self._base_prev_char_1_costs, 309 | ) 310 | 311 | @staticmethod 312 | def _distance( 313 | string_1: str, 314 | string_2: str, 315 | len_1: int, 316 | len_2: int, 317 | start: int, 318 | char_1_costs: list[int], 319 | prev_char_1_costs: list[int], 320 | ) -> int: 321 | """Internal implementation of the core Damerau-Levenshtein, optimal 322 | string alignment algorithm. 323 | 324 | **From**: https://github.com/softwx/SoftWx.Match 325 | """ 326 | char_1_costs = [j + 1 for j in range(len_2)] 327 | char_1 = " " 328 | current_cost = 0 329 | for i in range(len_1): 330 | prev_char_1 = char_1 331 | char_1 = string_1[start + i] 332 | char_2 = " " 333 | left_char_cost = above_char_cost = i 334 | next_trans_cost = 0 335 | for j in range(len_2): 336 | this_trans_cost = next_trans_cost 337 | next_trans_cost = prev_char_1_costs[j] 338 | # cost of diagonal (substitution) 339 | prev_char_1_costs[j] = current_cost = left_char_cost 340 | # left now equals current cost (which will be diagonal 341 | # at next iteration) 342 | left_char_cost = char_1_costs[j] 343 | prev_char_2 = char_2 344 | char_2 = string_2[start + j] 345 | if char_1 != char_2: 346 | # substitution if neither of two conditions below 347 | if above_char_cost < current_cost: 348 | current_cost = above_char_cost 349 | if left_char_cost < current_cost: 350 | current_cost = left_char_cost 351 | current_cost += 1 352 | if ( 353 | i != 0 354 | and j != 0 355 | and char_1 == prev_char_2 356 | and prev_char_1 == char_2 357 | and this_trans_cost + 1 < current_cost 358 | ): 359 | # transposition 360 | current_cost = this_trans_cost + 1 361 | char_1_costs[j] = above_char_cost = current_cost 362 | return current_cost 363 | 364 | @staticmethod 365 | def _distance_max( 366 | string_1: str, 367 | string_2: str, 368 | len_1: int, 369 | len_2: int, 370 | start: int, 371 | max_distance: int, 372 | char_1_costs: list[int], 373 | prev_char_1_costs: list[int], 374 | ) -> int: 375 | """Internal implementation of the core Damerau-Levenshtein, optimal 376 | string alignment algorithm that accepts a max_distance. 377 | 378 | **From**: https://github.com/softwx/SoftWx.Match 379 | """ 380 | char_1_costs = [ 381 | j + 1 if j < max_distance else max_distance + 1 for j in range(len_2) 382 | ] 383 | len_diff = len_2 - len_1 384 | j_start_offset = max_distance - len_diff 385 | j_start = 0 386 | j_end = max_distance 387 | char_1 = " " 388 | current_cost = 0 389 | for i in range(len_1): 390 | prev_char_1 = char_1 391 | char_1 = string_1[start + i] 392 | char_2 = " " 393 | left_char_cost = above_char_cost = i 394 | next_trans_cost = 0 395 | # no need to look beyond window of lower right diagonal - 396 | # max_distance cells (lower right diag is i - len_diff) and the upper 397 | # left diagonal + max_distance cells (upper left is i) 398 | j_start += 1 if i > j_start_offset else 0 399 | j_end += 1 if j_end < len_2 else 0 400 | for j in range(j_start, j_end): 401 | this_trans_cost = next_trans_cost 402 | next_trans_cost = prev_char_1_costs[j] 403 | # cost of diagonal (substitution) 404 | prev_char_1_costs[j] = current_cost = left_char_cost 405 | # left now equals current cost (which will be diagonal at next 406 | # iteration) 407 | left_char_cost = char_1_costs[j] 408 | prev_char_2 = char_2 409 | char_2 = string_2[start + j] 410 | if char_1 != char_2: 411 | # substitution if neither of two conditions below 412 | if above_char_cost < current_cost: 413 | current_cost = above_char_cost 414 | if left_char_cost < current_cost: 415 | current_cost = left_char_cost 416 | current_cost += 1 417 | if ( 418 | i != 0 419 | and j != 0 420 | and char_1 == prev_char_2 421 | and prev_char_1 == char_2 422 | and this_trans_cost + 1 < current_cost 423 | ): 424 | # transposition 425 | current_cost = this_trans_cost + 1 426 | char_1_costs[j] = above_char_cost = current_cost 427 | if char_1_costs[i + len_diff] > max_distance: 428 | return -1 429 | return current_cost if current_cost <= max_distance else -1 430 | 431 | 432 | class LevenshteinFast(AbstractDistanceComparer): 433 | """Provides an interface for computing edit distance metric between two 434 | strings using the fast Levenshtein algorithm. 435 | """ 436 | 437 | def distance( 438 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int 439 | ) -> int: 440 | """Computes the Levenshtein edit distance between two strings. 441 | 442 | Args: 443 | string_1: One of the strings to compare. 444 | string_2: The other string to compare. 445 | max_distance: The maximum distance that is of interest. 446 | 447 | Returns: 448 | -1 if the distance is greater than the max_distance, 0 if the strings 449 | are equivalent, otherwise a positive number whose magnitude 450 | increases as difference between the strings increases. 451 | """ 452 | return levenshtein.distance(string_1, string_2, max_distance) 453 | 454 | 455 | class DamerauOsaFast(AbstractDistanceComparer): 456 | """Provides an interface for computing edit distance metric between two 457 | strings using the fast Damerau-Levenshtein Optimal String Alignment (OSA) 458 | algorithm. 459 | """ 460 | 461 | def distance( 462 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int 463 | ) -> int: 464 | """Computes the Damerau-Levenshtein optimal string alignment edit 465 | distance between two strings. 466 | 467 | Args: 468 | string_1: One of the strings to compare. 469 | string_2: The other string to compare. 470 | max_distance: The maximum distance that is of interest. 471 | 472 | Returns: 473 | -1 if the distance is greater than the max_distance, 0 if the strings 474 | are equivalent, otherwise a positive number whose magnitude 475 | increases as difference between the strings increases. 476 | """ 477 | return damerau_osa.distance(string_1, string_2, max_distance) 478 | -------------------------------------------------------------------------------- /symspellpy/helpers.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """ 17 | .. module:: helpers 18 | :synopsis: Helper functions 19 | """ 20 | 21 | import re 22 | import sys 23 | import warnings 24 | from difflib import SequenceMatcher 25 | from typing import Optional 26 | 27 | 28 | def _rename_args(kwargs_map: dict[str, str], version: str): 29 | def decorator(func): 30 | def wrapped(*args, **kwargs): 31 | new_kwargs = {} 32 | for k, v in kwargs.items(): 33 | if k in kwargs_map: 34 | warnings.warn( 35 | f"Keyword argument '{k}' is deprecated and will be removed in {version}. Use '{kwargs_map[k]}' instead.", 36 | DeprecationWarning, 37 | ) 38 | new_kwargs[kwargs_map.get(k, k)] = v 39 | return func(*args, **new_kwargs) 40 | 41 | return wrapped 42 | 43 | return decorator 44 | 45 | 46 | def case_transfer_matching(cased_text: str, uncased_text: str) -> str: 47 | """Transfers the casing from one text to another - assuming that they are 48 | 'matching' texts, alias they have the same length. 49 | 50 | Args: 51 | cased_text: Text with varied casing. 52 | uncased_text: Text that is in lowercase only. 53 | 54 | Returns: 55 | Text with the content of `uncased_text` and the casing of `cased_text`. 56 | 57 | Raises: 58 | ValueError: If the input texts have different lengths. 59 | """ 60 | if len(cased_text) != len(uncased_text): 61 | raise ValueError( 62 | "'cased_text' and 'uncased_text' don't have the same length, use case_transfer_similar() instead" 63 | ) 64 | 65 | return "".join( 66 | [ 67 | y.upper() if x.isupper() else y.lower() 68 | for x, y in zip(cased_text, uncased_text) 69 | ] 70 | ) 71 | 72 | 73 | def case_transfer_similar(cased_text: str, uncased_text: str) -> str: 74 | """Transfers the casing from one text to another - for similar (not matching) 75 | text. 76 | 77 | Use `difflib.SequenceMatcher` to identify the different type of changes 78 | needed to turn `cased_text` into `uncased_text`. 79 | 80 | - For inserted sections: transfer the casing from the prior character. If no 81 | character before or the character before is the space, transfer the casing 82 | from the following character. 83 | - For deleted sections: no case transfer is required. 84 | - For equal sections: swap out the text with the original, the cased one, a 85 | otherwise the two are the same. 86 | - For replaced sections: transfer the casing using 87 | :meth:`case_transfer_matching` if the two has the same length, otherwise 88 | transfer character-by-character and carry the last casing over to any 89 | additional characters. 90 | 91 | Args: 92 | cased_text: Text with varied casing. 93 | uncased_text: Text in lowercase. 94 | 95 | Returns: 96 | Text with the content of `uncased_text` but the casing of `cased_text`. 97 | 98 | Raises: 99 | ValueError: If `cased_text` is empty. 100 | """ 101 | if not uncased_text: 102 | return uncased_text 103 | 104 | if not cased_text: 105 | raise ValueError("'cased_text' cannot be empty") 106 | 107 | matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text) 108 | result = "" 109 | 110 | for tag, i1, i2, j1, j2 in matcher.get_opcodes(): 111 | if tag == "delete": 112 | continue 113 | if tag == "insert": 114 | # For the first character or space on the left, take the casing from 115 | # the following character. Else take case the prior character 116 | ia_ref = i1 if i1 == 0 or cased_text[i1 - 1] == " " else i1 - 1 117 | if cased_text[ia_ref].isupper(): 118 | result += uncased_text[j1:j2].upper() 119 | else: 120 | result += uncased_text[j1:j2].lower() 121 | elif tag == "equal": 122 | # Transfer the text from the cased_text, as anyhow they are equal 123 | # (without the casing) 124 | result += cased_text[i1:i2] 125 | else: 126 | cased_seq = cased_text[i1:i2] 127 | uncased_seq = uncased_text[j1:j2] 128 | 129 | if len(cased_seq) == len(uncased_seq): 130 | result += case_transfer_matching(cased_seq, uncased_seq) 131 | else: 132 | # transfer the casing character-by-character and using the last 133 | # casing to continue if we run out of the sequence 134 | for cased, uncased in zip(cased_seq, uncased_seq): 135 | result += uncased.upper() if cased.isupper() else uncased.lower() 136 | # Apply casing from the last character of cased_seq to the rest 137 | # of the uncased_seq 138 | if len(cased_seq) < len(uncased_seq): 139 | upper = cased_seq[-1].isupper() 140 | idx = len(cased_seq) 141 | result += "".join( 142 | map(str.upper if upper else str.lower, uncased_seq[idx:]) 143 | ) 144 | return result 145 | 146 | 147 | def increment_count(count: int, count_previous: int) -> int: 148 | """Increments count up to ``sys.maxsize``.""" 149 | return ( 150 | count_previous + count if sys.maxsize - count_previous > count else sys.maxsize 151 | ) 152 | 153 | 154 | def is_acronym(word: str, contain_digits: bool = False) -> bool: 155 | """Checks if the word is all caps (acronym) and/or contain numbers. 156 | 157 | Args: 158 | word: The word to check 159 | contain_digits: A flag to determine whether any term with digits can be 160 | considered as acronym 161 | 162 | Returns: 163 | True if the word is all caps and/or contain numbers, e.g., ABCDE, AB12C, 164 | abc12, ab12c. False if the word contains lower case letters, e.g., 165 | abcde, ABCde, abcDE, abCDe. 166 | """ 167 | return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None or ( 168 | contain_digits and any(i.isdigit() for i in word) 169 | ) 170 | 171 | 172 | @_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0") 173 | def null_distance_results( 174 | string_1: Optional[str], string_2: Optional[str], max_distance: int 175 | ) -> int: 176 | """Determines the proper return value of an edit distance function when one 177 | or both strings are null. 178 | 179 | Args: 180 | string_1: Base string. 181 | string_2: The string to compare. 182 | max_distance: The maximum distance allowed. 183 | 184 | Returns: 185 | -1 if the distance is greater than the max_distance, 0 if the strings are 186 | equivalent (both are None), otherwise a positive number whose 187 | magnitude is the length of the string which is not None. 188 | """ 189 | if string_1 is None: 190 | if string_2 is None: 191 | return 0 192 | return len(string_2) if len(string_2) <= max_distance else -1 193 | return len(string_1) if len(string_1) <= max_distance else -1 194 | 195 | 196 | def parse_words( 197 | phrase: str, preserve_case: bool = False, split_by_space: bool = False 198 | ) -> list[str]: 199 | """Creates a non-unique wordlist from sample text. Language independent 200 | (e.g. works with Chinese characters) 201 | 202 | Args: 203 | phrase: Sample text that could contain one or more words. 204 | preserve_case: A flag to determine if we can to preserve the cases or 205 | convert all to lowercase. 206 | split_by_space: Splits the phrase into words simply based on space. 207 | 208 | Returns: 209 | A list of words 210 | """ 211 | if split_by_space: 212 | if preserve_case: 213 | return phrase.split() 214 | return phrase.lower().split() 215 | # \W non-words, use negated set to ignore non-words and "_" (underscore). 216 | # Compatible with non-latin characters, does not split words at apostrophes 217 | if preserve_case: 218 | return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase) 219 | return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower()) 220 | 221 | 222 | @_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0") 223 | def prefix_suffix_prep(string_1: str, string_2: str) -> tuple[int, int, int]: 224 | """Calculates starting position and lengths of two strings such that common 225 | prefix and suffix substrings are excluded. 226 | Expects len(string_1) <= len(string_2). 227 | 228 | Args: 229 | string_1: Base string. 230 | string_2: The string to compare. 231 | 232 | Returns: 233 | A tuple of lengths of the part excluding common prefix and suffix, and 234 | the starting position. 235 | """ 236 | # this is also the minimun length of the two strings 237 | len_1 = len(string_1) 238 | len_2 = len(string_2) 239 | # suffix common to both strings can be ignored 240 | while len_1 != 0 and string_1[len_1 - 1] == string_2[len_2 - 1]: 241 | len_1 -= 1 242 | len_2 -= 1 243 | # prefix common to both strings can be ignored 244 | start = 0 245 | while start != len_1 and string_1[start] == string_2[start]: 246 | start += 1 247 | if start != 0: 248 | len_1 -= start 249 | # length of the part excluding common prefix and suffix 250 | len_2 -= start 251 | return len_1, len_2, start 252 | 253 | 254 | def to_similarity(distance: int, length: int) -> float: 255 | """Calculates a similarity measure from an edit distance. 256 | 257 | Args: 258 | distance: The edit distance between two strings. 259 | length: The length of the longer of the two strings the edit distance is 260 | from. 261 | 262 | Returns: 263 | A similarity value from 0 to 1.0 (1 - (length / distance)), -1 if 264 | distance is negative 265 | """ 266 | return -1 if distance < 0 else 1.0 - distance / length 267 | 268 | 269 | def try_parse_int64(string: str) -> Optional[int]: 270 | """Converts the string representation of a number to its 64-bit signed 271 | integer equivalent. 272 | 273 | Args: 274 | string: String representation of a number. 275 | 276 | Returns: 277 | The 64-bit signed integer equivalent, or None if conversion failed or if 278 | the number is less than the min value or greater than the max value 279 | of a 64-bit signed integer. 280 | """ 281 | try: 282 | ret = int(string) 283 | except ValueError: 284 | return None 285 | return ret if -(2**63) <= ret <= 2**63 - 1 else None 286 | 287 | 288 | class DictIO: 289 | """An iterator wrapper for python dictionary to format the output as required 290 | by :meth:`load_dictionary_stream` and :meth:`load_dictionary_bigram_stream`. 291 | 292 | Args: 293 | dictionary: dictionary with words as keys and frequency count as values. 294 | separator: Separator characters between term(s) and count. 295 | 296 | Attributes: 297 | iteritems: An iterator object of dictionary.items(). 298 | separator: Separator characters between term(s) and count. 299 | """ 300 | 301 | def __init__(self, dictionary: dict[str, int], separator: str = " ") -> None: 302 | self.iteritems = iter(dictionary.items()) 303 | self.separator = separator 304 | 305 | def __iter__(self) -> "DictIO": 306 | return self 307 | 308 | def __next__(self) -> str: 309 | return self.separator.join(map(str, next(self.iteritems))) 310 | -------------------------------------------------------------------------------- /symspellpy/logging.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | import logging 16 | import sys 17 | 18 | logger = logging.getLogger("symspellpy") 19 | 20 | handler = logging.StreamHandler(sys.stderr) 21 | handler.setFormatter( 22 | logging.Formatter(fmt="%(asctime)s: %(levelname).1s %(name)s] %(message)s") 23 | ) 24 | 25 | logger.addHandler(handler) 26 | -------------------------------------------------------------------------------- /symspellpy/pickle_mixin.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """ 17 | .. module:: pickle_mixing 18 | :synopsis: Mixin to provide pickle loading and saving functionalities. 19 | """ 20 | 21 | import gzip 22 | import logging 23 | import pickle 24 | from operator import itemgetter 25 | from pathlib import Path 26 | from typing import IO, Optional, Union, cast 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | # Protocol only available in py38 32 | # class SymSpellProtocol(Protocol): 33 | # data_version: int 34 | # _count_threshold: int 35 | # _max_dictionary_edit_distance: int 36 | # _prefix_length: int 37 | # _deletes: dict[str, list[str]] 38 | # _words: dict[str, int] 39 | # _max_length: int 40 | 41 | 42 | class PickleMixin: 43 | """Implements saving and loading pickle functionality for SymSpell.""" 44 | 45 | data_version: int 46 | _below_threshold_words: dict[str, int] 47 | _bigrams: dict[str, int] 48 | _deletes: dict[str, list[str]] 49 | _words: dict[str, int] 50 | 51 | _count_threshold: int 52 | _max_dictionary_edit_distance: int 53 | _max_length: int 54 | _prefix_length: int 55 | 56 | def load_pickle( 57 | self, 58 | data: Union[bytes, Path], 59 | compressed: bool = True, 60 | from_bytes: bool = False, 61 | ) -> bool: 62 | """Loads delete combination from file as pickle. This will reduce the 63 | loading time compared to running :meth:`load_dictionary` again. 64 | 65 | Args: 66 | data: Either bytes string to be used with ``from_bytes=True`` or the 67 | path+filename of the pickle file to be used with 68 | ``from_bytes=False``. 69 | compressed: A flag to determine whether to read the pickled data as 70 | compressed data. 71 | from_bytes: Flag to determine if we are loading from bytes or file. 72 | 73 | Returns: 74 | ``True`` if delete combinations are successfully loaded. 75 | """ 76 | if from_bytes: 77 | assert isinstance(data, bytes) 78 | return self._load_pickle_stream(data, from_bytes) 79 | if compressed: 80 | with gzip.open(data, "rb") as gzip_infile: 81 | return self._load_pickle_stream(cast(IO[bytes], gzip_infile)) 82 | else: 83 | with open(data, "rb") as infile: 84 | return self._load_pickle_stream(infile) 85 | 86 | def save_pickle( 87 | self, 88 | filename: Optional[Path] = None, 89 | compressed: bool = True, 90 | to_bytes: bool = False, 91 | ) -> Optional[bytes]: 92 | """Pickles :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into 93 | a stream for quicker loading later. 94 | 95 | Args: 96 | filename: The path+filename of the pickle file. 97 | compressed: A flag to determine whether to compress the pickled data. 98 | to_bytes: Flag to determine by bytes string should be returned 99 | instead of wrting to file. 100 | 101 | Returns: 102 | A byte string of the pickled data if ``to_bytes=True``. 103 | """ 104 | if to_bytes: 105 | return self._save_pickle_stream(to_bytes=to_bytes) 106 | assert filename is not None 107 | if compressed: 108 | with gzip.open(filename, "wb") as gzip_outfile: 109 | self._save_pickle_stream(cast(IO[bytes], gzip_outfile)) 110 | else: 111 | with open(filename, "wb") as outfile: 112 | self._save_pickle_stream(outfile) 113 | return None 114 | 115 | def _load_pickle_stream( 116 | self, stream: Union[bytes, IO[bytes]], from_bytes: bool = False 117 | ) -> bool: 118 | """Loads delete combination from stream as pickle. This will reduce the 119 | loading time compared to running :meth:`load_dictionary` again. 120 | 121 | **NOTE**: Prints warning if the current settings `count_threshold`, 122 | `max_dictionary_edit_distance`, and `prefix_length` are different from 123 | the loaded settings. Overwrite current settings with loaded settings. 124 | 125 | Args: 126 | stream: The stream from which the pickle data is loaded. 127 | from_bytes: Flag to determine if we are loading from bytes or file. 128 | 129 | Returns: 130 | ``True`` if delete combinations are successfully loaded. 131 | """ 132 | if from_bytes: 133 | assert isinstance(stream, bytes) 134 | pickle_data = pickle.loads(stream) # nosec 135 | else: 136 | assert not isinstance(stream, bytes) 137 | pickle_data = pickle.load(stream) # nosec 138 | if pickle_data.get("data_version", None) != self.data_version: 139 | return False 140 | settings = ("count_threshold", "max_dictionary_edit_distance", "prefix_length") 141 | if itemgetter(*settings)(pickle_data) != ( 142 | self._count_threshold, 143 | self._max_dictionary_edit_distance, 144 | self._prefix_length, 145 | ): 146 | logger.warning( 147 | f"Loading data which was created using different {settings} settings. Overwriting current SymSpell instance with loaded settings ..." 148 | ) 149 | self._deletes = pickle_data["deletes"] 150 | self._words = pickle_data["words"] 151 | self._max_length = pickle_data["max_length"] 152 | # dictionary entries related variables 153 | self._below_threshold_words = pickle_data["below_threshold_words"] 154 | self._bigrams = pickle_data["bigrams"] 155 | self._deletes = pickle_data["deletes"] 156 | self._words = pickle_data["words"] 157 | self._max_length = pickle_data["max_length"] 158 | # SymSpell settings used to generate the above 159 | self._count_threshold = pickle_data["count_threshold"] 160 | self._max_dictionary_edit_distance = pickle_data["max_dictionary_edit_distance"] 161 | self._prefix_length = pickle_data["prefix_length"] 162 | return True 163 | 164 | def _save_pickle_stream( 165 | self, stream: Optional[IO[bytes]] = None, to_bytes: bool = False 166 | ) -> Optional[bytes]: 167 | """Pickles :attr:`_below_threshold_words`, :attr:`_bigrams`, 168 | :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into 169 | a stream for quicker loading later. 170 | 171 | Pickles :attr:`_count_threshold`, :attr:`_max_dictionary_edit_distance`, 172 | and :attr:`_prefix_length` to ensure consistent behavior. 173 | 174 | Args: 175 | stream: The stream to store the pickle data. 176 | to_bytes: Flag to determine by bytes string should be returned 177 | instead of wrting to file. 178 | 179 | Returns: 180 | A byte string of the pickled data if ``to_bytes=True``. 181 | """ 182 | pickle_data = { 183 | # Dictionary entries related variables 184 | "below_threshold_words": self._below_threshold_words, 185 | "bigrams": self._bigrams, 186 | "deletes": self._deletes, 187 | "words": self._words, 188 | "max_length": self._max_length, 189 | # SymSpell settings used to generate the above 190 | "count_threshold": self._count_threshold, 191 | "max_dictionary_edit_distance": self._max_dictionary_edit_distance, 192 | "prefix_length": self._prefix_length, 193 | # Version to ensure compatibility 194 | "data_version": self.data_version, 195 | } 196 | if to_bytes: 197 | return pickle.dumps(pickle_data) 198 | assert stream is not None 199 | pickle.dump(pickle_data, stream) 200 | return None 201 | -------------------------------------------------------------------------------- /symspellpy/suggest_item.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """ 17 | .. module:: suggest_item 18 | :synopsis: Data class for :meth:`symspellpy.symspellpy.lookup`. 19 | """ 20 | 21 | 22 | class SuggestItem: 23 | """Spelling suggestion returned from :meth:`lookup`. 24 | 25 | Args: 26 | term: The suggested word. 27 | distance: Edit distance from search word. 28 | count: Frequency of suggestion in dictionary or Naive Bayes probability 29 | of the individual suggestion parts. 30 | """ 31 | 32 | def __init__(self, term: str, distance: int, count: int) -> None: 33 | self._term = term 34 | self._distance = distance 35 | self._count = count 36 | 37 | def __eq__(self, other: object) -> bool: 38 | """ 39 | Returns: 40 | ``True`` if both distance and frequency count are the same. 41 | """ 42 | if not isinstance(other, SuggestItem): 43 | return NotImplemented 44 | if self._distance == other.distance: 45 | return self._count == other.count 46 | return self._distance == other.distance 47 | 48 | def __lt__(self, other: object) -> bool: 49 | """ 50 | Returns: 51 | Order by distance ascending, then by frequency count descending. 52 | """ 53 | if not isinstance(other, SuggestItem): 54 | return NotImplemented 55 | if self._distance == other.distance: 56 | return self._count > other.count 57 | return self._distance < other.distance 58 | 59 | def __str__(self) -> str: 60 | """ 61 | Returns: 62 | Displays attributes as "term, distance, count". 63 | """ 64 | return f"{self._term}, {self._distance}, {self._count}" 65 | 66 | @property 67 | def count(self) -> int: 68 | """Frequency of suggestion in the dictionary (a measure of how common the 69 | word is) or Naive Bayes probability of the individual suggestion parts in 70 | :meth:`lookup_compound`. 71 | """ 72 | return self._count 73 | 74 | @count.setter 75 | def count(self, count: int) -> None: 76 | self._count = count 77 | 78 | @property 79 | def distance(self) -> int: 80 | """Edit distance between searched for word and suggestion.""" 81 | return self._distance 82 | 83 | @distance.setter 84 | def distance(self, distance: int) -> None: 85 | self._distance = distance 86 | 87 | @property 88 | def term(self) -> str: 89 | """The suggested correctly spelled word.""" 90 | return self._term 91 | 92 | @term.setter 93 | def term(self, term: str) -> None: 94 | self._term = term 95 | 96 | @classmethod 97 | def create_with_probability(cls, term: str, distance: int) -> "SuggestItem": 98 | """Creates a SuggestItem with Naive Bayes probability as the count.""" 99 | return cls(term, distance, 10 // 10 ** len(term)) 100 | -------------------------------------------------------------------------------- /symspellpy/verbosity.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2025 mmb L (Python port) 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation) 5 | # 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy 7 | # of this software and associated documentation files (the "Software"), to deal 8 | # in the Software without restriction, including without limitation the rights 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | # copies of the Software, and to permit persons to whom the Software is 11 | # furnished to do so, subject to the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included in all 14 | # copies or substantial portions of the Software. 15 | 16 | """ 17 | .. module:: verbosity 18 | :synopsis: Enum for lookup results verbosity. 19 | """ 20 | 21 | from enum import Enum 22 | 23 | 24 | class Verbosity(Enum): 25 | """Controls the closeness/quantity of returned spelling suggestions. 26 | 27 | Attributes: 28 | TOP: Top suggestion with the highest term frequency of the suggestions of 29 | smallest edit distance found. 30 | CLOSEST: All suggestions of smallest edit distance found, suggestions 31 | ordered by term frequency. 32 | ALL: All suggestions within maxEditDistance, suggestions ordered by edit 33 | distance, then by term frequency (slower, no early termination). 34 | """ 35 | 36 | TOP = 0 37 | CLOSEST = 1 38 | ALL = 2 39 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mammothb/symspellpy/f4d1531a686038975370be3db4c19685564c2efe/tests/__init__.py -------------------------------------------------------------------------------- /tests/benchmarks.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import importlib.resources\n", 10 | "import sys\n", 11 | "from pathlib import Path\n", 12 | "\n", 13 | "sys.path.append(str(Path.cwd().parent))\n", 14 | "\n", 15 | "from symspellpy import SymSpell, Verbosity\n", 16 | "from symspellpy.editdistance import DistanceAlgorithm, EditDistance" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "data": { 26 | "text/plain": [ 27 | "True" 28 | ] 29 | }, 30 | "execution_count": 2, 31 | "metadata": {}, 32 | "output_type": "execute_result" 33 | } 34 | ], 35 | "source": [ 36 | "bigram_path = importlib.resources.files(\"symspellpy\") / \"frequency_bigramdictionary_en_243_342.txt\"\n", 37 | "\n", 38 | "dictionary_path = importlib.resources.files(\"symspellpy\") / \"frequency_dictionary_en_82_765.txt\"\n", 39 | "\n", 40 | "sym_spell_damerau_osa = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.DAMERAU_OSA))\n", 41 | "sym_spell_damerau_osa.load_bigram_dictionary(bigram_path, 0, 2)\n", 42 | "sym_spell_damerau_osa.load_dictionary(dictionary_path, 0, 1)\n", 43 | "\n", 44 | "sym_spell_damerau_osa_fast = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST))\n", 45 | "sym_spell_damerau_osa_fast.load_bigram_dictionary(bigram_path, 0, 2)\n", 46 | "sym_spell_damerau_osa_fast.load_dictionary(dictionary_path, 0, 1)\n", 47 | "\n", 48 | "sym_spell_levenshtein = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.LEVENSHTEIN))\n", 49 | "sym_spell_levenshtein.load_bigram_dictionary(bigram_path, 0, 2)\n", 50 | "sym_spell_levenshtein.load_dictionary(dictionary_path, 0, 1)\n", 51 | "\n", 52 | "sym_spell_levenshtein_fast = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST))\n", 53 | "sym_spell_levenshtein_fast.load_bigram_dictionary(bigram_path, 0, 2)\n", 54 | "sym_spell_levenshtein_fast.load_dictionary(dictionary_path, 0, 1)" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "def lookup_damerau_osa():\n", 64 | " sym_spell_damerau_osa.lookup(\"tepmperamet\", Verbosity.ALL)\n", 65 | "\n", 66 | "def lookup_damerau_osa_fast():\n", 67 | " sym_spell_damerau_osa_fast.lookup(\"tepmperamet\", Verbosity.ALL)\n", 68 | "\n", 69 | "def lookup_levenshtein():\n", 70 | " sym_spell_levenshtein.lookup(\"tepmperamet\", Verbosity.ALL)\n", 71 | "\n", 72 | "def lookup_levenshtein_fast():\n", 73 | " sym_spell_levenshtein_fast.lookup(\"tepmperamet\", Verbosity.ALL)\n", 74 | "\n", 75 | "def lookup_compound_damerau_osa():\n", 76 | " sym_spell_damerau_osa.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n", 77 | "\n", 78 | "def lookup_compound_damerau_osa_fast():\n", 79 | " sym_spell_damerau_osa_fast.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n", 80 | "\n", 81 | "def lookup_compound_levenshtein():\n", 82 | " sym_spell_levenshtein.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n", 83 | "\n", 84 | "def lookup_compound_levenshtein_fast():\n", 85 | " sym_spell_levenshtein_fast.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n", 86 | "\n", 87 | "def word_segmentation_damerau_osa():\n", 88 | " sym_spell_damerau_osa.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n", 89 | "\n", 90 | "def word_segmentation_damerau_osa_fast():\n", 91 | " sym_spell_damerau_osa_fast.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n", 92 | "\n", 93 | "def word_segmentation_levenshtein():\n", 94 | " sym_spell_levenshtein.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n", 95 | "\n", 96 | "def word_segmentation_levenshtein_fast():\n", 97 | " sym_spell_levenshtein_fast.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 4, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "107 μs ± 356 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", 110 | "67.6 μs ± 319 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", 111 | "95.4 μs ± 563 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n", 112 | "66.7 μs ± 295 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" 113 | ] 114 | } 115 | ], 116 | "source": [ 117 | "%timeit lookup_damerau_osa()\n", 118 | "%timeit lookup_damerau_osa_fast()\n", 119 | "%timeit lookup_levenshtein()\n", 120 | "%timeit lookup_levenshtein_fast()" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "9.89 ms ± 65.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", 133 | "5.1 ms ± 13.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", 134 | "8.68 ms ± 46.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n", 135 | "4.95 ms ± 13.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "%timeit lookup_compound_damerau_osa()\n", 141 | "%timeit lookup_compound_damerau_osa_fast()\n", 142 | "%timeit lookup_compound_levenshtein()\n", 143 | "%timeit lookup_compound_levenshtein_fast()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 6, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "1.13 ms ± 1.36 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n", 156 | "1.14 ms ± 2.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n", 157 | "1.14 ms ± 3.56 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n", 158 | "1.14 ms ± 1.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "%timeit word_segmentation_damerau_osa()\n", 164 | "%timeit word_segmentation_damerau_osa_fast()\n", 165 | "%timeit word_segmentation_levenshtein()\n", 166 | "%timeit word_segmentation_levenshtein_fast()" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": {}, 172 | "source": [ 173 | "**Note**: Result for `word_segmentation` is expected since we are passing `max_edit_distance=0`." 174 | ] 175 | } 176 | ], 177 | "metadata": { 178 | "interpreter": { 179 | "hash": "d83327bb218665ef1f16f1956a0b9fb217f4e8f6e80f84663e37ea0a49e5699a" 180 | }, 181 | "kernelspec": { 182 | "display_name": "Python 3 (ipykernel)", 183 | "language": "python", 184 | "name": "python3" 185 | }, 186 | "language_info": { 187 | "codemirror_mode": { 188 | "name": "ipython", 189 | "version": 3 190 | }, 191 | "file_extension": ".py", 192 | "mimetype": "text/x-python", 193 | "name": "python", 194 | "nbconvert_exporter": "python", 195 | "pygments_lexer": "ipython3", 196 | "version": "3.13.2" 197 | } 198 | }, 199 | "nbformat": 4, 200 | "nbformat_minor": 4 201 | } 202 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | import importlib_resources 5 | import pytest 6 | 7 | from symspellpy import SymSpell 8 | 9 | FORTESTS_DIR = Path(__file__).resolve().parent / "fortests" 10 | 11 | 12 | ####################################################################### 13 | # Paths 14 | ####################################################################### 15 | @pytest.fixture 16 | def bigram_path(): 17 | ref = ( 18 | importlib_resources.files("symspellpy") 19 | / "frequency_bigramdictionary_en_243_342.txt" 20 | ) 21 | with importlib_resources.as_file(ref) as path: 22 | yield path 23 | 24 | 25 | @pytest.fixture 26 | def dictionary_path(): 27 | ref = importlib_resources.files("symspellpy") / "frequency_dictionary_en_82_765.txt" 28 | with importlib_resources.as_file(ref) as path: 29 | yield path 30 | 31 | 32 | @pytest.fixture 33 | def pickle_path(): 34 | return FORTESTS_DIR / "dictionary.pickle" 35 | 36 | 37 | @pytest.fixture 38 | def query_path(): 39 | return FORTESTS_DIR / "noisy_query_en_1000.txt" 40 | 41 | 42 | ####################################################################### 43 | # Misc 44 | ####################################################################### 45 | @pytest.fixture 46 | def get_same_word_and_count(): 47 | word = "hello" 48 | return [(word, 11), (word, 3)] 49 | 50 | 51 | @pytest.fixture 52 | def get_fortests_data(request): 53 | with open(FORTESTS_DIR / request.param) as infile: 54 | return json.load(infile)["data"] 55 | 56 | 57 | ####################################################################### 58 | # symspells 59 | ####################################################################### 60 | @pytest.fixture 61 | def symspell_default(): 62 | return SymSpell() 63 | 64 | 65 | @pytest.fixture 66 | def symspell_default_entry(symspell_default, request): 67 | for entry in request.param: 68 | symspell_default.create_dictionary_entry(entry[0], entry[1]) 69 | return symspell_default 70 | 71 | 72 | @pytest.fixture 73 | def symspell_default_load(symspell_default, dictionary_path, bigram_path, request): 74 | symspell_default.load_dictionary(dictionary_path, 0, 1) 75 | if request.param == "bigram": 76 | symspell_default.load_bigram_dictionary(bigram_path, 0, 2) 77 | return symspell_default, request.param 78 | 79 | 80 | @pytest.fixture 81 | def symspell_long(): 82 | return SymSpell(5) 83 | 84 | 85 | @pytest.fixture 86 | def symspell_long_entry(symspell_long, request): 87 | for entry in request.param: 88 | symspell_long.create_dictionary_entry(entry, 2) 89 | return symspell_long, request.param 90 | 91 | 92 | @pytest.fixture 93 | def symspell_short(request): 94 | if request.param is None: 95 | return SymSpell(1, 3) 96 | return SymSpell(1, 3, count_threshold=request.param) 97 | -------------------------------------------------------------------------------- /tests/fortests/bad_dict.txt: -------------------------------------------------------------------------------- 1 | qwer 2 | wert 3 | erty 4 | rtyu tyui 12 5 | yuio uiop 13 6 | asdf 10 7 | sdfg 12 -------------------------------------------------------------------------------- /tests/fortests/below_threshold_dict.txt: -------------------------------------------------------------------------------- 1 | below 8 2 | threshold 10 3 | word 10 4 | -------------------------------------------------------------------------------- /tests/fortests/lookup_compound_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "typo": "whereis th elove", 5 | "bigram": { 6 | "num_results": 1, 7 | "term": "where is the love", 8 | "distance": 2, 9 | "count": 585 10 | }, 11 | "unigram": { 12 | "num_results": 1, 13 | "term": "whereas the love", 14 | "distance": 2, 15 | "count": 64 16 | } 17 | }, 18 | { 19 | "typo": "the bigjest playrs", 20 | "bigram": { 21 | "num_results": 1, 22 | "term": "the biggest players", 23 | "distance": 2, 24 | "count": 34 25 | }, 26 | "unigram": { 27 | "num_results": 1, 28 | "term": "the biggest players", 29 | "distance": 2, 30 | "count": 34 31 | } 32 | }, 33 | { 34 | "typo": "Can yu readthis", 35 | "bigram": { 36 | "num_results": 1, 37 | "term": "can you read this", 38 | "distance": 3, 39 | "count": 11440 40 | }, 41 | "unigram": { 42 | "num_results": 1, 43 | "term": "can you read this", 44 | "distance": 3, 45 | "count": 3 46 | } 47 | }, 48 | { 49 | "typo": "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him", 50 | "bigram": { 51 | "num_results": 1, 52 | "term": "where is the love he had dated for much of the past who couldn't read in sixth grade and inspired him", 53 | "distance": 9, 54 | "count": 0 55 | }, 56 | "unigram": { 57 | "num_results": 1, 58 | "term": "whereas the love head dated for much of the past who couldn't read in sixth grade and inspired him", 59 | "distance": 9, 60 | "count": 0 61 | } 62 | }, 63 | { 64 | "typo": "in te dhird qarter oflast jear he hadlearned ofca sekretplan", 65 | "bigram": { 66 | "num_results": 1, 67 | "term": "in the third quarter of last year he had learned of a secret plan", 68 | "distance": 9, 69 | "count": 0 70 | }, 71 | "unigram": { 72 | "num_results": 1, 73 | "term": "in the third quarter of last year he had learned of a secret plan", 74 | "distance": 9, 75 | "count": 0 76 | } 77 | }, 78 | { 79 | "typo": "the bigjest playrs in te strogsommer film slatew ith plety of funn", 80 | "bigram": { 81 | "num_results": 1, 82 | "term": "the biggest players in the strong summer film slate with plenty of fun", 83 | "distance": 9, 84 | "count": 0 85 | }, 86 | "unigram": { 87 | "num_results": 1, 88 | "term": "the biggest players in the strong summer film slate with plenty of fun", 89 | "distance": 9, 90 | "count": 0 91 | } 92 | }, 93 | { 94 | "typo": "Can yu readthis messa ge despite thehorible sppelingmsitakes", 95 | "bigram": { 96 | "num_results": 1, 97 | "term": "can you read this message despite the horrible spelling mistakes", 98 | "distance": 10, 99 | "count": 0 100 | }, 101 | "unigram": { 102 | "num_results": 1, 103 | "term": "can you read this message despite the horrible spelling mistakes", 104 | "distance": 10, 105 | "count": 0 106 | } 107 | } 108 | ] 109 | } 110 | -------------------------------------------------------------------------------- /tests/fortests/lookup_compound_ignore_non_words_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "typo": "whereis th elove 123 hehad dated forImuch of THEPAST who couqdn'tread in SIXTHgrade and ins pired him", 5 | "bigram": { 6 | "term": "where is the love 123 he had dated for much of THEPAST who couldn't read in sixth grade and inspired him" 7 | }, 8 | "unigram": { 9 | "term": "whereas the love 123 head dated for much of THEPAST who couldn't read in sixth grade and inspired him" 10 | } 11 | }, 12 | { 13 | "typo": "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan", 14 | "bigram": { 15 | "term": "in the DHIRD 1 quarter of last year he had learned of a secret plan" 16 | }, 17 | "unigram": { 18 | "term": "in the DHIRD 1 quarter of last year he had learned of a secret plan" 19 | } 20 | }, 21 | { 22 | "typo": "the bigjest playrs in te stroGSOmmer film slatew ith PLETY of 12 funn", 23 | "bigram": { 24 | "term": "the biggest players in the strong summer film slate with PLETY of 12 fun" 25 | }, 26 | "unigram": { 27 | "term": "the biggest players in the strong summer film slate with PLETY of 12 fun" 28 | } 29 | }, 30 | { 31 | "typo": "Can yu readtHIS messa ge despite thehorible 1234 sppelingmsitakes", 32 | "bigram": { 33 | "term": "can you read this message despite the horrible 1234 spelling mistakes" 34 | }, 35 | "unigram": { 36 | "term": "can you read this message despite the horrible 1234 spelling mistakes" 37 | } 38 | }, 39 | { 40 | "typo": "Can yu readtHIS messa ge despite thehorible AB1234 sppelingmsitakes", 41 | "bigram": { 42 | "term": "can you read this message despite the horrible AB1234 spelling mistakes" 43 | }, 44 | "unigram": { 45 | "term": "can you read this message despite the horrible AB1234 spelling mistakes" 46 | } 47 | }, 48 | { 49 | "typo": "PI on leave, arrange Co-I to do screening", 50 | "bigram": { "term": "PI on leave arrange co i to do screening" }, 51 | "unigram": { "term": "PI on leave arrange co i to do screening" } 52 | } 53 | ] 54 | } 55 | -------------------------------------------------------------------------------- /tests/fortests/lookup_compound_replaced_words_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "typo": "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him", 5 | "bigram": { 6 | "term": "where is the love he had dated for much of the past who couldn't read in sixth grade and inspired him", 7 | "replacement": { 8 | "whereis": "where is", 9 | "th": "the", 10 | "elove": "love", 11 | "hehad": "he had", 12 | "forimuch": "for much", 13 | "thepast": "the past", 14 | "couqdn'tread": "couldn't read", 15 | "sixthgrade": "sixth grade", 16 | "ins": "inspired" 17 | } 18 | }, 19 | "unigram": { 20 | "term": "whereas the love head dated for much of the past who couldn't read in sixth grade and inspired him", 21 | "replacement": { 22 | "whereis": "whereas", 23 | "th": "the", 24 | "elove": "love", 25 | "hehad": "head", 26 | "forimuch": "for much", 27 | "thepast": "the past", 28 | "couqdn'tread": "couldn't read", 29 | "sixthgrade": "sixth grade", 30 | "ins": "inspired" 31 | } 32 | } 33 | }, 34 | { 35 | "typo": "in te dhird qarter oflast jear he hadlearned ofca sekretplan", 36 | "bigram": { 37 | "term": "in the third quarter of last year he had learned of a secret plan", 38 | "replacement": { 39 | "te": "the", 40 | "dhird": "third", 41 | "qarter": "quarter", 42 | "oflast": "of last", 43 | "jear": "year", 44 | "hadlearned": "had learned", 45 | "ofca": "of a", 46 | "sekretplan": "secret plan" 47 | } 48 | }, 49 | "unigram": { 50 | "term": "in the third quarter of last year he had learned of a secret plan", 51 | "replacement": { 52 | "te": "the", 53 | "dhird": "third", 54 | "qarter": "quarter", 55 | "oflast": "of last", 56 | "jear": "year", 57 | "hadlearned": "had learned", 58 | "ofca": "of a", 59 | "sekretplan": "secret plan" 60 | } 61 | } 62 | }, 63 | { 64 | "typo": "the bigjest playrs in te strogsommer film slatew ith plety of funn", 65 | "bigram": { 66 | "term": "the biggest players in the strong summer film slate with plenty of fun", 67 | "replacement": { 68 | "bigjest": "biggest", 69 | "playrs": "players", 70 | "strogsommer": "strong summer", 71 | "slatew": "slate", 72 | "ith": "with", 73 | "plety": "plenty", 74 | "funn": "fun" 75 | } 76 | }, 77 | "unigram": { 78 | "term": "the biggest players in the strong summer film slate with plenty of fun", 79 | "replacement": { 80 | "bigjest": "biggest", 81 | "playrs": "players", 82 | "strogsommer": "strong summer", 83 | "slatew": "slate", 84 | "ith": "with", 85 | "plety": "plenty", 86 | "funn": "fun" 87 | } 88 | } 89 | } 90 | ] 91 | } 92 | -------------------------------------------------------------------------------- /tests/fortests/lookup_compound_transfer_casing_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "typo": "Whereis th elove hehaD Dated forImuch of thepast who couqdn'tread in sixthgrade AND ins pired him", 5 | "bigram": { 6 | "term": "Where is the love he haD Dated for much of the past who couldn't read in sixth grade AND inspired him" 7 | }, 8 | "unigram": { 9 | "term": "Whereas the love heaD Dated for much of the past who couldn't read in sixth grade AND inspired him" 10 | } 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/fortests/lookup_compound_transfer_casing_ignore_nonwords_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "typo": "Whereis th elove hehaD Dated FOREEVER forImuch of thepast who couqdn'tread in sixthgrade AND ins pired him", 5 | "bigram": { 6 | "term": "Where is the love he haD Dated FOREEVER for much of the past who couldn't read in sixth grade AND inspired him" 7 | }, 8 | "unigram": { 9 | "term": "Whereas the love heaD Dated FOREEVER for much of the past who couldn't read in sixth grade AND inspired him" 10 | } 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /tests/fortests/noisy_query_en_1000.txt: -------------------------------------------------------------------------------- 1 | te the 1 2 | aojecm project 3 3 | gutenberg gutenberg 0 4 | eboo ebook 1 5 | yof of 1 6 | adventures adventures 0 7 | sherlock sherlock 0 8 | polxs holmes 3 9 | si sir 1 10 | arthur arthur 0 11 | conn conan 1 12 | doyle doyle 0 13 | in in 0 14 | our our 0 15 | aeries series 1 16 | copyrgt copyright 2 17 | laws laws 0 18 | are are 0 19 | changng changing 1 20 | all all 0 21 | over over 0 22 | world world 0 23 | re sure 2 24 | to to 0 25 | check check 0 26 | qor for 1 27 | youbr your 1 28 | countrfy country 1 29 | before before 0 30 | dwnoadingg downloading 3 31 | or or 0 32 | redistributing redistributing 0 33 | ntis this 2 34 | any any 0 35 | other other 0 36 | jekler header 3 37 | shoflg should 2 38 | first first 0 39 | hng thing 2 40 | seen seen 0 41 | when when 0 42 | vkewing viewing 1 43 | cle file 2 44 | pleare please 1 45 | do do 0 46 | not not 0 47 | reovef remove 2 48 | it it 0 49 | change change 0 50 | ei edit 2 51 | ywthout without 2 52 | ritten written 1 53 | wermission permission 1 54 | rad read 1 55 | legailb legal 2 56 | smll small 1 57 | prinj print 1 58 | and and 0 59 | informatin information 1 60 | about about 0 61 | ct at 1 62 | abcttom bottom 2 63 | incluqed included 1 64 | if is 1 65 | important important 0 66 | speciic specific 1 67 | rights rights 0 68 | eetrnictions restrictions 3 69 | how how 0 70 | may may 0 71 | udsd used 2 72 | ou you 1 73 | can can 0 74 | also also 0 75 | itnd find 2 76 | ut out 1 77 | eake make 1 78 | donation donation 0 79 | get get 0 80 | nvotlved involved 2 81 | wellcome welcome 1 82 | free free 0 83 | pain plain 1 84 | vanla vanilla 2 85 | electronic electronic 0 86 | tets texts 1 87 | ebooks ebooks 0 88 | readable readable 0 89 | bt both 2 90 | umns humans 2 91 | compurters computers 1 92 | sikce since 1 93 | thseoe these 2 94 | iere were 1 95 | preard prepared 2 96 | thusans thousands 2 97 | volutrers volunteers 2 98 | tle title 2 99 | athor author 1 100 | release release 0 101 | daxe date 1 102 | marq march 2 103 | most most 0 104 | recently recently 0 105 | updayd updated 2 106 | dncovaember november 3 107 | edition edition 0 108 | lafnguage language 1 109 | engih english 2 110 | chactr character 3 111 | set set 0 112 | encoding encoding 0 113 | asci ascii 1 114 | styrt start 1 115 | additional additional 0 116 | edoing editing 2 117 | josex jose 1 118 | menendez menendez 0 119 | cntems contents 3 120 | scgndial scandal 2 121 | bohemja bohemia 1 122 | ii ii 0 123 | red red 0 124 | hdted headed 3 125 | leagm league 2 126 | iii iii 0 127 | csfe case 2 128 | identity identity 0 129 | iv iv 0 130 | boscombe boscombe 0 131 | vallejy valley 1 132 | mystery mystery 0 133 | five five 0 134 | orfnge orange 1 135 | pips pips 0 136 | vi vi 0 137 | mn man 1 138 | with with 0 139 | tisged twisted 2 140 | lip lip 0 141 | adventu adventure 2 142 | blue blue 0 143 | qcarbuncle carbuncle 1 144 | viii viii 0 145 | pmpeckld speckled 3 146 | bad band 1 147 | ix ix 0 148 | enginebr engineer 1 149 | thub thumb 1 150 | noble noble 0 151 | ahelr bachelor 3 152 | xi xi 0 153 | beryl beryl 0 154 | coixet coronet 3 155 | xii xii 0 156 | coer copper 2 157 | beeches beeches 0 158 | sge she 1 159 | aklways always 1 160 | wovmn woman 2 161 | hanve have 1 162 | setldom seldom 1 163 | herh heard 2 164 | him him 0 165 | mention mention 0 166 | her her 0 167 | ude under 2 168 | hme name 2 169 | hisw his 1 170 | eyes eyes 0 171 | ecipos eclipses 3 172 | predomiates predominates 1 173 | whole whole 0 174 | seyx sex 1 175 | ws was 1 176 | th that 2 177 | he he 0 178 | celt felt 1 179 | emtiobn emotion 2 180 | ainh akin 2 181 | love love 0 182 | ibne irene 2 183 | yadler adler 1 184 | eorgones emotions 4 185 | ne one 1 186 | pcarticulry particularly 3 187 | abajrrevnt abhorrent 3 188 | cold cold 0 189 | preise precise 1 190 | but but 0 191 | admirasly admirably 1 192 | oalpanced balanced 2 193 | mikd mind 1 194 | take take 0 195 | prct perfect 3 196 | reasng reasoning 3 197 | oservng observing 2 198 | mlacine machine 2 199 | has has 0 200 | as as 0 201 | ver lover 2 202 | ouqld would 2 203 | xplacd placed 2 204 | hiqslf himself 2 205 | false false 0 206 | posiin position 2 207 | nover never 1 208 | sspokep spoke 2 209 | sjofter softer 1 210 | talpsions passions 3 211 | ave save 1 212 | abe gibe 2 213 | ser sneer 2 214 | hey they 1 215 | admirbae admirable 2 216 | thingn things 1 217 | obaerver observer 1 218 | ezcvielltnt excellent 4 219 | rawijg drawing 2 220 | veigl veil 1 221 | frsm from 1 222 | men men 0 223 | qoives motives 2 224 | cions actions 2 225 | trained trained 0 226 | asvoner reasoner 3 227 | admit admit 0 228 | ch such 2 229 | intrusions intrusions 0 230 | ito into 1 231 | olwn own 1 232 | delcatee delicate 2 233 | fne finely 3 234 | fjusxed adjusted 3 235 | tepmperamet temperament 2 236 | vitroduce introduce 2 237 | dsntracting distracting 2 238 | factor factor 0 239 | wihicth which 2 240 | might might 0 241 | throw throw 0 242 | doubt doubt 0 243 | pot upon 2 244 | mentl mental 1 245 | requls results 2 246 | grit grit 0 247 | ensiiuw sensitive 4 248 | nstrumnn instrument 3 249 | crack crack 0 250 | hsgh high 1 251 | powe power 1 252 | clnses lenses 2 253 | more more 0 254 | vdisjturbing disturbing 2 255 | ezhan than 2 256 | stqngz strong 3 257 | notre nature 2 258 | yet yet 0 259 | tee there 2 260 | late late 0 261 | bubiofs dubious 2 262 | questionale questionable 1 263 | memtry memory 1 264 | hd had 1 265 | ittle little 1 266 | laiey lately 2 267 | my my 0 268 | mjrriajzbe marriage 4 269 | rifted drifted 1 270 | aaway away 1 271 | ach each 1 272 | vcympnlee complete 4 273 | happiness happiness 0 274 | home home 0 275 | enteredr centred 3 276 | interests interests 0 277 | rise rise 0 278 | uap up 1 279 | qroun around 2 280 | whoc who 1 281 | findtf finds 2 282 | maer master 2 283 | yeatabshment establishment 4 284 | sufficient sufficient 0 285 | absorb absorb 0 286 | etteantion attention 2 287 | whle while 1 288 | loatzhegd loathed 2 289 | every every 0 290 | om form 2 291 | sokcity society 2 292 | bohetmin bohemian 2 293 | souml soul 1 294 | remineu remained 2 295 | rogings lodgings 2 296 | aer baker 2 297 | trt street 3 298 | urild buried 2 299 | ang among 2 300 | old old 0 301 | books books 0 302 | aplternatinp alternating 2 303 | wek week 1 304 | ketween between 1 305 | cocainre cocaine 1 306 | ambition ambition 0 307 | drowsiness drowsiness 0 308 | dug drug 1 309 | fmieae fierce 3 310 | eergy energy 1 311 | een keen 1 312 | silr still 2 313 | ever ever 0 314 | deeply deeply 0 315 | axtrahtqed attracted 3 316 | study study 0 317 | crime crime 0 318 | ocijpied occupied 2 319 | iwmenfe immense 2 320 | ftcults faculties 3 321 | exttnordinaac extraordinary 4 322 | powers powers 0 323 | sprvation observation 3 324 | following following 0 325 | mose those 2 326 | clue clues 1 327 | cjaring clearing 2 328 | ystewies mysteries 2 329 | lzeen been 2 330 | abandoned abandoned 0 331 | hales hopeless 4 332 | ofmiciaz official 2 333 | police police 0 334 | tcim time 2 335 | come some 1 336 | vagje vague 1 337 | acocn account 3 338 | doitgs doings 1 339 | aumqmoxs summons 3 340 | dessa odessa 1 341 | trepoff trepoff 0 342 | mxumrper murder 3 343 | singular singular 0 344 | tragedpyk tragedy 2 345 | tkinson atkinson 1 346 | xbrphers brothers 3 347 | txincoale trincomalee 3 348 | fqnally finally 1 349 | mission mission 0 350 | jcomplished accomplished 2 351 | sow so 1 352 | dctely delicately 4 353 | sucycessfully successfully 1 354 | reigning reigning 0 355 | faifl family 3 356 | honlad holland 2 357 | beyond beyond 0 358 | signh signs 1 359 | lctivity activity 1 360 | hoer however 3 361 | merely merely 0 362 | swarcd shared 2 363 | readersj readers 1 364 | daily daily 0 365 | pvqess press 2 366 | xknw knew 2 367 | former former 0 368 | fieni friend 2 369 | cjompanion companion 1 370 | night night 0 371 | twneth twentieth 3 372 | returnig returning 1 373 | joaurnhey journey 2 374 | patient patient 0 375 | gow now 1 376 | returned returned 0 377 | iil civil 2 378 | pqratice practice 2 379 | way way 0 380 | ed led 1 381 | trough through 1 382 | passed passed 0 383 | wll well 1 384 | zrembere remembered 4 385 | cdoom door 2 386 | must must 0 387 | associated associated 0 388 | woonl wooing 2 389 | da dark 2 390 | miycidents incidents 2 391 | scarfelt scarlet 2 392 | seizedh seized 1 393 | djsie desire 2 394 | se see 1 395 | again again 0 396 | know know 0 397 | eplhlying employing 3 398 | rkoos rooms 2 399 | brilliantly brilliantly 0 400 | lit lit 0 401 | eve even 1 402 | ood looked 3 403 | saw saw 0 404 | tall tall 0 405 | spae spare 1 406 | figurj figure 1 407 | pas pass 1 408 | twice twice 0 409 | silgouette silhouette 1 410 | goains against 3 411 | bind blind 1 412 | acypng pacing 3 413 | room room 0 414 | swiyky swiftly 3 415 | eerly eagerly 2 416 | had head 1 417 | sunk sunk 0 418 | cdest chest 1 419 | hands hands 0 420 | clasped clasped 0 421 | behnd behind 1 422 | mod mood 1 423 | habi habit 1 424 | attityade attitude 2 425 | mlannedr manner 2 426 | thein their 1 427 | story story 0 428 | lworke work 2 429 | qisenn risen 2 430 | coretedu created 3 431 | drjzma dreams 3 432 | hot hot 0 433 | cent scent 1 434 | ew new 1 435 | problem problem 0 436 | arasng rang 2 437 | bl bell 2 438 | showvn shown 1 439 | hamber chamber 1 440 | formely formerly 1 441 | agt part 2 442 | efsisea effusive 4 443 | gld glad 1 444 | ink think 2 445 | hrdby hardly 2 446 | wodrd word 1 447 | spoken spoken 0 448 | kikdfy kindly 2 449 | ye eye 1 450 | wave waved 1 451 | an an 0 452 | rmchair armchair 1 453 | thew threw 1 454 | acrcss across 1 455 | cars cigars 2 456 | mndicaoed indicated 2 457 | crit spirit 3 458 | gasogene gasogene 0 459 | orne corner 2 460 | then then 0 461 | stood stood 0 462 | fire fire 0 463 | irosectve introspective 4 464 | fsiin fashion 3 465 | wwdloc wedlock 2 466 | sitt suits 2 467 | rqmrkedo remarked 3 468 | wtsn watson 2 469 | seoven seven 1 470 | hf half 2 471 | pouhnds pounds 1 472 | angwerd answered 2 473 | tsneed indeed 3 474 | thought thought 0 475 | ckjust just 2 476 | trilx trifle 2 477 | fkancy fancy 1 478 | obrve observe 2 479 | dnd did 1 480 | tl tell 2 481 | intended intended 0 482 | gw go 1 483 | harnes harness 1 484 | dduce deduce 1 485 | getxin getting 2 486 | yurself yourself 1 487 | ery very 1 488 | wet wet 0 489 | cluumsy clumsy 1 490 | faralesi careless 3 491 | servaqt servant 1 492 | irl girl 1 493 | dear dear 0 494 | saxdq said 2 495 | too too 0 496 | mchg much 2 497 | certainly certainly 0 498 | burned burned 0 499 | liea lived 2 500 | fewt few 1 501 | penntduris centuries 4 502 | fgo ago 1 503 | true true 0 504 | walk walk 0 505 | thursday thursday 0 506 | came came 0 507 | dreadul dreadful 1 508 | esbs mess 2 509 | chagd changed 2 510 | clthe clothes 2 511 | imagine imagine 0 512 | mary mary 0 513 | jane jane 0 514 | innorigile incorrigible 3 515 | wif wife 1 516 | gven given 1 517 | notice notice 0 518 | chucd chuckled 3 519 | rubbed rubbed 0 520 | long long 0 521 | nervos nervous 1 522 | otjer together 4 523 | simlicity simplicity 1 524 | iislfr itself 3 525 | inside inside 0 526 | leb left 2 527 | so shoe 2 528 | dwher where 2 529 | fireslght firelight 2 530 | strkhs strikes 2 531 | eather leather 1 532 | sored scored 1 533 | six six 0 534 | almost almost 0 535 | paalll parallel 2 536 | cuts cuts 0 537 | obviously obviously 0 538 | qausdd caused 2 539 | oxneone someone 3 540 | arelessly carelessly 1 541 | scraped scraped 0 542 | roun round 1 543 | dges edges 1 544 | sole sole 0 545 | orger order 1 546 | creusted crusted 1 547 | mudi mud 1 548 | hence hence 0 549 | adoblje double 3 550 | deduction deduction 0 551 | vse vile 2 552 | wether weather 1 553 | magnhnat malignant 4 554 | hbot boot 2 555 | plittijkg slitting 3 556 | spoecme specimen 3 557 | london london 0 558 | slfey slavey 2 559 | ief if 1 560 | eneman gentleman 3 561 | esmellzng smelling 2 562 | ioforc iodoform 3 563 | black black 0 564 | ak mark 2 565 | nitrate nitrate 0 566 | silsver silver 1 567 | righ right 1 568 | foefifgr forefinger 3 569 | blge bulge 1 570 | siydez side 2 571 | top top 0 572 | chat hat 1 573 | show show 0 574 | ecretd secreted 2 575 | stethoscope stethoscope 0 576 | dull dull 0 577 | proounce pronounce 1 578 | acivb active 2 579 | mepmber member 1 580 | vepcal medical 3 581 | profession profession 0 582 | could could 0 583 | besp help 2 584 | lalughing laughing 1 585 | eyyse ease 2 586 | emxplained explained 1 587 | process process 0 588 | hear hear 0 589 | ive give 1 590 | ueass reasons 3 591 | apears appears 1 592 | ridiculously ridiculously 0 593 | snigmle simple 3 594 | esivly easily 2 595 | yseylf myself 2 596 | athough though 1 597 | sccwssive successive 2 598 | nstance instance 1 599 | bafmlled baffled 2 600 | untl until 1 601 | explain explain 0 602 | beleve believe 1 603 | good good 0 604 | your yours 1 605 | que quite 2 606 | lightig lighting 1 607 | cgarete cigarette 2 608 | hrowwing throwing 2 609 | dowu down 1 610 | distinttion distinction 1 611 | clear clear 0 612 | emple example 2 613 | feqetly frequently 3 614 | stps steps 1 615 | xeaq lead 2 616 | hall hall 0 617 | often often 0 618 | hundrqeds hundreds 1 619 | times times 0 620 | qaly many 2 621 | don don 0 622 | oblsere observed 3 623 | poib point 2 624 | seveneene seventeen 2 625 | pecase because 2 626 | interested interested 0 627 | wobms problems 4 628 | ckougd enough 3 629 | chronicle chronicle 0 630 | two two 0 631 | rifnling trifling 2 632 | experencef experiences 2 633 | shet sheet 1 634 | thick thick 0 635 | pink pink 0 636 | tinted tinted 0 637 | noitepaper notepaper 1 638 | lyng lying 1 639 | opn open 1 640 | tbled table 2 641 | last last 0 642 | ot post 2 643 | aloud aloud 0 644 | xnotte note 2 645 | undzate undated 2 646 | either either 0 647 | saigatue signature 3 648 | wress address 3 649 | will will 0 650 | call call 0 651 | quarter quarter 0 652 | eight eight 0 653 | clock clock 0 654 | desires desires 0 655 | consl consult 2 656 | macttr matter 2 657 | deeupest deepest 1 658 | momnt moment 1 659 | recent recent 0 660 | cevice services 3 661 | yyal royal 2 662 | hpoufjs houses 3 663 | euroe europe 1 664 | safely safely 0 665 | tkrushted trusted 2 666 | mtttxs matters 3 667 | importace importance 1 668 | exaggeratbd exaggerated 1 669 | we we 0 670 | quartes quarters 1 671 | receiyd received 2 672 | aur hour 2 673 | zmiss amiss 1 674 | viositgr visitor 2 675 | wear wear 0 676 | masik mask 1 677 | what what 0 678 | means means 0 679 | no no 0 680 | aa data 2 681 | capital capital 0 682 | moisoake mistake 2 683 | theoxise theorise 1 684 | insensibly insensibly 0 685 | begnsj begins 2 686 | twst twist 1 687 | facte facts 1 688 | uiu suit 2 689 | theories theories 0 690 | yinstewad instead 2 691 | carfult carefully 3 692 | exlaqmined examined 2 693 | writig writing 1 694 | paper paper 0 695 | wroto wrote 1 696 | pesuhably presumably 2 697 | edeavouring endeavouring 1 698 | imitare imitate 1 699 | psocsse processes 3 700 | bough bought 1 701 | row crown 2 702 | wawket packet 2 703 | pcueiarly peculiarly 2 704 | iff stiff 2 705 | ecular peculiar 2 706 | hovldi hold 2 707 | light light 0 708 | large large 0 709 | woven woven 0 710 | zextzure texture 2 711 | asked asked 0 712 | nmaker maker 1 713 | mmoogaa monogram 4 714 | raher rather 1 715 | stanks stands 1 716 | geellsckaft gesellschaft 2 717 | german german 0 718 | cpany company 2 719 | csmary customary 3 720 | ycontraon contraction 4 721 | like like 0 722 | co co 0 723 | ckurhe course 2 724 | papienr papier 1 725 | eg eg 0 726 | lev let 1 727 | glance glance 0 728 | pcotinentalk continental 3 729 | gazcetter gazetteer 2 730 | took took 0 731 | havny heavy 2 732 | brown brown 0 733 | volumea volume 1 734 | shelmves shelves 1 735 | eglow eglow 0 736 | eglonitz eglonitz 0 737 | hrje here 2 738 | egeria egria 1 739 | sefkiang speaking 3 740 | rar far 1 741 | crlsbad carlsbad 1 742 | rqarabye remarkable 4 743 | being being 0 744 | scee scene 1 745 | death death 0 746 | wallenstein wallenstein 0 747 | its its 0 748 | nmeous numerous 2 749 | gvlass glass 1 750 | qctries factories 3 751 | milsm mills 2 752 | ha ha 0 753 | body boy 1 754 | sqrkled sparkled 2 755 | sqenzt sent 2 756 | gret great 1 757 | tdrumphant triumphant 2 758 | clud cloud 1 759 | made made 0 760 | precikey precisely 2 761 | construction construction 0 762 | sentence sentence 0 763 | frenczhmmn frenchman 2 764 | rssian russian 1 765 | uncgurteous uncourteous 1 766 | erbs verbs 1 767 | onlyd only 1 768 | remains remains 0 769 | therefore therefore 0 770 | dmmscovr discover 3 771 | wqnted wanted 1 772 | writes writes 0 773 | pffrs prefers 3 774 | eainq wearing 3 775 | showig showing 1 776 | face face 0 777 | comes comes 0 778 | msstaken mistaken 1 779 | xresoe resolve 3 780 | doubts doubts 0 781 | bsharp sharp 1 782 | souno sound 1 783 | horss horses 1 784 | zuoofs hoofs 2 785 | pgrating grating 1 786 | hels wheels 2 787 | curb curb 0 788 | nollowed followed 1 789 | ul pull 2 790 | whjistled whistled 1 791 | par pair 1 792 | yes yes 0 793 | continued continued 0 794 | gancing glancing 1 795 | windo window 1 796 | nicu nice 1 797 | bvoughawmr brougham 3 798 | euties beauties 2 799 | unded hundred 2 800 | fift fifty 1 801 | tineams guineas 3 802 | apiee apiece 1 803 | mone money 1 804 | taing nothing 3 805 | ejse else 1 806 | ett better 3 807 | bit bit 0 808 | doctor doctor 0 809 | lstvy stay 2 810 | mlost lost 1 811 | boswel boswell 1 812 | proasu promises 4 813 | uinterestig interesting 2 814 | py pity 2 815 | iss miss 1 816 | ldeny client 3 817 | want want 0 818 | beht best 1 819 | sgow slow 1 820 | utdep step 2 821 | tars stairs 2 822 | pasage passage 1 823 | paued paused 1 824 | immdiately immediately 1 825 | ousi outside 3 826 | loud loud 0 827 | authoritative authoritative 0 828 | tap tap 0 829 | enered entered 1 830 | leps less 1 831 | feet feet 0 832 | inwh inches 3 833 | hreigqhtd height 3 834 | libr limbs 2 835 | qercsuej hercules 4 836 | res dress 2 837 | richr rich 1 838 | richnesdsj richness 2 839 | eglnd england 2 840 | bd bad 1 841 | taste taste 0 842 | andsw bands 2 843 | sirakhqn astrakhan 3 844 | slashehd slashed 1 845 | sleeves sleeves 0 846 | fronts fronts 0 847 | hbreasted breasted 1 848 | cwat coat 1 849 | deep deep 0 850 | cloak cloak 0 851 | thrown thrown 0 852 | shoudrs shoulders 2 853 | gned lined 2 854 | fame flame 1 855 | txloure coloured 3 856 | slk silk 1 857 | secred secured 1 858 | nemk neck 1 859 | xbrooh brooch 2 860 | onsistea consisted 2 861 | igle single 2 862 | flaming flaming 0 863 | bdoots boots 1 864 | exended extended 1 865 | hlfwbay halfway 2 866 | adves calves 2 867 | trimeo trimmed 2 868 | tps tops 1 869 | furt fur 1 870 | cfopleted completed 2 871 | mprfpssiof impression 4 872 | baobro barbaric 4 873 | opundnzce opulence 3 874 | suggested suggested 0 875 | auppearancne appearance 2 876 | caried carried 1 877 | broaxd broad 1 878 | brimwed brimmed 1 879 | ad hand 2 880 | woe wore 1 881 | upper upper 0 882 | xtennz extending 4 883 | past past 0 884 | lchekboxs cheekbones 4 885 | vad vizard 3 886 | appaienwtly apparently 2 887 | rasod raised 2 888 | lowur lower 1 889 | appeared appeared 0 890 | hanging hanging 0 891 | ssragh straight 3 892 | cyhin chin 1 893 | suggestive suggestive 0 894 | resozlution resolution 1 895 | pushd pushed 1 896 | lengzth length 1 897 | otstinacy obstinacy 1 898 | harusch harsh 2 899 | voice voice 0 900 | srojgqy strongly 3 901 | marked marked 0 902 | accet accent 1 903 | uncamin uncertain 4 904 | pray pray 0 905 | olleeague colleague 2 906 | ocasgonaly occasionally 3 907 | cases cases 0 908 | uhgm whom 2 909 | xhonxr honour 3 910 | cokt count 2 911 | von von 0 912 | ramxm kramm 2 913 | noean nobleman 3 914 | nderman understand 4 915 | jisoretin discretion 3 916 | ettemse extreme 3 917 | eer prefer 3 918 | communicate communicate 0 919 | aone alone 1 920 | rgse rose 1 921 | caught caught 0 922 | wrist wrist 0 923 | bttk back 2 924 | chaim chair 1 925 | none none 0 926 | say say 0 927 | anyttzing anything 2 928 | sruggen shrugged 2 929 | bei begin 2 930 | incdingf binding 3 931 | atoltme absolute 4 932 | secrgcg secrecy 2 933 | years years 0 934 | end end 0 935 | pkrsent present 2 936 | weightp weight 1 937 | influence influence 0 938 | european european 0 939 | historyk history 1 940 | promisxe promise 1 941 | excbe excuse 2 942 | stracxnge strange 2 943 | augysd august 2 944 | persn person 1 945 | empmosd employs 3 946 | wises wishes 1 947 | aet agent 2 948 | nknow unknown 2 949 | onfes confess 2 950 | once once 0 951 | callvd called 1 952 | eixcdtly exactly 3 953 | aware aware 0 954 | dry dryly 2 955 | ciyrumstances circumstances 2 956 | decacw delicacy 3 957 | precaution precaution 0 958 | takn taken 1 959 | quench quench 0 960 | grow grow 0 961 | seriously seriously 0 962 | comproomutse compromise 3 963 | famiclies families 1 964 | speak speak 0 965 | lanly plainly 2 966 | impicates implicates 1 967 | hvuse house 1 968 | ormsqpek ormstein 4 969 | redotary hereditary 3 970 | kings kings 0 971 | muzmurd murmured 2 972 | settlinpg settling 1 973 | closing closing 0 974 | gancerd glanced 2 975 | appnt apparent 3 976 | surrise surprise 1 977 | layngid languid 2 978 | ljouunming lounging 3 979 | dpicuted depicted 2 980 | wnciqsive incisive 2 981 | enwergtic energetic 2 982 | xlowly slowly 1 983 | rekpewned reopened 2 984 | impaienty impatiently 2 985 | ggac gigantic 4 986 | maygty majesty 3 987 | cndeced condescend 3 988 | state state 0 989 | le able 2 990 | adeipej advise 3 991 | sprgm sprang 3 992 | pacedp paced 1 993 | uncontrollable uncontrollable 0 994 | gitton agitation 3 995 | gxesturew gesture 2 996 | desperation desperation 0 997 | zurd hurled 3 998 | grocnd ground 1 999 | criedm cried 1 1000 | kig king 1 -------------------------------------------------------------------------------- /tests/fortests/non_en_dict.txt: -------------------------------------------------------------------------------- 1 | АБИ 10 2 | И 1 3 | Б 2 -------------------------------------------------------------------------------- /tests/fortests/separator_dict.txt: -------------------------------------------------------------------------------- 1 | the$23135851162 2 | of$13151942776 3 | abcs of$10956800 4 | aaron and$10721728 5 | and$12997637966 -------------------------------------------------------------------------------- /tests/fortests/word_segmentation_data.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": [ 3 | { 4 | "typo": "thequickbrownfoxjumpsoverthelazydog", 5 | "0": { "term": "the quick brown fox jumps over the lazy dog" } 6 | }, 7 | { 8 | "typo": "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen", 9 | "0": { 10 | "term": "it was a bright cold day in april and the clocks were striking thirteen" 11 | } 12 | }, 13 | { 14 | "typo": "itwasthebestoftimesitwastheworstoftimesitwastheageofwisdomitwastheageoffoolishness", 15 | "0": { 16 | "term": "it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness" 17 | } 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /tests/test_compatibility.py: -------------------------------------------------------------------------------- 1 | from symspellpy.helpers import null_distance_results, prefix_suffix_prep 2 | 3 | 4 | def test_null_distance_results(): 5 | assert null_distance_results(None, None, 1) == 0 6 | assert null_distance_results(None, string2=None, max_distance=1) == 0 7 | assert null_distance_results(string1=None, string2=None, max_distance=1) == 0 8 | assert null_distance_results(string_1=None, string_2=None, max_distance=1) == 0 9 | 10 | 11 | def test_prefix_suffix_prep(): 12 | assert prefix_suffix_prep("dabca", "ddca") == (2, 1, 1) 13 | assert prefix_suffix_prep("dabca", string2="ddca") == (2, 1, 1) 14 | assert prefix_suffix_prep(string1="dabca", string2="ddca") == (2, 1, 1) 15 | assert prefix_suffix_prep(string_1="dabca", string_2="ddca") == (2, 1, 1) 16 | -------------------------------------------------------------------------------- /tests/test_editdistance.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from itertools import combinations, permutations 3 | 4 | import pytest 5 | 6 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer 7 | from symspellpy.editdistance import ( 8 | DamerauOsa, 9 | DamerauOsaFast, 10 | DistanceAlgorithm, 11 | EditDistance, 12 | Levenshtein, 13 | LevenshteinFast, 14 | ) 15 | 16 | SHORT_STRING = "string" 17 | LONG_STRING = "long_string" 18 | VERY_LONG_STRING = "very_long_string" 19 | 20 | 21 | def expected_levenshtein(string_1, string_2, max_distance): 22 | max_distance = int(min(2**31 - 1, max_distance)) 23 | len_1 = len(string_1) 24 | len_2 = len(string_2) 25 | d = [[0] * (len_2 + 1) for _ in range(len_1 + 1)] 26 | for i in range(len_1 + 1): 27 | d[i][0] = i 28 | for i in range(len_2 + 1): 29 | d[0][i] = i 30 | for j in range(1, len_2 + 1): 31 | for i in range(1, len_1 + 1): 32 | if string_1[i - 1] == string_2[j - 1]: 33 | # no operation 34 | d[i][j] = d[i - 1][j - 1] 35 | else: 36 | d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + 1) 37 | distance = d[len_1][len_2] 38 | return distance if distance <= max_distance else -1 39 | 40 | 41 | def expected_damerau_osa(string_1, string_2, max_distance): 42 | max_distance = int(min(2**31 - 1, max_distance)) 43 | len_1 = len(string_1) 44 | len_2 = len(string_2) 45 | d = [[0] * (len_2 + 1) for _ in range(len_1 + 1)] 46 | for i in range(len_1 + 1): 47 | d[i][0] = i 48 | for i in range(len_2 + 1): 49 | d[0][i] = i 50 | for i in range(1, len_1 + 1): 51 | for j in range(1, len_2 + 1): 52 | cost = 0 if string_1[i - 1] == string_2[j - 1] else 1 53 | d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost) 54 | if ( 55 | i > 1 56 | and j > 1 57 | and string_1[i - 1] == string_2[j - 2] 58 | and string_1[i - 2] == string_2[j - 1] 59 | ): 60 | d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost) 61 | distance = d[len_1][len_2] 62 | return distance if distance <= max_distance else -1 63 | 64 | 65 | class CustomDistanceComparer(AbstractDistanceComparer): 66 | def distance(self, string_1: str, string_2: str, max_distance: int) -> int: 67 | return -2 68 | 69 | 70 | @pytest.fixture( 71 | params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"] 72 | ) 73 | def get_comparer(request): 74 | comparer_dict = { 75 | "damerau_osa": {"actual": DamerauOsa(), "expected": expected_damerau_osa}, 76 | "levenshtein": {"actual": Levenshtein(), "expected": expected_levenshtein}, 77 | "damerau_osa_fast": { 78 | "actual": DamerauOsaFast(), 79 | "expected": expected_damerau_osa, 80 | }, 81 | "levenshtein_fast": { 82 | "actual": LevenshteinFast(), 83 | "expected": expected_levenshtein, 84 | }, 85 | } 86 | yield ( 87 | comparer_dict[request.param]["actual"], 88 | comparer_dict[request.param]["expected"], 89 | ) 90 | 91 | 92 | @pytest.fixture( 93 | params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"] 94 | ) 95 | def get_edit_distance(request): 96 | comparer_dict = { 97 | "damerau_osa": { 98 | "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA), 99 | "expected": DamerauOsa, 100 | }, 101 | "levenshtein": { 102 | "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN), 103 | "expected": Levenshtein, 104 | }, 105 | "damerau_osa_fast": { 106 | "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST), 107 | "expected": DamerauOsaFast, 108 | }, 109 | "levenshtein_fast": { 110 | "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST), 111 | "expected": LevenshteinFast, 112 | }, 113 | } 114 | yield ( 115 | comparer_dict[request.param]["actual"], 116 | comparer_dict[request.param]["expected"], 117 | ) 118 | 119 | 120 | @pytest.fixture 121 | def get_short_and_long_strings(): 122 | return [ 123 | (SHORT_STRING, None, {"null": len(SHORT_STRING), "zero": -1, "neg": -1}), 124 | (LONG_STRING, None, {"null": -1, "zero": -1, "neg": -1}), 125 | (None, SHORT_STRING, {"null": len(SHORT_STRING), "zero": -1, "neg": -1}), 126 | (None, LONG_STRING, {"null": -1, "zero": -1, "neg": -1}), 127 | (SHORT_STRING, SHORT_STRING, {"null": 0, "zero": 0, "neg": 0}), 128 | (None, None, {"null": 0, "zero": 0, "neg": 0}), 129 | ] 130 | 131 | 132 | @pytest.fixture(params=[0, 1, 3, sys.maxsize]) 133 | def get_strings(request): 134 | alphabet = "abcd" 135 | strings = [""] 136 | for i in range(1, len(alphabet) + 1): 137 | for combi in combinations(alphabet, i): 138 | strings += ["".join(p) for p in permutations(combi)] 139 | yield strings, request.param 140 | 141 | 142 | class TestEditDistance: 143 | def test_unknown_distance_algorithm(self): 144 | with pytest.raises(ValueError) as excinfo: 145 | _ = EditDistance(2) 146 | assert "unknown distance algorithm" == str(excinfo.value) 147 | 148 | def test_missing_custom_comparer(self): 149 | with pytest.raises(ValueError) as excinfo: 150 | _ = EditDistance(DistanceAlgorithm.USER_PROVIDED) 151 | assert "no comparer passed in" in str(excinfo.value) 152 | 153 | def test_abstract_distance_comparer(self): 154 | with pytest.raises(TypeError) as excinfo: 155 | comparer = AbstractDistanceComparer() 156 | _ = comparer.distance("string_1", "string_2", 10) 157 | assert str(excinfo.value).startswith( 158 | "Can't instantiate abstract class AbstractDistanceComparer" 159 | ) 160 | 161 | def test_warn_when_builtin_comparer_override_custom_comparer(self): 162 | with pytest.warns(UserWarning, match="A built-in comparer will be used.$"): 163 | comparer = CustomDistanceComparer() 164 | edit_distance = EditDistance(DistanceAlgorithm.LEVENSHTEIN, comparer) 165 | 166 | def test_internal_distance_comparer(self, get_edit_distance): 167 | edit_distance, expected = get_edit_distance 168 | assert isinstance(edit_distance._distance_comparer, expected) 169 | 170 | def test_comparer_match_ref(self, get_comparer, get_strings): 171 | comparer, expected = get_comparer 172 | strings, max_distance = get_strings 173 | 174 | for s1 in strings: 175 | for s2 in strings: 176 | assert expected(s1, s2, max_distance) == comparer.distance( 177 | s1, s2, max_distance 178 | ) 179 | 180 | def test_editdistance_use_custom_comparer(self, get_strings): 181 | strings, max_distance = get_strings 182 | comparer = CustomDistanceComparer() 183 | edit_distance = EditDistance(DistanceAlgorithm.USER_PROVIDED, comparer) 184 | 185 | for s1 in strings: 186 | for s2 in strings: 187 | assert -2 == comparer.distance(s1, s2, max_distance) 188 | 189 | def test_comparer_null_distance(self, get_comparer, get_short_and_long_strings): 190 | comparer, _ = get_comparer 191 | 192 | for s1, s2, expected in get_short_and_long_strings: 193 | distance = comparer.distance(s1, s2, 10) 194 | assert expected["null"] == distance 195 | 196 | def test_comparer_negative_max_distance( 197 | self, get_comparer, get_short_and_long_strings 198 | ): 199 | comparer, _ = get_comparer 200 | 201 | for s1, s2, expected in get_short_and_long_strings: 202 | distance = comparer.distance(s1, s2, 0) 203 | assert expected["zero"] == distance 204 | 205 | for s1, s2, expected in get_short_and_long_strings: 206 | distance = comparer.distance(s1, s2, 0) 207 | assert expected["neg"] == distance 208 | 209 | def test_comparer_very_long_string(self, get_comparer): 210 | comparer, _ = get_comparer 211 | distance = comparer.distance(SHORT_STRING, VERY_LONG_STRING, 5) 212 | 213 | assert -1 == distance 214 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from symspellpy.helpers import ( 4 | case_transfer_matching, 5 | case_transfer_similar, 6 | is_acronym, 7 | to_similarity, 8 | ) 9 | 10 | 11 | @pytest.fixture 12 | def get_acronyms(): 13 | return [ 14 | ("ABCDE", {"default": True, "digits": True}), 15 | ("AB12E", {"default": True, "digits": True}), 16 | ("abcde", {"default": False, "digits": False}), 17 | ("ABCde", {"default": False, "digits": False}), 18 | ("abcDE", {"default": False, "digits": False}), 19 | ("abCDe", {"default": False, "digits": False}), 20 | ("abc12", {"default": False, "digits": True}), 21 | ("ab12e", {"default": False, "digits": True}), 22 | ] 23 | 24 | 25 | @pytest.fixture 26 | def get_similar_texts(): 27 | return [ 28 | ( 29 | "Haaw is the weeather in New York?", 30 | "how is the weather in new york?", 31 | "How is the weather in New York?", 32 | ), 33 | ("Wethr in New Yoork", "weather in new york", "Weather in New York"), 34 | ("Efthr in New Yoork", "weather in new york", "WEather in New York"), 35 | ("efthr in New Yoork", "weather in new york", "weather in New York"), 36 | ("eTr in New Yoork", "weather in new york", "weaTHEr in New York"), 37 | ("hoW eqr", "Haaaw er", "haaaW er"), 38 | ("hOW eqr", "Haaaw er", "hAAAW er"), 39 | ] 40 | 41 | 42 | class TestHelpers: 43 | def test_to_similarity(self): 44 | length = 20.0 45 | 46 | assert pytest.approx(0.7) == to_similarity(6.0, length) 47 | assert -1 == to_similarity(-1.0, length) 48 | 49 | def test_is_acronym(self, get_acronyms): 50 | for word, expected in get_acronyms: 51 | assert expected["default"] == is_acronym(word) 52 | assert expected["digits"] == is_acronym(word, True) 53 | 54 | def test_case_transfer_matching_diff_lengths(self): 55 | with pytest.raises(ValueError) as excinfo: 56 | case_transfer_matching("abc", "abcd") 57 | assert ( 58 | "'cased_text' and 'uncased_text' don't have the same length, use " 59 | "case_transfer_similar() instead" 60 | ) == str(excinfo.value) 61 | 62 | def test_case_transfer_matching(self): 63 | cased_text = "Haw is the eeather in New York?" 64 | uncased_text = "how is the weather in new york?" 65 | 66 | # the uncased_text text with the casing transferred from 67 | # the cased_text text 68 | assert "How is the weather in New York?" == case_transfer_matching( 69 | cased_text, uncased_text 70 | ) 71 | 72 | def test_case_transfer_similar_empty_wo_casing(self): 73 | cased_text = "Haw is the eeather in New York?" 74 | uncased_text = "" 75 | 76 | assert uncased_text == case_transfer_similar(cased_text, uncased_text) 77 | 78 | def test_case_transfer_similar_empty_w_casing(self): 79 | with pytest.raises(ValueError) as excinfo: 80 | case_transfer_similar("", "abcd") 81 | assert "'cased_text' cannot be empty" == str(excinfo.value) 82 | 83 | def test_case_transfer_similar(self, get_similar_texts): 84 | for cased_text, uncased_text, expected in get_similar_texts: 85 | assert expected == case_transfer_similar(cased_text, uncased_text) 86 | -------------------------------------------------------------------------------- /tests/test_suggest_item.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from symspellpy.suggest_item import SuggestItem 4 | 5 | 6 | @pytest.fixture 7 | def suggest_item(): 8 | return SuggestItem("term", 0, 0) 9 | 10 | 11 | class TestSuggestItem: 12 | def test_invalid_equal_to(self, suggest_item): 13 | assert suggest_item.__eq__(0) is NotImplemented 14 | assert not suggest_item == 0 15 | 16 | def test_invalid_less_than(self, suggest_item): 17 | assert suggest_item.__lt__(0) is NotImplemented 18 | with pytest.raises(TypeError) as excinfo: 19 | suggest_item < 0 20 | assert "'<' not supported between instances of 'SuggestItem' and 'int'" == str( 21 | excinfo.value 22 | ) 23 | 24 | def test_suggest_item(self): 25 | si_1 = SuggestItem("asdf", 12, 34) 26 | si_2 = SuggestItem("sdfg", 12, 34) 27 | si_3 = SuggestItem("dfgh", 56, 78) 28 | 29 | assert si_1 == si_2 30 | assert si_2 != si_3 31 | 32 | assert "asdf" == si_1.term 33 | si_1.term = "qwer" 34 | assert "qwer" == si_1.term 35 | 36 | assert 34 == si_1.count 37 | si_1.count = 78 38 | assert 78 == si_1.count 39 | 40 | assert "qwer, 12, 78" == str(si_1) 41 | -------------------------------------------------------------------------------- /tests/test_symspellpy.py: -------------------------------------------------------------------------------- 1 | from io import StringIO 2 | from pathlib import Path 3 | from unittest import TestCase 4 | 5 | import pytest 6 | 7 | from symspellpy import SymSpell, Verbosity 8 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer 9 | from symspellpy.editdistance import DistanceAlgorithm, EditDistance 10 | from symspellpy.helpers import DictIO 11 | 12 | FORTESTS_DIR = Path(__file__).resolve().parent / "fortests" 13 | BAD_DICT_PATH = FORTESTS_DIR / "bad_dict.txt" 14 | BELOW_THRESHOLD_DICT_PATH = FORTESTS_DIR / "below_threshold_dict.txt" 15 | BIG_MODIFIED_PATH = FORTESTS_DIR / "big_modified.txt" 16 | BIG_WORDS_PATH = FORTESTS_DIR / "big_words.txt" 17 | NON_EN_DICT_PATH = FORTESTS_DIR / "non_en_dict.txt" 18 | SEPARATOR_DICT_PATH = FORTESTS_DIR / "separator_dict.txt" 19 | 20 | INVALID_PATH = "invalid/dictionary/path.txt" 21 | SEPARATOR = "$" 22 | 23 | 24 | @pytest.fixture 25 | def get_dictionary_stream(request): 26 | dictionary = { 27 | "the": 23135851162, 28 | "of": 13151942776, 29 | "abcs of": 10956800, 30 | "aaron and": 10721728, 31 | "and": 12997637966, 32 | "large count": 92233720368547758081, 33 | } 34 | if request.param is None: 35 | dict_stream = DictIO(dictionary) 36 | else: 37 | dict_stream = DictIO(dictionary, request.param) 38 | yield dict_stream, request.param 39 | 40 | 41 | class CustomDistanceComparer(AbstractDistanceComparer): 42 | def distance(self, string_1: str, string_2: str, max_distance: int) -> int: 43 | return 0 44 | 45 | 46 | class TestSymSpellPy: 47 | def test_negative_max_dictionary_edit_distance(self): 48 | with pytest.raises(ValueError) as excinfo: 49 | _ = SymSpell(-1, 3) 50 | assert "max_dictionary_edit_distance cannot be negative" == str(excinfo.value) 51 | 52 | def test_invalid_prefix_length(self): 53 | # prefix_length < 1 54 | with pytest.raises(ValueError) as excinfo: 55 | _ = SymSpell(1, 0) 56 | assert "prefix_length cannot be less than 1" == str(excinfo.value) 57 | 58 | with pytest.raises(ValueError) as excinfo: 59 | _ = SymSpell(1, -1) 60 | assert "prefix_length cannot be less than 1" == str(excinfo.value) 61 | 62 | # prefix_length <= max_dictionary_edit_distance 63 | with pytest.raises(ValueError) as excinfo: 64 | _ = SymSpell(2, 2) 65 | assert "prefix_length must be greater than max_dictionary_edit_distance" == str( 66 | excinfo.value 67 | ) 68 | 69 | def test_negative_count_threshold(self): 70 | with pytest.raises(ValueError) as excinfo: 71 | _ = SymSpell(1, 3, -1) 72 | assert "count_threshold cannot be negative" == str(excinfo.value) 73 | 74 | def test_set_distance_comparer(self): 75 | distance_comparer = EditDistance( 76 | DistanceAlgorithm.USER_PROVIDED, CustomDistanceComparer() 77 | ) 78 | sym_spell = SymSpell(distance_comparer=distance_comparer) 79 | 80 | assert distance_comparer == sym_spell.distance_comparer 81 | 82 | @pytest.mark.parametrize("symspell_short", [None, 0], indirect=True) 83 | def test_create_dictionary_entry_negative_count(self, symspell_short): 84 | assert ( 85 | symspell_short._count_threshold == 0 86 | ) == symspell_short.create_dictionary_entry("pipe", 0) 87 | assert not symspell_short.create_dictionary_entry("pipe", -1) 88 | 89 | @pytest.mark.parametrize("symspell_short", [10], indirect=True) 90 | def test_create_dictionary_entry_below_threshold(self, symspell_short): 91 | symspell_short.create_dictionary_entry("pipe", 4) 92 | assert 1 == len(symspell_short.below_threshold_words) 93 | assert 4 == symspell_short.below_threshold_words["pipe"] 94 | 95 | symspell_short.create_dictionary_entry("pipe", 4) 96 | assert 1 == len(symspell_short.below_threshold_words) 97 | assert 8 == symspell_short.below_threshold_words["pipe"] 98 | 99 | symspell_short.create_dictionary_entry("pipe", 4) 100 | assert 0 == len(symspell_short.below_threshold_words) 101 | 102 | def test_add_additional_counts_should_not_add_word_again( 103 | self, symspell_default, get_same_word_and_count 104 | ): 105 | for word, count in get_same_word_and_count: 106 | symspell_default.create_dictionary_entry(word, count) 107 | assert 1 == symspell_default.word_count 108 | 109 | def test_add_additional_counts_should_increase_count( 110 | self, symspell_default, get_same_word_and_count 111 | ): 112 | expected_count = 0 113 | for word, count in get_same_word_and_count: 114 | expected_count += count 115 | symspell_default.create_dictionary_entry(word, count) 116 | result = symspell_default.lookup(word, Verbosity.TOP) 117 | assert expected_count == result[0].count 118 | 119 | def test_load_bigram_dictionary_invalid_path(self, symspell_default): 120 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm: 121 | assert not symspell_default.load_bigram_dictionary(INVALID_PATH, 0, 2) 122 | assert ( 123 | f"Bigram dictionary file not found at {Path(INVALID_PATH)}." 124 | == cm.records[0].getMessage() 125 | ) 126 | 127 | def test_loading_dictionary_from_fileobject(self, symspell_default): 128 | with open(BIG_WORDS_PATH, "r", encoding="utf8") as infile: 129 | assert symspell_default.create_dictionary(infile) 130 | 131 | def test_load_bigram_dictionary_bad_dict(self, symspell_default): 132 | assert symspell_default.load_bigram_dictionary(BAD_DICT_PATH, 0, 2) 133 | assert 2 == len(symspell_default.bigrams) 134 | assert 12 == symspell_default.bigrams["rtyu tyui"] 135 | assert 13 == symspell_default.bigrams["yuio uiop"] 136 | 137 | def test_load_bigram_dictionary_separator(self, symspell_default): 138 | assert symspell_default.load_bigram_dictionary( 139 | SEPARATOR_DICT_PATH, 0, 1, SEPARATOR 140 | ) 141 | assert 5 == len(symspell_default.bigrams) 142 | assert 23135851162 == symspell_default.bigrams["the"] 143 | assert 13151942776 == symspell_default.bigrams["of"] 144 | assert 10956800 == symspell_default.bigrams["abcs of"] 145 | assert 10721728, symspell_default.bigrams["aaron and"] 146 | assert 12997637966 == symspell_default.bigrams["and"] 147 | 148 | @pytest.mark.parametrize("get_dictionary_stream", [None], indirect=True) 149 | def test_load_bigram_dictionary_stream( 150 | self, symspell_default, get_dictionary_stream 151 | ): 152 | dict_stream, _ = get_dictionary_stream 153 | assert symspell_default._load_bigram_dictionary_stream(dict_stream, 0, 2) 154 | assert 2 == len(symspell_default.bigrams) 155 | assert 10956800 == symspell_default.bigrams["abcs of"] 156 | assert 10721728 == symspell_default.bigrams["aaron and"] 157 | assert "large count" not in symspell_default.bigrams 158 | 159 | @pytest.mark.parametrize("get_dictionary_stream", [SEPARATOR], indirect=True) 160 | def test_load_bigram_dictionary_stream_separator( 161 | self, symspell_default, get_dictionary_stream 162 | ): 163 | dict_stream, separator = get_dictionary_stream 164 | assert symspell_default._load_bigram_dictionary_stream( 165 | dict_stream, 0, 1, separator 166 | ) 167 | assert 5 == len(symspell_default.bigrams) 168 | assert 23135851162 == symspell_default.bigrams["the"] 169 | assert 13151942776 == symspell_default.bigrams["of"] 170 | assert 10956800 == symspell_default.bigrams["abcs of"] 171 | assert 10721728 == symspell_default.bigrams["aaron and"] 172 | assert 12997637966 == symspell_default.bigrams["and"] 173 | 174 | def test_load_dictionary_invalid_path(self, symspell_default): 175 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm: 176 | assert not symspell_default.load_dictionary(INVALID_PATH, 0, 1) 177 | assert ( 178 | f"Dictionary file not found at {Path(INVALID_PATH)}." 179 | == cm.records[0].getMessage() 180 | ) 181 | 182 | def test_load_dictionary_bad_dictionary(self, symspell_default): 183 | assert symspell_default.load_dictionary(BAD_DICT_PATH, 0, 1) 184 | assert 2 == symspell_default.word_count 185 | assert 10 == symspell_default.words["asdf"] 186 | assert 12 == symspell_default.words["sdfg"] 187 | 188 | def test_load_dictionary_count(self, symspell_default, dictionary_path): 189 | symspell_default.load_dictionary(dictionary_path, 0, 1) 190 | 191 | assert 82834 == symspell_default.word_count 192 | assert 676094 == symspell_default.entry_count 193 | 194 | @pytest.mark.parametrize("symspell_short", [10], indirect=True) 195 | def test_load_dictionary_below_threshold(self, symspell_short): 196 | symspell_short.load_dictionary(BELOW_THRESHOLD_DICT_PATH, 0, 1) 197 | 198 | assert 1 == len(symspell_short.below_threshold_words) 199 | assert 8 == symspell_short.below_threshold_words["below"] 200 | 201 | assert 2 == symspell_short.word_count 202 | 203 | def test_load_dictionary_separator(self, symspell_default): 204 | assert symspell_default.load_dictionary(SEPARATOR_DICT_PATH, 0, 1, SEPARATOR) 205 | assert 5 == symspell_default.word_count 206 | assert 23135851162 == symspell_default.words["the"] 207 | assert 13151942776 == symspell_default.words["of"] 208 | assert 10956800 == symspell_default.words["abcs of"] 209 | assert 10721728 == symspell_default.words["aaron and"] 210 | assert 12997637966 == symspell_default.words["and"] 211 | 212 | @pytest.mark.parametrize("get_dictionary_stream", [None], indirect=True) 213 | def test_load_dictionary_stream(self, symspell_default, get_dictionary_stream): 214 | # keys with space in them don't get parsed properly when using 215 | # the default separator=" " 216 | dict_stream, _ = get_dictionary_stream 217 | assert symspell_default._load_dictionary_stream(dict_stream, 0, 1) 218 | assert 3 == symspell_default.word_count 219 | assert 23135851162 == symspell_default.words["the"] 220 | assert 13151942776 == symspell_default.words["of"] 221 | assert 12997637966 == symspell_default.words["and"] 222 | 223 | @pytest.mark.parametrize("get_dictionary_stream", [SEPARATOR], indirect=True) 224 | def test_load_dictionary_stream_separator( 225 | self, symspell_default, get_dictionary_stream 226 | ): 227 | dict_stream, separator = get_dictionary_stream 228 | assert symspell_default._load_dictionary_stream(dict_stream, 0, 1, separator) 229 | assert 5 == symspell_default.word_count 230 | assert 23135851162 == symspell_default.words["the"] 231 | assert 13151942776 == symspell_default.words["of"] 232 | assert 10956800 == symspell_default.words["abcs of"] 233 | assert 10721728 == symspell_default.words["aaron and"] 234 | assert 12997637966 == symspell_default.words["and"] 235 | 236 | def test_load_dictionary_encoding(self, symspell_default): 237 | symspell_default.load_dictionary(NON_EN_DICT_PATH, 0, 1, encoding="utf-8") 238 | 239 | result = symspell_default.lookup("АБ", Verbosity.TOP, 2) 240 | assert 1 == len(result) 241 | assert "АБИ" == result[0].term 242 | 243 | def test_load_dictionary_from_string_io(self, symspell_default, dictionary_path): 244 | with open(dictionary_path, "r") as f: 245 | symspell_default.load_dictionary(StringIO(f.read()), 0, 1) 246 | assert 82834 == symspell_default.word_count 247 | assert 676094 == symspell_default.entry_count 248 | 249 | def test_load_dictionary_from_text_io_wrapper(self, symspell_default, dictionary_path): 250 | with open(dictionary_path, "r") as f: 251 | symspell_default.load_dictionary(f, 0, 1) 252 | assert 82834 == symspell_default.word_count 253 | assert 676094 == symspell_default.entry_count 254 | 255 | def test_create_dictionary_invalid_path(self, symspell_default): 256 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm: 257 | assert not symspell_default.create_dictionary(INVALID_PATH) 258 | assert ( 259 | f"Corpus not found at {Path(INVALID_PATH)}." == cm.records[0].getMessage() 260 | ) 261 | 262 | def test_create_dictionary(self, symspell_default): 263 | symspell_default.create_dictionary(BIG_MODIFIED_PATH, encoding="utf-8") 264 | 265 | num_lines = 0 266 | with open(BIG_WORDS_PATH, "r") as infile: 267 | for line in infile: 268 | key, count = line.rstrip().split(" ") 269 | assert int(count) == symspell_default.words[key] 270 | num_lines += 1 271 | assert num_lines == symspell_default.word_count 272 | 273 | @pytest.mark.parametrize( 274 | "symspell_default_entry", 275 | [[("stea", 1), ("steama", 2), ("steem", 3)]], 276 | indirect=True, 277 | ) 278 | def test_delete_dictionary_entry(self, symspell_default_entry): 279 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2) 280 | assert 1 == len(result) 281 | assert "steama" == result[0].term 282 | assert len("steama") == symspell_default_entry._max_length 283 | 284 | assert symspell_default_entry.delete_dictionary_entry("steama") 285 | assert "steama" not in symspell_default_entry.words 286 | assert len("steem") == symspell_default_entry._max_length 287 | 288 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2) 289 | assert 1 == len(result) 290 | assert "steem" == result[0].term 291 | 292 | assert symspell_default_entry.delete_dictionary_entry("stea") 293 | assert "stea" not in symspell_default_entry.words 294 | assert len("steem") == symspell_default_entry._max_length 295 | 296 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2) 297 | assert 1 == len(result) 298 | assert "steem" == result[0].term 299 | 300 | @pytest.mark.parametrize( 301 | "symspell_default_entry", 302 | [[("stea", 1), ("steama", 2), ("steem", 3)]], 303 | indirect=True, 304 | ) 305 | def test_delete_dictionary_entry_invalid_word(self, symspell_default_entry): 306 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2) 307 | assert 1 == len(result) 308 | assert "steama" == result[0].term 309 | assert len("steama") == symspell_default_entry._max_length 310 | 311 | assert not symspell_default_entry.delete_dictionary_entry("steamab") 312 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2) 313 | assert 1 == len(result) 314 | assert "steama" == result[0].term 315 | assert len("steama") == symspell_default_entry._max_length 316 | -------------------------------------------------------------------------------- /tests/test_symspellpy_edge_cases.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from symspellpy import Verbosity 4 | 5 | ENTRIES = ["baked", "ax", "lake", "", "slaked"] 6 | 7 | 8 | class TestSymSpellPyEdgeCases: 9 | @pytest.mark.parametrize("symspell_long_entry", [ENTRIES], indirect=True) 10 | def test_empty_string_has_all_short_deletes(self, symspell_long_entry): 11 | sym_spell, entries = symspell_long_entry 12 | 13 | assert len(entries[:-1]) == len(sym_spell.deletes[""]) 14 | assert all(entry in sym_spell.deletes[""] for entry in entries[:-1]) 15 | assert "abc" not in sym_spell.deletes[""] 16 | 17 | def test_split_correction_part_of_single_term_correction(self, symspell_default): 18 | symspell_default.create_dictionary_entry("where", 2) 19 | symspell_default.create_dictionary_entry("is", 2) 20 | symspell_default.create_dictionary_entry("whereas", 2) 21 | symspell_default._bigrams["where is"] = 10 22 | 23 | suggestions = symspell_default.lookup_compound("whereiz", 2) 24 | assert "where is" == suggestions[0].term 25 | assert 2 == suggestions[0].distance 26 | assert 10 == suggestions[0].count 27 | 28 | @pytest.mark.parametrize("symspell_long_entry", [["bank", "bink"]], indirect=True) 29 | def test_no_common_char_with_phrase(self, symspell_long_entry): 30 | sym_spell, _ = symspell_long_entry 31 | results = sym_spell.lookup("knab", Verbosity.ALL, 4) 32 | 33 | assert 2 == len(results) 34 | assert "bank" == results[0].term 35 | assert 3 == results[0].distance 36 | assert "bink" == results[1].term 37 | assert 4 == results[1].distance 38 | -------------------------------------------------------------------------------- /tests/test_symspellpy_lookup.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytest 4 | 5 | from symspellpy import SymSpell, Verbosity 6 | 7 | 8 | @pytest.fixture 9 | def symspell_high_thres(): 10 | return SymSpell(2, 7, 10) 11 | 12 | 13 | @pytest.fixture 14 | def symspell_high_thres_flame(symspell_high_thres): 15 | symspell_high_thres.create_dictionary_entry("flame", 20) 16 | symspell_high_thres.create_dictionary_entry("flam", 1) 17 | return symspell_high_thres 18 | 19 | 20 | class TestSymSpellPyLookup: 21 | @pytest.mark.parametrize( 22 | "symspell_default_entry", 23 | [[("steama", 4), ("steamb", 6), ("steamc", 2)]], 24 | indirect=True, 25 | ) 26 | def test_deletes(self, symspell_default_entry): 27 | result = symspell_default_entry.lookup("stream", Verbosity.TOP, 2) 28 | assert 1 == len(result) 29 | assert "steamb" == result[0].term 30 | assert 6 == result[0].count 31 | assert symspell_default_entry.deletes 32 | 33 | @pytest.mark.parametrize("symspell_short", [None], indirect=True) 34 | def test_words_with_shared_prefix_should_retain_counts(self, symspell_short): 35 | symspell_short.create_dictionary_entry("pipe", 5) 36 | symspell_short.create_dictionary_entry("pips", 10) 37 | 38 | result = symspell_short.lookup("pipe", Verbosity.ALL, 1) 39 | assert 2 == len(result) 40 | assert "pipe" == result[0].term 41 | assert 5 == result[0].count 42 | assert "pips" == result[1].term 43 | assert 10 == result[1].count 44 | 45 | result = symspell_short.lookup("pips", Verbosity.ALL, 1) 46 | assert 2 == len(result) 47 | assert "pips" == result[0].term 48 | assert 10 == result[0].count 49 | assert "pipe" == result[1].term 50 | assert 5 == result[1].count 51 | 52 | result = symspell_short.lookup("pip", Verbosity.ALL, 1) 53 | assert 2 == len(result) 54 | assert "pips" == result[0].term 55 | assert 10 == result[0].count 56 | assert "pipe" == result[1].term 57 | assert 5 == result[1].count 58 | 59 | def test_add_additional_counts_should_not_overflow( 60 | self, symspell_default, get_same_word_and_count 61 | ): 62 | for i, (word, count) in enumerate(get_same_word_and_count): 63 | symspell_default.create_dictionary_entry( 64 | word, sys.maxsize - 1 if i == 0 else count 65 | ) 66 | result = symspell_default.lookup(word, Verbosity.TOP) 67 | assert (sys.maxsize - 1 if i == 0 else sys.maxsize) == result[0].count 68 | 69 | @pytest.mark.parametrize( 70 | "verbosity, num_results", 71 | [(Verbosity.TOP, 1), (Verbosity.CLOSEST, 2), (Verbosity.ALL, 3)], 72 | ) 73 | def test_verbosity_should_control_lookup_results( 74 | self, symspell_default, verbosity, num_results 75 | ): 76 | symspell_default.create_dictionary_entry("steam", 1) 77 | symspell_default.create_dictionary_entry("steams", 2) 78 | symspell_default.create_dictionary_entry("steem", 3) 79 | 80 | result = symspell_default.lookup("steems", verbosity, 2) 81 | assert num_results == len(result) 82 | 83 | @pytest.mark.parametrize( 84 | "symspell_default_entry", 85 | [[("steama", 4), ("steamb", 6), ("steamc", 2)]], 86 | indirect=True, 87 | ) 88 | def test_should_return_most_frequent(self, symspell_default_entry): 89 | result = symspell_default_entry.lookup("stream", Verbosity.TOP, 2) 90 | assert 1 == len(result) 91 | assert "steamb" == result[0].term 92 | assert 6 == result[0].count 93 | 94 | @pytest.mark.parametrize( 95 | "symspell_default_entry", 96 | [[("steama", 4), ("steamb", 6), ("steamc", 2)]], 97 | indirect=True, 98 | ) 99 | def test_should_find_exact_match(self, symspell_default_entry): 100 | result = symspell_default_entry.lookup("streama", Verbosity.TOP, 2) 101 | assert 1 == len(result) 102 | assert "steama" == result[0].term 103 | 104 | @pytest.mark.parametrize("term", ["paw", "awn"]) 105 | def test_should_not_return_non_word_delete(self, symspell_high_thres, term): 106 | symspell_high_thres.create_dictionary_entry("pawn", 10) 107 | result = symspell_high_thres.lookup(term, Verbosity.TOP, 0) 108 | assert not result 109 | 110 | def test_should_not_return_low_count_word(self, symspell_high_thres): 111 | symspell_high_thres.create_dictionary_entry("pawn", 1) 112 | result = symspell_high_thres.lookup("pawn", Verbosity.TOP, 0) 113 | assert not result 114 | 115 | def test_should_not_return_low_count_word_that_are_also_delete_word( 116 | self, symspell_high_thres_flame 117 | ): 118 | result = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 0) 119 | assert not result 120 | 121 | def test_max_edit_distance_too_large(self, symspell_high_thres_flame): 122 | with pytest.raises(ValueError) as excinfo: 123 | _ = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 3) 124 | assert "distance too large" == str(excinfo.value) 125 | 126 | def test_include_unknown(self, symspell_high_thres_flame): 127 | result = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 0, True) 128 | assert 1 == len(result) 129 | assert "flam" == result[0].term 130 | 131 | def test_avoid_exact_match_early_exit(self, symspell_high_thres_flame): 132 | result = symspell_high_thres_flame.lookup( 133 | "24th", Verbosity.ALL, 2, ignore_token=r"\d{2}\w*\b" 134 | ) 135 | assert 1 == len(result) 136 | assert "24th" == result[0].term 137 | 138 | def test_should_replicate_noisy_results( 139 | self, dictionary_path, query_path, symspell_default 140 | ): 141 | symspell_default.load_dictionary(dictionary_path, 0, 1) 142 | 143 | with open(query_path, "r") as infile: 144 | test_phrases = [ 145 | parts[0] 146 | for parts in map(lambda x: x.strip().split(), infile.readlines()) 147 | if len(parts) >= 2 148 | ] 149 | 150 | result_sum = 0 151 | for phrase in test_phrases: 152 | result_sum += len(symspell_default.lookup(phrase, Verbosity.CLOSEST, 2)) 153 | 154 | assert 4955 == result_sum 155 | 156 | @pytest.mark.parametrize( 157 | "symspell_default_entry, typo, correction", 158 | [ 159 | ([("steam", 4)], "Stream", "Steam"), 160 | ([("steam", 4)], "StreaM", "SteaM"), 161 | ([("steam", 4)], "STREAM", "STEAM"), 162 | ([("i", 4)], "I", "I"), 163 | ], 164 | indirect=["symspell_default_entry"], 165 | ) 166 | def test_transfer_casing(self, symspell_default_entry, typo, correction): 167 | result = symspell_default_entry.lookup( 168 | typo, Verbosity.TOP, 2, transfer_casing=True 169 | ) 170 | assert correction == result[0].term 171 | -------------------------------------------------------------------------------- /tests/test_symspellpy_lookup_compound.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | 4 | class TestSymSpellPyLookupCompound: 5 | @pytest.mark.parametrize( 6 | "symspell_default_load, get_fortests_data", 7 | [ 8 | ("bigram", "lookup_compound_data.json"), 9 | ("unigram", "lookup_compound_data.json"), 10 | ], 11 | indirect=True, 12 | ) 13 | def test_lookup_compound(self, symspell_default_load, get_fortests_data): 14 | sym_spell, dictionary = symspell_default_load 15 | for entry in get_fortests_data: 16 | results = sym_spell.lookup_compound(entry["typo"], 2) 17 | assert entry[dictionary]["num_results"] == len(results) 18 | assert entry[dictionary]["term"] == results[0].term 19 | assert entry[dictionary]["distance"] == results[0].distance 20 | assert entry[dictionary]["count"] == results[0].count 21 | 22 | @pytest.mark.parametrize( 23 | "symspell_default_entry", [[("steam", 1), ("machine", 1)]], indirect=True 24 | ) 25 | def test_lookup_compound_only_combi(self, symspell_default_entry): 26 | typo = "ste am machie" 27 | correction = "steam machine" 28 | results = symspell_default_entry.lookup_compound(typo, 2) 29 | assert 1 == len(results) 30 | assert correction == results[0].term 31 | 32 | @pytest.mark.parametrize( 33 | "symspell_default_entry", [[("steam", 1), ("machine", 1)]], indirect=True 34 | ) 35 | def test_lookup_compound_no_suggestion(self, symspell_default_entry): 36 | typo = "qwer erty ytui a" 37 | results = symspell_default_entry.lookup_compound(typo, 2) 38 | assert 1 == len(results) 39 | assert typo == results[0].term 40 | 41 | @pytest.mark.parametrize( 42 | "symspell_default_load, get_fortests_data", 43 | [ 44 | ("bigram", "lookup_compound_replaced_words_data.json"), 45 | ("unigram", "lookup_compound_replaced_words_data.json"), 46 | ], 47 | indirect=True, 48 | ) 49 | def test_lookup_compound_replaced_words( 50 | self, symspell_default_load, get_fortests_data 51 | ): 52 | sym_spell, dictionary = symspell_default_load 53 | num_replaced_words = 0 54 | for entry in get_fortests_data: 55 | num_replaced_words += len(entry[dictionary]["replacement"]) 56 | results = sym_spell.lookup_compound(entry["typo"], 2) 57 | assert num_replaced_words == len(sym_spell.replaced_words) 58 | assert entry[dictionary]["term"] == results[0].term 59 | for k, v in entry[dictionary]["replacement"].items(): 60 | assert v == sym_spell.replaced_words[k].term 61 | 62 | @pytest.mark.parametrize( 63 | "symspell_default_load, get_fortests_data", 64 | [ 65 | ("bigram", "lookup_compound_ignore_non_words_data.json"), 66 | ("unigram", "lookup_compound_ignore_non_words_data.json"), 67 | ], 68 | indirect=True, 69 | ) 70 | def test_lookup_compound_ignore_non_words( 71 | self, symspell_default_load, get_fortests_data 72 | ): 73 | sym_spell, dictionary = symspell_default_load 74 | for entry in get_fortests_data: 75 | results = sym_spell.lookup_compound(entry["typo"], 2, True) 76 | assert 1 == len(results) 77 | assert entry[dictionary]["term"] == results[0].term 78 | 79 | @pytest.mark.parametrize( 80 | "symspell_default_load", ["bigram", "unigram"], indirect=True 81 | ) 82 | def test_lookup_compound_ignore_non_words_ignore_digits( 83 | self, symspell_default_load 84 | ): 85 | sym_spell, _ = symspell_default_load 86 | 87 | typo = "is the officeon 1st floor oepn 24/7" 88 | correction = "is the office on 1st floor open 24/7" 89 | results = sym_spell.lookup_compound( 90 | typo, 91 | 2, 92 | True, 93 | split_by_space=True, 94 | ignore_term_with_digits=True, 95 | ) 96 | assert 1 == len(results) 97 | assert correction == results[0].term 98 | assert 2 == results[0].distance 99 | assert 0 == results[0].count 100 | 101 | @pytest.mark.parametrize( 102 | "symspell_default_load, get_fortests_data", 103 | [ 104 | ("bigram", "lookup_compound_transfer_casing_data.json"), 105 | ("unigram", "lookup_compound_transfer_casing_data.json"), 106 | ], 107 | indirect=True, 108 | ) 109 | def test_lookup_compound_transfer_casing( 110 | self, symspell_default_load, get_fortests_data 111 | ): 112 | sym_spell, dictionary = symspell_default_load 113 | for entry in get_fortests_data: 114 | results = sym_spell.lookup_compound(entry["typo"], 2, transfer_casing=True) 115 | assert entry[dictionary]["term"] == results[0].term 116 | 117 | @pytest.mark.parametrize( 118 | "symspell_default_load, get_fortests_data", 119 | [ 120 | ("bigram", "lookup_compound_transfer_casing_ignore_nonwords_data.json"), 121 | ("unigram", "lookup_compound_transfer_casing_ignore_nonwords_data.json"), 122 | ], 123 | indirect=True, 124 | ) 125 | def test_lookup_compound_transfer_casing_ignore_nonwords( 126 | self, symspell_default_load, get_fortests_data 127 | ): 128 | sym_spell, dictionary = symspell_default_load 129 | for entry in get_fortests_data: 130 | results = sym_spell.lookup_compound(entry["typo"], 2, True, True) 131 | assert entry[dictionary]["term"] == results[0].term 132 | -------------------------------------------------------------------------------- /tests/test_symspellpy_pickle.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pickle 3 | from unittest import TestCase 4 | 5 | import pytest 6 | 7 | from symspellpy import SymSpell 8 | 9 | 10 | class TestSymSpellPyPickle: 11 | @pytest.mark.parametrize( 12 | "symspell_default_load, is_compressed", 13 | [("unigram", True), ("bigram", True), ("unigram", False), ("bigram", False)], 14 | indirect=["symspell_default_load"], 15 | ) 16 | def test_pickle(self, pickle_path, symspell_default_load, is_compressed): 17 | sym_spell, _ = symspell_default_load 18 | sym_spell.save_pickle(pickle_path, is_compressed) 19 | 20 | sym_spell_2 = SymSpell(123, 456, 789) 21 | 22 | assert sym_spell._count_threshold != sym_spell_2._count_threshold 23 | assert ( 24 | sym_spell._max_dictionary_edit_distance 25 | != sym_spell_2._max_dictionary_edit_distance 26 | ) 27 | assert sym_spell._prefix_length != sym_spell_2._prefix_length 28 | 29 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm: 30 | sym_spell_2.load_pickle(pickle_path, is_compressed) 31 | assert ( 32 | "Loading data which was created using different ('count_threshold', " 33 | "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting " 34 | "current SymSpell instance with loaded settings ..." 35 | ) == cm.records[0].getMessage() 36 | assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words 37 | assert sym_spell.bigrams == sym_spell_2.bigrams 38 | assert sym_spell.deletes == sym_spell_2.deletes 39 | assert sym_spell.words == sym_spell_2.words 40 | assert sym_spell._max_length == sym_spell_2._max_length 41 | assert sym_spell._count_threshold == sym_spell_2._count_threshold 42 | assert ( 43 | sym_spell._max_dictionary_edit_distance 44 | == sym_spell_2._max_dictionary_edit_distance 45 | ) 46 | assert sym_spell._prefix_length == sym_spell_2._prefix_length 47 | os.remove(pickle_path) 48 | 49 | @pytest.mark.parametrize( 50 | "symspell_default_load, is_compressed", 51 | [("unigram", True), ("bigram", True), ("unigram", False), ("bigram", False)], 52 | indirect=["symspell_default_load"], 53 | ) 54 | def test_pickle_same_settings( 55 | self, pickle_path, symspell_default_load, is_compressed 56 | ): 57 | sym_spell, _ = symspell_default_load 58 | sym_spell.save_pickle(pickle_path, is_compressed) 59 | 60 | sym_spell_2 = SymSpell() 61 | sym_spell_2.load_pickle(pickle_path, is_compressed) 62 | 63 | assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words 64 | assert sym_spell.bigrams == sym_spell_2.bigrams 65 | assert sym_spell.deletes == sym_spell_2.deletes 66 | assert sym_spell.words == sym_spell_2.words 67 | assert sym_spell._max_length == sym_spell_2._max_length 68 | assert sym_spell._count_threshold == sym_spell_2._count_threshold 69 | assert ( 70 | sym_spell._max_dictionary_edit_distance 71 | == sym_spell_2._max_dictionary_edit_distance 72 | ) 73 | assert sym_spell._prefix_length == sym_spell_2._prefix_length 74 | os.remove(pickle_path) 75 | 76 | @pytest.mark.parametrize( 77 | "symspell_default_load", ["unigram", "bigram"], indirect=True 78 | ) 79 | def test_pickle_bytes(self, symspell_default_load): 80 | sym_spell, _ = symspell_default_load 81 | sym_spell_2 = SymSpell(123, 456, 789) 82 | 83 | assert sym_spell._count_threshold != sym_spell_2._count_threshold 84 | assert ( 85 | sym_spell._max_dictionary_edit_distance 86 | != sym_spell_2._max_dictionary_edit_distance 87 | ) 88 | assert sym_spell._prefix_length != sym_spell_2._prefix_length 89 | 90 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm: 91 | sym_spell_2.load_pickle( 92 | sym_spell.save_pickle(to_bytes=True), from_bytes=True 93 | ) 94 | assert ( 95 | "Loading data which was created using different ('count_threshold', " 96 | "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting " 97 | "current SymSpell instance with loaded settings ..." 98 | ) == cm.records[0].getMessage() 99 | assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words 100 | assert sym_spell.bigrams == sym_spell_2.bigrams 101 | assert sym_spell.deletes == sym_spell_2.deletes 102 | assert sym_spell.words == sym_spell_2.words 103 | assert sym_spell._max_length == sym_spell_2._max_length 104 | assert sym_spell._count_threshold == sym_spell_2._count_threshold 105 | assert ( 106 | sym_spell._max_dictionary_edit_distance 107 | == sym_spell_2._max_dictionary_edit_distance 108 | ) 109 | assert sym_spell._prefix_length == sym_spell_2._prefix_length 110 | 111 | def test_pickle_invalid(self, pickle_path, symspell_default): 112 | pickle_data = {"deletes": {}, "words": {}, "max_length": 0, "data_version": -1} 113 | with open(pickle_path, "wb") as f: 114 | pickle.dump(pickle_data, f) 115 | assert not symspell_default.load_pickle(pickle_path, False) 116 | os.remove(pickle_path) 117 | 118 | pickle_data = {"deletes": {}, "words": {}, "max_length": 0} 119 | with open(pickle_path, "wb") as f: 120 | pickle.dump(pickle_data, f) 121 | assert not symspell_default.load_pickle(pickle_path, False) 122 | os.remove(pickle_path) 123 | -------------------------------------------------------------------------------- /tests/test_symspellpy_word_segmentation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from symspellpy import SymSpell 4 | 5 | 6 | @pytest.fixture 7 | def symspell_edit_distance_load(dictionary_path, request): 8 | sym_spell = SymSpell(request.param) 9 | sym_spell.load_dictionary(dictionary_path, 0, 1) 10 | return sym_spell, request.param 11 | 12 | 13 | class TestSymSpellPyWordSegmentation: 14 | @pytest.mark.parametrize("symspell_default_load", ["unigram"], indirect=True) 15 | def test_word_segmentation_ignore_token(self, symspell_default_load): 16 | sym_spell, _ = symspell_default_load 17 | typo = "24th december" 18 | result = sym_spell.word_segmentation(typo, ignore_token=r"\d{2}\w*\b") 19 | assert typo == result.corrected_string 20 | 21 | @pytest.mark.parametrize( 22 | "symspell_edit_distance_load, get_fortests_data, with_arguments, capitalize", 23 | [ 24 | (0, "word_segmentation_data.json", False, False), 25 | (0, "word_segmentation_data.json", True, False), 26 | (0, "word_segmentation_data.json", False, True), 27 | ], 28 | indirect=["symspell_edit_distance_load", "get_fortests_data"], 29 | ) 30 | def test_word_segmentation( 31 | self, 32 | symspell_edit_distance_load, 33 | get_fortests_data, 34 | with_arguments, 35 | capitalize, 36 | ): 37 | sym_spell, edit_distance = symspell_edit_distance_load 38 | for entry in get_fortests_data: 39 | if capitalize: 40 | typo = entry["typo"].capitalize() 41 | correction = entry[str(edit_distance)]["term"].capitalize() 42 | else: 43 | typo = entry["typo"] 44 | correction = entry[str(edit_distance)]["term"] 45 | if with_arguments: 46 | result = sym_spell.word_segmentation(typo, edit_distance, 11) 47 | else: 48 | result = sym_spell.word_segmentation(typo) 49 | assert correction == result.corrected_string 50 | 51 | @pytest.mark.parametrize("symspell_edit_distance_load", [0], indirect=True) 52 | def test_word_segmentation_apostrophe(self, symspell_edit_distance_load): 53 | sym_spell, _ = symspell_edit_distance_load 54 | 55 | typo = "There'resomewords" 56 | correction = "There' re some words" 57 | result = sym_spell.word_segmentation(typo) 58 | assert correction == result[1] 59 | 60 | @pytest.mark.parametrize("symspell_edit_distance_load", [0], indirect=True) 61 | def test_word_segmentation_ligature(self, symspell_edit_distance_load): 62 | sym_spell, _ = symspell_edit_distance_load 63 | 64 | typo = "Therearesomescientificwords" 65 | correction = "There are some scientific words" 66 | result = sym_spell.word_segmentation(typo) 67 | assert correction == result[1] 68 | --------------------------------------------------------------------------------