├── .coveragerc
├── .git-blame-ignore-revs
├── .gitattributes
├── .github
    ├── FUNDING.yml
    ├── dependabot.yml
    └── workflows
    │   ├── publish.yml
    │   ├── tests.yml
    │   └── weekly.yml
├── .gitignore
├── .readthedocs.yaml
├── CHANGELOG.md
├── INSTALL.rst
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── _templates
    │   └── layout.html
    ├── api
    │   ├── abstract_distance_comparer.rst
    │   ├── editdistance.rst
    │   ├── helpers.rst
    │   ├── index.rst
    │   └── symspellpy.rst
    ├── conf.py
    ├── examples
    │   ├── custom_distance_comparer.rst
    │   ├── dictionary.rst
    │   ├── index.rst
    │   ├── lookup.rst
    │   ├── lookup_compound.rst
    │   └── word_segmentation.rst
    ├── index.rst
    ├── make.bat
    ├── requirements.txt
    └── users
    │   └── installing.rst
├── pyproject.toml
├── requirements.txt
├── symspellpy
    ├── __init__.py
    ├── abstract_distance_comparer.py
    ├── composition.py
    ├── editdistance.py
    ├── frequency_bigramdictionary_en_243_342.txt
    ├── frequency_dictionary_en_82_765.txt
    ├── helpers.py
    ├── logging.py
    ├── pickle_mixin.py
    ├── suggest_item.py
    ├── symspellpy.py
    └── verbosity.py
└── tests
    ├── __init__.py
    ├── benchmarks.ipynb
    ├── conftest.py
    ├── fortests
        ├── bad_dict.txt
        ├── below_threshold_dict.txt
        ├── big_modified.txt
        ├── big_words.txt
        ├── lookup_compound_data.json
        ├── lookup_compound_ignore_non_words_data.json
        ├── lookup_compound_replaced_words_data.json
        ├── lookup_compound_transfer_casing_data.json
        ├── lookup_compound_transfer_casing_ignore_nonwords_data.json
        ├── noisy_query_en_1000.txt
        ├── non_en_dict.txt
        ├── separator_dict.txt
        └── word_segmentation_data.json
    ├── test_compatibility.py
    ├── test_editdistance.py
    ├── test_helpers.py
    ├── test_suggest_item.py
    ├── test_symspellpy.py
    ├── test_symspellpy_edge_cases.py
    ├── test_symspellpy_lookup.py
    ├── test_symspellpy_lookup_compound.py
    ├── test_symspellpy_pickle.py
    └── test_symspellpy_word_segmentation.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = true
3 | source = symspellpy
4 | 
5 | [report]
6 | exclude_lines =
7 |     pragma: no cover
8 | 


--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # format all
2 | b0abc5ed3a37b05848ca1e2de790321d7c07fd75
3 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py        eol=lf
2 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | github: mammothb
3 | ko_fi: mammothb
4 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "github-actions"
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 |       day: "friday"
13 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to TestPyPI and PyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   release:
 6 |     types: [published]
 7 | 
 8 | jobs:
 9 |   publish-test-pypi:
10 |     name: Build and publish to TestPyPI
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v4
14 | 
15 |       - name: Set up Python 3.10
16 |         uses: actions/setup-python@v5.6.0
17 |         with:
18 |           python-version: "3.10"
19 | 
20 |       - name: Build
21 |         run: |
22 |           echo "Building ..."
23 |           python -m pip install --upgrade pip
24 |           python -m pip install build
25 |           python -m build
26 | 
27 |       - name: Publish to TestPyPI
28 |         uses: pypa/gh-action-pypi-publish@v1.12.4
29 |         with:
30 |           user: __token__
31 |           password: ${{ secrets.TEST_PYPI_API_TOKEN }}
32 |           repository-url: https://test.pypi.org/legacy/
33 | 
34 |       - name: Publish to PyPI
35 |         if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/v')
36 |         uses: pypa/gh-action-pypi-publish@v1.12.4
37 |         with:
38 |           user: __token__
39 |           password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     name: "Python ${{ matrix.python-version }} on ${{ matrix.os }}"
 8 |     runs-on: ${{ matrix.os }}
 9 |     environment: Development
10 | 
11 |     strategy:
12 |       matrix:
13 |         os: [ubuntu-latest]
14 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v5.6.0
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install -r requirements.txt
28 | 
29 |       - name: Run pytest
30 |         run: python -m pytest --cov-report=xml --cov=symspellpy
31 | 
32 |       - name: Upload code coverage
33 |         uses: codecov/codecov-action@v5
34 |         with:
35 |           token: ${{ secrets.CODECOV_TOKEN }}
36 | 


--------------------------------------------------------------------------------
/.github/workflows/weekly.yml:
--------------------------------------------------------------------------------
 1 | name: Weekly Tests
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     # Runs every friday
 7 |     - cron: "0 0 * * 5"
 8 | 
 9 | jobs:
10 |   test:
11 |     name: "Python ${{ matrix.python-version }} on ${{ matrix.os }}"
12 |     runs-on: ${{ matrix.os }}
13 | 
14 |     strategy:
15 |       fail-fast: false
16 |       matrix:
17 |         os: [ubuntu-latest, macos-latest, windows-latest]
18 |         python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
19 | 
20 |     steps:
21 |       - uses: actions/checkout@v4
22 | 
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v5.6.0
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 | 
28 |       - name: Install dependencies
29 |         run: |
30 |           python -m pip install --upgrade pip
31 |           pip install -r requirements.txt -v
32 | 
33 |       - name: Run pytest
34 |         run: python -m pytest
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | .vscode/
106 | 
107 | #pycharm files
108 | .idea/


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the version of Python and other tools you might need
 9 | build:
10 |   os: ubuntu-24.04
11 |   tools:
12 |     python: "3.13"
13 | 
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 |   configuration: docs/conf.py
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |   install:
21 |     - requirements: docs/requirements.txt
22 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | CHANGELOG <br>
  2 | ==============
  3 | 
  4 | ## 6.9.0 (2025-03-09)
  5 | 
  6 | - Specify that frequency count must be 64-bit int [#180](https://github.com/mammothb/symspellpy/pull/180)
  7 | - Rename `string1` and `string2` argument names [#181](https://github.com/mammothb/symspellpy/pull/181)
  8 | 
  9 | ## 6.8.0 (2025-03-09)
 10 | - Allow file object as corpus of load_dictionary [#176](https://github.com/mammothb/symspellpy/pull/176)
 11 | - Bump supported Python version to 3.9 - 3.13 [#177](https://github.com/mammothb/symspellpy/pull/177)
 12 | 
 13 | ## 6.7.8 (2024-08-31)
 14 | - Handle encoding errors [#149](https://github.com/mammothb/symspellpy/pull/149)
 15 | - Bump supported Python version to 3.8 - 3.12 [#151](https://github.com/mammothb/symspellpy/pull/151)
 16 | - Remove numpy dependency [#156](https://github.com/mammothb/symspellpy/pull/156)
 17 | - Feature: distance comparer interface [#159](https://github.com/mammothb/symspellpy/pull/159)
 18 | 
 19 | ## 6.7.7 (2022-10-24)
 20 | - Remove support for Python 3.6
 21 | - Use compiled regex expression in `create_dictionary()` ([#129](https://github.com/mammothb/symspellpy/pull/129))
 22 | - Configure module logger instead of modifying root logger ([#132](https://github.com/mammothb/symspellpy/pull/132), [#133](https://github.com/mammothb/symspellpy/pull/133))
 23 | 
 24 | ## 6.7.6 (2021-12-19)
 25 | - Fix suggestion `count` in `lookup_compound` when `ignore_words=True` ([#108](https://github.com/mammothb/symspellpy/pull/108))
 26 | - Log error message when loading dictionary fails ([#109](https://github.com/mammothb/symspellpy/pull/109))
 27 | 
 28 | ## 6.7.5 (2021-12-02)
 29 | - Fix `replaced_words` not being updated when best match is a combi (closes [#103](https://github.com/mammothb/symspellpy/issues/103))
 30 | - Implement a way to change the edit distance comparer algorightm via `distance_algorithm` property. Available values are found in [`DistanceAlgorithm`](https://symspellpy.readthedocs.io/en/latest/api/editdistance.html#symspellpy.editdistance.DistanceAlgorithm)
 31 | 
 32 | ## 6.7.4 (2021-11-29)
 33 | - Update `editdistpy` dependency version
 34 | - Update `LevenshteinFast` and `DamerauOsaFast` to match the functionality of the `editdistpy` library
 35 | 
 36 | ## 6.7.3 (2021-11-27)
 37 | - Update `editdistpy` dependency version
 38 | 
 39 | ## 6.7.2 (2021-11-25)
 40 | - Fix typo of Dameruau to Damerau in various places. Can potentially break some setups that explicitly `_distance_algorithm`
 41 | - Implement fast distance comparers with [editdistpy](https://github.com/mammothb/editdistpy)
 42 | - Set `DamerauOsaFast` as the default distance comparer
 43 | 
 44 | ## 6.7.1 (2021-11-21)
 45 | - Updated `frequency_dictionary_en_82_765.txt` dictionary with common contractions
 46 | - Added `_below_threshold_words`, `_bigrams`, `_count_threshold`, `_max_dictionary_edit_distance`, and `_prefix_length` when saving to pickle. (closes [#93](https://github.com/mammothb/symspellpy/issues/93))
 47 | - Implemented `to_bytes` and `from_bytes` options to save and load pickle with bytes string
 48 | - Updated data_version to 3
 49 | - Removed Python 3.4 and Python 3.5 support
 50 | 
 51 | ## 6.7.0 (2020-08-28)
 52 | - Removed numpy dependency
 53 | - `word_segmentation` now retains/preserves case.
 54 | - `word_segmentation` now keeps punctuation or apostrophe adjacent to previous
 55 | word.
 56 | - `word_segmentation` now normalizes ligatures: "scientiﬁc" -> "scientific".
 57 | - `word_segmentation` now removes hyphens prior to word segmentation
 58 | (untested).
 59 | - American English word forms added to dictionary in addition to British
 60 | English e.g. favourable & favorable.
 61 | 
 62 | ## 6.5.2 (2019-10-23)
 63 | - Modified `load_bigram_dictionary` to allow dictionary entries to be split
 64 | into only 2 parts when using a custom separator
 65 | - Added dictionary files to wheels so `pkg_resources` could be used to access
 66 | them
 67 | 
 68 | ## 6.5.1 (2019-10-08)
 69 | - Added `separator` argument to allow user to choose custom separator for `load_dictionary`
 70 | 
 71 | ## 6.5.0 (2019-09-21)
 72 | - Added `load_bigram_dictionary` and bigram dictionary `frequency_bigramdictionary_en_243_342.txt`
 73 | - Updated `lookup_compound` algorithm
 74 | - Added `Levenshtein` to compute edit distance
 75 | - Added `save_pickle_stream` and `load_pickle_stream` to save/load SymSpell data alongside other structure (contribution by [marcoffee](https://github.com/marcoffee))
 76 | 
 77 | ## 6.3.9 (2019-08-06)
 78 | - Added `transfer_casing` to `lookup` and `lookup_compound`
 79 | - Fixed prefix length check in `_edits_prefix`
 80 | 
 81 | ## 6.3.8 (2019-03-21)
 82 | - Implemented `delete_dictionary_entry`
 83 | - Improved performance by using python builtin hashing
 84 | - Added versioning of the pickle
 85 | 
 86 | ## 6.3.7 (2019-02-18)
 87 | - Fixed `include_unknown` in `lookup`
 88 | - Removed unused `initial_capacity` argument
 89 | - Improved `_get_str_hash` performance
 90 | - Implemented `save_pickle` and `load_pickle` to avoid having to create the
 91 | dictionary every time
 92 | 
 93 | ## 6.3.6 (2019-02-11)
 94 | - Added `create_dictionary()` feature
 95 | 
 96 | ## 6.3.5 (2019-01-14)
 97 | - Fixed `lookup_compound()` to return the correct `distance`
 98 | 
 99 | ## 6.3.4 (2019-01-04)
100 | - Added `<self._replaced_words = dict()>` to track number of misspelled words
101 | - Added `ignore_token` to `word_segmentation()` to ignore words with regular expression
102 | 
103 | ## 6.3.3 (2018-12-05)
104 | - Added `word_segmentation()` feature
105 | 
106 | ## 6.3.2 (2018-10-23)
107 | - Added `encoding` option to `load_dictionary()`
108 | 
109 | ## 6.3.1 (2018-08-30)
110 | - Create a package for `symspellpy`
111 | 
112 | ## 6.3.0 (2018-08-13)
113 | - Ported [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.3
114 | 


--------------------------------------------------------------------------------
/INSTALL.rst:
--------------------------------------------------------------------------------
 1 | **********
 2 | Installing
 3 | **********
 4 | 
 5 | Installing an official release
 6 | ==============================
 7 | 
 8 | symspellpy and its dependencies are available as wheel packages for macOS,
 9 | Windows and Linux distributions::
10 | 
11 |   python -m pip install -U symspellpy
12 | 
13 | **NOTE**: symspellpy has only been tested on Windows and Linux systems and is
14 | assumed to work on macOS.
15 | 
16 | Dictionary data
17 | ===============
18 | 
19 | The dictionary files that are shipped with symspellpy can be accesed using
20 | `importlib.resources`::
21 | 
22 |   dictionary_path = importlib.resources.files("symspellpy") / "frequency_dictionary_en_82_765.txt"
23 |   bigram_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
24 | 
25 | Alternatively, you can download the dictionary files from the repository and
26 | add them to your project directory::
27 | 
28 |   curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt
29 |   curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt
30 | 
31 | You could end up with a project directory layout like::
32 | 
33 |   project_dir
34 |   +-frequency_bigramdictionary_en_243_342.txt
35 |   +-frequency_dictionary_en_82_765.txt
36 |   \-project.py
37 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 mmb L (Python port https://github.com/mammothb/symspellpy)
 4 | Copyright (c) 2021 Wolf Garbe (Original C# implementation https://github.com/wolfgarbe/SymSpell)
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | symspellpy <br>
 2 | [![PyPI version](https://badge.fury.io/py/symspellpy.svg)](https://badge.fury.io/py/symspellpy)
 3 | [![Tests](https://github.com/mammothb/symspellpy/actions/workflows/tests.yml/badge.svg)](https://github.com/mammothb/symspellpy/actions/workflows/tests.yml)
 4 | [![Documentation Status](https://readthedocs.org/projects/symspellpy/badge/?version=latest)](https://symspellpy.readthedocs.io/en/latest/?badge=latest)
 5 | [![codecov](https://codecov.io/gh/mammothb/symspellpy/branch/master/graph/badge.svg)](https://codecov.io/gh/mammothb/symspellpy)
 6 | ========
 7 | 
 8 | symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.2, which provides much higher speed and lower memory consumption. Unit tests
 9 | from the original project are implemented to ensure the accuracy of the port.
10 | 
11 | Please note that the port has not been optimized for speed.
12 | 
13 | Notable Changes
14 | ===============
15 | v6.7.2: Implemented fast distance comparer with [editdistpy](https://github.com/mammothb/editdistpy). Approximately 2x speed up for usage under default settings, benchmarks found [here](https://github.com/mammothb/symspellpy/blob/master/tests/benchmarks.ipynb).
16 | 
17 | Install
18 | =======
19 | For installation instructions, see the `INSTALL.rst` file or the [install](https://symspellpy.readthedocs.io/en/latest/users/installing.html) documentation.
20 | 
21 | Usage
22 | =====
23 | Check out the [examples](https://symspellpy.readthedocs.io/en/latest/examples/index.html) provided for sample usage.
24 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | .PHONY: help Makefile
15 | 
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 | 
3 | {%- block rootrellink %}
4 |   <li><a href="{{ pathto('index') }}">Home</a>|&nbsp;</li>
5 |   <li><a href="{{ pathto('examples/index') }}">Examples</a>|&nbsp;</li>
6 |   <li><a href="{{ pathto('api/index') }}">API</a>|&nbsp;</li>
7 | {%- endblock %}
8 | 


--------------------------------------------------------------------------------
/docs/api/abstract_distance_comparer.rst:
--------------------------------------------------------------------------------
 1 | **************************
 2 | abstract_distance_comparer
 3 | **************************
 4 | 
 5 | Distance comparer interface
 6 | ===========================
 7 | 
 8 | .. autoclass:: symspellpy.abstract_distance_comparer.AbstractDistanceComparer
 9 |    :members:
10 | 


--------------------------------------------------------------------------------
/docs/api/editdistance.rst:
--------------------------------------------------------------------------------
 1 | ************
 2 | editdistance
 3 | ************
 4 | 
 5 | Enum class
 6 | ==========
 7 | 
 8 | .. autoclass:: symspellpy.editdistance.DistanceAlgorithm
 9 |    :members:
10 |    :member-order: bysource
11 | 
12 | EditDistance class
13 | ==================
14 | 
15 | .. autoclass:: symspellpy.editdistance.EditDistance
16 |    :members:
17 | 
18 | Distance comparer classes
19 | =========================
20 | 
21 | .. autoclass:: symspellpy.editdistance.DamerauOsa
22 |    :members:
23 | 
24 | .. autoclass:: symspellpy.editdistance.Levenshtein
25 |    :members:
26 | 
27 | .. autoclass:: symspellpy.editdistance.DamerauOsaFast
28 |    :members:
29 | 
30 | .. autoclass:: symspellpy.editdistance.LevenshteinFast
31 |    :members:
32 | 


--------------------------------------------------------------------------------
/docs/api/helpers.rst:
--------------------------------------------------------------------------------
 1 | *******
 2 | helpers
 3 | *******
 4 | 
 5 | Helpers for `editdistance`
 6 | ==========================
 7 | 
 8 | .. autofunction:: symspellpy.helpers.null_distance_results
 9 | 
10 | .. autofunction:: symspellpy.helpers.prefix_suffix_prep
11 | 
12 | Helpers for `symspellpy`
13 | ========================
14 | 
15 | .. autoclass:: symspellpy.helpers.DictIO
16 | 
17 | .. autofunction:: symspellpy.helpers.case_transfer_matching
18 | 
19 | .. autofunction:: symspellpy.helpers.case_transfer_similar
20 | 
21 | .. autofunction:: symspellpy.helpers.increment_count
22 | 
23 | .. autofunction:: symspellpy.helpers.is_acronym
24 | 
25 | .. autofunction:: symspellpy.helpers.parse_words
26 | 
27 | .. autofunction:: symspellpy.helpers.try_parse_int64
28 | 
29 | Misc
30 | ====
31 | 
32 | .. autofunction:: symspellpy.helpers.to_similarity
33 | 
34 | 


--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
 1 | ************
 2 | API Overview
 3 | ************
 4 | 
 5 | Modules
 6 | =======
 7 | 
 8 | .. only:: html
 9 | 
10 | .. toctree::
11 |    :maxdepth: 2
12 | 
13 |    helpers.rst
14 |    abstract_distance_comparer.rst
15 |    editdistance.rst
16 |    symspellpy.rst
17 | 


--------------------------------------------------------------------------------
/docs/api/symspellpy.rst:
--------------------------------------------------------------------------------
 1 | **********
 2 | symspellpy
 3 | **********
 4 | 
 5 | Enum class
 6 | ==========
 7 | 
 8 | .. autoclass:: symspellpy.verbosity.Verbosity
 9 |    :members:
10 |    :member-order: bysource
11 | 
12 | Data class
13 | ==========
14 | 
15 | .. autoclass:: symspellpy.suggest_item.SuggestItem
16 |    :members:
17 |    :special-members: __eq__, __lt__, __str__
18 | 
19 | .. autoclass:: symspellpy.composition.Composition
20 |    :members:
21 |    :exclude-members: corrected_string, distance_sum, log_prob_sum, segmented_string
22 | 
23 | Utility class
24 | =============
25 | 
26 | .. autoclass:: symspellpy.pickle_mixin.PickleMixin
27 |    :members:
28 |    :private-members:
29 | 
30 | SymSpell
31 | ========
32 | 
33 | .. autoclass:: symspellpy.symspellpy.SymSpell
34 |    :members:
35 |    :private-members:
36 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | 
 15 | import os.path
 16 | import sys
 17 | 
 18 | sys.path.insert(0, os.path.abspath(".."))
 19 | 
 20 | from pathlib import Path
 21 | 
 22 | import tomllib
 23 | 
 24 | # -- Project information -----------------------------------------------------
 25 | 
 26 | project = "symspellpy"
 27 | copyright = "2025, mmb L, Wolf Garbe"
 28 | author = "mmb L, Wolf Garbe"
 29 | 
 30 | # The short X.Y version
 31 | version = ""
 32 | # The full version, including alpha/beta/rc tags
 33 | with open(Path(__file__).parents[1] / "pyproject.toml", "rb") as infile:
 34 |     data = tomllib.load(infile)
 35 |     release = data["project"]["version"]
 36 | 
 37 | 
 38 | # -- General configuration ---------------------------------------------------
 39 | 
 40 | # If your documentation needs a minimal Sphinx version, state it here.
 41 | #
 42 | # needs_sphinx = '1.0'
 43 | 
 44 | # Add any Sphinx extension module names here, as strings. They can be
 45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 46 | # ones.
 47 | extensions = [
 48 |     "sphinx.ext.autodoc",
 49 |     "sphinx.ext.napoleon",
 50 |     "sphinx.ext.viewcode",
 51 |     "sphinx_autodoc_typehints",
 52 | ]
 53 | # numpydoc_class_members_toctree = False
 54 | # numpydoc_show_inherited_class_members = False
 55 | highlight_language = "none"
 56 | 
 57 | # Add any paths that contain templates here, relative to this directory.
 58 | templates_path = ["_templates"]
 59 | 
 60 | # The suffix(es) of source filenames.
 61 | # You can specify multiple suffix as a list of string:
 62 | #
 63 | # source_suffix = ['.rst', '.md']
 64 | source_suffix = ".rst"
 65 | 
 66 | # The master toctree document.
 67 | master_doc = "index"
 68 | 
 69 | # The language for content autogenerated by Sphinx. Refer to documentation
 70 | # for a list of supported languages.
 71 | #
 72 | # This is also used if you do content translation via gettext catalogs.
 73 | # Usually you set "language" from the command line for these cases.
 74 | language = "en"
 75 | 
 76 | # List of patterns, relative to source directory, that match files and
 77 | # directories to ignore when looking for source files.
 78 | # This pattern also affects html_static_path and html_extra_path.
 79 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 80 | 
 81 | # The name of the Pygments (syntax highlighting) style to use.
 82 | pygments_style = None
 83 | 
 84 | 
 85 | # -- Options for HTML output -------------------------------------------------
 86 | 
 87 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 88 | # a list of builtin themes.
 89 | #
 90 | html_theme = "sphinxdoc"
 91 | 
 92 | # Theme options are theme-specific and customize the look and feel of a theme
 93 | # further.  For a list of options available for each theme, see the
 94 | # documentation.
 95 | #
 96 | # html_theme_options = {}
 97 | 
 98 | # Add any paths that contain custom static files (such as style sheets) here,
 99 | # relative to this directory. They are copied after the builtin static files,
100 | # so a file named "default.css" will overwrite the builtin "default.css".
101 | #  html_static_path = ["_static"]
102 | html_static_path = []
103 | 
104 | # Custom sidebar templates, must be a dictionary that maps document names
105 | # to template names.
106 | #
107 | # The default sidebars (for documents that don't match any pattern) are
108 | # defined by theme itself.  Builtin themes are using these templates by
109 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
110 | # 'searchbox.html']``.
111 | #
112 | html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]}
113 | 
114 | 
115 | # -- Options for HTMLHelp output ---------------------------------------------
116 | 
117 | # Output file base name for HTML help builder.
118 | htmlhelp_basename = "symspellpydoc"
119 | 


--------------------------------------------------------------------------------
/docs/examples/custom_distance_comparer.rst:
--------------------------------------------------------------------------------
 1 | ************************
 2 | Custom distance comparer
 3 | ************************
 4 | 
 5 | Basic usage
 6 | ===========
 7 | 
 8 | Create a comparer class which satisfies the interface specified by
 9 | :class:`~symspellpy.abstract_distance_comparer.AbstractDistanceComparer`:
10 | 
11 | .. code-block:: python
12 | 
13 |    import importlib.resources
14 |    from itertools import islice
15 | 
16 |    from symspellpy import SymSpell
17 |    from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
18 |    from symspellpy.editdistance import DistanceAlgorithm, EditDistance
19 | 
20 |    class CustomComparer(AbstractDistanceComparer):
21 |        def distance(self, string_1, string_2, max_distance):
22 |            # Compare distance between string_1 and string_2
23 |            return -1 if distance > max_distance else distance
24 | 
25 |    custom_comparer = Editdistance(DistanceAlgorithm.USER_PROVIDED, CustomComparer())
26 |    sym_spell = SymSpell(distance_comparer=custom_comparer)
27 |    dictionary_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
28 |    sym_spell.load_bigram_dictionary(dictionary_path, 0, 2)
29 | 
30 |    # Print out first 5 elements to demonstrate that dictionary is
31 |    # successfully loaded
32 |    print(list(islice(sym_spell.bigrams.items(), 5)))
33 | 


--------------------------------------------------------------------------------
/docs/examples/dictionary.rst:
--------------------------------------------------------------------------------
  1 | **********
  2 | Dictionary
  3 | **********
  4 | 
  5 | Load frequency dictionary
  6 | =========================
  7 | 
  8 | `load_dictionary`
  9 | -----------------
 10 | 
 11 | Given a dictionary file like::
 12 | 
 13 |   <term> <count>
 14 |   <term> <count>
 15 |   ...
 16 |   <term> <count>
 17 | 
 18 | We can use :meth:`~symspellpy.symspellpy.SymSpell.load_dictionary`:
 19 | 
 20 | .. code-block:: python
 21 |   :emphasize-lines: 8
 22 | 
 23 |   import importlib.resources
 24 |   from itertools import islice
 25 | 
 26 |   from symspellpy import SymSpell
 27 | 
 28 |   sym_spell = SymSpell()
 29 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
 30 |   sym_spell.load_dictionary(dictionary_path, 0, 1)
 31 | 
 32 |   # Print out first 5 elements to demonstrate that dictionary is
 33 |   # successfully loaded
 34 |   print(list(islice(sym_spell.words.items(), 5)))
 35 | 
 36 | Output::
 37 | 
 38 |   [('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698)]
 39 | 
 40 | `load_bigram_dictionary`
 41 | ------------------------
 42 | 
 43 | Given a bigram dictionary file like::
 44 | 
 45 |   <term_part_1> <term_part_2> <count>
 46 |   <term_part_1> <term_part_2> <count>
 47 |   ...
 48 |   <term_part_1> <term_part_2> <count>
 49 | 
 50 | We can use :meth:`~symspellpy.symspellpy.SymSpell.load_bigram_dictionary`:
 51 | 
 52 | .. code-block:: python
 53 |   :emphasize-lines: 8
 54 | 
 55 |   import importlib.resources
 56 |   from itertools import islice
 57 | 
 58 |   from symspellpy import SymSpell
 59 | 
 60 |   sym_spell = SymSpell()
 61 |   dictionary_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
 62 |   sym_spell.load_bigram_dictionary(dictionary_path, 0, 2)
 63 | 
 64 |   # Print out first 5 elements to demonstrate that dictionary is
 65 |   # successfully loaded
 66 |   print(list(islice(sym_spell.bigrams.items(), 5)))
 67 | 
 68 | Output::
 69 | 
 70 |   [('abcs of', 10956800), ('aaron and', 10721728), ('abbott and', 7861376), ('abbreviations and', 13518272), ('aberdeen and', 7347776)]
 71 | 
 72 | Load frequency dictionary with custom separator
 73 | ===============================================
 74 | 
 75 | `load_dictionary`
 76 | -----------------
 77 | 
 78 | It is also possible to specific a custom `separator` so that dictionaries can
 79 | contain space separated terms. For example, given a dictionary file like::
 80 | 
 81 |   the$23135851162
 82 |   abcs of$10956800
 83 |   of$13151942776
 84 |   aaron and$10721728
 85 |   abbott and$7861376
 86 |   abbreviations and$13518272
 87 |   aberdeen and$7347776
 88 | 
 89 | We can specify "$" as the custom `separator` in
 90 | :meth:`~symspellpy.symspellpy.SymSpell.load_dictionary` like:
 91 | 
 92 | .. code-block:: python
 93 |   :emphasize-lines: 7
 94 | 
 95 |   from itertools import islice
 96 | 
 97 |   from symspellpy import SymSpell
 98 | 
 99 |   sym_spell = SymSpell()
100 |   dictionary_path = <path/to/dictionary>
101 |   sym_spell.load_dictionary(dictionary_path, 0, 1, separator="$")
102 | 
103 |   # Print out first 5 elements to demonstrate that dictionary is
104 |   # successfully loaded
105 |   print(list(islice(sym_spell.words.items(), 5)))
106 | 
107 | Output::
108 | 
109 |   [('the', 23135851162), ('abcs of', 10956800), ('of', 13151942776), ('aaron and', 10721728), ('abbott and', 7861376)]
110 | 
111 | Note that space separated terms such as "abcs of", "aaron and", and
112 | "abbott and" can now be found in `words` instead of `bigrams`.
113 | 
114 | `load_bigram_dictionary`
115 | ------------------------
116 | 
117 | We can also specify "$" as the custom `separator` in
118 | :meth:`~symspellpy.symspellpy.SymSpell.load_bigram_dictionary` like
119 | (note that we changed `count_index` from 2 to 1):
120 | 
121 | .. code-block:: python
122 |   :emphasize-lines: 7
123 | 
124 |   from itertools import islice
125 | 
126 |   from symspellpy import SymSpell
127 | 
128 |   sym_spell = SymSpell()
129 |   dictionary_path = <path/to/dictionary>
130 |   sym_spell.load_bigram_dictionary(dictionary_path, 0, 1, separator="$")
131 | 
132 |   # Print out first 5 elements to demonstrate that dictionary is
133 |   # successfully loaded
134 |   print(list(islice(sym_spell.bigrams.items(), 5)))
135 | 
136 | Output::
137 | 
138 |   [('the', 23135851162), ('abcs of', 10956800), ('of', 13151942776), ('aaron and', 10721728), ('abbott and', 7861376)]
139 | 
140 | Note that `bigrams` now **erroneously** contains monograms. Precautions
141 | should taken when creating bigram dictionary with custom separator.
142 | 
143 | Create dictionary from plain text file
144 | ======================================
145 | 
146 | Given a plain text file like::
147 | 
148 |   abc abc-def abc_def abc'def abc qwe qwe1 1qwe q1we 1234 1234
149 | 
150 | We can create a dictionary from the file using
151 | :meth:`~symspellpy.symspellpy.SymSpell.create_dictionary` like:
152 | 
153 | .. code-block:: python
154 |    :emphasize-lines: 5
155 | 
156 |    from symspellpy import SymSpell
157 | 
158 |    sym_spell = SymSpell()
159 |    corpus_path = <path/to/plain/text/file>
160 |    sym_spell.create_dictionary(corpus_path)
161 | 
162 |    print(sym_spell.words)
163 | 
164 | Output::
165 | 
166 |   {'abc': 4, 'def': 2, "abc'def": 1, 'qwe': 1, 'qwe1': 1, '1qwe': 1, 'q1we': 1, '1234': 2}
167 | 
168 | Note that :meth:`~symspellpy.symspellpy.SymSpell.create_dictionary` did not
169 | split words at apostrophes and did not check if the words contained numbers.
170 | 


--------------------------------------------------------------------------------
/docs/examples/index.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Examples
 3 | ========
 4 | 
 5 | .. only:: html
 6 | 
 7 | .. toctree::
 8 |     :maxdepth: 2
 9 | 
10 |     dictionary.rst
11 |     custom_distance_comparer.rst
12 |     lookup.rst
13 |     lookup_compound.rst
14 |     word_segmentation.rst
15 | 


--------------------------------------------------------------------------------
/docs/examples/lookup.rst:
--------------------------------------------------------------------------------
  1 | ******
  2 | lookup
  3 | ******
  4 | 
  5 | Basic usage
  6 | ===========
  7 | 
  8 | .. code-block:: python
  9 |   :emphasize-lines: 15
 10 | 
 11 |   import importlib.resources
 12 | 
 13 |   from symspellpy import SymSpell, Verbosity
 14 | 
 15 |   sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
 16 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
 17 |   # term_index is the column of the term and count_index is the
 18 |   # column of the term frequency
 19 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
 20 | 
 21 |   # lookup suggestions for single-word input strings
 22 |   input_term = "memebers"  # misspelling of "members"
 23 |   # max edit distance per lookup
 24 |   # (max_edit_distance_lookup <= max_dictionary_edit_distance)
 25 |   suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2)
 26 |   # display suggestion term, edit distance, and term frequency
 27 |   for suggestion in suggestions:
 28 |       print(suggestion)
 29 | 
 30 | Output::
 31 | 
 32 |   members, 1, 226656153
 33 | 
 34 | Return original word if no correction within edit distance is found
 35 | ===================================================================
 36 | 
 37 | .. code-block:: python
 38 |   :emphasize-lines: 15,16,17
 39 | 
 40 |   import importlib.resources
 41 | 
 42 |   from symspellpy import SymSpell, Verbosity
 43 | 
 44 |   sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
 45 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
 46 |   # term_index is the column of the term and count_index is the
 47 |   # column of the term frequency
 48 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
 49 | 
 50 |   # lookup suggestions for single-word input strings
 51 |   input_term = "apastraphee"  # misspelling of "apostrophe"
 52 |   # max edit distance per lookup
 53 |   # (max_edit_distance_lookup <= max_dictionary_edit_distance)
 54 |   suggestions = sym_spell.lookup(
 55 |       input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True
 56 |   )
 57 |   # display suggestion term, edit distance, and term frequency
 58 |   for suggestion in suggestions:
 59 |       print(suggestion)
 60 | 
 61 | Output::
 62 | 
 63 |   apastraphee, 3, 0
 64 | 
 65 | Note that `suggestions` would have been empty if `include_unknown` was
 66 | `False`.
 67 | 
 68 | Avoid correcting phrases matching regex
 69 | =======================================
 70 | 
 71 | .. code-block:: python
 72 |   :emphasize-lines: 14,15,16
 73 | 
 74 |   import importlib.resources
 75 | 
 76 |   from symspellpy import SymSpell, Verbosity
 77 | 
 78 |   sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
 79 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
 80 |   # column of the term frequency
 81 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
 82 | 
 83 |   # lookup suggestions for single-word input strings
 84 |   input_term = "members1"
 85 |   # max edit distance per lookup
 86 |   # (max_edit_distance_lookup <= max_dictionary_edit_distance)
 87 |   suggestions = sym_spell.lookup(
 88 |       input_term, Verbosity.CLOSEST, max_edit_distance=2, ignore_token=r"\w+\d"
 89 |   )
 90 |   # display suggestion term, edit distance, and term frequency
 91 |   for suggestion in suggestions:
 92 |       print(suggestion)
 93 | 
 94 | Output::
 95 | 
 96 |   members1, 0, 1
 97 | 
 98 | Note that `members, 1, 226656153` would be returned if `ignore_token` wasn't
 99 | specified.
100 | 
101 | Keep original casing
102 | ====================
103 | 
104 | .. code-block:: python
105 |   :emphasize-lines: 15,16,17
106 | 
107 |   import importlib.resources
108 | 
109 |   from symspellpy import SymSpell, Verbosity
110 | 
111 |   sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
112 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
113 |   # term_index is the column of the term and count_index is the
114 |   # column of the term frequency
115 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
116 | 
117 |   # lookup suggestions for single-word input strings
118 |   input_term = "mEmEbers"
119 |   # max edit distance per lookup
120 |   # (max_edit_distance_lookup <= max_dictionary_edit_distance)
121 |   suggestions = sym_spell.lookup(
122 |       input_term, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True
123 |   )
124 |   # display suggestion term, edit distance, and term frequency
125 |   for suggestion in suggestions:
126 |       print(suggestion)
127 | 
128 | 
129 | Output::
130 | 
131 |   mEmbers, 1, 226656153
132 | 
133 | Note that the uppercase of the second "E" was not passed on to "b" in the
134 | corrected word.
135 | 


--------------------------------------------------------------------------------
/docs/examples/lookup_compound.rst:
--------------------------------------------------------------------------------
 1 | ***************
 2 | lookup_compound
 3 | ***************
 4 | 
 5 | Basic usage
 6 | ===========
 7 | 
 8 | .. code-block:: python
 9 |   :emphasize-lines: 20
10 | 
11 |   import importlib.resources
12 | 
13 |   from symspellpy import SymSpell
14 | 
15 |   sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
16 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
17 |   bigram_path = importlib.resources("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
18 |   # term_index is the column of the term and count_index is the
19 |   # column of the term frequency
20 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
21 |   sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
22 | 
23 |   # lookup suggestions for multi-word input strings (supports compound
24 |   # splitting & merging)
25 |   input_term = (
26 |       "whereis th elove hehad dated forImuch of thepast who "
27 |       "couqdn'tread in sixtgrade and ins pired him"
28 |   )
29 |   # max edit distance per lookup (per single word, not per whole input string)
30 |   suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
31 |   # display suggestion term, edit distance, and term frequency
32 |   for suggestion in suggestions:
33 |       print(suggestion)
34 | 
35 | Output::
36 | 
37 |   where is the love he had dated for much of the past who couldn't read in six grade and inspired him, 9, 0
38 | 
39 | Keep original casing
40 | ====================
41 | 
42 | .. code-block:: python
43 |   :emphasize-lines: 20,21,22
44 | 
45 |   import importlib.resources
46 | 
47 |   from symspellpy import SymSpell
48 | 
49 |   sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
50 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
51 |   bigram_path = importlib.resources("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
52 |   # term_index is the column of the term and count_index is the
53 |   # column of the term frequency
54 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
55 |   sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
56 | 
57 |   # lookup suggestions for multi-word input strings (supports compound
58 |   # splitting & merging)
59 |   input_term = (
60 |       "whereis th elove heHAd dated forImuch of thEPast who "
61 |       "couqdn'tread in sixtgrade and ins pired him"
62 |   )
63 |   # max edit distance per lookup (per single word, not per whole input string)
64 |   suggestions = sym_spell.lookup_compound(
65 |       input_term, max_edit_distance=2, transfer_casing=True
66 |   )
67 |   # display suggestion term, edit distance, and term frequency
68 |   for suggestion in suggestions:
69 |       print(suggestion)
70 | 
71 | Output::
72 | 
73 |   where is the love he HAd dated for much of thE Past who couldn't read in six grade and inspired him, 9, 0
74 | 


--------------------------------------------------------------------------------
/docs/examples/word_segmentation.rst:
--------------------------------------------------------------------------------
 1 | *****************
 2 | word_segmentation
 3 | *****************
 4 | 
 5 | Basic usage
 6 | ===========
 7 | 
 8 | .. code-block:: python
 9 |   :emphasize-lines: 14
10 | 
11 |   import importlib.resources
12 | 
13 |   from symspellpy.symspellpy import SymSpell
14 | 
15 |   # Set max_dictionary_edit_distance to avoid spelling correction
16 |   sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
17 |   dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
18 |   # term_index is the column of the term and count_index is the
19 |   # column of the term frequency
20 |   sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
21 | 
22 |   # a sentence without any spaces
23 |   input_term = "thequickbrownfoxjumpsoverthelazydog"
24 |   result = sym_spell.word_segmentation(input_term)
25 |   print(f"{result.corrected_string}, {result.distance_sum}, {result.log_prob_sum}")
26 | 
27 | Output::
28 | 
29 |   the quick brown fox jumps over the lazy dog, 8, -34.491167981910635
30 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. symspellpy documentation master file, created by
 2 |    sphinx-quickstart on Tue Feb 19 09:03:54 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | :orphan:
 7 | 
 8 | .. title:: symspellpy: a SymSpell Python port
 9 | 
10 | .. toctree::
11 |    :hidden:
12 | 
13 |    users/installing
14 |    examples/index
15 |    api/index
16 | 
17 | **********
18 | symspellpy
19 | **********
20 | 
21 | symspellpy is a Python port of SymSpell_ v6.7.2, a Symmetric Delete
22 | spelling correction algorithm which provides much higher speed and lower
23 | memory consumption.
24 | 
25 | .. _SymSpell: https://github.com/wolfgarbe/SymSpell
26 | 
27 | Unit tests from the original project are implemented to ensure the accuracy
28 | of the port. Please note that the port has tried to replicate the code
29 | structure of the original project and has not been optimized for speed.
30 | 
31 | Installation
32 | ============
33 | 
34 | Visit the :doc:`symspellpy installation instructions <users/installing>`.
35 | 
36 | Usage examples
37 | ==============
38 | 
39 | Check out :doc:`examples <examples/index>` to learn how to use symspellpy.
40 | 
41 | Documentation
42 | =============
43 | 
44 | Check out the :doc:`documentation <api/index>`.
45 | 
46 | Indices and tables
47 | ------------------
48 | 
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | editdistpy>=0.1.3
2 | numpydoc==1.8.0
3 | sphinx==8.2.3
4 | sphinx-autodoc-typehints==3.1.0
5 | 


--------------------------------------------------------------------------------
/docs/users/installing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../INSTALL.rst
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "symspellpy"
 7 | version = "6.9.0"
 8 | dependencies = [
 9 |     "editdistpy>=0.1.3",
10 | ]
11 | requires-python = ">=3.9"
12 | authors = [
13 |     {name = "mmb L"},
14 | ]
15 | description = "Python SymSpell"
16 | readme = "README.md"
17 | license = {file = "LICENSE"}
18 | keywords = ["spellchecker", "symspell", "word segmentation"]
19 | classifiers = [
20 |     "Development Status :: 4 - Beta",
21 |     "Intended Audience :: Developers",
22 |     "Intended Audience :: Education",
23 |     "Natural Language :: English",
24 |     "License :: OSI Approved :: MIT License",
25 |     "Programming Language :: Python",
26 |     "Programming Language :: Python :: 3",
27 |     "Programming Language :: Python :: 3.9",
28 |     "Programming Language :: Python :: 3.10",
29 |     "Programming Language :: Python :: 3.11",
30 |     "Programming Language :: Python :: 3.12",
31 |     "Programming Language :: Python :: 3.13",
32 | ]
33 | 
34 | [project.urls]
35 | Repository = "https://github.com/mammothb/symspellpy"
36 | Documentation = "https://symspellpy.readthedocs.io/en/latest"
37 | Changelog = "https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md"
38 | 
39 | [tool.basedpyright]
40 | ignore = ["tests"]
41 | pythonVersion = "3.9"
42 | 
43 | reportUnusedCallResult = "none"
44 | 
45 | [tool.ruff]
46 | line-length = 88
47 | indent-width = 4
48 | 
49 | [tool.ruff.format]
50 | docstring-code-format = false
51 | indent-style = "space"
52 | line-ending = "auto"
53 | quote-style = "double"
54 | skip-magic-trailing-comma = false
55 | 
56 | [tool.setuptools.dynamic]
57 | version = {attr = "symspellpy.__version__"}
58 | 
59 | [tool.setuptools.packages.find]
60 | where = ["."]
61 | include = ["symspellpy"]
62 | 
63 | [tool.setuptools.package-data]
64 | symspellpy = ["frequency_*.txt"]
65 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | editdistpy>=0.1.3
2 | 
3 | # For testing
4 | importlib-resources>=6.3.2
5 | pytest==8.3.4
6 | pytest-cov==6.0.0
7 | 


--------------------------------------------------------------------------------
/symspellpy/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2025 mmb L (Python port)
 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
 5 | #
 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | # of this software and associated documentation files (the "Software"), to deal
 8 | # in the Software without restriction, including without limitation the rights
 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 | 
16 | """symspellpy
17 | 
18 | .. moduleauthor:: mmb L <mammothb@hotmail.com>
19 | .. moduleauthor:: Wolf Garbe <wolf.garbe@faroo.com>
20 | """
21 | 
22 | from . import editdistance, helpers, logging
23 | from .symspellpy import SymSpell
24 | from .verbosity import Verbosity
25 | 


--------------------------------------------------------------------------------
/symspellpy/abstract_distance_comparer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Optional
 3 | 
 4 | 
 5 | class AbstractDistanceComparer(ABC):
 6 |     """An interface to compute relative distance between two strings."""
 7 | 
 8 |     @abstractmethod
 9 |     def distance(
10 |         self, string_1: Optional[str], string_2: Optional[str], max_distance: int
11 |     ) -> int:
12 |         """Returns a measure of the distance between two strings.
13 | 
14 |         Args:
15 |             string_1: One of the strings to compare.
16 |             string_2: The other string to compare.
17 |             max_distance: The maximum distance that is of interest.
18 | 
19 |         Returns:
20 |             -1 if the distance is greater than the max_distance, 0 if the strings
21 |                 are equivalent, otherwise a positive number whose magnitude
22 |                 increases as difference between the strings increases.
23 |         """
24 | 


--------------------------------------------------------------------------------
/symspellpy/composition.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2025 mmb L (Python port)
 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
 5 | #
 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | # of this software and associated documentation files (the "Software"), to deal
 8 | # in the Software without restriction, including without limitation the rights
 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 | 
16 | """
17 | .. module:: compostiion
18 |    :synopsis: Data class for :meth:`symspellpy.symspellpy.word_segmentation`.
19 | """
20 | 
21 | from typing import NamedTuple
22 | 
23 | 
24 | class Composition(NamedTuple):
25 |     """Used by :meth:`word_segmentation`.
26 | 
27 |     Attributes:
28 |         segmented_string: The word segmented string.
29 |         corrected_string: The spelling corrected string.
30 |         distance_sum: The sum of edit distance between input string and
31 |             corrected string
32 |         log_prob_sum: The sum of word occurrence probabilities in log
33 |             scale (a measure of how common and probable the corrected
34 |             segmentation is).
35 |     """
36 | 
37 |     segmented_string: str = ""
38 |     corrected_string: str = ""
39 |     distance_sum: int = 0
40 |     log_prob_sum: float = 0
41 | 
42 |     @classmethod
43 |     def create(
44 |         cls,
45 |         composition: "Composition",
46 |         segmented_part: str,
47 |         corrected_part: str,
48 |         distance: int,
49 |         log_prob: float,
50 |     ) -> "Composition":
51 |         """Creates a Composition by appending to an existing Composition."""
52 |         return cls(
53 |             composition.segmented_string + segmented_part,
54 |             composition.corrected_string + corrected_part,
55 |             composition.distance_sum + distance,
56 |             composition.log_prob_sum + log_prob,
57 |         )
58 | 


--------------------------------------------------------------------------------
/symspellpy/editdistance.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2025 mmb L (Python port)
  4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | 
 16 | """
 17 | .. module:: editdistance
 18 |    :synopsis: Module for edit distance algorithms.
 19 | """
 20 | 
 21 | import warnings
 22 | from enum import Enum
 23 | from typing import Optional
 24 | 
 25 | from editdistpy import damerau_osa, levenshtein
 26 | 
 27 | from symspellpy import helpers
 28 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
 29 | 
 30 | 
 31 | class DistanceAlgorithm(Enum):
 32 |     """Supported edit distance algorithms."""
 33 | 
 34 |     LEVENSHTEIN = 0  #: Levenshtein algorithm.
 35 |     DAMERAU_OSA = 1  #: Damerau optimal string alignment algorithm
 36 |     LEVENSHTEIN_FAST = 2  #: Fast Levenshtein algorithm.
 37 |     DAMERAU_OSA_FAST = 3  #: Fast Damerau optimal string alignment algorithm
 38 |     USER_PROVIDED = 4  #: User provided custom edit distance algorithm
 39 | 
 40 | 
 41 | class EditDistance:
 42 |     """Edit distance algorithms.
 43 | 
 44 |     Args:
 45 |         algorithm: The distance algorithm to use.
 46 | 
 47 |     Attributes:
 48 |         _algorithm (:class:`DistanceAlgorithm`): The edit distance algorithm to
 49 |             use.
 50 |         _distance_comparer (:class:`AbstractDistanceComparer`): An object to
 51 |             compute the relative distance between two strings. The concrete
 52 |             object will be chosen based on the value of :attr:`_algorithm`.
 53 | 
 54 |     Raises:
 55 |         ValueError: If `algorithm` specifies an invalid distance algorithm.
 56 |     """
 57 | 
 58 |     def __init__(
 59 |         self,
 60 |         algorithm: DistanceAlgorithm,
 61 |         comparer: Optional[AbstractDistanceComparer] = None,
 62 |     ) -> None:
 63 |         if algorithm != DistanceAlgorithm.USER_PROVIDED and comparer is not None:
 64 |             warnings.warn(
 65 |                 f"A comparer is passed in but algorithm is not {DistanceAlgorithm.USER_PROVIDED.value}. A built-in comparer will be used."
 66 |             )
 67 | 
 68 |         self._distance_comparer: AbstractDistanceComparer
 69 |         self._algorithm = algorithm
 70 |         if algorithm == DistanceAlgorithm.LEVENSHTEIN:
 71 |             self._distance_comparer = Levenshtein()
 72 |         elif algorithm == DistanceAlgorithm.DAMERAU_OSA:
 73 |             self._distance_comparer = DamerauOsa()
 74 |         elif algorithm == DistanceAlgorithm.LEVENSHTEIN_FAST:
 75 |             self._distance_comparer = LevenshteinFast()
 76 |         elif algorithm == DistanceAlgorithm.DAMERAU_OSA_FAST:
 77 |             self._distance_comparer = DamerauOsaFast()
 78 |         elif algorithm == DistanceAlgorithm.USER_PROVIDED:
 79 |             if not isinstance(comparer, AbstractDistanceComparer):
 80 |                 raise ValueError(
 81 |                     f"{algorithm.value} selected but no comparer passed in."
 82 |                 )
 83 |             self._distance_comparer = comparer
 84 |         else:
 85 |             raise ValueError("unknown distance algorithm")
 86 | 
 87 |     def compare(self, string_1: str, string_2: str, max_distance: int) -> int:
 88 |         """Compares a string to the base string to determine the edit distance,
 89 |         using the previously selected algorithm.
 90 | 
 91 |         Args:
 92 |             string_1: Base string.
 93 |             string_2: The string to compare.
 94 |             max_distance: The maximum distance allowed.
 95 | 
 96 |         Returns:
 97 |             The edit distance (or -1 if `max_distance` exceeded).
 98 |         """
 99 |         return self._distance_comparer.distance(string_1, string_2, max_distance)
100 | 
101 | 
102 | class Levenshtein(AbstractDistanceComparer):
103 |     """Provides Levenshtein algorithm for computing edit distance metric between
104 |     two strings.
105 | 
106 |     Attributes:
107 |         _base_char_1_costs (list[int]):
108 |     """
109 | 
110 |     def __init__(self):
111 |         self._base_char_1_costs: list[int] = []
112 | 
113 |     def distance(
114 |         self, string_1: Optional[str], string_2: Optional[str], max_distance: int
115 |     ) -> int:
116 |         """Computes the Levenshtein edit distance between two strings.
117 | 
118 |         Args:
119 |             string_1: One of the strings to compare.
120 |             string_2: The other string to compare.
121 |             max_distance: The maximum distance that is of interest.
122 | 
123 |         Returns:
124 |             -1 if the distance is greater than the max_distance, 0 if the strings
125 |                 are equivalent, otherwise a positive number whose magnitude
126 |                 increases as difference between the strings increases.
127 |         """
128 |         if string_1 is None or string_2 is None:
129 |             return helpers.null_distance_results(string_1, string_2, max_distance)
130 |         if max_distance <= 0:
131 |             return 0 if string_1 == string_2 else -1
132 |         max_distance = int(min(2**31 - 1, max_distance))
133 |         # if strings of different lengths, ensure shorter string is in string_1.
134 |         # This can result in a little faster speed by spending more time spinning
135 |         # just the inner loop during the main processing.
136 |         if len(string_1) > len(string_2):
137 |             string_2, string_1 = string_1, string_2
138 |         if len(string_2) - len(string_1) > max_distance:
139 |             return -1
140 |         # identify common suffix and/or prefix that can be ignored
141 |         len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2)
142 |         if len_1 == 0:
143 |             return len_2 if len_2 <= max_distance else -1
144 | 
145 |         if len_2 > len(self._base_char_1_costs):
146 |             self._base_char_1_costs = [0 for _ in range(len_2)]
147 |         if max_distance < len_2:
148 |             return self._distance_max(
149 |                 string_1,
150 |                 string_2,
151 |                 len_1,
152 |                 len_2,
153 |                 start,
154 |                 max_distance,
155 |                 self._base_char_1_costs,
156 |             )
157 |         return self._distance(
158 |             string_1, string_2, len_1, len_2, start, self._base_char_1_costs
159 |         )
160 | 
161 |     @staticmethod
162 |     def _distance(
163 |         string_1: str,
164 |         string_2: str,
165 |         len_1: int,
166 |         len_2: int,
167 |         start: int,
168 |         char_1_costs: list[int],
169 |     ) -> int:
170 |         """Internal implementation of the core Levenshtein algorithm.
171 | 
172 |         **From**: https://github.com/softwx/SoftWx.Match
173 |         """
174 |         char_1_costs = [j + 1 for j in range(len_2)]
175 |         current_cost = 0
176 |         for i in range(len_1):
177 |             left_char_cost = above_char_cost = i
178 |             char_1 = string_1[start + i]
179 |             for j in range(len_2):
180 |                 # cost of diagonal (substitution)
181 |                 current_cost = left_char_cost
182 |                 left_char_cost = char_1_costs[j]
183 |                 if string_2[start + j] != char_1:
184 |                     # substitution if neither of the two conditions below
185 |                     if above_char_cost < current_cost:
186 |                         current_cost = above_char_cost
187 |                     if left_char_cost < current_cost:
188 |                         current_cost = left_char_cost
189 |                     current_cost += 1
190 |                 char_1_costs[j] = above_char_cost = current_cost
191 |         return current_cost
192 | 
193 |     @staticmethod
194 |     def _distance_max(
195 |         string_1: str,
196 |         string_2: str,
197 |         len_1: int,
198 |         len_2: int,
199 |         start: int,
200 |         max_distance: int,
201 |         char_1_costs: list[int],
202 |     ) -> int:
203 |         """Internal implementation of the core Levenshtein algorithm that accepts
204 |         a max_distance.
205 | 
206 |         **From**: https://github.com/softwx/SoftWx.Match
207 |         """
208 |         char_1_costs = [
209 |             j + 1 if j < max_distance else max_distance + 1 for j in range(len_2)
210 |         ]
211 |         len_diff = len_2 - len_1
212 |         j_start_offset = max_distance - len_diff
213 |         j_start = 0
214 |         j_end = max_distance
215 |         current_cost = 0
216 |         for i in range(len_1):
217 |             char_1 = string_1[start + i]
218 |             prev_char_1_cost = above_char_cost = i
219 |             # no need to look beyond window of lower right diagonal -
220 |             # max_distance cells (lower right diag is i - lenDiff) and the upper
221 |             # left diagonal + max_distance cells (upper left is i)
222 |             j_start += 1 if i > j_start_offset else 0
223 |             j_end += 1 if j_end < len_2 else 0
224 |             for j in range(j_start, j_end):
225 |                 # cost of diagonal (substitution)
226 |                 current_cost = prev_char_1_cost
227 |                 prev_char_1_cost = char_1_costs[j]
228 |                 if string_2[start + j] != char_1:
229 |                     # substitution if neither of the two conditions below
230 |                     if above_char_cost < current_cost:
231 |                         current_cost = above_char_cost
232 |                     if prev_char_1_cost < current_cost:
233 |                         current_cost = prev_char_1_cost
234 |                     current_cost += 1
235 |                 char_1_costs[j] = above_char_cost = current_cost
236 |             if char_1_costs[i + len_diff] > max_distance:
237 |                 return -1
238 |         return current_cost if current_cost <= max_distance else -1
239 | 
240 | 
241 | class DamerauOsa(AbstractDistanceComparer):
242 |     """Provides optimized methods for computing Damerau-Levenshtein Optimal
243 |     String Alignment (OSA) comparisons between two strings.
244 | 
245 |     Attributes:
246 |         _base_char_1_costs (list[int]):
247 |         _base_prev_char_1_costs (list[int]):
248 |     """
249 | 
250 |     def __init__(self) -> None:
251 |         self._base_char_1_costs: list[int] = []
252 |         self._base_prev_char_1_costs: list[int] = []
253 | 
254 |     def distance(
255 |         self, string_1: Optional[str], string_2: Optional[str], max_distance: int
256 |     ) -> int:
257 |         """Computes the Damerau-Levenshtein optimal string alignment edit
258 |         distance between two strings.
259 | 
260 |         Args:
261 |             string_1: One of the strings to compare.
262 |             string_2: The other string to compare.
263 |             max_distance: The maximum distance that is of interest.
264 | 
265 |         Returns:
266 |             -1 if the distance is greater than the max_distance, 0 if the strings
267 |                 are equivalent, otherwise a positive number whose magnitude
268 |                 increases as difference between the strings increases.
269 |         """
270 |         if string_1 is None or string_2 is None:
271 |             return helpers.null_distance_results(string_1, string_2, max_distance)
272 |         if max_distance <= 0:
273 |             return 0 if string_1 == string_2 else -1
274 |         max_distance = int(min(2**31 - 1, max_distance))
275 |         # if strings of different lengths, ensure shorter string is in string_1.
276 |         # This can result in a little faster speed by spending more time spinning
277 |         # just the inner loop during the main processing.
278 |         if len(string_1) > len(string_2):
279 |             string_2, string_1 = string_1, string_2
280 |         if len(string_2) - len(string_1) > max_distance:
281 |             return -1
282 |         # identify common suffix and/or prefix that can be ignored
283 |         len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2)
284 |         if len_1 == 0:
285 |             return len_2 if len_2 <= max_distance else -1
286 | 
287 |         if len_2 > len(self._base_char_1_costs):
288 |             self._base_char_1_costs = [0 for _ in range(len_2)]
289 |             self._base_prev_char_1_costs = [0 for _ in range(len_2)]
290 |         if max_distance < len_2:
291 |             return self._distance_max(
292 |                 string_1,
293 |                 string_2,
294 |                 len_1,
295 |                 len_2,
296 |                 start,
297 |                 max_distance,
298 |                 self._base_char_1_costs,
299 |                 self._base_prev_char_1_costs,
300 |             )
301 |         return self._distance(
302 |             string_1,
303 |             string_2,
304 |             len_1,
305 |             len_2,
306 |             start,
307 |             self._base_char_1_costs,
308 |             self._base_prev_char_1_costs,
309 |         )
310 | 
311 |     @staticmethod
312 |     def _distance(
313 |         string_1: str,
314 |         string_2: str,
315 |         len_1: int,
316 |         len_2: int,
317 |         start: int,
318 |         char_1_costs: list[int],
319 |         prev_char_1_costs: list[int],
320 |     ) -> int:
321 |         """Internal implementation of the core Damerau-Levenshtein, optimal
322 |         string alignment algorithm.
323 | 
324 |         **From**: https://github.com/softwx/SoftWx.Match
325 |         """
326 |         char_1_costs = [j + 1 for j in range(len_2)]
327 |         char_1 = " "
328 |         current_cost = 0
329 |         for i in range(len_1):
330 |             prev_char_1 = char_1
331 |             char_1 = string_1[start + i]
332 |             char_2 = " "
333 |             left_char_cost = above_char_cost = i
334 |             next_trans_cost = 0
335 |             for j in range(len_2):
336 |                 this_trans_cost = next_trans_cost
337 |                 next_trans_cost = prev_char_1_costs[j]
338 |                 # cost of diagonal (substitution)
339 |                 prev_char_1_costs[j] = current_cost = left_char_cost
340 |                 # left now equals current cost (which will be diagonal
341 |                 # at next iteration)
342 |                 left_char_cost = char_1_costs[j]
343 |                 prev_char_2 = char_2
344 |                 char_2 = string_2[start + j]
345 |                 if char_1 != char_2:
346 |                     # substitution if neither of two conditions below
347 |                     if above_char_cost < current_cost:
348 |                         current_cost = above_char_cost
349 |                     if left_char_cost < current_cost:
350 |                         current_cost = left_char_cost
351 |                     current_cost += 1
352 |                     if (
353 |                         i != 0
354 |                         and j != 0
355 |                         and char_1 == prev_char_2
356 |                         and prev_char_1 == char_2
357 |                         and this_trans_cost + 1 < current_cost
358 |                     ):
359 |                         # transposition
360 |                         current_cost = this_trans_cost + 1
361 |                 char_1_costs[j] = above_char_cost = current_cost
362 |         return current_cost
363 | 
364 |     @staticmethod
365 |     def _distance_max(
366 |         string_1: str,
367 |         string_2: str,
368 |         len_1: int,
369 |         len_2: int,
370 |         start: int,
371 |         max_distance: int,
372 |         char_1_costs: list[int],
373 |         prev_char_1_costs: list[int],
374 |     ) -> int:
375 |         """Internal implementation of the core Damerau-Levenshtein, optimal
376 |         string alignment algorithm that accepts a max_distance.
377 | 
378 |         **From**: https://github.com/softwx/SoftWx.Match
379 |         """
380 |         char_1_costs = [
381 |             j + 1 if j < max_distance else max_distance + 1 for j in range(len_2)
382 |         ]
383 |         len_diff = len_2 - len_1
384 |         j_start_offset = max_distance - len_diff
385 |         j_start = 0
386 |         j_end = max_distance
387 |         char_1 = " "
388 |         current_cost = 0
389 |         for i in range(len_1):
390 |             prev_char_1 = char_1
391 |             char_1 = string_1[start + i]
392 |             char_2 = " "
393 |             left_char_cost = above_char_cost = i
394 |             next_trans_cost = 0
395 |             # no need to look beyond window of lower right diagonal -
396 |             # max_distance cells (lower right diag is i - len_diff) and the upper
397 |             # left diagonal + max_distance cells (upper left is i)
398 |             j_start += 1 if i > j_start_offset else 0
399 |             j_end += 1 if j_end < len_2 else 0
400 |             for j in range(j_start, j_end):
401 |                 this_trans_cost = next_trans_cost
402 |                 next_trans_cost = prev_char_1_costs[j]
403 |                 # cost of diagonal (substitution)
404 |                 prev_char_1_costs[j] = current_cost = left_char_cost
405 |                 # left now equals current cost (which will be diagonal at next
406 |                 # iteration)
407 |                 left_char_cost = char_1_costs[j]
408 |                 prev_char_2 = char_2
409 |                 char_2 = string_2[start + j]
410 |                 if char_1 != char_2:
411 |                     # substitution if neither of two conditions below
412 |                     if above_char_cost < current_cost:
413 |                         current_cost = above_char_cost
414 |                     if left_char_cost < current_cost:
415 |                         current_cost = left_char_cost
416 |                     current_cost += 1
417 |                     if (
418 |                         i != 0
419 |                         and j != 0
420 |                         and char_1 == prev_char_2
421 |                         and prev_char_1 == char_2
422 |                         and this_trans_cost + 1 < current_cost
423 |                     ):
424 |                         # transposition
425 |                         current_cost = this_trans_cost + 1
426 |                 char_1_costs[j] = above_char_cost = current_cost
427 |             if char_1_costs[i + len_diff] > max_distance:
428 |                 return -1
429 |         return current_cost if current_cost <= max_distance else -1
430 | 
431 | 
432 | class LevenshteinFast(AbstractDistanceComparer):
433 |     """Provides an interface for computing edit distance metric between two
434 |     strings using the fast Levenshtein algorithm.
435 |     """
436 | 
437 |     def distance(
438 |         self, string_1: Optional[str], string_2: Optional[str], max_distance: int
439 |     ) -> int:
440 |         """Computes the Levenshtein edit distance between two strings.
441 | 
442 |         Args:
443 |             string_1: One of the strings to compare.
444 |             string_2: The other string to compare.
445 |             max_distance: The maximum distance that is of interest.
446 | 
447 |         Returns:
448 |             -1 if the distance is greater than the max_distance, 0 if the strings
449 |                 are equivalent, otherwise a positive number whose magnitude
450 |                 increases as difference between the strings increases.
451 |         """
452 |         return levenshtein.distance(string_1, string_2, max_distance)
453 | 
454 | 
455 | class DamerauOsaFast(AbstractDistanceComparer):
456 |     """Provides an interface for computing edit distance metric between two
457 |     strings using the fast Damerau-Levenshtein Optimal String Alignment (OSA)
458 |     algorithm.
459 |     """
460 | 
461 |     def distance(
462 |         self, string_1: Optional[str], string_2: Optional[str], max_distance: int
463 |     ) -> int:
464 |         """Computes the Damerau-Levenshtein optimal string alignment edit
465 |         distance between two strings.
466 | 
467 |         Args:
468 |             string_1: One of the strings to compare.
469 |             string_2: The other string to compare.
470 |             max_distance: The maximum distance that is of interest.
471 | 
472 |         Returns:
473 |             -1 if the distance is greater than the max_distance, 0 if the strings
474 |                 are equivalent, otherwise a positive number whose magnitude
475 |                 increases as difference between the strings increases.
476 |         """
477 |         return damerau_osa.distance(string_1, string_2, max_distance)
478 | 


--------------------------------------------------------------------------------
/symspellpy/helpers.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2025 mmb L (Python port)
  4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | 
 16 | """
 17 | .. module:: helpers
 18 |    :synopsis: Helper functions
 19 | """
 20 | 
 21 | import re
 22 | import sys
 23 | import warnings
 24 | from difflib import SequenceMatcher
 25 | from typing import Optional
 26 | 
 27 | 
 28 | def _rename_args(kwargs_map: dict[str, str], version: str):
 29 |     def decorator(func):
 30 |         def wrapped(*args, **kwargs):
 31 |             new_kwargs = {}
 32 |             for k, v in kwargs.items():
 33 |                 if k in kwargs_map:
 34 |                     warnings.warn(
 35 |                         f"Keyword argument '{k}' is deprecated and will be removed in {version}. Use '{kwargs_map[k]}' instead.",
 36 |                         DeprecationWarning,
 37 |                     )
 38 |                 new_kwargs[kwargs_map.get(k, k)] = v
 39 |             return func(*args, **new_kwargs)
 40 | 
 41 |         return wrapped
 42 | 
 43 |     return decorator
 44 | 
 45 | 
 46 | def case_transfer_matching(cased_text: str, uncased_text: str) -> str:
 47 |     """Transfers the casing from one text to another - assuming that they are
 48 |     'matching' texts, alias they have the same length.
 49 | 
 50 |     Args:
 51 |         cased_text: Text with varied casing.
 52 |         uncased_text: Text that is in lowercase only.
 53 | 
 54 |     Returns:
 55 |         Text with the content of `uncased_text` and the casing of `cased_text`.
 56 | 
 57 |     Raises:
 58 |         ValueError: If the input texts have different lengths.
 59 |     """
 60 |     if len(cased_text) != len(uncased_text):
 61 |         raise ValueError(
 62 |             "'cased_text' and 'uncased_text' don't have the same length, use case_transfer_similar() instead"
 63 |         )
 64 | 
 65 |     return "".join(
 66 |         [
 67 |             y.upper() if x.isupper() else y.lower()
 68 |             for x, y in zip(cased_text, uncased_text)
 69 |         ]
 70 |     )
 71 | 
 72 | 
 73 | def case_transfer_similar(cased_text: str, uncased_text: str) -> str:
 74 |     """Transfers the casing from one text to another - for similar (not matching)
 75 |     text.
 76 | 
 77 |     Use `difflib.SequenceMatcher` to identify the different type of changes
 78 |     needed to turn `cased_text` into `uncased_text`.
 79 | 
 80 |     - For inserted sections: transfer the casing from the prior character. If no
 81 |       character before or the character before is the space, transfer the casing
 82 |       from the following character.
 83 |     - For deleted sections: no case transfer is required.
 84 |     - For equal sections: swap out the text with the original, the cased one, a
 85 |       otherwise the two are the same.
 86 |     - For replaced sections: transfer the casing using
 87 |       :meth:`case_transfer_matching` if the two has the same length, otherwise
 88 |       transfer character-by-character and carry the last casing over to any
 89 |       additional characters.
 90 | 
 91 |     Args:
 92 |         cased_text: Text with varied casing.
 93 |         uncased_text: Text in lowercase.
 94 | 
 95 |     Returns:
 96 |         Text with the content of `uncased_text` but the casing of `cased_text`.
 97 | 
 98 |     Raises:
 99 |         ValueError: If `cased_text` is empty.
100 |     """
101 |     if not uncased_text:
102 |         return uncased_text
103 | 
104 |     if not cased_text:
105 |         raise ValueError("'cased_text' cannot be empty")
106 | 
107 |     matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text)
108 |     result = ""
109 | 
110 |     for tag, i1, i2, j1, j2 in matcher.get_opcodes():
111 |         if tag == "delete":
112 |             continue
113 |         if tag == "insert":
114 |             # For the first character or space on the left, take the casing from
115 |             # the following character. Else take case the prior character
116 |             ia_ref = i1 if i1 == 0 or cased_text[i1 - 1] == " " else i1 - 1
117 |             if cased_text[ia_ref].isupper():
118 |                 result += uncased_text[j1:j2].upper()
119 |             else:
120 |                 result += uncased_text[j1:j2].lower()
121 |         elif tag == "equal":
122 |             # Transfer the text from the cased_text, as anyhow they are equal
123 |             # (without the casing)
124 |             result += cased_text[i1:i2]
125 |         else:
126 |             cased_seq = cased_text[i1:i2]
127 |             uncased_seq = uncased_text[j1:j2]
128 | 
129 |             if len(cased_seq) == len(uncased_seq):
130 |                 result += case_transfer_matching(cased_seq, uncased_seq)
131 |             else:
132 |                 # transfer the casing character-by-character and using the last
133 |                 # casing to continue if we run out of the sequence
134 |                 for cased, uncased in zip(cased_seq, uncased_seq):
135 |                     result += uncased.upper() if cased.isupper() else uncased.lower()
136 |                 # Apply casing from the last character of cased_seq to the rest
137 |                 # of the uncased_seq
138 |                 if len(cased_seq) < len(uncased_seq):
139 |                     upper = cased_seq[-1].isupper()
140 |                     idx = len(cased_seq)
141 |                     result += "".join(
142 |                         map(str.upper if upper else str.lower, uncased_seq[idx:])
143 |                     )
144 |     return result
145 | 
146 | 
147 | def increment_count(count: int, count_previous: int) -> int:
148 |     """Increments count up to ``sys.maxsize``."""
149 |     return (
150 |         count_previous + count if sys.maxsize - count_previous > count else sys.maxsize
151 |     )
152 | 
153 | 
154 | def is_acronym(word: str, contain_digits: bool = False) -> bool:
155 |     """Checks if the word is all caps (acronym) and/or contain numbers.
156 | 
157 |     Args:
158 |         word: The word to check
159 |         contain_digits: A flag to determine whether any term with digits can be
160 |             considered as acronym
161 | 
162 |     Returns:
163 |         True if the word is all caps and/or contain numbers, e.g., ABCDE, AB12C,
164 |             abc12, ab12c. False if the word contains lower case letters, e.g.,
165 |             abcde, ABCde, abcDE, abCDe.
166 |     """
167 |     return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None or (
168 |         contain_digits and any(i.isdigit() for i in word)
169 |     )
170 | 
171 | 
172 | @_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0")
173 | def null_distance_results(
174 |     string_1: Optional[str], string_2: Optional[str], max_distance: int
175 | ) -> int:
176 |     """Determines the proper return value of an edit distance function when one
177 |     or both strings are null.
178 | 
179 |     Args:
180 |         string_1: Base string.
181 |         string_2: The string to compare.
182 |         max_distance: The maximum distance allowed.
183 | 
184 |     Returns:
185 |         -1 if the distance is greater than the max_distance, 0 if the strings are
186 |             equivalent (both are None), otherwise a positive number whose
187 |             magnitude is the length of the string which is not None.
188 |     """
189 |     if string_1 is None:
190 |         if string_2 is None:
191 |             return 0
192 |         return len(string_2) if len(string_2) <= max_distance else -1
193 |     return len(string_1) if len(string_1) <= max_distance else -1
194 | 
195 | 
196 | def parse_words(
197 |     phrase: str, preserve_case: bool = False, split_by_space: bool = False
198 | ) -> list[str]:
199 |     """Creates a non-unique wordlist from sample text. Language independent
200 |     (e.g. works with Chinese characters)
201 | 
202 |     Args:
203 |         phrase: Sample text that could contain one or more words.
204 |         preserve_case: A flag to determine if we can to preserve the cases or
205 |             convert all to lowercase.
206 |         split_by_space: Splits the phrase into words simply based on space.
207 | 
208 |     Returns:
209 |         A list of words
210 |     """
211 |     if split_by_space:
212 |         if preserve_case:
213 |             return phrase.split()
214 |         return phrase.lower().split()
215 |     # \W non-words, use negated set to ignore non-words and "_" (underscore).
216 |     # Compatible with non-latin characters, does not split words at apostrophes
217 |     if preserve_case:
218 |         return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase)
219 |     return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())
220 | 
221 | 
222 | @_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0")
223 | def prefix_suffix_prep(string_1: str, string_2: str) -> tuple[int, int, int]:
224 |     """Calculates starting position and lengths of two strings such that common
225 |     prefix and suffix substrings are excluded.
226 |     Expects len(string_1) <= len(string_2).
227 | 
228 |     Args:
229 |         string_1: Base string.
230 |         string_2: The string to compare.
231 | 
232 |     Returns:
233 |         A tuple of lengths of the part excluding common prefix and suffix, and
234 |             the starting position.
235 |     """
236 |     # this is also the minimun length of the two strings
237 |     len_1 = len(string_1)
238 |     len_2 = len(string_2)
239 |     # suffix common to both strings can be ignored
240 |     while len_1 != 0 and string_1[len_1 - 1] == string_2[len_2 - 1]:
241 |         len_1 -= 1
242 |         len_2 -= 1
243 |     # prefix common to both strings can be ignored
244 |     start = 0
245 |     while start != len_1 and string_1[start] == string_2[start]:
246 |         start += 1
247 |     if start != 0:
248 |         len_1 -= start
249 |         # length of the part excluding common prefix and suffix
250 |         len_2 -= start
251 |     return len_1, len_2, start
252 | 
253 | 
254 | def to_similarity(distance: int, length: int) -> float:
255 |     """Calculates a similarity measure from an edit distance.
256 | 
257 |     Args:
258 |         distance: The edit distance between two strings.
259 |         length: The length of the longer of the two strings the edit distance is
260 |             from.
261 | 
262 |     Returns:
263 |         A similarity value from 0 to 1.0 (1 - (length / distance)), -1 if
264 |             distance is negative
265 |     """
266 |     return -1 if distance < 0 else 1.0 - distance / length
267 | 
268 | 
269 | def try_parse_int64(string: str) -> Optional[int]:
270 |     """Converts the string representation of a number to its 64-bit signed
271 |     integer equivalent.
272 | 
273 |     Args:
274 |         string: String representation of a number.
275 | 
276 |     Returns:
277 |         The 64-bit signed integer equivalent, or None if conversion failed or if
278 |             the number is less than the min value or greater than the max value
279 |             of a 64-bit signed integer.
280 |     """
281 |     try:
282 |         ret = int(string)
283 |     except ValueError:
284 |         return None
285 |     return ret if -(2**63) <= ret <= 2**63 - 1 else None
286 | 
287 | 
288 | class DictIO:
289 |     """An iterator wrapper for python dictionary to format the output as required
290 |     by :meth:`load_dictionary_stream` and :meth:`load_dictionary_bigram_stream`.
291 | 
292 |     Args:
293 |         dictionary: dictionary with words as keys and frequency count as values.
294 |         separator: Separator characters between term(s) and count.
295 | 
296 |     Attributes:
297 |         iteritems: An iterator object of dictionary.items().
298 |         separator: Separator characters between term(s) and count.
299 |     """
300 | 
301 |     def __init__(self, dictionary: dict[str, int], separator: str = " ") -> None:
302 |         self.iteritems = iter(dictionary.items())
303 |         self.separator = separator
304 | 
305 |     def __iter__(self) -> "DictIO":
306 |         return self
307 | 
308 |     def __next__(self) -> str:
309 |         return self.separator.join(map(str, next(self.iteritems)))
310 | 


--------------------------------------------------------------------------------
/symspellpy/logging.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2025 mmb L (Python port)
 4 | #
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | import logging
16 | import sys
17 | 
18 | logger = logging.getLogger("symspellpy")
19 | 
20 | handler = logging.StreamHandler(sys.stderr)
21 | handler.setFormatter(
22 |     logging.Formatter(fmt="%(asctime)s: %(levelname).1s %(name)s] %(message)s")
23 | )
24 | 
25 | logger.addHandler(handler)
26 | 


--------------------------------------------------------------------------------
/symspellpy/pickle_mixin.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2025 mmb L (Python port)
  4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | 
 16 | """
 17 | .. module:: pickle_mixing
 18 |    :synopsis: Mixin to provide pickle loading and saving functionalities.
 19 | """
 20 | 
 21 | import gzip
 22 | import logging
 23 | import pickle
 24 | from operator import itemgetter
 25 | from pathlib import Path
 26 | from typing import IO, Optional, Union, cast
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | # Protocol only available in py38
 32 | # class SymSpellProtocol(Protocol):
 33 | #     data_version: int
 34 | #     _count_threshold: int
 35 | #     _max_dictionary_edit_distance: int
 36 | #     _prefix_length: int
 37 | #     _deletes: dict[str, list[str]]
 38 | #     _words: dict[str, int]
 39 | #     _max_length: int
 40 | 
 41 | 
 42 | class PickleMixin:
 43 |     """Implements saving and loading pickle functionality for SymSpell."""
 44 | 
 45 |     data_version: int
 46 |     _below_threshold_words: dict[str, int]
 47 |     _bigrams: dict[str, int]
 48 |     _deletes: dict[str, list[str]]
 49 |     _words: dict[str, int]
 50 | 
 51 |     _count_threshold: int
 52 |     _max_dictionary_edit_distance: int
 53 |     _max_length: int
 54 |     _prefix_length: int
 55 | 
 56 |     def load_pickle(
 57 |         self,
 58 |         data: Union[bytes, Path],
 59 |         compressed: bool = True,
 60 |         from_bytes: bool = False,
 61 |     ) -> bool:
 62 |         """Loads delete combination from file as pickle. This will reduce the
 63 |         loading time compared to running :meth:`load_dictionary` again.
 64 | 
 65 |         Args:
 66 |             data: Either bytes string to be used with ``from_bytes=True`` or the
 67 |                 path+filename of the pickle file to be used with
 68 |                 ``from_bytes=False``.
 69 |             compressed: A flag to determine whether to read the pickled data as
 70 |                 compressed data.
 71 |             from_bytes: Flag to determine if we are loading from bytes or file.
 72 | 
 73 |         Returns:
 74 |             ``True`` if delete combinations are successfully loaded.
 75 |         """
 76 |         if from_bytes:
 77 |             assert isinstance(data, bytes)
 78 |             return self._load_pickle_stream(data, from_bytes)
 79 |         if compressed:
 80 |             with gzip.open(data, "rb") as gzip_infile:
 81 |                 return self._load_pickle_stream(cast(IO[bytes], gzip_infile))
 82 |         else:
 83 |             with open(data, "rb") as infile:
 84 |                 return self._load_pickle_stream(infile)
 85 | 
 86 |     def save_pickle(
 87 |         self,
 88 |         filename: Optional[Path] = None,
 89 |         compressed: bool = True,
 90 |         to_bytes: bool = False,
 91 |     ) -> Optional[bytes]:
 92 |         """Pickles :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into
 93 |         a stream for quicker loading later.
 94 | 
 95 |         Args:
 96 |             filename: The path+filename of the pickle file.
 97 |             compressed: A flag to determine whether to compress the pickled data.
 98 |             to_bytes: Flag to determine by bytes string should be returned
 99 |                 instead of wrting to file.
100 | 
101 |         Returns:
102 |             A byte string of the pickled data if ``to_bytes=True``.
103 |         """
104 |         if to_bytes:
105 |             return self._save_pickle_stream(to_bytes=to_bytes)
106 |         assert filename is not None
107 |         if compressed:
108 |             with gzip.open(filename, "wb") as gzip_outfile:
109 |                 self._save_pickle_stream(cast(IO[bytes], gzip_outfile))
110 |         else:
111 |             with open(filename, "wb") as outfile:
112 |                 self._save_pickle_stream(outfile)
113 |         return None
114 | 
115 |     def _load_pickle_stream(
116 |         self, stream: Union[bytes, IO[bytes]], from_bytes: bool = False
117 |     ) -> bool:
118 |         """Loads delete combination from stream as pickle. This will reduce the
119 |         loading time compared to running :meth:`load_dictionary` again.
120 | 
121 |         **NOTE**: Prints warning if the current settings `count_threshold`,
122 |         `max_dictionary_edit_distance`, and `prefix_length` are different from
123 |         the loaded settings. Overwrite current settings with loaded settings.
124 | 
125 |         Args:
126 |             stream: The stream from which the pickle data is loaded.
127 |             from_bytes: Flag to determine if we are loading from bytes or file.
128 | 
129 |         Returns:
130 |             ``True`` if delete combinations are successfully loaded.
131 |         """
132 |         if from_bytes:
133 |             assert isinstance(stream, bytes)
134 |             pickle_data = pickle.loads(stream)  # nosec
135 |         else:
136 |             assert not isinstance(stream, bytes)
137 |             pickle_data = pickle.load(stream)  # nosec
138 |         if pickle_data.get("data_version", None) != self.data_version:
139 |             return False
140 |         settings = ("count_threshold", "max_dictionary_edit_distance", "prefix_length")
141 |         if itemgetter(*settings)(pickle_data) != (
142 |             self._count_threshold,
143 |             self._max_dictionary_edit_distance,
144 |             self._prefix_length,
145 |         ):
146 |             logger.warning(
147 |                 f"Loading data which was created using different {settings} settings. Overwriting current SymSpell instance with loaded settings ..."
148 |             )
149 |         self._deletes = pickle_data["deletes"]
150 |         self._words = pickle_data["words"]
151 |         self._max_length = pickle_data["max_length"]
152 |         # dictionary entries related variables
153 |         self._below_threshold_words = pickle_data["below_threshold_words"]
154 |         self._bigrams = pickle_data["bigrams"]
155 |         self._deletes = pickle_data["deletes"]
156 |         self._words = pickle_data["words"]
157 |         self._max_length = pickle_data["max_length"]
158 |         # SymSpell settings used to generate the above
159 |         self._count_threshold = pickle_data["count_threshold"]
160 |         self._max_dictionary_edit_distance = pickle_data["max_dictionary_edit_distance"]
161 |         self._prefix_length = pickle_data["prefix_length"]
162 |         return True
163 | 
164 |     def _save_pickle_stream(
165 |         self, stream: Optional[IO[bytes]] = None, to_bytes: bool = False
166 |     ) -> Optional[bytes]:
167 |         """Pickles :attr:`_below_threshold_words`, :attr:`_bigrams`,
168 |         :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into
169 |         a stream for quicker loading later.
170 | 
171 |         Pickles :attr:`_count_threshold`, :attr:`_max_dictionary_edit_distance`,
172 |         and :attr:`_prefix_length` to ensure consistent behavior.
173 | 
174 |         Args:
175 |             stream: The stream to store the pickle data.
176 |             to_bytes: Flag to determine by bytes string should be returned
177 |                 instead of wrting to file.
178 | 
179 |         Returns:
180 |             A byte string of the pickled data if ``to_bytes=True``.
181 |         """
182 |         pickle_data = {
183 |             # Dictionary entries related variables
184 |             "below_threshold_words": self._below_threshold_words,
185 |             "bigrams": self._bigrams,
186 |             "deletes": self._deletes,
187 |             "words": self._words,
188 |             "max_length": self._max_length,
189 |             # SymSpell settings used to generate the above
190 |             "count_threshold": self._count_threshold,
191 |             "max_dictionary_edit_distance": self._max_dictionary_edit_distance,
192 |             "prefix_length": self._prefix_length,
193 |             # Version to ensure compatibility
194 |             "data_version": self.data_version,
195 |         }
196 |         if to_bytes:
197 |             return pickle.dumps(pickle_data)
198 |         assert stream is not None
199 |         pickle.dump(pickle_data, stream)
200 |         return None
201 | 


--------------------------------------------------------------------------------
/symspellpy/suggest_item.py:
--------------------------------------------------------------------------------
  1 | # MIT License
  2 | #
  3 | # Copyright (c) 2025 mmb L (Python port)
  4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
  5 | #
  6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | # of this software and associated documentation files (the "Software"), to deal
  8 | # in the Software without restriction, including without limitation the rights
  9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | # copies of the Software, and to permit persons to whom the Software is
 11 | # furnished to do so, subject to the following conditions:
 12 | #
 13 | # The above copyright notice and this permission notice shall be included in all
 14 | # copies or substantial portions of the Software.
 15 | 
 16 | """
 17 | .. module:: suggest_item
 18 |    :synopsis: Data class for :meth:`symspellpy.symspellpy.lookup`.
 19 | """
 20 | 
 21 | 
 22 | class SuggestItem:
 23 |     """Spelling suggestion returned from :meth:`lookup`.
 24 | 
 25 |     Args:
 26 |         term: The suggested word.
 27 |         distance: Edit distance from search word.
 28 |         count: Frequency of suggestion in dictionary or Naive Bayes probability
 29 |             of the individual suggestion parts.
 30 |     """
 31 | 
 32 |     def __init__(self, term: str, distance: int, count: int) -> None:
 33 |         self._term = term
 34 |         self._distance = distance
 35 |         self._count = count
 36 | 
 37 |     def __eq__(self, other: object) -> bool:
 38 |         """
 39 |         Returns:
 40 |             ``True`` if both distance and frequency count are the same.
 41 |         """
 42 |         if not isinstance(other, SuggestItem):
 43 |             return NotImplemented
 44 |         if self._distance == other.distance:
 45 |             return self._count == other.count
 46 |         return self._distance == other.distance
 47 | 
 48 |     def __lt__(self, other: object) -> bool:
 49 |         """
 50 |         Returns:
 51 |             Order by distance ascending, then by frequency count descending.
 52 |         """
 53 |         if not isinstance(other, SuggestItem):
 54 |             return NotImplemented
 55 |         if self._distance == other.distance:
 56 |             return self._count > other.count
 57 |         return self._distance < other.distance
 58 | 
 59 |     def __str__(self) -> str:
 60 |         """
 61 |         Returns:
 62 |             Displays attributes as "term, distance, count".
 63 |         """
 64 |         return f"{self._term}, {self._distance}, {self._count}"
 65 | 
 66 |     @property
 67 |     def count(self) -> int:
 68 |         """Frequency of suggestion in the dictionary (a measure of how common the
 69 |         word is) or Naive Bayes probability of the individual suggestion parts in
 70 |         :meth:`lookup_compound`.
 71 |         """
 72 |         return self._count
 73 | 
 74 |     @count.setter
 75 |     def count(self, count: int) -> None:
 76 |         self._count = count
 77 | 
 78 |     @property
 79 |     def distance(self) -> int:
 80 |         """Edit distance between searched for word and suggestion."""
 81 |         return self._distance
 82 | 
 83 |     @distance.setter
 84 |     def distance(self, distance: int) -> None:
 85 |         self._distance = distance
 86 | 
 87 |     @property
 88 |     def term(self) -> str:
 89 |         """The suggested correctly spelled word."""
 90 |         return self._term
 91 | 
 92 |     @term.setter
 93 |     def term(self, term: str) -> None:
 94 |         self._term = term
 95 | 
 96 |     @classmethod
 97 |     def create_with_probability(cls, term: str, distance: int) -> "SuggestItem":
 98 |         """Creates a SuggestItem with Naive Bayes probability as the count."""
 99 |         return cls(term, distance, 10 // 10 ** len(term))
100 | 


--------------------------------------------------------------------------------
/symspellpy/verbosity.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | #
 3 | # Copyright (c) 2025 mmb L (Python port)
 4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
 5 | #
 6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | # of this software and associated documentation files (the "Software"), to deal
 8 | # in the Software without restriction, including without limitation the rights
 9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 | 
16 | """
17 | .. module:: verbosity
18 |    :synopsis: Enum for lookup results verbosity.
19 | """
20 | 
21 | from enum import Enum
22 | 
23 | 
24 | class Verbosity(Enum):
25 |     """Controls the closeness/quantity of returned spelling suggestions.
26 | 
27 |     Attributes:
28 |         TOP: Top suggestion with the highest term frequency of the suggestions of
29 |             smallest edit distance found.
30 |         CLOSEST: All suggestions of smallest edit distance found, suggestions
31 |             ordered by term frequency.
32 |         ALL: All suggestions within maxEditDistance, suggestions ordered by edit
33 |             distance, then by term frequency (slower, no early termination).
34 |     """
35 | 
36 |     TOP = 0
37 |     CLOSEST = 1
38 |     ALL = 2
39 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mammothb/symspellpy/f4d1531a686038975370be3db4c19685564c2efe/tests/__init__.py


--------------------------------------------------------------------------------
/tests/benchmarks.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import importlib.resources\n",
 10 |     "import sys\n",
 11 |     "from pathlib import Path\n",
 12 |     "\n",
 13 |     "sys.path.append(str(Path.cwd().parent))\n",
 14 |     "\n",
 15 |     "from symspellpy import SymSpell, Verbosity\n",
 16 |     "from symspellpy.editdistance import DistanceAlgorithm, EditDistance"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/plain": [
 27 |        "True"
 28 |       ]
 29 |      },
 30 |      "execution_count": 2,
 31 |      "metadata": {},
 32 |      "output_type": "execute_result"
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "bigram_path = importlib.resources.files(\"symspellpy\") / \"frequency_bigramdictionary_en_243_342.txt\"\n",
 37 |     "\n",
 38 |     "dictionary_path = importlib.resources.files(\"symspellpy\") /  \"frequency_dictionary_en_82_765.txt\"\n",
 39 |     "\n",
 40 |     "sym_spell_damerau_osa = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.DAMERAU_OSA))\n",
 41 |     "sym_spell_damerau_osa.load_bigram_dictionary(bigram_path, 0, 2)\n",
 42 |     "sym_spell_damerau_osa.load_dictionary(dictionary_path, 0, 1)\n",
 43 |     "\n",
 44 |     "sym_spell_damerau_osa_fast = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST))\n",
 45 |     "sym_spell_damerau_osa_fast.load_bigram_dictionary(bigram_path, 0, 2)\n",
 46 |     "sym_spell_damerau_osa_fast.load_dictionary(dictionary_path, 0, 1)\n",
 47 |     "\n",
 48 |     "sym_spell_levenshtein = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.LEVENSHTEIN))\n",
 49 |     "sym_spell_levenshtein.load_bigram_dictionary(bigram_path, 0, 2)\n",
 50 |     "sym_spell_levenshtein.load_dictionary(dictionary_path, 0, 1)\n",
 51 |     "\n",
 52 |     "sym_spell_levenshtein_fast = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST))\n",
 53 |     "sym_spell_levenshtein_fast.load_bigram_dictionary(bigram_path, 0, 2)\n",
 54 |     "sym_spell_levenshtein_fast.load_dictionary(dictionary_path, 0, 1)"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "def lookup_damerau_osa():\n",
 64 |     "    sym_spell_damerau_osa.lookup(\"tepmperamet\", Verbosity.ALL)\n",
 65 |     "\n",
 66 |     "def lookup_damerau_osa_fast():\n",
 67 |     "    sym_spell_damerau_osa_fast.lookup(\"tepmperamet\", Verbosity.ALL)\n",
 68 |     "\n",
 69 |     "def lookup_levenshtein():\n",
 70 |     "    sym_spell_levenshtein.lookup(\"tepmperamet\", Verbosity.ALL)\n",
 71 |     "\n",
 72 |     "def lookup_levenshtein_fast():\n",
 73 |     "    sym_spell_levenshtein_fast.lookup(\"tepmperamet\", Verbosity.ALL)\n",
 74 |     "\n",
 75 |     "def lookup_compound_damerau_osa():\n",
 76 |     "    sym_spell_damerau_osa.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
 77 |     "\n",
 78 |     "def lookup_compound_damerau_osa_fast():\n",
 79 |     "    sym_spell_damerau_osa_fast.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
 80 |     "\n",
 81 |     "def lookup_compound_levenshtein():\n",
 82 |     "    sym_spell_levenshtein.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
 83 |     "\n",
 84 |     "def lookup_compound_levenshtein_fast():\n",
 85 |     "    sym_spell_levenshtein_fast.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
 86 |     "\n",
 87 |     "def word_segmentation_damerau_osa():\n",
 88 |     "    sym_spell_damerau_osa.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n",
 89 |     "\n",
 90 |     "def word_segmentation_damerau_osa_fast():\n",
 91 |     "    sym_spell_damerau_osa_fast.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n",
 92 |     "\n",
 93 |     "def word_segmentation_levenshtein():\n",
 94 |     "    sym_spell_levenshtein.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n",
 95 |     "\n",
 96 |     "def word_segmentation_levenshtein_fast():\n",
 97 |     "    sym_spell_levenshtein_fast.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 4,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "name": "stdout",
107 |      "output_type": "stream",
108 |      "text": [
109 |       "107 μs ± 356 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
110 |       "67.6 μs ± 319 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
111 |       "95.4 μs ± 563 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
112 |       "66.7 μs ± 295 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
113 |      ]
114 |     }
115 |    ],
116 |    "source": [
117 |     "%timeit lookup_damerau_osa()\n",
118 |     "%timeit lookup_damerau_osa_fast()\n",
119 |     "%timeit lookup_levenshtein()\n",
120 |     "%timeit lookup_levenshtein_fast()"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "9.89 ms ± 65.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
133 |       "5.1 ms ± 13.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
134 |       "8.68 ms ± 46.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
135 |       "4.95 ms ± 13.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
136 |      ]
137 |     }
138 |    ],
139 |    "source": [
140 |     "%timeit lookup_compound_damerau_osa()\n",
141 |     "%timeit lookup_compound_damerau_osa_fast()\n",
142 |     "%timeit lookup_compound_levenshtein()\n",
143 |     "%timeit lookup_compound_levenshtein_fast()"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 6,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "1.13 ms ± 1.36 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
156 |       "1.14 ms ± 2.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
157 |       "1.14 ms ± 3.56 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
158 |       "1.14 ms ± 1.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "%timeit word_segmentation_damerau_osa()\n",
164 |     "%timeit word_segmentation_damerau_osa_fast()\n",
165 |     "%timeit word_segmentation_levenshtein()\n",
166 |     "%timeit word_segmentation_levenshtein_fast()"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {},
172 |    "source": [
173 |     "**Note**: Result for `word_segmentation` is expected since we are passing `max_edit_distance=0`."
174 |    ]
175 |   }
176 |  ],
177 |  "metadata": {
178 |   "interpreter": {
179 |    "hash": "d83327bb218665ef1f16f1956a0b9fb217f4e8f6e80f84663e37ea0a49e5699a"
180 |   },
181 |   "kernelspec": {
182 |    "display_name": "Python 3 (ipykernel)",
183 |    "language": "python",
184 |    "name": "python3"
185 |   },
186 |   "language_info": {
187 |    "codemirror_mode": {
188 |     "name": "ipython",
189 |     "version": 3
190 |    },
191 |    "file_extension": ".py",
192 |    "mimetype": "text/x-python",
193 |    "name": "python",
194 |    "nbconvert_exporter": "python",
195 |    "pygments_lexer": "ipython3",
196 |    "version": "3.13.2"
197 |   }
198 |  },
199 |  "nbformat": 4,
200 |  "nbformat_minor": 4
201 | }
202 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | import importlib_resources
 5 | import pytest
 6 | 
 7 | from symspellpy import SymSpell
 8 | 
 9 | FORTESTS_DIR = Path(__file__).resolve().parent / "fortests"
10 | 
11 | 
12 | #######################################################################
13 | # Paths
14 | #######################################################################
15 | @pytest.fixture
16 | def bigram_path():
17 |     ref = (
18 |         importlib_resources.files("symspellpy")
19 |         / "frequency_bigramdictionary_en_243_342.txt"
20 |     )
21 |     with importlib_resources.as_file(ref) as path:
22 |         yield path
23 | 
24 | 
25 | @pytest.fixture
26 | def dictionary_path():
27 |     ref = importlib_resources.files("symspellpy") / "frequency_dictionary_en_82_765.txt"
28 |     with importlib_resources.as_file(ref) as path:
29 |         yield path
30 | 
31 | 
32 | @pytest.fixture
33 | def pickle_path():
34 |     return FORTESTS_DIR / "dictionary.pickle"
35 | 
36 | 
37 | @pytest.fixture
38 | def query_path():
39 |     return FORTESTS_DIR / "noisy_query_en_1000.txt"
40 | 
41 | 
42 | #######################################################################
43 | # Misc
44 | #######################################################################
45 | @pytest.fixture
46 | def get_same_word_and_count():
47 |     word = "hello"
48 |     return [(word, 11), (word, 3)]
49 | 
50 | 
51 | @pytest.fixture
52 | def get_fortests_data(request):
53 |     with open(FORTESTS_DIR / request.param) as infile:
54 |         return json.load(infile)["data"]
55 | 
56 | 
57 | #######################################################################
58 | # symspells
59 | #######################################################################
60 | @pytest.fixture
61 | def symspell_default():
62 |     return SymSpell()
63 | 
64 | 
65 | @pytest.fixture
66 | def symspell_default_entry(symspell_default, request):
67 |     for entry in request.param:
68 |         symspell_default.create_dictionary_entry(entry[0], entry[1])
69 |     return symspell_default
70 | 
71 | 
72 | @pytest.fixture
73 | def symspell_default_load(symspell_default, dictionary_path, bigram_path, request):
74 |     symspell_default.load_dictionary(dictionary_path, 0, 1)
75 |     if request.param == "bigram":
76 |         symspell_default.load_bigram_dictionary(bigram_path, 0, 2)
77 |     return symspell_default, request.param
78 | 
79 | 
80 | @pytest.fixture
81 | def symspell_long():
82 |     return SymSpell(5)
83 | 
84 | 
85 | @pytest.fixture
86 | def symspell_long_entry(symspell_long, request):
87 |     for entry in request.param:
88 |         symspell_long.create_dictionary_entry(entry, 2)
89 |     return symspell_long, request.param
90 | 
91 | 
92 | @pytest.fixture
93 | def symspell_short(request):
94 |     if request.param is None:
95 |         return SymSpell(1, 3)
96 |     return SymSpell(1, 3, count_threshold=request.param)
97 | 


--------------------------------------------------------------------------------
/tests/fortests/bad_dict.txt:
--------------------------------------------------------------------------------
1 | qwer
2 | wert
3 | erty
4 | rtyu tyui 12
5 | yuio uiop 13
6 | asdf 10
7 | sdfg 12


--------------------------------------------------------------------------------
/tests/fortests/below_threshold_dict.txt:
--------------------------------------------------------------------------------
1 | below 8
2 | threshold 10
3 | word 10
4 | 


--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_data.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "data": [
  3 |     {
  4 |       "typo": "whereis th elove",
  5 |       "bigram": {
  6 |         "num_results": 1,
  7 |         "term": "where is the love",
  8 |         "distance": 2,
  9 |         "count": 585
 10 |       },
 11 |       "unigram": {
 12 |         "num_results": 1,
 13 |         "term": "whereas the love",
 14 |         "distance": 2,
 15 |         "count": 64
 16 |       }
 17 |     },
 18 |     {
 19 |       "typo": "the bigjest playrs",
 20 |       "bigram": {
 21 |         "num_results": 1,
 22 |         "term": "the biggest players",
 23 |         "distance": 2,
 24 |         "count": 34
 25 |       },
 26 |       "unigram": {
 27 |         "num_results": 1,
 28 |         "term": "the biggest players",
 29 |         "distance": 2,
 30 |         "count": 34
 31 |       }
 32 |     },
 33 |     {
 34 |       "typo": "Can yu readthis",
 35 |       "bigram": {
 36 |         "num_results": 1,
 37 |         "term": "can you read this",
 38 |         "distance": 3,
 39 |         "count": 11440
 40 |       },
 41 |       "unigram": {
 42 |         "num_results": 1,
 43 |         "term": "can you read this",
 44 |         "distance": 3,
 45 |         "count": 3
 46 |       }
 47 |     },
 48 |     {
 49 |       "typo": "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him",
 50 |       "bigram": {
 51 |         "num_results": 1,
 52 |         "term": "where is the love he had dated for much of the past who couldn't read in sixth grade and inspired him",
 53 |         "distance": 9,
 54 |         "count": 0
 55 |       },
 56 |       "unigram": {
 57 |         "num_results": 1,
 58 |         "term": "whereas the love head dated for much of the past who couldn't read in sixth grade and inspired him",
 59 |         "distance": 9,
 60 |         "count": 0
 61 |       }
 62 |     },
 63 |     {
 64 |       "typo": "in te dhird qarter oflast jear he hadlearned ofca sekretplan",
 65 |       "bigram": {
 66 |         "num_results": 1,
 67 |         "term": "in the third quarter of last year he had learned of a secret plan",
 68 |         "distance": 9,
 69 |         "count": 0
 70 |       },
 71 |       "unigram": {
 72 |         "num_results": 1,
 73 |         "term": "in the third quarter of last year he had learned of a secret plan",
 74 |         "distance": 9,
 75 |         "count": 0
 76 |       }
 77 |     },
 78 |     {
 79 |       "typo": "the bigjest playrs in te strogsommer film slatew ith plety of funn",
 80 |       "bigram": {
 81 |         "num_results": 1,
 82 |         "term": "the biggest players in the strong summer film slate with plenty of fun",
 83 |         "distance": 9,
 84 |         "count": 0
 85 |       },
 86 |       "unigram": {
 87 |         "num_results": 1,
 88 |         "term": "the biggest players in the strong summer film slate with plenty of fun",
 89 |         "distance": 9,
 90 |         "count": 0
 91 |       }
 92 |     },
 93 |     {
 94 |       "typo": "Can yu readthis messa ge despite thehorible sppelingmsitakes",
 95 |       "bigram": {
 96 |         "num_results": 1,
 97 |         "term": "can you read this message despite the horrible spelling mistakes",
 98 |         "distance": 10,
 99 |         "count": 0
100 |       },
101 |       "unigram": {
102 |         "num_results": 1,
103 |         "term": "can you read this message despite the horrible spelling mistakes",
104 |         "distance": 10,
105 |         "count": 0
106 |       }
107 |     }
108 |   ]
109 | }
110 | 


--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_ignore_non_words_data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "typo": "whereis th elove 123 hehad dated forImuch of THEPAST who couqdn'tread in SIXTHgrade and ins pired him",
 5 |       "bigram": {
 6 |         "term": "where is the love 123 he had dated for much of THEPAST who couldn't read in sixth grade and inspired him"
 7 |       },
 8 |       "unigram": {
 9 |         "term": "whereas the love 123 head dated for much of THEPAST who couldn't read in sixth grade and inspired him"
10 |       }
11 |     },
12 |     {
13 |       "typo": "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan",
14 |       "bigram": {
15 |         "term": "in the DHIRD 1 quarter of last year he had learned of a secret plan"
16 |       },
17 |       "unigram": {
18 |         "term": "in the DHIRD 1 quarter of last year he had learned of a secret plan"
19 |       }
20 |     },
21 |     {
22 |       "typo": "the bigjest playrs in te stroGSOmmer film slatew ith PLETY of 12 funn",
23 |       "bigram": {
24 |         "term": "the biggest players in the strong summer film slate with PLETY of 12 fun"
25 |       },
26 |       "unigram": {
27 |         "term": "the biggest players in the strong summer film slate with PLETY of 12 fun"
28 |       }
29 |     },
30 |     {
31 |       "typo": "Can yu readtHIS messa ge despite thehorible 1234 sppelingmsitakes",
32 |       "bigram": {
33 |         "term": "can you read this message despite the horrible 1234 spelling mistakes"
34 |       },
35 |       "unigram": {
36 |         "term": "can you read this message despite the horrible 1234 spelling mistakes"
37 |       }
38 |     },
39 |     {
40 |       "typo": "Can yu readtHIS messa ge despite thehorible AB1234 sppelingmsitakes",
41 |       "bigram": {
42 |         "term": "can you read this message despite the horrible AB1234 spelling mistakes"
43 |       },
44 |       "unigram": {
45 |         "term": "can you read this message despite the horrible AB1234 spelling mistakes"
46 |       }
47 |     },
48 |     {
49 |       "typo": "PI on leave, arrange Co-I to do screening",
50 |       "bigram": { "term": "PI on leave arrange co i to do screening" },
51 |       "unigram": { "term": "PI on leave arrange co i to do screening" }
52 |     }
53 |   ]
54 | }
55 | 


--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_replaced_words_data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "typo": "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him",
 5 |       "bigram": {
 6 |         "term": "where is the love he had dated for much of the past who couldn't read in sixth grade and inspired him",
 7 |         "replacement": {
 8 |           "whereis": "where is",
 9 |           "th": "the",
10 |           "elove": "love",
11 |           "hehad": "he had",
12 |           "forimuch": "for much",
13 |           "thepast": "the past",
14 |           "couqdn'tread": "couldn't read",
15 |           "sixthgrade": "sixth grade",
16 |           "ins": "inspired"
17 |         }
18 |       },
19 |       "unigram": {
20 |         "term": "whereas the love head dated for much of the past who couldn't read in sixth grade and inspired him",
21 |         "replacement": {
22 |           "whereis": "whereas",
23 |           "th": "the",
24 |           "elove": "love",
25 |           "hehad": "head",
26 |           "forimuch": "for much",
27 |           "thepast": "the past",
28 |           "couqdn'tread": "couldn't read",
29 |           "sixthgrade": "sixth grade",
30 |           "ins": "inspired"
31 |         }
32 |       }
33 |     },
34 |     {
35 |       "typo": "in te dhird qarter oflast jear he hadlearned ofca sekretplan",
36 |       "bigram": {
37 |         "term": "in the third quarter of last year he had learned of a secret plan",
38 |         "replacement": {
39 |           "te": "the",
40 |           "dhird": "third",
41 |           "qarter": "quarter",
42 |           "oflast": "of last",
43 |           "jear": "year",
44 |           "hadlearned": "had learned",
45 |           "ofca": "of a",
46 |           "sekretplan": "secret plan"
47 |         }
48 |       },
49 |       "unigram": {
50 |         "term": "in the third quarter of last year he had learned of a secret plan",
51 |         "replacement": {
52 |           "te": "the",
53 |           "dhird": "third",
54 |           "qarter": "quarter",
55 |           "oflast": "of last",
56 |           "jear": "year",
57 |           "hadlearned": "had learned",
58 |           "ofca": "of a",
59 |           "sekretplan": "secret plan"
60 |         }
61 |       }
62 |     },
63 |     {
64 |       "typo": "the bigjest playrs in te strogsommer film slatew ith plety of funn",
65 |       "bigram": {
66 |         "term": "the biggest players in the strong summer film slate with plenty of fun",
67 |         "replacement": {
68 |           "bigjest": "biggest",
69 |           "playrs": "players",
70 |           "strogsommer": "strong summer",
71 |           "slatew": "slate",
72 |           "ith": "with",
73 |           "plety": "plenty",
74 |           "funn": "fun"
75 |         }
76 |       },
77 |       "unigram": {
78 |         "term": "the biggest players in the strong summer film slate with plenty of fun",
79 |         "replacement": {
80 |           "bigjest": "biggest",
81 |           "playrs": "players",
82 |           "strogsommer": "strong summer",
83 |           "slatew": "slate",
84 |           "ith": "with",
85 |           "plety": "plenty",
86 |           "funn": "fun"
87 |         }
88 |       }
89 |     }
90 |   ]
91 | }
92 | 


--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_transfer_casing_data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "typo": "Whereis th elove hehaD Dated forImuch of thepast who couqdn'tread in sixthgrade AND ins pired him",
 5 |       "bigram": {
 6 |         "term": "Where is the love he haD Dated for much of the past who couldn't read in sixth grade AND inspired him"
 7 |       },
 8 |       "unigram": {
 9 |         "term": "Whereas the love heaD Dated for much of the past who couldn't read in sixth grade AND inspired him"
10 |       }
11 |     }
12 |   ]
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_transfer_casing_ignore_nonwords_data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "typo": "Whereis th elove hehaD Dated FOREEVER forImuch of thepast who couqdn'tread in sixthgrade AND ins pired him",
 5 |       "bigram": {
 6 |         "term": "Where is the love he haD Dated FOREEVER for much of the past who couldn't read in sixth grade AND inspired him"
 7 |       },
 8 |       "unigram": {
 9 |         "term": "Whereas the love heaD Dated FOREEVER for much of the past who couldn't read in sixth grade AND inspired him"
10 |       }
11 |     }
12 |   ]
13 | }
14 | 


--------------------------------------------------------------------------------
/tests/fortests/noisy_query_en_1000.txt:
--------------------------------------------------------------------------------
   1 | te the 1
   2 | aojecm project 3
   3 | gutenberg gutenberg 0
   4 | eboo ebook 1
   5 | yof of 1
   6 | adventures adventures 0
   7 | sherlock sherlock 0
   8 | polxs holmes 3
   9 | si sir 1
  10 | arthur arthur 0
  11 | conn conan 1
  12 | doyle doyle 0
  13 | in in 0
  14 | our our 0
  15 | aeries series 1
  16 | copyrgt copyright 2
  17 | laws laws 0
  18 | are are 0
  19 | changng changing 1
  20 | all all 0
  21 | over over 0
  22 | world world 0
  23 | re sure 2
  24 | to to 0
  25 | check check 0
  26 | qor for 1
  27 | youbr your 1
  28 | countrfy country 1
  29 | before before 0
  30 | dwnoadingg downloading 3
  31 | or or 0
  32 | redistributing redistributing 0
  33 | ntis this 2
  34 | any any 0
  35 | other other 0
  36 | jekler header 3
  37 | shoflg should 2
  38 | first first 0
  39 | hng thing 2
  40 | seen seen 0
  41 | when when 0
  42 | vkewing viewing 1
  43 | cle file 2
  44 | pleare please 1
  45 | do do 0
  46 | not not 0
  47 | reovef remove 2
  48 | it it 0
  49 | change change 0
  50 | ei edit 2
  51 | ywthout without 2
  52 | ritten written 1
  53 | wermission permission 1
  54 | rad read 1
  55 | legailb legal 2
  56 | smll small 1
  57 | prinj print 1
  58 | and and 0
  59 | informatin information 1
  60 | about about 0
  61 | ct at 1
  62 | abcttom bottom 2
  63 | incluqed included 1
  64 | if is 1
  65 | important important 0
  66 | speciic specific 1
  67 | rights rights 0
  68 | eetrnictions restrictions 3
  69 | how how 0
  70 | may may 0
  71 | udsd used 2
  72 | ou you 1
  73 | can can 0
  74 | also also 0
  75 | itnd find 2
  76 | ut out 1
  77 | eake make 1
  78 | donation donation 0
  79 | get get 0
  80 | nvotlved involved 2
  81 | wellcome welcome 1
  82 | free free 0
  83 | pain plain 1
  84 | vanla vanilla 2
  85 | electronic electronic 0
  86 | tets texts 1
  87 | ebooks ebooks 0
  88 | readable readable 0
  89 | bt both 2
  90 | umns humans 2
  91 | compurters computers 1
  92 | sikce since 1
  93 | thseoe these 2
  94 | iere were 1
  95 | preard prepared 2
  96 | thusans thousands 2
  97 | volutrers volunteers 2
  98 | tle title 2
  99 | athor author 1
 100 | release release 0
 101 | daxe date 1
 102 | marq march 2
 103 | most most 0
 104 | recently recently 0
 105 | updayd updated 2
 106 | dncovaember november 3
 107 | edition edition 0
 108 | lafnguage language 1
 109 | engih english 2
 110 | chactr character 3
 111 | set set 0
 112 | encoding encoding 0
 113 | asci ascii 1
 114 | styrt start 1
 115 | additional additional 0
 116 | edoing editing 2
 117 | josex jose 1
 118 | menendez menendez 0
 119 | cntems contents 3
 120 | scgndial scandal 2
 121 | bohemja bohemia 1
 122 | ii ii 0
 123 | red red 0
 124 | hdted headed 3
 125 | leagm league 2
 126 | iii iii 0
 127 | csfe case 2
 128 | identity identity 0
 129 | iv iv 0
 130 | boscombe boscombe 0
 131 | vallejy valley 1
 132 | mystery mystery 0
 133 | five five 0
 134 | orfnge orange 1
 135 | pips pips 0
 136 | vi vi 0
 137 | mn man 1
 138 | with with 0
 139 | tisged twisted 2
 140 | lip lip 0
 141 | adventu adventure 2
 142 | blue blue 0
 143 | qcarbuncle carbuncle 1
 144 | viii viii 0
 145 | pmpeckld speckled 3
 146 | bad band 1
 147 | ix ix 0
 148 | enginebr engineer 1
 149 | thub thumb 1
 150 | noble noble 0
 151 | ahelr bachelor 3
 152 | xi xi 0
 153 | beryl beryl 0
 154 | coixet coronet 3
 155 | xii xii 0
 156 | coer copper 2
 157 | beeches beeches 0
 158 | sge she 1
 159 | aklways always 1
 160 | wovmn woman 2
 161 | hanve have 1
 162 | setldom seldom 1
 163 | herh heard 2
 164 | him him 0
 165 | mention mention 0
 166 | her her 0
 167 | ude under 2
 168 | hme name 2
 169 | hisw his 1
 170 | eyes eyes 0
 171 | ecipos eclipses 3
 172 | predomiates predominates 1
 173 | whole whole 0
 174 | seyx sex 1
 175 | ws was 1
 176 | th that 2
 177 | he he 0
 178 | celt felt 1
 179 | emtiobn emotion 2
 180 | ainh akin 2
 181 | love love 0
 182 | ibne irene 2
 183 | yadler adler 1
 184 | eorgones emotions 4
 185 | ne one 1
 186 | pcarticulry particularly 3
 187 | abajrrevnt abhorrent 3
 188 | cold cold 0
 189 | preise precise 1
 190 | but but 0
 191 | admirasly admirably 1
 192 | oalpanced balanced 2
 193 | mikd mind 1
 194 | take take 0
 195 | prct perfect 3
 196 | reasng reasoning 3
 197 | oservng observing 2
 198 | mlacine machine 2
 199 | has has 0
 200 | as as 0
 201 | ver lover 2
 202 | ouqld would 2
 203 | xplacd placed 2
 204 | hiqslf himself 2
 205 | false false 0
 206 | posiin position 2
 207 | nover never 1
 208 | sspokep spoke 2
 209 | sjofter softer 1
 210 | talpsions passions 3
 211 | ave save 1
 212 | abe gibe 2
 213 | ser sneer 2
 214 | hey they 1
 215 | admirbae admirable 2
 216 | thingn things 1
 217 | obaerver observer 1
 218 | ezcvielltnt excellent 4
 219 | rawijg drawing 2
 220 | veigl veil 1
 221 | frsm from 1
 222 | men men 0
 223 | qoives motives 2
 224 | cions actions 2
 225 | trained trained 0
 226 | asvoner reasoner 3
 227 | admit admit 0
 228 | ch such 2
 229 | intrusions intrusions 0
 230 | ito into 1
 231 | olwn own 1
 232 | delcatee delicate 2
 233 | fne finely 3
 234 | fjusxed adjusted 3
 235 | tepmperamet temperament 2
 236 | vitroduce introduce 2
 237 | dsntracting distracting 2
 238 | factor factor 0
 239 | wihicth which 2
 240 | might might 0
 241 | throw throw 0
 242 | doubt doubt 0
 243 | pot upon 2
 244 | mentl mental 1
 245 | requls results 2
 246 | grit grit 0
 247 | ensiiuw sensitive 4
 248 | nstrumnn instrument 3
 249 | crack crack 0
 250 | hsgh high 1
 251 | powe power 1
 252 | clnses lenses 2
 253 | more more 0
 254 | vdisjturbing disturbing 2
 255 | ezhan than 2
 256 | stqngz strong 3
 257 | notre nature 2
 258 | yet yet 0
 259 | tee there 2
 260 | late late 0
 261 | bubiofs dubious 2
 262 | questionale questionable 1
 263 | memtry memory 1
 264 | hd had 1
 265 | ittle little 1
 266 | laiey lately 2
 267 | my my 0
 268 | mjrriajzbe marriage 4
 269 | rifted drifted 1
 270 | aaway away 1
 271 | ach each 1
 272 | vcympnlee complete 4
 273 | happiness happiness 0
 274 | home home 0
 275 | enteredr centred 3
 276 | interests interests 0
 277 | rise rise 0
 278 | uap up 1
 279 | qroun around 2
 280 | whoc who 1
 281 | findtf finds 2
 282 | maer master 2
 283 | yeatabshment establishment 4
 284 | sufficient sufficient 0
 285 | absorb absorb 0
 286 | etteantion attention 2
 287 | whle while 1
 288 | loatzhegd loathed 2
 289 | every every 0
 290 | om form 2
 291 | sokcity society 2
 292 | bohetmin bohemian 2
 293 | souml soul 1
 294 | remineu remained 2
 295 | rogings lodgings 2
 296 | aer baker 2
 297 | trt street 3
 298 | urild buried 2
 299 | ang among 2
 300 | old old 0
 301 | books books 0
 302 | aplternatinp alternating 2
 303 | wek week 1
 304 | ketween between 1
 305 | cocainre cocaine 1
 306 | ambition ambition 0
 307 | drowsiness drowsiness 0
 308 | dug drug 1
 309 | fmieae fierce 3
 310 | eergy energy 1
 311 | een keen 1
 312 | silr still 2
 313 | ever ever 0
 314 | deeply deeply 0
 315 | axtrahtqed attracted 3
 316 | study study 0
 317 | crime crime 0
 318 | ocijpied occupied 2
 319 | iwmenfe immense 2
 320 | ftcults faculties 3
 321 | exttnordinaac extraordinary 4
 322 | powers powers 0
 323 | sprvation observation 3
 324 | following following 0
 325 | mose those 2
 326 | clue clues 1
 327 | cjaring clearing 2
 328 | ystewies mysteries 2
 329 | lzeen been 2
 330 | abandoned abandoned 0
 331 | hales hopeless 4
 332 | ofmiciaz official 2
 333 | police police 0
 334 | tcim time 2
 335 | come some 1
 336 | vagje vague 1
 337 | acocn account 3
 338 | doitgs doings 1
 339 | aumqmoxs summons 3
 340 | dessa odessa 1
 341 | trepoff trepoff 0
 342 | mxumrper murder 3
 343 | singular singular 0
 344 | tragedpyk tragedy 2
 345 | tkinson atkinson 1
 346 | xbrphers brothers 3
 347 | txincoale trincomalee 3
 348 | fqnally finally 1
 349 | mission mission 0
 350 | jcomplished accomplished 2
 351 | sow so 1
 352 | dctely delicately 4
 353 | sucycessfully successfully 1
 354 | reigning reigning 0
 355 | faifl family 3
 356 | honlad holland 2
 357 | beyond beyond 0
 358 | signh signs 1
 359 | lctivity activity 1
 360 | hoer however 3
 361 | merely merely 0
 362 | swarcd shared 2
 363 | readersj readers 1
 364 | daily daily 0
 365 | pvqess press 2
 366 | xknw knew 2
 367 | former former 0
 368 | fieni friend 2
 369 | cjompanion companion 1
 370 | night night 0
 371 | twneth twentieth 3
 372 | returnig returning 1
 373 | joaurnhey journey 2
 374 | patient patient 0
 375 | gow now 1
 376 | returned returned 0
 377 | iil civil 2
 378 | pqratice practice 2
 379 | way way 0
 380 | ed led 1
 381 | trough through 1
 382 | passed passed 0
 383 | wll well 1
 384 | zrembere remembered 4
 385 | cdoom door 2
 386 | must must 0
 387 | associated associated 0
 388 | woonl wooing 2
 389 | da dark 2
 390 | miycidents incidents 2
 391 | scarfelt scarlet 2
 392 | seizedh seized 1
 393 | djsie desire 2
 394 | se see 1
 395 | again again 0
 396 | know know 0
 397 | eplhlying employing 3
 398 | rkoos rooms 2
 399 | brilliantly brilliantly 0
 400 | lit lit 0
 401 | eve even 1
 402 | ood looked 3
 403 | saw saw 0
 404 | tall tall 0
 405 | spae spare 1
 406 | figurj figure 1
 407 | pas pass 1
 408 | twice twice 0
 409 | silgouette silhouette 1
 410 | goains against 3
 411 | bind blind 1
 412 | acypng pacing 3
 413 | room room 0
 414 | swiyky swiftly 3
 415 | eerly eagerly 2
 416 | had head 1
 417 | sunk sunk 0
 418 | cdest chest 1
 419 | hands hands 0
 420 | clasped clasped 0
 421 | behnd behind 1
 422 | mod mood 1
 423 | habi habit 1
 424 | attityade attitude 2
 425 | mlannedr manner 2
 426 | thein their 1
 427 | story story 0
 428 | lworke work 2
 429 | qisenn risen 2
 430 | coretedu created 3
 431 | drjzma dreams 3
 432 | hot hot 0
 433 | cent scent 1
 434 | ew new 1
 435 | problem problem 0
 436 | arasng rang 2
 437 | bl bell 2
 438 | showvn shown 1
 439 | hamber chamber 1
 440 | formely formerly 1
 441 | agt part 2
 442 | efsisea effusive 4
 443 | gld glad 1
 444 | ink think 2
 445 | hrdby hardly 2
 446 | wodrd word 1
 447 | spoken spoken 0
 448 | kikdfy kindly 2
 449 | ye eye 1
 450 | wave waved 1
 451 | an an 0
 452 | rmchair armchair 1
 453 | thew threw 1
 454 | acrcss across 1
 455 | cars cigars 2
 456 | mndicaoed indicated 2
 457 | crit spirit 3
 458 | gasogene gasogene 0
 459 | orne corner 2
 460 | then then 0
 461 | stood stood 0
 462 | fire fire 0
 463 | irosectve introspective 4
 464 | fsiin fashion 3
 465 | wwdloc wedlock 2
 466 | sitt suits 2
 467 | rqmrkedo remarked 3
 468 | wtsn watson 2
 469 | seoven seven 1
 470 | hf half 2
 471 | pouhnds pounds 1
 472 | angwerd answered 2
 473 | tsneed indeed 3
 474 | thought thought 0
 475 | ckjust just 2
 476 | trilx trifle 2
 477 | fkancy fancy 1
 478 | obrve observe 2
 479 | dnd did 1
 480 | tl tell 2
 481 | intended intended 0
 482 | gw go 1
 483 | harnes harness 1
 484 | dduce deduce 1
 485 | getxin getting 2
 486 | yurself yourself 1
 487 | ery very 1
 488 | wet wet 0
 489 | cluumsy clumsy 1
 490 | faralesi careless 3
 491 | servaqt servant 1
 492 | irl girl 1
 493 | dear dear 0
 494 | saxdq said 2
 495 | too too 0
 496 | mchg much 2
 497 | certainly certainly 0
 498 | burned burned 0
 499 | liea lived 2
 500 | fewt few 1
 501 | penntduris centuries 4
 502 | fgo ago 1
 503 | true true 0
 504 | walk walk 0
 505 | thursday thursday 0
 506 | came came 0
 507 | dreadul dreadful 1
 508 | esbs mess 2
 509 | chagd changed 2
 510 | clthe clothes 2
 511 | imagine imagine 0
 512 | mary mary 0
 513 | jane jane 0
 514 | innorigile incorrigible 3
 515 | wif wife 1
 516 | gven given 1
 517 | notice notice 0
 518 | chucd chuckled 3
 519 | rubbed rubbed 0
 520 | long long 0
 521 | nervos nervous 1
 522 | otjer together 4
 523 | simlicity simplicity 1
 524 | iislfr itself 3
 525 | inside inside 0
 526 | leb left 2
 527 | so shoe 2
 528 | dwher where 2
 529 | fireslght firelight 2
 530 | strkhs strikes 2
 531 | eather leather 1
 532 | sored scored 1
 533 | six six 0
 534 | almost almost 0
 535 | paalll parallel 2
 536 | cuts cuts 0
 537 | obviously obviously 0
 538 | qausdd caused 2
 539 | oxneone someone 3
 540 | arelessly carelessly 1
 541 | scraped scraped 0
 542 | roun round 1
 543 | dges edges 1
 544 | sole sole 0
 545 | orger order 1
 546 | creusted crusted 1
 547 | mudi mud 1
 548 | hence hence 0
 549 | adoblje double 3
 550 | deduction deduction 0
 551 | vse vile 2
 552 | wether weather 1
 553 | magnhnat malignant 4
 554 | hbot boot 2
 555 | plittijkg slitting 3
 556 | spoecme specimen 3
 557 | london london 0
 558 | slfey slavey 2
 559 | ief if 1
 560 | eneman gentleman 3
 561 | esmellzng smelling 2
 562 | ioforc iodoform 3
 563 | black black 0
 564 | ak mark 2
 565 | nitrate nitrate 0
 566 | silsver silver 1
 567 | righ right 1
 568 | foefifgr forefinger 3
 569 | blge bulge 1
 570 | siydez side 2
 571 | top top 0
 572 | chat hat 1
 573 | show show 0
 574 | ecretd secreted 2
 575 | stethoscope stethoscope 0
 576 | dull dull 0
 577 | proounce pronounce 1
 578 | acivb active 2
 579 | mepmber member 1
 580 | vepcal medical 3
 581 | profession profession 0
 582 | could could 0
 583 | besp help 2
 584 | lalughing laughing 1
 585 | eyyse ease 2
 586 | emxplained explained 1
 587 | process process 0
 588 | hear hear 0
 589 | ive give 1
 590 | ueass reasons 3
 591 | apears appears 1
 592 | ridiculously ridiculously 0
 593 | snigmle simple 3
 594 | esivly easily 2
 595 | yseylf myself 2
 596 | athough though 1
 597 | sccwssive successive 2
 598 | nstance instance 1
 599 | bafmlled baffled 2
 600 | untl until 1
 601 | explain explain 0
 602 | beleve believe 1
 603 | good good 0
 604 | your yours 1
 605 | que quite 2
 606 | lightig lighting 1
 607 | cgarete cigarette 2
 608 | hrowwing throwing 2
 609 | dowu down 1
 610 | distinttion distinction 1
 611 | clear clear 0
 612 | emple example 2
 613 | feqetly frequently 3
 614 | stps steps 1
 615 | xeaq lead 2
 616 | hall hall 0
 617 | often often 0
 618 | hundrqeds hundreds 1
 619 | times times 0
 620 | qaly many 2
 621 | don don 0
 622 | oblsere observed 3
 623 | poib point 2
 624 | seveneene seventeen 2
 625 | pecase because 2
 626 | interested interested 0
 627 | wobms problems 4
 628 | ckougd enough 3
 629 | chronicle chronicle 0
 630 | two two 0
 631 | rifnling trifling 2
 632 | experencef experiences 2
 633 | shet sheet 1
 634 | thick thick 0
 635 | pink pink 0
 636 | tinted tinted 0
 637 | noitepaper notepaper 1
 638 | lyng lying 1
 639 | opn open 1
 640 | tbled table 2
 641 | last last 0
 642 | ot post 2
 643 | aloud aloud 0
 644 | xnotte note 2
 645 | undzate undated 2
 646 | either either 0
 647 | saigatue signature 3
 648 | wress address 3
 649 | will will 0
 650 | call call 0
 651 | quarter quarter 0
 652 | eight eight 0
 653 | clock clock 0
 654 | desires desires 0
 655 | consl consult 2
 656 | macttr matter 2
 657 | deeupest deepest 1
 658 | momnt moment 1
 659 | recent recent 0
 660 | cevice services 3
 661 | yyal royal 2
 662 | hpoufjs houses 3
 663 | euroe europe 1
 664 | safely safely 0
 665 | tkrushted trusted 2
 666 | mtttxs matters 3
 667 | importace importance 1
 668 | exaggeratbd exaggerated 1
 669 | we we 0
 670 | quartes quarters 1
 671 | receiyd received 2
 672 | aur hour 2
 673 | zmiss amiss 1
 674 | viositgr visitor 2
 675 | wear wear 0
 676 | masik mask 1
 677 | what what 0
 678 | means means 0
 679 | no no 0
 680 | aa data 2
 681 | capital capital 0
 682 | moisoake mistake 2
 683 | theoxise theorise 1
 684 | insensibly insensibly 0
 685 | begnsj begins 2
 686 | twst twist 1
 687 | facte facts 1
 688 | uiu suit 2
 689 | theories theories 0
 690 | yinstewad instead 2
 691 | carfult carefully 3
 692 | exlaqmined examined 2
 693 | writig writing 1
 694 | paper paper 0
 695 | wroto wrote 1
 696 | pesuhably presumably 2
 697 | edeavouring endeavouring 1
 698 | imitare imitate 1
 699 | psocsse processes 3
 700 | bough bought 1
 701 | row crown 2
 702 | wawket packet 2
 703 | pcueiarly peculiarly 2
 704 | iff stiff 2
 705 | ecular peculiar 2
 706 | hovldi hold 2
 707 | light light 0
 708 | large large 0
 709 | woven woven 0
 710 | zextzure texture 2
 711 | asked asked 0
 712 | nmaker maker 1
 713 | mmoogaa monogram 4
 714 | raher rather 1
 715 | stanks stands 1
 716 | geellsckaft gesellschaft 2
 717 | german german 0
 718 | cpany company 2
 719 | csmary customary 3
 720 | ycontraon contraction 4
 721 | like like 0
 722 | co co 0
 723 | ckurhe course 2
 724 | papienr papier 1
 725 | eg eg 0
 726 | lev let 1
 727 | glance glance 0
 728 | pcotinentalk continental 3
 729 | gazcetter gazetteer 2
 730 | took took 0
 731 | havny heavy 2
 732 | brown brown 0
 733 | volumea volume 1
 734 | shelmves shelves 1
 735 | eglow eglow 0
 736 | eglonitz eglonitz 0
 737 | hrje here 2
 738 | egeria egria 1
 739 | sefkiang speaking 3
 740 | rar far 1
 741 | crlsbad carlsbad 1
 742 | rqarabye remarkable 4
 743 | being being 0
 744 | scee scene 1
 745 | death death 0
 746 | wallenstein wallenstein 0
 747 | its its 0
 748 | nmeous numerous 2
 749 | gvlass glass 1
 750 | qctries factories 3
 751 | milsm mills 2
 752 | ha ha 0
 753 | body boy 1
 754 | sqrkled sparkled 2
 755 | sqenzt sent 2
 756 | gret great 1
 757 | tdrumphant triumphant 2
 758 | clud cloud 1
 759 | made made 0
 760 | precikey precisely 2
 761 | construction construction 0
 762 | sentence sentence 0
 763 | frenczhmmn frenchman 2
 764 | rssian russian 1
 765 | uncgurteous uncourteous 1
 766 | erbs verbs 1
 767 | onlyd only 1
 768 | remains remains 0
 769 | therefore therefore 0
 770 | dmmscovr discover 3
 771 | wqnted wanted 1
 772 | writes writes 0
 773 | pffrs prefers 3
 774 | eainq wearing 3
 775 | showig showing 1
 776 | face face 0
 777 | comes comes 0
 778 | msstaken mistaken 1
 779 | xresoe resolve 3
 780 | doubts doubts 0
 781 | bsharp sharp 1
 782 | souno sound 1
 783 | horss horses 1
 784 | zuoofs hoofs 2
 785 | pgrating grating 1
 786 | hels wheels 2
 787 | curb curb 0
 788 | nollowed followed 1
 789 | ul pull 2
 790 | whjistled whistled 1
 791 | par pair 1
 792 | yes yes 0
 793 | continued continued 0
 794 | gancing glancing 1
 795 | windo window 1
 796 | nicu nice 1
 797 | bvoughawmr brougham 3
 798 | euties beauties 2
 799 | unded hundred 2
 800 | fift fifty 1
 801 | tineams guineas 3
 802 | apiee apiece 1
 803 | mone money 1
 804 | taing nothing 3
 805 | ejse else 1
 806 | ett better 3
 807 | bit bit 0
 808 | doctor doctor 0
 809 | lstvy stay 2
 810 | mlost lost 1
 811 | boswel boswell 1
 812 | proasu promises 4
 813 | uinterestig interesting 2
 814 | py pity 2
 815 | iss miss 1
 816 | ldeny client 3
 817 | want want 0
 818 | beht best 1
 819 | sgow slow 1
 820 | utdep step 2
 821 | tars stairs 2
 822 | pasage passage 1
 823 | paued paused 1
 824 | immdiately immediately 1
 825 | ousi outside 3
 826 | loud loud 0
 827 | authoritative authoritative 0
 828 | tap tap 0
 829 | enered entered 1
 830 | leps less 1
 831 | feet feet 0
 832 | inwh inches 3
 833 | hreigqhtd height 3
 834 | libr limbs 2
 835 | qercsuej hercules 4
 836 | res dress 2
 837 | richr rich 1
 838 | richnesdsj richness 2
 839 | eglnd england 2
 840 | bd bad 1
 841 | taste taste 0
 842 | andsw bands 2
 843 | sirakhqn astrakhan 3
 844 | slashehd slashed 1
 845 | sleeves sleeves 0
 846 | fronts fronts 0
 847 | hbreasted breasted 1
 848 | cwat coat 1
 849 | deep deep 0
 850 | cloak cloak 0
 851 | thrown thrown 0
 852 | shoudrs shoulders 2
 853 | gned lined 2
 854 | fame flame 1
 855 | txloure coloured 3
 856 | slk silk 1
 857 | secred secured 1
 858 | nemk neck 1
 859 | xbrooh brooch 2
 860 | onsistea consisted 2
 861 | igle single 2
 862 | flaming flaming 0
 863 | bdoots boots 1
 864 | exended extended 1
 865 | hlfwbay halfway 2
 866 | adves calves 2
 867 | trimeo trimmed 2
 868 | tps tops 1
 869 | furt fur 1
 870 | cfopleted completed 2
 871 | mprfpssiof impression 4
 872 | baobro barbaric 4
 873 | opundnzce opulence 3
 874 | suggested suggested 0
 875 | auppearancne appearance 2
 876 | caried carried 1
 877 | broaxd broad 1
 878 | brimwed brimmed 1
 879 | ad hand 2
 880 | woe wore 1
 881 | upper upper 0
 882 | xtennz extending 4
 883 | past past 0
 884 | lchekboxs cheekbones 4
 885 | vad vizard 3
 886 | appaienwtly apparently 2
 887 | rasod raised 2
 888 | lowur lower 1
 889 | appeared appeared 0
 890 | hanging hanging 0
 891 | ssragh straight 3
 892 | cyhin chin 1
 893 | suggestive suggestive 0
 894 | resozlution resolution 1
 895 | pushd pushed 1
 896 | lengzth length 1
 897 | otstinacy obstinacy 1
 898 | harusch harsh 2
 899 | voice voice 0
 900 | srojgqy strongly 3
 901 | marked marked 0
 902 | accet accent 1
 903 | uncamin uncertain 4
 904 | pray pray 0
 905 | olleeague colleague 2
 906 | ocasgonaly occasionally 3
 907 | cases cases 0
 908 | uhgm whom 2
 909 | xhonxr honour 3
 910 | cokt count 2
 911 | von von 0
 912 | ramxm kramm 2
 913 | noean nobleman 3
 914 | nderman understand 4
 915 | jisoretin discretion 3
 916 | ettemse extreme 3
 917 | eer prefer 3
 918 | communicate communicate 0
 919 | aone alone 1
 920 | rgse rose 1
 921 | caught caught 0
 922 | wrist wrist 0
 923 | bttk back 2
 924 | chaim chair 1
 925 | none none 0
 926 | say say 0
 927 | anyttzing anything 2
 928 | sruggen shrugged 2
 929 | bei begin 2
 930 | incdingf binding 3
 931 | atoltme absolute 4
 932 | secrgcg secrecy 2
 933 | years years 0
 934 | end end 0
 935 | pkrsent present 2
 936 | weightp weight 1
 937 | influence influence 0
 938 | european european 0
 939 | historyk history 1
 940 | promisxe promise 1
 941 | excbe excuse 2
 942 | stracxnge strange 2
 943 | augysd august 2
 944 | persn person 1
 945 | empmosd employs 3
 946 | wises wishes 1
 947 | aet agent 2
 948 | nknow unknown 2
 949 | onfes confess 2
 950 | once once 0
 951 | callvd called 1
 952 | eixcdtly exactly 3
 953 | aware aware 0
 954 | dry dryly 2
 955 | ciyrumstances circumstances 2
 956 | decacw delicacy 3
 957 | precaution precaution 0
 958 | takn taken 1
 959 | quench quench 0
 960 | grow grow 0
 961 | seriously seriously 0
 962 | comproomutse compromise 3
 963 | famiclies families 1
 964 | speak speak 0
 965 | lanly plainly 2
 966 | impicates implicates 1
 967 | hvuse house 1
 968 | ormsqpek ormstein 4
 969 | redotary hereditary 3
 970 | kings kings 0
 971 | muzmurd murmured 2
 972 | settlinpg settling 1
 973 | closing closing 0
 974 | gancerd glanced 2
 975 | appnt apparent 3
 976 | surrise surprise 1
 977 | layngid languid 2
 978 | ljouunming lounging 3
 979 | dpicuted depicted 2
 980 | wnciqsive incisive 2
 981 | enwergtic energetic 2
 982 | xlowly slowly 1
 983 | rekpewned reopened 2
 984 | impaienty impatiently 2
 985 | ggac gigantic 4
 986 | maygty majesty 3
 987 | cndeced condescend 3
 988 | state state 0
 989 | le able 2
 990 | adeipej advise 3
 991 | sprgm sprang 3
 992 | pacedp paced 1
 993 | uncontrollable uncontrollable 0
 994 | gitton agitation 3
 995 | gxesturew gesture 2
 996 | desperation desperation 0
 997 | zurd hurled 3
 998 | grocnd ground 1
 999 | criedm cried 1
1000 | kig king 1


--------------------------------------------------------------------------------
/tests/fortests/non_en_dict.txt:
--------------------------------------------------------------------------------
1 | АБИ 10
2 | И 1
3 | Б 2


--------------------------------------------------------------------------------
/tests/fortests/separator_dict.txt:
--------------------------------------------------------------------------------
1 | the$23135851162
2 | of$13151942776
3 | abcs of$10956800
4 | aaron and$10721728
5 | and$12997637966


--------------------------------------------------------------------------------
/tests/fortests/word_segmentation_data.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "data": [
 3 |     {
 4 |       "typo": "thequickbrownfoxjumpsoverthelazydog",
 5 |       "0": { "term": "the quick brown fox jumps over the lazy dog" }
 6 |     },
 7 |     {
 8 |       "typo": "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen",
 9 |       "0": {
10 |         "term": "it was a bright cold day in april and the clocks were striking thirteen"
11 |       }
12 |     },
13 |     {
14 |       "typo": "itwasthebestoftimesitwastheworstoftimesitwastheageofwisdomitwastheageoffoolishness",
15 |       "0": {
16 |         "term": "it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness"
17 |       }
18 |     }
19 |   ]
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/test_compatibility.py:
--------------------------------------------------------------------------------
 1 | from symspellpy.helpers import null_distance_results, prefix_suffix_prep
 2 | 
 3 | 
 4 | def test_null_distance_results():
 5 |     assert null_distance_results(None, None, 1) == 0
 6 |     assert null_distance_results(None, string2=None, max_distance=1) == 0
 7 |     assert null_distance_results(string1=None, string2=None, max_distance=1) == 0
 8 |     assert null_distance_results(string_1=None, string_2=None, max_distance=1) == 0
 9 | 
10 | 
11 | def test_prefix_suffix_prep():
12 |     assert prefix_suffix_prep("dabca", "ddca") == (2, 1, 1)
13 |     assert prefix_suffix_prep("dabca", string2="ddca") == (2, 1, 1)
14 |     assert prefix_suffix_prep(string1="dabca", string2="ddca") == (2, 1, 1)
15 |     assert prefix_suffix_prep(string_1="dabca", string_2="ddca") == (2, 1, 1)
16 | 


--------------------------------------------------------------------------------
/tests/test_editdistance.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from itertools import combinations, permutations
  3 | 
  4 | import pytest
  5 | 
  6 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
  7 | from symspellpy.editdistance import (
  8 |     DamerauOsa,
  9 |     DamerauOsaFast,
 10 |     DistanceAlgorithm,
 11 |     EditDistance,
 12 |     Levenshtein,
 13 |     LevenshteinFast,
 14 | )
 15 | 
 16 | SHORT_STRING = "string"
 17 | LONG_STRING = "long_string"
 18 | VERY_LONG_STRING = "very_long_string"
 19 | 
 20 | 
 21 | def expected_levenshtein(string_1, string_2, max_distance):
 22 |     max_distance = int(min(2**31 - 1, max_distance))
 23 |     len_1 = len(string_1)
 24 |     len_2 = len(string_2)
 25 |     d = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]
 26 |     for i in range(len_1 + 1):
 27 |         d[i][0] = i
 28 |     for i in range(len_2 + 1):
 29 |         d[0][i] = i
 30 |     for j in range(1, len_2 + 1):
 31 |         for i in range(1, len_1 + 1):
 32 |             if string_1[i - 1] == string_2[j - 1]:
 33 |                 # no operation
 34 |                 d[i][j] = d[i - 1][j - 1]
 35 |             else:
 36 |                 d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + 1)
 37 |     distance = d[len_1][len_2]
 38 |     return distance if distance <= max_distance else -1
 39 | 
 40 | 
 41 | def expected_damerau_osa(string_1, string_2, max_distance):
 42 |     max_distance = int(min(2**31 - 1, max_distance))
 43 |     len_1 = len(string_1)
 44 |     len_2 = len(string_2)
 45 |     d = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]
 46 |     for i in range(len_1 + 1):
 47 |         d[i][0] = i
 48 |     for i in range(len_2 + 1):
 49 |         d[0][i] = i
 50 |     for i in range(1, len_1 + 1):
 51 |         for j in range(1, len_2 + 1):
 52 |             cost = 0 if string_1[i - 1] == string_2[j - 1] else 1
 53 |             d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost)
 54 |             if (
 55 |                 i > 1
 56 |                 and j > 1
 57 |                 and string_1[i - 1] == string_2[j - 2]
 58 |                 and string_1[i - 2] == string_2[j - 1]
 59 |             ):
 60 |                 d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)
 61 |     distance = d[len_1][len_2]
 62 |     return distance if distance <= max_distance else -1
 63 | 
 64 | 
 65 | class CustomDistanceComparer(AbstractDistanceComparer):
 66 |     def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
 67 |         return -2
 68 | 
 69 | 
 70 | @pytest.fixture(
 71 |     params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"]
 72 | )
 73 | def get_comparer(request):
 74 |     comparer_dict = {
 75 |         "damerau_osa": {"actual": DamerauOsa(), "expected": expected_damerau_osa},
 76 |         "levenshtein": {"actual": Levenshtein(), "expected": expected_levenshtein},
 77 |         "damerau_osa_fast": {
 78 |             "actual": DamerauOsaFast(),
 79 |             "expected": expected_damerau_osa,
 80 |         },
 81 |         "levenshtein_fast": {
 82 |             "actual": LevenshteinFast(),
 83 |             "expected": expected_levenshtein,
 84 |         },
 85 |     }
 86 |     yield (
 87 |         comparer_dict[request.param]["actual"],
 88 |         comparer_dict[request.param]["expected"],
 89 |     )
 90 | 
 91 | 
 92 | @pytest.fixture(
 93 |     params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"]
 94 | )
 95 | def get_edit_distance(request):
 96 |     comparer_dict = {
 97 |         "damerau_osa": {
 98 |             "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA),
 99 |             "expected": DamerauOsa,
100 |         },
101 |         "levenshtein": {
102 |             "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN),
103 |             "expected": Levenshtein,
104 |         },
105 |         "damerau_osa_fast": {
106 |             "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST),
107 |             "expected": DamerauOsaFast,
108 |         },
109 |         "levenshtein_fast": {
110 |             "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST),
111 |             "expected": LevenshteinFast,
112 |         },
113 |     }
114 |     yield (
115 |         comparer_dict[request.param]["actual"],
116 |         comparer_dict[request.param]["expected"],
117 |     )
118 | 
119 | 
120 | @pytest.fixture
121 | def get_short_and_long_strings():
122 |     return [
123 |         (SHORT_STRING, None, {"null": len(SHORT_STRING), "zero": -1, "neg": -1}),
124 |         (LONG_STRING, None, {"null": -1, "zero": -1, "neg": -1}),
125 |         (None, SHORT_STRING, {"null": len(SHORT_STRING), "zero": -1, "neg": -1}),
126 |         (None, LONG_STRING, {"null": -1, "zero": -1, "neg": -1}),
127 |         (SHORT_STRING, SHORT_STRING, {"null": 0, "zero": 0, "neg": 0}),
128 |         (None, None, {"null": 0, "zero": 0, "neg": 0}),
129 |     ]
130 | 
131 | 
132 | @pytest.fixture(params=[0, 1, 3, sys.maxsize])
133 | def get_strings(request):
134 |     alphabet = "abcd"
135 |     strings = [""]
136 |     for i in range(1, len(alphabet) + 1):
137 |         for combi in combinations(alphabet, i):
138 |             strings += ["".join(p) for p in permutations(combi)]
139 |     yield strings, request.param
140 | 
141 | 
142 | class TestEditDistance:
143 |     def test_unknown_distance_algorithm(self):
144 |         with pytest.raises(ValueError) as excinfo:
145 |             _ = EditDistance(2)
146 |         assert "unknown distance algorithm" == str(excinfo.value)
147 | 
148 |     def test_missing_custom_comparer(self):
149 |         with pytest.raises(ValueError) as excinfo:
150 |             _ = EditDistance(DistanceAlgorithm.USER_PROVIDED)
151 |         assert "no comparer passed in" in str(excinfo.value)
152 | 
153 |     def test_abstract_distance_comparer(self):
154 |         with pytest.raises(TypeError) as excinfo:
155 |             comparer = AbstractDistanceComparer()
156 |             _ = comparer.distance("string_1", "string_2", 10)
157 |         assert str(excinfo.value).startswith(
158 |             "Can't instantiate abstract class AbstractDistanceComparer"
159 |         )
160 | 
161 |     def test_warn_when_builtin_comparer_override_custom_comparer(self):
162 |         with pytest.warns(UserWarning, match="A built-in comparer will be used.$"):
163 |             comparer = CustomDistanceComparer()
164 |             edit_distance = EditDistance(DistanceAlgorithm.LEVENSHTEIN, comparer)
165 | 
166 |     def test_internal_distance_comparer(self, get_edit_distance):
167 |         edit_distance, expected = get_edit_distance
168 |         assert isinstance(edit_distance._distance_comparer, expected)
169 | 
170 |     def test_comparer_match_ref(self, get_comparer, get_strings):
171 |         comparer, expected = get_comparer
172 |         strings, max_distance = get_strings
173 | 
174 |         for s1 in strings:
175 |             for s2 in strings:
176 |                 assert expected(s1, s2, max_distance) == comparer.distance(
177 |                     s1, s2, max_distance
178 |                 )
179 | 
180 |     def test_editdistance_use_custom_comparer(self, get_strings):
181 |         strings, max_distance = get_strings
182 |         comparer = CustomDistanceComparer()
183 |         edit_distance = EditDistance(DistanceAlgorithm.USER_PROVIDED, comparer)
184 | 
185 |         for s1 in strings:
186 |             for s2 in strings:
187 |                 assert -2 == comparer.distance(s1, s2, max_distance)
188 | 
189 |     def test_comparer_null_distance(self, get_comparer, get_short_and_long_strings):
190 |         comparer, _ = get_comparer
191 | 
192 |         for s1, s2, expected in get_short_and_long_strings:
193 |             distance = comparer.distance(s1, s2, 10)
194 |             assert expected["null"] == distance
195 | 
196 |     def test_comparer_negative_max_distance(
197 |         self, get_comparer, get_short_and_long_strings
198 |     ):
199 |         comparer, _ = get_comparer
200 | 
201 |         for s1, s2, expected in get_short_and_long_strings:
202 |             distance = comparer.distance(s1, s2, 0)
203 |             assert expected["zero"] == distance
204 | 
205 |         for s1, s2, expected in get_short_and_long_strings:
206 |             distance = comparer.distance(s1, s2, 0)
207 |             assert expected["neg"] == distance
208 | 
209 |     def test_comparer_very_long_string(self, get_comparer):
210 |         comparer, _ = get_comparer
211 |         distance = comparer.distance(SHORT_STRING, VERY_LONG_STRING, 5)
212 | 
213 |         assert -1 == distance
214 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from symspellpy.helpers import (
 4 |     case_transfer_matching,
 5 |     case_transfer_similar,
 6 |     is_acronym,
 7 |     to_similarity,
 8 | )
 9 | 
10 | 
11 | @pytest.fixture
12 | def get_acronyms():
13 |     return [
14 |         ("ABCDE", {"default": True, "digits": True}),
15 |         ("AB12E", {"default": True, "digits": True}),
16 |         ("abcde", {"default": False, "digits": False}),
17 |         ("ABCde", {"default": False, "digits": False}),
18 |         ("abcDE", {"default": False, "digits": False}),
19 |         ("abCDe", {"default": False, "digits": False}),
20 |         ("abc12", {"default": False, "digits": True}),
21 |         ("ab12e", {"default": False, "digits": True}),
22 |     ]
23 | 
24 | 
25 | @pytest.fixture
26 | def get_similar_texts():
27 |     return [
28 |         (
29 |             "Haaw is the weeather in New York?",
30 |             "how is the weather in new york?",
31 |             "How is the weather in New York?",
32 |         ),
33 |         ("Wethr in New Yoork", "weather in new york", "Weather in New York"),
34 |         ("Efthr in New Yoork", "weather in new york", "WEather in New York"),
35 |         ("efthr in New Yoork", "weather in new york", "weather in New York"),
36 |         ("eTr in New Yoork", "weather in new york", "weaTHEr in New York"),
37 |         ("hoW eqr", "Haaaw er", "haaaW er"),
38 |         ("hOW eqr", "Haaaw er", "hAAAW er"),
39 |     ]
40 | 
41 | 
42 | class TestHelpers:
43 |     def test_to_similarity(self):
44 |         length = 20.0
45 | 
46 |         assert pytest.approx(0.7) == to_similarity(6.0, length)
47 |         assert -1 == to_similarity(-1.0, length)
48 | 
49 |     def test_is_acronym(self, get_acronyms):
50 |         for word, expected in get_acronyms:
51 |             assert expected["default"] == is_acronym(word)
52 |             assert expected["digits"] == is_acronym(word, True)
53 | 
54 |     def test_case_transfer_matching_diff_lengths(self):
55 |         with pytest.raises(ValueError) as excinfo:
56 |             case_transfer_matching("abc", "abcd")
57 |         assert (
58 |             "'cased_text' and 'uncased_text' don't have the same length, use "
59 |             "case_transfer_similar() instead"
60 |         ) == str(excinfo.value)
61 | 
62 |     def test_case_transfer_matching(self):
63 |         cased_text = "Haw is the eeather in New York?"
64 |         uncased_text = "how is the weather in new york?"
65 | 
66 |         # the uncased_text text with the casing transferred from
67 |         # the cased_text text
68 |         assert "How is the weather in New York?" == case_transfer_matching(
69 |             cased_text, uncased_text
70 |         )
71 | 
72 |     def test_case_transfer_similar_empty_wo_casing(self):
73 |         cased_text = "Haw is the eeather in New York?"
74 |         uncased_text = ""
75 | 
76 |         assert uncased_text == case_transfer_similar(cased_text, uncased_text)
77 | 
78 |     def test_case_transfer_similar_empty_w_casing(self):
79 |         with pytest.raises(ValueError) as excinfo:
80 |             case_transfer_similar("", "abcd")
81 |         assert "'cased_text' cannot be empty" == str(excinfo.value)
82 | 
83 |     def test_case_transfer_similar(self, get_similar_texts):
84 |         for cased_text, uncased_text, expected in get_similar_texts:
85 |             assert expected == case_transfer_similar(cased_text, uncased_text)
86 | 


--------------------------------------------------------------------------------
/tests/test_suggest_item.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from symspellpy.suggest_item import SuggestItem
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def suggest_item():
 8 |     return SuggestItem("term", 0, 0)
 9 | 
10 | 
11 | class TestSuggestItem:
12 |     def test_invalid_equal_to(self, suggest_item):
13 |         assert suggest_item.__eq__(0) is NotImplemented
14 |         assert not suggest_item == 0
15 | 
16 |     def test_invalid_less_than(self, suggest_item):
17 |         assert suggest_item.__lt__(0) is NotImplemented
18 |         with pytest.raises(TypeError) as excinfo:
19 |             suggest_item < 0
20 |         assert "'<' not supported between instances of 'SuggestItem' and 'int'" == str(
21 |             excinfo.value
22 |         )
23 | 
24 |     def test_suggest_item(self):
25 |         si_1 = SuggestItem("asdf", 12, 34)
26 |         si_2 = SuggestItem("sdfg", 12, 34)
27 |         si_3 = SuggestItem("dfgh", 56, 78)
28 | 
29 |         assert si_1 == si_2
30 |         assert si_2 != si_3
31 | 
32 |         assert "asdf" == si_1.term
33 |         si_1.term = "qwer"
34 |         assert "qwer" == si_1.term
35 | 
36 |         assert 34 == si_1.count
37 |         si_1.count = 78
38 |         assert 78 == si_1.count
39 | 
40 |         assert "qwer, 12, 78" == str(si_1)
41 | 


--------------------------------------------------------------------------------
/tests/test_symspellpy.py:
--------------------------------------------------------------------------------
  1 | from io import StringIO
  2 | from pathlib import Path
  3 | from unittest import TestCase
  4 | 
  5 | import pytest
  6 | 
  7 | from symspellpy import SymSpell, Verbosity
  8 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
  9 | from symspellpy.editdistance import DistanceAlgorithm, EditDistance
 10 | from symspellpy.helpers import DictIO
 11 | 
 12 | FORTESTS_DIR = Path(__file__).resolve().parent / "fortests"
 13 | BAD_DICT_PATH = FORTESTS_DIR / "bad_dict.txt"
 14 | BELOW_THRESHOLD_DICT_PATH = FORTESTS_DIR / "below_threshold_dict.txt"
 15 | BIG_MODIFIED_PATH = FORTESTS_DIR / "big_modified.txt"
 16 | BIG_WORDS_PATH = FORTESTS_DIR / "big_words.txt"
 17 | NON_EN_DICT_PATH = FORTESTS_DIR / "non_en_dict.txt"
 18 | SEPARATOR_DICT_PATH = FORTESTS_DIR / "separator_dict.txt"
 19 | 
 20 | INVALID_PATH = "invalid/dictionary/path.txt"
 21 | SEPARATOR = "$"
 22 | 
 23 | 
 24 | @pytest.fixture
 25 | def get_dictionary_stream(request):
 26 |     dictionary = {
 27 |         "the": 23135851162,
 28 |         "of": 13151942776,
 29 |         "abcs of": 10956800,
 30 |         "aaron and": 10721728,
 31 |         "and": 12997637966,
 32 |         "large count": 92233720368547758081,
 33 |     }
 34 |     if request.param is None:
 35 |         dict_stream = DictIO(dictionary)
 36 |     else:
 37 |         dict_stream = DictIO(dictionary, request.param)
 38 |     yield dict_stream, request.param
 39 | 
 40 | 
 41 | class CustomDistanceComparer(AbstractDistanceComparer):
 42 |     def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
 43 |         return 0
 44 | 
 45 | 
 46 | class TestSymSpellPy:
 47 |     def test_negative_max_dictionary_edit_distance(self):
 48 |         with pytest.raises(ValueError) as excinfo:
 49 |             _ = SymSpell(-1, 3)
 50 |         assert "max_dictionary_edit_distance cannot be negative" == str(excinfo.value)
 51 | 
 52 |     def test_invalid_prefix_length(self):
 53 |         # prefix_length < 1
 54 |         with pytest.raises(ValueError) as excinfo:
 55 |             _ = SymSpell(1, 0)
 56 |         assert "prefix_length cannot be less than 1" == str(excinfo.value)
 57 | 
 58 |         with pytest.raises(ValueError) as excinfo:
 59 |             _ = SymSpell(1, -1)
 60 |         assert "prefix_length cannot be less than 1" == str(excinfo.value)
 61 | 
 62 |         # prefix_length <= max_dictionary_edit_distance
 63 |         with pytest.raises(ValueError) as excinfo:
 64 |             _ = SymSpell(2, 2)
 65 |         assert "prefix_length must be greater than max_dictionary_edit_distance" == str(
 66 |             excinfo.value
 67 |         )
 68 | 
 69 |     def test_negative_count_threshold(self):
 70 |         with pytest.raises(ValueError) as excinfo:
 71 |             _ = SymSpell(1, 3, -1)
 72 |         assert "count_threshold cannot be negative" == str(excinfo.value)
 73 | 
 74 |     def test_set_distance_comparer(self):
 75 |         distance_comparer = EditDistance(
 76 |             DistanceAlgorithm.USER_PROVIDED, CustomDistanceComparer()
 77 |         )
 78 |         sym_spell = SymSpell(distance_comparer=distance_comparer)
 79 | 
 80 |         assert distance_comparer == sym_spell.distance_comparer
 81 | 
 82 |     @pytest.mark.parametrize("symspell_short", [None, 0], indirect=True)
 83 |     def test_create_dictionary_entry_negative_count(self, symspell_short):
 84 |         assert (
 85 |             symspell_short._count_threshold == 0
 86 |         ) == symspell_short.create_dictionary_entry("pipe", 0)
 87 |         assert not symspell_short.create_dictionary_entry("pipe", -1)
 88 | 
 89 |     @pytest.mark.parametrize("symspell_short", [10], indirect=True)
 90 |     def test_create_dictionary_entry_below_threshold(self, symspell_short):
 91 |         symspell_short.create_dictionary_entry("pipe", 4)
 92 |         assert 1 == len(symspell_short.below_threshold_words)
 93 |         assert 4 == symspell_short.below_threshold_words["pipe"]
 94 | 
 95 |         symspell_short.create_dictionary_entry("pipe", 4)
 96 |         assert 1 == len(symspell_short.below_threshold_words)
 97 |         assert 8 == symspell_short.below_threshold_words["pipe"]
 98 | 
 99 |         symspell_short.create_dictionary_entry("pipe", 4)
100 |         assert 0 == len(symspell_short.below_threshold_words)
101 | 
102 |     def test_add_additional_counts_should_not_add_word_again(
103 |         self, symspell_default, get_same_word_and_count
104 |     ):
105 |         for word, count in get_same_word_and_count:
106 |             symspell_default.create_dictionary_entry(word, count)
107 |             assert 1 == symspell_default.word_count
108 | 
109 |     def test_add_additional_counts_should_increase_count(
110 |         self, symspell_default, get_same_word_and_count
111 |     ):
112 |         expected_count = 0
113 |         for word, count in get_same_word_and_count:
114 |             expected_count += count
115 |             symspell_default.create_dictionary_entry(word, count)
116 |             result = symspell_default.lookup(word, Verbosity.TOP)
117 |             assert expected_count == result[0].count
118 | 
119 |     def test_load_bigram_dictionary_invalid_path(self, symspell_default):
120 |         with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm:
121 |             assert not symspell_default.load_bigram_dictionary(INVALID_PATH, 0, 2)
122 |         assert (
123 |             f"Bigram dictionary file not found at {Path(INVALID_PATH)}."
124 |             == cm.records[0].getMessage()
125 |         )
126 | 
127 |     def test_loading_dictionary_from_fileobject(self, symspell_default):
128 |         with open(BIG_WORDS_PATH, "r", encoding="utf8") as infile:
129 |             assert symspell_default.create_dictionary(infile)
130 | 
131 |     def test_load_bigram_dictionary_bad_dict(self, symspell_default):
132 |         assert symspell_default.load_bigram_dictionary(BAD_DICT_PATH, 0, 2)
133 |         assert 2 == len(symspell_default.bigrams)
134 |         assert 12 == symspell_default.bigrams["rtyu tyui"]
135 |         assert 13 == symspell_default.bigrams["yuio uiop"]
136 | 
137 |     def test_load_bigram_dictionary_separator(self, symspell_default):
138 |         assert symspell_default.load_bigram_dictionary(
139 |             SEPARATOR_DICT_PATH, 0, 1, SEPARATOR
140 |         )
141 |         assert 5 == len(symspell_default.bigrams)
142 |         assert 23135851162 == symspell_default.bigrams["the"]
143 |         assert 13151942776 == symspell_default.bigrams["of"]
144 |         assert 10956800 == symspell_default.bigrams["abcs of"]
145 |         assert 10721728, symspell_default.bigrams["aaron and"]
146 |         assert 12997637966 == symspell_default.bigrams["and"]
147 | 
148 |     @pytest.mark.parametrize("get_dictionary_stream", [None], indirect=True)
149 |     def test_load_bigram_dictionary_stream(
150 |         self, symspell_default, get_dictionary_stream
151 |     ):
152 |         dict_stream, _ = get_dictionary_stream
153 |         assert symspell_default._load_bigram_dictionary_stream(dict_stream, 0, 2)
154 |         assert 2 == len(symspell_default.bigrams)
155 |         assert 10956800 == symspell_default.bigrams["abcs of"]
156 |         assert 10721728 == symspell_default.bigrams["aaron and"]
157 |         assert "large count" not in symspell_default.bigrams
158 | 
159 |     @pytest.mark.parametrize("get_dictionary_stream", [SEPARATOR], indirect=True)
160 |     def test_load_bigram_dictionary_stream_separator(
161 |         self, symspell_default, get_dictionary_stream
162 |     ):
163 |         dict_stream, separator = get_dictionary_stream
164 |         assert symspell_default._load_bigram_dictionary_stream(
165 |             dict_stream, 0, 1, separator
166 |         )
167 |         assert 5 == len(symspell_default.bigrams)
168 |         assert 23135851162 == symspell_default.bigrams["the"]
169 |         assert 13151942776 == symspell_default.bigrams["of"]
170 |         assert 10956800 == symspell_default.bigrams["abcs of"]
171 |         assert 10721728 == symspell_default.bigrams["aaron and"]
172 |         assert 12997637966 == symspell_default.bigrams["and"]
173 | 
174 |     def test_load_dictionary_invalid_path(self, symspell_default):
175 |         with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm:
176 |             assert not symspell_default.load_dictionary(INVALID_PATH, 0, 1)
177 |         assert (
178 |             f"Dictionary file not found at {Path(INVALID_PATH)}."
179 |             == cm.records[0].getMessage()
180 |         )
181 | 
182 |     def test_load_dictionary_bad_dictionary(self, symspell_default):
183 |         assert symspell_default.load_dictionary(BAD_DICT_PATH, 0, 1)
184 |         assert 2 == symspell_default.word_count
185 |         assert 10 == symspell_default.words["asdf"]
186 |         assert 12 == symspell_default.words["sdfg"]
187 | 
188 |     def test_load_dictionary_count(self, symspell_default, dictionary_path):
189 |         symspell_default.load_dictionary(dictionary_path, 0, 1)
190 | 
191 |         assert 82834 == symspell_default.word_count
192 |         assert 676094 == symspell_default.entry_count
193 | 
194 |     @pytest.mark.parametrize("symspell_short", [10], indirect=True)
195 |     def test_load_dictionary_below_threshold(self, symspell_short):
196 |         symspell_short.load_dictionary(BELOW_THRESHOLD_DICT_PATH, 0, 1)
197 | 
198 |         assert 1 == len(symspell_short.below_threshold_words)
199 |         assert 8 == symspell_short.below_threshold_words["below"]
200 | 
201 |         assert 2 == symspell_short.word_count
202 | 
203 |     def test_load_dictionary_separator(self, symspell_default):
204 |         assert symspell_default.load_dictionary(SEPARATOR_DICT_PATH, 0, 1, SEPARATOR)
205 |         assert 5 == symspell_default.word_count
206 |         assert 23135851162 == symspell_default.words["the"]
207 |         assert 13151942776 == symspell_default.words["of"]
208 |         assert 10956800 == symspell_default.words["abcs of"]
209 |         assert 10721728 == symspell_default.words["aaron and"]
210 |         assert 12997637966 == symspell_default.words["and"]
211 | 
212 |     @pytest.mark.parametrize("get_dictionary_stream", [None], indirect=True)
213 |     def test_load_dictionary_stream(self, symspell_default, get_dictionary_stream):
214 |         # keys with space in them don't get parsed properly when using
215 |         # the default separator=" "
216 |         dict_stream, _ = get_dictionary_stream
217 |         assert symspell_default._load_dictionary_stream(dict_stream, 0, 1)
218 |         assert 3 == symspell_default.word_count
219 |         assert 23135851162 == symspell_default.words["the"]
220 |         assert 13151942776 == symspell_default.words["of"]
221 |         assert 12997637966 == symspell_default.words["and"]
222 | 
223 |     @pytest.mark.parametrize("get_dictionary_stream", [SEPARATOR], indirect=True)
224 |     def test_load_dictionary_stream_separator(
225 |         self, symspell_default, get_dictionary_stream
226 |     ):
227 |         dict_stream, separator = get_dictionary_stream
228 |         assert symspell_default._load_dictionary_stream(dict_stream, 0, 1, separator)
229 |         assert 5 == symspell_default.word_count
230 |         assert 23135851162 == symspell_default.words["the"]
231 |         assert 13151942776 == symspell_default.words["of"]
232 |         assert 10956800 == symspell_default.words["abcs of"]
233 |         assert 10721728 == symspell_default.words["aaron and"]
234 |         assert 12997637966 == symspell_default.words["and"]
235 | 
236 |     def test_load_dictionary_encoding(self, symspell_default):
237 |         symspell_default.load_dictionary(NON_EN_DICT_PATH, 0, 1, encoding="utf-8")
238 | 
239 |         result = symspell_default.lookup("АБ", Verbosity.TOP, 2)
240 |         assert 1 == len(result)
241 |         assert "АБИ" == result[0].term
242 | 
243 |     def test_load_dictionary_from_string_io(self, symspell_default, dictionary_path):
244 |         with open(dictionary_path, "r") as f:
245 |             symspell_default.load_dictionary(StringIO(f.read()), 0, 1)
246 |             assert 82834 == symspell_default.word_count
247 |             assert 676094 == symspell_default.entry_count
248 | 
249 |     def test_load_dictionary_from_text_io_wrapper(self, symspell_default, dictionary_path):
250 |         with open(dictionary_path, "r") as f:
251 |             symspell_default.load_dictionary(f, 0, 1)
252 |             assert 82834 == symspell_default.word_count
253 |             assert 676094 == symspell_default.entry_count
254 | 
255 |     def test_create_dictionary_invalid_path(self, symspell_default):
256 |         with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm:
257 |             assert not symspell_default.create_dictionary(INVALID_PATH)
258 |         assert (
259 |             f"Corpus not found at {Path(INVALID_PATH)}." == cm.records[0].getMessage()
260 |         )
261 | 
262 |     def test_create_dictionary(self, symspell_default):
263 |         symspell_default.create_dictionary(BIG_MODIFIED_PATH, encoding="utf-8")
264 | 
265 |         num_lines = 0
266 |         with open(BIG_WORDS_PATH, "r") as infile:
267 |             for line in infile:
268 |                 key, count = line.rstrip().split(" ")
269 |                 assert int(count) == symspell_default.words[key]
270 |                 num_lines += 1
271 |         assert num_lines == symspell_default.word_count
272 | 
273 |     @pytest.mark.parametrize(
274 |         "symspell_default_entry",
275 |         [[("stea", 1), ("steama", 2), ("steem", 3)]],
276 |         indirect=True,
277 |     )
278 |     def test_delete_dictionary_entry(self, symspell_default_entry):
279 |         result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
280 |         assert 1 == len(result)
281 |         assert "steama" == result[0].term
282 |         assert len("steama") == symspell_default_entry._max_length
283 | 
284 |         assert symspell_default_entry.delete_dictionary_entry("steama")
285 |         assert "steama" not in symspell_default_entry.words
286 |         assert len("steem") == symspell_default_entry._max_length
287 | 
288 |         result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
289 |         assert 1 == len(result)
290 |         assert "steem" == result[0].term
291 | 
292 |         assert symspell_default_entry.delete_dictionary_entry("stea")
293 |         assert "stea" not in symspell_default_entry.words
294 |         assert len("steem") == symspell_default_entry._max_length
295 | 
296 |         result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
297 |         assert 1 == len(result)
298 |         assert "steem" == result[0].term
299 | 
300 |     @pytest.mark.parametrize(
301 |         "symspell_default_entry",
302 |         [[("stea", 1), ("steama", 2), ("steem", 3)]],
303 |         indirect=True,
304 |     )
305 |     def test_delete_dictionary_entry_invalid_word(self, symspell_default_entry):
306 |         result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
307 |         assert 1 == len(result)
308 |         assert "steama" == result[0].term
309 |         assert len("steama") == symspell_default_entry._max_length
310 | 
311 |         assert not symspell_default_entry.delete_dictionary_entry("steamab")
312 |         result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
313 |         assert 1 == len(result)
314 |         assert "steama" == result[0].term
315 |         assert len("steama") == symspell_default_entry._max_length
316 | 


--------------------------------------------------------------------------------
/tests/test_symspellpy_edge_cases.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from symspellpy import Verbosity
 4 | 
 5 | ENTRIES = ["baked", "ax", "lake", "", "slaked"]
 6 | 
 7 | 
 8 | class TestSymSpellPyEdgeCases:
 9 |     @pytest.mark.parametrize("symspell_long_entry", [ENTRIES], indirect=True)
10 |     def test_empty_string_has_all_short_deletes(self, symspell_long_entry):
11 |         sym_spell, entries = symspell_long_entry
12 | 
13 |         assert len(entries[:-1]) == len(sym_spell.deletes[""])
14 |         assert all(entry in sym_spell.deletes[""] for entry in entries[:-1])
15 |         assert "abc" not in sym_spell.deletes[""]
16 | 
17 |     def test_split_correction_part_of_single_term_correction(self, symspell_default):
18 |         symspell_default.create_dictionary_entry("where", 2)
19 |         symspell_default.create_dictionary_entry("is", 2)
20 |         symspell_default.create_dictionary_entry("whereas", 2)
21 |         symspell_default._bigrams["where is"] = 10
22 | 
23 |         suggestions = symspell_default.lookup_compound("whereiz", 2)
24 |         assert "where is" == suggestions[0].term
25 |         assert 2 == suggestions[0].distance
26 |         assert 10 == suggestions[0].count
27 | 
28 |     @pytest.mark.parametrize("symspell_long_entry", [["bank", "bink"]], indirect=True)
29 |     def test_no_common_char_with_phrase(self, symspell_long_entry):
30 |         sym_spell, _ = symspell_long_entry
31 |         results = sym_spell.lookup("knab", Verbosity.ALL, 4)
32 | 
33 |         assert 2 == len(results)
34 |         assert "bank" == results[0].term
35 |         assert 3 == results[0].distance
36 |         assert "bink" == results[1].term
37 |         assert 4 == results[1].distance
38 | 


--------------------------------------------------------------------------------
/tests/test_symspellpy_lookup.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | 
  3 | import pytest
  4 | 
  5 | from symspellpy import SymSpell, Verbosity
  6 | 
  7 | 
  8 | @pytest.fixture
  9 | def symspell_high_thres():
 10 |     return SymSpell(2, 7, 10)
 11 | 
 12 | 
 13 | @pytest.fixture
 14 | def symspell_high_thres_flame(symspell_high_thres):
 15 |     symspell_high_thres.create_dictionary_entry("flame", 20)
 16 |     symspell_high_thres.create_dictionary_entry("flam", 1)
 17 |     return symspell_high_thres
 18 | 
 19 | 
 20 | class TestSymSpellPyLookup:
 21 |     @pytest.mark.parametrize(
 22 |         "symspell_default_entry",
 23 |         [[("steama", 4), ("steamb", 6), ("steamc", 2)]],
 24 |         indirect=True,
 25 |     )
 26 |     def test_deletes(self, symspell_default_entry):
 27 |         result = symspell_default_entry.lookup("stream", Verbosity.TOP, 2)
 28 |         assert 1 == len(result)
 29 |         assert "steamb" == result[0].term
 30 |         assert 6 == result[0].count
 31 |         assert symspell_default_entry.deletes
 32 | 
 33 |     @pytest.mark.parametrize("symspell_short", [None], indirect=True)
 34 |     def test_words_with_shared_prefix_should_retain_counts(self, symspell_short):
 35 |         symspell_short.create_dictionary_entry("pipe", 5)
 36 |         symspell_short.create_dictionary_entry("pips", 10)
 37 | 
 38 |         result = symspell_short.lookup("pipe", Verbosity.ALL, 1)
 39 |         assert 2 == len(result)
 40 |         assert "pipe" == result[0].term
 41 |         assert 5 == result[0].count
 42 |         assert "pips" == result[1].term
 43 |         assert 10 == result[1].count
 44 | 
 45 |         result = symspell_short.lookup("pips", Verbosity.ALL, 1)
 46 |         assert 2 == len(result)
 47 |         assert "pips" == result[0].term
 48 |         assert 10 == result[0].count
 49 |         assert "pipe" == result[1].term
 50 |         assert 5 == result[1].count
 51 | 
 52 |         result = symspell_short.lookup("pip", Verbosity.ALL, 1)
 53 |         assert 2 == len(result)
 54 |         assert "pips" == result[0].term
 55 |         assert 10 == result[0].count
 56 |         assert "pipe" == result[1].term
 57 |         assert 5 == result[1].count
 58 | 
 59 |     def test_add_additional_counts_should_not_overflow(
 60 |         self, symspell_default, get_same_word_and_count
 61 |     ):
 62 |         for i, (word, count) in enumerate(get_same_word_and_count):
 63 |             symspell_default.create_dictionary_entry(
 64 |                 word, sys.maxsize - 1 if i == 0 else count
 65 |             )
 66 |             result = symspell_default.lookup(word, Verbosity.TOP)
 67 |             assert (sys.maxsize - 1 if i == 0 else sys.maxsize) == result[0].count
 68 | 
 69 |     @pytest.mark.parametrize(
 70 |         "verbosity, num_results",
 71 |         [(Verbosity.TOP, 1), (Verbosity.CLOSEST, 2), (Verbosity.ALL, 3)],
 72 |     )
 73 |     def test_verbosity_should_control_lookup_results(
 74 |         self, symspell_default, verbosity, num_results
 75 |     ):
 76 |         symspell_default.create_dictionary_entry("steam", 1)
 77 |         symspell_default.create_dictionary_entry("steams", 2)
 78 |         symspell_default.create_dictionary_entry("steem", 3)
 79 | 
 80 |         result = symspell_default.lookup("steems", verbosity, 2)
 81 |         assert num_results == len(result)
 82 | 
 83 |     @pytest.mark.parametrize(
 84 |         "symspell_default_entry",
 85 |         [[("steama", 4), ("steamb", 6), ("steamc", 2)]],
 86 |         indirect=True,
 87 |     )
 88 |     def test_should_return_most_frequent(self, symspell_default_entry):
 89 |         result = symspell_default_entry.lookup("stream", Verbosity.TOP, 2)
 90 |         assert 1 == len(result)
 91 |         assert "steamb" == result[0].term
 92 |         assert 6 == result[0].count
 93 | 
 94 |     @pytest.mark.parametrize(
 95 |         "symspell_default_entry",
 96 |         [[("steama", 4), ("steamb", 6), ("steamc", 2)]],
 97 |         indirect=True,
 98 |     )
 99 |     def test_should_find_exact_match(self, symspell_default_entry):
100 |         result = symspell_default_entry.lookup("streama", Verbosity.TOP, 2)
101 |         assert 1 == len(result)
102 |         assert "steama" == result[0].term
103 | 
104 |     @pytest.mark.parametrize("term", ["paw", "awn"])
105 |     def test_should_not_return_non_word_delete(self, symspell_high_thres, term):
106 |         symspell_high_thres.create_dictionary_entry("pawn", 10)
107 |         result = symspell_high_thres.lookup(term, Verbosity.TOP, 0)
108 |         assert not result
109 | 
110 |     def test_should_not_return_low_count_word(self, symspell_high_thres):
111 |         symspell_high_thres.create_dictionary_entry("pawn", 1)
112 |         result = symspell_high_thres.lookup("pawn", Verbosity.TOP, 0)
113 |         assert not result
114 | 
115 |     def test_should_not_return_low_count_word_that_are_also_delete_word(
116 |         self, symspell_high_thres_flame
117 |     ):
118 |         result = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 0)
119 |         assert not result
120 | 
121 |     def test_max_edit_distance_too_large(self, symspell_high_thres_flame):
122 |         with pytest.raises(ValueError) as excinfo:
123 |             _ = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 3)
124 |         assert "distance too large" == str(excinfo.value)
125 | 
126 |     def test_include_unknown(self, symspell_high_thres_flame):
127 |         result = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 0, True)
128 |         assert 1 == len(result)
129 |         assert "flam" == result[0].term
130 | 
131 |     def test_avoid_exact_match_early_exit(self, symspell_high_thres_flame):
132 |         result = symspell_high_thres_flame.lookup(
133 |             "24th", Verbosity.ALL, 2, ignore_token=r"\d{2}\w*\b"
134 |         )
135 |         assert 1 == len(result)
136 |         assert "24th" == result[0].term
137 | 
138 |     def test_should_replicate_noisy_results(
139 |         self, dictionary_path, query_path, symspell_default
140 |     ):
141 |         symspell_default.load_dictionary(dictionary_path, 0, 1)
142 | 
143 |         with open(query_path, "r") as infile:
144 |             test_phrases = [
145 |                 parts[0]
146 |                 for parts in map(lambda x: x.strip().split(), infile.readlines())
147 |                 if len(parts) >= 2
148 |             ]
149 | 
150 |         result_sum = 0
151 |         for phrase in test_phrases:
152 |             result_sum += len(symspell_default.lookup(phrase, Verbosity.CLOSEST, 2))
153 | 
154 |         assert 4955 == result_sum
155 | 
156 |     @pytest.mark.parametrize(
157 |         "symspell_default_entry, typo, correction",
158 |         [
159 |             ([("steam", 4)], "Stream", "Steam"),
160 |             ([("steam", 4)], "StreaM", "SteaM"),
161 |             ([("steam", 4)], "STREAM", "STEAM"),
162 |             ([("i", 4)], "I", "I"),
163 |         ],
164 |         indirect=["symspell_default_entry"],
165 |     )
166 |     def test_transfer_casing(self, symspell_default_entry, typo, correction):
167 |         result = symspell_default_entry.lookup(
168 |             typo, Verbosity.TOP, 2, transfer_casing=True
169 |         )
170 |         assert correction == result[0].term
171 | 


--------------------------------------------------------------------------------
/tests/test_symspellpy_lookup_compound.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | 
  4 | class TestSymSpellPyLookupCompound:
  5 |     @pytest.mark.parametrize(
  6 |         "symspell_default_load, get_fortests_data",
  7 |         [
  8 |             ("bigram", "lookup_compound_data.json"),
  9 |             ("unigram", "lookup_compound_data.json"),
 10 |         ],
 11 |         indirect=True,
 12 |     )
 13 |     def test_lookup_compound(self, symspell_default_load, get_fortests_data):
 14 |         sym_spell, dictionary = symspell_default_load
 15 |         for entry in get_fortests_data:
 16 |             results = sym_spell.lookup_compound(entry["typo"], 2)
 17 |             assert entry[dictionary]["num_results"] == len(results)
 18 |             assert entry[dictionary]["term"] == results[0].term
 19 |             assert entry[dictionary]["distance"] == results[0].distance
 20 |             assert entry[dictionary]["count"] == results[0].count
 21 | 
 22 |     @pytest.mark.parametrize(
 23 |         "symspell_default_entry", [[("steam", 1), ("machine", 1)]], indirect=True
 24 |     )
 25 |     def test_lookup_compound_only_combi(self, symspell_default_entry):
 26 |         typo = "ste am machie"
 27 |         correction = "steam machine"
 28 |         results = symspell_default_entry.lookup_compound(typo, 2)
 29 |         assert 1 == len(results)
 30 |         assert correction == results[0].term
 31 | 
 32 |     @pytest.mark.parametrize(
 33 |         "symspell_default_entry", [[("steam", 1), ("machine", 1)]], indirect=True
 34 |     )
 35 |     def test_lookup_compound_no_suggestion(self, symspell_default_entry):
 36 |         typo = "qwer erty ytui a"
 37 |         results = symspell_default_entry.lookup_compound(typo, 2)
 38 |         assert 1 == len(results)
 39 |         assert typo == results[0].term
 40 | 
 41 |     @pytest.mark.parametrize(
 42 |         "symspell_default_load, get_fortests_data",
 43 |         [
 44 |             ("bigram", "lookup_compound_replaced_words_data.json"),
 45 |             ("unigram", "lookup_compound_replaced_words_data.json"),
 46 |         ],
 47 |         indirect=True,
 48 |     )
 49 |     def test_lookup_compound_replaced_words(
 50 |         self, symspell_default_load, get_fortests_data
 51 |     ):
 52 |         sym_spell, dictionary = symspell_default_load
 53 |         num_replaced_words = 0
 54 |         for entry in get_fortests_data:
 55 |             num_replaced_words += len(entry[dictionary]["replacement"])
 56 |             results = sym_spell.lookup_compound(entry["typo"], 2)
 57 |             assert num_replaced_words == len(sym_spell.replaced_words)
 58 |             assert entry[dictionary]["term"] == results[0].term
 59 |             for k, v in entry[dictionary]["replacement"].items():
 60 |                 assert v == sym_spell.replaced_words[k].term
 61 | 
 62 |     @pytest.mark.parametrize(
 63 |         "symspell_default_load, get_fortests_data",
 64 |         [
 65 |             ("bigram", "lookup_compound_ignore_non_words_data.json"),
 66 |             ("unigram", "lookup_compound_ignore_non_words_data.json"),
 67 |         ],
 68 |         indirect=True,
 69 |     )
 70 |     def test_lookup_compound_ignore_non_words(
 71 |         self, symspell_default_load, get_fortests_data
 72 |     ):
 73 |         sym_spell, dictionary = symspell_default_load
 74 |         for entry in get_fortests_data:
 75 |             results = sym_spell.lookup_compound(entry["typo"], 2, True)
 76 |             assert 1 == len(results)
 77 |             assert entry[dictionary]["term"] == results[0].term
 78 | 
 79 |     @pytest.mark.parametrize(
 80 |         "symspell_default_load", ["bigram", "unigram"], indirect=True
 81 |     )
 82 |     def test_lookup_compound_ignore_non_words_ignore_digits(
 83 |         self, symspell_default_load
 84 |     ):
 85 |         sym_spell, _ = symspell_default_load
 86 | 
 87 |         typo = "is the officeon 1st floor oepn 24/7"
 88 |         correction = "is the office on 1st floor open 24/7"
 89 |         results = sym_spell.lookup_compound(
 90 |             typo,
 91 |             2,
 92 |             True,
 93 |             split_by_space=True,
 94 |             ignore_term_with_digits=True,
 95 |         )
 96 |         assert 1 == len(results)
 97 |         assert correction == results[0].term
 98 |         assert 2 == results[0].distance
 99 |         assert 0 == results[0].count
100 | 
101 |     @pytest.mark.parametrize(
102 |         "symspell_default_load, get_fortests_data",
103 |         [
104 |             ("bigram", "lookup_compound_transfer_casing_data.json"),
105 |             ("unigram", "lookup_compound_transfer_casing_data.json"),
106 |         ],
107 |         indirect=True,
108 |     )
109 |     def test_lookup_compound_transfer_casing(
110 |         self, symspell_default_load, get_fortests_data
111 |     ):
112 |         sym_spell, dictionary = symspell_default_load
113 |         for entry in get_fortests_data:
114 |             results = sym_spell.lookup_compound(entry["typo"], 2, transfer_casing=True)
115 |             assert entry[dictionary]["term"] == results[0].term
116 | 
117 |     @pytest.mark.parametrize(
118 |         "symspell_default_load, get_fortests_data",
119 |         [
120 |             ("bigram", "lookup_compound_transfer_casing_ignore_nonwords_data.json"),
121 |             ("unigram", "lookup_compound_transfer_casing_ignore_nonwords_data.json"),
122 |         ],
123 |         indirect=True,
124 |     )
125 |     def test_lookup_compound_transfer_casing_ignore_nonwords(
126 |         self, symspell_default_load, get_fortests_data
127 |     ):
128 |         sym_spell, dictionary = symspell_default_load
129 |         for entry in get_fortests_data:
130 |             results = sym_spell.lookup_compound(entry["typo"], 2, True, True)
131 |             assert entry[dictionary]["term"] == results[0].term
132 | 


--------------------------------------------------------------------------------
/tests/test_symspellpy_pickle.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import pickle
  3 | from unittest import TestCase
  4 | 
  5 | import pytest
  6 | 
  7 | from symspellpy import SymSpell
  8 | 
  9 | 
 10 | class TestSymSpellPyPickle:
 11 |     @pytest.mark.parametrize(
 12 |         "symspell_default_load, is_compressed",
 13 |         [("unigram", True), ("bigram", True), ("unigram", False), ("bigram", False)],
 14 |         indirect=["symspell_default_load"],
 15 |     )
 16 |     def test_pickle(self, pickle_path, symspell_default_load, is_compressed):
 17 |         sym_spell, _ = symspell_default_load
 18 |         sym_spell.save_pickle(pickle_path, is_compressed)
 19 | 
 20 |         sym_spell_2 = SymSpell(123, 456, 789)
 21 | 
 22 |         assert sym_spell._count_threshold != sym_spell_2._count_threshold
 23 |         assert (
 24 |             sym_spell._max_dictionary_edit_distance
 25 |             != sym_spell_2._max_dictionary_edit_distance
 26 |         )
 27 |         assert sym_spell._prefix_length != sym_spell_2._prefix_length
 28 | 
 29 |         with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm:
 30 |             sym_spell_2.load_pickle(pickle_path, is_compressed)
 31 |         assert (
 32 |             "Loading data which was created using different ('count_threshold', "
 33 |             "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting "
 34 |             "current SymSpell instance with loaded settings ..."
 35 |         ) == cm.records[0].getMessage()
 36 |         assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
 37 |         assert sym_spell.bigrams == sym_spell_2.bigrams
 38 |         assert sym_spell.deletes == sym_spell_2.deletes
 39 |         assert sym_spell.words == sym_spell_2.words
 40 |         assert sym_spell._max_length == sym_spell_2._max_length
 41 |         assert sym_spell._count_threshold == sym_spell_2._count_threshold
 42 |         assert (
 43 |             sym_spell._max_dictionary_edit_distance
 44 |             == sym_spell_2._max_dictionary_edit_distance
 45 |         )
 46 |         assert sym_spell._prefix_length == sym_spell_2._prefix_length
 47 |         os.remove(pickle_path)
 48 | 
 49 |     @pytest.mark.parametrize(
 50 |         "symspell_default_load, is_compressed",
 51 |         [("unigram", True), ("bigram", True), ("unigram", False), ("bigram", False)],
 52 |         indirect=["symspell_default_load"],
 53 |     )
 54 |     def test_pickle_same_settings(
 55 |         self, pickle_path, symspell_default_load, is_compressed
 56 |     ):
 57 |         sym_spell, _ = symspell_default_load
 58 |         sym_spell.save_pickle(pickle_path, is_compressed)
 59 | 
 60 |         sym_spell_2 = SymSpell()
 61 |         sym_spell_2.load_pickle(pickle_path, is_compressed)
 62 | 
 63 |         assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
 64 |         assert sym_spell.bigrams == sym_spell_2.bigrams
 65 |         assert sym_spell.deletes == sym_spell_2.deletes
 66 |         assert sym_spell.words == sym_spell_2.words
 67 |         assert sym_spell._max_length == sym_spell_2._max_length
 68 |         assert sym_spell._count_threshold == sym_spell_2._count_threshold
 69 |         assert (
 70 |             sym_spell._max_dictionary_edit_distance
 71 |             == sym_spell_2._max_dictionary_edit_distance
 72 |         )
 73 |         assert sym_spell._prefix_length == sym_spell_2._prefix_length
 74 |         os.remove(pickle_path)
 75 | 
 76 |     @pytest.mark.parametrize(
 77 |         "symspell_default_load", ["unigram", "bigram"], indirect=True
 78 |     )
 79 |     def test_pickle_bytes(self, symspell_default_load):
 80 |         sym_spell, _ = symspell_default_load
 81 |         sym_spell_2 = SymSpell(123, 456, 789)
 82 | 
 83 |         assert sym_spell._count_threshold != sym_spell_2._count_threshold
 84 |         assert (
 85 |             sym_spell._max_dictionary_edit_distance
 86 |             != sym_spell_2._max_dictionary_edit_distance
 87 |         )
 88 |         assert sym_spell._prefix_length != sym_spell_2._prefix_length
 89 | 
 90 |         with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm:
 91 |             sym_spell_2.load_pickle(
 92 |                 sym_spell.save_pickle(to_bytes=True), from_bytes=True
 93 |             )
 94 |         assert (
 95 |             "Loading data which was created using different ('count_threshold', "
 96 |             "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting "
 97 |             "current SymSpell instance with loaded settings ..."
 98 |         ) == cm.records[0].getMessage()
 99 |         assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
100 |         assert sym_spell.bigrams == sym_spell_2.bigrams
101 |         assert sym_spell.deletes == sym_spell_2.deletes
102 |         assert sym_spell.words == sym_spell_2.words
103 |         assert sym_spell._max_length == sym_spell_2._max_length
104 |         assert sym_spell._count_threshold == sym_spell_2._count_threshold
105 |         assert (
106 |             sym_spell._max_dictionary_edit_distance
107 |             == sym_spell_2._max_dictionary_edit_distance
108 |         )
109 |         assert sym_spell._prefix_length == sym_spell_2._prefix_length
110 | 
111 |     def test_pickle_invalid(self, pickle_path, symspell_default):
112 |         pickle_data = {"deletes": {}, "words": {}, "max_length": 0, "data_version": -1}
113 |         with open(pickle_path, "wb") as f:
114 |             pickle.dump(pickle_data, f)
115 |         assert not symspell_default.load_pickle(pickle_path, False)
116 |         os.remove(pickle_path)
117 | 
118 |         pickle_data = {"deletes": {}, "words": {}, "max_length": 0}
119 |         with open(pickle_path, "wb") as f:
120 |             pickle.dump(pickle_data, f)
121 |         assert not symspell_default.load_pickle(pickle_path, False)
122 |         os.remove(pickle_path)
123 | 


--------------------------------------------------------------------------------
/tests/test_symspellpy_word_segmentation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from symspellpy import SymSpell
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def symspell_edit_distance_load(dictionary_path, request):
 8 |     sym_spell = SymSpell(request.param)
 9 |     sym_spell.load_dictionary(dictionary_path, 0, 1)
10 |     return sym_spell, request.param
11 | 
12 | 
13 | class TestSymSpellPyWordSegmentation:
14 |     @pytest.mark.parametrize("symspell_default_load", ["unigram"], indirect=True)
15 |     def test_word_segmentation_ignore_token(self, symspell_default_load):
16 |         sym_spell, _ = symspell_default_load
17 |         typo = "24th december"
18 |         result = sym_spell.word_segmentation(typo, ignore_token=r"\d{2}\w*\b")
19 |         assert typo == result.corrected_string
20 | 
21 |     @pytest.mark.parametrize(
22 |         "symspell_edit_distance_load, get_fortests_data, with_arguments, capitalize",
23 |         [
24 |             (0, "word_segmentation_data.json", False, False),
25 |             (0, "word_segmentation_data.json", True, False),
26 |             (0, "word_segmentation_data.json", False, True),
27 |         ],
28 |         indirect=["symspell_edit_distance_load", "get_fortests_data"],
29 |     )
30 |     def test_word_segmentation(
31 |         self,
32 |         symspell_edit_distance_load,
33 |         get_fortests_data,
34 |         with_arguments,
35 |         capitalize,
36 |     ):
37 |         sym_spell, edit_distance = symspell_edit_distance_load
38 |         for entry in get_fortests_data:
39 |             if capitalize:
40 |                 typo = entry["typo"].capitalize()
41 |                 correction = entry[str(edit_distance)]["term"].capitalize()
42 |             else:
43 |                 typo = entry["typo"]
44 |                 correction = entry[str(edit_distance)]["term"]
45 |             if with_arguments:
46 |                 result = sym_spell.word_segmentation(typo, edit_distance, 11)
47 |             else:
48 |                 result = sym_spell.word_segmentation(typo)
49 |             assert correction == result.corrected_string
50 | 
51 |     @pytest.mark.parametrize("symspell_edit_distance_load", [0], indirect=True)
52 |     def test_word_segmentation_apostrophe(self, symspell_edit_distance_load):
53 |         sym_spell, _ = symspell_edit_distance_load
54 | 
55 |         typo = "There'resomewords"
56 |         correction = "There' re some words"
57 |         result = sym_spell.word_segmentation(typo)
58 |         assert correction == result[1]
59 | 
60 |     @pytest.mark.parametrize("symspell_edit_distance_load", [0], indirect=True)
61 |     def test_word_segmentation_ligature(self, symspell_edit_distance_load):
62 |         sym_spell, _ = symspell_edit_distance_load
63 | 
64 |         typo = "Therearesomescientiﬁcwords"
65 |         correction = "There are some scientific words"
66 |         result = sym_spell.word_segmentation(typo)
67 |         assert correction == result[1]
68 | 


--------------------------------------------------------------------------------