├── .coveragerc
├── .git-blame-ignore-revs
├── .gitattributes
├── .github
├── FUNDING.yml
├── dependabot.yml
└── workflows
│ ├── publish.yml
│ ├── tests.yml
│ └── weekly.yml
├── .gitignore
├── .readthedocs.yaml
├── CHANGELOG.md
├── INSTALL.rst
├── LICENSE
├── README.md
├── docs
├── Makefile
├── _templates
│ └── layout.html
├── api
│ ├── abstract_distance_comparer.rst
│ ├── editdistance.rst
│ ├── helpers.rst
│ ├── index.rst
│ └── symspellpy.rst
├── conf.py
├── examples
│ ├── custom_distance_comparer.rst
│ ├── dictionary.rst
│ ├── index.rst
│ ├── lookup.rst
│ ├── lookup_compound.rst
│ └── word_segmentation.rst
├── index.rst
├── make.bat
├── requirements.txt
└── users
│ └── installing.rst
├── pyproject.toml
├── requirements.txt
├── symspellpy
├── __init__.py
├── abstract_distance_comparer.py
├── composition.py
├── editdistance.py
├── frequency_bigramdictionary_en_243_342.txt
├── frequency_dictionary_en_82_765.txt
├── helpers.py
├── logging.py
├── pickle_mixin.py
├── suggest_item.py
├── symspellpy.py
└── verbosity.py
└── tests
├── __init__.py
├── benchmarks.ipynb
├── conftest.py
├── fortests
├── bad_dict.txt
├── below_threshold_dict.txt
├── big_modified.txt
├── big_words.txt
├── lookup_compound_data.json
├── lookup_compound_ignore_non_words_data.json
├── lookup_compound_replaced_words_data.json
├── lookup_compound_transfer_casing_data.json
├── lookup_compound_transfer_casing_ignore_nonwords_data.json
├── noisy_query_en_1000.txt
├── non_en_dict.txt
├── separator_dict.txt
└── word_segmentation_data.json
├── test_compatibility.py
├── test_editdistance.py
├── test_helpers.py
├── test_suggest_item.py
├── test_symspellpy.py
├── test_symspellpy_edge_cases.py
├── test_symspellpy_lookup.py
├── test_symspellpy_lookup_compound.py
├── test_symspellpy_pickle.py
└── test_symspellpy_word_segmentation.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | branch = true
3 | source = symspellpy
4 |
5 | [report]
6 | exclude_lines =
7 | pragma: no cover
8 |
--------------------------------------------------------------------------------
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # format all
2 | b0abc5ed3a37b05848ca1e2de790321d7c07fd75
3 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.py eol=lf
2 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | # These are supported funding model platforms
2 | github: mammothb
3 | ko_fi: mammothb
4 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "github-actions"
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "weekly"
12 | day: "friday"
13 |
--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | name: Publish to TestPyPI and PyPI
2 |
3 | on:
4 | workflow_dispatch:
5 | release:
6 | types: [published]
7 |
8 | jobs:
9 | publish-test-pypi:
10 | name: Build and publish to TestPyPI
11 | runs-on: ubuntu-latest
12 | steps:
13 | - uses: actions/checkout@v4
14 |
15 | - name: Set up Python 3.10
16 | uses: actions/setup-python@v5.6.0
17 | with:
18 | python-version: "3.10"
19 |
20 | - name: Build
21 | run: |
22 | echo "Building ..."
23 | python -m pip install --upgrade pip
24 | python -m pip install build
25 | python -m build
26 |
27 | - name: Publish to TestPyPI
28 | uses: pypa/gh-action-pypi-publish@v1.12.4
29 | with:
30 | user: __token__
31 | password: ${{ secrets.TEST_PYPI_API_TOKEN }}
32 | repository-url: https://test.pypi.org/legacy/
33 |
34 | - name: Publish to PyPI
35 | if: github.event_name == 'release' && startsWith(github.ref, 'refs/tags/v')
36 | uses: pypa/gh-action-pypi-publish@v1.12.4
37 | with:
38 | user: __token__
39 | password: ${{ secrets.PYPI_API_TOKEN }}
40 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | test:
7 | name: "Python ${{ matrix.python-version }} on ${{ matrix.os }}"
8 | runs-on: ${{ matrix.os }}
9 | environment: Development
10 |
11 | strategy:
12 | matrix:
13 | os: [ubuntu-latest]
14 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
15 |
16 | steps:
17 | - uses: actions/checkout@v4
18 |
19 | - name: Set up Python ${{ matrix.python-version }}
20 | uses: actions/setup-python@v5.6.0
21 | with:
22 | python-version: ${{ matrix.python-version }}
23 |
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install -r requirements.txt
28 |
29 | - name: Run pytest
30 | run: python -m pytest --cov-report=xml --cov=symspellpy
31 |
32 | - name: Upload code coverage
33 | uses: codecov/codecov-action@v5
34 | with:
35 | token: ${{ secrets.CODECOV_TOKEN }}
36 |
--------------------------------------------------------------------------------
/.github/workflows/weekly.yml:
--------------------------------------------------------------------------------
1 | name: Weekly Tests
2 |
3 | on:
4 | workflow_dispatch:
5 | schedule:
6 | # Runs every friday
7 | - cron: "0 0 * * 5"
8 |
9 | jobs:
10 | test:
11 | name: "Python ${{ matrix.python-version }} on ${{ matrix.os }}"
12 | runs-on: ${{ matrix.os }}
13 |
14 | strategy:
15 | fail-fast: false
16 | matrix:
17 | os: [ubuntu-latest, macos-latest, windows-latest]
18 | python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"]
19 |
20 | steps:
21 | - uses: actions/checkout@v4
22 |
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v5.6.0
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 |
28 | - name: Install dependencies
29 | run: |
30 | python -m pip install --upgrade pip
31 | pip install -r requirements.txt -v
32 |
33 | - name: Run pytest
34 | run: python -m pytest
35 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 | .vscode/
106 |
107 | #pycharm files
108 | .idea/
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # .readthedocs.yaml
2 | # Read the Docs configuration file
3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
4 |
5 | # Required
6 | version: 2
7 |
8 | # Set the version of Python and other tools you might need
9 | build:
10 | os: ubuntu-24.04
11 | tools:
12 | python: "3.13"
13 |
14 | # Build documentation in the docs/ directory with Sphinx
15 | sphinx:
16 | configuration: docs/conf.py
17 |
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 | install:
21 | - requirements: docs/requirements.txt
22 |
--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | CHANGELOG
2 | ==============
3 |
4 | ## 6.9.0 (2025-03-09)
5 |
6 | - Specify that frequency count must be 64-bit int [#180](https://github.com/mammothb/symspellpy/pull/180)
7 | - Rename `string1` and `string2` argument names [#181](https://github.com/mammothb/symspellpy/pull/181)
8 |
9 | ## 6.8.0 (2025-03-09)
10 | - Allow file object as corpus of load_dictionary [#176](https://github.com/mammothb/symspellpy/pull/176)
11 | - Bump supported Python version to 3.9 - 3.13 [#177](https://github.com/mammothb/symspellpy/pull/177)
12 |
13 | ## 6.7.8 (2024-08-31)
14 | - Handle encoding errors [#149](https://github.com/mammothb/symspellpy/pull/149)
15 | - Bump supported Python version to 3.8 - 3.12 [#151](https://github.com/mammothb/symspellpy/pull/151)
16 | - Remove numpy dependency [#156](https://github.com/mammothb/symspellpy/pull/156)
17 | - Feature: distance comparer interface [#159](https://github.com/mammothb/symspellpy/pull/159)
18 |
19 | ## 6.7.7 (2022-10-24)
20 | - Remove support for Python 3.6
21 | - Use compiled regex expression in `create_dictionary()` ([#129](https://github.com/mammothb/symspellpy/pull/129))
22 | - Configure module logger instead of modifying root logger ([#132](https://github.com/mammothb/symspellpy/pull/132), [#133](https://github.com/mammothb/symspellpy/pull/133))
23 |
24 | ## 6.7.6 (2021-12-19)
25 | - Fix suggestion `count` in `lookup_compound` when `ignore_words=True` ([#108](https://github.com/mammothb/symspellpy/pull/108))
26 | - Log error message when loading dictionary fails ([#109](https://github.com/mammothb/symspellpy/pull/109))
27 |
28 | ## 6.7.5 (2021-12-02)
29 | - Fix `replaced_words` not being updated when best match is a combi (closes [#103](https://github.com/mammothb/symspellpy/issues/103))
30 | - Implement a way to change the edit distance comparer algorightm via `distance_algorithm` property. Available values are found in [`DistanceAlgorithm`](https://symspellpy.readthedocs.io/en/latest/api/editdistance.html#symspellpy.editdistance.DistanceAlgorithm)
31 |
32 | ## 6.7.4 (2021-11-29)
33 | - Update `editdistpy` dependency version
34 | - Update `LevenshteinFast` and `DamerauOsaFast` to match the functionality of the `editdistpy` library
35 |
36 | ## 6.7.3 (2021-11-27)
37 | - Update `editdistpy` dependency version
38 |
39 | ## 6.7.2 (2021-11-25)
40 | - Fix typo of Dameruau to Damerau in various places. Can potentially break some setups that explicitly `_distance_algorithm`
41 | - Implement fast distance comparers with [editdistpy](https://github.com/mammothb/editdistpy)
42 | - Set `DamerauOsaFast` as the default distance comparer
43 |
44 | ## 6.7.1 (2021-11-21)
45 | - Updated `frequency_dictionary_en_82_765.txt` dictionary with common contractions
46 | - Added `_below_threshold_words`, `_bigrams`, `_count_threshold`, `_max_dictionary_edit_distance`, and `_prefix_length` when saving to pickle. (closes [#93](https://github.com/mammothb/symspellpy/issues/93))
47 | - Implemented `to_bytes` and `from_bytes` options to save and load pickle with bytes string
48 | - Updated data_version to 3
49 | - Removed Python 3.4 and Python 3.5 support
50 |
51 | ## 6.7.0 (2020-08-28)
52 | - Removed numpy dependency
53 | - `word_segmentation` now retains/preserves case.
54 | - `word_segmentation` now keeps punctuation or apostrophe adjacent to previous
55 | word.
56 | - `word_segmentation` now normalizes ligatures: "scientific" -> "scientific".
57 | - `word_segmentation` now removes hyphens prior to word segmentation
58 | (untested).
59 | - American English word forms added to dictionary in addition to British
60 | English e.g. favourable & favorable.
61 |
62 | ## 6.5.2 (2019-10-23)
63 | - Modified `load_bigram_dictionary` to allow dictionary entries to be split
64 | into only 2 parts when using a custom separator
65 | - Added dictionary files to wheels so `pkg_resources` could be used to access
66 | them
67 |
68 | ## 6.5.1 (2019-10-08)
69 | - Added `separator` argument to allow user to choose custom separator for `load_dictionary`
70 |
71 | ## 6.5.0 (2019-09-21)
72 | - Added `load_bigram_dictionary` and bigram dictionary `frequency_bigramdictionary_en_243_342.txt`
73 | - Updated `lookup_compound` algorithm
74 | - Added `Levenshtein` to compute edit distance
75 | - Added `save_pickle_stream` and `load_pickle_stream` to save/load SymSpell data alongside other structure (contribution by [marcoffee](https://github.com/marcoffee))
76 |
77 | ## 6.3.9 (2019-08-06)
78 | - Added `transfer_casing` to `lookup` and `lookup_compound`
79 | - Fixed prefix length check in `_edits_prefix`
80 |
81 | ## 6.3.8 (2019-03-21)
82 | - Implemented `delete_dictionary_entry`
83 | - Improved performance by using python builtin hashing
84 | - Added versioning of the pickle
85 |
86 | ## 6.3.7 (2019-02-18)
87 | - Fixed `include_unknown` in `lookup`
88 | - Removed unused `initial_capacity` argument
89 | - Improved `_get_str_hash` performance
90 | - Implemented `save_pickle` and `load_pickle` to avoid having to create the
91 | dictionary every time
92 |
93 | ## 6.3.6 (2019-02-11)
94 | - Added `create_dictionary()` feature
95 |
96 | ## 6.3.5 (2019-01-14)
97 | - Fixed `lookup_compound()` to return the correct `distance`
98 |
99 | ## 6.3.4 (2019-01-04)
100 | - Added `` to track number of misspelled words
101 | - Added `ignore_token` to `word_segmentation()` to ignore words with regular expression
102 |
103 | ## 6.3.3 (2018-12-05)
104 | - Added `word_segmentation()` feature
105 |
106 | ## 6.3.2 (2018-10-23)
107 | - Added `encoding` option to `load_dictionary()`
108 |
109 | ## 6.3.1 (2018-08-30)
110 | - Create a package for `symspellpy`
111 |
112 | ## 6.3.0 (2018-08-13)
113 | - Ported [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.3
114 |
--------------------------------------------------------------------------------
/INSTALL.rst:
--------------------------------------------------------------------------------
1 | **********
2 | Installing
3 | **********
4 |
5 | Installing an official release
6 | ==============================
7 |
8 | symspellpy and its dependencies are available as wheel packages for macOS,
9 | Windows and Linux distributions::
10 |
11 | python -m pip install -U symspellpy
12 |
13 | **NOTE**: symspellpy has only been tested on Windows and Linux systems and is
14 | assumed to work on macOS.
15 |
16 | Dictionary data
17 | ===============
18 |
19 | The dictionary files that are shipped with symspellpy can be accesed using
20 | `importlib.resources`::
21 |
22 | dictionary_path = importlib.resources.files("symspellpy") / "frequency_dictionary_en_82_765.txt"
23 | bigram_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
24 |
25 | Alternatively, you can download the dictionary files from the repository and
26 | add them to your project directory::
27 |
28 | curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt
29 | curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt
30 |
31 | You could end up with a project directory layout like::
32 |
33 | project_dir
34 | +-frequency_bigramdictionary_en_243_342.txt
35 | +-frequency_dictionary_en_82_765.txt
36 | \-project.py
37 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 mmb L (Python port https://github.com/mammothb/symspellpy)
4 | Copyright (c) 2021 Wolf Garbe (Original C# implementation https://github.com/wolfgarbe/SymSpell)
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | symspellpy
2 | [](https://badge.fury.io/py/symspellpy)
3 | [](https://github.com/mammothb/symspellpy/actions/workflows/tests.yml)
4 | [](https://symspellpy.readthedocs.io/en/latest/?badge=latest)
5 | [](https://codecov.io/gh/mammothb/symspellpy)
6 | ========
7 |
8 | symspellpy is a Python port of [SymSpell](https://github.com/wolfgarbe/SymSpell) v6.7.2, which provides much higher speed and lower memory consumption. Unit tests
9 | from the original project are implemented to ensure the accuracy of the port.
10 |
11 | Please note that the port has not been optimized for speed.
12 |
13 | Notable Changes
14 | ===============
15 | v6.7.2: Implemented fast distance comparer with [editdistpy](https://github.com/mammothb/editdistpy). Approximately 2x speed up for usage under default settings, benchmarks found [here](https://github.com/mammothb/symspellpy/blob/master/tests/benchmarks.ipynb).
16 |
17 | Install
18 | =======
19 | For installation instructions, see the `INSTALL.rst` file or the [install](https://symspellpy.readthedocs.io/en/latest/users/installing.html) documentation.
20 |
21 | Usage
22 | =====
23 | Check out the [examples](https://symspellpy.readthedocs.io/en/latest/examples/index.html) provided for sample usage.
24 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line.
5 | SPHINXOPTS =
6 | SPHINXBUILD = sphinx-build
7 | SOURCEDIR = .
8 | BUILDDIR = _build
9 |
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 |
14 | .PHONY: help Makefile
15 |
16 | # Catch-all target: route all unknown targets to Sphinx using the new
17 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18 | %: Makefile
19 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--------------------------------------------------------------------------------
/docs/_templates/layout.html:
--------------------------------------------------------------------------------
1 | {% extends "!layout.html" %}
2 |
3 | {%- block rootrellink %}
4 | Home|
5 | Examples|
6 | API|
7 | {%- endblock %}
8 |
--------------------------------------------------------------------------------
/docs/api/abstract_distance_comparer.rst:
--------------------------------------------------------------------------------
1 | **************************
2 | abstract_distance_comparer
3 | **************************
4 |
5 | Distance comparer interface
6 | ===========================
7 |
8 | .. autoclass:: symspellpy.abstract_distance_comparer.AbstractDistanceComparer
9 | :members:
10 |
--------------------------------------------------------------------------------
/docs/api/editdistance.rst:
--------------------------------------------------------------------------------
1 | ************
2 | editdistance
3 | ************
4 |
5 | Enum class
6 | ==========
7 |
8 | .. autoclass:: symspellpy.editdistance.DistanceAlgorithm
9 | :members:
10 | :member-order: bysource
11 |
12 | EditDistance class
13 | ==================
14 |
15 | .. autoclass:: symspellpy.editdistance.EditDistance
16 | :members:
17 |
18 | Distance comparer classes
19 | =========================
20 |
21 | .. autoclass:: symspellpy.editdistance.DamerauOsa
22 | :members:
23 |
24 | .. autoclass:: symspellpy.editdistance.Levenshtein
25 | :members:
26 |
27 | .. autoclass:: symspellpy.editdistance.DamerauOsaFast
28 | :members:
29 |
30 | .. autoclass:: symspellpy.editdistance.LevenshteinFast
31 | :members:
32 |
--------------------------------------------------------------------------------
/docs/api/helpers.rst:
--------------------------------------------------------------------------------
1 | *******
2 | helpers
3 | *******
4 |
5 | Helpers for `editdistance`
6 | ==========================
7 |
8 | .. autofunction:: symspellpy.helpers.null_distance_results
9 |
10 | .. autofunction:: symspellpy.helpers.prefix_suffix_prep
11 |
12 | Helpers for `symspellpy`
13 | ========================
14 |
15 | .. autoclass:: symspellpy.helpers.DictIO
16 |
17 | .. autofunction:: symspellpy.helpers.case_transfer_matching
18 |
19 | .. autofunction:: symspellpy.helpers.case_transfer_similar
20 |
21 | .. autofunction:: symspellpy.helpers.increment_count
22 |
23 | .. autofunction:: symspellpy.helpers.is_acronym
24 |
25 | .. autofunction:: symspellpy.helpers.parse_words
26 |
27 | .. autofunction:: symspellpy.helpers.try_parse_int64
28 |
29 | Misc
30 | ====
31 |
32 | .. autofunction:: symspellpy.helpers.to_similarity
33 |
34 |
--------------------------------------------------------------------------------
/docs/api/index.rst:
--------------------------------------------------------------------------------
1 | ************
2 | API Overview
3 | ************
4 |
5 | Modules
6 | =======
7 |
8 | .. only:: html
9 |
10 | .. toctree::
11 | :maxdepth: 2
12 |
13 | helpers.rst
14 | abstract_distance_comparer.rst
15 | editdistance.rst
16 | symspellpy.rst
17 |
--------------------------------------------------------------------------------
/docs/api/symspellpy.rst:
--------------------------------------------------------------------------------
1 | **********
2 | symspellpy
3 | **********
4 |
5 | Enum class
6 | ==========
7 |
8 | .. autoclass:: symspellpy.verbosity.Verbosity
9 | :members:
10 | :member-order: bysource
11 |
12 | Data class
13 | ==========
14 |
15 | .. autoclass:: symspellpy.suggest_item.SuggestItem
16 | :members:
17 | :special-members: __eq__, __lt__, __str__
18 |
19 | .. autoclass:: symspellpy.composition.Composition
20 | :members:
21 | :exclude-members: corrected_string, distance_sum, log_prob_sum, segmented_string
22 |
23 | Utility class
24 | =============
25 |
26 | .. autoclass:: symspellpy.pickle_mixin.PickleMixin
27 | :members:
28 | :private-members:
29 |
30 | SymSpell
31 | ========
32 |
33 | .. autoclass:: symspellpy.symspellpy.SymSpell
34 | :members:
35 | :private-members:
36 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # Configuration file for the Sphinx documentation builder.
4 | #
5 | # This file does only contain a selection of the most common options. For a
6 | # full list see the documentation:
7 | # http://www.sphinx-doc.org/en/master/config
8 |
9 | # -- Path setup --------------------------------------------------------------
10 |
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 |
15 | import os.path
16 | import sys
17 |
18 | sys.path.insert(0, os.path.abspath(".."))
19 |
20 | from pathlib import Path
21 |
22 | import tomllib
23 |
24 | # -- Project information -----------------------------------------------------
25 |
26 | project = "symspellpy"
27 | copyright = "2025, mmb L, Wolf Garbe"
28 | author = "mmb L, Wolf Garbe"
29 |
30 | # The short X.Y version
31 | version = ""
32 | # The full version, including alpha/beta/rc tags
33 | with open(Path(__file__).parents[1] / "pyproject.toml", "rb") as infile:
34 | data = tomllib.load(infile)
35 | release = data["project"]["version"]
36 |
37 |
38 | # -- General configuration ---------------------------------------------------
39 |
40 | # If your documentation needs a minimal Sphinx version, state it here.
41 | #
42 | # needs_sphinx = '1.0'
43 |
44 | # Add any Sphinx extension module names here, as strings. They can be
45 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
46 | # ones.
47 | extensions = [
48 | "sphinx.ext.autodoc",
49 | "sphinx.ext.napoleon",
50 | "sphinx.ext.viewcode",
51 | "sphinx_autodoc_typehints",
52 | ]
53 | # numpydoc_class_members_toctree = False
54 | # numpydoc_show_inherited_class_members = False
55 | highlight_language = "none"
56 |
57 | # Add any paths that contain templates here, relative to this directory.
58 | templates_path = ["_templates"]
59 |
60 | # The suffix(es) of source filenames.
61 | # You can specify multiple suffix as a list of string:
62 | #
63 | # source_suffix = ['.rst', '.md']
64 | source_suffix = ".rst"
65 |
66 | # The master toctree document.
67 | master_doc = "index"
68 |
69 | # The language for content autogenerated by Sphinx. Refer to documentation
70 | # for a list of supported languages.
71 | #
72 | # This is also used if you do content translation via gettext catalogs.
73 | # Usually you set "language" from the command line for these cases.
74 | language = "en"
75 |
76 | # List of patterns, relative to source directory, that match files and
77 | # directories to ignore when looking for source files.
78 | # This pattern also affects html_static_path and html_extra_path.
79 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
80 |
81 | # The name of the Pygments (syntax highlighting) style to use.
82 | pygments_style = None
83 |
84 |
85 | # -- Options for HTML output -------------------------------------------------
86 |
87 | # The theme to use for HTML and HTML Help pages. See the documentation for
88 | # a list of builtin themes.
89 | #
90 | html_theme = "sphinxdoc"
91 |
92 | # Theme options are theme-specific and customize the look and feel of a theme
93 | # further. For a list of options available for each theme, see the
94 | # documentation.
95 | #
96 | # html_theme_options = {}
97 |
98 | # Add any paths that contain custom static files (such as style sheets) here,
99 | # relative to this directory. They are copied after the builtin static files,
100 | # so a file named "default.css" will overwrite the builtin "default.css".
101 | # html_static_path = ["_static"]
102 | html_static_path = []
103 |
104 | # Custom sidebar templates, must be a dictionary that maps document names
105 | # to template names.
106 | #
107 | # The default sidebars (for documents that don't match any pattern) are
108 | # defined by theme itself. Builtin themes are using these templates by
109 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
110 | # 'searchbox.html']``.
111 | #
112 | html_sidebars = {"**": ["globaltoc.html", "searchbox.html"]}
113 |
114 |
115 | # -- Options for HTMLHelp output ---------------------------------------------
116 |
117 | # Output file base name for HTML help builder.
118 | htmlhelp_basename = "symspellpydoc"
119 |
--------------------------------------------------------------------------------
/docs/examples/custom_distance_comparer.rst:
--------------------------------------------------------------------------------
1 | ************************
2 | Custom distance comparer
3 | ************************
4 |
5 | Basic usage
6 | ===========
7 |
8 | Create a comparer class which satisfies the interface specified by
9 | :class:`~symspellpy.abstract_distance_comparer.AbstractDistanceComparer`:
10 |
11 | .. code-block:: python
12 |
13 | import importlib.resources
14 | from itertools import islice
15 |
16 | from symspellpy import SymSpell
17 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
18 | from symspellpy.editdistance import DistanceAlgorithm, EditDistance
19 |
20 | class CustomComparer(AbstractDistanceComparer):
21 | def distance(self, string_1, string_2, max_distance):
22 | # Compare distance between string_1 and string_2
23 | return -1 if distance > max_distance else distance
24 |
25 | custom_comparer = Editdistance(DistanceAlgorithm.USER_PROVIDED, CustomComparer())
26 | sym_spell = SymSpell(distance_comparer=custom_comparer)
27 | dictionary_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
28 | sym_spell.load_bigram_dictionary(dictionary_path, 0, 2)
29 |
30 | # Print out first 5 elements to demonstrate that dictionary is
31 | # successfully loaded
32 | print(list(islice(sym_spell.bigrams.items(), 5)))
33 |
--------------------------------------------------------------------------------
/docs/examples/dictionary.rst:
--------------------------------------------------------------------------------
1 | **********
2 | Dictionary
3 | **********
4 |
5 | Load frequency dictionary
6 | =========================
7 |
8 | `load_dictionary`
9 | -----------------
10 |
11 | Given a dictionary file like::
12 |
13 |
14 |
15 | ...
16 |
17 |
18 | We can use :meth:`~symspellpy.symspellpy.SymSpell.load_dictionary`:
19 |
20 | .. code-block:: python
21 | :emphasize-lines: 8
22 |
23 | import importlib.resources
24 | from itertools import islice
25 |
26 | from symspellpy import SymSpell
27 |
28 | sym_spell = SymSpell()
29 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
30 | sym_spell.load_dictionary(dictionary_path, 0, 1)
31 |
32 | # Print out first 5 elements to demonstrate that dictionary is
33 | # successfully loaded
34 | print(list(islice(sym_spell.words.items(), 5)))
35 |
36 | Output::
37 |
38 | [('the', 23135851162), ('of', 13151942776), ('and', 12997637966), ('to', 12136980858), ('a', 9081174698)]
39 |
40 | `load_bigram_dictionary`
41 | ------------------------
42 |
43 | Given a bigram dictionary file like::
44 |
45 |
46 |
47 | ...
48 |
49 |
50 | We can use :meth:`~symspellpy.symspellpy.SymSpell.load_bigram_dictionary`:
51 |
52 | .. code-block:: python
53 | :emphasize-lines: 8
54 |
55 | import importlib.resources
56 | from itertools import islice
57 |
58 | from symspellpy import SymSpell
59 |
60 | sym_spell = SymSpell()
61 | dictionary_path = importlib.resources.files("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
62 | sym_spell.load_bigram_dictionary(dictionary_path, 0, 2)
63 |
64 | # Print out first 5 elements to demonstrate that dictionary is
65 | # successfully loaded
66 | print(list(islice(sym_spell.bigrams.items(), 5)))
67 |
68 | Output::
69 |
70 | [('abcs of', 10956800), ('aaron and', 10721728), ('abbott and', 7861376), ('abbreviations and', 13518272), ('aberdeen and', 7347776)]
71 |
72 | Load frequency dictionary with custom separator
73 | ===============================================
74 |
75 | `load_dictionary`
76 | -----------------
77 |
78 | It is also possible to specific a custom `separator` so that dictionaries can
79 | contain space separated terms. For example, given a dictionary file like::
80 |
81 | the$23135851162
82 | abcs of$10956800
83 | of$13151942776
84 | aaron and$10721728
85 | abbott and$7861376
86 | abbreviations and$13518272
87 | aberdeen and$7347776
88 |
89 | We can specify "$" as the custom `separator` in
90 | :meth:`~symspellpy.symspellpy.SymSpell.load_dictionary` like:
91 |
92 | .. code-block:: python
93 | :emphasize-lines: 7
94 |
95 | from itertools import islice
96 |
97 | from symspellpy import SymSpell
98 |
99 | sym_spell = SymSpell()
100 | dictionary_path =
101 | sym_spell.load_dictionary(dictionary_path, 0, 1, separator="$")
102 |
103 | # Print out first 5 elements to demonstrate that dictionary is
104 | # successfully loaded
105 | print(list(islice(sym_spell.words.items(), 5)))
106 |
107 | Output::
108 |
109 | [('the', 23135851162), ('abcs of', 10956800), ('of', 13151942776), ('aaron and', 10721728), ('abbott and', 7861376)]
110 |
111 | Note that space separated terms such as "abcs of", "aaron and", and
112 | "abbott and" can now be found in `words` instead of `bigrams`.
113 |
114 | `load_bigram_dictionary`
115 | ------------------------
116 |
117 | We can also specify "$" as the custom `separator` in
118 | :meth:`~symspellpy.symspellpy.SymSpell.load_bigram_dictionary` like
119 | (note that we changed `count_index` from 2 to 1):
120 |
121 | .. code-block:: python
122 | :emphasize-lines: 7
123 |
124 | from itertools import islice
125 |
126 | from symspellpy import SymSpell
127 |
128 | sym_spell = SymSpell()
129 | dictionary_path =
130 | sym_spell.load_bigram_dictionary(dictionary_path, 0, 1, separator="$")
131 |
132 | # Print out first 5 elements to demonstrate that dictionary is
133 | # successfully loaded
134 | print(list(islice(sym_spell.bigrams.items(), 5)))
135 |
136 | Output::
137 |
138 | [('the', 23135851162), ('abcs of', 10956800), ('of', 13151942776), ('aaron and', 10721728), ('abbott and', 7861376)]
139 |
140 | Note that `bigrams` now **erroneously** contains monograms. Precautions
141 | should taken when creating bigram dictionary with custom separator.
142 |
143 | Create dictionary from plain text file
144 | ======================================
145 |
146 | Given a plain text file like::
147 |
148 | abc abc-def abc_def abc'def abc qwe qwe1 1qwe q1we 1234 1234
149 |
150 | We can create a dictionary from the file using
151 | :meth:`~symspellpy.symspellpy.SymSpell.create_dictionary` like:
152 |
153 | .. code-block:: python
154 | :emphasize-lines: 5
155 |
156 | from symspellpy import SymSpell
157 |
158 | sym_spell = SymSpell()
159 | corpus_path =
160 | sym_spell.create_dictionary(corpus_path)
161 |
162 | print(sym_spell.words)
163 |
164 | Output::
165 |
166 | {'abc': 4, 'def': 2, "abc'def": 1, 'qwe': 1, 'qwe1': 1, '1qwe': 1, 'q1we': 1, '1234': 2}
167 |
168 | Note that :meth:`~symspellpy.symspellpy.SymSpell.create_dictionary` did not
169 | split words at apostrophes and did not check if the words contained numbers.
170 |
--------------------------------------------------------------------------------
/docs/examples/index.rst:
--------------------------------------------------------------------------------
1 | ========
2 | Examples
3 | ========
4 |
5 | .. only:: html
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 |
10 | dictionary.rst
11 | custom_distance_comparer.rst
12 | lookup.rst
13 | lookup_compound.rst
14 | word_segmentation.rst
15 |
--------------------------------------------------------------------------------
/docs/examples/lookup.rst:
--------------------------------------------------------------------------------
1 | ******
2 | lookup
3 | ******
4 |
5 | Basic usage
6 | ===========
7 |
8 | .. code-block:: python
9 | :emphasize-lines: 15
10 |
11 | import importlib.resources
12 |
13 | from symspellpy import SymSpell, Verbosity
14 |
15 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
16 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
17 | # term_index is the column of the term and count_index is the
18 | # column of the term frequency
19 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
20 |
21 | # lookup suggestions for single-word input strings
22 | input_term = "memebers" # misspelling of "members"
23 | # max edit distance per lookup
24 | # (max_edit_distance_lookup <= max_dictionary_edit_distance)
25 | suggestions = sym_spell.lookup(input_term, Verbosity.CLOSEST, max_edit_distance=2)
26 | # display suggestion term, edit distance, and term frequency
27 | for suggestion in suggestions:
28 | print(suggestion)
29 |
30 | Output::
31 |
32 | members, 1, 226656153
33 |
34 | Return original word if no correction within edit distance is found
35 | ===================================================================
36 |
37 | .. code-block:: python
38 | :emphasize-lines: 15,16,17
39 |
40 | import importlib.resources
41 |
42 | from symspellpy import SymSpell, Verbosity
43 |
44 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
45 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
46 | # term_index is the column of the term and count_index is the
47 | # column of the term frequency
48 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
49 |
50 | # lookup suggestions for single-word input strings
51 | input_term = "apastraphee" # misspelling of "apostrophe"
52 | # max edit distance per lookup
53 | # (max_edit_distance_lookup <= max_dictionary_edit_distance)
54 | suggestions = sym_spell.lookup(
55 | input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True
56 | )
57 | # display suggestion term, edit distance, and term frequency
58 | for suggestion in suggestions:
59 | print(suggestion)
60 |
61 | Output::
62 |
63 | apastraphee, 3, 0
64 |
65 | Note that `suggestions` would have been empty if `include_unknown` was
66 | `False`.
67 |
68 | Avoid correcting phrases matching regex
69 | =======================================
70 |
71 | .. code-block:: python
72 | :emphasize-lines: 14,15,16
73 |
74 | import importlib.resources
75 |
76 | from symspellpy import SymSpell, Verbosity
77 |
78 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
79 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
80 | # column of the term frequency
81 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
82 |
83 | # lookup suggestions for single-word input strings
84 | input_term = "members1"
85 | # max edit distance per lookup
86 | # (max_edit_distance_lookup <= max_dictionary_edit_distance)
87 | suggestions = sym_spell.lookup(
88 | input_term, Verbosity.CLOSEST, max_edit_distance=2, ignore_token=r"\w+\d"
89 | )
90 | # display suggestion term, edit distance, and term frequency
91 | for suggestion in suggestions:
92 | print(suggestion)
93 |
94 | Output::
95 |
96 | members1, 0, 1
97 |
98 | Note that `members, 1, 226656153` would be returned if `ignore_token` wasn't
99 | specified.
100 |
101 | Keep original casing
102 | ====================
103 |
104 | .. code-block:: python
105 | :emphasize-lines: 15,16,17
106 |
107 | import importlib.resources
108 |
109 | from symspellpy import SymSpell, Verbosity
110 |
111 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
112 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
113 | # term_index is the column of the term and count_index is the
114 | # column of the term frequency
115 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
116 |
117 | # lookup suggestions for single-word input strings
118 | input_term = "mEmEbers"
119 | # max edit distance per lookup
120 | # (max_edit_distance_lookup <= max_dictionary_edit_distance)
121 | suggestions = sym_spell.lookup(
122 | input_term, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True
123 | )
124 | # display suggestion term, edit distance, and term frequency
125 | for suggestion in suggestions:
126 | print(suggestion)
127 |
128 |
129 | Output::
130 |
131 | mEmbers, 1, 226656153
132 |
133 | Note that the uppercase of the second "E" was not passed on to "b" in the
134 | corrected word.
135 |
--------------------------------------------------------------------------------
/docs/examples/lookup_compound.rst:
--------------------------------------------------------------------------------
1 | ***************
2 | lookup_compound
3 | ***************
4 |
5 | Basic usage
6 | ===========
7 |
8 | .. code-block:: python
9 | :emphasize-lines: 20
10 |
11 | import importlib.resources
12 |
13 | from symspellpy import SymSpell
14 |
15 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
16 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
17 | bigram_path = importlib.resources("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
18 | # term_index is the column of the term and count_index is the
19 | # column of the term frequency
20 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
21 | sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
22 |
23 | # lookup suggestions for multi-word input strings (supports compound
24 | # splitting & merging)
25 | input_term = (
26 | "whereis th elove hehad dated forImuch of thepast who "
27 | "couqdn'tread in sixtgrade and ins pired him"
28 | )
29 | # max edit distance per lookup (per single word, not per whole input string)
30 | suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2)
31 | # display suggestion term, edit distance, and term frequency
32 | for suggestion in suggestions:
33 | print(suggestion)
34 |
35 | Output::
36 |
37 | where is the love he had dated for much of the past who couldn't read in six grade and inspired him, 9, 0
38 |
39 | Keep original casing
40 | ====================
41 |
42 | .. code-block:: python
43 | :emphasize-lines: 20,21,22
44 |
45 | import importlib.resources
46 |
47 | from symspellpy import SymSpell
48 |
49 | sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
50 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
51 | bigram_path = importlib.resources("symspellpy") / "frequency_bigramdictionary_en_243_342.txt"
52 | # term_index is the column of the term and count_index is the
53 | # column of the term frequency
54 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
55 | sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)
56 |
57 | # lookup suggestions for multi-word input strings (supports compound
58 | # splitting & merging)
59 | input_term = (
60 | "whereis th elove heHAd dated forImuch of thEPast who "
61 | "couqdn'tread in sixtgrade and ins pired him"
62 | )
63 | # max edit distance per lookup (per single word, not per whole input string)
64 | suggestions = sym_spell.lookup_compound(
65 | input_term, max_edit_distance=2, transfer_casing=True
66 | )
67 | # display suggestion term, edit distance, and term frequency
68 | for suggestion in suggestions:
69 | print(suggestion)
70 |
71 | Output::
72 |
73 | where is the love he HAd dated for much of thE Past who couldn't read in six grade and inspired him, 9, 0
74 |
--------------------------------------------------------------------------------
/docs/examples/word_segmentation.rst:
--------------------------------------------------------------------------------
1 | *****************
2 | word_segmentation
3 | *****************
4 |
5 | Basic usage
6 | ===========
7 |
8 | .. code-block:: python
9 | :emphasize-lines: 14
10 |
11 | import importlib.resources
12 |
13 | from symspellpy.symspellpy import SymSpell
14 |
15 | # Set max_dictionary_edit_distance to avoid spelling correction
16 | sym_spell = SymSpell(max_dictionary_edit_distance=0, prefix_length=7)
17 | dictionary_path = importlib.resources("symspellpy") / "frequency_dictionary_en_82_765.txt"
18 | # term_index is the column of the term and count_index is the
19 | # column of the term frequency
20 | sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
21 |
22 | # a sentence without any spaces
23 | input_term = "thequickbrownfoxjumpsoverthelazydog"
24 | result = sym_spell.word_segmentation(input_term)
25 | print(f"{result.corrected_string}, {result.distance_sum}, {result.log_prob_sum}")
26 |
27 | Output::
28 |
29 | the quick brown fox jumps over the lazy dog, 8, -34.491167981910635
30 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | .. symspellpy documentation master file, created by
2 | sphinx-quickstart on Tue Feb 19 09:03:54 2019.
3 | You can adapt this file completely to your liking, but it should at least
4 | contain the root `toctree` directive.
5 |
6 | :orphan:
7 |
8 | .. title:: symspellpy: a SymSpell Python port
9 |
10 | .. toctree::
11 | :hidden:
12 |
13 | users/installing
14 | examples/index
15 | api/index
16 |
17 | **********
18 | symspellpy
19 | **********
20 |
21 | symspellpy is a Python port of SymSpell_ v6.7.2, a Symmetric Delete
22 | spelling correction algorithm which provides much higher speed and lower
23 | memory consumption.
24 |
25 | .. _SymSpell: https://github.com/wolfgarbe/SymSpell
26 |
27 | Unit tests from the original project are implemented to ensure the accuracy
28 | of the port. Please note that the port has tried to replicate the code
29 | structure of the original project and has not been optimized for speed.
30 |
31 | Installation
32 | ============
33 |
34 | Visit the :doc:`symspellpy installation instructions `.
35 |
36 | Usage examples
37 | ==============
38 |
39 | Check out :doc:`examples ` to learn how to use symspellpy.
40 |
41 | Documentation
42 | =============
43 |
44 | Check out the :doc:`documentation `.
45 |
46 | Indices and tables
47 | ------------------
48 |
49 | * :ref:`genindex`
50 | * :ref:`modindex`
51 | * :ref:`search`
52 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | editdistpy>=0.1.3
2 | numpydoc==1.8.0
3 | sphinx==8.2.3
4 | sphinx-autodoc-typehints==3.1.0
5 |
--------------------------------------------------------------------------------
/docs/users/installing.rst:
--------------------------------------------------------------------------------
1 | .. include:: ../../INSTALL.rst
2 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "symspellpy"
7 | version = "6.9.0"
8 | dependencies = [
9 | "editdistpy>=0.1.3",
10 | ]
11 | requires-python = ">=3.9"
12 | authors = [
13 | {name = "mmb L"},
14 | ]
15 | description = "Python SymSpell"
16 | readme = "README.md"
17 | license = {file = "LICENSE"}
18 | keywords = ["spellchecker", "symspell", "word segmentation"]
19 | classifiers = [
20 | "Development Status :: 4 - Beta",
21 | "Intended Audience :: Developers",
22 | "Intended Audience :: Education",
23 | "Natural Language :: English",
24 | "License :: OSI Approved :: MIT License",
25 | "Programming Language :: Python",
26 | "Programming Language :: Python :: 3",
27 | "Programming Language :: Python :: 3.9",
28 | "Programming Language :: Python :: 3.10",
29 | "Programming Language :: Python :: 3.11",
30 | "Programming Language :: Python :: 3.12",
31 | "Programming Language :: Python :: 3.13",
32 | ]
33 |
34 | [project.urls]
35 | Repository = "https://github.com/mammothb/symspellpy"
36 | Documentation = "https://symspellpy.readthedocs.io/en/latest"
37 | Changelog = "https://github.com/mammothb/symspellpy/blob/master/CHANGELOG.md"
38 |
39 | [tool.basedpyright]
40 | ignore = ["tests"]
41 | pythonVersion = "3.9"
42 |
43 | reportUnusedCallResult = "none"
44 |
45 | [tool.ruff]
46 | line-length = 88
47 | indent-width = 4
48 |
49 | [tool.ruff.format]
50 | docstring-code-format = false
51 | indent-style = "space"
52 | line-ending = "auto"
53 | quote-style = "double"
54 | skip-magic-trailing-comma = false
55 |
56 | [tool.setuptools.dynamic]
57 | version = {attr = "symspellpy.__version__"}
58 |
59 | [tool.setuptools.packages.find]
60 | where = ["."]
61 | include = ["symspellpy"]
62 |
63 | [tool.setuptools.package-data]
64 | symspellpy = ["frequency_*.txt"]
65 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | editdistpy>=0.1.3
2 |
3 | # For testing
4 | importlib-resources>=6.3.2
5 | pytest==8.3.4
6 | pytest-cov==6.0.0
7 |
--------------------------------------------------------------------------------
/symspellpy/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """symspellpy
17 |
18 | .. moduleauthor:: mmb L
19 | .. moduleauthor:: Wolf Garbe
20 | """
21 |
22 | from . import editdistance, helpers, logging
23 | from .symspellpy import SymSpell
24 | from .verbosity import Verbosity
25 |
--------------------------------------------------------------------------------
/symspellpy/abstract_distance_comparer.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Optional
3 |
4 |
5 | class AbstractDistanceComparer(ABC):
6 | """An interface to compute relative distance between two strings."""
7 |
8 | @abstractmethod
9 | def distance(
10 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int
11 | ) -> int:
12 | """Returns a measure of the distance between two strings.
13 |
14 | Args:
15 | string_1: One of the strings to compare.
16 | string_2: The other string to compare.
17 | max_distance: The maximum distance that is of interest.
18 |
19 | Returns:
20 | -1 if the distance is greater than the max_distance, 0 if the strings
21 | are equivalent, otherwise a positive number whose magnitude
22 | increases as difference between the strings increases.
23 | """
24 |
--------------------------------------------------------------------------------
/symspellpy/composition.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """
17 | .. module:: compostiion
18 | :synopsis: Data class for :meth:`symspellpy.symspellpy.word_segmentation`.
19 | """
20 |
21 | from typing import NamedTuple
22 |
23 |
24 | class Composition(NamedTuple):
25 | """Used by :meth:`word_segmentation`.
26 |
27 | Attributes:
28 | segmented_string: The word segmented string.
29 | corrected_string: The spelling corrected string.
30 | distance_sum: The sum of edit distance between input string and
31 | corrected string
32 | log_prob_sum: The sum of word occurrence probabilities in log
33 | scale (a measure of how common and probable the corrected
34 | segmentation is).
35 | """
36 |
37 | segmented_string: str = ""
38 | corrected_string: str = ""
39 | distance_sum: int = 0
40 | log_prob_sum: float = 0
41 |
42 | @classmethod
43 | def create(
44 | cls,
45 | composition: "Composition",
46 | segmented_part: str,
47 | corrected_part: str,
48 | distance: int,
49 | log_prob: float,
50 | ) -> "Composition":
51 | """Creates a Composition by appending to an existing Composition."""
52 | return cls(
53 | composition.segmented_string + segmented_part,
54 | composition.corrected_string + corrected_part,
55 | composition.distance_sum + distance,
56 | composition.log_prob_sum + log_prob,
57 | )
58 |
--------------------------------------------------------------------------------
/symspellpy/editdistance.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """
17 | .. module:: editdistance
18 | :synopsis: Module for edit distance algorithms.
19 | """
20 |
21 | import warnings
22 | from enum import Enum
23 | from typing import Optional
24 |
25 | from editdistpy import damerau_osa, levenshtein
26 |
27 | from symspellpy import helpers
28 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
29 |
30 |
31 | class DistanceAlgorithm(Enum):
32 | """Supported edit distance algorithms."""
33 |
34 | LEVENSHTEIN = 0 #: Levenshtein algorithm.
35 | DAMERAU_OSA = 1 #: Damerau optimal string alignment algorithm
36 | LEVENSHTEIN_FAST = 2 #: Fast Levenshtein algorithm.
37 | DAMERAU_OSA_FAST = 3 #: Fast Damerau optimal string alignment algorithm
38 | USER_PROVIDED = 4 #: User provided custom edit distance algorithm
39 |
40 |
41 | class EditDistance:
42 | """Edit distance algorithms.
43 |
44 | Args:
45 | algorithm: The distance algorithm to use.
46 |
47 | Attributes:
48 | _algorithm (:class:`DistanceAlgorithm`): The edit distance algorithm to
49 | use.
50 | _distance_comparer (:class:`AbstractDistanceComparer`): An object to
51 | compute the relative distance between two strings. The concrete
52 | object will be chosen based on the value of :attr:`_algorithm`.
53 |
54 | Raises:
55 | ValueError: If `algorithm` specifies an invalid distance algorithm.
56 | """
57 |
58 | def __init__(
59 | self,
60 | algorithm: DistanceAlgorithm,
61 | comparer: Optional[AbstractDistanceComparer] = None,
62 | ) -> None:
63 | if algorithm != DistanceAlgorithm.USER_PROVIDED and comparer is not None:
64 | warnings.warn(
65 | f"A comparer is passed in but algorithm is not {DistanceAlgorithm.USER_PROVIDED.value}. A built-in comparer will be used."
66 | )
67 |
68 | self._distance_comparer: AbstractDistanceComparer
69 | self._algorithm = algorithm
70 | if algorithm == DistanceAlgorithm.LEVENSHTEIN:
71 | self._distance_comparer = Levenshtein()
72 | elif algorithm == DistanceAlgorithm.DAMERAU_OSA:
73 | self._distance_comparer = DamerauOsa()
74 | elif algorithm == DistanceAlgorithm.LEVENSHTEIN_FAST:
75 | self._distance_comparer = LevenshteinFast()
76 | elif algorithm == DistanceAlgorithm.DAMERAU_OSA_FAST:
77 | self._distance_comparer = DamerauOsaFast()
78 | elif algorithm == DistanceAlgorithm.USER_PROVIDED:
79 | if not isinstance(comparer, AbstractDistanceComparer):
80 | raise ValueError(
81 | f"{algorithm.value} selected but no comparer passed in."
82 | )
83 | self._distance_comparer = comparer
84 | else:
85 | raise ValueError("unknown distance algorithm")
86 |
87 | def compare(self, string_1: str, string_2: str, max_distance: int) -> int:
88 | """Compares a string to the base string to determine the edit distance,
89 | using the previously selected algorithm.
90 |
91 | Args:
92 | string_1: Base string.
93 | string_2: The string to compare.
94 | max_distance: The maximum distance allowed.
95 |
96 | Returns:
97 | The edit distance (or -1 if `max_distance` exceeded).
98 | """
99 | return self._distance_comparer.distance(string_1, string_2, max_distance)
100 |
101 |
102 | class Levenshtein(AbstractDistanceComparer):
103 | """Provides Levenshtein algorithm for computing edit distance metric between
104 | two strings.
105 |
106 | Attributes:
107 | _base_char_1_costs (list[int]):
108 | """
109 |
110 | def __init__(self):
111 | self._base_char_1_costs: list[int] = []
112 |
113 | def distance(
114 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int
115 | ) -> int:
116 | """Computes the Levenshtein edit distance between two strings.
117 |
118 | Args:
119 | string_1: One of the strings to compare.
120 | string_2: The other string to compare.
121 | max_distance: The maximum distance that is of interest.
122 |
123 | Returns:
124 | -1 if the distance is greater than the max_distance, 0 if the strings
125 | are equivalent, otherwise a positive number whose magnitude
126 | increases as difference between the strings increases.
127 | """
128 | if string_1 is None or string_2 is None:
129 | return helpers.null_distance_results(string_1, string_2, max_distance)
130 | if max_distance <= 0:
131 | return 0 if string_1 == string_2 else -1
132 | max_distance = int(min(2**31 - 1, max_distance))
133 | # if strings of different lengths, ensure shorter string is in string_1.
134 | # This can result in a little faster speed by spending more time spinning
135 | # just the inner loop during the main processing.
136 | if len(string_1) > len(string_2):
137 | string_2, string_1 = string_1, string_2
138 | if len(string_2) - len(string_1) > max_distance:
139 | return -1
140 | # identify common suffix and/or prefix that can be ignored
141 | len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2)
142 | if len_1 == 0:
143 | return len_2 if len_2 <= max_distance else -1
144 |
145 | if len_2 > len(self._base_char_1_costs):
146 | self._base_char_1_costs = [0 for _ in range(len_2)]
147 | if max_distance < len_2:
148 | return self._distance_max(
149 | string_1,
150 | string_2,
151 | len_1,
152 | len_2,
153 | start,
154 | max_distance,
155 | self._base_char_1_costs,
156 | )
157 | return self._distance(
158 | string_1, string_2, len_1, len_2, start, self._base_char_1_costs
159 | )
160 |
161 | @staticmethod
162 | def _distance(
163 | string_1: str,
164 | string_2: str,
165 | len_1: int,
166 | len_2: int,
167 | start: int,
168 | char_1_costs: list[int],
169 | ) -> int:
170 | """Internal implementation of the core Levenshtein algorithm.
171 |
172 | **From**: https://github.com/softwx/SoftWx.Match
173 | """
174 | char_1_costs = [j + 1 for j in range(len_2)]
175 | current_cost = 0
176 | for i in range(len_1):
177 | left_char_cost = above_char_cost = i
178 | char_1 = string_1[start + i]
179 | for j in range(len_2):
180 | # cost of diagonal (substitution)
181 | current_cost = left_char_cost
182 | left_char_cost = char_1_costs[j]
183 | if string_2[start + j] != char_1:
184 | # substitution if neither of the two conditions below
185 | if above_char_cost < current_cost:
186 | current_cost = above_char_cost
187 | if left_char_cost < current_cost:
188 | current_cost = left_char_cost
189 | current_cost += 1
190 | char_1_costs[j] = above_char_cost = current_cost
191 | return current_cost
192 |
193 | @staticmethod
194 | def _distance_max(
195 | string_1: str,
196 | string_2: str,
197 | len_1: int,
198 | len_2: int,
199 | start: int,
200 | max_distance: int,
201 | char_1_costs: list[int],
202 | ) -> int:
203 | """Internal implementation of the core Levenshtein algorithm that accepts
204 | a max_distance.
205 |
206 | **From**: https://github.com/softwx/SoftWx.Match
207 | """
208 | char_1_costs = [
209 | j + 1 if j < max_distance else max_distance + 1 for j in range(len_2)
210 | ]
211 | len_diff = len_2 - len_1
212 | j_start_offset = max_distance - len_diff
213 | j_start = 0
214 | j_end = max_distance
215 | current_cost = 0
216 | for i in range(len_1):
217 | char_1 = string_1[start + i]
218 | prev_char_1_cost = above_char_cost = i
219 | # no need to look beyond window of lower right diagonal -
220 | # max_distance cells (lower right diag is i - lenDiff) and the upper
221 | # left diagonal + max_distance cells (upper left is i)
222 | j_start += 1 if i > j_start_offset else 0
223 | j_end += 1 if j_end < len_2 else 0
224 | for j in range(j_start, j_end):
225 | # cost of diagonal (substitution)
226 | current_cost = prev_char_1_cost
227 | prev_char_1_cost = char_1_costs[j]
228 | if string_2[start + j] != char_1:
229 | # substitution if neither of the two conditions below
230 | if above_char_cost < current_cost:
231 | current_cost = above_char_cost
232 | if prev_char_1_cost < current_cost:
233 | current_cost = prev_char_1_cost
234 | current_cost += 1
235 | char_1_costs[j] = above_char_cost = current_cost
236 | if char_1_costs[i + len_diff] > max_distance:
237 | return -1
238 | return current_cost if current_cost <= max_distance else -1
239 |
240 |
241 | class DamerauOsa(AbstractDistanceComparer):
242 | """Provides optimized methods for computing Damerau-Levenshtein Optimal
243 | String Alignment (OSA) comparisons between two strings.
244 |
245 | Attributes:
246 | _base_char_1_costs (list[int]):
247 | _base_prev_char_1_costs (list[int]):
248 | """
249 |
250 | def __init__(self) -> None:
251 | self._base_char_1_costs: list[int] = []
252 | self._base_prev_char_1_costs: list[int] = []
253 |
254 | def distance(
255 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int
256 | ) -> int:
257 | """Computes the Damerau-Levenshtein optimal string alignment edit
258 | distance between two strings.
259 |
260 | Args:
261 | string_1: One of the strings to compare.
262 | string_2: The other string to compare.
263 | max_distance: The maximum distance that is of interest.
264 |
265 | Returns:
266 | -1 if the distance is greater than the max_distance, 0 if the strings
267 | are equivalent, otherwise a positive number whose magnitude
268 | increases as difference between the strings increases.
269 | """
270 | if string_1 is None or string_2 is None:
271 | return helpers.null_distance_results(string_1, string_2, max_distance)
272 | if max_distance <= 0:
273 | return 0 if string_1 == string_2 else -1
274 | max_distance = int(min(2**31 - 1, max_distance))
275 | # if strings of different lengths, ensure shorter string is in string_1.
276 | # This can result in a little faster speed by spending more time spinning
277 | # just the inner loop during the main processing.
278 | if len(string_1) > len(string_2):
279 | string_2, string_1 = string_1, string_2
280 | if len(string_2) - len(string_1) > max_distance:
281 | return -1
282 | # identify common suffix and/or prefix that can be ignored
283 | len_1, len_2, start = helpers.prefix_suffix_prep(string_1, string_2)
284 | if len_1 == 0:
285 | return len_2 if len_2 <= max_distance else -1
286 |
287 | if len_2 > len(self._base_char_1_costs):
288 | self._base_char_1_costs = [0 for _ in range(len_2)]
289 | self._base_prev_char_1_costs = [0 for _ in range(len_2)]
290 | if max_distance < len_2:
291 | return self._distance_max(
292 | string_1,
293 | string_2,
294 | len_1,
295 | len_2,
296 | start,
297 | max_distance,
298 | self._base_char_1_costs,
299 | self._base_prev_char_1_costs,
300 | )
301 | return self._distance(
302 | string_1,
303 | string_2,
304 | len_1,
305 | len_2,
306 | start,
307 | self._base_char_1_costs,
308 | self._base_prev_char_1_costs,
309 | )
310 |
311 | @staticmethod
312 | def _distance(
313 | string_1: str,
314 | string_2: str,
315 | len_1: int,
316 | len_2: int,
317 | start: int,
318 | char_1_costs: list[int],
319 | prev_char_1_costs: list[int],
320 | ) -> int:
321 | """Internal implementation of the core Damerau-Levenshtein, optimal
322 | string alignment algorithm.
323 |
324 | **From**: https://github.com/softwx/SoftWx.Match
325 | """
326 | char_1_costs = [j + 1 for j in range(len_2)]
327 | char_1 = " "
328 | current_cost = 0
329 | for i in range(len_1):
330 | prev_char_1 = char_1
331 | char_1 = string_1[start + i]
332 | char_2 = " "
333 | left_char_cost = above_char_cost = i
334 | next_trans_cost = 0
335 | for j in range(len_2):
336 | this_trans_cost = next_trans_cost
337 | next_trans_cost = prev_char_1_costs[j]
338 | # cost of diagonal (substitution)
339 | prev_char_1_costs[j] = current_cost = left_char_cost
340 | # left now equals current cost (which will be diagonal
341 | # at next iteration)
342 | left_char_cost = char_1_costs[j]
343 | prev_char_2 = char_2
344 | char_2 = string_2[start + j]
345 | if char_1 != char_2:
346 | # substitution if neither of two conditions below
347 | if above_char_cost < current_cost:
348 | current_cost = above_char_cost
349 | if left_char_cost < current_cost:
350 | current_cost = left_char_cost
351 | current_cost += 1
352 | if (
353 | i != 0
354 | and j != 0
355 | and char_1 == prev_char_2
356 | and prev_char_1 == char_2
357 | and this_trans_cost + 1 < current_cost
358 | ):
359 | # transposition
360 | current_cost = this_trans_cost + 1
361 | char_1_costs[j] = above_char_cost = current_cost
362 | return current_cost
363 |
364 | @staticmethod
365 | def _distance_max(
366 | string_1: str,
367 | string_2: str,
368 | len_1: int,
369 | len_2: int,
370 | start: int,
371 | max_distance: int,
372 | char_1_costs: list[int],
373 | prev_char_1_costs: list[int],
374 | ) -> int:
375 | """Internal implementation of the core Damerau-Levenshtein, optimal
376 | string alignment algorithm that accepts a max_distance.
377 |
378 | **From**: https://github.com/softwx/SoftWx.Match
379 | """
380 | char_1_costs = [
381 | j + 1 if j < max_distance else max_distance + 1 for j in range(len_2)
382 | ]
383 | len_diff = len_2 - len_1
384 | j_start_offset = max_distance - len_diff
385 | j_start = 0
386 | j_end = max_distance
387 | char_1 = " "
388 | current_cost = 0
389 | for i in range(len_1):
390 | prev_char_1 = char_1
391 | char_1 = string_1[start + i]
392 | char_2 = " "
393 | left_char_cost = above_char_cost = i
394 | next_trans_cost = 0
395 | # no need to look beyond window of lower right diagonal -
396 | # max_distance cells (lower right diag is i - len_diff) and the upper
397 | # left diagonal + max_distance cells (upper left is i)
398 | j_start += 1 if i > j_start_offset else 0
399 | j_end += 1 if j_end < len_2 else 0
400 | for j in range(j_start, j_end):
401 | this_trans_cost = next_trans_cost
402 | next_trans_cost = prev_char_1_costs[j]
403 | # cost of diagonal (substitution)
404 | prev_char_1_costs[j] = current_cost = left_char_cost
405 | # left now equals current cost (which will be diagonal at next
406 | # iteration)
407 | left_char_cost = char_1_costs[j]
408 | prev_char_2 = char_2
409 | char_2 = string_2[start + j]
410 | if char_1 != char_2:
411 | # substitution if neither of two conditions below
412 | if above_char_cost < current_cost:
413 | current_cost = above_char_cost
414 | if left_char_cost < current_cost:
415 | current_cost = left_char_cost
416 | current_cost += 1
417 | if (
418 | i != 0
419 | and j != 0
420 | and char_1 == prev_char_2
421 | and prev_char_1 == char_2
422 | and this_trans_cost + 1 < current_cost
423 | ):
424 | # transposition
425 | current_cost = this_trans_cost + 1
426 | char_1_costs[j] = above_char_cost = current_cost
427 | if char_1_costs[i + len_diff] > max_distance:
428 | return -1
429 | return current_cost if current_cost <= max_distance else -1
430 |
431 |
432 | class LevenshteinFast(AbstractDistanceComparer):
433 | """Provides an interface for computing edit distance metric between two
434 | strings using the fast Levenshtein algorithm.
435 | """
436 |
437 | def distance(
438 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int
439 | ) -> int:
440 | """Computes the Levenshtein edit distance between two strings.
441 |
442 | Args:
443 | string_1: One of the strings to compare.
444 | string_2: The other string to compare.
445 | max_distance: The maximum distance that is of interest.
446 |
447 | Returns:
448 | -1 if the distance is greater than the max_distance, 0 if the strings
449 | are equivalent, otherwise a positive number whose magnitude
450 | increases as difference between the strings increases.
451 | """
452 | return levenshtein.distance(string_1, string_2, max_distance)
453 |
454 |
455 | class DamerauOsaFast(AbstractDistanceComparer):
456 | """Provides an interface for computing edit distance metric between two
457 | strings using the fast Damerau-Levenshtein Optimal String Alignment (OSA)
458 | algorithm.
459 | """
460 |
461 | def distance(
462 | self, string_1: Optional[str], string_2: Optional[str], max_distance: int
463 | ) -> int:
464 | """Computes the Damerau-Levenshtein optimal string alignment edit
465 | distance between two strings.
466 |
467 | Args:
468 | string_1: One of the strings to compare.
469 | string_2: The other string to compare.
470 | max_distance: The maximum distance that is of interest.
471 |
472 | Returns:
473 | -1 if the distance is greater than the max_distance, 0 if the strings
474 | are equivalent, otherwise a positive number whose magnitude
475 | increases as difference between the strings increases.
476 | """
477 | return damerau_osa.distance(string_1, string_2, max_distance)
478 |
--------------------------------------------------------------------------------
/symspellpy/helpers.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """
17 | .. module:: helpers
18 | :synopsis: Helper functions
19 | """
20 |
21 | import re
22 | import sys
23 | import warnings
24 | from difflib import SequenceMatcher
25 | from typing import Optional
26 |
27 |
28 | def _rename_args(kwargs_map: dict[str, str], version: str):
29 | def decorator(func):
30 | def wrapped(*args, **kwargs):
31 | new_kwargs = {}
32 | for k, v in kwargs.items():
33 | if k in kwargs_map:
34 | warnings.warn(
35 | f"Keyword argument '{k}' is deprecated and will be removed in {version}. Use '{kwargs_map[k]}' instead.",
36 | DeprecationWarning,
37 | )
38 | new_kwargs[kwargs_map.get(k, k)] = v
39 | return func(*args, **new_kwargs)
40 |
41 | return wrapped
42 |
43 | return decorator
44 |
45 |
46 | def case_transfer_matching(cased_text: str, uncased_text: str) -> str:
47 | """Transfers the casing from one text to another - assuming that they are
48 | 'matching' texts, alias they have the same length.
49 |
50 | Args:
51 | cased_text: Text with varied casing.
52 | uncased_text: Text that is in lowercase only.
53 |
54 | Returns:
55 | Text with the content of `uncased_text` and the casing of `cased_text`.
56 |
57 | Raises:
58 | ValueError: If the input texts have different lengths.
59 | """
60 | if len(cased_text) != len(uncased_text):
61 | raise ValueError(
62 | "'cased_text' and 'uncased_text' don't have the same length, use case_transfer_similar() instead"
63 | )
64 |
65 | return "".join(
66 | [
67 | y.upper() if x.isupper() else y.lower()
68 | for x, y in zip(cased_text, uncased_text)
69 | ]
70 | )
71 |
72 |
73 | def case_transfer_similar(cased_text: str, uncased_text: str) -> str:
74 | """Transfers the casing from one text to another - for similar (not matching)
75 | text.
76 |
77 | Use `difflib.SequenceMatcher` to identify the different type of changes
78 | needed to turn `cased_text` into `uncased_text`.
79 |
80 | - For inserted sections: transfer the casing from the prior character. If no
81 | character before or the character before is the space, transfer the casing
82 | from the following character.
83 | - For deleted sections: no case transfer is required.
84 | - For equal sections: swap out the text with the original, the cased one, a
85 | otherwise the two are the same.
86 | - For replaced sections: transfer the casing using
87 | :meth:`case_transfer_matching` if the two has the same length, otherwise
88 | transfer character-by-character and carry the last casing over to any
89 | additional characters.
90 |
91 | Args:
92 | cased_text: Text with varied casing.
93 | uncased_text: Text in lowercase.
94 |
95 | Returns:
96 | Text with the content of `uncased_text` but the casing of `cased_text`.
97 |
98 | Raises:
99 | ValueError: If `cased_text` is empty.
100 | """
101 | if not uncased_text:
102 | return uncased_text
103 |
104 | if not cased_text:
105 | raise ValueError("'cased_text' cannot be empty")
106 |
107 | matcher = SequenceMatcher(a=cased_text.lower(), b=uncased_text)
108 | result = ""
109 |
110 | for tag, i1, i2, j1, j2 in matcher.get_opcodes():
111 | if tag == "delete":
112 | continue
113 | if tag == "insert":
114 | # For the first character or space on the left, take the casing from
115 | # the following character. Else take case the prior character
116 | ia_ref = i1 if i1 == 0 or cased_text[i1 - 1] == " " else i1 - 1
117 | if cased_text[ia_ref].isupper():
118 | result += uncased_text[j1:j2].upper()
119 | else:
120 | result += uncased_text[j1:j2].lower()
121 | elif tag == "equal":
122 | # Transfer the text from the cased_text, as anyhow they are equal
123 | # (without the casing)
124 | result += cased_text[i1:i2]
125 | else:
126 | cased_seq = cased_text[i1:i2]
127 | uncased_seq = uncased_text[j1:j2]
128 |
129 | if len(cased_seq) == len(uncased_seq):
130 | result += case_transfer_matching(cased_seq, uncased_seq)
131 | else:
132 | # transfer the casing character-by-character and using the last
133 | # casing to continue if we run out of the sequence
134 | for cased, uncased in zip(cased_seq, uncased_seq):
135 | result += uncased.upper() if cased.isupper() else uncased.lower()
136 | # Apply casing from the last character of cased_seq to the rest
137 | # of the uncased_seq
138 | if len(cased_seq) < len(uncased_seq):
139 | upper = cased_seq[-1].isupper()
140 | idx = len(cased_seq)
141 | result += "".join(
142 | map(str.upper if upper else str.lower, uncased_seq[idx:])
143 | )
144 | return result
145 |
146 |
147 | def increment_count(count: int, count_previous: int) -> int:
148 | """Increments count up to ``sys.maxsize``."""
149 | return (
150 | count_previous + count if sys.maxsize - count_previous > count else sys.maxsize
151 | )
152 |
153 |
154 | def is_acronym(word: str, contain_digits: bool = False) -> bool:
155 | """Checks if the word is all caps (acronym) and/or contain numbers.
156 |
157 | Args:
158 | word: The word to check
159 | contain_digits: A flag to determine whether any term with digits can be
160 | considered as acronym
161 |
162 | Returns:
163 | True if the word is all caps and/or contain numbers, e.g., ABCDE, AB12C,
164 | abc12, ab12c. False if the word contains lower case letters, e.g.,
165 | abcde, ABCde, abcDE, abCDe.
166 | """
167 | return re.match(r"\b[A-Z0-9]{2,}\b", word) is not None or (
168 | contain_digits and any(i.isdigit() for i in word)
169 | )
170 |
171 |
172 | @_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0")
173 | def null_distance_results(
174 | string_1: Optional[str], string_2: Optional[str], max_distance: int
175 | ) -> int:
176 | """Determines the proper return value of an edit distance function when one
177 | or both strings are null.
178 |
179 | Args:
180 | string_1: Base string.
181 | string_2: The string to compare.
182 | max_distance: The maximum distance allowed.
183 |
184 | Returns:
185 | -1 if the distance is greater than the max_distance, 0 if the strings are
186 | equivalent (both are None), otherwise a positive number whose
187 | magnitude is the length of the string which is not None.
188 | """
189 | if string_1 is None:
190 | if string_2 is None:
191 | return 0
192 | return len(string_2) if len(string_2) <= max_distance else -1
193 | return len(string_1) if len(string_1) <= max_distance else -1
194 |
195 |
196 | def parse_words(
197 | phrase: str, preserve_case: bool = False, split_by_space: bool = False
198 | ) -> list[str]:
199 | """Creates a non-unique wordlist from sample text. Language independent
200 | (e.g. works with Chinese characters)
201 |
202 | Args:
203 | phrase: Sample text that could contain one or more words.
204 | preserve_case: A flag to determine if we can to preserve the cases or
205 | convert all to lowercase.
206 | split_by_space: Splits the phrase into words simply based on space.
207 |
208 | Returns:
209 | A list of words
210 | """
211 | if split_by_space:
212 | if preserve_case:
213 | return phrase.split()
214 | return phrase.lower().split()
215 | # \W non-words, use negated set to ignore non-words and "_" (underscore).
216 | # Compatible with non-latin characters, does not split words at apostrophes
217 | if preserve_case:
218 | return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase)
219 | return re.findall(r"([^\W_]+['’]*[^\W_]*)", phrase.lower())
220 |
221 |
222 | @_rename_args({"string1": "string_1", "string2": "string_2"}, "v7.0.0")
223 | def prefix_suffix_prep(string_1: str, string_2: str) -> tuple[int, int, int]:
224 | """Calculates starting position and lengths of two strings such that common
225 | prefix and suffix substrings are excluded.
226 | Expects len(string_1) <= len(string_2).
227 |
228 | Args:
229 | string_1: Base string.
230 | string_2: The string to compare.
231 |
232 | Returns:
233 | A tuple of lengths of the part excluding common prefix and suffix, and
234 | the starting position.
235 | """
236 | # this is also the minimun length of the two strings
237 | len_1 = len(string_1)
238 | len_2 = len(string_2)
239 | # suffix common to both strings can be ignored
240 | while len_1 != 0 and string_1[len_1 - 1] == string_2[len_2 - 1]:
241 | len_1 -= 1
242 | len_2 -= 1
243 | # prefix common to both strings can be ignored
244 | start = 0
245 | while start != len_1 and string_1[start] == string_2[start]:
246 | start += 1
247 | if start != 0:
248 | len_1 -= start
249 | # length of the part excluding common prefix and suffix
250 | len_2 -= start
251 | return len_1, len_2, start
252 |
253 |
254 | def to_similarity(distance: int, length: int) -> float:
255 | """Calculates a similarity measure from an edit distance.
256 |
257 | Args:
258 | distance: The edit distance between two strings.
259 | length: The length of the longer of the two strings the edit distance is
260 | from.
261 |
262 | Returns:
263 | A similarity value from 0 to 1.0 (1 - (length / distance)), -1 if
264 | distance is negative
265 | """
266 | return -1 if distance < 0 else 1.0 - distance / length
267 |
268 |
269 | def try_parse_int64(string: str) -> Optional[int]:
270 | """Converts the string representation of a number to its 64-bit signed
271 | integer equivalent.
272 |
273 | Args:
274 | string: String representation of a number.
275 |
276 | Returns:
277 | The 64-bit signed integer equivalent, or None if conversion failed or if
278 | the number is less than the min value or greater than the max value
279 | of a 64-bit signed integer.
280 | """
281 | try:
282 | ret = int(string)
283 | except ValueError:
284 | return None
285 | return ret if -(2**63) <= ret <= 2**63 - 1 else None
286 |
287 |
288 | class DictIO:
289 | """An iterator wrapper for python dictionary to format the output as required
290 | by :meth:`load_dictionary_stream` and :meth:`load_dictionary_bigram_stream`.
291 |
292 | Args:
293 | dictionary: dictionary with words as keys and frequency count as values.
294 | separator: Separator characters between term(s) and count.
295 |
296 | Attributes:
297 | iteritems: An iterator object of dictionary.items().
298 | separator: Separator characters between term(s) and count.
299 | """
300 |
301 | def __init__(self, dictionary: dict[str, int], separator: str = " ") -> None:
302 | self.iteritems = iter(dictionary.items())
303 | self.separator = separator
304 |
305 | def __iter__(self) -> "DictIO":
306 | return self
307 |
308 | def __next__(self) -> str:
309 | return self.separator.join(map(str, next(self.iteritems)))
310 |
--------------------------------------------------------------------------------
/symspellpy/logging.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | #
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | import logging
16 | import sys
17 |
18 | logger = logging.getLogger("symspellpy")
19 |
20 | handler = logging.StreamHandler(sys.stderr)
21 | handler.setFormatter(
22 | logging.Formatter(fmt="%(asctime)s: %(levelname).1s %(name)s] %(message)s")
23 | )
24 |
25 | logger.addHandler(handler)
26 |
--------------------------------------------------------------------------------
/symspellpy/pickle_mixin.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """
17 | .. module:: pickle_mixing
18 | :synopsis: Mixin to provide pickle loading and saving functionalities.
19 | """
20 |
21 | import gzip
22 | import logging
23 | import pickle
24 | from operator import itemgetter
25 | from pathlib import Path
26 | from typing import IO, Optional, Union, cast
27 |
28 | logger = logging.getLogger(__name__)
29 |
30 |
31 | # Protocol only available in py38
32 | # class SymSpellProtocol(Protocol):
33 | # data_version: int
34 | # _count_threshold: int
35 | # _max_dictionary_edit_distance: int
36 | # _prefix_length: int
37 | # _deletes: dict[str, list[str]]
38 | # _words: dict[str, int]
39 | # _max_length: int
40 |
41 |
42 | class PickleMixin:
43 | """Implements saving and loading pickle functionality for SymSpell."""
44 |
45 | data_version: int
46 | _below_threshold_words: dict[str, int]
47 | _bigrams: dict[str, int]
48 | _deletes: dict[str, list[str]]
49 | _words: dict[str, int]
50 |
51 | _count_threshold: int
52 | _max_dictionary_edit_distance: int
53 | _max_length: int
54 | _prefix_length: int
55 |
56 | def load_pickle(
57 | self,
58 | data: Union[bytes, Path],
59 | compressed: bool = True,
60 | from_bytes: bool = False,
61 | ) -> bool:
62 | """Loads delete combination from file as pickle. This will reduce the
63 | loading time compared to running :meth:`load_dictionary` again.
64 |
65 | Args:
66 | data: Either bytes string to be used with ``from_bytes=True`` or the
67 | path+filename of the pickle file to be used with
68 | ``from_bytes=False``.
69 | compressed: A flag to determine whether to read the pickled data as
70 | compressed data.
71 | from_bytes: Flag to determine if we are loading from bytes or file.
72 |
73 | Returns:
74 | ``True`` if delete combinations are successfully loaded.
75 | """
76 | if from_bytes:
77 | assert isinstance(data, bytes)
78 | return self._load_pickle_stream(data, from_bytes)
79 | if compressed:
80 | with gzip.open(data, "rb") as gzip_infile:
81 | return self._load_pickle_stream(cast(IO[bytes], gzip_infile))
82 | else:
83 | with open(data, "rb") as infile:
84 | return self._load_pickle_stream(infile)
85 |
86 | def save_pickle(
87 | self,
88 | filename: Optional[Path] = None,
89 | compressed: bool = True,
90 | to_bytes: bool = False,
91 | ) -> Optional[bytes]:
92 | """Pickles :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into
93 | a stream for quicker loading later.
94 |
95 | Args:
96 | filename: The path+filename of the pickle file.
97 | compressed: A flag to determine whether to compress the pickled data.
98 | to_bytes: Flag to determine by bytes string should be returned
99 | instead of wrting to file.
100 |
101 | Returns:
102 | A byte string of the pickled data if ``to_bytes=True``.
103 | """
104 | if to_bytes:
105 | return self._save_pickle_stream(to_bytes=to_bytes)
106 | assert filename is not None
107 | if compressed:
108 | with gzip.open(filename, "wb") as gzip_outfile:
109 | self._save_pickle_stream(cast(IO[bytes], gzip_outfile))
110 | else:
111 | with open(filename, "wb") as outfile:
112 | self._save_pickle_stream(outfile)
113 | return None
114 |
115 | def _load_pickle_stream(
116 | self, stream: Union[bytes, IO[bytes]], from_bytes: bool = False
117 | ) -> bool:
118 | """Loads delete combination from stream as pickle. This will reduce the
119 | loading time compared to running :meth:`load_dictionary` again.
120 |
121 | **NOTE**: Prints warning if the current settings `count_threshold`,
122 | `max_dictionary_edit_distance`, and `prefix_length` are different from
123 | the loaded settings. Overwrite current settings with loaded settings.
124 |
125 | Args:
126 | stream: The stream from which the pickle data is loaded.
127 | from_bytes: Flag to determine if we are loading from bytes or file.
128 |
129 | Returns:
130 | ``True`` if delete combinations are successfully loaded.
131 | """
132 | if from_bytes:
133 | assert isinstance(stream, bytes)
134 | pickle_data = pickle.loads(stream) # nosec
135 | else:
136 | assert not isinstance(stream, bytes)
137 | pickle_data = pickle.load(stream) # nosec
138 | if pickle_data.get("data_version", None) != self.data_version:
139 | return False
140 | settings = ("count_threshold", "max_dictionary_edit_distance", "prefix_length")
141 | if itemgetter(*settings)(pickle_data) != (
142 | self._count_threshold,
143 | self._max_dictionary_edit_distance,
144 | self._prefix_length,
145 | ):
146 | logger.warning(
147 | f"Loading data which was created using different {settings} settings. Overwriting current SymSpell instance with loaded settings ..."
148 | )
149 | self._deletes = pickle_data["deletes"]
150 | self._words = pickle_data["words"]
151 | self._max_length = pickle_data["max_length"]
152 | # dictionary entries related variables
153 | self._below_threshold_words = pickle_data["below_threshold_words"]
154 | self._bigrams = pickle_data["bigrams"]
155 | self._deletes = pickle_data["deletes"]
156 | self._words = pickle_data["words"]
157 | self._max_length = pickle_data["max_length"]
158 | # SymSpell settings used to generate the above
159 | self._count_threshold = pickle_data["count_threshold"]
160 | self._max_dictionary_edit_distance = pickle_data["max_dictionary_edit_distance"]
161 | self._prefix_length = pickle_data["prefix_length"]
162 | return True
163 |
164 | def _save_pickle_stream(
165 | self, stream: Optional[IO[bytes]] = None, to_bytes: bool = False
166 | ) -> Optional[bytes]:
167 | """Pickles :attr:`_below_threshold_words`, :attr:`_bigrams`,
168 | :attr:`_deletes`, :attr:`_words`, and :attr:`_max_length` into
169 | a stream for quicker loading later.
170 |
171 | Pickles :attr:`_count_threshold`, :attr:`_max_dictionary_edit_distance`,
172 | and :attr:`_prefix_length` to ensure consistent behavior.
173 |
174 | Args:
175 | stream: The stream to store the pickle data.
176 | to_bytes: Flag to determine by bytes string should be returned
177 | instead of wrting to file.
178 |
179 | Returns:
180 | A byte string of the pickled data if ``to_bytes=True``.
181 | """
182 | pickle_data = {
183 | # Dictionary entries related variables
184 | "below_threshold_words": self._below_threshold_words,
185 | "bigrams": self._bigrams,
186 | "deletes": self._deletes,
187 | "words": self._words,
188 | "max_length": self._max_length,
189 | # SymSpell settings used to generate the above
190 | "count_threshold": self._count_threshold,
191 | "max_dictionary_edit_distance": self._max_dictionary_edit_distance,
192 | "prefix_length": self._prefix_length,
193 | # Version to ensure compatibility
194 | "data_version": self.data_version,
195 | }
196 | if to_bytes:
197 | return pickle.dumps(pickle_data)
198 | assert stream is not None
199 | pickle.dump(pickle_data, stream)
200 | return None
201 |
--------------------------------------------------------------------------------
/symspellpy/suggest_item.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """
17 | .. module:: suggest_item
18 | :synopsis: Data class for :meth:`symspellpy.symspellpy.lookup`.
19 | """
20 |
21 |
22 | class SuggestItem:
23 | """Spelling suggestion returned from :meth:`lookup`.
24 |
25 | Args:
26 | term: The suggested word.
27 | distance: Edit distance from search word.
28 | count: Frequency of suggestion in dictionary or Naive Bayes probability
29 | of the individual suggestion parts.
30 | """
31 |
32 | def __init__(self, term: str, distance: int, count: int) -> None:
33 | self._term = term
34 | self._distance = distance
35 | self._count = count
36 |
37 | def __eq__(self, other: object) -> bool:
38 | """
39 | Returns:
40 | ``True`` if both distance and frequency count are the same.
41 | """
42 | if not isinstance(other, SuggestItem):
43 | return NotImplemented
44 | if self._distance == other.distance:
45 | return self._count == other.count
46 | return self._distance == other.distance
47 |
48 | def __lt__(self, other: object) -> bool:
49 | """
50 | Returns:
51 | Order by distance ascending, then by frequency count descending.
52 | """
53 | if not isinstance(other, SuggestItem):
54 | return NotImplemented
55 | if self._distance == other.distance:
56 | return self._count > other.count
57 | return self._distance < other.distance
58 |
59 | def __str__(self) -> str:
60 | """
61 | Returns:
62 | Displays attributes as "term, distance, count".
63 | """
64 | return f"{self._term}, {self._distance}, {self._count}"
65 |
66 | @property
67 | def count(self) -> int:
68 | """Frequency of suggestion in the dictionary (a measure of how common the
69 | word is) or Naive Bayes probability of the individual suggestion parts in
70 | :meth:`lookup_compound`.
71 | """
72 | return self._count
73 |
74 | @count.setter
75 | def count(self, count: int) -> None:
76 | self._count = count
77 |
78 | @property
79 | def distance(self) -> int:
80 | """Edit distance between searched for word and suggestion."""
81 | return self._distance
82 |
83 | @distance.setter
84 | def distance(self, distance: int) -> None:
85 | self._distance = distance
86 |
87 | @property
88 | def term(self) -> str:
89 | """The suggested correctly spelled word."""
90 | return self._term
91 |
92 | @term.setter
93 | def term(self, term: str) -> None:
94 | self._term = term
95 |
96 | @classmethod
97 | def create_with_probability(cls, term: str, distance: int) -> "SuggestItem":
98 | """Creates a SuggestItem with Naive Bayes probability as the count."""
99 | return cls(term, distance, 10 // 10 ** len(term))
100 |
--------------------------------------------------------------------------------
/symspellpy/verbosity.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 | #
3 | # Copyright (c) 2025 mmb L (Python port)
4 | # Copyright (c) 2021 Wolf Garbe (Original C# implementation)
5 | #
6 | # Permission is hereby granted, free of charge, to any person obtaining a copy
7 | # of this software and associated documentation files (the "Software"), to deal
8 | # in the Software without restriction, including without limitation the rights
9 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | # copies of the Software, and to permit persons to whom the Software is
11 | # furnished to do so, subject to the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included in all
14 | # copies or substantial portions of the Software.
15 |
16 | """
17 | .. module:: verbosity
18 | :synopsis: Enum for lookup results verbosity.
19 | """
20 |
21 | from enum import Enum
22 |
23 |
24 | class Verbosity(Enum):
25 | """Controls the closeness/quantity of returned spelling suggestions.
26 |
27 | Attributes:
28 | TOP: Top suggestion with the highest term frequency of the suggestions of
29 | smallest edit distance found.
30 | CLOSEST: All suggestions of smallest edit distance found, suggestions
31 | ordered by term frequency.
32 | ALL: All suggestions within maxEditDistance, suggestions ordered by edit
33 | distance, then by term frequency (slower, no early termination).
34 | """
35 |
36 | TOP = 0
37 | CLOSEST = 1
38 | ALL = 2
39 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mammothb/symspellpy/f4d1531a686038975370be3db4c19685564c2efe/tests/__init__.py
--------------------------------------------------------------------------------
/tests/benchmarks.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import importlib.resources\n",
10 | "import sys\n",
11 | "from pathlib import Path\n",
12 | "\n",
13 | "sys.path.append(str(Path.cwd().parent))\n",
14 | "\n",
15 | "from symspellpy import SymSpell, Verbosity\n",
16 | "from symspellpy.editdistance import DistanceAlgorithm, EditDistance"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 2,
22 | "metadata": {},
23 | "outputs": [
24 | {
25 | "data": {
26 | "text/plain": [
27 | "True"
28 | ]
29 | },
30 | "execution_count": 2,
31 | "metadata": {},
32 | "output_type": "execute_result"
33 | }
34 | ],
35 | "source": [
36 | "bigram_path = importlib.resources.files(\"symspellpy\") / \"frequency_bigramdictionary_en_243_342.txt\"\n",
37 | "\n",
38 | "dictionary_path = importlib.resources.files(\"symspellpy\") / \"frequency_dictionary_en_82_765.txt\"\n",
39 | "\n",
40 | "sym_spell_damerau_osa = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.DAMERAU_OSA))\n",
41 | "sym_spell_damerau_osa.load_bigram_dictionary(bigram_path, 0, 2)\n",
42 | "sym_spell_damerau_osa.load_dictionary(dictionary_path, 0, 1)\n",
43 | "\n",
44 | "sym_spell_damerau_osa_fast = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST))\n",
45 | "sym_spell_damerau_osa_fast.load_bigram_dictionary(bigram_path, 0, 2)\n",
46 | "sym_spell_damerau_osa_fast.load_dictionary(dictionary_path, 0, 1)\n",
47 | "\n",
48 | "sym_spell_levenshtein = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.LEVENSHTEIN))\n",
49 | "sym_spell_levenshtein.load_bigram_dictionary(bigram_path, 0, 2)\n",
50 | "sym_spell_levenshtein.load_dictionary(dictionary_path, 0, 1)\n",
51 | "\n",
52 | "sym_spell_levenshtein_fast = SymSpell(distance_comparer=EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST))\n",
53 | "sym_spell_levenshtein_fast.load_bigram_dictionary(bigram_path, 0, 2)\n",
54 | "sym_spell_levenshtein_fast.load_dictionary(dictionary_path, 0, 1)"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "def lookup_damerau_osa():\n",
64 | " sym_spell_damerau_osa.lookup(\"tepmperamet\", Verbosity.ALL)\n",
65 | "\n",
66 | "def lookup_damerau_osa_fast():\n",
67 | " sym_spell_damerau_osa_fast.lookup(\"tepmperamet\", Verbosity.ALL)\n",
68 | "\n",
69 | "def lookup_levenshtein():\n",
70 | " sym_spell_levenshtein.lookup(\"tepmperamet\", Verbosity.ALL)\n",
71 | "\n",
72 | "def lookup_levenshtein_fast():\n",
73 | " sym_spell_levenshtein_fast.lookup(\"tepmperamet\", Verbosity.ALL)\n",
74 | "\n",
75 | "def lookup_compound_damerau_osa():\n",
76 | " sym_spell_damerau_osa.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
77 | "\n",
78 | "def lookup_compound_damerau_osa_fast():\n",
79 | " sym_spell_damerau_osa_fast.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
80 | "\n",
81 | "def lookup_compound_levenshtein():\n",
82 | " sym_spell_levenshtein.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
83 | "\n",
84 | "def lookup_compound_levenshtein_fast():\n",
85 | " sym_spell_levenshtein_fast.lookup_compound(\"whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him\", 2)\n",
86 | "\n",
87 | "def word_segmentation_damerau_osa():\n",
88 | " sym_spell_damerau_osa.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n",
89 | "\n",
90 | "def word_segmentation_damerau_osa_fast():\n",
91 | " sym_spell_damerau_osa_fast.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n",
92 | "\n",
93 | "def word_segmentation_levenshtein():\n",
94 | " sym_spell_levenshtein.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)\n",
95 | "\n",
96 | "def word_segmentation_levenshtein_fast():\n",
97 | " sym_spell_levenshtein_fast.word_segmentation(\"thequickbrownfoxjumpsoverthelazydog\", 0)"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 4,
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "107 μs ± 356 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
110 | "67.6 μs ± 319 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
111 | "95.4 μs ± 563 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n",
112 | "66.7 μs ± 295 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n"
113 | ]
114 | }
115 | ],
116 | "source": [
117 | "%timeit lookup_damerau_osa()\n",
118 | "%timeit lookup_damerau_osa_fast()\n",
119 | "%timeit lookup_levenshtein()\n",
120 | "%timeit lookup_levenshtein_fast()"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 5,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "9.89 ms ± 65.3 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
133 | "5.1 ms ± 13.1 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
134 | "8.68 ms ± 46.6 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n",
135 | "4.95 ms ± 13.2 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "%timeit lookup_compound_damerau_osa()\n",
141 | "%timeit lookup_compound_damerau_osa_fast()\n",
142 | "%timeit lookup_compound_levenshtein()\n",
143 | "%timeit lookup_compound_levenshtein_fast()"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 6,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "1.13 ms ± 1.36 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
156 | "1.14 ms ± 2.94 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
157 | "1.14 ms ± 3.56 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n",
158 | "1.14 ms ± 1.6 μs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)\n"
159 | ]
160 | }
161 | ],
162 | "source": [
163 | "%timeit word_segmentation_damerau_osa()\n",
164 | "%timeit word_segmentation_damerau_osa_fast()\n",
165 | "%timeit word_segmentation_levenshtein()\n",
166 | "%timeit word_segmentation_levenshtein_fast()"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "**Note**: Result for `word_segmentation` is expected since we are passing `max_edit_distance=0`."
174 | ]
175 | }
176 | ],
177 | "metadata": {
178 | "interpreter": {
179 | "hash": "d83327bb218665ef1f16f1956a0b9fb217f4e8f6e80f84663e37ea0a49e5699a"
180 | },
181 | "kernelspec": {
182 | "display_name": "Python 3 (ipykernel)",
183 | "language": "python",
184 | "name": "python3"
185 | },
186 | "language_info": {
187 | "codemirror_mode": {
188 | "name": "ipython",
189 | "version": 3
190 | },
191 | "file_extension": ".py",
192 | "mimetype": "text/x-python",
193 | "name": "python",
194 | "nbconvert_exporter": "python",
195 | "pygments_lexer": "ipython3",
196 | "version": "3.13.2"
197 | }
198 | },
199 | "nbformat": 4,
200 | "nbformat_minor": 4
201 | }
202 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pathlib import Path
3 |
4 | import importlib_resources
5 | import pytest
6 |
7 | from symspellpy import SymSpell
8 |
9 | FORTESTS_DIR = Path(__file__).resolve().parent / "fortests"
10 |
11 |
12 | #######################################################################
13 | # Paths
14 | #######################################################################
15 | @pytest.fixture
16 | def bigram_path():
17 | ref = (
18 | importlib_resources.files("symspellpy")
19 | / "frequency_bigramdictionary_en_243_342.txt"
20 | )
21 | with importlib_resources.as_file(ref) as path:
22 | yield path
23 |
24 |
25 | @pytest.fixture
26 | def dictionary_path():
27 | ref = importlib_resources.files("symspellpy") / "frequency_dictionary_en_82_765.txt"
28 | with importlib_resources.as_file(ref) as path:
29 | yield path
30 |
31 |
32 | @pytest.fixture
33 | def pickle_path():
34 | return FORTESTS_DIR / "dictionary.pickle"
35 |
36 |
37 | @pytest.fixture
38 | def query_path():
39 | return FORTESTS_DIR / "noisy_query_en_1000.txt"
40 |
41 |
42 | #######################################################################
43 | # Misc
44 | #######################################################################
45 | @pytest.fixture
46 | def get_same_word_and_count():
47 | word = "hello"
48 | return [(word, 11), (word, 3)]
49 |
50 |
51 | @pytest.fixture
52 | def get_fortests_data(request):
53 | with open(FORTESTS_DIR / request.param) as infile:
54 | return json.load(infile)["data"]
55 |
56 |
57 | #######################################################################
58 | # symspells
59 | #######################################################################
60 | @pytest.fixture
61 | def symspell_default():
62 | return SymSpell()
63 |
64 |
65 | @pytest.fixture
66 | def symspell_default_entry(symspell_default, request):
67 | for entry in request.param:
68 | symspell_default.create_dictionary_entry(entry[0], entry[1])
69 | return symspell_default
70 |
71 |
72 | @pytest.fixture
73 | def symspell_default_load(symspell_default, dictionary_path, bigram_path, request):
74 | symspell_default.load_dictionary(dictionary_path, 0, 1)
75 | if request.param == "bigram":
76 | symspell_default.load_bigram_dictionary(bigram_path, 0, 2)
77 | return symspell_default, request.param
78 |
79 |
80 | @pytest.fixture
81 | def symspell_long():
82 | return SymSpell(5)
83 |
84 |
85 | @pytest.fixture
86 | def symspell_long_entry(symspell_long, request):
87 | for entry in request.param:
88 | symspell_long.create_dictionary_entry(entry, 2)
89 | return symspell_long, request.param
90 |
91 |
92 | @pytest.fixture
93 | def symspell_short(request):
94 | if request.param is None:
95 | return SymSpell(1, 3)
96 | return SymSpell(1, 3, count_threshold=request.param)
97 |
--------------------------------------------------------------------------------
/tests/fortests/bad_dict.txt:
--------------------------------------------------------------------------------
1 | qwer
2 | wert
3 | erty
4 | rtyu tyui 12
5 | yuio uiop 13
6 | asdf 10
7 | sdfg 12
--------------------------------------------------------------------------------
/tests/fortests/below_threshold_dict.txt:
--------------------------------------------------------------------------------
1 | below 8
2 | threshold 10
3 | word 10
4 |
--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "data": [
3 | {
4 | "typo": "whereis th elove",
5 | "bigram": {
6 | "num_results": 1,
7 | "term": "where is the love",
8 | "distance": 2,
9 | "count": 585
10 | },
11 | "unigram": {
12 | "num_results": 1,
13 | "term": "whereas the love",
14 | "distance": 2,
15 | "count": 64
16 | }
17 | },
18 | {
19 | "typo": "the bigjest playrs",
20 | "bigram": {
21 | "num_results": 1,
22 | "term": "the biggest players",
23 | "distance": 2,
24 | "count": 34
25 | },
26 | "unigram": {
27 | "num_results": 1,
28 | "term": "the biggest players",
29 | "distance": 2,
30 | "count": 34
31 | }
32 | },
33 | {
34 | "typo": "Can yu readthis",
35 | "bigram": {
36 | "num_results": 1,
37 | "term": "can you read this",
38 | "distance": 3,
39 | "count": 11440
40 | },
41 | "unigram": {
42 | "num_results": 1,
43 | "term": "can you read this",
44 | "distance": 3,
45 | "count": 3
46 | }
47 | },
48 | {
49 | "typo": "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him",
50 | "bigram": {
51 | "num_results": 1,
52 | "term": "where is the love he had dated for much of the past who couldn't read in sixth grade and inspired him",
53 | "distance": 9,
54 | "count": 0
55 | },
56 | "unigram": {
57 | "num_results": 1,
58 | "term": "whereas the love head dated for much of the past who couldn't read in sixth grade and inspired him",
59 | "distance": 9,
60 | "count": 0
61 | }
62 | },
63 | {
64 | "typo": "in te dhird qarter oflast jear he hadlearned ofca sekretplan",
65 | "bigram": {
66 | "num_results": 1,
67 | "term": "in the third quarter of last year he had learned of a secret plan",
68 | "distance": 9,
69 | "count": 0
70 | },
71 | "unigram": {
72 | "num_results": 1,
73 | "term": "in the third quarter of last year he had learned of a secret plan",
74 | "distance": 9,
75 | "count": 0
76 | }
77 | },
78 | {
79 | "typo": "the bigjest playrs in te strogsommer film slatew ith plety of funn",
80 | "bigram": {
81 | "num_results": 1,
82 | "term": "the biggest players in the strong summer film slate with plenty of fun",
83 | "distance": 9,
84 | "count": 0
85 | },
86 | "unigram": {
87 | "num_results": 1,
88 | "term": "the biggest players in the strong summer film slate with plenty of fun",
89 | "distance": 9,
90 | "count": 0
91 | }
92 | },
93 | {
94 | "typo": "Can yu readthis messa ge despite thehorible sppelingmsitakes",
95 | "bigram": {
96 | "num_results": 1,
97 | "term": "can you read this message despite the horrible spelling mistakes",
98 | "distance": 10,
99 | "count": 0
100 | },
101 | "unigram": {
102 | "num_results": 1,
103 | "term": "can you read this message despite the horrible spelling mistakes",
104 | "distance": 10,
105 | "count": 0
106 | }
107 | }
108 | ]
109 | }
110 |
--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_ignore_non_words_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "data": [
3 | {
4 | "typo": "whereis th elove 123 hehad dated forImuch of THEPAST who couqdn'tread in SIXTHgrade and ins pired him",
5 | "bigram": {
6 | "term": "where is the love 123 he had dated for much of THEPAST who couldn't read in sixth grade and inspired him"
7 | },
8 | "unigram": {
9 | "term": "whereas the love 123 head dated for much of THEPAST who couldn't read in sixth grade and inspired him"
10 | }
11 | },
12 | {
13 | "typo": "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan",
14 | "bigram": {
15 | "term": "in the DHIRD 1 quarter of last year he had learned of a secret plan"
16 | },
17 | "unigram": {
18 | "term": "in the DHIRD 1 quarter of last year he had learned of a secret plan"
19 | }
20 | },
21 | {
22 | "typo": "the bigjest playrs in te stroGSOmmer film slatew ith PLETY of 12 funn",
23 | "bigram": {
24 | "term": "the biggest players in the strong summer film slate with PLETY of 12 fun"
25 | },
26 | "unigram": {
27 | "term": "the biggest players in the strong summer film slate with PLETY of 12 fun"
28 | }
29 | },
30 | {
31 | "typo": "Can yu readtHIS messa ge despite thehorible 1234 sppelingmsitakes",
32 | "bigram": {
33 | "term": "can you read this message despite the horrible 1234 spelling mistakes"
34 | },
35 | "unigram": {
36 | "term": "can you read this message despite the horrible 1234 spelling mistakes"
37 | }
38 | },
39 | {
40 | "typo": "Can yu readtHIS messa ge despite thehorible AB1234 sppelingmsitakes",
41 | "bigram": {
42 | "term": "can you read this message despite the horrible AB1234 spelling mistakes"
43 | },
44 | "unigram": {
45 | "term": "can you read this message despite the horrible AB1234 spelling mistakes"
46 | }
47 | },
48 | {
49 | "typo": "PI on leave, arrange Co-I to do screening",
50 | "bigram": { "term": "PI on leave arrange co i to do screening" },
51 | "unigram": { "term": "PI on leave arrange co i to do screening" }
52 | }
53 | ]
54 | }
55 |
--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_replaced_words_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "data": [
3 | {
4 | "typo": "whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him",
5 | "bigram": {
6 | "term": "where is the love he had dated for much of the past who couldn't read in sixth grade and inspired him",
7 | "replacement": {
8 | "whereis": "where is",
9 | "th": "the",
10 | "elove": "love",
11 | "hehad": "he had",
12 | "forimuch": "for much",
13 | "thepast": "the past",
14 | "couqdn'tread": "couldn't read",
15 | "sixthgrade": "sixth grade",
16 | "ins": "inspired"
17 | }
18 | },
19 | "unigram": {
20 | "term": "whereas the love head dated for much of the past who couldn't read in sixth grade and inspired him",
21 | "replacement": {
22 | "whereis": "whereas",
23 | "th": "the",
24 | "elove": "love",
25 | "hehad": "head",
26 | "forimuch": "for much",
27 | "thepast": "the past",
28 | "couqdn'tread": "couldn't read",
29 | "sixthgrade": "sixth grade",
30 | "ins": "inspired"
31 | }
32 | }
33 | },
34 | {
35 | "typo": "in te dhird qarter oflast jear he hadlearned ofca sekretplan",
36 | "bigram": {
37 | "term": "in the third quarter of last year he had learned of a secret plan",
38 | "replacement": {
39 | "te": "the",
40 | "dhird": "third",
41 | "qarter": "quarter",
42 | "oflast": "of last",
43 | "jear": "year",
44 | "hadlearned": "had learned",
45 | "ofca": "of a",
46 | "sekretplan": "secret plan"
47 | }
48 | },
49 | "unigram": {
50 | "term": "in the third quarter of last year he had learned of a secret plan",
51 | "replacement": {
52 | "te": "the",
53 | "dhird": "third",
54 | "qarter": "quarter",
55 | "oflast": "of last",
56 | "jear": "year",
57 | "hadlearned": "had learned",
58 | "ofca": "of a",
59 | "sekretplan": "secret plan"
60 | }
61 | }
62 | },
63 | {
64 | "typo": "the bigjest playrs in te strogsommer film slatew ith plety of funn",
65 | "bigram": {
66 | "term": "the biggest players in the strong summer film slate with plenty of fun",
67 | "replacement": {
68 | "bigjest": "biggest",
69 | "playrs": "players",
70 | "strogsommer": "strong summer",
71 | "slatew": "slate",
72 | "ith": "with",
73 | "plety": "plenty",
74 | "funn": "fun"
75 | }
76 | },
77 | "unigram": {
78 | "term": "the biggest players in the strong summer film slate with plenty of fun",
79 | "replacement": {
80 | "bigjest": "biggest",
81 | "playrs": "players",
82 | "strogsommer": "strong summer",
83 | "slatew": "slate",
84 | "ith": "with",
85 | "plety": "plenty",
86 | "funn": "fun"
87 | }
88 | }
89 | }
90 | ]
91 | }
92 |
--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_transfer_casing_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "data": [
3 | {
4 | "typo": "Whereis th elove hehaD Dated forImuch of thepast who couqdn'tread in sixthgrade AND ins pired him",
5 | "bigram": {
6 | "term": "Where is the love he haD Dated for much of the past who couldn't read in sixth grade AND inspired him"
7 | },
8 | "unigram": {
9 | "term": "Whereas the love heaD Dated for much of the past who couldn't read in sixth grade AND inspired him"
10 | }
11 | }
12 | ]
13 | }
14 |
--------------------------------------------------------------------------------
/tests/fortests/lookup_compound_transfer_casing_ignore_nonwords_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "data": [
3 | {
4 | "typo": "Whereis th elove hehaD Dated FOREEVER forImuch of thepast who couqdn'tread in sixthgrade AND ins pired him",
5 | "bigram": {
6 | "term": "Where is the love he haD Dated FOREEVER for much of the past who couldn't read in sixth grade AND inspired him"
7 | },
8 | "unigram": {
9 | "term": "Whereas the love heaD Dated FOREEVER for much of the past who couldn't read in sixth grade AND inspired him"
10 | }
11 | }
12 | ]
13 | }
14 |
--------------------------------------------------------------------------------
/tests/fortests/noisy_query_en_1000.txt:
--------------------------------------------------------------------------------
1 | te the 1
2 | aojecm project 3
3 | gutenberg gutenberg 0
4 | eboo ebook 1
5 | yof of 1
6 | adventures adventures 0
7 | sherlock sherlock 0
8 | polxs holmes 3
9 | si sir 1
10 | arthur arthur 0
11 | conn conan 1
12 | doyle doyle 0
13 | in in 0
14 | our our 0
15 | aeries series 1
16 | copyrgt copyright 2
17 | laws laws 0
18 | are are 0
19 | changng changing 1
20 | all all 0
21 | over over 0
22 | world world 0
23 | re sure 2
24 | to to 0
25 | check check 0
26 | qor for 1
27 | youbr your 1
28 | countrfy country 1
29 | before before 0
30 | dwnoadingg downloading 3
31 | or or 0
32 | redistributing redistributing 0
33 | ntis this 2
34 | any any 0
35 | other other 0
36 | jekler header 3
37 | shoflg should 2
38 | first first 0
39 | hng thing 2
40 | seen seen 0
41 | when when 0
42 | vkewing viewing 1
43 | cle file 2
44 | pleare please 1
45 | do do 0
46 | not not 0
47 | reovef remove 2
48 | it it 0
49 | change change 0
50 | ei edit 2
51 | ywthout without 2
52 | ritten written 1
53 | wermission permission 1
54 | rad read 1
55 | legailb legal 2
56 | smll small 1
57 | prinj print 1
58 | and and 0
59 | informatin information 1
60 | about about 0
61 | ct at 1
62 | abcttom bottom 2
63 | incluqed included 1
64 | if is 1
65 | important important 0
66 | speciic specific 1
67 | rights rights 0
68 | eetrnictions restrictions 3
69 | how how 0
70 | may may 0
71 | udsd used 2
72 | ou you 1
73 | can can 0
74 | also also 0
75 | itnd find 2
76 | ut out 1
77 | eake make 1
78 | donation donation 0
79 | get get 0
80 | nvotlved involved 2
81 | wellcome welcome 1
82 | free free 0
83 | pain plain 1
84 | vanla vanilla 2
85 | electronic electronic 0
86 | tets texts 1
87 | ebooks ebooks 0
88 | readable readable 0
89 | bt both 2
90 | umns humans 2
91 | compurters computers 1
92 | sikce since 1
93 | thseoe these 2
94 | iere were 1
95 | preard prepared 2
96 | thusans thousands 2
97 | volutrers volunteers 2
98 | tle title 2
99 | athor author 1
100 | release release 0
101 | daxe date 1
102 | marq march 2
103 | most most 0
104 | recently recently 0
105 | updayd updated 2
106 | dncovaember november 3
107 | edition edition 0
108 | lafnguage language 1
109 | engih english 2
110 | chactr character 3
111 | set set 0
112 | encoding encoding 0
113 | asci ascii 1
114 | styrt start 1
115 | additional additional 0
116 | edoing editing 2
117 | josex jose 1
118 | menendez menendez 0
119 | cntems contents 3
120 | scgndial scandal 2
121 | bohemja bohemia 1
122 | ii ii 0
123 | red red 0
124 | hdted headed 3
125 | leagm league 2
126 | iii iii 0
127 | csfe case 2
128 | identity identity 0
129 | iv iv 0
130 | boscombe boscombe 0
131 | vallejy valley 1
132 | mystery mystery 0
133 | five five 0
134 | orfnge orange 1
135 | pips pips 0
136 | vi vi 0
137 | mn man 1
138 | with with 0
139 | tisged twisted 2
140 | lip lip 0
141 | adventu adventure 2
142 | blue blue 0
143 | qcarbuncle carbuncle 1
144 | viii viii 0
145 | pmpeckld speckled 3
146 | bad band 1
147 | ix ix 0
148 | enginebr engineer 1
149 | thub thumb 1
150 | noble noble 0
151 | ahelr bachelor 3
152 | xi xi 0
153 | beryl beryl 0
154 | coixet coronet 3
155 | xii xii 0
156 | coer copper 2
157 | beeches beeches 0
158 | sge she 1
159 | aklways always 1
160 | wovmn woman 2
161 | hanve have 1
162 | setldom seldom 1
163 | herh heard 2
164 | him him 0
165 | mention mention 0
166 | her her 0
167 | ude under 2
168 | hme name 2
169 | hisw his 1
170 | eyes eyes 0
171 | ecipos eclipses 3
172 | predomiates predominates 1
173 | whole whole 0
174 | seyx sex 1
175 | ws was 1
176 | th that 2
177 | he he 0
178 | celt felt 1
179 | emtiobn emotion 2
180 | ainh akin 2
181 | love love 0
182 | ibne irene 2
183 | yadler adler 1
184 | eorgones emotions 4
185 | ne one 1
186 | pcarticulry particularly 3
187 | abajrrevnt abhorrent 3
188 | cold cold 0
189 | preise precise 1
190 | but but 0
191 | admirasly admirably 1
192 | oalpanced balanced 2
193 | mikd mind 1
194 | take take 0
195 | prct perfect 3
196 | reasng reasoning 3
197 | oservng observing 2
198 | mlacine machine 2
199 | has has 0
200 | as as 0
201 | ver lover 2
202 | ouqld would 2
203 | xplacd placed 2
204 | hiqslf himself 2
205 | false false 0
206 | posiin position 2
207 | nover never 1
208 | sspokep spoke 2
209 | sjofter softer 1
210 | talpsions passions 3
211 | ave save 1
212 | abe gibe 2
213 | ser sneer 2
214 | hey they 1
215 | admirbae admirable 2
216 | thingn things 1
217 | obaerver observer 1
218 | ezcvielltnt excellent 4
219 | rawijg drawing 2
220 | veigl veil 1
221 | frsm from 1
222 | men men 0
223 | qoives motives 2
224 | cions actions 2
225 | trained trained 0
226 | asvoner reasoner 3
227 | admit admit 0
228 | ch such 2
229 | intrusions intrusions 0
230 | ito into 1
231 | olwn own 1
232 | delcatee delicate 2
233 | fne finely 3
234 | fjusxed adjusted 3
235 | tepmperamet temperament 2
236 | vitroduce introduce 2
237 | dsntracting distracting 2
238 | factor factor 0
239 | wihicth which 2
240 | might might 0
241 | throw throw 0
242 | doubt doubt 0
243 | pot upon 2
244 | mentl mental 1
245 | requls results 2
246 | grit grit 0
247 | ensiiuw sensitive 4
248 | nstrumnn instrument 3
249 | crack crack 0
250 | hsgh high 1
251 | powe power 1
252 | clnses lenses 2
253 | more more 0
254 | vdisjturbing disturbing 2
255 | ezhan than 2
256 | stqngz strong 3
257 | notre nature 2
258 | yet yet 0
259 | tee there 2
260 | late late 0
261 | bubiofs dubious 2
262 | questionale questionable 1
263 | memtry memory 1
264 | hd had 1
265 | ittle little 1
266 | laiey lately 2
267 | my my 0
268 | mjrriajzbe marriage 4
269 | rifted drifted 1
270 | aaway away 1
271 | ach each 1
272 | vcympnlee complete 4
273 | happiness happiness 0
274 | home home 0
275 | enteredr centred 3
276 | interests interests 0
277 | rise rise 0
278 | uap up 1
279 | qroun around 2
280 | whoc who 1
281 | findtf finds 2
282 | maer master 2
283 | yeatabshment establishment 4
284 | sufficient sufficient 0
285 | absorb absorb 0
286 | etteantion attention 2
287 | whle while 1
288 | loatzhegd loathed 2
289 | every every 0
290 | om form 2
291 | sokcity society 2
292 | bohetmin bohemian 2
293 | souml soul 1
294 | remineu remained 2
295 | rogings lodgings 2
296 | aer baker 2
297 | trt street 3
298 | urild buried 2
299 | ang among 2
300 | old old 0
301 | books books 0
302 | aplternatinp alternating 2
303 | wek week 1
304 | ketween between 1
305 | cocainre cocaine 1
306 | ambition ambition 0
307 | drowsiness drowsiness 0
308 | dug drug 1
309 | fmieae fierce 3
310 | eergy energy 1
311 | een keen 1
312 | silr still 2
313 | ever ever 0
314 | deeply deeply 0
315 | axtrahtqed attracted 3
316 | study study 0
317 | crime crime 0
318 | ocijpied occupied 2
319 | iwmenfe immense 2
320 | ftcults faculties 3
321 | exttnordinaac extraordinary 4
322 | powers powers 0
323 | sprvation observation 3
324 | following following 0
325 | mose those 2
326 | clue clues 1
327 | cjaring clearing 2
328 | ystewies mysteries 2
329 | lzeen been 2
330 | abandoned abandoned 0
331 | hales hopeless 4
332 | ofmiciaz official 2
333 | police police 0
334 | tcim time 2
335 | come some 1
336 | vagje vague 1
337 | acocn account 3
338 | doitgs doings 1
339 | aumqmoxs summons 3
340 | dessa odessa 1
341 | trepoff trepoff 0
342 | mxumrper murder 3
343 | singular singular 0
344 | tragedpyk tragedy 2
345 | tkinson atkinson 1
346 | xbrphers brothers 3
347 | txincoale trincomalee 3
348 | fqnally finally 1
349 | mission mission 0
350 | jcomplished accomplished 2
351 | sow so 1
352 | dctely delicately 4
353 | sucycessfully successfully 1
354 | reigning reigning 0
355 | faifl family 3
356 | honlad holland 2
357 | beyond beyond 0
358 | signh signs 1
359 | lctivity activity 1
360 | hoer however 3
361 | merely merely 0
362 | swarcd shared 2
363 | readersj readers 1
364 | daily daily 0
365 | pvqess press 2
366 | xknw knew 2
367 | former former 0
368 | fieni friend 2
369 | cjompanion companion 1
370 | night night 0
371 | twneth twentieth 3
372 | returnig returning 1
373 | joaurnhey journey 2
374 | patient patient 0
375 | gow now 1
376 | returned returned 0
377 | iil civil 2
378 | pqratice practice 2
379 | way way 0
380 | ed led 1
381 | trough through 1
382 | passed passed 0
383 | wll well 1
384 | zrembere remembered 4
385 | cdoom door 2
386 | must must 0
387 | associated associated 0
388 | woonl wooing 2
389 | da dark 2
390 | miycidents incidents 2
391 | scarfelt scarlet 2
392 | seizedh seized 1
393 | djsie desire 2
394 | se see 1
395 | again again 0
396 | know know 0
397 | eplhlying employing 3
398 | rkoos rooms 2
399 | brilliantly brilliantly 0
400 | lit lit 0
401 | eve even 1
402 | ood looked 3
403 | saw saw 0
404 | tall tall 0
405 | spae spare 1
406 | figurj figure 1
407 | pas pass 1
408 | twice twice 0
409 | silgouette silhouette 1
410 | goains against 3
411 | bind blind 1
412 | acypng pacing 3
413 | room room 0
414 | swiyky swiftly 3
415 | eerly eagerly 2
416 | had head 1
417 | sunk sunk 0
418 | cdest chest 1
419 | hands hands 0
420 | clasped clasped 0
421 | behnd behind 1
422 | mod mood 1
423 | habi habit 1
424 | attityade attitude 2
425 | mlannedr manner 2
426 | thein their 1
427 | story story 0
428 | lworke work 2
429 | qisenn risen 2
430 | coretedu created 3
431 | drjzma dreams 3
432 | hot hot 0
433 | cent scent 1
434 | ew new 1
435 | problem problem 0
436 | arasng rang 2
437 | bl bell 2
438 | showvn shown 1
439 | hamber chamber 1
440 | formely formerly 1
441 | agt part 2
442 | efsisea effusive 4
443 | gld glad 1
444 | ink think 2
445 | hrdby hardly 2
446 | wodrd word 1
447 | spoken spoken 0
448 | kikdfy kindly 2
449 | ye eye 1
450 | wave waved 1
451 | an an 0
452 | rmchair armchair 1
453 | thew threw 1
454 | acrcss across 1
455 | cars cigars 2
456 | mndicaoed indicated 2
457 | crit spirit 3
458 | gasogene gasogene 0
459 | orne corner 2
460 | then then 0
461 | stood stood 0
462 | fire fire 0
463 | irosectve introspective 4
464 | fsiin fashion 3
465 | wwdloc wedlock 2
466 | sitt suits 2
467 | rqmrkedo remarked 3
468 | wtsn watson 2
469 | seoven seven 1
470 | hf half 2
471 | pouhnds pounds 1
472 | angwerd answered 2
473 | tsneed indeed 3
474 | thought thought 0
475 | ckjust just 2
476 | trilx trifle 2
477 | fkancy fancy 1
478 | obrve observe 2
479 | dnd did 1
480 | tl tell 2
481 | intended intended 0
482 | gw go 1
483 | harnes harness 1
484 | dduce deduce 1
485 | getxin getting 2
486 | yurself yourself 1
487 | ery very 1
488 | wet wet 0
489 | cluumsy clumsy 1
490 | faralesi careless 3
491 | servaqt servant 1
492 | irl girl 1
493 | dear dear 0
494 | saxdq said 2
495 | too too 0
496 | mchg much 2
497 | certainly certainly 0
498 | burned burned 0
499 | liea lived 2
500 | fewt few 1
501 | penntduris centuries 4
502 | fgo ago 1
503 | true true 0
504 | walk walk 0
505 | thursday thursday 0
506 | came came 0
507 | dreadul dreadful 1
508 | esbs mess 2
509 | chagd changed 2
510 | clthe clothes 2
511 | imagine imagine 0
512 | mary mary 0
513 | jane jane 0
514 | innorigile incorrigible 3
515 | wif wife 1
516 | gven given 1
517 | notice notice 0
518 | chucd chuckled 3
519 | rubbed rubbed 0
520 | long long 0
521 | nervos nervous 1
522 | otjer together 4
523 | simlicity simplicity 1
524 | iislfr itself 3
525 | inside inside 0
526 | leb left 2
527 | so shoe 2
528 | dwher where 2
529 | fireslght firelight 2
530 | strkhs strikes 2
531 | eather leather 1
532 | sored scored 1
533 | six six 0
534 | almost almost 0
535 | paalll parallel 2
536 | cuts cuts 0
537 | obviously obviously 0
538 | qausdd caused 2
539 | oxneone someone 3
540 | arelessly carelessly 1
541 | scraped scraped 0
542 | roun round 1
543 | dges edges 1
544 | sole sole 0
545 | orger order 1
546 | creusted crusted 1
547 | mudi mud 1
548 | hence hence 0
549 | adoblje double 3
550 | deduction deduction 0
551 | vse vile 2
552 | wether weather 1
553 | magnhnat malignant 4
554 | hbot boot 2
555 | plittijkg slitting 3
556 | spoecme specimen 3
557 | london london 0
558 | slfey slavey 2
559 | ief if 1
560 | eneman gentleman 3
561 | esmellzng smelling 2
562 | ioforc iodoform 3
563 | black black 0
564 | ak mark 2
565 | nitrate nitrate 0
566 | silsver silver 1
567 | righ right 1
568 | foefifgr forefinger 3
569 | blge bulge 1
570 | siydez side 2
571 | top top 0
572 | chat hat 1
573 | show show 0
574 | ecretd secreted 2
575 | stethoscope stethoscope 0
576 | dull dull 0
577 | proounce pronounce 1
578 | acivb active 2
579 | mepmber member 1
580 | vepcal medical 3
581 | profession profession 0
582 | could could 0
583 | besp help 2
584 | lalughing laughing 1
585 | eyyse ease 2
586 | emxplained explained 1
587 | process process 0
588 | hear hear 0
589 | ive give 1
590 | ueass reasons 3
591 | apears appears 1
592 | ridiculously ridiculously 0
593 | snigmle simple 3
594 | esivly easily 2
595 | yseylf myself 2
596 | athough though 1
597 | sccwssive successive 2
598 | nstance instance 1
599 | bafmlled baffled 2
600 | untl until 1
601 | explain explain 0
602 | beleve believe 1
603 | good good 0
604 | your yours 1
605 | que quite 2
606 | lightig lighting 1
607 | cgarete cigarette 2
608 | hrowwing throwing 2
609 | dowu down 1
610 | distinttion distinction 1
611 | clear clear 0
612 | emple example 2
613 | feqetly frequently 3
614 | stps steps 1
615 | xeaq lead 2
616 | hall hall 0
617 | often often 0
618 | hundrqeds hundreds 1
619 | times times 0
620 | qaly many 2
621 | don don 0
622 | oblsere observed 3
623 | poib point 2
624 | seveneene seventeen 2
625 | pecase because 2
626 | interested interested 0
627 | wobms problems 4
628 | ckougd enough 3
629 | chronicle chronicle 0
630 | two two 0
631 | rifnling trifling 2
632 | experencef experiences 2
633 | shet sheet 1
634 | thick thick 0
635 | pink pink 0
636 | tinted tinted 0
637 | noitepaper notepaper 1
638 | lyng lying 1
639 | opn open 1
640 | tbled table 2
641 | last last 0
642 | ot post 2
643 | aloud aloud 0
644 | xnotte note 2
645 | undzate undated 2
646 | either either 0
647 | saigatue signature 3
648 | wress address 3
649 | will will 0
650 | call call 0
651 | quarter quarter 0
652 | eight eight 0
653 | clock clock 0
654 | desires desires 0
655 | consl consult 2
656 | macttr matter 2
657 | deeupest deepest 1
658 | momnt moment 1
659 | recent recent 0
660 | cevice services 3
661 | yyal royal 2
662 | hpoufjs houses 3
663 | euroe europe 1
664 | safely safely 0
665 | tkrushted trusted 2
666 | mtttxs matters 3
667 | importace importance 1
668 | exaggeratbd exaggerated 1
669 | we we 0
670 | quartes quarters 1
671 | receiyd received 2
672 | aur hour 2
673 | zmiss amiss 1
674 | viositgr visitor 2
675 | wear wear 0
676 | masik mask 1
677 | what what 0
678 | means means 0
679 | no no 0
680 | aa data 2
681 | capital capital 0
682 | moisoake mistake 2
683 | theoxise theorise 1
684 | insensibly insensibly 0
685 | begnsj begins 2
686 | twst twist 1
687 | facte facts 1
688 | uiu suit 2
689 | theories theories 0
690 | yinstewad instead 2
691 | carfult carefully 3
692 | exlaqmined examined 2
693 | writig writing 1
694 | paper paper 0
695 | wroto wrote 1
696 | pesuhably presumably 2
697 | edeavouring endeavouring 1
698 | imitare imitate 1
699 | psocsse processes 3
700 | bough bought 1
701 | row crown 2
702 | wawket packet 2
703 | pcueiarly peculiarly 2
704 | iff stiff 2
705 | ecular peculiar 2
706 | hovldi hold 2
707 | light light 0
708 | large large 0
709 | woven woven 0
710 | zextzure texture 2
711 | asked asked 0
712 | nmaker maker 1
713 | mmoogaa monogram 4
714 | raher rather 1
715 | stanks stands 1
716 | geellsckaft gesellschaft 2
717 | german german 0
718 | cpany company 2
719 | csmary customary 3
720 | ycontraon contraction 4
721 | like like 0
722 | co co 0
723 | ckurhe course 2
724 | papienr papier 1
725 | eg eg 0
726 | lev let 1
727 | glance glance 0
728 | pcotinentalk continental 3
729 | gazcetter gazetteer 2
730 | took took 0
731 | havny heavy 2
732 | brown brown 0
733 | volumea volume 1
734 | shelmves shelves 1
735 | eglow eglow 0
736 | eglonitz eglonitz 0
737 | hrje here 2
738 | egeria egria 1
739 | sefkiang speaking 3
740 | rar far 1
741 | crlsbad carlsbad 1
742 | rqarabye remarkable 4
743 | being being 0
744 | scee scene 1
745 | death death 0
746 | wallenstein wallenstein 0
747 | its its 0
748 | nmeous numerous 2
749 | gvlass glass 1
750 | qctries factories 3
751 | milsm mills 2
752 | ha ha 0
753 | body boy 1
754 | sqrkled sparkled 2
755 | sqenzt sent 2
756 | gret great 1
757 | tdrumphant triumphant 2
758 | clud cloud 1
759 | made made 0
760 | precikey precisely 2
761 | construction construction 0
762 | sentence sentence 0
763 | frenczhmmn frenchman 2
764 | rssian russian 1
765 | uncgurteous uncourteous 1
766 | erbs verbs 1
767 | onlyd only 1
768 | remains remains 0
769 | therefore therefore 0
770 | dmmscovr discover 3
771 | wqnted wanted 1
772 | writes writes 0
773 | pffrs prefers 3
774 | eainq wearing 3
775 | showig showing 1
776 | face face 0
777 | comes comes 0
778 | msstaken mistaken 1
779 | xresoe resolve 3
780 | doubts doubts 0
781 | bsharp sharp 1
782 | souno sound 1
783 | horss horses 1
784 | zuoofs hoofs 2
785 | pgrating grating 1
786 | hels wheels 2
787 | curb curb 0
788 | nollowed followed 1
789 | ul pull 2
790 | whjistled whistled 1
791 | par pair 1
792 | yes yes 0
793 | continued continued 0
794 | gancing glancing 1
795 | windo window 1
796 | nicu nice 1
797 | bvoughawmr brougham 3
798 | euties beauties 2
799 | unded hundred 2
800 | fift fifty 1
801 | tineams guineas 3
802 | apiee apiece 1
803 | mone money 1
804 | taing nothing 3
805 | ejse else 1
806 | ett better 3
807 | bit bit 0
808 | doctor doctor 0
809 | lstvy stay 2
810 | mlost lost 1
811 | boswel boswell 1
812 | proasu promises 4
813 | uinterestig interesting 2
814 | py pity 2
815 | iss miss 1
816 | ldeny client 3
817 | want want 0
818 | beht best 1
819 | sgow slow 1
820 | utdep step 2
821 | tars stairs 2
822 | pasage passage 1
823 | paued paused 1
824 | immdiately immediately 1
825 | ousi outside 3
826 | loud loud 0
827 | authoritative authoritative 0
828 | tap tap 0
829 | enered entered 1
830 | leps less 1
831 | feet feet 0
832 | inwh inches 3
833 | hreigqhtd height 3
834 | libr limbs 2
835 | qercsuej hercules 4
836 | res dress 2
837 | richr rich 1
838 | richnesdsj richness 2
839 | eglnd england 2
840 | bd bad 1
841 | taste taste 0
842 | andsw bands 2
843 | sirakhqn astrakhan 3
844 | slashehd slashed 1
845 | sleeves sleeves 0
846 | fronts fronts 0
847 | hbreasted breasted 1
848 | cwat coat 1
849 | deep deep 0
850 | cloak cloak 0
851 | thrown thrown 0
852 | shoudrs shoulders 2
853 | gned lined 2
854 | fame flame 1
855 | txloure coloured 3
856 | slk silk 1
857 | secred secured 1
858 | nemk neck 1
859 | xbrooh brooch 2
860 | onsistea consisted 2
861 | igle single 2
862 | flaming flaming 0
863 | bdoots boots 1
864 | exended extended 1
865 | hlfwbay halfway 2
866 | adves calves 2
867 | trimeo trimmed 2
868 | tps tops 1
869 | furt fur 1
870 | cfopleted completed 2
871 | mprfpssiof impression 4
872 | baobro barbaric 4
873 | opundnzce opulence 3
874 | suggested suggested 0
875 | auppearancne appearance 2
876 | caried carried 1
877 | broaxd broad 1
878 | brimwed brimmed 1
879 | ad hand 2
880 | woe wore 1
881 | upper upper 0
882 | xtennz extending 4
883 | past past 0
884 | lchekboxs cheekbones 4
885 | vad vizard 3
886 | appaienwtly apparently 2
887 | rasod raised 2
888 | lowur lower 1
889 | appeared appeared 0
890 | hanging hanging 0
891 | ssragh straight 3
892 | cyhin chin 1
893 | suggestive suggestive 0
894 | resozlution resolution 1
895 | pushd pushed 1
896 | lengzth length 1
897 | otstinacy obstinacy 1
898 | harusch harsh 2
899 | voice voice 0
900 | srojgqy strongly 3
901 | marked marked 0
902 | accet accent 1
903 | uncamin uncertain 4
904 | pray pray 0
905 | olleeague colleague 2
906 | ocasgonaly occasionally 3
907 | cases cases 0
908 | uhgm whom 2
909 | xhonxr honour 3
910 | cokt count 2
911 | von von 0
912 | ramxm kramm 2
913 | noean nobleman 3
914 | nderman understand 4
915 | jisoretin discretion 3
916 | ettemse extreme 3
917 | eer prefer 3
918 | communicate communicate 0
919 | aone alone 1
920 | rgse rose 1
921 | caught caught 0
922 | wrist wrist 0
923 | bttk back 2
924 | chaim chair 1
925 | none none 0
926 | say say 0
927 | anyttzing anything 2
928 | sruggen shrugged 2
929 | bei begin 2
930 | incdingf binding 3
931 | atoltme absolute 4
932 | secrgcg secrecy 2
933 | years years 0
934 | end end 0
935 | pkrsent present 2
936 | weightp weight 1
937 | influence influence 0
938 | european european 0
939 | historyk history 1
940 | promisxe promise 1
941 | excbe excuse 2
942 | stracxnge strange 2
943 | augysd august 2
944 | persn person 1
945 | empmosd employs 3
946 | wises wishes 1
947 | aet agent 2
948 | nknow unknown 2
949 | onfes confess 2
950 | once once 0
951 | callvd called 1
952 | eixcdtly exactly 3
953 | aware aware 0
954 | dry dryly 2
955 | ciyrumstances circumstances 2
956 | decacw delicacy 3
957 | precaution precaution 0
958 | takn taken 1
959 | quench quench 0
960 | grow grow 0
961 | seriously seriously 0
962 | comproomutse compromise 3
963 | famiclies families 1
964 | speak speak 0
965 | lanly plainly 2
966 | impicates implicates 1
967 | hvuse house 1
968 | ormsqpek ormstein 4
969 | redotary hereditary 3
970 | kings kings 0
971 | muzmurd murmured 2
972 | settlinpg settling 1
973 | closing closing 0
974 | gancerd glanced 2
975 | appnt apparent 3
976 | surrise surprise 1
977 | layngid languid 2
978 | ljouunming lounging 3
979 | dpicuted depicted 2
980 | wnciqsive incisive 2
981 | enwergtic energetic 2
982 | xlowly slowly 1
983 | rekpewned reopened 2
984 | impaienty impatiently 2
985 | ggac gigantic 4
986 | maygty majesty 3
987 | cndeced condescend 3
988 | state state 0
989 | le able 2
990 | adeipej advise 3
991 | sprgm sprang 3
992 | pacedp paced 1
993 | uncontrollable uncontrollable 0
994 | gitton agitation 3
995 | gxesturew gesture 2
996 | desperation desperation 0
997 | zurd hurled 3
998 | grocnd ground 1
999 | criedm cried 1
1000 | kig king 1
--------------------------------------------------------------------------------
/tests/fortests/non_en_dict.txt:
--------------------------------------------------------------------------------
1 | АБИ 10
2 | И 1
3 | Б 2
--------------------------------------------------------------------------------
/tests/fortests/separator_dict.txt:
--------------------------------------------------------------------------------
1 | the$23135851162
2 | of$13151942776
3 | abcs of$10956800
4 | aaron and$10721728
5 | and$12997637966
--------------------------------------------------------------------------------
/tests/fortests/word_segmentation_data.json:
--------------------------------------------------------------------------------
1 | {
2 | "data": [
3 | {
4 | "typo": "thequickbrownfoxjumpsoverthelazydog",
5 | "0": { "term": "the quick brown fox jumps over the lazy dog" }
6 | },
7 | {
8 | "typo": "itwasabrightcolddayinaprilandtheclockswerestrikingthirteen",
9 | "0": {
10 | "term": "it was a bright cold day in april and the clocks were striking thirteen"
11 | }
12 | },
13 | {
14 | "typo": "itwasthebestoftimesitwastheworstoftimesitwastheageofwisdomitwastheageoffoolishness",
15 | "0": {
16 | "term": "it was the best of times it was the worst of times it was the age of wisdom it was the age of foolishness"
17 | }
18 | }
19 | ]
20 | }
21 |
--------------------------------------------------------------------------------
/tests/test_compatibility.py:
--------------------------------------------------------------------------------
1 | from symspellpy.helpers import null_distance_results, prefix_suffix_prep
2 |
3 |
4 | def test_null_distance_results():
5 | assert null_distance_results(None, None, 1) == 0
6 | assert null_distance_results(None, string2=None, max_distance=1) == 0
7 | assert null_distance_results(string1=None, string2=None, max_distance=1) == 0
8 | assert null_distance_results(string_1=None, string_2=None, max_distance=1) == 0
9 |
10 |
11 | def test_prefix_suffix_prep():
12 | assert prefix_suffix_prep("dabca", "ddca") == (2, 1, 1)
13 | assert prefix_suffix_prep("dabca", string2="ddca") == (2, 1, 1)
14 | assert prefix_suffix_prep(string1="dabca", string2="ddca") == (2, 1, 1)
15 | assert prefix_suffix_prep(string_1="dabca", string_2="ddca") == (2, 1, 1)
16 |
--------------------------------------------------------------------------------
/tests/test_editdistance.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from itertools import combinations, permutations
3 |
4 | import pytest
5 |
6 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
7 | from symspellpy.editdistance import (
8 | DamerauOsa,
9 | DamerauOsaFast,
10 | DistanceAlgorithm,
11 | EditDistance,
12 | Levenshtein,
13 | LevenshteinFast,
14 | )
15 |
16 | SHORT_STRING = "string"
17 | LONG_STRING = "long_string"
18 | VERY_LONG_STRING = "very_long_string"
19 |
20 |
21 | def expected_levenshtein(string_1, string_2, max_distance):
22 | max_distance = int(min(2**31 - 1, max_distance))
23 | len_1 = len(string_1)
24 | len_2 = len(string_2)
25 | d = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]
26 | for i in range(len_1 + 1):
27 | d[i][0] = i
28 | for i in range(len_2 + 1):
29 | d[0][i] = i
30 | for j in range(1, len_2 + 1):
31 | for i in range(1, len_1 + 1):
32 | if string_1[i - 1] == string_2[j - 1]:
33 | # no operation
34 | d[i][j] = d[i - 1][j - 1]
35 | else:
36 | d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + 1)
37 | distance = d[len_1][len_2]
38 | return distance if distance <= max_distance else -1
39 |
40 |
41 | def expected_damerau_osa(string_1, string_2, max_distance):
42 | max_distance = int(min(2**31 - 1, max_distance))
43 | len_1 = len(string_1)
44 | len_2 = len(string_2)
45 | d = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]
46 | for i in range(len_1 + 1):
47 | d[i][0] = i
48 | for i in range(len_2 + 1):
49 | d[0][i] = i
50 | for i in range(1, len_1 + 1):
51 | for j in range(1, len_2 + 1):
52 | cost = 0 if string_1[i - 1] == string_2[j - 1] else 1
53 | d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost)
54 | if (
55 | i > 1
56 | and j > 1
57 | and string_1[i - 1] == string_2[j - 2]
58 | and string_1[i - 2] == string_2[j - 1]
59 | ):
60 | d[i][j] = min(d[i][j], d[i - 2][j - 2] + cost)
61 | distance = d[len_1][len_2]
62 | return distance if distance <= max_distance else -1
63 |
64 |
65 | class CustomDistanceComparer(AbstractDistanceComparer):
66 | def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
67 | return -2
68 |
69 |
70 | @pytest.fixture(
71 | params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"]
72 | )
73 | def get_comparer(request):
74 | comparer_dict = {
75 | "damerau_osa": {"actual": DamerauOsa(), "expected": expected_damerau_osa},
76 | "levenshtein": {"actual": Levenshtein(), "expected": expected_levenshtein},
77 | "damerau_osa_fast": {
78 | "actual": DamerauOsaFast(),
79 | "expected": expected_damerau_osa,
80 | },
81 | "levenshtein_fast": {
82 | "actual": LevenshteinFast(),
83 | "expected": expected_levenshtein,
84 | },
85 | }
86 | yield (
87 | comparer_dict[request.param]["actual"],
88 | comparer_dict[request.param]["expected"],
89 | )
90 |
91 |
92 | @pytest.fixture(
93 | params=["damerau_osa", "levenshtein", "damerau_osa_fast", "levenshtein_fast"]
94 | )
95 | def get_edit_distance(request):
96 | comparer_dict = {
97 | "damerau_osa": {
98 | "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA),
99 | "expected": DamerauOsa,
100 | },
101 | "levenshtein": {
102 | "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN),
103 | "expected": Levenshtein,
104 | },
105 | "damerau_osa_fast": {
106 | "actual": EditDistance(DistanceAlgorithm.DAMERAU_OSA_FAST),
107 | "expected": DamerauOsaFast,
108 | },
109 | "levenshtein_fast": {
110 | "actual": EditDistance(DistanceAlgorithm.LEVENSHTEIN_FAST),
111 | "expected": LevenshteinFast,
112 | },
113 | }
114 | yield (
115 | comparer_dict[request.param]["actual"],
116 | comparer_dict[request.param]["expected"],
117 | )
118 |
119 |
120 | @pytest.fixture
121 | def get_short_and_long_strings():
122 | return [
123 | (SHORT_STRING, None, {"null": len(SHORT_STRING), "zero": -1, "neg": -1}),
124 | (LONG_STRING, None, {"null": -1, "zero": -1, "neg": -1}),
125 | (None, SHORT_STRING, {"null": len(SHORT_STRING), "zero": -1, "neg": -1}),
126 | (None, LONG_STRING, {"null": -1, "zero": -1, "neg": -1}),
127 | (SHORT_STRING, SHORT_STRING, {"null": 0, "zero": 0, "neg": 0}),
128 | (None, None, {"null": 0, "zero": 0, "neg": 0}),
129 | ]
130 |
131 |
132 | @pytest.fixture(params=[0, 1, 3, sys.maxsize])
133 | def get_strings(request):
134 | alphabet = "abcd"
135 | strings = [""]
136 | for i in range(1, len(alphabet) + 1):
137 | for combi in combinations(alphabet, i):
138 | strings += ["".join(p) for p in permutations(combi)]
139 | yield strings, request.param
140 |
141 |
142 | class TestEditDistance:
143 | def test_unknown_distance_algorithm(self):
144 | with pytest.raises(ValueError) as excinfo:
145 | _ = EditDistance(2)
146 | assert "unknown distance algorithm" == str(excinfo.value)
147 |
148 | def test_missing_custom_comparer(self):
149 | with pytest.raises(ValueError) as excinfo:
150 | _ = EditDistance(DistanceAlgorithm.USER_PROVIDED)
151 | assert "no comparer passed in" in str(excinfo.value)
152 |
153 | def test_abstract_distance_comparer(self):
154 | with pytest.raises(TypeError) as excinfo:
155 | comparer = AbstractDistanceComparer()
156 | _ = comparer.distance("string_1", "string_2", 10)
157 | assert str(excinfo.value).startswith(
158 | "Can't instantiate abstract class AbstractDistanceComparer"
159 | )
160 |
161 | def test_warn_when_builtin_comparer_override_custom_comparer(self):
162 | with pytest.warns(UserWarning, match="A built-in comparer will be used.$"):
163 | comparer = CustomDistanceComparer()
164 | edit_distance = EditDistance(DistanceAlgorithm.LEVENSHTEIN, comparer)
165 |
166 | def test_internal_distance_comparer(self, get_edit_distance):
167 | edit_distance, expected = get_edit_distance
168 | assert isinstance(edit_distance._distance_comparer, expected)
169 |
170 | def test_comparer_match_ref(self, get_comparer, get_strings):
171 | comparer, expected = get_comparer
172 | strings, max_distance = get_strings
173 |
174 | for s1 in strings:
175 | for s2 in strings:
176 | assert expected(s1, s2, max_distance) == comparer.distance(
177 | s1, s2, max_distance
178 | )
179 |
180 | def test_editdistance_use_custom_comparer(self, get_strings):
181 | strings, max_distance = get_strings
182 | comparer = CustomDistanceComparer()
183 | edit_distance = EditDistance(DistanceAlgorithm.USER_PROVIDED, comparer)
184 |
185 | for s1 in strings:
186 | for s2 in strings:
187 | assert -2 == comparer.distance(s1, s2, max_distance)
188 |
189 | def test_comparer_null_distance(self, get_comparer, get_short_and_long_strings):
190 | comparer, _ = get_comparer
191 |
192 | for s1, s2, expected in get_short_and_long_strings:
193 | distance = comparer.distance(s1, s2, 10)
194 | assert expected["null"] == distance
195 |
196 | def test_comparer_negative_max_distance(
197 | self, get_comparer, get_short_and_long_strings
198 | ):
199 | comparer, _ = get_comparer
200 |
201 | for s1, s2, expected in get_short_and_long_strings:
202 | distance = comparer.distance(s1, s2, 0)
203 | assert expected["zero"] == distance
204 |
205 | for s1, s2, expected in get_short_and_long_strings:
206 | distance = comparer.distance(s1, s2, 0)
207 | assert expected["neg"] == distance
208 |
209 | def test_comparer_very_long_string(self, get_comparer):
210 | comparer, _ = get_comparer
211 | distance = comparer.distance(SHORT_STRING, VERY_LONG_STRING, 5)
212 |
213 | assert -1 == distance
214 |
--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from symspellpy.helpers import (
4 | case_transfer_matching,
5 | case_transfer_similar,
6 | is_acronym,
7 | to_similarity,
8 | )
9 |
10 |
11 | @pytest.fixture
12 | def get_acronyms():
13 | return [
14 | ("ABCDE", {"default": True, "digits": True}),
15 | ("AB12E", {"default": True, "digits": True}),
16 | ("abcde", {"default": False, "digits": False}),
17 | ("ABCde", {"default": False, "digits": False}),
18 | ("abcDE", {"default": False, "digits": False}),
19 | ("abCDe", {"default": False, "digits": False}),
20 | ("abc12", {"default": False, "digits": True}),
21 | ("ab12e", {"default": False, "digits": True}),
22 | ]
23 |
24 |
25 | @pytest.fixture
26 | def get_similar_texts():
27 | return [
28 | (
29 | "Haaw is the weeather in New York?",
30 | "how is the weather in new york?",
31 | "How is the weather in New York?",
32 | ),
33 | ("Wethr in New Yoork", "weather in new york", "Weather in New York"),
34 | ("Efthr in New Yoork", "weather in new york", "WEather in New York"),
35 | ("efthr in New Yoork", "weather in new york", "weather in New York"),
36 | ("eTr in New Yoork", "weather in new york", "weaTHEr in New York"),
37 | ("hoW eqr", "Haaaw er", "haaaW er"),
38 | ("hOW eqr", "Haaaw er", "hAAAW er"),
39 | ]
40 |
41 |
42 | class TestHelpers:
43 | def test_to_similarity(self):
44 | length = 20.0
45 |
46 | assert pytest.approx(0.7) == to_similarity(6.0, length)
47 | assert -1 == to_similarity(-1.0, length)
48 |
49 | def test_is_acronym(self, get_acronyms):
50 | for word, expected in get_acronyms:
51 | assert expected["default"] == is_acronym(word)
52 | assert expected["digits"] == is_acronym(word, True)
53 |
54 | def test_case_transfer_matching_diff_lengths(self):
55 | with pytest.raises(ValueError) as excinfo:
56 | case_transfer_matching("abc", "abcd")
57 | assert (
58 | "'cased_text' and 'uncased_text' don't have the same length, use "
59 | "case_transfer_similar() instead"
60 | ) == str(excinfo.value)
61 |
62 | def test_case_transfer_matching(self):
63 | cased_text = "Haw is the eeather in New York?"
64 | uncased_text = "how is the weather in new york?"
65 |
66 | # the uncased_text text with the casing transferred from
67 | # the cased_text text
68 | assert "How is the weather in New York?" == case_transfer_matching(
69 | cased_text, uncased_text
70 | )
71 |
72 | def test_case_transfer_similar_empty_wo_casing(self):
73 | cased_text = "Haw is the eeather in New York?"
74 | uncased_text = ""
75 |
76 | assert uncased_text == case_transfer_similar(cased_text, uncased_text)
77 |
78 | def test_case_transfer_similar_empty_w_casing(self):
79 | with pytest.raises(ValueError) as excinfo:
80 | case_transfer_similar("", "abcd")
81 | assert "'cased_text' cannot be empty" == str(excinfo.value)
82 |
83 | def test_case_transfer_similar(self, get_similar_texts):
84 | for cased_text, uncased_text, expected in get_similar_texts:
85 | assert expected == case_transfer_similar(cased_text, uncased_text)
86 |
--------------------------------------------------------------------------------
/tests/test_suggest_item.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from symspellpy.suggest_item import SuggestItem
4 |
5 |
6 | @pytest.fixture
7 | def suggest_item():
8 | return SuggestItem("term", 0, 0)
9 |
10 |
11 | class TestSuggestItem:
12 | def test_invalid_equal_to(self, suggest_item):
13 | assert suggest_item.__eq__(0) is NotImplemented
14 | assert not suggest_item == 0
15 |
16 | def test_invalid_less_than(self, suggest_item):
17 | assert suggest_item.__lt__(0) is NotImplemented
18 | with pytest.raises(TypeError) as excinfo:
19 | suggest_item < 0
20 | assert "'<' not supported between instances of 'SuggestItem' and 'int'" == str(
21 | excinfo.value
22 | )
23 |
24 | def test_suggest_item(self):
25 | si_1 = SuggestItem("asdf", 12, 34)
26 | si_2 = SuggestItem("sdfg", 12, 34)
27 | si_3 = SuggestItem("dfgh", 56, 78)
28 |
29 | assert si_1 == si_2
30 | assert si_2 != si_3
31 |
32 | assert "asdf" == si_1.term
33 | si_1.term = "qwer"
34 | assert "qwer" == si_1.term
35 |
36 | assert 34 == si_1.count
37 | si_1.count = 78
38 | assert 78 == si_1.count
39 |
40 | assert "qwer, 12, 78" == str(si_1)
41 |
--------------------------------------------------------------------------------
/tests/test_symspellpy.py:
--------------------------------------------------------------------------------
1 | from io import StringIO
2 | from pathlib import Path
3 | from unittest import TestCase
4 |
5 | import pytest
6 |
7 | from symspellpy import SymSpell, Verbosity
8 | from symspellpy.abstract_distance_comparer import AbstractDistanceComparer
9 | from symspellpy.editdistance import DistanceAlgorithm, EditDistance
10 | from symspellpy.helpers import DictIO
11 |
12 | FORTESTS_DIR = Path(__file__).resolve().parent / "fortests"
13 | BAD_DICT_PATH = FORTESTS_DIR / "bad_dict.txt"
14 | BELOW_THRESHOLD_DICT_PATH = FORTESTS_DIR / "below_threshold_dict.txt"
15 | BIG_MODIFIED_PATH = FORTESTS_DIR / "big_modified.txt"
16 | BIG_WORDS_PATH = FORTESTS_DIR / "big_words.txt"
17 | NON_EN_DICT_PATH = FORTESTS_DIR / "non_en_dict.txt"
18 | SEPARATOR_DICT_PATH = FORTESTS_DIR / "separator_dict.txt"
19 |
20 | INVALID_PATH = "invalid/dictionary/path.txt"
21 | SEPARATOR = "$"
22 |
23 |
24 | @pytest.fixture
25 | def get_dictionary_stream(request):
26 | dictionary = {
27 | "the": 23135851162,
28 | "of": 13151942776,
29 | "abcs of": 10956800,
30 | "aaron and": 10721728,
31 | "and": 12997637966,
32 | "large count": 92233720368547758081,
33 | }
34 | if request.param is None:
35 | dict_stream = DictIO(dictionary)
36 | else:
37 | dict_stream = DictIO(dictionary, request.param)
38 | yield dict_stream, request.param
39 |
40 |
41 | class CustomDistanceComparer(AbstractDistanceComparer):
42 | def distance(self, string_1: str, string_2: str, max_distance: int) -> int:
43 | return 0
44 |
45 |
46 | class TestSymSpellPy:
47 | def test_negative_max_dictionary_edit_distance(self):
48 | with pytest.raises(ValueError) as excinfo:
49 | _ = SymSpell(-1, 3)
50 | assert "max_dictionary_edit_distance cannot be negative" == str(excinfo.value)
51 |
52 | def test_invalid_prefix_length(self):
53 | # prefix_length < 1
54 | with pytest.raises(ValueError) as excinfo:
55 | _ = SymSpell(1, 0)
56 | assert "prefix_length cannot be less than 1" == str(excinfo.value)
57 |
58 | with pytest.raises(ValueError) as excinfo:
59 | _ = SymSpell(1, -1)
60 | assert "prefix_length cannot be less than 1" == str(excinfo.value)
61 |
62 | # prefix_length <= max_dictionary_edit_distance
63 | with pytest.raises(ValueError) as excinfo:
64 | _ = SymSpell(2, 2)
65 | assert "prefix_length must be greater than max_dictionary_edit_distance" == str(
66 | excinfo.value
67 | )
68 |
69 | def test_negative_count_threshold(self):
70 | with pytest.raises(ValueError) as excinfo:
71 | _ = SymSpell(1, 3, -1)
72 | assert "count_threshold cannot be negative" == str(excinfo.value)
73 |
74 | def test_set_distance_comparer(self):
75 | distance_comparer = EditDistance(
76 | DistanceAlgorithm.USER_PROVIDED, CustomDistanceComparer()
77 | )
78 | sym_spell = SymSpell(distance_comparer=distance_comparer)
79 |
80 | assert distance_comparer == sym_spell.distance_comparer
81 |
82 | @pytest.mark.parametrize("symspell_short", [None, 0], indirect=True)
83 | def test_create_dictionary_entry_negative_count(self, symspell_short):
84 | assert (
85 | symspell_short._count_threshold == 0
86 | ) == symspell_short.create_dictionary_entry("pipe", 0)
87 | assert not symspell_short.create_dictionary_entry("pipe", -1)
88 |
89 | @pytest.mark.parametrize("symspell_short", [10], indirect=True)
90 | def test_create_dictionary_entry_below_threshold(self, symspell_short):
91 | symspell_short.create_dictionary_entry("pipe", 4)
92 | assert 1 == len(symspell_short.below_threshold_words)
93 | assert 4 == symspell_short.below_threshold_words["pipe"]
94 |
95 | symspell_short.create_dictionary_entry("pipe", 4)
96 | assert 1 == len(symspell_short.below_threshold_words)
97 | assert 8 == symspell_short.below_threshold_words["pipe"]
98 |
99 | symspell_short.create_dictionary_entry("pipe", 4)
100 | assert 0 == len(symspell_short.below_threshold_words)
101 |
102 | def test_add_additional_counts_should_not_add_word_again(
103 | self, symspell_default, get_same_word_and_count
104 | ):
105 | for word, count in get_same_word_and_count:
106 | symspell_default.create_dictionary_entry(word, count)
107 | assert 1 == symspell_default.word_count
108 |
109 | def test_add_additional_counts_should_increase_count(
110 | self, symspell_default, get_same_word_and_count
111 | ):
112 | expected_count = 0
113 | for word, count in get_same_word_and_count:
114 | expected_count += count
115 | symspell_default.create_dictionary_entry(word, count)
116 | result = symspell_default.lookup(word, Verbosity.TOP)
117 | assert expected_count == result[0].count
118 |
119 | def test_load_bigram_dictionary_invalid_path(self, symspell_default):
120 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm:
121 | assert not symspell_default.load_bigram_dictionary(INVALID_PATH, 0, 2)
122 | assert (
123 | f"Bigram dictionary file not found at {Path(INVALID_PATH)}."
124 | == cm.records[0].getMessage()
125 | )
126 |
127 | def test_loading_dictionary_from_fileobject(self, symspell_default):
128 | with open(BIG_WORDS_PATH, "r", encoding="utf8") as infile:
129 | assert symspell_default.create_dictionary(infile)
130 |
131 | def test_load_bigram_dictionary_bad_dict(self, symspell_default):
132 | assert symspell_default.load_bigram_dictionary(BAD_DICT_PATH, 0, 2)
133 | assert 2 == len(symspell_default.bigrams)
134 | assert 12 == symspell_default.bigrams["rtyu tyui"]
135 | assert 13 == symspell_default.bigrams["yuio uiop"]
136 |
137 | def test_load_bigram_dictionary_separator(self, symspell_default):
138 | assert symspell_default.load_bigram_dictionary(
139 | SEPARATOR_DICT_PATH, 0, 1, SEPARATOR
140 | )
141 | assert 5 == len(symspell_default.bigrams)
142 | assert 23135851162 == symspell_default.bigrams["the"]
143 | assert 13151942776 == symspell_default.bigrams["of"]
144 | assert 10956800 == symspell_default.bigrams["abcs of"]
145 | assert 10721728, symspell_default.bigrams["aaron and"]
146 | assert 12997637966 == symspell_default.bigrams["and"]
147 |
148 | @pytest.mark.parametrize("get_dictionary_stream", [None], indirect=True)
149 | def test_load_bigram_dictionary_stream(
150 | self, symspell_default, get_dictionary_stream
151 | ):
152 | dict_stream, _ = get_dictionary_stream
153 | assert symspell_default._load_bigram_dictionary_stream(dict_stream, 0, 2)
154 | assert 2 == len(symspell_default.bigrams)
155 | assert 10956800 == symspell_default.bigrams["abcs of"]
156 | assert 10721728 == symspell_default.bigrams["aaron and"]
157 | assert "large count" not in symspell_default.bigrams
158 |
159 | @pytest.mark.parametrize("get_dictionary_stream", [SEPARATOR], indirect=True)
160 | def test_load_bigram_dictionary_stream_separator(
161 | self, symspell_default, get_dictionary_stream
162 | ):
163 | dict_stream, separator = get_dictionary_stream
164 | assert symspell_default._load_bigram_dictionary_stream(
165 | dict_stream, 0, 1, separator
166 | )
167 | assert 5 == len(symspell_default.bigrams)
168 | assert 23135851162 == symspell_default.bigrams["the"]
169 | assert 13151942776 == symspell_default.bigrams["of"]
170 | assert 10956800 == symspell_default.bigrams["abcs of"]
171 | assert 10721728 == symspell_default.bigrams["aaron and"]
172 | assert 12997637966 == symspell_default.bigrams["and"]
173 |
174 | def test_load_dictionary_invalid_path(self, symspell_default):
175 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm:
176 | assert not symspell_default.load_dictionary(INVALID_PATH, 0, 1)
177 | assert (
178 | f"Dictionary file not found at {Path(INVALID_PATH)}."
179 | == cm.records[0].getMessage()
180 | )
181 |
182 | def test_load_dictionary_bad_dictionary(self, symspell_default):
183 | assert symspell_default.load_dictionary(BAD_DICT_PATH, 0, 1)
184 | assert 2 == symspell_default.word_count
185 | assert 10 == symspell_default.words["asdf"]
186 | assert 12 == symspell_default.words["sdfg"]
187 |
188 | def test_load_dictionary_count(self, symspell_default, dictionary_path):
189 | symspell_default.load_dictionary(dictionary_path, 0, 1)
190 |
191 | assert 82834 == symspell_default.word_count
192 | assert 676094 == symspell_default.entry_count
193 |
194 | @pytest.mark.parametrize("symspell_short", [10], indirect=True)
195 | def test_load_dictionary_below_threshold(self, symspell_short):
196 | symspell_short.load_dictionary(BELOW_THRESHOLD_DICT_PATH, 0, 1)
197 |
198 | assert 1 == len(symspell_short.below_threshold_words)
199 | assert 8 == symspell_short.below_threshold_words["below"]
200 |
201 | assert 2 == symspell_short.word_count
202 |
203 | def test_load_dictionary_separator(self, symspell_default):
204 | assert symspell_default.load_dictionary(SEPARATOR_DICT_PATH, 0, 1, SEPARATOR)
205 | assert 5 == symspell_default.word_count
206 | assert 23135851162 == symspell_default.words["the"]
207 | assert 13151942776 == symspell_default.words["of"]
208 | assert 10956800 == symspell_default.words["abcs of"]
209 | assert 10721728 == symspell_default.words["aaron and"]
210 | assert 12997637966 == symspell_default.words["and"]
211 |
212 | @pytest.mark.parametrize("get_dictionary_stream", [None], indirect=True)
213 | def test_load_dictionary_stream(self, symspell_default, get_dictionary_stream):
214 | # keys with space in them don't get parsed properly when using
215 | # the default separator=" "
216 | dict_stream, _ = get_dictionary_stream
217 | assert symspell_default._load_dictionary_stream(dict_stream, 0, 1)
218 | assert 3 == symspell_default.word_count
219 | assert 23135851162 == symspell_default.words["the"]
220 | assert 13151942776 == symspell_default.words["of"]
221 | assert 12997637966 == symspell_default.words["and"]
222 |
223 | @pytest.mark.parametrize("get_dictionary_stream", [SEPARATOR], indirect=True)
224 | def test_load_dictionary_stream_separator(
225 | self, symspell_default, get_dictionary_stream
226 | ):
227 | dict_stream, separator = get_dictionary_stream
228 | assert symspell_default._load_dictionary_stream(dict_stream, 0, 1, separator)
229 | assert 5 == symspell_default.word_count
230 | assert 23135851162 == symspell_default.words["the"]
231 | assert 13151942776 == symspell_default.words["of"]
232 | assert 10956800 == symspell_default.words["abcs of"]
233 | assert 10721728 == symspell_default.words["aaron and"]
234 | assert 12997637966 == symspell_default.words["and"]
235 |
236 | def test_load_dictionary_encoding(self, symspell_default):
237 | symspell_default.load_dictionary(NON_EN_DICT_PATH, 0, 1, encoding="utf-8")
238 |
239 | result = symspell_default.lookup("АБ", Verbosity.TOP, 2)
240 | assert 1 == len(result)
241 | assert "АБИ" == result[0].term
242 |
243 | def test_load_dictionary_from_string_io(self, symspell_default, dictionary_path):
244 | with open(dictionary_path, "r") as f:
245 | symspell_default.load_dictionary(StringIO(f.read()), 0, 1)
246 | assert 82834 == symspell_default.word_count
247 | assert 676094 == symspell_default.entry_count
248 |
249 | def test_load_dictionary_from_text_io_wrapper(self, symspell_default, dictionary_path):
250 | with open(dictionary_path, "r") as f:
251 | symspell_default.load_dictionary(f, 0, 1)
252 | assert 82834 == symspell_default.word_count
253 | assert 676094 == symspell_default.entry_count
254 |
255 | def test_create_dictionary_invalid_path(self, symspell_default):
256 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="ERROR") as cm:
257 | assert not symspell_default.create_dictionary(INVALID_PATH)
258 | assert (
259 | f"Corpus not found at {Path(INVALID_PATH)}." == cm.records[0].getMessage()
260 | )
261 |
262 | def test_create_dictionary(self, symspell_default):
263 | symspell_default.create_dictionary(BIG_MODIFIED_PATH, encoding="utf-8")
264 |
265 | num_lines = 0
266 | with open(BIG_WORDS_PATH, "r") as infile:
267 | for line in infile:
268 | key, count = line.rstrip().split(" ")
269 | assert int(count) == symspell_default.words[key]
270 | num_lines += 1
271 | assert num_lines == symspell_default.word_count
272 |
273 | @pytest.mark.parametrize(
274 | "symspell_default_entry",
275 | [[("stea", 1), ("steama", 2), ("steem", 3)]],
276 | indirect=True,
277 | )
278 | def test_delete_dictionary_entry(self, symspell_default_entry):
279 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
280 | assert 1 == len(result)
281 | assert "steama" == result[0].term
282 | assert len("steama") == symspell_default_entry._max_length
283 |
284 | assert symspell_default_entry.delete_dictionary_entry("steama")
285 | assert "steama" not in symspell_default_entry.words
286 | assert len("steem") == symspell_default_entry._max_length
287 |
288 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
289 | assert 1 == len(result)
290 | assert "steem" == result[0].term
291 |
292 | assert symspell_default_entry.delete_dictionary_entry("stea")
293 | assert "stea" not in symspell_default_entry.words
294 | assert len("steem") == symspell_default_entry._max_length
295 |
296 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
297 | assert 1 == len(result)
298 | assert "steem" == result[0].term
299 |
300 | @pytest.mark.parametrize(
301 | "symspell_default_entry",
302 | [[("stea", 1), ("steama", 2), ("steem", 3)]],
303 | indirect=True,
304 | )
305 | def test_delete_dictionary_entry_invalid_word(self, symspell_default_entry):
306 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
307 | assert 1 == len(result)
308 | assert "steama" == result[0].term
309 | assert len("steama") == symspell_default_entry._max_length
310 |
311 | assert not symspell_default_entry.delete_dictionary_entry("steamab")
312 | result = symspell_default_entry.lookup("steama", Verbosity.TOP, 2)
313 | assert 1 == len(result)
314 | assert "steama" == result[0].term
315 | assert len("steama") == symspell_default_entry._max_length
316 |
--------------------------------------------------------------------------------
/tests/test_symspellpy_edge_cases.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from symspellpy import Verbosity
4 |
5 | ENTRIES = ["baked", "ax", "lake", "", "slaked"]
6 |
7 |
8 | class TestSymSpellPyEdgeCases:
9 | @pytest.mark.parametrize("symspell_long_entry", [ENTRIES], indirect=True)
10 | def test_empty_string_has_all_short_deletes(self, symspell_long_entry):
11 | sym_spell, entries = symspell_long_entry
12 |
13 | assert len(entries[:-1]) == len(sym_spell.deletes[""])
14 | assert all(entry in sym_spell.deletes[""] for entry in entries[:-1])
15 | assert "abc" not in sym_spell.deletes[""]
16 |
17 | def test_split_correction_part_of_single_term_correction(self, symspell_default):
18 | symspell_default.create_dictionary_entry("where", 2)
19 | symspell_default.create_dictionary_entry("is", 2)
20 | symspell_default.create_dictionary_entry("whereas", 2)
21 | symspell_default._bigrams["where is"] = 10
22 |
23 | suggestions = symspell_default.lookup_compound("whereiz", 2)
24 | assert "where is" == suggestions[0].term
25 | assert 2 == suggestions[0].distance
26 | assert 10 == suggestions[0].count
27 |
28 | @pytest.mark.parametrize("symspell_long_entry", [["bank", "bink"]], indirect=True)
29 | def test_no_common_char_with_phrase(self, symspell_long_entry):
30 | sym_spell, _ = symspell_long_entry
31 | results = sym_spell.lookup("knab", Verbosity.ALL, 4)
32 |
33 | assert 2 == len(results)
34 | assert "bank" == results[0].term
35 | assert 3 == results[0].distance
36 | assert "bink" == results[1].term
37 | assert 4 == results[1].distance
38 |
--------------------------------------------------------------------------------
/tests/test_symspellpy_lookup.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import pytest
4 |
5 | from symspellpy import SymSpell, Verbosity
6 |
7 |
8 | @pytest.fixture
9 | def symspell_high_thres():
10 | return SymSpell(2, 7, 10)
11 |
12 |
13 | @pytest.fixture
14 | def symspell_high_thres_flame(symspell_high_thres):
15 | symspell_high_thres.create_dictionary_entry("flame", 20)
16 | symspell_high_thres.create_dictionary_entry("flam", 1)
17 | return symspell_high_thres
18 |
19 |
20 | class TestSymSpellPyLookup:
21 | @pytest.mark.parametrize(
22 | "symspell_default_entry",
23 | [[("steama", 4), ("steamb", 6), ("steamc", 2)]],
24 | indirect=True,
25 | )
26 | def test_deletes(self, symspell_default_entry):
27 | result = symspell_default_entry.lookup("stream", Verbosity.TOP, 2)
28 | assert 1 == len(result)
29 | assert "steamb" == result[0].term
30 | assert 6 == result[0].count
31 | assert symspell_default_entry.deletes
32 |
33 | @pytest.mark.parametrize("symspell_short", [None], indirect=True)
34 | def test_words_with_shared_prefix_should_retain_counts(self, symspell_short):
35 | symspell_short.create_dictionary_entry("pipe", 5)
36 | symspell_short.create_dictionary_entry("pips", 10)
37 |
38 | result = symspell_short.lookup("pipe", Verbosity.ALL, 1)
39 | assert 2 == len(result)
40 | assert "pipe" == result[0].term
41 | assert 5 == result[0].count
42 | assert "pips" == result[1].term
43 | assert 10 == result[1].count
44 |
45 | result = symspell_short.lookup("pips", Verbosity.ALL, 1)
46 | assert 2 == len(result)
47 | assert "pips" == result[0].term
48 | assert 10 == result[0].count
49 | assert "pipe" == result[1].term
50 | assert 5 == result[1].count
51 |
52 | result = symspell_short.lookup("pip", Verbosity.ALL, 1)
53 | assert 2 == len(result)
54 | assert "pips" == result[0].term
55 | assert 10 == result[0].count
56 | assert "pipe" == result[1].term
57 | assert 5 == result[1].count
58 |
59 | def test_add_additional_counts_should_not_overflow(
60 | self, symspell_default, get_same_word_and_count
61 | ):
62 | for i, (word, count) in enumerate(get_same_word_and_count):
63 | symspell_default.create_dictionary_entry(
64 | word, sys.maxsize - 1 if i == 0 else count
65 | )
66 | result = symspell_default.lookup(word, Verbosity.TOP)
67 | assert (sys.maxsize - 1 if i == 0 else sys.maxsize) == result[0].count
68 |
69 | @pytest.mark.parametrize(
70 | "verbosity, num_results",
71 | [(Verbosity.TOP, 1), (Verbosity.CLOSEST, 2), (Verbosity.ALL, 3)],
72 | )
73 | def test_verbosity_should_control_lookup_results(
74 | self, symspell_default, verbosity, num_results
75 | ):
76 | symspell_default.create_dictionary_entry("steam", 1)
77 | symspell_default.create_dictionary_entry("steams", 2)
78 | symspell_default.create_dictionary_entry("steem", 3)
79 |
80 | result = symspell_default.lookup("steems", verbosity, 2)
81 | assert num_results == len(result)
82 |
83 | @pytest.mark.parametrize(
84 | "symspell_default_entry",
85 | [[("steama", 4), ("steamb", 6), ("steamc", 2)]],
86 | indirect=True,
87 | )
88 | def test_should_return_most_frequent(self, symspell_default_entry):
89 | result = symspell_default_entry.lookup("stream", Verbosity.TOP, 2)
90 | assert 1 == len(result)
91 | assert "steamb" == result[0].term
92 | assert 6 == result[0].count
93 |
94 | @pytest.mark.parametrize(
95 | "symspell_default_entry",
96 | [[("steama", 4), ("steamb", 6), ("steamc", 2)]],
97 | indirect=True,
98 | )
99 | def test_should_find_exact_match(self, symspell_default_entry):
100 | result = symspell_default_entry.lookup("streama", Verbosity.TOP, 2)
101 | assert 1 == len(result)
102 | assert "steama" == result[0].term
103 |
104 | @pytest.mark.parametrize("term", ["paw", "awn"])
105 | def test_should_not_return_non_word_delete(self, symspell_high_thres, term):
106 | symspell_high_thres.create_dictionary_entry("pawn", 10)
107 | result = symspell_high_thres.lookup(term, Verbosity.TOP, 0)
108 | assert not result
109 |
110 | def test_should_not_return_low_count_word(self, symspell_high_thres):
111 | symspell_high_thres.create_dictionary_entry("pawn", 1)
112 | result = symspell_high_thres.lookup("pawn", Verbosity.TOP, 0)
113 | assert not result
114 |
115 | def test_should_not_return_low_count_word_that_are_also_delete_word(
116 | self, symspell_high_thres_flame
117 | ):
118 | result = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 0)
119 | assert not result
120 |
121 | def test_max_edit_distance_too_large(self, symspell_high_thres_flame):
122 | with pytest.raises(ValueError) as excinfo:
123 | _ = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 3)
124 | assert "distance too large" == str(excinfo.value)
125 |
126 | def test_include_unknown(self, symspell_high_thres_flame):
127 | result = symspell_high_thres_flame.lookup("flam", Verbosity.TOP, 0, True)
128 | assert 1 == len(result)
129 | assert "flam" == result[0].term
130 |
131 | def test_avoid_exact_match_early_exit(self, symspell_high_thres_flame):
132 | result = symspell_high_thres_flame.lookup(
133 | "24th", Verbosity.ALL, 2, ignore_token=r"\d{2}\w*\b"
134 | )
135 | assert 1 == len(result)
136 | assert "24th" == result[0].term
137 |
138 | def test_should_replicate_noisy_results(
139 | self, dictionary_path, query_path, symspell_default
140 | ):
141 | symspell_default.load_dictionary(dictionary_path, 0, 1)
142 |
143 | with open(query_path, "r") as infile:
144 | test_phrases = [
145 | parts[0]
146 | for parts in map(lambda x: x.strip().split(), infile.readlines())
147 | if len(parts) >= 2
148 | ]
149 |
150 | result_sum = 0
151 | for phrase in test_phrases:
152 | result_sum += len(symspell_default.lookup(phrase, Verbosity.CLOSEST, 2))
153 |
154 | assert 4955 == result_sum
155 |
156 | @pytest.mark.parametrize(
157 | "symspell_default_entry, typo, correction",
158 | [
159 | ([("steam", 4)], "Stream", "Steam"),
160 | ([("steam", 4)], "StreaM", "SteaM"),
161 | ([("steam", 4)], "STREAM", "STEAM"),
162 | ([("i", 4)], "I", "I"),
163 | ],
164 | indirect=["symspell_default_entry"],
165 | )
166 | def test_transfer_casing(self, symspell_default_entry, typo, correction):
167 | result = symspell_default_entry.lookup(
168 | typo, Verbosity.TOP, 2, transfer_casing=True
169 | )
170 | assert correction == result[0].term
171 |
--------------------------------------------------------------------------------
/tests/test_symspellpy_lookup_compound.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 |
4 | class TestSymSpellPyLookupCompound:
5 | @pytest.mark.parametrize(
6 | "symspell_default_load, get_fortests_data",
7 | [
8 | ("bigram", "lookup_compound_data.json"),
9 | ("unigram", "lookup_compound_data.json"),
10 | ],
11 | indirect=True,
12 | )
13 | def test_lookup_compound(self, symspell_default_load, get_fortests_data):
14 | sym_spell, dictionary = symspell_default_load
15 | for entry in get_fortests_data:
16 | results = sym_spell.lookup_compound(entry["typo"], 2)
17 | assert entry[dictionary]["num_results"] == len(results)
18 | assert entry[dictionary]["term"] == results[0].term
19 | assert entry[dictionary]["distance"] == results[0].distance
20 | assert entry[dictionary]["count"] == results[0].count
21 |
22 | @pytest.mark.parametrize(
23 | "symspell_default_entry", [[("steam", 1), ("machine", 1)]], indirect=True
24 | )
25 | def test_lookup_compound_only_combi(self, symspell_default_entry):
26 | typo = "ste am machie"
27 | correction = "steam machine"
28 | results = symspell_default_entry.lookup_compound(typo, 2)
29 | assert 1 == len(results)
30 | assert correction == results[0].term
31 |
32 | @pytest.mark.parametrize(
33 | "symspell_default_entry", [[("steam", 1), ("machine", 1)]], indirect=True
34 | )
35 | def test_lookup_compound_no_suggestion(self, symspell_default_entry):
36 | typo = "qwer erty ytui a"
37 | results = symspell_default_entry.lookup_compound(typo, 2)
38 | assert 1 == len(results)
39 | assert typo == results[0].term
40 |
41 | @pytest.mark.parametrize(
42 | "symspell_default_load, get_fortests_data",
43 | [
44 | ("bigram", "lookup_compound_replaced_words_data.json"),
45 | ("unigram", "lookup_compound_replaced_words_data.json"),
46 | ],
47 | indirect=True,
48 | )
49 | def test_lookup_compound_replaced_words(
50 | self, symspell_default_load, get_fortests_data
51 | ):
52 | sym_spell, dictionary = symspell_default_load
53 | num_replaced_words = 0
54 | for entry in get_fortests_data:
55 | num_replaced_words += len(entry[dictionary]["replacement"])
56 | results = sym_spell.lookup_compound(entry["typo"], 2)
57 | assert num_replaced_words == len(sym_spell.replaced_words)
58 | assert entry[dictionary]["term"] == results[0].term
59 | for k, v in entry[dictionary]["replacement"].items():
60 | assert v == sym_spell.replaced_words[k].term
61 |
62 | @pytest.mark.parametrize(
63 | "symspell_default_load, get_fortests_data",
64 | [
65 | ("bigram", "lookup_compound_ignore_non_words_data.json"),
66 | ("unigram", "lookup_compound_ignore_non_words_data.json"),
67 | ],
68 | indirect=True,
69 | )
70 | def test_lookup_compound_ignore_non_words(
71 | self, symspell_default_load, get_fortests_data
72 | ):
73 | sym_spell, dictionary = symspell_default_load
74 | for entry in get_fortests_data:
75 | results = sym_spell.lookup_compound(entry["typo"], 2, True)
76 | assert 1 == len(results)
77 | assert entry[dictionary]["term"] == results[0].term
78 |
79 | @pytest.mark.parametrize(
80 | "symspell_default_load", ["bigram", "unigram"], indirect=True
81 | )
82 | def test_lookup_compound_ignore_non_words_ignore_digits(
83 | self, symspell_default_load
84 | ):
85 | sym_spell, _ = symspell_default_load
86 |
87 | typo = "is the officeon 1st floor oepn 24/7"
88 | correction = "is the office on 1st floor open 24/7"
89 | results = sym_spell.lookup_compound(
90 | typo,
91 | 2,
92 | True,
93 | split_by_space=True,
94 | ignore_term_with_digits=True,
95 | )
96 | assert 1 == len(results)
97 | assert correction == results[0].term
98 | assert 2 == results[0].distance
99 | assert 0 == results[0].count
100 |
101 | @pytest.mark.parametrize(
102 | "symspell_default_load, get_fortests_data",
103 | [
104 | ("bigram", "lookup_compound_transfer_casing_data.json"),
105 | ("unigram", "lookup_compound_transfer_casing_data.json"),
106 | ],
107 | indirect=True,
108 | )
109 | def test_lookup_compound_transfer_casing(
110 | self, symspell_default_load, get_fortests_data
111 | ):
112 | sym_spell, dictionary = symspell_default_load
113 | for entry in get_fortests_data:
114 | results = sym_spell.lookup_compound(entry["typo"], 2, transfer_casing=True)
115 | assert entry[dictionary]["term"] == results[0].term
116 |
117 | @pytest.mark.parametrize(
118 | "symspell_default_load, get_fortests_data",
119 | [
120 | ("bigram", "lookup_compound_transfer_casing_ignore_nonwords_data.json"),
121 | ("unigram", "lookup_compound_transfer_casing_ignore_nonwords_data.json"),
122 | ],
123 | indirect=True,
124 | )
125 | def test_lookup_compound_transfer_casing_ignore_nonwords(
126 | self, symspell_default_load, get_fortests_data
127 | ):
128 | sym_spell, dictionary = symspell_default_load
129 | for entry in get_fortests_data:
130 | results = sym_spell.lookup_compound(entry["typo"], 2, True, True)
131 | assert entry[dictionary]["term"] == results[0].term
132 |
--------------------------------------------------------------------------------
/tests/test_symspellpy_pickle.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pickle
3 | from unittest import TestCase
4 |
5 | import pytest
6 |
7 | from symspellpy import SymSpell
8 |
9 |
10 | class TestSymSpellPyPickle:
11 | @pytest.mark.parametrize(
12 | "symspell_default_load, is_compressed",
13 | [("unigram", True), ("bigram", True), ("unigram", False), ("bigram", False)],
14 | indirect=["symspell_default_load"],
15 | )
16 | def test_pickle(self, pickle_path, symspell_default_load, is_compressed):
17 | sym_spell, _ = symspell_default_load
18 | sym_spell.save_pickle(pickle_path, is_compressed)
19 |
20 | sym_spell_2 = SymSpell(123, 456, 789)
21 |
22 | assert sym_spell._count_threshold != sym_spell_2._count_threshold
23 | assert (
24 | sym_spell._max_dictionary_edit_distance
25 | != sym_spell_2._max_dictionary_edit_distance
26 | )
27 | assert sym_spell._prefix_length != sym_spell_2._prefix_length
28 |
29 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm:
30 | sym_spell_2.load_pickle(pickle_path, is_compressed)
31 | assert (
32 | "Loading data which was created using different ('count_threshold', "
33 | "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting "
34 | "current SymSpell instance with loaded settings ..."
35 | ) == cm.records[0].getMessage()
36 | assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
37 | assert sym_spell.bigrams == sym_spell_2.bigrams
38 | assert sym_spell.deletes == sym_spell_2.deletes
39 | assert sym_spell.words == sym_spell_2.words
40 | assert sym_spell._max_length == sym_spell_2._max_length
41 | assert sym_spell._count_threshold == sym_spell_2._count_threshold
42 | assert (
43 | sym_spell._max_dictionary_edit_distance
44 | == sym_spell_2._max_dictionary_edit_distance
45 | )
46 | assert sym_spell._prefix_length == sym_spell_2._prefix_length
47 | os.remove(pickle_path)
48 |
49 | @pytest.mark.parametrize(
50 | "symspell_default_load, is_compressed",
51 | [("unigram", True), ("bigram", True), ("unigram", False), ("bigram", False)],
52 | indirect=["symspell_default_load"],
53 | )
54 | def test_pickle_same_settings(
55 | self, pickle_path, symspell_default_load, is_compressed
56 | ):
57 | sym_spell, _ = symspell_default_load
58 | sym_spell.save_pickle(pickle_path, is_compressed)
59 |
60 | sym_spell_2 = SymSpell()
61 | sym_spell_2.load_pickle(pickle_path, is_compressed)
62 |
63 | assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
64 | assert sym_spell.bigrams == sym_spell_2.bigrams
65 | assert sym_spell.deletes == sym_spell_2.deletes
66 | assert sym_spell.words == sym_spell_2.words
67 | assert sym_spell._max_length == sym_spell_2._max_length
68 | assert sym_spell._count_threshold == sym_spell_2._count_threshold
69 | assert (
70 | sym_spell._max_dictionary_edit_distance
71 | == sym_spell_2._max_dictionary_edit_distance
72 | )
73 | assert sym_spell._prefix_length == sym_spell_2._prefix_length
74 | os.remove(pickle_path)
75 |
76 | @pytest.mark.parametrize(
77 | "symspell_default_load", ["unigram", "bigram"], indirect=True
78 | )
79 | def test_pickle_bytes(self, symspell_default_load):
80 | sym_spell, _ = symspell_default_load
81 | sym_spell_2 = SymSpell(123, 456, 789)
82 |
83 | assert sym_spell._count_threshold != sym_spell_2._count_threshold
84 | assert (
85 | sym_spell._max_dictionary_edit_distance
86 | != sym_spell_2._max_dictionary_edit_distance
87 | )
88 | assert sym_spell._prefix_length != sym_spell_2._prefix_length
89 |
90 | with TestCase.assertLogs("symspellpy.symspellpy.logger", level="WARNING") as cm:
91 | sym_spell_2.load_pickle(
92 | sym_spell.save_pickle(to_bytes=True), from_bytes=True
93 | )
94 | assert (
95 | "Loading data which was created using different ('count_threshold', "
96 | "'max_dictionary_edit_distance', 'prefix_length') settings. Overwriting "
97 | "current SymSpell instance with loaded settings ..."
98 | ) == cm.records[0].getMessage()
99 | assert sym_spell.below_threshold_words == sym_spell_2.below_threshold_words
100 | assert sym_spell.bigrams == sym_spell_2.bigrams
101 | assert sym_spell.deletes == sym_spell_2.deletes
102 | assert sym_spell.words == sym_spell_2.words
103 | assert sym_spell._max_length == sym_spell_2._max_length
104 | assert sym_spell._count_threshold == sym_spell_2._count_threshold
105 | assert (
106 | sym_spell._max_dictionary_edit_distance
107 | == sym_spell_2._max_dictionary_edit_distance
108 | )
109 | assert sym_spell._prefix_length == sym_spell_2._prefix_length
110 |
111 | def test_pickle_invalid(self, pickle_path, symspell_default):
112 | pickle_data = {"deletes": {}, "words": {}, "max_length": 0, "data_version": -1}
113 | with open(pickle_path, "wb") as f:
114 | pickle.dump(pickle_data, f)
115 | assert not symspell_default.load_pickle(pickle_path, False)
116 | os.remove(pickle_path)
117 |
118 | pickle_data = {"deletes": {}, "words": {}, "max_length": 0}
119 | with open(pickle_path, "wb") as f:
120 | pickle.dump(pickle_data, f)
121 | assert not symspell_default.load_pickle(pickle_path, False)
122 | os.remove(pickle_path)
123 |
--------------------------------------------------------------------------------
/tests/test_symspellpy_word_segmentation.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from symspellpy import SymSpell
4 |
5 |
6 | @pytest.fixture
7 | def symspell_edit_distance_load(dictionary_path, request):
8 | sym_spell = SymSpell(request.param)
9 | sym_spell.load_dictionary(dictionary_path, 0, 1)
10 | return sym_spell, request.param
11 |
12 |
13 | class TestSymSpellPyWordSegmentation:
14 | @pytest.mark.parametrize("symspell_default_load", ["unigram"], indirect=True)
15 | def test_word_segmentation_ignore_token(self, symspell_default_load):
16 | sym_spell, _ = symspell_default_load
17 | typo = "24th december"
18 | result = sym_spell.word_segmentation(typo, ignore_token=r"\d{2}\w*\b")
19 | assert typo == result.corrected_string
20 |
21 | @pytest.mark.parametrize(
22 | "symspell_edit_distance_load, get_fortests_data, with_arguments, capitalize",
23 | [
24 | (0, "word_segmentation_data.json", False, False),
25 | (0, "word_segmentation_data.json", True, False),
26 | (0, "word_segmentation_data.json", False, True),
27 | ],
28 | indirect=["symspell_edit_distance_load", "get_fortests_data"],
29 | )
30 | def test_word_segmentation(
31 | self,
32 | symspell_edit_distance_load,
33 | get_fortests_data,
34 | with_arguments,
35 | capitalize,
36 | ):
37 | sym_spell, edit_distance = symspell_edit_distance_load
38 | for entry in get_fortests_data:
39 | if capitalize:
40 | typo = entry["typo"].capitalize()
41 | correction = entry[str(edit_distance)]["term"].capitalize()
42 | else:
43 | typo = entry["typo"]
44 | correction = entry[str(edit_distance)]["term"]
45 | if with_arguments:
46 | result = sym_spell.word_segmentation(typo, edit_distance, 11)
47 | else:
48 | result = sym_spell.word_segmentation(typo)
49 | assert correction == result.corrected_string
50 |
51 | @pytest.mark.parametrize("symspell_edit_distance_load", [0], indirect=True)
52 | def test_word_segmentation_apostrophe(self, symspell_edit_distance_load):
53 | sym_spell, _ = symspell_edit_distance_load
54 |
55 | typo = "There'resomewords"
56 | correction = "There' re some words"
57 | result = sym_spell.word_segmentation(typo)
58 | assert correction == result[1]
59 |
60 | @pytest.mark.parametrize("symspell_edit_distance_load", [0], indirect=True)
61 | def test_word_segmentation_ligature(self, symspell_edit_distance_load):
62 | sym_spell, _ = symspell_edit_distance_load
63 |
64 | typo = "Therearesomescientificwords"
65 | correction = "There are some scientific words"
66 | result = sym_spell.word_segmentation(typo)
67 | assert correction == result[1]
68 |
--------------------------------------------------------------------------------