├── .coveragerc ├── .github ├── FUNDING.yml ├── dependabot.yml └── workflows │ └── CI.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CITATION.cff ├── Cargo.toml ├── Justfile ├── LICENSE ├── README.md ├── benchmarks ├── compare.ipynb ├── timedruns-new.csv ├── timedruns-old.csv └── timedruns.py ├── docs ├── assets │ └── white-jellyfish.svg ├── changelog.md ├── functions.md └── index.md ├── mkdocs.yml ├── pyproject.toml ├── python └── jellyfish │ ├── __init__.py │ ├── __init__.pyi │ ├── _jellyfish.py │ └── py.typed ├── run-cov.sh ├── src ├── common.rs ├── hamming.rs ├── jaccard.rs ├── jaro.rs ├── levenshtein.rs ├── lib.rs ├── match_rating.rs ├── metaphone.rs ├── nysiis.rs ├── rustyfish.rs ├── soundex.rs └── testutils.rs ├── testdata ├── README.md ├── damerau_levenshtein.csv ├── hamming.csv ├── jaccard.csv ├── jaro_distance.csv ├── jaro_winkler.csv ├── jaro_winkler_longtol.csv ├── levenshtein.csv ├── match_rating_codex.csv ├── match_rating_comparison.csv ├── metaphone.csv ├── nysiis.csv ├── porter.csv ├── soundex.csv └── wagner_fischer.csv └── tests └── test_jellyfish.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | omit = jellyfish/compat.py 3 | jellyfish/test.py 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: jamesturk 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file 2 | version: 2 3 | updates: 4 | - package-ecosystem: "pip" 5 | directory: "/" 6 | schedule: 7 | interval: "daily" 8 | time: "10:00" 9 | open-pull-requests-limit: 10 10 | - package-ecosystem: "github-actions" 11 | directory: "/" 12 | schedule: 13 | interval: "weekly" 14 | -------------------------------------------------------------------------------- /.github/workflows/CI.yml: -------------------------------------------------------------------------------- 1 | # This file was edited manually to add 2 | # The original was autogenerated by maturin v0.14.15 3 | on: 4 | push: 5 | branches: 6 | - "main" 7 | tags: 8 | - "*" 9 | pull_request: 10 | workflow_dispatch: 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | lint_and_test: 17 | runs-on: ubuntu-latest 18 | strategy: 19 | matrix: 20 | python-version: ["3.9", "3.13", "pypy3.11"] 21 | steps: 22 | - uses: actions/checkout@v4 23 | with: 24 | submodules: recursive 25 | - uses: actions/setup-python@v5 26 | with: 27 | python-version: ${{ matrix.python-version }} 28 | - name: Build wheels 29 | uses: PyO3/maturin-action@v1 30 | with: 31 | target: ${{ matrix.target }} 32 | args: --release --out dist -i ${{ matrix.python-version }} 33 | sccache: "true" 34 | - name: Install Just 35 | uses: extractions/setup-just@v3 36 | - name: Run Cargo Tests 37 | run: | 38 | cargo test 39 | - name: Run pytest 40 | run: | 41 | # just venv pytest 42 | rm -rf .venv 43 | python3 -m venv .venv 44 | . .venv/bin/activate 45 | .venv/bin/pip install wheel pytest mkdocs-material 46 | maturin develop 47 | .venv/bin/pytest 48 | 49 | linux: 50 | runs-on: ubuntu-latest 51 | needs: lint_and_test 52 | strategy: 53 | matrix: 54 | platform: 55 | - target: x64 56 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 57 | - target: aarch64 58 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 59 | - target: armv7 60 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 61 | steps: 62 | - uses: actions/checkout@v4 63 | with: 64 | submodules: recursive 65 | - name: Build wheels 66 | uses: PyO3/maturin-action@v1 67 | with: 68 | target: ${{ matrix.platform.target }} 69 | args: --release --out dist -i ${{ matrix.platform.interpreter }} 70 | sccache: "true" 71 | manylinux: auto 72 | - name: Upload wheels 73 | uses: actions/upload-artifact@v4 74 | with: 75 | name: wheels-linux-${{ strategy.job-index }} 76 | path: dist 77 | musllinux: 78 | runs-on: ubuntu-latest 79 | needs: lint_and_test 80 | strategy: 81 | matrix: 82 | platform: 83 | - target: x86_64-unknown-linux-musl 84 | arch: x86_64 85 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 86 | - target: i686-unknown-linux-musl 87 | arch: x86 88 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 89 | - target: aarch64-unknown-linux-musl 90 | arch: aarch64 91 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 92 | # all values: [x86_64, x86, aarch64, armhf, armv7, ppc64le, riscv64, s390x] 93 | # { target: "armv7-unknown-linux-musleabihf", image_tag: "armv7" }, 94 | # { target: "powerpc64le-unknown-linux-musl", image_tag: "ppc64le" }, 95 | steps: 96 | - uses: actions/checkout@v4 97 | with: 98 | submodules: recursive 99 | - name: Setup QEMU 100 | uses: docker/setup-qemu-action@v3 101 | - name: Build wheels 102 | uses: PyO3/maturin-action@v1 103 | with: 104 | target: ${{ matrix.platform.target }} 105 | args: --release --out dist -i ${{ matrix.platform.interpreter }} 106 | sccache: "true" 107 | manylinux: musllinux_1_1 108 | - name: Upload wheels 109 | uses: actions/upload-artifact@v4 110 | with: 111 | name: wheels-musl-${{ strategy.job-index }} 112 | path: dist 113 | 114 | windows: 115 | runs-on: windows-latest 116 | needs: lint_and_test 117 | strategy: 118 | matrix: 119 | target: [x64, x86] 120 | interpreter: [3.9, "3.10", "3.11", "3.12", "3.13"] 121 | steps: 122 | - uses: actions/checkout@v4 123 | with: 124 | submodules: recursive 125 | - uses: actions/setup-python@v5 126 | with: 127 | python-version: ${{ matrix.interpreter }} 128 | - name: Build wheels 129 | uses: PyO3/maturin-action@v1 130 | with: 131 | target: ${{ matrix.target }} 132 | args: --release --out dist -i ${{ matrix.interpreter }} 133 | sccache: "true" 134 | - name: Upload wheels 135 | uses: actions/upload-artifact@v4 136 | with: 137 | path: dist 138 | name: wheels-win-${{ strategy.job-index }} 139 | 140 | macos: 141 | runs-on: macos-latest 142 | needs: lint_and_test 143 | strategy: 144 | matrix: 145 | platform: 146 | - target: x64 147 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 148 | - target: aarch64 149 | interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11 150 | steps: 151 | - uses: actions/checkout@v4 152 | with: 153 | submodules: recursive 154 | - name: Build wheels 155 | uses: PyO3/maturin-action@v1 156 | with: 157 | target: ${{ matrix.platform.target }} 158 | args: --release --out dist -i ${{ matrix.platform.interpreter }} 159 | sccache: "true" 160 | - name: Upload wheels 161 | uses: actions/upload-artifact@v4 162 | with: 163 | name: wheels-mac-${{ strategy.job-index }} 164 | path: dist 165 | 166 | sdist: 167 | runs-on: ubuntu-latest 168 | needs: lint_and_test 169 | steps: 170 | - uses: actions/checkout@v4 171 | with: 172 | submodules: recursive 173 | - name: Build sdist 174 | uses: PyO3/maturin-action@v1 175 | with: 176 | command: sdist 177 | args: --out dist 178 | - name: Upload sdist 179 | uses: actions/upload-artifact@v4 180 | with: 181 | name: wheels-sdist-${{ strategy.job-index }} 182 | path: dist 183 | 184 | release: 185 | name: Release 186 | runs-on: ubuntu-latest 187 | if: "startsWith(github.ref, 'refs/tags/')" 188 | needs: [linux, windows, macos, sdist, musllinux] 189 | steps: 190 | - uses: actions/download-artifact@v4 191 | with: 192 | pattern: wheels-* 193 | merge-multiple: true 194 | - name: Publish to PyPI 195 | uses: PyO3/maturin-action@v1 196 | env: 197 | MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} 198 | with: 199 | command: upload 200 | args: --skip-existing * 201 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | jellyfish.egg-info/ 4 | *.so 5 | *.swp 6 | *.pyc 7 | *.DS_Store 8 | *~ 9 | .tox/ 10 | .coverage 11 | htmlcov/ 12 | .ropeproject/ 13 | _build/ 14 | .ipynb_checkpoints/ 15 | .cache 16 | wheelhouse/ 17 | site/ 18 | target/ 19 | Cargo.lock 20 | .venv 21 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_language_version: 2 | python: python3.8 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v2.5.0 # Use the ref you want to point at 6 | hooks: 7 | - id: check-merge-conflict 8 | - id: debug-statements 9 | - id: flake8 10 | args: ["--ignore=E203,E501,W503"] 11 | - repo: https://github.com/ambv/black 12 | rev: 19.10b0 13 | hooks: 14 | - id: black 15 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: "Turk" 5 | given-names: "James" 6 | orcid: https://orcid.org/0000-0003-1762-1420 7 | title: "jellyfish" 8 | version: 1.0.0 9 | date-released: 2023-06-21 10 | url: "https://github.com/jamesturk/jellyfish" -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "jellyfish" 3 | version = "1.2.0" 4 | edition = "2021" 5 | description = "Approximate and phonetic matching of strings." 6 | authors = ["James Turk "] 7 | repository = "https://github.com/jamesturk/jellyfish/" 8 | license = "MIT" 9 | readme = "README.md" 10 | 11 | 12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 13 | [lib] 14 | name = "jellyfish" 15 | crate-type = ["cdylib"] 16 | 17 | [dependencies] 18 | pyo3 = { version = "0.24.0", features = [] } 19 | unicode-segmentation = "^1.6.0" 20 | unicode-normalization = "^0.1" 21 | smallvec = "^1.13" 22 | ahash = "^0.8" 23 | num-traits = "0.2.19" 24 | 25 | [dev-dependencies] 26 | csv = "1.1" 27 | 28 | [features] 29 | python = [] 30 | -------------------------------------------------------------------------------- /Justfile: -------------------------------------------------------------------------------- 1 | pytest: 2 | maturin develop 3 | .venv/bin/pytest 4 | 5 | test: pytest 6 | cargo test 7 | 8 | deploy-docs: 9 | . .venv/bin/activate 10 | mkdocs gh-deploy 11 | 12 | venv: 13 | rm -rf .venv 14 | python3 -m venv .venv 15 | . .venv/bin/activate 16 | .venv/bin/pip install wheel pytest mkdocs-material 17 | .venv/bin/pip install jupyter pandas seaborn 18 | 19 | 20 | timedruns-old: 21 | .venv/bin/pip install jellyfish==0.10.0 # last C version 22 | .venv/bin/python benchmarks/timedruns.py old > benchmarks/timedruns-old.csv 23 | 24 | timedruns-new: 25 | .venv/bin/pip uninstall jellyfish 26 | .venv/bin/pip install -e . 27 | #.venv/bin/pip install --pre jellyfish # latest Rust version 28 | .venv/bin/python benchmarks/timedruns.py new >> benchmarks/timedruns-new.csv 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2015 James Turk 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | **jellyfish** is a library for approximate & phonetic matching of strings. 4 | 5 | Source: [https://github.com/jamesturk/jellyfish](https://github.com/jamesturk/jellyfish) 6 | 7 | Documentation: [https://jamesturk.github.io/jellyfish/](https://jamesturk.github.io/jellyfish/) 8 | 9 | Issues: [https://github.com/jamesturk/jellyfish/issues](https://github.com/jamesturk/jellyfish/issues) 10 | 11 | [![PyPI badge](https://badge.fury.io/py/jellyfish.svg)](https://badge.fury.io/py/jellyfish) 12 | [![Test badge](https://github.com/jamesturk/jellyfish/workflows/Python%20package/badge.svg)](https://github.com/jamesturk/jellyfish/actions?query=workflow%3A%22Python+package) 13 | [![Coveralls](https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master)](https://coveralls.io/r/jamesturk/jellyfish) 14 | ![Test Rust](https://github.com/jamesturk/rust-jellyfish/workflows/Test%20Rust/badge.svg) 15 | 16 | ## Included Algorithms 17 | 18 | String comparison: 19 | 20 | * Levenshtein Distance 21 | * Damerau-Levenshtein Distance 22 | * Jaccard Index 23 | * Jaro Distance 24 | * Jaro-Winkler Distance 25 | * Match Rating Approach Comparison 26 | * Hamming Distance 27 | 28 | Phonetic encoding: 29 | 30 | * American Soundex 31 | * Metaphone 32 | * NYSIIS (New York State Identification and Intelligence System) 33 | * Match Rating Codex 34 | 35 | ## Example Usage 36 | 37 | ``` python 38 | >>> import jellyfish 39 | >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish') 40 | 2 41 | >>> jellyfish.jaro_similarity('jellyfish', 'smellyfish') 42 | 0.89629629629629637 43 | >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') 44 | 1 45 | 46 | >>> jellyfish.metaphone('Jellyfish') 47 | 'JLFX' 48 | >>> jellyfish.soundex('Jellyfish') 49 | 'J412' 50 | >>> jellyfish.nysiis('Jellyfish') 51 | 'JALYF' 52 | >>> jellyfish.match_rating_codex('Jellyfish') 53 | 'JLLFSH' 54 | ``` 55 | -------------------------------------------------------------------------------- /benchmarks/timedruns-new.csv: -------------------------------------------------------------------------------- 1 | 3.10.7,0.11a1,rust,damerau_levenshtein_distance,2.955821124999602e-06 2 | 3.10.7,0.11a1,rust,hamming_distance,2.637990829998671e-07 3 | 3.10.7,0.11a1,rust,jaro_similarity,8.923487499996554e-07 4 | 3.10.7,0.11a1,rust,jaro_winkler_similarity,5.265191250000499e-07 5 | 3.10.7,0.11a1,rust,levenshtein_distance,5.327967920002266e-07 6 | 3.10.7,0.11a1,rust,match_rating_codex,3.9641191699956834e-07 7 | 3.10.7,0.11a1,rust,match_rating_comparison,7.64051959000426e-07 8 | 3.10.7,0.11a1,rust,metaphone,4.791485000005196e-07 9 | 3.10.7,0.11a1,rust,nysiis,6.270804579999094e-07 10 | 3.10.7,0.11a1,rust,soundex,3.9677620900056354e-07 11 | 3.10.7,0.11.0,rust,damerau_levenshtein_distance,2.200372166997113e-06 12 | 3.10.7,0.11.0,rust,hamming_distance,1.723820409970358e-07 13 | 3.10.7,0.11.0,rust,jaro_similarity,6.059524590018554e-07 14 | 3.10.7,0.11.0,rust,jaro_winkler_similarity,2.81896541993774e-07 15 | 3.10.7,0.11.0,rust,levenshtein_distance,2.6762129200506027e-07 16 | 3.10.7,0.11.0,rust,match_rating_codex,3.020092500009923e-07 17 | 3.10.7,0.11.0,rust,match_rating_comparison,4.794018750035321e-07 18 | 3.10.7,0.11.0,rust,metaphone,3.206092919936054e-07 19 | 3.10.7,0.11.0,rust,nysiis,3.3875070799695096e-07 20 | 3.10.7,0.11.0,rust,soundex,2.549132920030388e-07 21 | 3.10.7,dev,rust,damerau_levenshtein_distance,1.2226207920029991e-06 22 | 3.10.7,dev,rust,hamming_distance,1.7096670799946878e-07 23 | 3.10.7,dev,rust,jaro_similarity,6.012054580060067e-07 24 | 3.10.7,dev,rust,jaro_winkler_similarity,2.8654966699832583e-07 25 | 3.10.7,dev,rust,levenshtein_distance,2.7065066699287856e-07 26 | 3.10.7,dev,rust,match_rating_codex,2.96483124999213e-07 27 | 3.10.7,dev,rust,match_rating_comparison,4.7412966699630485e-07 28 | 3.10.7,dev,rust,metaphone,3.101041250047274e-07 29 | 3.10.7,dev,rust,nysiis,3.454310419911053e-07 30 | 3.10.7,dev,rust,soundex,2.5703445900580847e-07 -------------------------------------------------------------------------------- /benchmarks/timedruns-old.csv: -------------------------------------------------------------------------------- 1 | 3.10.7,0.10-classic,c,damerau_levenshtein_distance,4.3809779200000775e-07 2 | 3.10.7,0.10-classic,c,hamming_distance,8.937791700009256e-08 3 | 3.10.7,0.10-classic,c,jaro_similarity,2.503094580006291e-07 4 | 3.10.7,0.10-classic,c,jaro_winkler_similarity,1.972025830000348e-07 5 | 3.10.7,0.10-classic,c,levenshtein_distance,1.5478662499936037e-07 6 | 3.10.7,0.10-classic,c,match_rating_codex,2.1903375000056258e-07 7 | 3.10.7,0.10-classic,c,match_rating_comparison,3.148877909998191e-07 8 | 3.10.7,0.10-classic,c,metaphone,3.495554169994648e-07 9 | 3.10.7,0.10-classic,c,nysiis,2.2051829199972418e-07 10 | 3.10.7,0.10-classic,c,soundex,2.6794874999995953e-07 11 | 3.10.7,0.10-classic,python,damerau_levenshtein_distance,3.269755224999972e-05 12 | 3.10.7,0.10-classic,python,hamming_distance,4.6421708400066563e-07 13 | 3.10.7,0.10-classic,python,jaro_similarity,8.32981374999963e-06 14 | 3.10.7,0.10-classic,python,jaro_winkler_similarity,3.957727625000189e-06 15 | 3.10.7,0.10-classic,python,levenshtein_distance,4.634622290999687e-06 16 | 3.10.7,0.10-classic,python,match_rating_codex,6.073832079991917e-07 17 | 3.10.7,0.10-classic,python,match_rating_comparison,2.1926620000003824e-06 18 | 3.10.7,0.10-classic,python,metaphone,2.464329958000235e-06 19 | 3.10.7,0.10-classic,python,nysiis,1.960830291000093e-06 20 | 3.10.7,0.10-classic,python,soundex,1.4157104160003654e-06 21 | -------------------------------------------------------------------------------- /benchmarks/timedruns.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import timeit 3 | import csv 4 | 5 | open_kwargs = {"encoding": "utf8"} 6 | 7 | 8 | def _load_data(name): 9 | with open("./testdata/{}.csv".format(name), **open_kwargs) as f: 10 | yield from csv.reader(f) 11 | 12 | 13 | def _load_n(name, n): 14 | data = [] 15 | iterator = _load_data(name) 16 | while n > 0: 17 | try: 18 | data.append(next(iterator)) 19 | n -= 1 20 | except StopIteration: 21 | iterator = _load_data(name) 22 | 23 | return data 24 | 25 | 26 | def time_func(funcname, name, params, ftype): 27 | TEST_N = 100 28 | TEST_ITERATIONS = 10000 29 | if params == 1: 30 | run = "[{}(x) for x, y in data]".format(funcname) 31 | elif params == 2: 32 | run = "[{}(x, y) for x, y, z in data]".format(funcname) 33 | 34 | if ftype == "python": 35 | path = "_jellyfish" 36 | elif ftype == "c": 37 | path = "cjellyfish" 38 | elif ftype == "rust": 39 | path = "_rustyfish" 40 | 41 | return ( 42 | timeit.timeit( 43 | run, 44 | setup="""from __main__ import _load_n 45 | from jellyfish.{} import {} 46 | data = _load_n('{}', {}) 47 | """.format( 48 | path, funcname, name, TEST_N 49 | ), 50 | number=TEST_ITERATIONS, 51 | ) 52 | / (TEST_N * TEST_ITERATIONS) 53 | ) 54 | 55 | 56 | testing = [ 57 | ("damerau_levenshtein_distance", "damerau_levenshtein", 2), 58 | ("hamming_distance", "hamming", 2), 59 | ("jaro_similarity", "jaro_distance", 2), 60 | ("jaro_winkler_similarity", "jaro_winkler", 2), 61 | ("levenshtein_distance", "levenshtein", 2), 62 | ("match_rating_codex", "match_rating_codex", 1), 63 | ("match_rating_comparison", "match_rating_comparison", 2), 64 | ("metaphone", "metaphone", 1), 65 | ("nysiis", "nysiis", 1), 66 | ("soundex", "soundex", 1), 67 | ] 68 | 69 | 70 | def main(): 71 | py_version = "{}.{}.{}".format(*sys.version_info[0:3]) 72 | if sys.argv[1] == "old": 73 | jf_version = "0.10" 74 | ftypes = ("c", "python") 75 | elif sys.argv[1] == "new": 76 | jf_version = "dev" 77 | ftypes = ("rust",) 78 | 79 | for ftype in ftypes: 80 | for funcname, name, params in testing: 81 | result = time_func(funcname, name, params, ftype) 82 | print(f"{py_version},{jf_version},{ftype},{funcname},{result}") 83 | 84 | 85 | if __name__ == "__main__": 86 | main() 87 | -------------------------------------------------------------------------------- /docs/assets/white-jellyfish.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /docs/changelog.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 1.2.0 - 31 March 2025 4 | 5 | - drop support for Python 3.8, add support for Python 3.13 and PyPy 3.11 (PyO3 0.24 bump) 6 | 7 | ## 1.1.2 - 3 December 2024 8 | 9 | - release that supports Python 3.13 on all supported platforms (thanks @energynumbers for help with Windows!) 10 | 11 | ## 1.1.0 - 28 July 2024 12 | 13 | - add jaccard_similarity thanks to Niklas von Moers (@NiklasvonM) (#214) 14 | - update to PyO3 0.22 15 | 16 | ## 1.0.4 - 28 May 2024 17 | 18 | - `match_rating_codex` now returns consistent results for Unicode characters regardless of position (#210) 19 | - adds prebuilt wheels for Alpine (#209) 20 | 21 | ## 1.0.3 - 17 November 2023 22 | 23 | - `match_rating_codex` now raises a `ValueError` when passed non-alpha characters (#200) 24 | - adds prebuilt wheels for Python 3.12 25 | 26 | ## 1.0.1 - 18 September 2023 27 | 28 | - fully remove deprecated names 29 | - add armv7 linux builds 30 | - fully drop Python 3.7 support 31 | 32 | ## 1.0.0 - 21 June 2023 33 | 34 | - bump to 1.0 (no notable changes from 0.11.2) 35 | 36 | ## 0.11.2 - 2 April 2023 37 | 38 | - fix to Rust build process to build more wheels, thanks @MartinoMensio! 39 | - switch to using `ahash` for Damerau-Levenshtein for speed gains 40 | 41 | ## 0.11.1 - 30 March 2023 42 | 43 | - fix missing testdata in packages 44 | 45 | ## 0.11.0 - 27 March 2023 46 | 47 | - switched to using Rust implementation for all algorithms 48 | 49 | ## 0.10.0 - 25 March 2023 50 | 51 | - removed rarely-used `porter_stem` function, better implementations exist 52 | 53 | ## 0.9.0 - 7 January 2021 54 | 55 | - updated documentation available at 56 | - support for Python 3.10+ 57 | - handle spaces correctly in MRA algorithm 58 | 59 | ## 0.8.9 - 26 October 2021 60 | 61 | - fix buffer overflow in NYSIIS 62 | - remove unnecessary/undocumented special casing of digits in Jaro-Winkler 63 | 64 | ## 0.8.8 - 17 August 2021 65 | 66 | - release fix to fix Linux wheel issue 67 | 68 | ## 0.8.7 - 16 August 2021 69 | 70 | - safer allocations from CJellyfish 71 | - include aarch64 wheels 72 | 73 | ## 0.8.4 - 4 August 2021 74 | 75 | - fix for jaro winkler (cjellyfish#8) 76 | 77 | ## 0.8.3 - 11 March 2021 78 | 79 | - build changes 80 | - include OSX and Windows wheels 81 | 82 | ## 0.8.2 - 21 May 2020 83 | 84 | - fix jaro_winkler/jaro_winkler_similarity mix-up 85 | - deprecate jaro_distance in favor of jaro_similarity 86 | backwards compatible shim left in place, will be removed in 1.0 87 | - (note: 0.8.1 was a broken release without proper C libraries) 88 | 89 | ## 0.8.0 - 21 May 2020 90 | 91 | - rename jaro_winkler to jaro_winkler_similarity to match other functions 92 | backwards compatible shim added, but will be removed in 1.0 93 | - fix soundex bug with W/H cases, #83 94 | - fix metaphone bug with WH prefix, #108 95 | - fix C match rating codex bug with duplicate letters, #121 96 | - fix metaphone bug with leading vowels and 'kn' pair, #123 97 | - fix Python jaro_winkler bug #124 98 | - fix Python 3.9 deprecation warning 99 | - add manylinux wheels 100 | 101 | ## 0.7.2 - 5 June 2019 102 | 103 | - fix CJellyfish damerau_levenshtein w/ unicode, thanks to immerrr 104 | - fix final H in NYSIIS 105 | - fix issue w/ trailing W in metaphone 106 | 107 | ## 0.7.1 - 10 January 2019 108 | 109 | - restrict install to Python >= 3.4 110 | 111 | ## 0.7.0 - 10 January 2019 112 | 113 | - drop Python 2 compatibility & legacy code 114 | - add bugfix for NYSIIS for words starting with PF 115 | 116 | ## 0.6.1 - April 16 2018 117 | 118 | - fixed wheel release issue 119 | 120 | ## 0.6.0 - April 7 2018 121 | 122 | - fix quite a few bugs & differences between C/Py implementations 123 | - add wagner-fischer testdata 124 | - uppercase soundex result 125 | - better error handling in nysiis, soundex, and jaro 126 | 127 | ## 0.5.6 - June 23 2016 128 | 129 | - bugfix for metaphone & soundex raising unexpected TypeErrors on Windows (#54) 130 | 131 | ## 0.5.5 - June 21 2016 132 | 133 | - bugfix for metaphone WH case 134 | 135 | ## 0.5.4 - May 13 2016 136 | 137 | - bugfix for C version of damerau_levenshtein thanks to Tyler Sellon 138 | 139 | ## 0.5.3 - March 15 2016 140 | 141 | - style/packaging changes 142 | 143 | ## 0.5.2 - February 3 2016 144 | 145 | - testing fixes for Python 3.5 146 | - bugfix for Metaphone w/ silent H thanks to Jeremy Carbaugh 147 | 148 | ## 0.5.1 - July 12 2015 149 | 150 | - bugfixes for NYSIIS 151 | - bugfixes for metaphone 152 | - bugfix for C version of jaro_winkler 153 | 154 | ## 0.5.0 - April 23 2015 155 | 156 | - consistent unicode behavior, all functions take unicode and reject bytes on Py2 and 3, C and Python 157 | - parametrize tests 158 | - Windows compiler support 159 | 160 | ## 0.4.0 - March 27 2015 161 | 162 | - tons of new tests 163 | - documentation 164 | - split out cjellyfish 165 | - test all w/ unicode and plenty of fixes to accommodate 166 | - 100% test coverage 167 | 168 | ## 0.3.4 - February 4 2015 169 | 170 | - fix segfaults and memory leaks via Danrich Parrol 171 | 172 | ## 0.3.3 - November 20 2014 173 | 174 | - fix bugs in damerau and NYSIIS 175 | 176 | ## 0.3.2 - August 11 2014 177 | 178 | - fix for jaro-winkler from David McKean 179 | - more packaging fixes 180 | 181 | ## 0.3.1 - July 16 2014 182 | 183 | - packaging fix for C/Python alternative 184 | 185 | ## 0.3.0 - July 15 2014 186 | 187 | - python alternatives where C isn't available 188 | 189 | ## 0.2.2 - March 14 2014 190 | 191 | - testing fixes 192 | - assorted bugfixes in NYSIIS 193 | 194 | ## 0.2.0 - January 26 2012 195 | 196 | - incorporate some speed changes from Peter Scott 197 | - segfault bugfixes. 198 | 199 | ## 0.1.2 - September 16 2010 200 | 201 | - initial working release 202 | -------------------------------------------------------------------------------- /docs/functions.md: -------------------------------------------------------------------------------- 1 | # Functions 2 | 3 | Jellyfish provides a variety of functions for string comparison, phonetic encoding, and stemming. 4 | 5 | ## String Comparison 6 | 7 | These methods are all measures of the difference (aka edit distance) between two strings. 8 | 9 | ### Levenshtein Distance 10 | 11 | ``` python 12 | def levenshtein_distance(s1: str, s2: str) 13 | ``` 14 | 15 | Compute the Levenshtein distance between s1 and s2. 16 | 17 | Levenshtein distance represents the number of insertions, deletions, and substitutions required to change one word to another. 18 | 19 | For example: ``levenshtein_distance('berne', 'born') == 2`` representing the transformation of the first e to o and the deletion of the second e. 20 | 21 | See the [Levenshtein distance article at Wikipedia](http://en.wikipedia.org/wiki/Levenshtein_distance) for more details. 22 | 23 | ### Damerau-Levenshtein Distance 24 | 25 | ``` python 26 | def damerau_levenshtein_distance(s1: str, s2: str) 27 | ``` 28 | 29 | Compute the Damerau-Levenshtein distance between s1 and s2. 30 | 31 | A modification of Levenshtein distance, Damerau-Levenshtein distance counts transpositions (such as ifsh for fish) as a single edit. 32 | 33 | Where ``levenshtein_distance('fish', 'ifsh') == 2`` as it would require a deletion and an insertion, 34 | though ``damerau_levenshtein_distance('fish', 'ifsh') == 1`` as this counts as a transposition. 35 | 36 | See the [Damerau-Levenshtein distance article at Wikipedia](http://en.wikipedia.org/wiki/Damerau-Levenshtein_distance) for more details. 37 | 38 | ### Hamming Distance 39 | 40 | ``` python 41 | def hamming_distance(s1: str, s2: str) 42 | ``` 43 | 44 | Compute the Hamming distance between s1 and s2. 45 | 46 | Hamming distance is the measure of the number of characters that differ between two strings. 47 | 48 | Typically Hamming distance is undefined when strings are of different length, but this implementation 49 | considers extra characters as differing. For example ``hamming_distance('abc', 'abcd') == 1``. 50 | 51 | See the [Hamming distance article at Wikipedia](http://en.wikipedia.org/wiki/Hamming_distance) for more details. 52 | 53 | ### Jaccard Similarity 54 | 55 | ``` python 56 | def jaccard_similarity(s1: str, s2: str, ngram_size: Optional[int] = None) -> float 57 | ``` 58 | 59 | Compute the Jaccard index between s1 and s2. 60 | 61 | The Jaccard index between two sets is defined as the number of elements of the intersection divided by the number of elements of the union of the two sets. The elements of the sets are ngrams (the substrings of length `ngram_size`) or words if `ngram_size` is `None`. The strings are split by whitespace. 62 | 63 | The Jaccard index does not consider order of words/ngrams. Hence "hello world" and "world hello" have a Jaccard similarity of 1. 64 | 65 | ### Jaro Similarity 66 | 67 | ``` python 68 | def jaro_similarity(s1: str, s2: str) 69 | ``` 70 | 71 | Compute the Jaro similarity between s1 and s2. 72 | 73 | Jaro distance is a string-edit distance that gives a floating point response in [0,1] where 0 represents two completely dissimilar strings and 1 represents identical strings. 74 | 75 | !!! warning 76 | 77 | Prior to 0.8.1 this function was named jaro_distance. It was removed in 1.0. 78 | 79 | ### Jaro-Winkler Similarity 80 | 81 | ``` python 82 | def jaro_winkler_similarity(s1: str, s2: str) 83 | ``` 84 | 85 | Compute the Jaro-Winkler similarity between s1 and s2. 86 | 87 | Jaro-Winkler is a modification/improvement to Jaro distance, like Jaro it gives a floating point response in [0,1] where 0 represents two completely dissimilar strings and 1 represents identical strings. 88 | 89 | !!! warning 90 | 91 | Prior to 0.8.1 this function was named jaro_winkler. That name is still available, but is no longer recommended. 92 | It will be replaced in 1.0 with a correct version. 93 | 94 | See the [Jaro-Winkler distance article at Wikipedia](http://en.wikipedia.org/wiki/Jaro-Winkler_distance) for more details. 95 | 96 | ### Match Rating Approach (comparison) 97 | 98 | ``` python 99 | def match_rating_comparison(s1, s2) 100 | ``` 101 | 102 | Compare s1 and s2 using the match rating approach algorithm, returns ``True`` if strings are considered equivalent or ``False`` if not. Can also return ``None`` if s1 and s2 are not comparable (length differs by more than 3). 103 | 104 | The Match rating approach algorithm is an algorithm for determining whether or not two names are 105 | pronounced similarly. Strings are first encoded using :py:func:`match_rating_codex` then compared according to the MRA algorithm. 106 | 107 | See the [Match Rating Approach article at Wikipedia](http://en.wikipedia.org/wiki/Match_rating_approach) for more details. 108 | 109 | ## Phonetic Encoding 110 | 111 | These algorithms convert a string to a normalized phonetic encoding, converting a word to a representation of its pronunciation. Each takes a single string and returns a coded representation. 112 | 113 | 114 | ### American Soundex 115 | 116 | ``` python 117 | def soundex(s: str) 118 | ``` 119 | 120 | Calculate the American Soundex of the string s. 121 | 122 | Soundex is an algorithm to convert a word (typically a name) to a four digit code in the form 123 | 'A123' where 'A' is the first letter of the name and the digits represent similar sounds. 124 | 125 | For example ``soundex('Ann') == soundex('Anne') == 'A500'`` and 126 | ``soundex('Rupert') == soundex('Robert') == 'R163'``. 127 | 128 | See the [Soundex article at Wikipedia](http://en.wikipedia.org/wiki/Soundex) for more details. 129 | 130 | 131 | ### Metaphone 132 | 133 | ``` python 134 | def metaphone(s: str) 135 | ``` 136 | 137 | Calculate the metaphone code for the string s. 138 | 139 | The metaphone algorithm was designed as an improvement on Soundex. It transforms a word into a 140 | string consisting of '0BFHJKLMNPRSTWXY' where '0' is pronounced 'th' and 'X' is a '[sc]h' sound. 141 | 142 | For example ``metaphone('Klumpz') == metaphone('Clumps') == 'KLMPS'``. 143 | 144 | See the [Metaphone article at Wikipedia](http://en.wikipedia.org/wiki/Metaphone) for more details. 145 | 146 | 147 | ### NYSIIS 148 | 149 | ``` python 150 | def nysiis(s: str) 151 | ``` 152 | 153 | Calculate the NYSIIS code for the string s. 154 | 155 | The NYSIIS algorithm is an algorithm developed by the New York State Identification and Intelligence System. It transforms a word into a phonetic code. Like soundex and metaphone it is primarily intended for use on names (as they would be pronounced in English). 156 | 157 | For example ``nysiis('John') == nysiis('Jan') == JAN``. 158 | 159 | See the [NYSIIS article at Wikipedia](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System) for more details. 160 | 161 | ### Match Rating Approach (codex) 162 | 163 | ``` python 164 | def match_rating_codex(s: str) 165 | ``` 166 | 167 | Calculate the match rating approach value (also called PNI) for the string s. 168 | 169 | The Match rating approach algorithm is an algorithm for determining whether or not two names are 170 | pronounced similarly. The algorithm consists of an encoding function (similar to soundex or nysiis) 171 | which is implemented here as well as :py:func:`match_rating_comparison` which does the actual comparison. 172 | 173 | See the [Match Rating Approach article at Wikipedia](http://en.wikipedia.org/wiki/Match_rating_approach) for more details. 174 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Overview 2 | 3 | **jellyfish** is a library for approximate & phonetic matching of strings. 4 | 5 | Source: [https://github.com/jamesturk/jellyfish](https://github.com/jamesturk/jellyfish) 6 | 7 | Documentation: [https://jamesturk.github.io/jellyfish/](https://jamesturk.github.io/jellyfish/) 8 | 9 | Issues: [https://github.com/jamesturk/jellyfish/issues](https://github.com/jamesturk/jellyfish/issues) 10 | 11 | [![PyPI badge](https://badge.fury.io/py/jellyfish.svg)](https://badge.fury.io/py/jellyfish) 12 | [![Test badge](https://github.com/jamesturk/jellyfish/workflows/Python%20package/badge.svg)](https://github.com/jamesturk/jellyfish/actions?query=workflow%3A%22Python+package) 13 | [![Coveralls](https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master)](https://coveralls.io/r/jamesturk/jellyfish) 14 | 15 | ## Included Algorithms 16 | 17 | String comparison: 18 | 19 | * Levenshtein Distance 20 | * Damerau-Levenshtein Distance 21 | * Jaccard Similarity 22 | * Jaro Distance 23 | * Jaro-Winkler Distance 24 | * Match Rating Approach Comparison 25 | * Hamming Distance 26 | 27 | Phonetic encoding: 28 | 29 | * American Soundex 30 | * Metaphone 31 | * NYSIIS (New York State Identification and Intelligence System) 32 | * Match Rating Codex 33 | 34 | ## Implementations 35 | 36 | Each algorithm has Rust and Python implementations. 37 | 38 | The Rust implementations are used by default. The Python 39 | implementations are a remnant of an early version of 40 | the library and will probably be removed in 1.0. 41 | 42 | To explicitly use a specific implementation, refer to the appropriate module:: 43 | 44 | ``` python 45 | import jellyfish._jellyfish as pyjellyfish 46 | import jellyfish.rustyfish as rustyfish 47 | ``` 48 | 49 | If you've already imported jellyfish and are not sure what implementation you 50 | are using, you can check by querying `jellyfish.library`. 51 | 52 | ``` python 53 | if jellyfish.library == 'Python': 54 | # Python implementation 55 | elif jellyfish.library == 'Rust': 56 | # Rust implementation 57 | ``` 58 | 59 | ## Example Usage 60 | 61 | ``` python 62 | >>> import jellyfish 63 | >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish') 64 | 2 65 | >>> jellyfish.jaro_similarity('jellyfish', 'smellyfish') 66 | 0.89629629629629637 67 | >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs') 68 | 1 69 | 70 | >>> jellyfish.metaphone('Jellyfish') 71 | 'JLFX' 72 | >>> jellyfish.soundex('Jellyfish') 73 | 'J412' 74 | >>> jellyfish.nysiis('Jellyfish') 75 | 'JALYF' 76 | >>> jellyfish.match_rating_codex('Jellyfish') 77 | 'JLLFSH' 78 | ``` 79 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: jellyfish 2 | site_url: https://jamesturk.github.io/jellyfish/ 3 | site_author: James Turk 4 | site_description: A python library for approximate and phonetic matching of strings. 5 | copyright: Copyright © 2011 James Turk 6 | repo_url: https://github.com/jamesturk/jellyfish 7 | repo_name: jamesturk/jellyfish 8 | edit_uri: edit/main/docs/ 9 | 10 | theme: 11 | logo: assets/white-jellyfish.svg 12 | name: material 13 | palette: 14 | - scheme: default 15 | primary: teal 16 | accent: teal 17 | toggle: 18 | icon: material/toggle-switch-off-outline 19 | name: Switch to dark mode 20 | - scheme: slate 21 | primary: teal 22 | accent: teal 23 | toggle: 24 | icon: material/toggle-switch 25 | name: Switch to light mode 26 | 27 | features: 28 | #- navigation.tabs 29 | - navigation.sections 30 | - navigation.top 31 | - content.tabs.link 32 | icon: 33 | repo: fontawesome/brands/github 34 | markdown_extensions: 35 | - admonition 36 | - def_list 37 | - pymdownx.highlight 38 | - pymdownx.tabbed 39 | - pymdownx.superfences 40 | - toc: 41 | permalink: true 42 | plugins: 43 | - search 44 | 45 | extra_css: 46 | - assets/extra.css 47 | nav: 48 | - 'index.md' 49 | - 'functions.md' 50 | - 'changelog.md' 51 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["maturin>=0.14,<2"] 3 | build-backend = "maturin" 4 | 5 | [project] 6 | name = "jellyfish" 7 | dynamic = ["version"] 8 | requires-python = ">=3.9" 9 | classifiers = [ 10 | "Programming Language :: Rust", 11 | "Programming Language :: Python :: Implementation :: CPython", 12 | "Programming Language :: Python :: Implementation :: PyPy", 13 | "License :: OSI Approved :: MIT License", 14 | "Operating System :: OS Independent", 15 | "Development Status :: 5 - Production/Stable", 16 | "Intended Audience :: Developers", 17 | ] 18 | 19 | [project.urls] 20 | homepage = "https://jamesturk.github.io/jellyfish/" 21 | documentation = "https://jamesturk.github.io/jellyfish/" 22 | repository = "https://github.com/jamesturk/jellyfish/" 23 | 24 | 25 | [tool.maturin] 26 | features = ["pyo3/extension-module", "python"] 27 | python-source = "python" 28 | module-name = "jellyfish._rustyfish" 29 | -------------------------------------------------------------------------------- /python/jellyfish/__init__.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | from ._rustyfish import * 4 | from . import _jellyfish 5 | -------------------------------------------------------------------------------- /python/jellyfish/__init__.pyi: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | def levenshtein_distance(s1: str, s2: str) -> int: ... 4 | def jaccard_similarity(s1: str, s2: str, ngram_size: Optional[int] = None) -> float: ... 5 | def jaro_similarity(s1: str, s2: str) -> float: ... 6 | def jaro_winkler_similarity(s1: str, s2: str, long_tolerance: bool = ...) -> float: ... 7 | def damerau_levenshtein_distance(s1: str, s2: str) -> int: ... 8 | def soundex(s: str) -> str: ... 9 | def hamming_distance(s1: str, s2: str) -> int: ... 10 | def nysiis(s: str) -> str: ... 11 | def match_rating_codex(s: str) -> str: ... 12 | def match_rating_comparison(s1: str, s2: str) -> bool: ... 13 | def metaphone(s: str) -> str: ... 14 | -------------------------------------------------------------------------------- /python/jellyfish/_jellyfish.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | from collections import defaultdict 3 | from itertools import zip_longest 4 | import warnings 5 | 6 | 7 | def _normalize(s): 8 | return unicodedata.normalize("NFKD", s) 9 | 10 | 11 | def _check_type(s): 12 | # warn here since each function will call this 13 | warnings.warn( 14 | "The jellyfish._jellyfish module is deprecated and will be removed in jellyfish 1.0.", 15 | DeprecationWarning, 16 | ) 17 | if not isinstance(s, str): 18 | raise TypeError("expected str or unicode, got %s" % type(s).__name__) 19 | 20 | 21 | def levenshtein_distance(s1, s2): 22 | _check_type(s1) 23 | _check_type(s2) 24 | 25 | if s1 == s2: 26 | return 0 27 | rows = len(s1) + 1 28 | cols = len(s2) + 1 29 | 30 | if not s1: 31 | return cols - 1 32 | if not s2: 33 | return rows - 1 34 | 35 | prev = None 36 | cur = range(cols) 37 | for r in range(1, rows): 38 | prev, cur = cur, [r] + [0] * (cols - 1) 39 | for c in range(1, cols): 40 | deletion = prev[c] + 1 41 | insertion = cur[c - 1] + 1 42 | edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1) 43 | cur[c] = min(edit, deletion, insertion) 44 | 45 | return cur[-1] 46 | 47 | 48 | def _jaro_winkler(s1, s2, long_tolerance, winklerize): 49 | _check_type(s1) 50 | _check_type(s2) 51 | 52 | s1_len = len(s1) 53 | s2_len = len(s2) 54 | 55 | if not s1_len or not s2_len: 56 | return 0.0 57 | 58 | min_len = min(s1_len, s2_len) 59 | search_range = max(s1_len, s2_len) 60 | search_range = (search_range // 2) - 1 61 | if search_range < 0: 62 | search_range = 0 63 | 64 | s1_flags = [False] * s1_len 65 | s2_flags = [False] * s2_len 66 | 67 | # looking only within search range, count & flag matched pairs 68 | common_chars = 0 69 | for i, s1_ch in enumerate(s1): 70 | low = max(0, i - search_range) 71 | hi = min(i + search_range, s2_len - 1) 72 | for j in range(low, hi + 1): 73 | if not s2_flags[j] and s2[j] == s1_ch: 74 | s1_flags[i] = s2_flags[j] = True 75 | common_chars += 1 76 | break 77 | 78 | # short circuit if no characters match 79 | if not common_chars: 80 | return 0.0 81 | 82 | # count transpositions 83 | k = trans_count = 0 84 | for i, s1_f in enumerate(s1_flags): 85 | if s1_f: 86 | for j in range(k, s2_len): 87 | if s2_flags[j]: 88 | k = j + 1 89 | break 90 | if s1[i] != s2[j]: 91 | trans_count += 1 92 | trans_count //= 2 93 | 94 | # adjust for similarities in nonmatched characters 95 | common_chars = float(common_chars) 96 | weight = ( 97 | ( 98 | common_chars / s1_len 99 | + common_chars / s2_len 100 | + (common_chars - trans_count) / common_chars 101 | ) 102 | ) / 3 103 | 104 | # winkler modification: continue to boost if strings are similar 105 | if winklerize and weight > 0.7: 106 | # adjust for up to first 4 chars in common 107 | j = min(min_len, 4) 108 | i = 0 109 | while i < j and s1[i] == s2[i]: 110 | i += 1 111 | if i: 112 | weight += i * 0.1 * (1.0 - weight) 113 | 114 | # optionally adjust for long strings 115 | # after agreeing beginning chars, at least two or more must agree and 116 | # agreed characters must be > half of remaining characters 117 | if ( 118 | long_tolerance 119 | and min_len > 4 120 | and common_chars > i + 1 121 | and 2 * common_chars >= min_len + i 122 | ): 123 | weight += (1.0 - weight) * ( 124 | float(common_chars - i - 1) / float(s1_len + s2_len - i * 2 + 2) 125 | ) 126 | 127 | return weight 128 | 129 | 130 | def jaro_similarity(s1, s2): 131 | return _jaro_winkler(s1, s2, False, False) # noqa 132 | 133 | 134 | def jaro_winkler_similarity(s1, s2, long_tolerance=False): 135 | return _jaro_winkler(s1, s2, long_tolerance, True) # noqa 136 | 137 | 138 | def damerau_levenshtein_distance(s1, s2): 139 | _check_type(s1) 140 | _check_type(s2) 141 | 142 | len1 = len(s1) 143 | len2 = len(s2) 144 | infinite = len1 + len2 145 | 146 | # character array 147 | da = defaultdict(int) 148 | 149 | # distance matrix 150 | score = [[0] * (len2 + 2) for x in range(len1 + 2)] 151 | 152 | score[0][0] = infinite 153 | for i in range(0, len1 + 1): 154 | score[i + 1][0] = infinite 155 | score[i + 1][1] = i 156 | for i in range(0, len2 + 1): 157 | score[0][i + 1] = infinite 158 | score[1][i + 1] = i 159 | 160 | for i in range(1, len1 + 1): 161 | db = 0 162 | for j in range(1, len2 + 1): 163 | i1 = da[s2[j - 1]] 164 | j1 = db 165 | cost = 1 166 | if s1[i - 1] == s2[j - 1]: 167 | cost = 0 168 | db = j 169 | 170 | score[i + 1][j + 1] = min( 171 | score[i][j] + cost, 172 | score[i + 1][j] + 1, 173 | score[i][j + 1] + 1, 174 | score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1), 175 | ) 176 | da[s1[i - 1]] = i 177 | 178 | return score[len1 + 1][len2 + 1] 179 | 180 | 181 | def soundex(s): 182 | _check_type(s) 183 | 184 | if not s: 185 | return "" 186 | 187 | s = _normalize(s) 188 | s = s.upper() 189 | 190 | replacements = ( 191 | ("BFPV", "1"), 192 | ("CGJKQSXZ", "2"), 193 | ("DT", "3"), 194 | ("L", "4"), 195 | ("MN", "5"), 196 | ("R", "6"), 197 | ) 198 | result = [s[0]] 199 | count = 1 200 | 201 | # find would-be replacement for first character 202 | for lset, sub in replacements: 203 | if s[0] in lset: 204 | last = sub 205 | break 206 | else: 207 | last = None 208 | 209 | for letter in s[1:]: 210 | for lset, sub in replacements: 211 | if letter in lset: 212 | if sub != last: 213 | result.append(sub) 214 | count += 1 215 | last = sub 216 | break 217 | else: 218 | if letter != "H" and letter != "W": 219 | # leave last alone if middle letter is H or W 220 | last = None 221 | if count == 4: 222 | break 223 | 224 | result += "0" * (4 - count) 225 | return "".join(result) 226 | 227 | 228 | def hamming_distance(s1, s2): 229 | _check_type(s1) 230 | _check_type(s2) 231 | 232 | # ensure length of s1 >= s2 233 | if len(s2) > len(s1): 234 | s1, s2 = s2, s1 235 | 236 | # distance is difference in length + differing chars 237 | distance = len(s1) - len(s2) 238 | for i, c in enumerate(s2): 239 | if c != s1[i]: 240 | distance += 1 241 | 242 | return distance 243 | 244 | 245 | def nysiis(s): 246 | _check_type(s) 247 | 248 | if not s: 249 | return "" 250 | 251 | s = s.upper() 252 | key = [] 253 | 254 | # step 1 - prefixes 255 | if s.startswith("MAC"): 256 | s = "MCC" + s[3:] 257 | elif s.startswith("KN"): 258 | s = s[1:] 259 | elif s.startswith("K"): 260 | s = "C" + s[1:] 261 | elif s.startswith(("PH", "PF")): 262 | s = "FF" + s[2:] 263 | elif s.startswith("SCH"): 264 | s = "SSS" + s[3:] 265 | 266 | # step 2 - suffixes 267 | if s.endswith(("IE", "EE")): 268 | s = s[:-2] + "Y" 269 | elif s.endswith(("DT", "RT", "RD", "NT", "ND")): 270 | s = s[:-2] + "D" 271 | 272 | # step 3 - first character of key comes from name 273 | key.append(s[0]) 274 | 275 | # step 4 - translate remaining chars 276 | i = 1 277 | len_s = len(s) 278 | while i < len_s: 279 | ch = s[i] 280 | if ch == "E" and i + 1 < len_s and s[i + 1] == "V": 281 | ch = "AF" 282 | i += 1 283 | elif ch in "AEIOU": 284 | ch = "A" 285 | elif ch == "Q": 286 | ch = "G" 287 | elif ch == "Z": 288 | ch = "S" 289 | elif ch == "M": 290 | ch = "N" 291 | elif ch == "K": 292 | if i + 1 < len(s) and s[i + 1] == "N": 293 | ch = "N" 294 | else: 295 | ch = "C" 296 | elif ch == "S" and s[i + 1 : i + 3] == "CH": 297 | ch = "SS" 298 | i += 2 299 | elif ch == "P" and i + 1 < len(s) and s[i + 1] == "H": 300 | ch = "F" 301 | i += 1 302 | elif ch == "H" and ( 303 | s[i - 1] not in "AEIOU" 304 | or (i + 1 < len(s) and s[i + 1] not in "AEIOU") 305 | or (i + 1 == len(s)) 306 | ): 307 | if s[i - 1] in "AEIOU": 308 | ch = "A" 309 | else: 310 | ch = s[i - 1] 311 | elif ch == "W" and s[i - 1] in "AEIOU": 312 | ch = s[i - 1] 313 | 314 | if ch[-1] != key[-1][-1]: 315 | key.append(ch) 316 | 317 | i += 1 318 | 319 | key = "".join(key) 320 | 321 | # step 5 - remove trailing S 322 | if key.endswith("S") and key != "S": 323 | key = key[:-1] 324 | 325 | # step 6 - replace AY w/ Y 326 | if key.endswith("AY"): 327 | key = key[:-2] + "Y" 328 | 329 | # step 7 - remove trailing A 330 | if key.endswith("A") and key != "A": 331 | key = key[:-1] 332 | 333 | # step 8 was already done 334 | 335 | return key 336 | 337 | 338 | def match_rating_codex(s): 339 | _check_type(s) 340 | 341 | # we ignore spaces 342 | s = s.upper().replace(" ", "") 343 | # any remaining non-alphabetic characters are invalid 344 | if not s.isalpha(): 345 | raise ValueError("string must be alphabetic") 346 | 347 | codex = [] 348 | 349 | prev = None 350 | first = True 351 | for c in s: 352 | # starting character 353 | # or consonant not preceded by same consonant 354 | if first or (c not in "AEIOU" and c != prev): 355 | codex.append(c) 356 | 357 | prev = c 358 | first = False 359 | 360 | # just use first/last 3 361 | if len(codex) > 6: 362 | return "".join(codex[:3] + codex[-3:]) 363 | else: 364 | return "".join(codex) 365 | 366 | 367 | def match_rating_comparison(s1, s2): 368 | codex1 = match_rating_codex(s1) 369 | codex2 = match_rating_codex(s2) 370 | len1 = len(codex1) 371 | len2 = len(codex2) 372 | res1 = [] 373 | res2 = [] 374 | 375 | # length differs by 3 or more, no result 376 | if abs(len1 - len2) >= 3: 377 | return None 378 | 379 | # get minimum rating based on sums of codexes 380 | lensum = len1 + len2 381 | if lensum <= 4: 382 | min_rating = 5 383 | elif lensum <= 7: 384 | min_rating = 4 385 | elif lensum <= 11: 386 | min_rating = 3 387 | else: 388 | min_rating = 2 389 | 390 | # strip off common prefixes 391 | for c1, c2 in zip_longest(codex1, codex2): 392 | if c1 != c2: 393 | if c1: 394 | res1.append(c1) 395 | if c2: 396 | res2.append(c2) 397 | 398 | unmatched_count1 = unmatched_count2 = 0 399 | for c1, c2 in zip_longest(reversed(res1), reversed(res2)): 400 | if c1 != c2: 401 | if c1: 402 | unmatched_count1 += 1 403 | if c2: 404 | unmatched_count2 += 1 405 | 406 | return (6 - max(unmatched_count1, unmatched_count2)) >= min_rating 407 | 408 | 409 | def metaphone(s): 410 | _check_type(s) 411 | 412 | result = [] 413 | 414 | s = _normalize(s.lower()) 415 | 416 | # skip first character if s starts with these 417 | if s.startswith(("kn", "gn", "pn", "wr", "ae")): 418 | s = s[1:] 419 | 420 | i = 0 421 | 422 | while i < len(s): 423 | c = s[i] 424 | next = s[i + 1] if i < len(s) - 1 else "*****" 425 | nextnext = s[i + 2] if i < len(s) - 2 else "*****" 426 | 427 | # skip doubles except for cc 428 | if c == next and c != "c": 429 | i += 1 430 | continue 431 | 432 | if c in "aeiou": 433 | if i == 0 or s[i - 1] == " ": 434 | result.append(c) 435 | elif c == "b": 436 | if (not (i != 0 and s[i - 1] == "m")) or next: 437 | result.append("b") 438 | elif c == "c": 439 | if next == "i" and nextnext == "a" or next == "h": 440 | result.append("x") 441 | i += 1 442 | elif next in "iey": 443 | result.append("s") 444 | i += 1 445 | else: 446 | result.append("k") 447 | elif c == "d": 448 | if next == "g" and nextnext in "iey": 449 | result.append("j") 450 | i += 2 451 | else: 452 | result.append("t") 453 | elif c in "fjlmnr": 454 | result.append(c) 455 | elif c == "g": 456 | if next in "iey": 457 | result.append("j") 458 | elif next == "h" and nextnext and nextnext not in "aeiou": 459 | i += 1 460 | elif next == "n" and not nextnext: 461 | i += 1 462 | else: 463 | result.append("k") 464 | elif c == "h": 465 | if i == 0 or next in "aeiou" or s[i - 1] not in "aeiou": 466 | result.append("h") 467 | elif c == "k": 468 | if i == 0 or s[i - 1] != "c": 469 | result.append("k") 470 | elif c == "p": 471 | if next == "h": 472 | result.append("f") 473 | i += 1 474 | else: 475 | result.append("p") 476 | elif c == "q": 477 | result.append("k") 478 | elif c == "s": 479 | if next == "h": 480 | result.append("x") 481 | i += 1 482 | elif next == "i" and nextnext in "oa": 483 | result.append("x") 484 | i += 2 485 | else: 486 | result.append("s") 487 | elif c == "t": 488 | if next == "i" and nextnext in "oa": 489 | result.append("x") 490 | elif next == "h": 491 | result.append("0") 492 | i += 1 493 | elif next != "c" or nextnext != "h": 494 | result.append("t") 495 | elif c == "v": 496 | result.append("f") 497 | elif c == "w": 498 | if i == 0 and next == "h": 499 | i += 1 500 | result.append("w") 501 | elif next in "aeiou": 502 | result.append("w") 503 | elif c == "x": 504 | if i == 0: 505 | if next == "h" or (next == "i" and nextnext in "oa"): 506 | result.append("x") 507 | else: 508 | result.append("s") 509 | else: 510 | result.append("k") 511 | result.append("s") 512 | elif c == "y": 513 | if next in "aeiou": 514 | result.append("y") 515 | elif c == "z": 516 | result.append("s") 517 | elif c == " ": 518 | if len(result) > 0 and result[-1] != " ": 519 | result.append(" ") 520 | 521 | i += 1 522 | 523 | return "".join(result).upper() 524 | -------------------------------------------------------------------------------- /python/jellyfish/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jamesturk/jellyfish/846fae4b210db8ff4ab9dfaed7e2ec9f372728a7/python/jellyfish/py.typed -------------------------------------------------------------------------------- /run-cov.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | export PYTHONPATH=.; 4 | pip install -e . 5 | py.test jellyfish/test.py --cov jellyfish --cov-report html 6 | -------------------------------------------------------------------------------- /src/common.rs: -------------------------------------------------------------------------------- 1 | use smallvec::SmallVec; 2 | // most strings are short, so we can use a fixed-size array 3 | const VEC_SIZE: usize = 32; 4 | 5 | pub type FastVec = SmallVec<[T; VEC_SIZE]>; 6 | -------------------------------------------------------------------------------- /src/hamming.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use unicode_segmentation::UnicodeSegmentation; 3 | 4 | pub fn vec_hamming_distance(s1: &FastVec, s2: &FastVec) -> usize { 5 | let (longer, shorter) = if s1.len() > s2.len() { 6 | (s1, s2) 7 | } else { 8 | (s2, s1) 9 | }; 10 | 11 | // distance is difference in length + differing chars 12 | let mut distance = longer.len() - shorter.len(); 13 | for (i, c) in shorter.iter().enumerate() { 14 | if *c != longer[i] { 15 | distance += 1 16 | } 17 | } 18 | 19 | distance 20 | } 21 | 22 | pub fn hamming_distance(s1: &str, s2: &str) -> usize { 23 | let us1 = UnicodeSegmentation::graphemes(s1, true).collect::>(); 24 | let us2 = UnicodeSegmentation::graphemes(s2, true).collect::>(); 25 | 26 | vec_hamming_distance(&us1, &us2) 27 | } 28 | 29 | #[cfg(test)] 30 | mod test { 31 | use super::*; 32 | use crate::testutils::testutils; 33 | #[test] 34 | fn test_hamming() { 35 | testutils::test_distance_func("testdata/hamming.csv", hamming_distance); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/jaccard.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | use std::collections::HashSet; 3 | 4 | pub fn jaccard_similarity(s1: &str, s2: &str, ngram_size: Option) -> f64 { 5 | // 1. Tokenize into ngrams 6 | let grams1: HashSet = get_ngrams(s1, ngram_size) 7 | .into_iter() 8 | .map(|cow| cow.into_owned()) 9 | .collect(); 10 | let grams2: HashSet = get_ngrams(s2, ngram_size) 11 | .into_iter() 12 | .map(|cow| cow.into_owned()) 13 | .collect(); 14 | 15 | // 2. Calculate intersection and union sizes 16 | let intersection_size: usize = grams1.iter().filter(|gram| grams2.contains(*gram)).count(); 17 | let union_size: usize = grams1.len() + grams2.len() - intersection_size; 18 | 19 | // 3. Calculate Jaccard index 20 | if union_size == 0 { 21 | 0.0 22 | } else { 23 | intersection_size as f64 / union_size as f64 24 | } 25 | } 26 | 27 | fn get_ngrams(s: &str, n: Option) -> Vec> { 28 | if let Some(size) = n { 29 | // Non-overlapping character-level n-grams 30 | s.chars() 31 | .collect::>() 32 | .chunks(size) // Use chunks() for non-overlapping groups 33 | .map(|chunk| Cow::from(chunk.iter().collect::())) 34 | .collect() 35 | } else { 36 | // Word-level "n-grams" (i.e., words) 37 | s.split_whitespace() 38 | .map(Cow::from) 39 | .collect() 40 | } 41 | } 42 | 43 | 44 | 45 | #[cfg(test)] 46 | mod test { 47 | use super::*; // Import the Jaccard functions 48 | use crate::testutils::testutils; // Import the test utils 49 | 50 | #[test] 51 | fn test_jaccard_similarity() { 52 | testutils::test_similarity_func_three_args("testdata/jaccard.csv", jaccard_similarity); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/jaro.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use smallvec::smallvec; 3 | use std::cmp; 4 | use unicode_segmentation::UnicodeSegmentation; 5 | 6 | enum JaroVersion { 7 | Pure, 8 | Winkler, 9 | WinklerLongTolerance, 10 | } 11 | 12 | fn vec_jaro_or_winkler( 13 | s1: &FastVec, 14 | s2: &FastVec, 15 | version: JaroVersion, 16 | ) -> f64 { 17 | let s1_len = s1.len(); 18 | let s2_len = s2.len(); 19 | 20 | if s1_len == 0 || s2_len == 0 { 21 | return 0.0; 22 | } 23 | 24 | let min_len = cmp::min(s1_len, s2_len); 25 | let mut search_range = cmp::max(s1_len, s2_len); 26 | search_range = (search_range / 2).saturating_sub(1); 27 | 28 | let mut s1_flags: FastVec = smallvec![false; s1_len]; 29 | let mut s2_flags: FastVec = smallvec![false; s2_len]; 30 | let mut common_chars = 0; 31 | 32 | // looking only within search range, count & flag matched pairs 33 | for (i, s1_ch) in s1.iter().enumerate() { 34 | // avoid underflow on i - search_range 35 | let low = i.saturating_sub(search_range); 36 | let hi = cmp::min(i + search_range, s2_len - 1); 37 | for j in low..hi + 1 { 38 | if !s2_flags[j] && s2[j] == *s1_ch { 39 | s1_flags[i] = true; 40 | s2_flags[j] = true; 41 | common_chars += 1; 42 | break; 43 | } 44 | } 45 | } 46 | 47 | // no characters match 48 | if common_chars == 0 { 49 | return 0.0; 50 | } 51 | 52 | // count transpositions 53 | let mut k = 0; 54 | let mut trans_count = 0; 55 | for (i, s1_f) in s1_flags.iter().enumerate() { 56 | if *s1_f { 57 | let mut j = k; 58 | while j < s2_len { 59 | if s2_flags[j] { 60 | k = j + 1; 61 | break; 62 | } 63 | j += 1; 64 | } 65 | if s1[i] != s2[j] { 66 | trans_count += 1 67 | } 68 | } 69 | } 70 | // need to do floor division then cast to float 71 | let trans_count = (trans_count / 2) as f64; 72 | let common_charsf = common_chars as f64; 73 | let s1_lenf = s1_len as f64; 74 | let s2_lenf = s2_len as f64; 75 | 76 | // adjust for similarities in nonmatched characters 77 | let mut weight = (common_charsf / s1_lenf 78 | + common_charsf / s2_lenf 79 | + (common_charsf - trans_count) / common_charsf) 80 | / 3.0; 81 | 82 | // check which version to run 83 | let (winklerize, long_tolerance) = match version { 84 | JaroVersion::Pure => (false, false), 85 | JaroVersion::Winkler => (true, false), 86 | JaroVersion::WinklerLongTolerance => (true, true), 87 | }; 88 | 89 | // winkler modification: continue to boost similar strings 90 | if winklerize && weight > 0.7 { 91 | let mut i = 0; 92 | let j = cmp::min(min_len, 4); 93 | while i < j && s1[i] == s2[i] { 94 | // TODO: also had s1[i] in Python, necessary? 95 | i += 1; 96 | } 97 | let fi = i as f64; 98 | if i > 0 { 99 | weight += fi * 0.1 * (1.0 - weight); 100 | } 101 | 102 | // optional adjustment for long strings 103 | // after agreeing beginning items, at least two or more must agree 104 | // and agreed items must be more than half of remaining items 105 | if long_tolerance && min_len > 4 && common_chars > i + 1 && 2 * common_chars >= min_len + i 106 | { 107 | weight += 108 | (1.0 - weight) * (common_charsf - fi - 1.0) / (s1_lenf + s2_lenf - fi * 2.0 + 2.0); 109 | } 110 | } 111 | 112 | weight 113 | } 114 | 115 | pub fn vec_jaro_similarity(s1: &FastVec, s2: &FastVec) -> f64 { 116 | vec_jaro_or_winkler(s1, s2, JaroVersion::Pure) 117 | } 118 | 119 | pub fn vec_jaro_winkler_similarity(s1: &FastVec, s2: &FastVec) -> f64 { 120 | vec_jaro_or_winkler(s1, s2, JaroVersion::Winkler) 121 | } 122 | 123 | pub fn vec_jaro_winkler_similarity_longtol(s1: &FastVec, s2: &FastVec) -> f64 { 124 | vec_jaro_or_winkler(s1, s2, JaroVersion::WinklerLongTolerance) 125 | } 126 | 127 | pub fn jaro_similarity(s1: &str, s2: &str) -> f64 { 128 | let us1 = UnicodeSegmentation::graphemes(s1, true).collect::>(); 129 | let us2 = UnicodeSegmentation::graphemes(s2, true).collect::>(); 130 | vec_jaro_similarity(&us1, &us2) 131 | } 132 | 133 | pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 { 134 | let us1 = UnicodeSegmentation::graphemes(s1, true).collect::>(); 135 | let us2 = UnicodeSegmentation::graphemes(s2, true).collect::>(); 136 | vec_jaro_winkler_similarity(&us1, &us2) 137 | } 138 | 139 | pub fn jaro_winkler_similarity_longtol(s1: &str, s2: &str) -> f64 { 140 | let us1 = UnicodeSegmentation::graphemes(s1, true).collect::>(); 141 | let us2 = UnicodeSegmentation::graphemes(s2, true).collect::>(); 142 | vec_jaro_winkler_similarity_longtol(&us1, &us2) 143 | } 144 | 145 | #[cfg(test)] 146 | mod test { 147 | use super::*; 148 | use crate::testutils::testutils; 149 | #[test] 150 | fn test_jaro() { 151 | testutils::test_similarity_func_two_args("testdata/jaro_distance.csv", jaro_similarity); 152 | } 153 | 154 | #[test] 155 | fn test_jaro_winkler() { 156 | testutils::test_similarity_func_two_args("testdata/jaro_winkler.csv", jaro_winkler_similarity); 157 | } 158 | 159 | #[test] 160 | fn test_jaro_winkler_longtol() { 161 | testutils::test_similarity_func_two_args( 162 | "testdata/jaro_winkler_longtol.csv", 163 | jaro_winkler_similarity_longtol, 164 | ); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/levenshtein.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use ahash::AHashMap; 3 | use smallvec::smallvec; 4 | use std::cmp; 5 | use unicode_segmentation::UnicodeSegmentation; 6 | 7 | fn range_vec(size: usize) -> FastVec { 8 | let mut vec = FastVec::new(); 9 | let mut p: usize = 0; 10 | vec.resize_with(size, || { 11 | p += 1; 12 | p - 1 13 | }); 14 | vec 15 | } 16 | 17 | pub fn vec_levenshtein_distance(v1: &FastVec, v2: &FastVec) -> usize { 18 | let rows = v1.len() + 1; 19 | let cols = v2.len() + 1; 20 | 21 | if rows == 1 { 22 | return cols - 1; 23 | } else if cols == 1 { 24 | return rows - 1; 25 | } 26 | 27 | let mut cur = range_vec(cols); 28 | 29 | for r in 1..rows { 30 | // make a copy of the previous row so we can edit cur 31 | let prev = cur.clone(); 32 | cur = smallvec![0; cols]; 33 | cur[0] = r; 34 | for c in 1..cols { 35 | // deletion cost or insertion cost 36 | let del_or_ins = cmp::min(prev[c] + 1, cur[c - 1] + 1); 37 | let edit = prev[c - 1] + (if v1[r - 1] == v2[c - 1] { 0 } else { 1 }); 38 | cur[c] = cmp::min(del_or_ins, edit); 39 | } 40 | } 41 | 42 | // last element of bottom row 43 | cur[cols - 1] 44 | } 45 | 46 | pub fn vec_damerau_levenshtein_distance( 47 | v1: &FastVec, 48 | v2: &FastVec, 49 | ) -> usize { 50 | let len1 = v1.len(); 51 | let len2 = v2.len(); 52 | let infinite = len1 + len2; 53 | 54 | let mut item_position = AHashMap::with_capacity(cmp::max(len1, len2)); 55 | // distance matrix 56 | // try using a flat array instead of a 2d vec for speed 57 | let mut score: Vec = vec![0; (len1 + 2) * (len2 + 2)]; 58 | let idx = |i: usize, j: usize| (len2 + 2) * i + j; 59 | //let mut score: FastVec> = smallvec![smallvec![0; len2 + 2]; len1 + 2]; 60 | 61 | score[0] = infinite; 62 | for i in 0..=len1 { 63 | score[idx(i + 1, 0)] = infinite; 64 | score[idx(i + 1, 1)] = i; 65 | } 66 | for i in 0..=len2 { 67 | score[idx(0, i + 1)] = infinite; 68 | score[idx(1, i + 1)] = i; 69 | } 70 | 71 | for i in 1..len1 + 1 { 72 | let mut db = 0; 73 | for j in 1..len2 + 1 { 74 | let i1 = item_position.entry(&v2[j - 1]).or_insert(0); 75 | let j1 = db; 76 | let mut cost = 1; 77 | if v1[i - 1] == v2[j - 1] { 78 | cost = 0; 79 | db = j; 80 | } 81 | 82 | // min of the four options 83 | score[idx(i + 1, j + 1)] = cmp::min( 84 | // substitution & insertion 85 | cmp::min(score[idx(i, j)] + cost, score[idx(i + 1, j)] + 1), 86 | cmp::min( 87 | // deletion & transposition 88 | score[idx(i, j + 1)] + 1, 89 | score[idx(*i1, j1)] + (i - *i1 - 1) + 1 + (j - j1 - 1), 90 | ), 91 | ) 92 | } 93 | // store the position of this character for transpositions 94 | item_position.insert(&v1[i - 1], i); 95 | } 96 | 97 | score[idx(len1 + 1, len2 + 1)] 98 | } 99 | 100 | pub fn levenshtein_distance(s1: &str, s2: &str) -> usize { 101 | if s1 == s2 { 102 | return 0; 103 | } 104 | 105 | let us1 = UnicodeSegmentation::graphemes(s1, true).collect::>(); 106 | let us2 = UnicodeSegmentation::graphemes(s2, true).collect::>(); 107 | 108 | vec_levenshtein_distance(&us1, &us2) 109 | } 110 | 111 | pub fn damerau_levenshtein_distance(s1: &str, s2: &str) -> usize { 112 | if s1 == s2 { 113 | return 0; 114 | } 115 | 116 | let us1 = UnicodeSegmentation::graphemes(s1, true).collect::>(); 117 | let us2 = UnicodeSegmentation::graphemes(s2, true).collect::>(); 118 | 119 | vec_damerau_levenshtein_distance(&us1, &us2) 120 | } 121 | 122 | #[cfg(test)] 123 | mod test { 124 | use super::*; 125 | use crate::testutils::testutils; 126 | #[test] 127 | fn test_levenshtein() { 128 | testutils::test_distance_func("testdata/levenshtein.csv", levenshtein_distance); 129 | } 130 | 131 | #[test] 132 | fn test_damerau_levenshtein() { 133 | testutils::test_distance_func( 134 | "testdata/damerau_levenshtein.csv", 135 | damerau_levenshtein_distance, 136 | ); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | mod common; 2 | mod hamming; 3 | mod jaccard; 4 | mod jaro; 5 | mod levenshtein; 6 | mod match_rating; 7 | mod metaphone; 8 | mod nysiis; 9 | mod soundex; 10 | mod testutils; 11 | 12 | pub use hamming::{hamming_distance, vec_hamming_distance}; 13 | pub use jaccard::jaccard_similarity; 14 | pub use jaro::{ 15 | jaro_similarity, jaro_winkler_similarity, jaro_winkler_similarity_longtol, vec_jaro_similarity, 16 | vec_jaro_winkler_similarity, vec_jaro_winkler_similarity_longtol, 17 | }; 18 | pub use levenshtein::{ 19 | damerau_levenshtein_distance, levenshtein_distance, vec_damerau_levenshtein_distance, 20 | vec_levenshtein_distance, 21 | }; 22 | pub use match_rating::{match_rating_codex, match_rating_comparison}; 23 | pub use metaphone::metaphone; 24 | pub use nysiis::nysiis; 25 | pub use soundex::soundex; 26 | 27 | #[cfg(feature = "python")] 28 | mod rustyfish; 29 | #[cfg(feature = "python")] 30 | pub use rustyfish::_rustyfish; 31 | -------------------------------------------------------------------------------- /src/match_rating.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use std::cmp; 3 | use unicode_segmentation::UnicodeSegmentation; 4 | 5 | pub fn match_rating_codex(s: &str) -> Result { 6 | // match rating only really makes sense on strings 7 | 8 | let s = &s.to_uppercase()[..]; 9 | let v = UnicodeSegmentation::graphemes(s, true).collect::>(); 10 | let mut codex = String::new(); 11 | let mut prev = "~tmp~"; 12 | let is_alpha = s.chars().all(|c| c.is_alphabetic() || c == ' '); 13 | 14 | if !is_alpha { 15 | return Err(String::from("Strings must only contain alphabetical characters")); 16 | } 17 | 18 | for (i, c) in v.iter().enumerate() { 19 | let vowel = *c == "A" || *c == "E" || *c == "I" || *c == "O" || *c == "U"; 20 | // not a space || starting char & vowel || non-double consonant 21 | if *c != " " && (i == 0 && vowel) || (!vowel && *c != prev) { 22 | codex.push_str(c); 23 | } 24 | prev = c; 25 | } 26 | 27 | if codex.len() > 6 { 28 | // not safe to take a slice without conversion to chars() since there 29 | // can be unicode left, this implementation matches the Python one 30 | // even though MRC really shouldn't be used with unicode chars 31 | let first_three: String = codex.chars().take(3).collect(); 32 | let last_three: String = codex.chars().rev().take(3).collect::().chars().rev().collect(); 33 | return Ok(first_three + &last_three); 34 | } 35 | 36 | Ok(codex) 37 | } 38 | 39 | pub fn match_rating_comparison(s1: &str, s2: &str) -> Result { 40 | let codex1 = match_rating_codex(s1)?; 41 | let codex2 = match_rating_codex(s2)?; 42 | 43 | // need to know which is longer for comparisons later 44 | let (longer, shorter) = if codex1.len() > codex2.len() { 45 | (codex1, codex2) 46 | } else { 47 | (codex2, codex1) 48 | }; 49 | 50 | let lensum = longer.len() + shorter.len(); 51 | 52 | // can't do a comparison when difference is 3 or greater 53 | if longer.len() - shorter.len() >= 3 { 54 | return Err(String::from("strings differ in length by more than 2")); 55 | } 56 | 57 | // remove matching characters going forward 58 | let mut res1 = FastVec::new(); 59 | let mut res2 = FastVec::new(); 60 | let mut iter1 = longer.chars(); 61 | let mut iter2 = shorter.chars(); 62 | loop { 63 | match (iter1.next(), iter2.next()) { 64 | (Some(x), Some(y)) => { 65 | if x != y { 66 | res1.push(x); 67 | res2.push(y) 68 | } 69 | } 70 | (Some(x), None) => res1.push(x), 71 | (None, Some(y)) => res2.push(y), 72 | (None, None) => break, 73 | }; 74 | } 75 | 76 | // count unmatched characters going backwards 77 | let mut unmatched_count1 = 0; 78 | let mut unmatched_count2 = 0; 79 | let mut iter1 = res1.iter().rev(); 80 | let mut iter2 = res2.iter().rev(); 81 | loop { 82 | match (iter1.next(), iter2.next()) { 83 | (Some(x), Some(y)) => { 84 | if x != y { 85 | unmatched_count1 += 1; 86 | unmatched_count2 += 1; 87 | } 88 | } 89 | (Some(_), None) => unmatched_count1 += 1, 90 | (None, Some(_)) => unmatched_count2 += 1, 91 | (None, None) => break, 92 | }; 93 | } 94 | 95 | let score = 6 - cmp::max(unmatched_count1, unmatched_count2); 96 | match lensum { 97 | 0..=4 => Ok(score >= 5), 98 | 5..=7 => Ok(score >= 4), 99 | 8..=11 => Ok(score >= 3), 100 | _ => Ok(score >= 2), 101 | } 102 | } 103 | 104 | #[cfg(test)] 105 | mod test { 106 | use super::*; 107 | use crate::testutils::testutils; 108 | pub fn mrc_unwrapped(s: &str) -> String { 109 | return match_rating_codex(s).unwrap(); 110 | } 111 | 112 | #[test] 113 | fn test_match_rating() { 114 | testutils::test_str_func("testdata/match_rating_codex.csv", mrc_unwrapped); 115 | } 116 | 117 | #[test] 118 | fn test_match_rating_comparison() { 119 | // TODO: switch to using CSV 120 | assert!(match_rating_comparison("Bryne", "Boern").unwrap()); 121 | assert!(match_rating_comparison("Smith", "Smyth").unwrap()); 122 | assert!(match_rating_comparison("Ed", "Ad").unwrap()); 123 | assert!(match_rating_comparison("Catherine", "Kathryn").unwrap()); 124 | assert!(!match_rating_comparison("Michael", "Mike").unwrap()); 125 | } 126 | 127 | #[test] 128 | fn test_match_rating_comparison_err() { 129 | let result = match_rating_comparison("Tim", "Timothy"); 130 | assert_eq!(result.is_err(), true); 131 | } 132 | 133 | #[test] 134 | fn test_match_rating_codex_bad_str() { 135 | let result = match_rating_codex("i’m going home"); 136 | assert!(result.is_err()); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/metaphone.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use unicode_normalization::UnicodeNormalization; 3 | 4 | pub fn isvowel(s: char) -> bool { 5 | matches!(s, 'A' | 'E' | 'I' | 'O' | 'U') 6 | } 7 | 8 | fn is_iey(s: char) -> bool { 9 | matches!(s, 'I' | 'E' | 'Y') 10 | } 11 | 12 | pub fn metaphone(s: &str) -> String { 13 | if s.is_empty() { 14 | return String::from(""); 15 | } 16 | 17 | let s = &s.to_uppercase()[..]; 18 | let mut v = s.nfkd().collect::>(); 19 | let mut ret = FastVec::new(); 20 | 21 | // skip first character if s starts with these 22 | if s.starts_with("KN") 23 | || s.starts_with("GN") 24 | || s.starts_with("PN") 25 | || s.starts_with("WR") 26 | || s.starts_with("AE") 27 | { 28 | v.remove(0); 29 | } 30 | 31 | let mut i = 0; 32 | 33 | while i < v.len() { 34 | let c = v[i]; 35 | let next = if i + 1 < v.len() { v[i + 1] } else { '*' }; 36 | let nextnext = if i + 2 < v.len() { v[i + 2] } else { '*' }; 37 | 38 | // skip doubles except for CC 39 | if c == next && c != 'C' { 40 | i += 1; 41 | continue; 42 | } 43 | 44 | match c { 45 | 'A' | 'E' | 'I' | 'O' | 'U' => { 46 | if i == 0 || v[i - 1] == ' ' { 47 | ret.push(c); 48 | } 49 | } 50 | 'B' => { 51 | if (i == 0 || v[i - 1] != 'M') || next != '*' { 52 | ret.push('B'); 53 | } 54 | } 55 | 'C' => { 56 | if next == 'I' && nextnext == 'A' || next == 'H' { 57 | i += 1; 58 | ret.push('X'); 59 | } else if is_iey(next) { 60 | i += 1; 61 | ret.push('S'); 62 | } else { 63 | ret.push('K'); 64 | } 65 | } 66 | 'D' => { 67 | if next == 'G' && is_iey(nextnext) { 68 | i += 2; 69 | ret.push('J'); 70 | } else { 71 | ret.push('T'); 72 | } 73 | } 74 | 'F' | 'J' | 'L' | 'M' | 'N' | 'R' => { 75 | ret.push(c); 76 | } 77 | 'G' => { 78 | if is_iey(next) { 79 | ret.push('J'); 80 | } else if (next == 'H' && nextnext != '*' && !isvowel(nextnext)) 81 | || (next == 'N' && nextnext == '*') 82 | { 83 | i += 1; 84 | } else { 85 | ret.push('K'); 86 | } 87 | } 88 | 'H' => { 89 | if i == 0 || isvowel(next) || !isvowel(v[i - 1]) { 90 | ret.push('H'); 91 | } 92 | } 93 | 'K' => { 94 | if i == 0 || v[i - 1] != 'C' { 95 | ret.push('K'); 96 | } 97 | } 98 | 'P' => { 99 | if next == 'H' { 100 | i += 1; 101 | ret.push('F'); 102 | } else { 103 | ret.push('P'); 104 | } 105 | } 106 | 'Q' => { 107 | ret.push('K'); 108 | } 109 | 'S' => { 110 | if next == 'H' { 111 | i += 1; 112 | ret.push('X'); 113 | } else if next == 'I' && (nextnext == 'O' || nextnext == 'A') { 114 | i += 2; 115 | ret.push('X'); 116 | } else { 117 | ret.push('S'); 118 | } 119 | } 120 | 'T' => { 121 | if next == 'I' && (nextnext == 'O' || nextnext == 'A') { 122 | ret.push('X'); 123 | } else if next == 'H' { 124 | i += 1; 125 | ret.push('0'); 126 | } else if next != 'C' || nextnext != 'H' { 127 | ret.push('T'); 128 | } 129 | } 130 | 'V' => { 131 | ret.push('F'); 132 | } 133 | 'W' => { 134 | if i == 0 && next == 'H' { 135 | i += 1; 136 | ret.push('W'); 137 | } else if isvowel(next) { 138 | ret.push('W'); 139 | } 140 | } 141 | 'X' => { 142 | if i == 0 { 143 | if next == 'H' || (next == 'I' && (nextnext == 'O' || nextnext == 'A')) { 144 | ret.push('X'); 145 | } else { 146 | ret.push('S'); 147 | } 148 | } else { 149 | ret.push('K'); 150 | ret.push('S'); 151 | } 152 | } 153 | 'Y' => { 154 | if isvowel(next) { 155 | ret.push('Y'); 156 | } 157 | } 158 | 'Z' => { 159 | ret.push('S'); 160 | } 161 | ' ' => { 162 | if !ret.is_empty() && ret[ret.len() - 1] != ' ' { 163 | ret.push(' '); 164 | } 165 | } 166 | _ => {} 167 | }; 168 | i += 1; 169 | } 170 | 171 | let mut str_key = String::new(); 172 | for k in ret { 173 | str_key.push(k); 174 | } 175 | 176 | str_key 177 | } 178 | 179 | #[cfg(test)] 180 | mod test { 181 | use super::*; 182 | use crate::testutils::testutils; 183 | #[test] 184 | fn test_metaphone() { 185 | testutils::test_str_func("testdata/metaphone.csv", metaphone); 186 | } 187 | } 188 | -------------------------------------------------------------------------------- /src/nysiis.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use smallvec::{smallvec, SmallVec}; 3 | use unicode_segmentation::UnicodeSegmentation; 4 | 5 | fn isvowel(s: &str) -> bool { 6 | matches!(s, "A" | "E" | "I" | "O" | "U") 7 | } 8 | 9 | pub fn nysiis(s: &str) -> String { 10 | if s.is_empty() { 11 | return String::from(""); 12 | } 13 | 14 | let s = &s.to_uppercase()[..]; 15 | let mut v = UnicodeSegmentation::graphemes(s, true).collect::>(); 16 | 17 | // step 1: handle prefixes 18 | if s.starts_with("MAC") { 19 | v[1] = "C"; // switch MAC to MCC 20 | } else if s.starts_with("KN") { 21 | v.remove(0); // strip leading K from KN 22 | } else if s.starts_with('K') { 23 | v[0] = "C"; // switch K to C 24 | } else if s.starts_with("PH") || s.starts_with("PF") { 25 | v[0] = "F"; 26 | v[1] = "F"; // switch these to FF 27 | } else if s.starts_with("SCH") { 28 | v[1] = "S"; 29 | v[2] = "S"; // switch SCH to SSS 30 | } 31 | 32 | // step 2: suffixes 33 | if s.ends_with("IE") || s.ends_with("EE") { 34 | v.pop(); 35 | v.pop(); 36 | v.push("Y"); 37 | } else if s.ends_with("DT") 38 | || s.ends_with("RT") 39 | || s.ends_with("RD") 40 | || s.ends_with("NT") 41 | || s.ends_with("ND") 42 | { 43 | v.pop(); 44 | v.pop(); 45 | v.push("D"); 46 | } 47 | 48 | // step 3: key starts with first character of name 49 | let mut key = FastVec::new(); 50 | key.push(v[0]); 51 | 52 | // step 4: translate remaining characters 53 | let mut i = 1; 54 | 55 | while i < v.len() { 56 | let chars: SmallVec<[&str; 3]> = match v[i] { 57 | "E" if i + 1 < v.len() && v[i + 1] == "V" => { 58 | i += 1; 59 | smallvec!["A", "F"] 60 | } 61 | "A" | "E" | "I" | "O" | "U" => smallvec!["A"], 62 | "Q" => smallvec!["G"], 63 | "Z" => smallvec!["S"], 64 | "M" => smallvec!["N"], 65 | "K" => { 66 | if i + 1 < v.len() && v[i + 1] == "N" { 67 | smallvec!["N"] 68 | } else { 69 | smallvec!["C"] 70 | } 71 | } 72 | "S" if i + 2 < v.len() && v[i + 1] == "C" && v[i + 2] == "H" => { 73 | i += 2; 74 | smallvec!["S", "S"] 75 | } 76 | "P" if i + 1 < v.len() && v[i + 1] == "H" => { 77 | i += 1; 78 | smallvec!["F"] 79 | } 80 | "H" if !isvowel(v[i - 1]) 81 | || (i + 1 < v.len() && !isvowel(v[i + 1])) 82 | || (i + 1 == v.len()) => 83 | { 84 | if isvowel(v[i - 1]) { 85 | smallvec!["A"] 86 | } else { 87 | smallvec![v[i - 1]] 88 | } 89 | } 90 | "W" if isvowel(v[i - 1]) => smallvec![v[i - 1]], 91 | _ => smallvec![v[i]], 92 | }; 93 | 94 | if !chars.is_empty() && chars[chars.len() - 1] != key[key.len() - 1] { 95 | for c in chars { 96 | key.push(c); 97 | } 98 | } 99 | 100 | i += 1; 101 | } 102 | 103 | // step 5 remove trailing S 104 | if key[key.len() - 1] == "S" && key.len() > 1 { 105 | key.pop(); 106 | } 107 | 108 | // step 6 replace AY w/ Y 109 | if key.ends_with(&["A", "Y"]) { 110 | key.remove(key.len() - 2); 111 | } 112 | 113 | // step 7 remove trailing A 114 | if key[key.len() - 1] == "A" && key.len() > 1 { 115 | key.pop(); 116 | } 117 | 118 | let mut str_key = String::new(); 119 | for k in key { 120 | str_key.push_str(k); 121 | } 122 | 123 | str_key 124 | } 125 | 126 | #[cfg(test)] 127 | mod test { 128 | use super::*; 129 | use crate::testutils::testutils; 130 | #[test] 131 | fn test_nysiis() { 132 | testutils::test_str_func("testdata/nysiis.csv", nysiis); 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /src/rustyfish.rs: -------------------------------------------------------------------------------- 1 | use crate::damerau_levenshtein_distance as _damerau; 2 | use crate::hamming_distance as _hamming; 3 | use crate::jaccard_similarity as _jaccard; 4 | use crate::jaro_similarity as _jaro; 5 | use crate::jaro_winkler_similarity as _jaro_winkler; 6 | use crate::jaro_winkler_similarity_longtol as _jaro_winkler_long; 7 | use crate::levenshtein_distance as _lev; 8 | use crate::match_rating_codex as _mr_codex; 9 | use crate::match_rating_comparison as _mr_comparison; 10 | use crate::metaphone as _metaphone; 11 | use crate::nysiis as _nysiis; 12 | use crate::soundex as _soundex; 13 | use pyo3::exceptions::PyValueError; 14 | use pyo3::prelude::*; 15 | 16 | /// Calculates the Damerau-Levenshtein distance between two strings. 17 | #[pyfunction] 18 | fn damerau_levenshtein_distance(a: &str, b: &str) -> PyResult { 19 | Ok(_damerau(a, b)) 20 | } 21 | 22 | // Calculates the Hamming distance between two strings. 23 | #[pyfunction] 24 | fn hamming_distance(a: &str, b: &str) -> PyResult { 25 | Ok(_hamming(a, b)) 26 | } 27 | 28 | // Calculates the Jaccard index between two strings. 29 | #[pyfunction] 30 | #[pyo3(signature=(a, b, ngram_size=None))] 31 | fn jaccard_similarity(a: &str, b: &str, ngram_size: Option) -> PyResult { 32 | Ok(_jaccard(a, b, ngram_size)) 33 | } 34 | 35 | // Calculates the Jaro similarity between two strings. 36 | #[pyfunction] 37 | fn jaro_similarity(a: &str, b: &str) -> PyResult { 38 | Ok(_jaro(a, b)) 39 | } 40 | 41 | // Calculates the Jaro-Winkler similarity between two strings. 42 | #[pyfunction] 43 | #[pyo3(signature=(a, b, long_tolerance=None))] 44 | fn jaro_winkler_similarity(a: &str, b: &str, long_tolerance: Option) -> PyResult { 45 | match long_tolerance { 46 | Some(true) => Ok(_jaro_winkler_long(a, b)), 47 | _ => Ok(_jaro_winkler(a, b)), 48 | } 49 | } 50 | 51 | // Calculates the Levenshtein distance between two strings. 52 | #[pyfunction] 53 | fn levenshtein_distance(a: &str, b: &str) -> PyResult { 54 | Ok(_lev(a, b)) 55 | } 56 | 57 | // Calculates the Match Rating Approach code for a string. 58 | #[pyfunction] 59 | fn match_rating_codex(a: &str) -> PyResult { 60 | // convert to ValueError 61 | _mr_codex(a).map_err(|e| PyErr::new::(format!("{}", e))) 62 | } 63 | 64 | // Calculates the Match Rating Approach comparison for two strings. 65 | #[pyfunction] 66 | fn match_rating_comparison(a: &str, b: &str) -> Option { 67 | match _mr_comparison(a, b) { 68 | Ok(value) => Some(value), 69 | Err(_) => None, 70 | } 71 | } 72 | 73 | /// Calculates the NYSIIS phonetic encoding of a string. 74 | #[pyfunction] 75 | fn nysiis(a: &str) -> PyResult { 76 | Ok(_nysiis(a)) 77 | } 78 | 79 | /// Calculates the phonetic encoding of a string using the Soundex algorithm. 80 | #[pyfunction] 81 | fn soundex(a: &str) -> PyResult { 82 | Ok(_soundex(a)) 83 | } 84 | 85 | /// Calculates the phonetic encoding of a string using the Metaphone algorithm. 86 | #[pyfunction] 87 | fn metaphone(a: &str) -> PyResult { 88 | Ok(_metaphone(a)) 89 | } 90 | 91 | /// A Python module implemented in Rust. 92 | #[pymodule] 93 | pub fn _rustyfish(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { 94 | m.add_function(wrap_pyfunction!(damerau_levenshtein_distance, m)?)?; 95 | m.add_function(wrap_pyfunction!(hamming_distance, m)?)?; 96 | m.add_function(wrap_pyfunction!(jaccard_similarity, m)?)?; 97 | m.add_function(wrap_pyfunction!(jaro_similarity, m)?)?; 98 | m.add_function(wrap_pyfunction!(jaro_winkler_similarity, m)?)?; 99 | m.add_function(wrap_pyfunction!(levenshtein_distance, m)?)?; 100 | m.add_function(wrap_pyfunction!(match_rating_codex, m)?)?; 101 | m.add_function(wrap_pyfunction!(match_rating_comparison, m)?)?; 102 | m.add_function(wrap_pyfunction!(nysiis, m)?)?; 103 | m.add_function(wrap_pyfunction!(soundex, m)?)?; 104 | m.add_function(wrap_pyfunction!(metaphone, m)?)?; 105 | 106 | Ok(()) 107 | } 108 | -------------------------------------------------------------------------------- /src/soundex.rs: -------------------------------------------------------------------------------- 1 | use crate::common::FastVec; 2 | use unicode_normalization::UnicodeNormalization; 3 | 4 | pub fn soundex(s: &str) -> String { 5 | if s.is_empty() { 6 | return String::from(""); 7 | } 8 | 9 | let v = &s.to_uppercase().nfkd().collect::>(); 10 | 11 | let mut result = FastVec::new(); 12 | result.push(v[0]); 13 | 14 | let replacement = |ch| match ch { 15 | 'B' | 'F' | 'P' | 'V' => '1', 16 | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2', 17 | 'D' | 'T' => '3', 18 | 'L' => '4', 19 | 'M' | 'N' => '5', 20 | 'R' => '6', 21 | _ => '*', 22 | }; 23 | 24 | // find would be replacement for first character 25 | let mut last = replacement(v[0]); 26 | 27 | // loop over remaining letters 28 | for letter in v.iter().skip(1) { 29 | let sub = replacement(*letter); 30 | if sub != '*' { 31 | if sub != last { 32 | result.push(sub); 33 | if result.len() == 4 { 34 | break; 35 | } 36 | } 37 | last = sub; 38 | } else if *letter != 'H' && *letter != 'W' { 39 | last = '*'; 40 | } 41 | } 42 | 43 | while result.len() < 4 { 44 | result.push('0'); 45 | } 46 | let mut str_key = String::new(); 47 | for k in result { 48 | str_key.push(k); 49 | } 50 | str_key 51 | } 52 | 53 | #[cfg(test)] 54 | mod test { 55 | use super::*; 56 | use crate::testutils::testutils; 57 | #[test] 58 | fn test_soundex() { 59 | testutils::test_str_func("testdata/soundex.csv", soundex); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/testutils.rs: -------------------------------------------------------------------------------- 1 | #[cfg(test)] 2 | pub mod testutils { 3 | use csv; 4 | use num_traits::{Float, FromPrimitive}; 5 | 6 | fn test_generic_func(filename: &str, func: F) 7 | where 8 | F: Fn(&str, &str, Option) -> T, // Signature for functions with ngram_size 9 | T: PartialEq + std::fmt::Debug + std::str::FromStr + Float + FromPrimitive, 10 | ::Err: std::fmt::Debug, 11 | { 12 | let mut reader = csv::ReaderBuilder::new() 13 | .has_headers(false) 14 | .from_path(filename) 15 | .unwrap(); 16 | let mut num_tested = 0; 17 | for result in reader.records() { 18 | let rec = result.unwrap(); 19 | let input1 = &rec[0]; 20 | let input2 = &rec[1]; 21 | let ngram_size = rec.get(3).and_then(|s| s.parse().ok()); 22 | 23 | let expected: T = rec[2].parse().expect("Failed to parse expected value"); 24 | let output = func(input1, input2, ngram_size); 25 | 26 | let abs_diff = (output.to_f64().unwrap() - expected.to_f64().unwrap()).abs(); 27 | assert!( 28 | abs_diff < 0.001, 29 | "comparing {} to {} (ngram_size: {:?}), expected {:?}, got {:?} (diff {:?})", 30 | input1, 31 | input2, 32 | ngram_size, 33 | expected, 34 | output, 35 | abs_diff 36 | ); 37 | 38 | num_tested += 1; 39 | } 40 | assert!(num_tested > 0); 41 | } 42 | 43 | pub fn test_distance_func(filename: &str, func: fn(&str, &str) -> usize) { 44 | let mut reader = csv::ReaderBuilder::new() 45 | .has_headers(false) 46 | .from_path(filename) 47 | .unwrap(); 48 | let mut num_tested = 0; 49 | for result in reader.records() { 50 | let rec = result.unwrap(); 51 | let input1 = &rec[0]; 52 | let input2 = &rec[1]; 53 | let expected: usize = rec[2].parse().expect("Failed to parse expected value"); 54 | let output = func(input1, input2); 55 | 56 | println!( 57 | "comparing {} to {}, expecting {:?}, got {:?}", 58 | input1, input2, expected, output 59 | ); 60 | assert_eq!(output, expected); 61 | num_tested += 1; 62 | } 63 | assert!(num_tested > 0); 64 | } 65 | 66 | // For functions with two string arguments 67 | pub fn test_similarity_func_two_args(filename: &str, func: fn(&str, &str) -> f64) { 68 | test_generic_func::(filename, |a, b, _| func(a, b)); 69 | } 70 | 71 | // For functions with three arguments (including the optional usize) 72 | pub fn test_similarity_func_three_args(filename: &str, func: fn(&str, &str, Option) -> f64) { 73 | test_generic_func::(filename, |a, b, n| func(a, b, n)); 74 | } 75 | 76 | pub fn test_str_func(filename: &str, func: fn(&str) -> String) { 77 | let mut reader = csv::ReaderBuilder::new() 78 | .has_headers(false) 79 | .from_path(filename) 80 | .unwrap(); 81 | let mut num_tested = 0; 82 | for result in reader.records() { 83 | let rec = result.unwrap(); 84 | let input1 = &rec[0]; 85 | let expected = rec[1].to_string(); 86 | 87 | let output = func(input1); 88 | 89 | println!( 90 | "comparing {}, expecting {:?}, got {:?}", 91 | input1, expected, output 92 | ); 93 | assert_eq!(output, expected); 94 | num_tested += 1; 95 | } 96 | assert!(num_tested > 0); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /testdata/README.md: -------------------------------------------------------------------------------- 1 | Test data for jellyfish string comparison and phonetic encoding algorithms. 2 | -------------------------------------------------------------------------------- /testdata/damerau_levenshtein.csv: -------------------------------------------------------------------------------- 1 | ,,0 2 | abc,,3 3 | bc,abc,1 4 | fuor,four,1 5 | abcd,acb,2 6 | cape sand recycling ,edith ann graham,17 7 | jellyifhs,jellyfish,2 8 | ifhs,fish,2 9 | "Hello, world!","Hello, world!",2 10 | -------------------------------------------------------------------------------- /testdata/hamming.csv: -------------------------------------------------------------------------------- 1 | ,,0 2 | ,abc,3 3 | abc,abc,0 4 | acc,abc,1 5 | abcd,abc,1 6 | abc,abcd,1 7 | testing,this is a test,13 8 | Saturday,Sunday,7 9 | -------------------------------------------------------------------------------- /testdata/jaccard.csv: -------------------------------------------------------------------------------- 1 | abc,xyz,0.0, 2 | abc,abc,1.0, 3 | abc,abcd,0.0, 4 | abcd,abce,0.0, 5 | abcd,abcde,0.0, 6 | french,quebec,0.0, 7 | france,quebec,0.0, 8 | france,france,1.0, 9 | The quick brown fox jumps over the lazy dog,The quick brown fox jumps over the lazy cat,0.8, 10 | The quick brown fox jumps over the lazy dog,The slow green turtle crawls under the lazy cat,0.2, 11 | John Smith,Smith; John,0.33333, 12 | John Smith,Smith John,1.0, 13 | John Smith,John Jacob Smith,0.666667, 14 | night,nacht,0.0, 15 | night,nacht,0.2,2 16 | night,nacht,0.33333,3 -------------------------------------------------------------------------------- /testdata/jaro_distance.csv: -------------------------------------------------------------------------------- 1 | dixon,dicksonx,0.767 2 | martha,marhta,0.944 3 | dwayne,duane,0.822 4 | 0ð00,0ð00,1 5 | "Sint-Pietersplein 6, 9000 Gent","Test 10, 1010 Brussel",0.518 6 | -------------------------------------------------------------------------------- /testdata/jaro_winkler.csv: -------------------------------------------------------------------------------- 1 | dixon,dicksonx,0.813 2 | martha,marhta,0.961 3 | dwayne,duane,0.84 4 | William,Williams,0.975 5 | ,foo,0 6 | a,a,1 7 | abc,xyz,0 8 | aaaa,aaaaa,0.96 9 | orangutan-kumquat,orangutan kumquat,0.976 10 | jaz,jal,0.822 11 | @,@@,0.85 12 | 0,0@,0.85 13 | a,ab,0.85 14 | 012345,0123456,0.971 15 | 012abc,012abcd,0.971 16 | 012abc,013abcd,0.879 17 | a1bc,a1be,0.883 18 | -------------------------------------------------------------------------------- /testdata/jaro_winkler_longtol.csv: -------------------------------------------------------------------------------- 1 | dixon,dicksonx,0.830 2 | martha,marhta,0.971 3 | dwayne,duane,0.869 4 | William,Williams,0.980 5 | ,foo,0 6 | a,a,1 7 | abc,xyz,0 8 | aaaa,aaaaa,0.96 9 | orangutan-kumquat,orangutan kumquat,0.986 10 | 1abcdefg,1abcdefh,0.96 11 | -------------------------------------------------------------------------------- /testdata/levenshtein.csv: -------------------------------------------------------------------------------- 1 | ,,0 2 | abc,,3 3 | ,abc,3 4 | bc,abc,1 5 | kitten,sitting,3 6 | Saturday,Sunday,3 7 | -------------------------------------------------------------------------------- /testdata/match_rating_codex.csv: -------------------------------------------------------------------------------- 1 | Byrne,BYRN 2 | Boern,BRN 3 | Smith,SMTH 4 | Smyth,SMYTH 5 | Catherine,CTHRN 6 | Kathryn,KTHRYN 7 | Kathrynoglin,KTHGLN 8 | Ad,AD 9 | Ed,ED 10 | William,WLM 11 | ä,Ä 12 | Frédéric,FRÉÉRC 13 | -------------------------------------------------------------------------------- /testdata/match_rating_comparison.csv: -------------------------------------------------------------------------------- 1 | Bryne,Boern,True 2 | Smith,Smyth,True 3 | Catherine,Kathryn,True 4 | Michael,Mike,False 5 | Tim,Timothy,None 6 | Ed,Ad,True 7 | Marie Helene,Maria Rio,True 8 | -------------------------------------------------------------------------------- /testdata/metaphone.csv: -------------------------------------------------------------------------------- 1 | DGIB,JB 2 | metaphone,MTFN 3 | wHErE,WR 4 | shell,XL 5 | this is a difficult string,0S IS A TFKLT STRNK 6 | aeromancy,ERMNS 7 | Antidisestablishmentarianism,ANTTSSTBLXMNTRNSM 8 | sunlight labs,SNLT LBS 9 | sonlite laabz,SNLT LBS 10 | Çáŕẗéř,KRTR 11 | kentucky,KNTK 12 | KENTUCKY,KNTK 13 | NXNXNX,NKSNKSNKS 14 | Aapti,PT 15 | Aarti,RT 16 | CIAB,XB 17 | NQ,NK 18 | sian,XN 19 | gek,JK 20 | Hb,HB 21 | Bho,BH 22 | Tiavyi,XFY 23 | Xhot,XHT 24 | Xnot,SNT 25 | g,K 26 | 8 queens,KNS 27 | Utah,UT 28 | WH,W 29 | walt,WLT 30 | ANDREW,ANTR 31 | why,W 32 | whynot,WNT 33 | acceptingness,AKSPTNKNS 34 | -------------------------------------------------------------------------------- /testdata/nysiis.csv: -------------------------------------------------------------------------------- 1 | Worthy,WARTY 2 | Ogata,OGAT 3 | montgomery,MANTGANARY 4 | Costales,CASTAL 5 | Tu,T 6 | martincevic,MARTANCAFAC 7 | Catherine,CATARAN 8 | Katherine,CATARAN 9 | Katerina,CATARAN 10 | Johnathan,JANATAN 11 | Jonathan,JANATAN 12 | John,JAN 13 | Teresa,TARAS 14 | Theresa,TARAS 15 | Jessica,JASAC 16 | Joshua,JAS 17 | Bosch,BAS 18 | Lapher,LAFAR 19 | wiyh,WY 20 | MacArthur,MCARTAR 21 | Pheenard,FANAD 22 | Schmittie,SNATY 23 | Knaqze,NAGS 24 | Knokno,NAN 25 | Knoko,NAC 26 | Macaw,MC 27 | , 28 | T,T 29 | S,S 30 | P,P 31 | K,C 32 | M,M 33 | E,E 34 | PFEISTER,FASTAR 35 | SARAH,SAR 36 | ç,Ç -------------------------------------------------------------------------------- /testdata/soundex.csv: -------------------------------------------------------------------------------- 1 | Washington,W252 2 | Lee,L000 3 | Gutierrez,G362 4 | Pfister,P236 5 | Jackson,J250 6 | Tymczak,T522 7 | , 8 | A,A000 9 | Çáŕẗéř,C636 10 | Ashcroft,A261 11 | ¿,¿000 -------------------------------------------------------------------------------- /testdata/wagner_fischer.csv: -------------------------------------------------------------------------------- 1 | ,,0, 2 | abc,,3,~~~ 3 | ,abc,3,~~~ 4 | bc,abc,1,~bc 5 | kitten,sitting,3,~itt~n~ 6 | Saturday,Sunday,3,s~~u~day 7 | -------------------------------------------------------------------------------- /tests/test_jellyfish.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import pytest 3 | 4 | open_kwargs = {"encoding": "utf8"} 5 | 6 | 7 | def assertAlmostEqual(a, b, places=3): 8 | assert abs(a - b) < (0.1**places) 9 | 10 | 11 | implementations = ["python", "rust"] 12 | 13 | 14 | @pytest.fixture(params=implementations) 15 | def jf(request): 16 | if request.param == "python": 17 | import jellyfish._jellyfish as jf 18 | elif request.param == "rust": 19 | from jellyfish import _rustyfish as jf 20 | return jf 21 | 22 | 23 | def _load_data(name): 24 | with open("testdata/{}.csv".format(name), **open_kwargs) as f: 25 | yield from csv.reader(f) 26 | 27 | 28 | @pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler"), ids=str) 29 | def test_jaro_winkler_similarity(jf, s1, s2, value): 30 | value = float(value) 31 | assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2), value, places=3) 32 | 33 | 34 | @pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler_longtol"), ids=str) 35 | def test_jaro_winkler_similarity_longtol(jf, s1, s2, value): 36 | value = float(value) 37 | assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2, True), value, places=3) 38 | 39 | 40 | @pytest.mark.parametrize("s1,s2,value", _load_data("jaro_distance"), ids=str) 41 | def test_jaro_similarity(jf, s1, s2, value): 42 | value = float(value) 43 | assertAlmostEqual(jf.jaro_similarity(s1, s2), value, places=3) 44 | 45 | 46 | @pytest.mark.parametrize("s1,s2,value", _load_data("hamming"), ids=str) 47 | def test_hamming_distance(jf, s1, s2, value): 48 | value = int(value) 49 | assert jf.hamming_distance(s1, s2) == value 50 | 51 | 52 | @pytest.mark.parametrize("s1,s2,value", _load_data("levenshtein"), ids=str) 53 | def test_levenshtein_distance(jf, s1, s2, value): 54 | value = int(value) 55 | assert jf.levenshtein_distance(s1, s2) == value 56 | 57 | 58 | @pytest.mark.parametrize("s1,s2,value", _load_data("damerau_levenshtein"), ids=str) 59 | def test_damerau_levenshtein_distance(jf, s1, s2, value): 60 | value = int(value) 61 | assert jf.damerau_levenshtein_distance(s1, s2) == value 62 | 63 | 64 | @pytest.mark.parametrize("s1,code", _load_data("soundex"), ids=str) 65 | def test_soundex(jf, s1, code): 66 | assert jf.soundex(s1) == code 67 | 68 | 69 | @pytest.mark.parametrize("s1,code", _load_data("metaphone"), ids=str) 70 | def test_metaphone(jf, s1, code): 71 | assert jf.metaphone(s1) == code 72 | 73 | 74 | @pytest.mark.parametrize("s1,s2", _load_data("nysiis"), ids=str) 75 | def test_nysiis(jf, s1, s2): 76 | assert jf.nysiis(s1) == s2 77 | 78 | 79 | @pytest.mark.parametrize("s1,s2", _load_data("match_rating_codex"), ids=str) 80 | def test_match_rating_codex(jf, s1, s2): 81 | assert jf.match_rating_codex(s1) == s2 82 | 83 | 84 | @pytest.mark.parametrize("s1,s2,value", _load_data("match_rating_comparison"), ids=str) 85 | def test_match_rating_comparison(jf, s1, s2, value): 86 | value = {"True": True, "False": False, "None": None}[value] 87 | assert jf.match_rating_comparison(s1, s2) is value 88 | 89 | 90 | def test_jaro_winkler_long_tolerance(jf): 91 | no_lt = jf.jaro_winkler_similarity( 92 | "two long strings", "two long stringz", long_tolerance=False 93 | ) 94 | with_lt = jf.jaro_winkler_similarity( 95 | "two long strings", "two long stringz", long_tolerance=True 96 | ) 97 | # make sure long_tolerance does something 98 | assertAlmostEqual(no_lt, 0.975) 99 | assertAlmostEqual(with_lt, 0.984) 100 | 101 | 102 | def test_damerau_levenshtein_distance_type(jf): 103 | jf.damerau_levenshtein_distance("abc", "abc") 104 | with pytest.raises(TypeError) as exc: 105 | jf.damerau_levenshtein_distance(b"abc", b"abc") 106 | 107 | 108 | def test_levenshtein_distance_type(jf): 109 | assert jf.levenshtein_distance("abc", "abc") == 0 110 | with pytest.raises(TypeError) as exc: 111 | jf.levenshtein_distance(b"abc", b"abc") 112 | 113 | 114 | def test_jaro_similarity_type(jf): 115 | assert jf.jaro_similarity("abc", "abc") == 1 116 | with pytest.raises(TypeError) as exc: 117 | jf.jaro_similarity(b"abc", b"abc") 118 | 119 | 120 | def test_jaro_winkler_type(jf): 121 | assert jf.jaro_winkler_similarity("abc", "abc") == 1 122 | with pytest.raises(TypeError) as exc: 123 | jf.jaro_winkler_similarity(b"abc", b"abc") 124 | 125 | 126 | def test_mra_comparison_type(jf): 127 | assert jf.match_rating_comparison("abc", "abc") is True 128 | with pytest.raises(TypeError) as exc: 129 | jf.match_rating_comparison(b"abc", b"abc") 130 | 131 | 132 | def test_hamming_type(jf): 133 | assert jf.hamming_distance("abc", "abc") == 0 134 | with pytest.raises(TypeError) as exc: 135 | jf.hamming_distance(b"abc", b"abc") 136 | 137 | 138 | def test_soundex_type(jf): 139 | assert jf.soundex("ABC") == "A120" 140 | with pytest.raises(TypeError) as exc: 141 | jf.soundex(b"ABC") 142 | 143 | 144 | def test_metaphone_type(jf): 145 | assert jf.metaphone("abc") == "ABK" 146 | with pytest.raises(TypeError) as exc: 147 | jf.metaphone(b"abc") 148 | 149 | 150 | def test_nysiis_type(jf): 151 | assert jf.nysiis("abc") == "ABC" 152 | with pytest.raises(TypeError) as exc: 153 | jf.nysiis(b"abc") 154 | 155 | 156 | def test_mr_codex_type(jf): 157 | assert jf.match_rating_codex("abc") == "ABC" 158 | with pytest.raises(TypeError) as exc: 159 | jf.match_rating_codex(b"abc") 160 | 161 | 162 | def test_mr_codex_bad_string(jf): 163 | with pytest.raises(ValueError) as exc: 164 | res = jf.match_rating_codex("i’m") 165 | print(res) 166 | --------------------------------------------------------------------------------