├── .coveragerc
├── .github
    ├── FUNDING.yml
    ├── dependabot.yml
    └── workflows
    │   └── CI.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── Cargo.toml
├── Justfile
├── LICENSE
├── README.md
├── benchmarks
    ├── compare.ipynb
    ├── timedruns-new.csv
    ├── timedruns-old.csv
    └── timedruns.py
├── docs
    ├── assets
    │   └── white-jellyfish.svg
    ├── changelog.md
    ├── functions.md
    └── index.md
├── mkdocs.yml
├── pyproject.toml
├── python
    └── jellyfish
    │   ├── __init__.py
    │   ├── __init__.pyi
    │   ├── _jellyfish.py
    │   └── py.typed
├── run-cov.sh
├── src
    ├── common.rs
    ├── hamming.rs
    ├── jaccard.rs
    ├── jaro.rs
    ├── levenshtein.rs
    ├── lib.rs
    ├── match_rating.rs
    ├── metaphone.rs
    ├── nysiis.rs
    ├── rustyfish.rs
    ├── soundex.rs
    └── testutils.rs
├── testdata
    ├── README.md
    ├── damerau_levenshtein.csv
    ├── hamming.csv
    ├── jaccard.csv
    ├── jaro_distance.csv
    ├── jaro_winkler.csv
    ├── jaro_winkler_longtol.csv
    ├── levenshtein.csv
    ├── match_rating_codex.csv
    ├── match_rating_comparison.csv
    ├── metaphone.csv
    ├── nysiis.csv
    ├── porter.csv
    ├── soundex.csv
    └── wagner_fischer.csv
└── tests
    └── test_jellyfish.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | omit = jellyfish/compat.py
3 |        jellyfish/test.py
4 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: jamesturk
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # https://docs.github.com/en/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 2 | version: 2
 3 | updates:
 4 |   - package-ecosystem: "pip"
 5 |     directory: "/"
 6 |     schedule:
 7 |       interval: "daily"
 8 |       time: "10:00"
 9 |     open-pull-requests-limit: 10
10 |   - package-ecosystem: "github-actions"
11 |     directory: "/"
12 |     schedule:
13 |       interval: "weekly"
14 | 


--------------------------------------------------------------------------------
/.github/workflows/CI.yml:
--------------------------------------------------------------------------------
  1 | # This file was edited manually to add
  2 | # The original was autogenerated by maturin v0.14.15
  3 | on:
  4 |   push:
  5 |     branches:
  6 |       - "main"
  7 |     tags:
  8 |       - "*"
  9 |   pull_request:
 10 |   workflow_dispatch:
 11 | 
 12 | permissions:
 13 |   contents: read
 14 | 
 15 | jobs:
 16 |   lint_and_test:
 17 |     runs-on: ubuntu-latest
 18 |     strategy:
 19 |       matrix:
 20 |         python-version: ["3.9", "3.13", "pypy3.11"]
 21 |     steps:
 22 |       - uses: actions/checkout@v4
 23 |         with:
 24 |           submodules: recursive
 25 |       - uses: actions/setup-python@v5
 26 |         with:
 27 |           python-version: ${{ matrix.python-version }}
 28 |       - name: Build wheels
 29 |         uses: PyO3/maturin-action@v1
 30 |         with:
 31 |           target: ${{ matrix.target }}
 32 |           args: --release --out dist -i ${{ matrix.python-version }}
 33 |           sccache: "true"
 34 |       - name: Install Just
 35 |         uses: extractions/setup-just@v3
 36 |       - name: Run Cargo Tests
 37 |         run: |
 38 |           cargo test
 39 |       - name: Run pytest
 40 |         run: |
 41 |           # just venv pytest
 42 |           rm -rf .venv
 43 |           python3 -m venv .venv
 44 |           . .venv/bin/activate
 45 |           .venv/bin/pip install wheel pytest mkdocs-material
 46 |           maturin develop
 47 |           .venv/bin/pytest
 48 | 
 49 |   linux:
 50 |     runs-on: ubuntu-latest
 51 |     needs: lint_and_test
 52 |     strategy:
 53 |       matrix:
 54 |         platform:
 55 |           - target: x64
 56 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
 57 |           - target: aarch64
 58 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
 59 |           - target: armv7
 60 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
 61 |     steps:
 62 |       - uses: actions/checkout@v4
 63 |         with:
 64 |           submodules: recursive
 65 |       - name: Build wheels
 66 |         uses: PyO3/maturin-action@v1
 67 |         with:
 68 |           target: ${{ matrix.platform.target }}
 69 |           args: --release --out dist -i ${{ matrix.platform.interpreter }}
 70 |           sccache: "true"
 71 |           manylinux: auto
 72 |       - name: Upload wheels
 73 |         uses: actions/upload-artifact@v4
 74 |         with:
 75 |           name: wheels-linux-${{ strategy.job-index }}
 76 |           path: dist
 77 |   musllinux:
 78 |     runs-on: ubuntu-latest
 79 |     needs: lint_and_test
 80 |     strategy:
 81 |       matrix:
 82 |         platform:
 83 |           - target: x86_64-unknown-linux-musl
 84 |             arch: x86_64
 85 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
 86 |           - target: i686-unknown-linux-musl
 87 |             arch: x86
 88 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
 89 |           - target: aarch64-unknown-linux-musl
 90 |             arch: aarch64
 91 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
 92 |         # all values: [x86_64, x86, aarch64, armhf, armv7, ppc64le, riscv64, s390x]
 93 |         # { target: "armv7-unknown-linux-musleabihf", image_tag: "armv7" },
 94 |         # { target: "powerpc64le-unknown-linux-musl", image_tag: "ppc64le" },
 95 |     steps:
 96 |       - uses: actions/checkout@v4
 97 |         with:
 98 |           submodules: recursive
 99 |       - name: Setup QEMU
100 |         uses: docker/setup-qemu-action@v3
101 |       - name: Build wheels
102 |         uses: PyO3/maturin-action@v1
103 |         with:
104 |           target: ${{ matrix.platform.target }}
105 |           args: --release --out dist -i ${{ matrix.platform.interpreter }}
106 |           sccache: "true"
107 |           manylinux: musllinux_1_1
108 |       - name: Upload wheels
109 |         uses: actions/upload-artifact@v4
110 |         with:
111 |           name: wheels-musl-${{ strategy.job-index }}
112 |           path: dist
113 | 
114 |   windows:
115 |     runs-on: windows-latest
116 |     needs: lint_and_test
117 |     strategy:
118 |       matrix:
119 |         target: [x64, x86]
120 |         interpreter: [3.9, "3.10", "3.11", "3.12", "3.13"]
121 |     steps:
122 |       - uses: actions/checkout@v4
123 |         with:
124 |           submodules: recursive
125 |       - uses: actions/setup-python@v5
126 |         with:
127 |           python-version: ${{ matrix.interpreter }}
128 |       - name: Build wheels
129 |         uses: PyO3/maturin-action@v1
130 |         with:
131 |           target: ${{ matrix.target }}
132 |           args: --release --out dist -i ${{ matrix.interpreter }}
133 |           sccache: "true"
134 |       - name: Upload wheels
135 |         uses: actions/upload-artifact@v4
136 |         with:
137 |           path: dist
138 |           name: wheels-win-${{ strategy.job-index }}
139 | 
140 |   macos:
141 |     runs-on: macos-latest
142 |     needs: lint_and_test
143 |     strategy:
144 |       matrix:
145 |         platform:
146 |           - target: x64
147 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
148 |           - target: aarch64
149 |             interpreter: 3.9 3.10 3.11 3.12 3.13 pypy3.9 pypy3.10 pypy3.11
150 |     steps:
151 |       - uses: actions/checkout@v4
152 |         with:
153 |           submodules: recursive
154 |       - name: Build wheels
155 |         uses: PyO3/maturin-action@v1
156 |         with:
157 |           target: ${{ matrix.platform.target }}
158 |           args: --release --out dist -i ${{ matrix.platform.interpreter }}
159 |           sccache: "true"
160 |       - name: Upload wheels
161 |         uses: actions/upload-artifact@v4
162 |         with:
163 |           name: wheels-mac-${{ strategy.job-index }}
164 |           path: dist
165 | 
166 |   sdist:
167 |     runs-on: ubuntu-latest
168 |     needs: lint_and_test
169 |     steps:
170 |       - uses: actions/checkout@v4
171 |         with:
172 |           submodules: recursive
173 |       - name: Build sdist
174 |         uses: PyO3/maturin-action@v1
175 |         with:
176 |           command: sdist
177 |           args: --out dist
178 |       - name: Upload sdist
179 |         uses: actions/upload-artifact@v4
180 |         with:
181 |           name: wheels-sdist-${{ strategy.job-index }}
182 |           path: dist
183 | 
184 |   release:
185 |     name: Release
186 |     runs-on: ubuntu-latest
187 |     if: "startsWith(github.ref, 'refs/tags/')"
188 |     needs: [linux, windows, macos, sdist, musllinux]
189 |     steps:
190 |       - uses: actions/download-artifact@v4
191 |         with:
192 |           pattern: wheels-*
193 |           merge-multiple: true
194 |       - name: Publish to PyPI
195 |         uses: PyO3/maturin-action@v1
196 |         env:
197 |           MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
198 |         with:
199 |           command: upload
200 |           args: --skip-existing *
201 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | jellyfish.egg-info/
 4 | *.so
 5 | *.swp
 6 | *.pyc
 7 | *.DS_Store
 8 | *~
 9 | .tox/
10 | .coverage
11 | htmlcov/
12 | .ropeproject/
13 | _build/
14 | .ipynb_checkpoints/
15 | .cache
16 | wheelhouse/
17 | site/
18 | target/
19 | Cargo.lock
20 | .venv
21 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | default_language_version:
 2 |     python: python3.8
 3 | repos:
 4 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v2.5.0  # Use the ref you want to point at
 6 |     hooks:
 7 |       - id: check-merge-conflict
 8 |       - id: debug-statements
 9 |       - id: flake8
10 |         args: ["--ignore=E203,E501,W503"]
11 | -   repo: https://github.com/ambv/black
12 |     rev: 19.10b0
13 |     hooks:
14 |       - id: black
15 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Turk"
 5 |   given-names: "James"
 6 |   orcid: https://orcid.org/0000-0003-1762-1420
 7 | title: "jellyfish"
 8 | version: 1.0.0
 9 | date-released: 2023-06-21
10 | url: "https://github.com/jamesturk/jellyfish"


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "jellyfish"
 3 | version = "1.2.0"
 4 | edition = "2021"
 5 | description = "Approximate and phonetic matching of strings."
 6 | authors = ["James Turk <dev@jamesturk.net>"]
 7 | repository = "https://github.com/jamesturk/jellyfish/"
 8 | license = "MIT"
 9 | readme = "README.md"
10 | 
11 | 
12 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
13 | [lib]
14 | name = "jellyfish"
15 | crate-type = ["cdylib"]
16 | 
17 | [dependencies]
18 | pyo3 = { version = "0.24.0", features = [] }
19 | unicode-segmentation = "^1.6.0"
20 | unicode-normalization = "^0.1"
21 | smallvec = "^1.13"
22 | ahash = "^0.8"
23 | num-traits = "0.2.19"
24 | 
25 | [dev-dependencies]
26 | csv = "1.1"
27 | 
28 | [features]
29 | python = []
30 | 


--------------------------------------------------------------------------------
/Justfile:
--------------------------------------------------------------------------------
 1 | pytest:
 2 |     maturin develop
 3 |     .venv/bin/pytest
 4 | 
 5 | test: pytest
 6 |     cargo test
 7 | 
 8 | deploy-docs:
 9 |     . .venv/bin/activate
10 |     mkdocs gh-deploy
11 | 
12 | venv:
13 |     rm -rf .venv
14 |     python3 -m venv .venv
15 |     . .venv/bin/activate
16 |     .venv/bin/pip install wheel pytest mkdocs-material
17 |     .venv/bin/pip install jupyter pandas seaborn
18 | 
19 | 
20 | timedruns-old:
21 |     .venv/bin/pip install jellyfish==0.10.0 # last C version
22 |     .venv/bin/python benchmarks/timedruns.py old > benchmarks/timedruns-old.csv
23 | 
24 | timedruns-new:
25 |     .venv/bin/pip uninstall jellyfish
26 |     .venv/bin/pip install -e .
27 |     #.venv/bin/pip install --pre jellyfish # latest Rust version
28 |     .venv/bin/python benchmarks/timedruns.py new >> benchmarks/timedruns-new.csv
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2015 James Turk
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | **jellyfish** is a library for approximate & phonetic matching of strings.
 4 | 
 5 | Source: [https://github.com/jamesturk/jellyfish](https://github.com/jamesturk/jellyfish)
 6 | 
 7 | Documentation: [https://jamesturk.github.io/jellyfish/](https://jamesturk.github.io/jellyfish/)
 8 | 
 9 | Issues: [https://github.com/jamesturk/jellyfish/issues](https://github.com/jamesturk/jellyfish/issues)
10 | 
11 | [![PyPI badge](https://badge.fury.io/py/jellyfish.svg)](https://badge.fury.io/py/jellyfish)
12 | [![Test badge](https://github.com/jamesturk/jellyfish/workflows/Python%20package/badge.svg)](https://github.com/jamesturk/jellyfish/actions?query=workflow%3A%22Python+package)
13 | [![Coveralls](https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master)](https://coveralls.io/r/jamesturk/jellyfish)
14 | ![Test Rust](https://github.com/jamesturk/rust-jellyfish/workflows/Test%20Rust/badge.svg)
15 | 
16 | ## Included Algorithms
17 | 
18 | String comparison:
19 | 
20 | * Levenshtein Distance
21 | * Damerau-Levenshtein Distance
22 | * Jaccard Index
23 | * Jaro Distance
24 | * Jaro-Winkler Distance
25 | * Match Rating Approach Comparison
26 | * Hamming Distance
27 | 
28 | Phonetic encoding:
29 | 
30 | * American Soundex
31 | * Metaphone
32 | * NYSIIS (New York State Identification and Intelligence System)
33 | * Match Rating Codex
34 | 
35 | ## Example Usage
36 | 
37 | ``` python
38 | >>> import jellyfish
39 | >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
40 | 2
41 | >>> jellyfish.jaro_similarity('jellyfish', 'smellyfish')
42 | 0.89629629629629637
43 | >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
44 | 1
45 | 
46 | >>> jellyfish.metaphone('Jellyfish')
47 | 'JLFX'
48 | >>> jellyfish.soundex('Jellyfish')
49 | 'J412'
50 | >>> jellyfish.nysiis('Jellyfish')
51 | 'JALYF'
52 | >>> jellyfish.match_rating_codex('Jellyfish')
53 | 'JLLFSH'
54 | ```
55 | 


--------------------------------------------------------------------------------
/benchmarks/timedruns-new.csv:
--------------------------------------------------------------------------------
 1 | 3.10.7,0.11a1,rust,damerau_levenshtein_distance,2.955821124999602e-06
 2 | 3.10.7,0.11a1,rust,hamming_distance,2.637990829998671e-07
 3 | 3.10.7,0.11a1,rust,jaro_similarity,8.923487499996554e-07
 4 | 3.10.7,0.11a1,rust,jaro_winkler_similarity,5.265191250000499e-07
 5 | 3.10.7,0.11a1,rust,levenshtein_distance,5.327967920002266e-07
 6 | 3.10.7,0.11a1,rust,match_rating_codex,3.9641191699956834e-07
 7 | 3.10.7,0.11a1,rust,match_rating_comparison,7.64051959000426e-07
 8 | 3.10.7,0.11a1,rust,metaphone,4.791485000005196e-07
 9 | 3.10.7,0.11a1,rust,nysiis,6.270804579999094e-07
10 | 3.10.7,0.11a1,rust,soundex,3.9677620900056354e-07
11 | 3.10.7,0.11.0,rust,damerau_levenshtein_distance,2.200372166997113e-06
12 | 3.10.7,0.11.0,rust,hamming_distance,1.723820409970358e-07
13 | 3.10.7,0.11.0,rust,jaro_similarity,6.059524590018554e-07
14 | 3.10.7,0.11.0,rust,jaro_winkler_similarity,2.81896541993774e-07
15 | 3.10.7,0.11.0,rust,levenshtein_distance,2.6762129200506027e-07
16 | 3.10.7,0.11.0,rust,match_rating_codex,3.020092500009923e-07
17 | 3.10.7,0.11.0,rust,match_rating_comparison,4.794018750035321e-07
18 | 3.10.7,0.11.0,rust,metaphone,3.206092919936054e-07
19 | 3.10.7,0.11.0,rust,nysiis,3.3875070799695096e-07
20 | 3.10.7,0.11.0,rust,soundex,2.549132920030388e-07
21 | 3.10.7,dev,rust,damerau_levenshtein_distance,1.2226207920029991e-06
22 | 3.10.7,dev,rust,hamming_distance,1.7096670799946878e-07
23 | 3.10.7,dev,rust,jaro_similarity,6.012054580060067e-07
24 | 3.10.7,dev,rust,jaro_winkler_similarity,2.8654966699832583e-07
25 | 3.10.7,dev,rust,levenshtein_distance,2.7065066699287856e-07
26 | 3.10.7,dev,rust,match_rating_codex,2.96483124999213e-07
27 | 3.10.7,dev,rust,match_rating_comparison,4.7412966699630485e-07
28 | 3.10.7,dev,rust,metaphone,3.101041250047274e-07
29 | 3.10.7,dev,rust,nysiis,3.454310419911053e-07
30 | 3.10.7,dev,rust,soundex,2.5703445900580847e-07


--------------------------------------------------------------------------------
/benchmarks/timedruns-old.csv:
--------------------------------------------------------------------------------
 1 | 3.10.7,0.10-classic,c,damerau_levenshtein_distance,4.3809779200000775e-07
 2 | 3.10.7,0.10-classic,c,hamming_distance,8.937791700009256e-08
 3 | 3.10.7,0.10-classic,c,jaro_similarity,2.503094580006291e-07
 4 | 3.10.7,0.10-classic,c,jaro_winkler_similarity,1.972025830000348e-07
 5 | 3.10.7,0.10-classic,c,levenshtein_distance,1.5478662499936037e-07
 6 | 3.10.7,0.10-classic,c,match_rating_codex,2.1903375000056258e-07
 7 | 3.10.7,0.10-classic,c,match_rating_comparison,3.148877909998191e-07
 8 | 3.10.7,0.10-classic,c,metaphone,3.495554169994648e-07
 9 | 3.10.7,0.10-classic,c,nysiis,2.2051829199972418e-07
10 | 3.10.7,0.10-classic,c,soundex,2.6794874999995953e-07
11 | 3.10.7,0.10-classic,python,damerau_levenshtein_distance,3.269755224999972e-05
12 | 3.10.7,0.10-classic,python,hamming_distance,4.6421708400066563e-07
13 | 3.10.7,0.10-classic,python,jaro_similarity,8.32981374999963e-06
14 | 3.10.7,0.10-classic,python,jaro_winkler_similarity,3.957727625000189e-06
15 | 3.10.7,0.10-classic,python,levenshtein_distance,4.634622290999687e-06
16 | 3.10.7,0.10-classic,python,match_rating_codex,6.073832079991917e-07
17 | 3.10.7,0.10-classic,python,match_rating_comparison,2.1926620000003824e-06
18 | 3.10.7,0.10-classic,python,metaphone,2.464329958000235e-06
19 | 3.10.7,0.10-classic,python,nysiis,1.960830291000093e-06
20 | 3.10.7,0.10-classic,python,soundex,1.4157104160003654e-06
21 | 


--------------------------------------------------------------------------------
/benchmarks/timedruns.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import timeit
 3 | import csv
 4 | 
 5 | open_kwargs = {"encoding": "utf8"}
 6 | 
 7 | 
 8 | def _load_data(name):
 9 |     with open("./testdata/{}.csv".format(name), **open_kwargs) as f:
10 |         yield from csv.reader(f)
11 | 
12 | 
13 | def _load_n(name, n):
14 |     data = []
15 |     iterator = _load_data(name)
16 |     while n > 0:
17 |         try:
18 |             data.append(next(iterator))
19 |             n -= 1
20 |         except StopIteration:
21 |             iterator = _load_data(name)
22 | 
23 |     return data
24 | 
25 | 
26 | def time_func(funcname, name, params, ftype):
27 |     TEST_N = 100
28 |     TEST_ITERATIONS = 10000
29 |     if params == 1:
30 |         run = "[{}(x) for x, y in data]".format(funcname)
31 |     elif params == 2:
32 |         run = "[{}(x, y) for x, y, z in data]".format(funcname)
33 | 
34 |     if ftype == "python":
35 |         path = "_jellyfish"
36 |     elif ftype == "c":
37 |         path = "cjellyfish"
38 |     elif ftype == "rust":
39 |         path = "_rustyfish"
40 | 
41 |     return (
42 |         timeit.timeit(
43 |             run,
44 |             setup="""from __main__ import _load_n
45 | from jellyfish.{} import {}
46 | data = _load_n('{}', {})
47 | """.format(
48 |                 path, funcname, name, TEST_N
49 |             ),
50 |             number=TEST_ITERATIONS,
51 |         )
52 |         / (TEST_N * TEST_ITERATIONS)
53 |     )
54 | 
55 | 
56 | testing = [
57 |     ("damerau_levenshtein_distance", "damerau_levenshtein", 2),
58 |     ("hamming_distance", "hamming", 2),
59 |     ("jaro_similarity", "jaro_distance", 2),
60 |     ("jaro_winkler_similarity", "jaro_winkler", 2),
61 |     ("levenshtein_distance", "levenshtein", 2),
62 |     ("match_rating_codex", "match_rating_codex", 1),
63 |     ("match_rating_comparison", "match_rating_comparison", 2),
64 |     ("metaphone", "metaphone", 1),
65 |     ("nysiis", "nysiis", 1),
66 |     ("soundex", "soundex", 1),
67 | ]
68 | 
69 | 
70 | def main():
71 |     py_version = "{}.{}.{}".format(*sys.version_info[0:3])
72 |     if sys.argv[1] == "old":
73 |         jf_version = "0.10"
74 |         ftypes = ("c", "python")
75 |     elif sys.argv[1] == "new":
76 |         jf_version = "dev"
77 |         ftypes = ("rust",)
78 | 
79 |     for ftype in ftypes:
80 |         for funcname, name, params in testing:
81 |             result = time_func(funcname, name, params, ftype)
82 |             print(f"{py_version},{jf_version},{ftype},{funcname},{result}")
83 | 
84 | 
85 | if __name__ == "__main__":
86 |     main()
87 | 


--------------------------------------------------------------------------------
/docs/assets/white-jellyfish.svg:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <svg fill="#FFFFFF" width="75pt" height="75pt" version="1.1" viewBox="0 0 75 75" xmlns="http://www.w3.org/2000/svg">
 3 |  <g id="surface73569" fill-rule="evenodd">
 4 |   <path d="m55.078 34.898v19.602c0 1.3086 1.0352 2.3398 2.3438 2.3398s2.3438-1.0312 2.3438-2.3398v-1.1719c-0.007813-0.65234 0.51953-1.1875 1.1719-1.1875s1.1797 0.53516 1.1719 1.1875v1.1719c0 2.5742-2.1133 4.6836-4.6875 4.6836s-4.6875-2.1094-4.6875-4.6836v-19.602z"/>
 5 |   <path d="m19.922 34.895v19.602c0 1.3086-1.0352 2.3398-2.3438 2.3398s-2.3438-1.0312-2.3438-2.3398v-1.1719c0.007813-0.65234-0.51953-1.1875-1.1719-1.1875s-1.1797 0.53516-1.1719 1.1875v1.1719c0 2.5742 2.1133 4.6836 4.6875 4.6836s4.6875-2.1094 4.6875-4.6836v-19.602z"/>
 6 |   <path d="m36.055 5.8594c-8.5938 0-15.984 6.1523-17.57 14.617l-0.51172 2.7539c-0.71875 3.8516-1.8164 7.6289-3.2695 11.27l-0.55859 1.3945c-0.37891 0.95703 0.57031 1.9023 1.5234 1.5195l5.4023-2.1602 4.8203 2.1406c0.30469 0.13281 0.64844 0.13281 0.95312 0l4.8203-2.1406 5.4023 2.1602c0.27734 0.11328 0.58984 0.11328 0.87109 0l5.4023-2.1602 4.8203 2.1406c0.30469 0.13281 0.64844 0.13281 0.95312 0l4.8203-2.1406 5.4023 2.1602c0.95312 0.38281 1.9023-0.5625 1.5234-1.5195l-0.55859-1.3945c-1.4531-3.6406-2.5469-7.418-3.2695-11.27l-0.51172-2.7539c-1.582-8.4648-8.9766-14.617-17.57-14.617zm0 2.3438h2.8945c7.4766 0 13.883 5.332 15.262 12.703l0.51562 2.7617c0.67969 3.6328 1.7734 7.1641 3.082 10.621l-3.4648-1.3867c-0.29297-0.11719-0.62109-0.10547-0.91016 0.023437l-4.7969 2.1289-4.7969-2.1289c-0.28906-0.12891-0.61719-0.14062-0.91016-0.023437l-5.4297 2.1641-5.4258-2.168c-0.29297-0.11719-0.62109-0.10547-0.91016 0.023437l-4.7969 2.1289-4.7969-2.1289c-0.28906-0.12891-0.61719-0.14062-0.91016-0.023437l-3.4648 1.3867c1.3086-3.4531 2.4023-6.9883 3.082-10.621l0.51562-2.7617c1.3789-7.3711 7.7852-12.703 15.262-12.703z"/>
 7 |   <path d="m31.621 33.98c-0.64844 0.011719-1.1641 0.54297-1.1523 1.1914v29.285c0 1.3086-1.0352 2.3398-2.3438 2.3398s-2.3438-1.0312-2.3438-2.3398v-1.1719c0.007812-0.65234-0.51953-1.1875-1.1719-1.1875s-1.1797 0.53516-1.1719 1.1875v1.1719c0 2.5742 2.1133 4.6836 4.6875 4.6836s4.6875-2.1094 4.6875-4.6836v-29.285c0.011719-0.66016-0.52734-1.1992-1.1914-1.1914z"/>
 8 |   <path d="m43.34 33.98c-0.64844 0.011719-1.1641 0.54297-1.1523 1.1914v29.285c0 2.5742 2.1133 4.6836 4.6875 4.6836s4.6875-2.1094 4.6875-4.6836v-1.1719c0.007812-0.65234-0.51953-1.1875-1.1719-1.1875s-1.1797 0.53516-1.1719 1.1875v1.1719c0 1.3086-1.0352 2.3398-2.3438 2.3398s-2.3438-1.0312-2.3438-2.3398v-29.285c0.011719-0.66016-0.52734-1.1992-1.1914-1.1914z"/>
 9 |   <path d="m36.586 11.246c-5.6406 0-10.492 4.0352-11.531 9.5898-0.14453 0.64844 0.27344 1.2891 0.92969 1.4102 0.65234 0.12109 1.2734-0.32422 1.3711-0.98047 0.83594-4.4609 4.7031-7.6758 9.2266-7.6758 0.65234 0.007812 1.1875-0.51953 1.1875-1.1719 0-0.65234-0.53516-1.1797-1.1875-1.1719z"/>
10 |  </g>
11 | </svg>
12 | 


--------------------------------------------------------------------------------
/docs/changelog.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | ## 1.2.0 - 31 March 2025
  4 | 
  5 | - drop support for Python 3.8, add support for Python 3.13 and PyPy 3.11 (PyO3 0.24 bump)
  6 | 
  7 | ## 1.1.2 - 3 December 2024
  8 | 
  9 | - release that supports Python 3.13 on all supported platforms (thanks @energynumbers for help with Windows!)
 10 | 
 11 | ## 1.1.0 - 28 July 2024
 12 | 
 13 | - add jaccard_similarity thanks to Niklas von Moers (@NiklasvonM) (#214)
 14 | - update to PyO3 0.22
 15 | 
 16 | ## 1.0.4 - 28 May 2024
 17 | 
 18 | - `match_rating_codex` now returns consistent results for Unicode characters regardless of position (#210)
 19 | - adds prebuilt wheels for Alpine (#209)
 20 | 
 21 | ## 1.0.3 - 17 November 2023
 22 | 
 23 | - `match_rating_codex` now raises a `ValueError` when passed non-alpha characters (#200)
 24 | - adds prebuilt wheels for Python 3.12
 25 | 
 26 | ## 1.0.1 - 18 September 2023
 27 | 
 28 | - fully remove deprecated names
 29 | - add armv7 linux builds
 30 | - fully drop Python 3.7 support
 31 | 
 32 | ## 1.0.0 - 21 June 2023
 33 | 
 34 | - bump to 1.0 (no notable changes from 0.11.2)
 35 | 
 36 | ## 0.11.2 - 2 April 2023
 37 | 
 38 | - fix to Rust build process to build more wheels, thanks @MartinoMensio!
 39 | - switch to using `ahash` for Damerau-Levenshtein for speed gains
 40 | 
 41 | ## 0.11.1 - 30 March 2023
 42 | 
 43 | - fix missing testdata in packages
 44 | 
 45 | ## 0.11.0 - 27 March 2023
 46 | 
 47 | - switched to using Rust implementation for all algorithms
 48 | 
 49 | ## 0.10.0 - 25 March 2023
 50 | 
 51 | - removed rarely-used `porter_stem` function, better implementations exist
 52 | 
 53 | ## 0.9.0 - 7 January 2021
 54 | 
 55 | - updated documentation available at <https://jamesturk.github.io/jellyfish/>
 56 | - support for Python 3.10+
 57 | - handle spaces correctly in MRA algorithm
 58 | 
 59 | ## 0.8.9 - 26 October 2021
 60 | 
 61 | - fix buffer overflow in NYSIIS
 62 | - remove unnecessary/undocumented special casing of digits in Jaro-Winkler
 63 | 
 64 | ## 0.8.8 - 17 August 2021
 65 | 
 66 | - release fix to fix Linux wheel issue
 67 | 
 68 | ## 0.8.7 - 16 August 2021
 69 | 
 70 | - safer allocations from CJellyfish
 71 | - include aarch64 wheels
 72 | 
 73 | ## 0.8.4 - 4 August 2021
 74 | 
 75 | - fix for jaro winkler (cjellyfish#8)
 76 | 
 77 | ## 0.8.3 - 11 March 2021
 78 | 
 79 | - build changes
 80 | - include OSX and Windows wheels
 81 | 
 82 | ## 0.8.2 - 21 May 2020
 83 | 
 84 | - fix jaro_winkler/jaro_winkler_similarity mix-up
 85 | - deprecate jaro_distance in favor of jaro_similarity
 86 |   backwards compatible shim left in place, will be removed in 1.0
 87 | - (note: 0.8.1 was a broken release without proper C libraries)
 88 | 
 89 | ## 0.8.0 - 21 May 2020
 90 | 
 91 | - rename jaro_winkler to jaro_winkler_similarity to match other functions
 92 |   backwards compatible shim added, but will be removed in 1.0
 93 | - fix soundex bug with W/H cases, #83
 94 | - fix metaphone bug with WH prefix, #108
 95 | - fix C match rating codex bug with duplicate letters, #121
 96 | - fix metaphone bug with leading vowels and 'kn' pair, #123
 97 | - fix Python jaro_winkler bug #124
 98 | - fix Python 3.9 deprecation warning
 99 | - add manylinux wheels
100 | 
101 | ## 0.7.2 - 5 June 2019
102 | 
103 | - fix CJellyfish damerau_levenshtein w/ unicode, thanks to immerrr
104 | - fix final H in NYSIIS
105 | - fix issue w/ trailing W in metaphone
106 | 
107 | ## 0.7.1 - 10 January 2019
108 | 
109 | - restrict install to Python >= 3.4
110 | 
111 | ## 0.7.0 - 10 January 2019
112 | 
113 | - drop Python 2 compatibility & legacy code
114 | - add bugfix for NYSIIS for words starting with PF
115 | 
116 | ## 0.6.1 - April 16 2018
117 | 
118 | - fixed wheel release issue
119 | 
120 | ## 0.6.0 - April 7 2018
121 | 
122 | - fix quite a few bugs & differences between C/Py implementations
123 | - add wagner-fischer testdata
124 | - uppercase soundex result
125 | - better error handling in nysiis, soundex, and jaro
126 | 
127 | ## 0.5.6 - June 23 2016
128 | 
129 | - bugfix for metaphone & soundex raising unexpected TypeErrors on Windows (#54)
130 | 
131 | ## 0.5.5 - June 21 2016
132 | 
133 | - bugfix for metaphone WH case
134 | 
135 | ## 0.5.4 - May 13 2016
136 | 
137 | - bugfix for C version of damerau_levenshtein thanks to Tyler Sellon
138 | 
139 | ## 0.5.3 - March 15 2016
140 | 
141 | - style/packaging changes
142 | 
143 | ## 0.5.2 - February 3 2016
144 | 
145 | - testing fixes for Python 3.5
146 | - bugfix for Metaphone w/ silent H thanks to Jeremy Carbaugh
147 | 
148 | ## 0.5.1 - July 12 2015
149 | 
150 | - bugfixes for NYSIIS
151 | - bugfixes for metaphone
152 | - bugfix for C version of jaro_winkler
153 | 
154 | ## 0.5.0 - April 23 2015
155 | 
156 | - consistent unicode behavior, all functions take unicode and reject bytes on Py2 and 3, C and Python
157 | - parametrize tests
158 | - Windows compiler support
159 | 
160 | ## 0.4.0 - March 27 2015
161 | 
162 | - tons of new tests
163 | - documentation
164 | - split out cjellyfish
165 | - test all w/ unicode and plenty of fixes to accommodate
166 | - 100% test coverage
167 | 
168 | ## 0.3.4 - February 4 2015
169 | 
170 | - fix segfaults and memory leaks via Danrich Parrol
171 | 
172 | ## 0.3.3 - November 20 2014
173 | 
174 | - fix bugs in damerau and NYSIIS
175 | 
176 | ## 0.3.2 - August 11 2014
177 | 
178 | - fix for jaro-winkler from David McKean
179 | - more packaging fixes
180 | 
181 | ## 0.3.1 - July 16 2014
182 | 
183 | - packaging fix for C/Python alternative
184 | 
185 | ## 0.3.0 - July 15 2014
186 | 
187 | - python alternatives where C isn't available
188 | 
189 | ## 0.2.2 - March 14 2014
190 | 
191 | - testing fixes
192 | - assorted bugfixes in NYSIIS
193 | 
194 | ## 0.2.0 - January 26 2012
195 | 
196 | - incorporate some speed changes from Peter Scott
197 | - segfault bugfixes.
198 | 
199 | ## 0.1.2 - September 16 2010
200 | 
201 | - initial working release
202 | 


--------------------------------------------------------------------------------
/docs/functions.md:
--------------------------------------------------------------------------------
  1 | # Functions
  2 | 
  3 | Jellyfish provides a variety of functions for string comparison, phonetic encoding, and stemming.
  4 | 
  5 | ## String Comparison
  6 | 
  7 | These methods are all measures of the difference (aka edit distance) between two strings.
  8 | 
  9 | ### Levenshtein Distance
 10 | 
 11 | ``` python
 12 | def levenshtein_distance(s1: str, s2: str)
 13 | ```
 14 | 
 15 | Compute the Levenshtein distance between s1 and s2.
 16 | 
 17 | Levenshtein distance represents the number of insertions, deletions, and substitutions required to change one word to another.
 18 | 
 19 | For example: ``levenshtein_distance('berne', 'born') == 2`` representing the transformation of the first e to o and the deletion of the second e.
 20 | 
 21 | See the [Levenshtein distance article at Wikipedia](http://en.wikipedia.org/wiki/Levenshtein_distance) for more details.
 22 | 
 23 | ### Damerau-Levenshtein Distance
 24 | 
 25 | ``` python
 26 | def damerau_levenshtein_distance(s1: str, s2: str)
 27 | ```
 28 | 
 29 | Compute the Damerau-Levenshtein distance between s1 and s2.
 30 | 
 31 | A modification of Levenshtein distance, Damerau-Levenshtein distance counts transpositions (such as ifsh for fish) as a single edit.
 32 | 
 33 | Where ``levenshtein_distance('fish', 'ifsh') == 2`` as it would require a deletion and an insertion,
 34 | though ``damerau_levenshtein_distance('fish', 'ifsh') == 1`` as this counts as a transposition.
 35 | 
 36 | See the [Damerau-Levenshtein distance article at Wikipedia](http://en.wikipedia.org/wiki/Damerau-Levenshtein_distance) for more details.
 37 | 
 38 | ### Hamming Distance
 39 | 
 40 | ``` python
 41 | def hamming_distance(s1: str, s2: str)
 42 | ```
 43 | 
 44 | Compute the Hamming distance between s1 and s2.
 45 | 
 46 | Hamming distance is the measure of the number of characters that differ between two strings.
 47 | 
 48 | Typically Hamming distance is undefined when strings are of different length, but this implementation
 49 | considers extra characters as differing.  For example ``hamming_distance('abc', 'abcd') == 1``.
 50 | 
 51 | See the [Hamming distance article at Wikipedia](http://en.wikipedia.org/wiki/Hamming_distance) for more details.
 52 | 
 53 | ### Jaccard Similarity
 54 | 
 55 | ``` python
 56 | def jaccard_similarity(s1: str, s2: str, ngram_size: Optional[int] = None) -> float
 57 | ```
 58 | 
 59 | Compute the Jaccard index between s1 and s2.
 60 | 
 61 | The Jaccard index between two sets is defined as the number of elements of the intersection divided by the number of elements of the union of the two sets. The elements of the sets are ngrams (the substrings of length `ngram_size`) or words if `ngram_size` is `None`. The strings are split by whitespace.
 62 | 
 63 | The Jaccard index does not consider order of words/ngrams. Hence "hello world" and "world hello" have a Jaccard similarity of 1.
 64 | 
 65 | ### Jaro Similarity
 66 | 
 67 | ``` python
 68 | def jaro_similarity(s1: str, s2: str)
 69 | ```
 70 | 
 71 | Compute the Jaro similarity between s1 and s2.
 72 | 
 73 | Jaro distance is a string-edit distance that gives a floating point response in [0,1] where 0 represents two completely dissimilar strings and 1 represents identical strings.
 74 | 
 75 | !!! warning
 76 | 
 77 |     Prior to 0.8.1 this function was named jaro_distance.  It was removed in 1.0.
 78 | 
 79 | ### Jaro-Winkler Similarity
 80 | 
 81 | ``` python
 82 | def jaro_winkler_similarity(s1: str, s2: str)
 83 | ```
 84 | 
 85 | Compute the Jaro-Winkler similarity between s1 and s2.
 86 | 
 87 | Jaro-Winkler is a modification/improvement to Jaro distance, like Jaro it gives a floating point response in [0,1] where 0 represents two completely dissimilar strings and 1 represents identical strings.
 88 | 
 89 | !!! warning
 90 | 
 91 |     Prior to 0.8.1 this function was named jaro_winkler.  That name is still available, but is no longer recommended.
 92 |     It will be replaced in 1.0 with a correct version.
 93 | 
 94 | See the [Jaro-Winkler distance article at Wikipedia](http://en.wikipedia.org/wiki/Jaro-Winkler_distance) for more details.
 95 | 
 96 | ### Match Rating Approach (comparison)
 97 | 
 98 | ``` python
 99 | def match_rating_comparison(s1, s2)
100 | ```
101 | 
102 | Compare s1 and s2 using the match rating approach algorithm, returns ``True`` if strings are considered equivalent or ``False`` if not.  Can also return ``None`` if s1 and s2 are not comparable (length differs by more than 3).
103 | 
104 | The Match rating approach algorithm is an algorithm for determining whether or not two names are
105 | pronounced similarly.  Strings are first encoded using :py:func:`match_rating_codex` then compared according to the MRA algorithm.
106 | 
107 | See the [Match Rating Approach article at Wikipedia](http://en.wikipedia.org/wiki/Match_rating_approach) for more details.
108 | 
109 | ## Phonetic Encoding
110 | 
111 | These algorithms convert a string to a normalized phonetic encoding, converting a word to a representation of its pronunciation.  Each takes a single string and returns a coded representation.
112 | 
113 | 
114 | ### American Soundex
115 | 
116 | ``` python
117 | def soundex(s: str)
118 | ```
119 | 
120 | Calculate the American Soundex of the string s.
121 | 
122 | Soundex is an algorithm to convert a word (typically a name) to a four digit code in the form 
123 | 'A123' where 'A' is the first letter of the name and the digits represent similar sounds.
124 | 
125 | For example ``soundex('Ann') == soundex('Anne') == 'A500'`` and
126 | ``soundex('Rupert') == soundex('Robert') == 'R163'``.
127 | 
128 | See the [Soundex article at Wikipedia](http://en.wikipedia.org/wiki/Soundex) for more details.
129 | 
130 | 
131 | ### Metaphone
132 | 
133 | ``` python
134 | def metaphone(s: str)
135 | ```
136 | 
137 | Calculate the metaphone code for the string s.
138 | 
139 | The metaphone algorithm was designed as an improvement on Soundex.  It transforms a word into a
140 | string consisting of '0BFHJKLMNPRSTWXY' where '0' is pronounced 'th' and 'X' is a '[sc]h' sound.
141 | 
142 | For example ``metaphone('Klumpz') == metaphone('Clumps') == 'KLMPS'``.
143 | 
144 | See the [Metaphone article at Wikipedia](http://en.wikipedia.org/wiki/Metaphone) for more details.
145 | 
146 | 
147 | ### NYSIIS
148 | 
149 | ``` python
150 | def nysiis(s: str)
151 | ```
152 | 
153 | Calculate the NYSIIS code for the string s.
154 | 
155 | The NYSIIS algorithm is an algorithm developed by the New York State Identification and Intelligence System.  It transforms a word into a phonetic code.  Like soundex and metaphone it is primarily intended for use on names (as they would be pronounced in English).
156 | 
157 | For example ``nysiis('John') == nysiis('Jan') == JAN``.
158 | 
159 | See the [NYSIIS article at Wikipedia](http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System) for more details.
160 | 
161 | ### Match Rating Approach (codex)
162 | 
163 | ``` python
164 | def match_rating_codex(s: str)
165 | ```
166 | 
167 | Calculate the match rating approach value (also called PNI) for the string s.
168 | 
169 | The Match rating approach algorithm is an algorithm for determining whether or not two names are
170 | pronounced similarly.  The algorithm consists of an encoding function (similar to soundex or nysiis)
171 | which is implemented here as well as :py:func:`match_rating_comparison` which does the actual comparison.
172 | 
173 | See the [Match Rating Approach article at Wikipedia](http://en.wikipedia.org/wiki/Match_rating_approach) for more details.
174 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Overview
 2 | 
 3 | **jellyfish** is a library for approximate & phonetic matching of strings.
 4 | 
 5 | Source: [https://github.com/jamesturk/jellyfish](https://github.com/jamesturk/jellyfish)
 6 | 
 7 | Documentation: [https://jamesturk.github.io/jellyfish/](https://jamesturk.github.io/jellyfish/)
 8 | 
 9 | Issues: [https://github.com/jamesturk/jellyfish/issues](https://github.com/jamesturk/jellyfish/issues)
10 | 
11 | [![PyPI badge](https://badge.fury.io/py/jellyfish.svg)](https://badge.fury.io/py/jellyfish)
12 | [![Test badge](https://github.com/jamesturk/jellyfish/workflows/Python%20package/badge.svg)](https://github.com/jamesturk/jellyfish/actions?query=workflow%3A%22Python+package)
13 | [![Coveralls](https://coveralls.io/repos/jamesturk/jellyfish/badge.png?branch=master)](https://coveralls.io/r/jamesturk/jellyfish)
14 | 
15 | ## Included Algorithms
16 | 
17 | String comparison:
18 | 
19 | * Levenshtein Distance
20 | * Damerau-Levenshtein Distance
21 | * Jaccard Similarity
22 | * Jaro Distance
23 | * Jaro-Winkler Distance
24 | * Match Rating Approach Comparison
25 | * Hamming Distance
26 | 
27 | Phonetic encoding:
28 | 
29 | * American Soundex
30 | * Metaphone
31 | * NYSIIS (New York State Identification and Intelligence System)
32 | * Match Rating Codex
33 | 
34 | ## Implementations
35 | 
36 | Each algorithm has Rust and Python implementations.
37 | 
38 | The Rust implementations are used by default. The Python
39 | implementations are a remnant of an early version of
40 | the library and will probably be removed in 1.0.
41 | 
42 | To explicitly use a specific implementation, refer to the appropriate module::
43 | 
44 | ``` python
45 | import jellyfish._jellyfish as pyjellyfish
46 | import jellyfish.rustyfish as rustyfish
47 | ```
48 | 
49 | If you've already imported jellyfish and are not sure what implementation you
50 | are using, you can check by querying `jellyfish.library`.
51 | 
52 | ``` python
53 |   if jellyfish.library == 'Python':
54 |       # Python implementation
55 |   elif jellyfish.library == 'Rust':
56 |       # Rust implementation
57 | ```
58 | 
59 | ## Example Usage
60 | 
61 | ``` python
62 | >>> import jellyfish
63 | >>> jellyfish.levenshtein_distance('jellyfish', 'smellyfish')
64 | 2
65 | >>> jellyfish.jaro_similarity('jellyfish', 'smellyfish')
66 | 0.89629629629629637
67 | >>> jellyfish.damerau_levenshtein_distance('jellyfish', 'jellyfihs')
68 | 1
69 | 
70 | >>> jellyfish.metaphone('Jellyfish')
71 | 'JLFX'
72 | >>> jellyfish.soundex('Jellyfish')
73 | 'J412'
74 | >>> jellyfish.nysiis('Jellyfish')
75 | 'JALYF'
76 | >>> jellyfish.match_rating_codex('Jellyfish')
77 | 'JLLFSH'
78 | ```
79 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: jellyfish
 2 | site_url: https://jamesturk.github.io/jellyfish/
 3 | site_author: James Turk
 4 | site_description: A python library for approximate and phonetic matching of strings.
 5 | copyright: Copyright &copy; 2011 James Turk
 6 | repo_url: https://github.com/jamesturk/jellyfish
 7 | repo_name: jamesturk/jellyfish
 8 | edit_uri: edit/main/docs/
 9 | 
10 | theme:
11 |   logo: assets/white-jellyfish.svg
12 |   name: material
13 |   palette:
14 |     - scheme: default
15 |       primary: teal
16 |       accent: teal
17 |       toggle:
18 |         icon: material/toggle-switch-off-outline
19 |         name: Switch to dark mode
20 |     - scheme: slate
21 |       primary: teal
22 |       accent: teal
23 |       toggle:
24 |         icon: material/toggle-switch
25 |         name: Switch to light mode
26 | 
27 |   features:
28 |     #- navigation.tabs
29 |     - navigation.sections
30 |     - navigation.top
31 |     - content.tabs.link
32 |   icon:
33 |     repo: fontawesome/brands/github
34 | markdown_extensions:
35 |   - admonition
36 |   - def_list
37 |   - pymdownx.highlight
38 |   - pymdownx.tabbed
39 |   - pymdownx.superfences
40 |   - toc:
41 |       permalink: true
42 | plugins:
43 | - search
44 | 
45 | extra_css:
46 |   - assets/extra.css
47 | nav:
48 |   - 'index.md'
49 |   - 'functions.md'
50 |   - 'changelog.md'
51 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["maturin>=0.14,<2"]
 3 | build-backend = "maturin"
 4 | 
 5 | [project]
 6 | name = "jellyfish"
 7 | dynamic = ["version"]
 8 | requires-python = ">=3.9"
 9 | classifiers = [
10 |   "Programming Language :: Rust",
11 |   "Programming Language :: Python :: Implementation :: CPython",
12 |   "Programming Language :: Python :: Implementation :: PyPy",
13 |   "License :: OSI Approved :: MIT License",
14 |   "Operating System :: OS Independent",
15 |   "Development Status :: 5 - Production/Stable",
16 |   "Intended Audience :: Developers",
17 | ]
18 | 
19 | [project.urls]
20 | homepage = "https://jamesturk.github.io/jellyfish/"
21 | documentation = "https://jamesturk.github.io/jellyfish/"
22 | repository = "https://github.com/jamesturk/jellyfish/"
23 | 
24 | 
25 | [tool.maturin]
26 | features = ["pyo3/extension-module", "python"]
27 | python-source = "python"
28 | module-name = "jellyfish._rustyfish"
29 | 


--------------------------------------------------------------------------------
/python/jellyfish/__init__.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | 
3 | from ._rustyfish import *
4 | from . import _jellyfish
5 | 


--------------------------------------------------------------------------------
/python/jellyfish/__init__.pyi:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | def levenshtein_distance(s1: str, s2: str) -> int: ...
 4 | def jaccard_similarity(s1: str, s2: str, ngram_size: Optional[int] = None) -> float: ...
 5 | def jaro_similarity(s1: str, s2: str) -> float: ...
 6 | def jaro_winkler_similarity(s1: str, s2: str, long_tolerance: bool = ...) -> float: ...
 7 | def damerau_levenshtein_distance(s1: str, s2: str) -> int: ...
 8 | def soundex(s: str) -> str: ...
 9 | def hamming_distance(s1: str, s2: str) -> int: ...
10 | def nysiis(s: str) -> str: ...
11 | def match_rating_codex(s: str) -> str: ...
12 | def match_rating_comparison(s1: str, s2: str) -> bool: ...
13 | def metaphone(s: str) -> str: ...
14 | 


--------------------------------------------------------------------------------
/python/jellyfish/_jellyfish.py:
--------------------------------------------------------------------------------
  1 | import unicodedata
  2 | from collections import defaultdict
  3 | from itertools import zip_longest
  4 | import warnings
  5 | 
  6 | 
  7 | def _normalize(s):
  8 |     return unicodedata.normalize("NFKD", s)
  9 | 
 10 | 
 11 | def _check_type(s):
 12 |     # warn here since each function will call this
 13 |     warnings.warn(
 14 |         "The jellyfish._jellyfish module is deprecated and will be removed in jellyfish 1.0.",
 15 |         DeprecationWarning,
 16 |     )
 17 |     if not isinstance(s, str):
 18 |         raise TypeError("expected str or unicode, got %s" % type(s).__name__)
 19 | 
 20 | 
 21 | def levenshtein_distance(s1, s2):
 22 |     _check_type(s1)
 23 |     _check_type(s2)
 24 | 
 25 |     if s1 == s2:
 26 |         return 0
 27 |     rows = len(s1) + 1
 28 |     cols = len(s2) + 1
 29 | 
 30 |     if not s1:
 31 |         return cols - 1
 32 |     if not s2:
 33 |         return rows - 1
 34 | 
 35 |     prev = None
 36 |     cur = range(cols)
 37 |     for r in range(1, rows):
 38 |         prev, cur = cur, [r] + [0] * (cols - 1)
 39 |         for c in range(1, cols):
 40 |             deletion = prev[c] + 1
 41 |             insertion = cur[c - 1] + 1
 42 |             edit = prev[c - 1] + (0 if s1[r - 1] == s2[c - 1] else 1)
 43 |             cur[c] = min(edit, deletion, insertion)
 44 | 
 45 |     return cur[-1]
 46 | 
 47 | 
 48 | def _jaro_winkler(s1, s2, long_tolerance, winklerize):
 49 |     _check_type(s1)
 50 |     _check_type(s2)
 51 | 
 52 |     s1_len = len(s1)
 53 |     s2_len = len(s2)
 54 | 
 55 |     if not s1_len or not s2_len:
 56 |         return 0.0
 57 | 
 58 |     min_len = min(s1_len, s2_len)
 59 |     search_range = max(s1_len, s2_len)
 60 |     search_range = (search_range // 2) - 1
 61 |     if search_range < 0:
 62 |         search_range = 0
 63 | 
 64 |     s1_flags = [False] * s1_len
 65 |     s2_flags = [False] * s2_len
 66 | 
 67 |     # looking only within search range, count & flag matched pairs
 68 |     common_chars = 0
 69 |     for i, s1_ch in enumerate(s1):
 70 |         low = max(0, i - search_range)
 71 |         hi = min(i + search_range, s2_len - 1)
 72 |         for j in range(low, hi + 1):
 73 |             if not s2_flags[j] and s2[j] == s1_ch:
 74 |                 s1_flags[i] = s2_flags[j] = True
 75 |                 common_chars += 1
 76 |                 break
 77 | 
 78 |     # short circuit if no characters match
 79 |     if not common_chars:
 80 |         return 0.0
 81 | 
 82 |     # count transpositions
 83 |     k = trans_count = 0
 84 |     for i, s1_f in enumerate(s1_flags):
 85 |         if s1_f:
 86 |             for j in range(k, s2_len):
 87 |                 if s2_flags[j]:
 88 |                     k = j + 1
 89 |                     break
 90 |             if s1[i] != s2[j]:
 91 |                 trans_count += 1
 92 |     trans_count //= 2
 93 | 
 94 |     # adjust for similarities in nonmatched characters
 95 |     common_chars = float(common_chars)
 96 |     weight = (
 97 |         (
 98 |             common_chars / s1_len
 99 |             + common_chars / s2_len
100 |             + (common_chars - trans_count) / common_chars
101 |         )
102 |     ) / 3
103 | 
104 |     # winkler modification: continue to boost if strings are similar
105 |     if winklerize and weight > 0.7:
106 |         # adjust for up to first 4 chars in common
107 |         j = min(min_len, 4)
108 |         i = 0
109 |         while i < j and s1[i] == s2[i]:
110 |             i += 1
111 |         if i:
112 |             weight += i * 0.1 * (1.0 - weight)
113 | 
114 |         # optionally adjust for long strings
115 |         # after agreeing beginning chars, at least two or more must agree and
116 |         # agreed characters must be > half of remaining characters
117 |         if (
118 |             long_tolerance
119 |             and min_len > 4
120 |             and common_chars > i + 1
121 |             and 2 * common_chars >= min_len + i
122 |         ):
123 |             weight += (1.0 - weight) * (
124 |                 float(common_chars - i - 1) / float(s1_len + s2_len - i * 2 + 2)
125 |             )
126 | 
127 |     return weight
128 | 
129 | 
130 | def jaro_similarity(s1, s2):
131 |     return _jaro_winkler(s1, s2, False, False)  # noqa
132 | 
133 | 
134 | def jaro_winkler_similarity(s1, s2, long_tolerance=False):
135 |     return _jaro_winkler(s1, s2, long_tolerance, True)  # noqa
136 | 
137 | 
138 | def damerau_levenshtein_distance(s1, s2):
139 |     _check_type(s1)
140 |     _check_type(s2)
141 | 
142 |     len1 = len(s1)
143 |     len2 = len(s2)
144 |     infinite = len1 + len2
145 | 
146 |     # character array
147 |     da = defaultdict(int)
148 | 
149 |     # distance matrix
150 |     score = [[0] * (len2 + 2) for x in range(len1 + 2)]
151 | 
152 |     score[0][0] = infinite
153 |     for i in range(0, len1 + 1):
154 |         score[i + 1][0] = infinite
155 |         score[i + 1][1] = i
156 |     for i in range(0, len2 + 1):
157 |         score[0][i + 1] = infinite
158 |         score[1][i + 1] = i
159 | 
160 |     for i in range(1, len1 + 1):
161 |         db = 0
162 |         for j in range(1, len2 + 1):
163 |             i1 = da[s2[j - 1]]
164 |             j1 = db
165 |             cost = 1
166 |             if s1[i - 1] == s2[j - 1]:
167 |                 cost = 0
168 |                 db = j
169 | 
170 |             score[i + 1][j + 1] = min(
171 |                 score[i][j] + cost,
172 |                 score[i + 1][j] + 1,
173 |                 score[i][j + 1] + 1,
174 |                 score[i1][j1] + (i - i1 - 1) + 1 + (j - j1 - 1),
175 |             )
176 |         da[s1[i - 1]] = i
177 | 
178 |     return score[len1 + 1][len2 + 1]
179 | 
180 | 
181 | def soundex(s):
182 |     _check_type(s)
183 | 
184 |     if not s:
185 |         return ""
186 | 
187 |     s = _normalize(s)
188 |     s = s.upper()
189 | 
190 |     replacements = (
191 |         ("BFPV", "1"),
192 |         ("CGJKQSXZ", "2"),
193 |         ("DT", "3"),
194 |         ("L", "4"),
195 |         ("MN", "5"),
196 |         ("R", "6"),
197 |     )
198 |     result = [s[0]]
199 |     count = 1
200 | 
201 |     # find would-be replacement for first character
202 |     for lset, sub in replacements:
203 |         if s[0] in lset:
204 |             last = sub
205 |             break
206 |     else:
207 |         last = None
208 | 
209 |     for letter in s[1:]:
210 |         for lset, sub in replacements:
211 |             if letter in lset:
212 |                 if sub != last:
213 |                     result.append(sub)
214 |                     count += 1
215 |                 last = sub
216 |                 break
217 |         else:
218 |             if letter != "H" and letter != "W":
219 |                 # leave last alone if middle letter is H or W
220 |                 last = None
221 |         if count == 4:
222 |             break
223 | 
224 |     result += "0" * (4 - count)
225 |     return "".join(result)
226 | 
227 | 
228 | def hamming_distance(s1, s2):
229 |     _check_type(s1)
230 |     _check_type(s2)
231 | 
232 |     # ensure length of s1 >= s2
233 |     if len(s2) > len(s1):
234 |         s1, s2 = s2, s1
235 | 
236 |     # distance is difference in length + differing chars
237 |     distance = len(s1) - len(s2)
238 |     for i, c in enumerate(s2):
239 |         if c != s1[i]:
240 |             distance += 1
241 | 
242 |     return distance
243 | 
244 | 
245 | def nysiis(s):
246 |     _check_type(s)
247 | 
248 |     if not s:
249 |         return ""
250 | 
251 |     s = s.upper()
252 |     key = []
253 | 
254 |     # step 1 - prefixes
255 |     if s.startswith("MAC"):
256 |         s = "MCC" + s[3:]
257 |     elif s.startswith("KN"):
258 |         s = s[1:]
259 |     elif s.startswith("K"):
260 |         s = "C" + s[1:]
261 |     elif s.startswith(("PH", "PF")):
262 |         s = "FF" + s[2:]
263 |     elif s.startswith("SCH"):
264 |         s = "SSS" + s[3:]
265 | 
266 |     # step 2 - suffixes
267 |     if s.endswith(("IE", "EE")):
268 |         s = s[:-2] + "Y"
269 |     elif s.endswith(("DT", "RT", "RD", "NT", "ND")):
270 |         s = s[:-2] + "D"
271 | 
272 |     # step 3 - first character of key comes from name
273 |     key.append(s[0])
274 | 
275 |     # step 4 - translate remaining chars
276 |     i = 1
277 |     len_s = len(s)
278 |     while i < len_s:
279 |         ch = s[i]
280 |         if ch == "E" and i + 1 < len_s and s[i + 1] == "V":
281 |             ch = "AF"
282 |             i += 1
283 |         elif ch in "AEIOU":
284 |             ch = "A"
285 |         elif ch == "Q":
286 |             ch = "G"
287 |         elif ch == "Z":
288 |             ch = "S"
289 |         elif ch == "M":
290 |             ch = "N"
291 |         elif ch == "K":
292 |             if i + 1 < len(s) and s[i + 1] == "N":
293 |                 ch = "N"
294 |             else:
295 |                 ch = "C"
296 |         elif ch == "S" and s[i + 1 : i + 3] == "CH":
297 |             ch = "SS"
298 |             i += 2
299 |         elif ch == "P" and i + 1 < len(s) and s[i + 1] == "H":
300 |             ch = "F"
301 |             i += 1
302 |         elif ch == "H" and (
303 |             s[i - 1] not in "AEIOU"
304 |             or (i + 1 < len(s) and s[i + 1] not in "AEIOU")
305 |             or (i + 1 == len(s))
306 |         ):
307 |             if s[i - 1] in "AEIOU":
308 |                 ch = "A"
309 |             else:
310 |                 ch = s[i - 1]
311 |         elif ch == "W" and s[i - 1] in "AEIOU":
312 |             ch = s[i - 1]
313 | 
314 |         if ch[-1] != key[-1][-1]:
315 |             key.append(ch)
316 | 
317 |         i += 1
318 | 
319 |     key = "".join(key)
320 | 
321 |     # step 5 - remove trailing S
322 |     if key.endswith("S") and key != "S":
323 |         key = key[:-1]
324 | 
325 |     # step 6 - replace AY w/ Y
326 |     if key.endswith("AY"):
327 |         key = key[:-2] + "Y"
328 | 
329 |     # step 7 - remove trailing A
330 |     if key.endswith("A") and key != "A":
331 |         key = key[:-1]
332 | 
333 |     # step 8 was already done
334 | 
335 |     return key
336 | 
337 | 
338 | def match_rating_codex(s):
339 |     _check_type(s)
340 | 
341 |     # we ignore spaces
342 |     s = s.upper().replace(" ", "")
343 |     # any remaining non-alphabetic characters are invalid
344 |     if not s.isalpha():
345 |         raise ValueError("string must be alphabetic")
346 | 
347 |     codex = []
348 | 
349 |     prev = None
350 |     first = True
351 |     for c in s:
352 |         # starting character
353 |         # or consonant not preceded by same consonant
354 |         if first or (c not in "AEIOU" and c != prev):
355 |             codex.append(c)
356 | 
357 |         prev = c
358 |         first = False
359 | 
360 |     # just use first/last 3
361 |     if len(codex) > 6:
362 |         return "".join(codex[:3] + codex[-3:])
363 |     else:
364 |         return "".join(codex)
365 | 
366 | 
367 | def match_rating_comparison(s1, s2):
368 |     codex1 = match_rating_codex(s1)
369 |     codex2 = match_rating_codex(s2)
370 |     len1 = len(codex1)
371 |     len2 = len(codex2)
372 |     res1 = []
373 |     res2 = []
374 | 
375 |     # length differs by 3 or more, no result
376 |     if abs(len1 - len2) >= 3:
377 |         return None
378 | 
379 |     # get minimum rating based on sums of codexes
380 |     lensum = len1 + len2
381 |     if lensum <= 4:
382 |         min_rating = 5
383 |     elif lensum <= 7:
384 |         min_rating = 4
385 |     elif lensum <= 11:
386 |         min_rating = 3
387 |     else:
388 |         min_rating = 2
389 | 
390 |     # strip off common prefixes
391 |     for c1, c2 in zip_longest(codex1, codex2):
392 |         if c1 != c2:
393 |             if c1:
394 |                 res1.append(c1)
395 |             if c2:
396 |                 res2.append(c2)
397 | 
398 |     unmatched_count1 = unmatched_count2 = 0
399 |     for c1, c2 in zip_longest(reversed(res1), reversed(res2)):
400 |         if c1 != c2:
401 |             if c1:
402 |                 unmatched_count1 += 1
403 |             if c2:
404 |                 unmatched_count2 += 1
405 | 
406 |     return (6 - max(unmatched_count1, unmatched_count2)) >= min_rating
407 | 
408 | 
409 | def metaphone(s):
410 |     _check_type(s)
411 | 
412 |     result = []
413 | 
414 |     s = _normalize(s.lower())
415 | 
416 |     # skip first character if s starts with these
417 |     if s.startswith(("kn", "gn", "pn", "wr", "ae")):
418 |         s = s[1:]
419 | 
420 |     i = 0
421 | 
422 |     while i < len(s):
423 |         c = s[i]
424 |         next = s[i + 1] if i < len(s) - 1 else "*****"
425 |         nextnext = s[i + 2] if i < len(s) - 2 else "*****"
426 | 
427 |         # skip doubles except for cc
428 |         if c == next and c != "c":
429 |             i += 1
430 |             continue
431 | 
432 |         if c in "aeiou":
433 |             if i == 0 or s[i - 1] == " ":
434 |                 result.append(c)
435 |         elif c == "b":
436 |             if (not (i != 0 and s[i - 1] == "m")) or next:
437 |                 result.append("b")
438 |         elif c == "c":
439 |             if next == "i" and nextnext == "a" or next == "h":
440 |                 result.append("x")
441 |                 i += 1
442 |             elif next in "iey":
443 |                 result.append("s")
444 |                 i += 1
445 |             else:
446 |                 result.append("k")
447 |         elif c == "d":
448 |             if next == "g" and nextnext in "iey":
449 |                 result.append("j")
450 |                 i += 2
451 |             else:
452 |                 result.append("t")
453 |         elif c in "fjlmnr":
454 |             result.append(c)
455 |         elif c == "g":
456 |             if next in "iey":
457 |                 result.append("j")
458 |             elif next == "h" and nextnext and nextnext not in "aeiou":
459 |                 i += 1
460 |             elif next == "n" and not nextnext:
461 |                 i += 1
462 |             else:
463 |                 result.append("k")
464 |         elif c == "h":
465 |             if i == 0 or next in "aeiou" or s[i - 1] not in "aeiou":
466 |                 result.append("h")
467 |         elif c == "k":
468 |             if i == 0 or s[i - 1] != "c":
469 |                 result.append("k")
470 |         elif c == "p":
471 |             if next == "h":
472 |                 result.append("f")
473 |                 i += 1
474 |             else:
475 |                 result.append("p")
476 |         elif c == "q":
477 |             result.append("k")
478 |         elif c == "s":
479 |             if next == "h":
480 |                 result.append("x")
481 |                 i += 1
482 |             elif next == "i" and nextnext in "oa":
483 |                 result.append("x")
484 |                 i += 2
485 |             else:
486 |                 result.append("s")
487 |         elif c == "t":
488 |             if next == "i" and nextnext in "oa":
489 |                 result.append("x")
490 |             elif next == "h":
491 |                 result.append("0")
492 |                 i += 1
493 |             elif next != "c" or nextnext != "h":
494 |                 result.append("t")
495 |         elif c == "v":
496 |             result.append("f")
497 |         elif c == "w":
498 |             if i == 0 and next == "h":
499 |                 i += 1
500 |                 result.append("w")
501 |             elif next in "aeiou":
502 |                 result.append("w")
503 |         elif c == "x":
504 |             if i == 0:
505 |                 if next == "h" or (next == "i" and nextnext in "oa"):
506 |                     result.append("x")
507 |                 else:
508 |                     result.append("s")
509 |             else:
510 |                 result.append("k")
511 |                 result.append("s")
512 |         elif c == "y":
513 |             if next in "aeiou":
514 |                 result.append("y")
515 |         elif c == "z":
516 |             result.append("s")
517 |         elif c == " ":
518 |             if len(result) > 0 and result[-1] != " ":
519 |                 result.append(" ")
520 | 
521 |         i += 1
522 | 
523 |     return "".join(result).upper()
524 | 


--------------------------------------------------------------------------------
/python/jellyfish/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jamesturk/jellyfish/846fae4b210db8ff4ab9dfaed7e2ec9f372728a7/python/jellyfish/py.typed


--------------------------------------------------------------------------------
/run-cov.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | export PYTHONPATH=.;
4 | pip install -e .
5 | py.test jellyfish/test.py --cov jellyfish --cov-report html
6 | 


--------------------------------------------------------------------------------
/src/common.rs:
--------------------------------------------------------------------------------
1 | use smallvec::SmallVec;
2 | // most strings are short, so we can use a fixed-size array
3 | const VEC_SIZE: usize = 32;
4 | 
5 | pub type FastVec<T> = SmallVec<[T; VEC_SIZE]>;
6 | 


--------------------------------------------------------------------------------
/src/hamming.rs:
--------------------------------------------------------------------------------
 1 | use crate::common::FastVec;
 2 | use unicode_segmentation::UnicodeSegmentation;
 3 | 
 4 | pub fn vec_hamming_distance<T: PartialEq>(s1: &FastVec<T>, s2: &FastVec<T>) -> usize {
 5 |     let (longer, shorter) = if s1.len() > s2.len() {
 6 |         (s1, s2)
 7 |     } else {
 8 |         (s2, s1)
 9 |     };
10 | 
11 |     // distance is difference in length + differing chars
12 |     let mut distance = longer.len() - shorter.len();
13 |     for (i, c) in shorter.iter().enumerate() {
14 |         if *c != longer[i] {
15 |             distance += 1
16 |         }
17 |     }
18 | 
19 |     distance
20 | }
21 | 
22 | pub fn hamming_distance(s1: &str, s2: &str) -> usize {
23 |     let us1 = UnicodeSegmentation::graphemes(s1, true).collect::<FastVec<&str>>();
24 |     let us2 = UnicodeSegmentation::graphemes(s2, true).collect::<FastVec<&str>>();
25 | 
26 |     vec_hamming_distance(&us1, &us2)
27 | }
28 | 
29 | #[cfg(test)]
30 | mod test {
31 |     use super::*;
32 |     use crate::testutils::testutils;
33 |     #[test]
34 |     fn test_hamming() {
35 |         testutils::test_distance_func("testdata/hamming.csv", hamming_distance);
36 |     }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/jaccard.rs:
--------------------------------------------------------------------------------
 1 | use std::borrow::Cow;
 2 | use std::collections::HashSet;
 3 | 
 4 | pub fn jaccard_similarity(s1: &str, s2: &str, ngram_size: Option<usize>) -> f64 {
 5 |     // 1. Tokenize into ngrams
 6 |     let grams1: HashSet<String> = get_ngrams(s1, ngram_size)
 7 |         .into_iter()
 8 |         .map(|cow| cow.into_owned())
 9 |         .collect();
10 |     let grams2: HashSet<String> = get_ngrams(s2, ngram_size)
11 |         .into_iter()
12 |         .map(|cow| cow.into_owned())
13 |         .collect();
14 | 
15 |     // 2. Calculate intersection and union sizes
16 |     let intersection_size: usize = grams1.iter().filter(|gram| grams2.contains(*gram)).count();
17 |     let union_size: usize = grams1.len() + grams2.len() - intersection_size;
18 | 
19 |     // 3. Calculate Jaccard index
20 |     if union_size == 0 {
21 |         0.0
22 |     } else {
23 |         intersection_size as f64 / union_size as f64
24 |     }
25 | }
26 | 
27 | fn get_ngrams(s: &str, n: Option<usize>) -> Vec<Cow<'_, str>> {
28 |     if let Some(size) = n {
29 |         // Non-overlapping character-level n-grams
30 |         s.chars()
31 |             .collect::<Vec<char>>()
32 |             .chunks(size) // Use chunks() for non-overlapping groups
33 |             .map(|chunk| Cow::from(chunk.iter().collect::<String>()))
34 |             .collect()
35 |     } else {
36 |         // Word-level "n-grams" (i.e., words)
37 |         s.split_whitespace()
38 |             .map(Cow::from)
39 |             .collect()
40 |     }
41 | }
42 | 
43 | 
44 | 
45 | #[cfg(test)]
46 | mod test {
47 |     use super::*; // Import the Jaccard functions
48 |     use crate::testutils::testutils; // Import the test utils
49 | 
50 |     #[test]
51 |     fn test_jaccard_similarity() {
52 |         testutils::test_similarity_func_three_args("testdata/jaccard.csv", jaccard_similarity);
53 |     }
54 | }
55 | 


--------------------------------------------------------------------------------
/src/jaro.rs:
--------------------------------------------------------------------------------
  1 | use crate::common::FastVec;
  2 | use smallvec::smallvec;
  3 | use std::cmp;
  4 | use unicode_segmentation::UnicodeSegmentation;
  5 | 
  6 | enum JaroVersion {
  7 |     Pure,
  8 |     Winkler,
  9 |     WinklerLongTolerance,
 10 | }
 11 | 
 12 | fn vec_jaro_or_winkler<T: PartialEq>(
 13 |     s1: &FastVec<T>,
 14 |     s2: &FastVec<T>,
 15 |     version: JaroVersion,
 16 | ) -> f64 {
 17 |     let s1_len = s1.len();
 18 |     let s2_len = s2.len();
 19 | 
 20 |     if s1_len == 0 || s2_len == 0 {
 21 |         return 0.0;
 22 |     }
 23 | 
 24 |     let min_len = cmp::min(s1_len, s2_len);
 25 |     let mut search_range = cmp::max(s1_len, s2_len);
 26 |     search_range = (search_range / 2).saturating_sub(1);
 27 | 
 28 |     let mut s1_flags: FastVec<bool> = smallvec![false; s1_len];
 29 |     let mut s2_flags: FastVec<bool> = smallvec![false; s2_len];
 30 |     let mut common_chars = 0;
 31 | 
 32 |     // looking only within search range, count & flag matched pairs
 33 |     for (i, s1_ch) in s1.iter().enumerate() {
 34 |         // avoid underflow on i - search_range
 35 |         let low = i.saturating_sub(search_range);
 36 |         let hi = cmp::min(i + search_range, s2_len - 1);
 37 |         for j in low..hi + 1 {
 38 |             if !s2_flags[j] && s2[j] == *s1_ch {
 39 |                 s1_flags[i] = true;
 40 |                 s2_flags[j] = true;
 41 |                 common_chars += 1;
 42 |                 break;
 43 |             }
 44 |         }
 45 |     }
 46 | 
 47 |     // no characters match
 48 |     if common_chars == 0 {
 49 |         return 0.0;
 50 |     }
 51 | 
 52 |     // count transpositions
 53 |     let mut k = 0;
 54 |     let mut trans_count = 0;
 55 |     for (i, s1_f) in s1_flags.iter().enumerate() {
 56 |         if *s1_f {
 57 |             let mut j = k;
 58 |             while j < s2_len {
 59 |                 if s2_flags[j] {
 60 |                     k = j + 1;
 61 |                     break;
 62 |                 }
 63 |                 j += 1;
 64 |             }
 65 |             if s1[i] != s2[j] {
 66 |                 trans_count += 1
 67 |             }
 68 |         }
 69 |     }
 70 |     // need to do floor division then cast to float
 71 |     let trans_count = (trans_count / 2) as f64;
 72 |     let common_charsf = common_chars as f64;
 73 |     let s1_lenf = s1_len as f64;
 74 |     let s2_lenf = s2_len as f64;
 75 | 
 76 |     // adjust for similarities in nonmatched characters
 77 |     let mut weight = (common_charsf / s1_lenf
 78 |         + common_charsf / s2_lenf
 79 |         + (common_charsf - trans_count) / common_charsf)
 80 |         / 3.0;
 81 | 
 82 |     // check which version to run
 83 |     let (winklerize, long_tolerance) = match version {
 84 |         JaroVersion::Pure => (false, false),
 85 |         JaroVersion::Winkler => (true, false),
 86 |         JaroVersion::WinklerLongTolerance => (true, true),
 87 |     };
 88 | 
 89 |     // winkler modification: continue to boost similar strings
 90 |     if winklerize && weight > 0.7 {
 91 |         let mut i = 0;
 92 |         let j = cmp::min(min_len, 4);
 93 |         while i < j && s1[i] == s2[i] {
 94 |             // TODO: also had s1[i] in Python, necessary?
 95 |             i += 1;
 96 |         }
 97 |         let fi = i as f64;
 98 |         if i > 0 {
 99 |             weight += fi * 0.1 * (1.0 - weight);
100 |         }
101 | 
102 |         // optional adjustment for long strings
103 |         // after agreeing beginning items, at least two or more must agree
104 |         // and agreed items must be more than half of remaining items
105 |         if long_tolerance && min_len > 4 && common_chars > i + 1 && 2 * common_chars >= min_len + i
106 |         {
107 |             weight +=
108 |                 (1.0 - weight) * (common_charsf - fi - 1.0) / (s1_lenf + s2_lenf - fi * 2.0 + 2.0);
109 |         }
110 |     }
111 | 
112 |     weight
113 | }
114 | 
115 | pub fn vec_jaro_similarity<T: PartialEq>(s1: &FastVec<T>, s2: &FastVec<T>) -> f64 {
116 |     vec_jaro_or_winkler(s1, s2, JaroVersion::Pure)
117 | }
118 | 
119 | pub fn vec_jaro_winkler_similarity<T: PartialEq>(s1: &FastVec<T>, s2: &FastVec<T>) -> f64 {
120 |     vec_jaro_or_winkler(s1, s2, JaroVersion::Winkler)
121 | }
122 | 
123 | pub fn vec_jaro_winkler_similarity_longtol<T: PartialEq>(s1: &FastVec<T>, s2: &FastVec<T>) -> f64 {
124 |     vec_jaro_or_winkler(s1, s2, JaroVersion::WinklerLongTolerance)
125 | }
126 | 
127 | pub fn jaro_similarity(s1: &str, s2: &str) -> f64 {
128 |     let us1 = UnicodeSegmentation::graphemes(s1, true).collect::<FastVec<&str>>();
129 |     let us2 = UnicodeSegmentation::graphemes(s2, true).collect::<FastVec<&str>>();
130 |     vec_jaro_similarity(&us1, &us2)
131 | }
132 | 
133 | pub fn jaro_winkler_similarity(s1: &str, s2: &str) -> f64 {
134 |     let us1 = UnicodeSegmentation::graphemes(s1, true).collect::<FastVec<&str>>();
135 |     let us2 = UnicodeSegmentation::graphemes(s2, true).collect::<FastVec<&str>>();
136 |     vec_jaro_winkler_similarity(&us1, &us2)
137 | }
138 | 
139 | pub fn jaro_winkler_similarity_longtol(s1: &str, s2: &str) -> f64 {
140 |     let us1 = UnicodeSegmentation::graphemes(s1, true).collect::<FastVec<&str>>();
141 |     let us2 = UnicodeSegmentation::graphemes(s2, true).collect::<FastVec<&str>>();
142 |     vec_jaro_winkler_similarity_longtol(&us1, &us2)
143 | }
144 | 
145 | #[cfg(test)]
146 | mod test {
147 |     use super::*;
148 |     use crate::testutils::testutils;
149 |     #[test]
150 |     fn test_jaro() {
151 |         testutils::test_similarity_func_two_args("testdata/jaro_distance.csv", jaro_similarity);
152 |     }
153 | 
154 |     #[test]
155 |     fn test_jaro_winkler() {
156 |         testutils::test_similarity_func_two_args("testdata/jaro_winkler.csv", jaro_winkler_similarity);
157 |     }
158 | 
159 |     #[test]
160 |     fn test_jaro_winkler_longtol() {
161 |         testutils::test_similarity_func_two_args(
162 |             "testdata/jaro_winkler_longtol.csv",
163 |             jaro_winkler_similarity_longtol,
164 |         );
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/levenshtein.rs:
--------------------------------------------------------------------------------
  1 | use crate::common::FastVec;
  2 | use ahash::AHashMap;
  3 | use smallvec::smallvec;
  4 | use std::cmp;
  5 | use unicode_segmentation::UnicodeSegmentation;
  6 | 
  7 | fn range_vec(size: usize) -> FastVec<usize> {
  8 |     let mut vec = FastVec::new();
  9 |     let mut p: usize = 0;
 10 |     vec.resize_with(size, || {
 11 |         p += 1;
 12 |         p - 1
 13 |     });
 14 |     vec
 15 | }
 16 | 
 17 | pub fn vec_levenshtein_distance<T: PartialEq>(v1: &FastVec<T>, v2: &FastVec<T>) -> usize {
 18 |     let rows = v1.len() + 1;
 19 |     let cols = v2.len() + 1;
 20 | 
 21 |     if rows == 1 {
 22 |         return cols - 1;
 23 |     } else if cols == 1 {
 24 |         return rows - 1;
 25 |     }
 26 | 
 27 |     let mut cur = range_vec(cols);
 28 | 
 29 |     for r in 1..rows {
 30 |         // make a copy of the previous row so we can edit cur
 31 |         let prev = cur.clone();
 32 |         cur = smallvec![0; cols];
 33 |         cur[0] = r;
 34 |         for c in 1..cols {
 35 |             // deletion cost or insertion cost
 36 |             let del_or_ins = cmp::min(prev[c] + 1, cur[c - 1] + 1);
 37 |             let edit = prev[c - 1] + (if v1[r - 1] == v2[c - 1] { 0 } else { 1 });
 38 |             cur[c] = cmp::min(del_or_ins, edit);
 39 |         }
 40 |     }
 41 | 
 42 |     // last element of bottom row
 43 |     cur[cols - 1]
 44 | }
 45 | 
 46 | pub fn vec_damerau_levenshtein_distance<T: Eq + std::hash::Hash>(
 47 |     v1: &FastVec<T>,
 48 |     v2: &FastVec<T>,
 49 | ) -> usize {
 50 |     let len1 = v1.len();
 51 |     let len2 = v2.len();
 52 |     let infinite = len1 + len2;
 53 | 
 54 |     let mut item_position = AHashMap::with_capacity(cmp::max(len1, len2));
 55 |     // distance matrix
 56 |     // try using a flat array instead of a 2d vec for speed
 57 |     let mut score: Vec<usize> = vec![0; (len1 + 2) * (len2 + 2)];
 58 |     let idx = |i: usize, j: usize| (len2 + 2) * i + j;
 59 |     //let mut score: FastVec<FastVec<usize>> = smallvec![smallvec![0; len2 + 2]; len1 + 2];
 60 | 
 61 |     score[0] = infinite;
 62 |     for i in 0..=len1 {
 63 |         score[idx(i + 1, 0)] = infinite;
 64 |         score[idx(i + 1, 1)] = i;
 65 |     }
 66 |     for i in 0..=len2 {
 67 |         score[idx(0, i + 1)] = infinite;
 68 |         score[idx(1, i + 1)] = i;
 69 |     }
 70 | 
 71 |     for i in 1..len1 + 1 {
 72 |         let mut db = 0;
 73 |         for j in 1..len2 + 1 {
 74 |             let i1 = item_position.entry(&v2[j - 1]).or_insert(0);
 75 |             let j1 = db;
 76 |             let mut cost = 1;
 77 |             if v1[i - 1] == v2[j - 1] {
 78 |                 cost = 0;
 79 |                 db = j;
 80 |             }
 81 | 
 82 |             // min of the four options
 83 |             score[idx(i + 1, j + 1)] = cmp::min(
 84 |                 // substitution & insertion
 85 |                 cmp::min(score[idx(i, j)] + cost, score[idx(i + 1, j)] + 1),
 86 |                 cmp::min(
 87 |                     // deletion & transposition
 88 |                     score[idx(i, j + 1)] + 1,
 89 |                     score[idx(*i1, j1)] + (i - *i1 - 1) + 1 + (j - j1 - 1),
 90 |                 ),
 91 |             )
 92 |         }
 93 |         // store the position of this character for transpositions
 94 |         item_position.insert(&v1[i - 1], i);
 95 |     }
 96 | 
 97 |     score[idx(len1 + 1, len2 + 1)]
 98 | }
 99 | 
100 | pub fn levenshtein_distance(s1: &str, s2: &str) -> usize {
101 |     if s1 == s2 {
102 |         return 0;
103 |     }
104 | 
105 |     let us1 = UnicodeSegmentation::graphemes(s1, true).collect::<FastVec<&str>>();
106 |     let us2 = UnicodeSegmentation::graphemes(s2, true).collect::<FastVec<&str>>();
107 | 
108 |     vec_levenshtein_distance(&us1, &us2)
109 | }
110 | 
111 | pub fn damerau_levenshtein_distance(s1: &str, s2: &str) -> usize {
112 |     if s1 == s2 {
113 |         return 0;
114 |     }
115 | 
116 |     let us1 = UnicodeSegmentation::graphemes(s1, true).collect::<FastVec<&str>>();
117 |     let us2 = UnicodeSegmentation::graphemes(s2, true).collect::<FastVec<&str>>();
118 | 
119 |     vec_damerau_levenshtein_distance(&us1, &us2)
120 | }
121 | 
122 | #[cfg(test)]
123 | mod test {
124 |     use super::*;
125 |     use crate::testutils::testutils;
126 |     #[test]
127 |     fn test_levenshtein() {
128 |         testutils::test_distance_func("testdata/levenshtein.csv", levenshtein_distance);
129 |     }
130 | 
131 |     #[test]
132 |     fn test_damerau_levenshtein() {
133 |         testutils::test_distance_func(
134 |             "testdata/damerau_levenshtein.csv",
135 |             damerau_levenshtein_distance,
136 |         );
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | mod common;
 2 | mod hamming;
 3 | mod jaccard;
 4 | mod jaro;
 5 | mod levenshtein;
 6 | mod match_rating;
 7 | mod metaphone;
 8 | mod nysiis;
 9 | mod soundex;
10 | mod testutils;
11 | 
12 | pub use hamming::{hamming_distance, vec_hamming_distance};
13 | pub use jaccard::jaccard_similarity;
14 | pub use jaro::{
15 |     jaro_similarity, jaro_winkler_similarity, jaro_winkler_similarity_longtol, vec_jaro_similarity,
16 |     vec_jaro_winkler_similarity, vec_jaro_winkler_similarity_longtol,
17 | };
18 | pub use levenshtein::{
19 |     damerau_levenshtein_distance, levenshtein_distance, vec_damerau_levenshtein_distance,
20 |     vec_levenshtein_distance,
21 | };
22 | pub use match_rating::{match_rating_codex, match_rating_comparison};
23 | pub use metaphone::metaphone;
24 | pub use nysiis::nysiis;
25 | pub use soundex::soundex;
26 | 
27 | #[cfg(feature = "python")]
28 | mod rustyfish;
29 | #[cfg(feature = "python")]
30 | pub use rustyfish::_rustyfish;
31 | 


--------------------------------------------------------------------------------
/src/match_rating.rs:
--------------------------------------------------------------------------------
  1 | use crate::common::FastVec;
  2 | use std::cmp;
  3 | use unicode_segmentation::UnicodeSegmentation;
  4 | 
  5 | pub fn match_rating_codex(s: &str) -> Result<String, String> {
  6 |     // match rating only really makes sense on strings
  7 | 
  8 |     let s = &s.to_uppercase()[..];
  9 |     let v = UnicodeSegmentation::graphemes(s, true).collect::<FastVec<&str>>();
 10 |     let mut codex = String::new();
 11 |     let mut prev = "~tmp~";
 12 |     let is_alpha = s.chars().all(|c| c.is_alphabetic() || c == ' ');
 13 | 
 14 |     if !is_alpha {
 15 |         return Err(String::from("Strings must only contain alphabetical characters"));
 16 |     }
 17 | 
 18 |     for (i, c) in v.iter().enumerate() {
 19 |         let vowel = *c == "A" || *c == "E" || *c == "I" || *c == "O" || *c == "U";
 20 |         // not a space || starting char & vowel || non-double consonant
 21 |         if *c != " " && (i == 0 && vowel) || (!vowel && *c != prev) {
 22 |             codex.push_str(c);
 23 |         }
 24 |         prev = c;
 25 |     }
 26 | 
 27 |     if codex.len() > 6 {
 28 |         // not safe to take a slice without conversion to chars() since there
 29 |         // can be unicode left, this implementation matches the Python one
 30 |         // even though MRC really shouldn't be used with unicode chars
 31 |         let first_three: String = codex.chars().take(3).collect();
 32 |         let last_three: String = codex.chars().rev().take(3).collect::<String>().chars().rev().collect();
 33 |         return Ok(first_three + &last_three);
 34 |     }
 35 | 
 36 |     Ok(codex)
 37 | }
 38 | 
 39 | pub fn match_rating_comparison(s1: &str, s2: &str) -> Result<bool, String> {
 40 |     let codex1 = match_rating_codex(s1)?;
 41 |     let codex2 = match_rating_codex(s2)?;
 42 | 
 43 |     // need to know which is longer for comparisons later
 44 |     let (longer, shorter) = if codex1.len() > codex2.len() {
 45 |         (codex1, codex2)
 46 |     } else {
 47 |         (codex2, codex1)
 48 |     };
 49 | 
 50 |     let lensum = longer.len() + shorter.len();
 51 | 
 52 |     // can't do a comparison when difference is 3 or greater
 53 |     if longer.len() - shorter.len() >= 3 {
 54 |         return Err(String::from("strings differ in length by more than 2"));
 55 |     }
 56 | 
 57 |     // remove matching characters going forward
 58 |     let mut res1 = FastVec::new();
 59 |     let mut res2 = FastVec::new();
 60 |     let mut iter1 = longer.chars();
 61 |     let mut iter2 = shorter.chars();
 62 |     loop {
 63 |         match (iter1.next(), iter2.next()) {
 64 |             (Some(x), Some(y)) => {
 65 |                 if x != y {
 66 |                     res1.push(x);
 67 |                     res2.push(y)
 68 |                 }
 69 |             }
 70 |             (Some(x), None) => res1.push(x),
 71 |             (None, Some(y)) => res2.push(y),
 72 |             (None, None) => break,
 73 |         };
 74 |     }
 75 | 
 76 |     // count unmatched characters going backwards
 77 |     let mut unmatched_count1 = 0;
 78 |     let mut unmatched_count2 = 0;
 79 |     let mut iter1 = res1.iter().rev();
 80 |     let mut iter2 = res2.iter().rev();
 81 |     loop {
 82 |         match (iter1.next(), iter2.next()) {
 83 |             (Some(x), Some(y)) => {
 84 |                 if x != y {
 85 |                     unmatched_count1 += 1;
 86 |                     unmatched_count2 += 1;
 87 |                 }
 88 |             }
 89 |             (Some(_), None) => unmatched_count1 += 1,
 90 |             (None, Some(_)) => unmatched_count2 += 1,
 91 |             (None, None) => break,
 92 |         };
 93 |     }
 94 | 
 95 |     let score = 6 - cmp::max(unmatched_count1, unmatched_count2);
 96 |     match lensum {
 97 |         0..=4 => Ok(score >= 5),
 98 |         5..=7 => Ok(score >= 4),
 99 |         8..=11 => Ok(score >= 3),
100 |         _ => Ok(score >= 2),
101 |     }
102 | }
103 | 
104 | #[cfg(test)]
105 | mod test {
106 |     use super::*;
107 |     use crate::testutils::testutils;
108 |     pub fn mrc_unwrapped(s: &str) -> String {
109 |         return match_rating_codex(s).unwrap();
110 |     }
111 | 
112 |     #[test]
113 |     fn test_match_rating() {
114 |         testutils::test_str_func("testdata/match_rating_codex.csv", mrc_unwrapped);
115 |     }
116 | 
117 |     #[test]
118 |     fn test_match_rating_comparison() {
119 |         // TODO: switch to using CSV
120 |         assert!(match_rating_comparison("Bryne", "Boern").unwrap());
121 |         assert!(match_rating_comparison("Smith", "Smyth").unwrap());
122 |         assert!(match_rating_comparison("Ed", "Ad").unwrap());
123 |         assert!(match_rating_comparison("Catherine", "Kathryn").unwrap());
124 |         assert!(!match_rating_comparison("Michael", "Mike").unwrap());
125 |     }
126 | 
127 |     #[test]
128 |     fn test_match_rating_comparison_err() {
129 |         let result = match_rating_comparison("Tim", "Timothy");
130 |         assert_eq!(result.is_err(), true);
131 |     }
132 | 
133 |     #[test]
134 |     fn test_match_rating_codex_bad_str() {
135 |         let result = match_rating_codex("i’m going home");
136 |         assert!(result.is_err());
137 |     }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/metaphone.rs:
--------------------------------------------------------------------------------
  1 | use crate::common::FastVec;
  2 | use unicode_normalization::UnicodeNormalization;
  3 | 
  4 | pub fn isvowel(s: char) -> bool {
  5 |     matches!(s, 'A' | 'E' | 'I' | 'O' | 'U')
  6 | }
  7 | 
  8 | fn is_iey(s: char) -> bool {
  9 |     matches!(s, 'I' | 'E' | 'Y')
 10 | }
 11 | 
 12 | pub fn metaphone(s: &str) -> String {
 13 |     if s.is_empty() {
 14 |         return String::from("");
 15 |     }
 16 | 
 17 |     let s = &s.to_uppercase()[..];
 18 |     let mut v = s.nfkd().collect::<FastVec<char>>();
 19 |     let mut ret = FastVec::new();
 20 | 
 21 |     // skip first character if s starts with these
 22 |     if s.starts_with("KN")
 23 |         || s.starts_with("GN")
 24 |         || s.starts_with("PN")
 25 |         || s.starts_with("WR")
 26 |         || s.starts_with("AE")
 27 |     {
 28 |         v.remove(0);
 29 |     }
 30 | 
 31 |     let mut i = 0;
 32 | 
 33 |     while i < v.len() {
 34 |         let c = v[i];
 35 |         let next = if i + 1 < v.len() { v[i + 1] } else { '*' };
 36 |         let nextnext = if i + 2 < v.len() { v[i + 2] } else { '*' };
 37 | 
 38 |         // skip doubles except for CC
 39 |         if c == next && c != 'C' {
 40 |             i += 1;
 41 |             continue;
 42 |         }
 43 | 
 44 |         match c {
 45 |             'A' | 'E' | 'I' | 'O' | 'U' => {
 46 |                 if i == 0 || v[i - 1] == ' ' {
 47 |                     ret.push(c);
 48 |                 }
 49 |             }
 50 |             'B' => {
 51 |                 if (i == 0 || v[i - 1] != 'M') || next != '*' {
 52 |                     ret.push('B');
 53 |                 }
 54 |             }
 55 |             'C' => {
 56 |                 if next == 'I' && nextnext == 'A' || next == 'H' {
 57 |                     i += 1;
 58 |                     ret.push('X');
 59 |                 } else if is_iey(next) {
 60 |                     i += 1;
 61 |                     ret.push('S');
 62 |                 } else {
 63 |                     ret.push('K');
 64 |                 }
 65 |             }
 66 |             'D' => {
 67 |                 if next == 'G' && is_iey(nextnext) {
 68 |                     i += 2;
 69 |                     ret.push('J');
 70 |                 } else {
 71 |                     ret.push('T');
 72 |                 }
 73 |             }
 74 |             'F' | 'J' | 'L' | 'M' | 'N' | 'R' => {
 75 |                 ret.push(c);
 76 |             }
 77 |             'G' => {
 78 |                 if is_iey(next) {
 79 |                     ret.push('J');
 80 |                 } else if (next == 'H' && nextnext != '*' && !isvowel(nextnext))
 81 |                     || (next == 'N' && nextnext == '*')
 82 |                 {
 83 |                     i += 1;
 84 |                 } else {
 85 |                     ret.push('K');
 86 |                 }
 87 |             }
 88 |             'H' => {
 89 |                 if i == 0 || isvowel(next) || !isvowel(v[i - 1]) {
 90 |                     ret.push('H');
 91 |                 }
 92 |             }
 93 |             'K' => {
 94 |                 if i == 0 || v[i - 1] != 'C' {
 95 |                     ret.push('K');
 96 |                 }
 97 |             }
 98 |             'P' => {
 99 |                 if next == 'H' {
100 |                     i += 1;
101 |                     ret.push('F');
102 |                 } else {
103 |                     ret.push('P');
104 |                 }
105 |             }
106 |             'Q' => {
107 |                 ret.push('K');
108 |             }
109 |             'S' => {
110 |                 if next == 'H' {
111 |                     i += 1;
112 |                     ret.push('X');
113 |                 } else if next == 'I' && (nextnext == 'O' || nextnext == 'A') {
114 |                     i += 2;
115 |                     ret.push('X');
116 |                 } else {
117 |                     ret.push('S');
118 |                 }
119 |             }
120 |             'T' => {
121 |                 if next == 'I' && (nextnext == 'O' || nextnext == 'A') {
122 |                     ret.push('X');
123 |                 } else if next == 'H' {
124 |                     i += 1;
125 |                     ret.push('0');
126 |                 } else if next != 'C' || nextnext != 'H' {
127 |                     ret.push('T');
128 |                 }
129 |             }
130 |             'V' => {
131 |                 ret.push('F');
132 |             }
133 |             'W' => {
134 |                 if i == 0 && next == 'H' {
135 |                     i += 1;
136 |                     ret.push('W');
137 |                 } else if isvowel(next) {
138 |                     ret.push('W');
139 |                 }
140 |             }
141 |             'X' => {
142 |                 if i == 0 {
143 |                     if next == 'H' || (next == 'I' && (nextnext == 'O' || nextnext == 'A')) {
144 |                         ret.push('X');
145 |                     } else {
146 |                         ret.push('S');
147 |                     }
148 |                 } else {
149 |                     ret.push('K');
150 |                     ret.push('S');
151 |                 }
152 |             }
153 |             'Y' => {
154 |                 if isvowel(next) {
155 |                     ret.push('Y');
156 |                 }
157 |             }
158 |             'Z' => {
159 |                 ret.push('S');
160 |             }
161 |             ' ' => {
162 |                 if !ret.is_empty() && ret[ret.len() - 1] != ' ' {
163 |                     ret.push(' ');
164 |                 }
165 |             }
166 |             _ => {}
167 |         };
168 |         i += 1;
169 |     }
170 | 
171 |     let mut str_key = String::new();
172 |     for k in ret {
173 |         str_key.push(k);
174 |     }
175 | 
176 |     str_key
177 | }
178 | 
179 | #[cfg(test)]
180 | mod test {
181 |     use super::*;
182 |     use crate::testutils::testutils;
183 |     #[test]
184 |     fn test_metaphone() {
185 |         testutils::test_str_func("testdata/metaphone.csv", metaphone);
186 |     }
187 | }
188 | 


--------------------------------------------------------------------------------
/src/nysiis.rs:
--------------------------------------------------------------------------------
  1 | use crate::common::FastVec;
  2 | use smallvec::{smallvec, SmallVec};
  3 | use unicode_segmentation::UnicodeSegmentation;
  4 | 
  5 | fn isvowel(s: &str) -> bool {
  6 |     matches!(s, "A" | "E" | "I" | "O" | "U")
  7 | }
  8 | 
  9 | pub fn nysiis(s: &str) -> String {
 10 |     if s.is_empty() {
 11 |         return String::from("");
 12 |     }
 13 | 
 14 |     let s = &s.to_uppercase()[..];
 15 |     let mut v = UnicodeSegmentation::graphemes(s, true).collect::<FastVec<&str>>();
 16 | 
 17 |     // step 1: handle prefixes
 18 |     if s.starts_with("MAC") {
 19 |         v[1] = "C"; // switch MAC to MCC
 20 |     } else if s.starts_with("KN") {
 21 |         v.remove(0); // strip leading K from KN
 22 |     } else if s.starts_with('K') {
 23 |         v[0] = "C"; // switch K to C
 24 |     } else if s.starts_with("PH") || s.starts_with("PF") {
 25 |         v[0] = "F";
 26 |         v[1] = "F"; // switch these to FF
 27 |     } else if s.starts_with("SCH") {
 28 |         v[1] = "S";
 29 |         v[2] = "S"; // switch SCH to SSS
 30 |     }
 31 | 
 32 |     // step 2: suffixes
 33 |     if s.ends_with("IE") || s.ends_with("EE") {
 34 |         v.pop();
 35 |         v.pop();
 36 |         v.push("Y");
 37 |     } else if s.ends_with("DT")
 38 |         || s.ends_with("RT")
 39 |         || s.ends_with("RD")
 40 |         || s.ends_with("NT")
 41 |         || s.ends_with("ND")
 42 |     {
 43 |         v.pop();
 44 |         v.pop();
 45 |         v.push("D");
 46 |     }
 47 | 
 48 |     // step 3: key starts with first character of name
 49 |     let mut key = FastVec::new();
 50 |     key.push(v[0]);
 51 | 
 52 |     // step 4: translate remaining characters
 53 |     let mut i = 1;
 54 | 
 55 |     while i < v.len() {
 56 |         let chars: SmallVec<[&str; 3]> = match v[i] {
 57 |             "E" if i + 1 < v.len() && v[i + 1] == "V" => {
 58 |                 i += 1;
 59 |                 smallvec!["A", "F"]
 60 |             }
 61 |             "A" | "E" | "I" | "O" | "U" => smallvec!["A"],
 62 |             "Q" => smallvec!["G"],
 63 |             "Z" => smallvec!["S"],
 64 |             "M" => smallvec!["N"],
 65 |             "K" => {
 66 |                 if i + 1 < v.len() && v[i + 1] == "N" {
 67 |                     smallvec!["N"]
 68 |                 } else {
 69 |                     smallvec!["C"]
 70 |                 }
 71 |             }
 72 |             "S" if i + 2 < v.len() && v[i + 1] == "C" && v[i + 2] == "H" => {
 73 |                 i += 2;
 74 |                 smallvec!["S", "S"]
 75 |             }
 76 |             "P" if i + 1 < v.len() && v[i + 1] == "H" => {
 77 |                 i += 1;
 78 |                 smallvec!["F"]
 79 |             }
 80 |             "H" if !isvowel(v[i - 1])
 81 |                 || (i + 1 < v.len() && !isvowel(v[i + 1]))
 82 |                 || (i + 1 == v.len()) =>
 83 |             {
 84 |                 if isvowel(v[i - 1]) {
 85 |                     smallvec!["A"]
 86 |                 } else {
 87 |                     smallvec![v[i - 1]]
 88 |                 }
 89 |             }
 90 |             "W" if isvowel(v[i - 1]) => smallvec![v[i - 1]],
 91 |             _ => smallvec![v[i]],
 92 |         };
 93 | 
 94 |         if !chars.is_empty() && chars[chars.len() - 1] != key[key.len() - 1] {
 95 |             for c in chars {
 96 |                 key.push(c);
 97 |             }
 98 |         }
 99 | 
100 |         i += 1;
101 |     }
102 | 
103 |     // step 5 remove trailing S
104 |     if key[key.len() - 1] == "S" && key.len() > 1 {
105 |         key.pop();
106 |     }
107 | 
108 |     // step 6 replace AY w/ Y
109 |     if key.ends_with(&["A", "Y"]) {
110 |         key.remove(key.len() - 2);
111 |     }
112 | 
113 |     // step 7 remove trailing A
114 |     if key[key.len() - 1] == "A" && key.len() > 1 {
115 |         key.pop();
116 |     }
117 | 
118 |     let mut str_key = String::new();
119 |     for k in key {
120 |         str_key.push_str(k);
121 |     }
122 | 
123 |     str_key
124 | }
125 | 
126 | #[cfg(test)]
127 | mod test {
128 |     use super::*;
129 |     use crate::testutils::testutils;
130 |     #[test]
131 |     fn test_nysiis() {
132 |         testutils::test_str_func("testdata/nysiis.csv", nysiis);
133 |     }
134 | }
135 | 


--------------------------------------------------------------------------------
/src/rustyfish.rs:
--------------------------------------------------------------------------------
  1 | use crate::damerau_levenshtein_distance as _damerau;
  2 | use crate::hamming_distance as _hamming;
  3 | use crate::jaccard_similarity as _jaccard;
  4 | use crate::jaro_similarity as _jaro;
  5 | use crate::jaro_winkler_similarity as _jaro_winkler;
  6 | use crate::jaro_winkler_similarity_longtol as _jaro_winkler_long;
  7 | use crate::levenshtein_distance as _lev;
  8 | use crate::match_rating_codex as _mr_codex;
  9 | use crate::match_rating_comparison as _mr_comparison;
 10 | use crate::metaphone as _metaphone;
 11 | use crate::nysiis as _nysiis;
 12 | use crate::soundex as _soundex;
 13 | use pyo3::exceptions::PyValueError;
 14 | use pyo3::prelude::*;
 15 | 
 16 | /// Calculates the Damerau-Levenshtein distance between two strings.
 17 | #[pyfunction]
 18 | fn damerau_levenshtein_distance(a: &str, b: &str) -> PyResult<usize> {
 19 |     Ok(_damerau(a, b))
 20 | }
 21 | 
 22 | // Calculates the Hamming distance between two strings.
 23 | #[pyfunction]
 24 | fn hamming_distance(a: &str, b: &str) -> PyResult<usize> {
 25 |     Ok(_hamming(a, b))
 26 | }
 27 | 
 28 | // Calculates the Jaccard index between two strings.
 29 | #[pyfunction]
 30 | #[pyo3(signature=(a, b, ngram_size=None))]
 31 | fn jaccard_similarity(a: &str, b: &str, ngram_size: Option<usize>) -> PyResult<f64> {
 32 |     Ok(_jaccard(a, b, ngram_size))
 33 | }
 34 | 
 35 | // Calculates the Jaro similarity between two strings.
 36 | #[pyfunction]
 37 | fn jaro_similarity(a: &str, b: &str) -> PyResult<f64> {
 38 |     Ok(_jaro(a, b))
 39 | }
 40 | 
 41 | // Calculates the Jaro-Winkler similarity between two strings.
 42 | #[pyfunction]
 43 | #[pyo3(signature=(a, b, long_tolerance=None))]
 44 | fn jaro_winkler_similarity(a: &str, b: &str, long_tolerance: Option<bool>) -> PyResult<f64> {
 45 |     match long_tolerance {
 46 |         Some(true) => Ok(_jaro_winkler_long(a, b)),
 47 |         _ => Ok(_jaro_winkler(a, b)),
 48 |     }
 49 | }
 50 | 
 51 | // Calculates the Levenshtein distance between two strings.
 52 | #[pyfunction]
 53 | fn levenshtein_distance(a: &str, b: &str) -> PyResult<usize> {
 54 |     Ok(_lev(a, b))
 55 | }
 56 | 
 57 | // Calculates the Match Rating Approach code for a string.
 58 | #[pyfunction]
 59 | fn match_rating_codex(a: &str) -> PyResult<String> {
 60 |     // convert to ValueError
 61 |     _mr_codex(a).map_err(|e| PyErr::new::<PyValueError, _>(format!("{}", e)))
 62 | }
 63 | 
 64 | // Calculates the Match Rating Approach comparison for two strings.
 65 | #[pyfunction]
 66 | fn match_rating_comparison(a: &str, b: &str) -> Option<bool> {
 67 |     match _mr_comparison(a, b) {
 68 |         Ok(value) => Some(value),
 69 |         Err(_) => None,
 70 |     }
 71 | }
 72 | 
 73 | /// Calculates the NYSIIS phonetic encoding of a string.
 74 | #[pyfunction]
 75 | fn nysiis(a: &str) -> PyResult<String> {
 76 |     Ok(_nysiis(a))
 77 | }
 78 | 
 79 | /// Calculates the phonetic encoding of a string using the Soundex algorithm.
 80 | #[pyfunction]
 81 | fn soundex(a: &str) -> PyResult<String> {
 82 |     Ok(_soundex(a))
 83 | }
 84 | 
 85 | /// Calculates the phonetic encoding of a string using the Metaphone algorithm.
 86 | #[pyfunction]
 87 | fn metaphone(a: &str) -> PyResult<String> {
 88 |     Ok(_metaphone(a))
 89 | }
 90 | 
 91 | /// A Python module implemented in Rust.
 92 | #[pymodule]
 93 | pub fn _rustyfish(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
 94 |     m.add_function(wrap_pyfunction!(damerau_levenshtein_distance, m)?)?;
 95 |     m.add_function(wrap_pyfunction!(hamming_distance, m)?)?;
 96 |     m.add_function(wrap_pyfunction!(jaccard_similarity, m)?)?;
 97 |     m.add_function(wrap_pyfunction!(jaro_similarity, m)?)?;
 98 |     m.add_function(wrap_pyfunction!(jaro_winkler_similarity, m)?)?;
 99 |     m.add_function(wrap_pyfunction!(levenshtein_distance, m)?)?;
100 |     m.add_function(wrap_pyfunction!(match_rating_codex, m)?)?;
101 |     m.add_function(wrap_pyfunction!(match_rating_comparison, m)?)?;
102 |     m.add_function(wrap_pyfunction!(nysiis, m)?)?;
103 |     m.add_function(wrap_pyfunction!(soundex, m)?)?;
104 |     m.add_function(wrap_pyfunction!(metaphone, m)?)?;
105 | 
106 |     Ok(())
107 | }
108 | 


--------------------------------------------------------------------------------
/src/soundex.rs:
--------------------------------------------------------------------------------
 1 | use crate::common::FastVec;
 2 | use unicode_normalization::UnicodeNormalization;
 3 | 
 4 | pub fn soundex(s: &str) -> String {
 5 |     if s.is_empty() {
 6 |         return String::from("");
 7 |     }
 8 | 
 9 |     let v = &s.to_uppercase().nfkd().collect::<FastVec<char>>();
10 | 
11 |     let mut result = FastVec::new();
12 |     result.push(v[0]);
13 | 
14 |     let replacement = |ch| match ch {
15 |         'B' | 'F' | 'P' | 'V' => '1',
16 |         'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
17 |         'D' | 'T' => '3',
18 |         'L' => '4',
19 |         'M' | 'N' => '5',
20 |         'R' => '6',
21 |         _ => '*',
22 |     };
23 | 
24 |     // find would be replacement for first character
25 |     let mut last = replacement(v[0]);
26 | 
27 |     // loop over remaining letters
28 |     for letter in v.iter().skip(1) {
29 |         let sub = replacement(*letter);
30 |         if sub != '*' {
31 |             if sub != last {
32 |                 result.push(sub);
33 |                 if result.len() == 4 {
34 |                     break;
35 |                 }
36 |             }
37 |             last = sub;
38 |         } else if *letter != 'H' && *letter != 'W' {
39 |             last = '*';
40 |         }
41 |     }
42 | 
43 |     while result.len() < 4 {
44 |         result.push('0');
45 |     }
46 |     let mut str_key = String::new();
47 |     for k in result {
48 |         str_key.push(k);
49 |     }
50 |     str_key
51 | }
52 | 
53 | #[cfg(test)]
54 | mod test {
55 |     use super::*;
56 |     use crate::testutils::testutils;
57 |     #[test]
58 |     fn test_soundex() {
59 |         testutils::test_str_func("testdata/soundex.csv", soundex);
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/testutils.rs:
--------------------------------------------------------------------------------
 1 | #[cfg(test)]
 2 | pub mod testutils {
 3 |     use csv;
 4 |     use num_traits::{Float, FromPrimitive};
 5 | 
 6 |     fn test_generic_func<T, F>(filename: &str, func: F)
 7 |     where
 8 |         F: Fn(&str, &str, Option<usize>) -> T, // Signature for functions with ngram_size
 9 |         T: PartialEq + std::fmt::Debug + std::str::FromStr + Float + FromPrimitive,
10 |         <T as std::str::FromStr>::Err: std::fmt::Debug,
11 |     {
12 |         let mut reader = csv::ReaderBuilder::new()
13 |             .has_headers(false)
14 |             .from_path(filename)
15 |             .unwrap();
16 |         let mut num_tested = 0;
17 |         for result in reader.records() {
18 |             let rec = result.unwrap();
19 |             let input1 = &rec[0];
20 |             let input2 = &rec[1];
21 |             let ngram_size = rec.get(3).and_then(|s| s.parse().ok());
22 | 
23 |             let expected: T = rec[2].parse().expect("Failed to parse expected value");
24 |             let output = func(input1, input2, ngram_size);
25 | 
26 |             let abs_diff = (output.to_f64().unwrap() - expected.to_f64().unwrap()).abs();
27 |             assert!(
28 |                 abs_diff < 0.001,
29 |                 "comparing {} to {} (ngram_size: {:?}), expected {:?}, got {:?} (diff {:?})",
30 |                 input1,
31 |                 input2,
32 |                 ngram_size,
33 |                 expected,
34 |                 output,
35 |                 abs_diff
36 |             );
37 | 
38 |             num_tested += 1;
39 |         }
40 |         assert!(num_tested > 0);
41 |     }
42 | 
43 |     pub fn test_distance_func(filename: &str, func: fn(&str, &str) -> usize) {
44 |         let mut reader = csv::ReaderBuilder::new()
45 |             .has_headers(false)
46 |             .from_path(filename)
47 |             .unwrap();
48 |         let mut num_tested = 0;
49 |         for result in reader.records() {
50 |             let rec = result.unwrap();
51 |             let input1 = &rec[0];
52 |             let input2 = &rec[1];
53 |             let expected: usize = rec[2].parse().expect("Failed to parse expected value");
54 |             let output = func(input1, input2);
55 | 
56 |             println!(
57 |                 "comparing {} to {}, expecting {:?}, got {:?}",
58 |                 input1, input2, expected, output
59 |             );
60 |             assert_eq!(output, expected);
61 |             num_tested += 1;
62 |         }
63 |         assert!(num_tested > 0);
64 |     }
65 | 
66 |     // For functions with two string arguments
67 |     pub fn test_similarity_func_two_args(filename: &str, func: fn(&str, &str) -> f64) {
68 |         test_generic_func::<f64, _>(filename, |a, b, _| func(a, b));
69 |     }
70 | 
71 |     // For functions with three arguments (including the optional usize)
72 |     pub fn test_similarity_func_three_args(filename: &str, func: fn(&str, &str, Option<usize>) -> f64) {
73 |         test_generic_func::<f64, _>(filename, |a, b, n| func(a, b, n));
74 |     }
75 | 
76 |     pub fn test_str_func(filename: &str, func: fn(&str) -> String) {
77 |         let mut reader = csv::ReaderBuilder::new()
78 |             .has_headers(false)
79 |             .from_path(filename)
80 |             .unwrap();
81 |         let mut num_tested = 0;
82 |         for result in reader.records() {
83 |             let rec = result.unwrap();
84 |             let input1 = &rec[0];
85 |             let expected = rec[1].to_string();
86 | 
87 |             let output = func(input1);
88 | 
89 |             println!(
90 |                 "comparing {}, expecting {:?}, got {:?}",
91 |                 input1, expected, output
92 |             );
93 |             assert_eq!(output, expected);
94 |             num_tested += 1;
95 |         }
96 |         assert!(num_tested > 0);
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/testdata/README.md:
--------------------------------------------------------------------------------
1 | Test data for jellyfish string comparison and phonetic encoding algorithms.
2 | 


--------------------------------------------------------------------------------
/testdata/damerau_levenshtein.csv:
--------------------------------------------------------------------------------
 1 | ,,0
 2 | abc,,3
 3 | bc,abc,1
 4 | fuor,four,1
 5 | abcd,acb,2
 6 | cape sand recycling ,edith ann graham,17
 7 | jellyifhs,jellyfish,2
 8 | ifhs,fish,2
 9 | "Hello, world!","Hello,Â world!",2
10 | 


--------------------------------------------------------------------------------
/testdata/hamming.csv:
--------------------------------------------------------------------------------
1 | ,,0
2 | ,abc,3
3 | abc,abc,0
4 | acc,abc,1
5 | abcd,abc,1
6 | abc,abcd,1
7 | testing,this is a test,13
8 | Saturday,Sunday,7
9 | 


--------------------------------------------------------------------------------
/testdata/jaccard.csv:
--------------------------------------------------------------------------------
 1 | abc,xyz,0.0,
 2 | abc,abc,1.0,
 3 | abc,abcd,0.0,
 4 | abcd,abce,0.0,
 5 | abcd,abcde,0.0,
 6 | french,quebec,0.0,
 7 | france,quebec,0.0,
 8 | france,france,1.0,
 9 | The quick brown fox jumps over the lazy dog,The quick brown fox jumps over the lazy cat,0.8,
10 | The quick brown fox jumps over the lazy dog,The slow green turtle crawls under the lazy cat,0.2,
11 | John Smith,Smith; John,0.33333,
12 | John Smith,Smith John,1.0,
13 | John Smith,John Jacob Smith,0.666667,
14 | night,nacht,0.0,
15 | night,nacht,0.2,2
16 | night,nacht,0.33333,3


--------------------------------------------------------------------------------
/testdata/jaro_distance.csv:
--------------------------------------------------------------------------------
1 | dixon,dicksonx,0.767
2 | martha,marhta,0.944
3 | dwayne,duane,0.822
4 | 0ð00,0ð00,1
5 | "Sint-Pietersplein 6, 9000 Gent","Test 10, 1010 Brussel",0.518
6 | 


--------------------------------------------------------------------------------
/testdata/jaro_winkler.csv:
--------------------------------------------------------------------------------
 1 | dixon,dicksonx,0.813
 2 | martha,marhta,0.961
 3 | dwayne,duane,0.84
 4 | William,Williams,0.975
 5 | ,foo,0
 6 | a,a,1
 7 | abc,xyz,0
 8 | aaaa,aaaaa,0.96
 9 | orangutan-kumquat,orangutan kumquat,0.976
10 | jaz,jal,0.822
11 | @,@@,0.85
12 | 0,0@,0.85
13 | a,ab,0.85
14 | 012345,0123456,0.971
15 | 012abc,012abcd,0.971
16 | 012abc,013abcd,0.879
17 | a1bc,a1be,0.883
18 | 


--------------------------------------------------------------------------------
/testdata/jaro_winkler_longtol.csv:
--------------------------------------------------------------------------------
 1 | dixon,dicksonx,0.830
 2 | martha,marhta,0.971
 3 | dwayne,duane,0.869
 4 | William,Williams,0.980
 5 | ,foo,0
 6 | a,a,1
 7 | abc,xyz,0
 8 | aaaa,aaaaa,0.96
 9 | orangutan-kumquat,orangutan kumquat,0.986
10 | 1abcdefg,1abcdefh,0.96
11 | 


--------------------------------------------------------------------------------
/testdata/levenshtein.csv:
--------------------------------------------------------------------------------
1 | ,,0
2 | abc,,3
3 | ,abc,3
4 | bc,abc,1
5 | kitten,sitting,3
6 | Saturday,Sunday,3
7 | 


--------------------------------------------------------------------------------
/testdata/match_rating_codex.csv:
--------------------------------------------------------------------------------
 1 | Byrne,BYRN
 2 | Boern,BRN
 3 | Smith,SMTH
 4 | Smyth,SMYTH
 5 | Catherine,CTHRN
 6 | Kathryn,KTHRYN
 7 | Kathrynoglin,KTHGLN
 8 | Ad,AD
 9 | Ed,ED
10 | William,WLM
11 | ä,Ä
12 | Frédéric,FRÉÉRC
13 | 


--------------------------------------------------------------------------------
/testdata/match_rating_comparison.csv:
--------------------------------------------------------------------------------
1 | Bryne,Boern,True
2 | Smith,Smyth,True
3 | Catherine,Kathryn,True
4 | Michael,Mike,False
5 | Tim,Timothy,None
6 | Ed,Ad,True
7 | Marie Helene,Maria Rio,True
8 | 


--------------------------------------------------------------------------------
/testdata/metaphone.csv:
--------------------------------------------------------------------------------
 1 | DGIB,JB
 2 | metaphone,MTFN
 3 | wHErE,WR
 4 | shell,XL
 5 | this is a difficult string,0S IS A TFKLT STRNK
 6 | aeromancy,ERMNS
 7 | Antidisestablishmentarianism,ANTTSSTBLXMNTRNSM
 8 | sunlight labs,SNLT LBS
 9 | sonlite laabz,SNLT LBS
10 | Çáŕẗéř,KRTR
11 | kentucky,KNTK
12 | KENTUCKY,KNTK
13 | NXNXNX,NKSNKSNKS
14 | Aapti,PT
15 | Aarti,RT
16 | CIAB,XB
17 | NQ,NK
18 | sian,XN
19 | gek,JK
20 | Hb,HB
21 | Bho,BH
22 | Tiavyi,XFY
23 | Xhot,XHT
24 | Xnot,SNT
25 | g,K
26 | 8 queens,KNS
27 | Utah,UT
28 | WH,W
29 | walt,WLT
30 | ANDREW,ANTR
31 | why,W
32 | whynot,WNT
33 | acceptingness,AKSPTNKNS
34 | 


--------------------------------------------------------------------------------
/testdata/nysiis.csv:
--------------------------------------------------------------------------------
 1 | Worthy,WARTY
 2 | Ogata,OGAT
 3 | montgomery,MANTGANARY
 4 | Costales,CASTAL
 5 | Tu,T
 6 | martincevic,MARTANCAFAC
 7 | Catherine,CATARAN
 8 | Katherine,CATARAN
 9 | Katerina,CATARAN
10 | Johnathan,JANATAN
11 | Jonathan,JANATAN
12 | John,JAN
13 | Teresa,TARAS
14 | Theresa,TARAS
15 | Jessica,JASAC
16 | Joshua,JAS
17 | Bosch,BAS
18 | Lapher,LAFAR
19 | wiyh,WY
20 | MacArthur,MCARTAR
21 | Pheenard,FANAD
22 | Schmittie,SNATY
23 | Knaqze,NAGS
24 | Knokno,NAN
25 | Knoko,NAC
26 | Macaw,MC
27 | ,
28 | T,T
29 | S,S
30 | P,P
31 | K,C
32 | M,M
33 | E,E
34 | PFEISTER,FASTAR
35 | SARAH,SAR
36 | ç,Ç


--------------------------------------------------------------------------------
/testdata/soundex.csv:
--------------------------------------------------------------------------------
 1 | Washington,W252
 2 | Lee,L000
 3 | Gutierrez,G362
 4 | Pfister,P236
 5 | Jackson,J250
 6 | Tymczak,T522
 7 | ,
 8 | A,A000
 9 | Çáŕẗéř,C636
10 | Ashcroft,A261
11 | ¿,¿000


--------------------------------------------------------------------------------
/testdata/wagner_fischer.csv:
--------------------------------------------------------------------------------
1 | ,,0,
2 | abc,,3,~~~
3 | ,abc,3,~~~
4 | bc,abc,1,~bc
5 | kitten,sitting,3,~itt~n~
6 | Saturday,Sunday,3,s~~u~day
7 | 


--------------------------------------------------------------------------------
/tests/test_jellyfish.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import pytest
  3 | 
  4 | open_kwargs = {"encoding": "utf8"}
  5 | 
  6 | 
  7 | def assertAlmostEqual(a, b, places=3):
  8 |     assert abs(a - b) < (0.1**places)
  9 | 
 10 | 
 11 | implementations = ["python", "rust"]
 12 | 
 13 | 
 14 | @pytest.fixture(params=implementations)
 15 | def jf(request):
 16 |     if request.param == "python":
 17 |         import jellyfish._jellyfish as jf
 18 |     elif request.param == "rust":
 19 |         from jellyfish import _rustyfish as jf
 20 |     return jf
 21 | 
 22 | 
 23 | def _load_data(name):
 24 |     with open("testdata/{}.csv".format(name), **open_kwargs) as f:
 25 |         yield from csv.reader(f)
 26 | 
 27 | 
 28 | @pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler"), ids=str)
 29 | def test_jaro_winkler_similarity(jf, s1, s2, value):
 30 |     value = float(value)
 31 |     assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2), value, places=3)
 32 | 
 33 | 
 34 | @pytest.mark.parametrize("s1,s2,value", _load_data("jaro_winkler_longtol"), ids=str)
 35 | def test_jaro_winkler_similarity_longtol(jf, s1, s2, value):
 36 |     value = float(value)
 37 |     assertAlmostEqual(jf.jaro_winkler_similarity(s1, s2, True), value, places=3)
 38 | 
 39 | 
 40 | @pytest.mark.parametrize("s1,s2,value", _load_data("jaro_distance"), ids=str)
 41 | def test_jaro_similarity(jf, s1, s2, value):
 42 |     value = float(value)
 43 |     assertAlmostEqual(jf.jaro_similarity(s1, s2), value, places=3)
 44 | 
 45 | 
 46 | @pytest.mark.parametrize("s1,s2,value", _load_data("hamming"), ids=str)
 47 | def test_hamming_distance(jf, s1, s2, value):
 48 |     value = int(value)
 49 |     assert jf.hamming_distance(s1, s2) == value
 50 | 
 51 | 
 52 | @pytest.mark.parametrize("s1,s2,value", _load_data("levenshtein"), ids=str)
 53 | def test_levenshtein_distance(jf, s1, s2, value):
 54 |     value = int(value)
 55 |     assert jf.levenshtein_distance(s1, s2) == value
 56 | 
 57 | 
 58 | @pytest.mark.parametrize("s1,s2,value", _load_data("damerau_levenshtein"), ids=str)
 59 | def test_damerau_levenshtein_distance(jf, s1, s2, value):
 60 |     value = int(value)
 61 |     assert jf.damerau_levenshtein_distance(s1, s2) == value
 62 | 
 63 | 
 64 | @pytest.mark.parametrize("s1,code", _load_data("soundex"), ids=str)
 65 | def test_soundex(jf, s1, code):
 66 |     assert jf.soundex(s1) == code
 67 | 
 68 | 
 69 | @pytest.mark.parametrize("s1,code", _load_data("metaphone"), ids=str)
 70 | def test_metaphone(jf, s1, code):
 71 |     assert jf.metaphone(s1) == code
 72 | 
 73 | 
 74 | @pytest.mark.parametrize("s1,s2", _load_data("nysiis"), ids=str)
 75 | def test_nysiis(jf, s1, s2):
 76 |     assert jf.nysiis(s1) == s2
 77 | 
 78 | 
 79 | @pytest.mark.parametrize("s1,s2", _load_data("match_rating_codex"), ids=str)
 80 | def test_match_rating_codex(jf, s1, s2):
 81 |     assert jf.match_rating_codex(s1) == s2
 82 | 
 83 | 
 84 | @pytest.mark.parametrize("s1,s2,value", _load_data("match_rating_comparison"), ids=str)
 85 | def test_match_rating_comparison(jf, s1, s2, value):
 86 |     value = {"True": True, "False": False, "None": None}[value]
 87 |     assert jf.match_rating_comparison(s1, s2) is value
 88 | 
 89 | 
 90 | def test_jaro_winkler_long_tolerance(jf):
 91 |     no_lt = jf.jaro_winkler_similarity(
 92 |         "two long strings", "two long stringz", long_tolerance=False
 93 |     )
 94 |     with_lt = jf.jaro_winkler_similarity(
 95 |         "two long strings", "two long stringz", long_tolerance=True
 96 |     )
 97 |     # make sure long_tolerance does something
 98 |     assertAlmostEqual(no_lt, 0.975)
 99 |     assertAlmostEqual(with_lt, 0.984)
100 | 
101 | 
102 | def test_damerau_levenshtein_distance_type(jf):
103 |     jf.damerau_levenshtein_distance("abc", "abc")
104 |     with pytest.raises(TypeError) as exc:
105 |         jf.damerau_levenshtein_distance(b"abc", b"abc")
106 | 
107 | 
108 | def test_levenshtein_distance_type(jf):
109 |     assert jf.levenshtein_distance("abc", "abc") == 0
110 |     with pytest.raises(TypeError) as exc:
111 |         jf.levenshtein_distance(b"abc", b"abc")
112 | 
113 | 
114 | def test_jaro_similarity_type(jf):
115 |     assert jf.jaro_similarity("abc", "abc") == 1
116 |     with pytest.raises(TypeError) as exc:
117 |         jf.jaro_similarity(b"abc", b"abc")
118 | 
119 | 
120 | def test_jaro_winkler_type(jf):
121 |     assert jf.jaro_winkler_similarity("abc", "abc") == 1
122 |     with pytest.raises(TypeError) as exc:
123 |         jf.jaro_winkler_similarity(b"abc", b"abc")
124 | 
125 | 
126 | def test_mra_comparison_type(jf):
127 |     assert jf.match_rating_comparison("abc", "abc") is True
128 |     with pytest.raises(TypeError) as exc:
129 |         jf.match_rating_comparison(b"abc", b"abc")
130 | 
131 | 
132 | def test_hamming_type(jf):
133 |     assert jf.hamming_distance("abc", "abc") == 0
134 |     with pytest.raises(TypeError) as exc:
135 |         jf.hamming_distance(b"abc", b"abc")
136 | 
137 | 
138 | def test_soundex_type(jf):
139 |     assert jf.soundex("ABC") == "A120"
140 |     with pytest.raises(TypeError) as exc:
141 |         jf.soundex(b"ABC")
142 | 
143 | 
144 | def test_metaphone_type(jf):
145 |     assert jf.metaphone("abc") == "ABK"
146 |     with pytest.raises(TypeError) as exc:
147 |         jf.metaphone(b"abc")
148 | 
149 | 
150 | def test_nysiis_type(jf):
151 |     assert jf.nysiis("abc") == "ABC"
152 |     with pytest.raises(TypeError) as exc:
153 |         jf.nysiis(b"abc")
154 | 
155 | 
156 | def test_mr_codex_type(jf):
157 |     assert jf.match_rating_codex("abc") == "ABC"
158 |     with pytest.raises(TypeError) as exc:
159 |         jf.match_rating_codex(b"abc")
160 | 
161 | 
162 | def test_mr_codex_bad_string(jf):
163 |     with pytest.raises(ValueError) as exc:
164 |         res = jf.match_rating_codex("i’m")
165 |         print(res)
166 | 


--------------------------------------------------------------------------------