├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── _config.yml ├── docs └── MinimumEditDistance.html ├── pytest.ini ├── setup.cfg ├── setup.py ├── string_distance ├── __init__.py ├── bm25.py ├── cost.pxd ├── edit_distance.pyx ├── float_cost.pxd ├── float_edit_distance.pyx ├── float_minimum_edit_distance.pxd ├── float_minimum_edit_distance.pyx ├── maximum_edit_distance.pxd ├── maximum_edit_distance.pyx ├── minimum_edit_distance.pxd ├── minimum_edit_distance.pyx ├── sequence_distance.pxd ├── sequence_distance.pyx ├── spike │ └── edit_distance_parallel.pyx ├── token_distance.pxd └── token_distance.pyx └── tests ├── bm25_test.py ├── edit_distance_test.py ├── sequence_distance_test.py ├── setup.py ├── token_distance_test.py └── wrapper.pyx /.gitignore: -------------------------------------------------------------------------------- 1 | .python-version 2 | dist/* 3 | build/* 4 | *.egg* 5 | *.pyc 6 | *.so 7 | __pycache__/ 8 | */.ipynb_checkpoints/ 9 | *.c 10 | *.html 11 | .pytest_cache/ 12 | tests/build 13 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | - "3.5" 5 | install: 6 | - "pip install -e .[test]" 7 | script: 8 | - cd tests 9 | - python setup.py build_ext --inplace 10 | - pytest 11 | notifications: 12 | email: false 13 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2018 Brian Lester 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include string_distance/cost.pxd 3 | include string_distance/edit_distance.c 4 | include string_distance/minimum_edit_distance.c 5 | include string_distance/minimum_edit_distance.pxd 6 | include string_distance/maximum_edit_distance.c 7 | include string_distance/maximum_edit_distance.pxd 8 | include string_distance/float_cost.pxd 9 | include string_distance/float_edit_distance.c 10 | include string_distance/float_minimum_edit_distance.c 11 | include string_distance/float_minimum_edit_distance.pxd 12 | include string_distance/sequence_distance.c 13 | include string_distance/sequence_distance.pxd 14 | include string_distance/token_distance.c 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Minimum Edit Distance in Cython 2 | 3 | [![Build Status](https://travis-ci.com/blester125/string_distance.svg?branch=master)](https://travis-ci.com/blester125/string_distance) 4 | 5 | This provides String Distance functions in Cython. 6 | 7 | ### Edit Based 8 | 9 | With these metrics smaller is better. 10 | 11 | * `levenshtein` (1 for insert, 1 for delete, and 1 for substitution) 12 | * `levenshtein_no_sub` (1 for insert, 1 for delete, 2 for substitution) 13 | * `brew` (0.1 for insert, 15 for delete, and 1 for substitution) 14 | * `damerau_levenshtein` (1 for insert, 1 for delete, 1 for substitution, 1 for transposition) 15 | * `jaro_winkler` 16 | 17 | ### Token Based 18 | 19 | * `cosine_distance` 20 | * `binary_cosine_distance` 21 | * `jaccard_distance` 22 | 23 | ### Sequence Based 24 | 25 | With these metrics Larger is better. 26 | 27 | * `longest_common_subsequence` 28 | * `longest_common_substring` 29 | * `ratcliff_obershelft` 30 | 31 | 32 | ### Extending and rolling your own cost functions 33 | 34 | There are 2 kinds of functions used to define costs for the dynamic programming minimum edit distance algorithm. The first is `ctypedef int (*cmp_func)(int c1, int c2)` which is used to compare two characters and return a cost. The second is `ctypedef int (*char_func)(int c1)`. By implementing your own versions of these functions (I would recommend doing it in cost.pxd and inline'ing the function) you can pass them to the distance solver to implement your own weighting scheme. The `cmp_func` can be used to weight a substitution (for example a low cost to letter next to each other on the keyboard like `w` and `e` and high cost to far keys like `z` and `p`). The `char_func` can can be used to weight the insert or delete, for example you could weight inserts by their scrabble scores. 35 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-minimal -------------------------------------------------------------------------------- /docs/MinimumEditDistance.html: -------------------------------------------------------------------------------- 1 |

Minimum Edit Distance

2 | 3 |

I plan to put some examples here

4 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files=*_test.py 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [build_ext] 2 | inplace=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import re 2 | from setuptools import setup, find_packages, Extension 3 | 4 | def get_version(project_name): 5 | regex = re.compile(r"^__version__ = '(\d+\.\d+\.\d+(?:a|b|rc)?(?:\d)*?)'$") 6 | with open("{}/__init__.py".format(project_name)) as f: 7 | for line in f: 8 | m = regex.match(line) 9 | if m is not None: 10 | return m.groups(1)[0] 11 | 12 | 13 | def convert_images(text): 14 | image_regex = re.compile(r"!\[(.*?)\]\((.*?)\)") 15 | return image_regex.sub(r'\1', text) 16 | 17 | class About(object): 18 | NAME='string_distance' 19 | VERSION=get_version(NAME) 20 | AUTHOR='blester125' 21 | EMAIL='{}@gmail.com'.format(AUTHOR) 22 | URL='https://github.com/{}/{}'.format(AUTHOR, NAME) 23 | DL_URL='{}/archive/{}.tar.gz'.format(URL, VERSION) 24 | LICENSE='MIT' 25 | 26 | 27 | ext_modules = [ 28 | Extension( 29 | "string_distance.minimum_edit_distance", 30 | ["string_distance/minimum_edit_distance.pyx"] 31 | ), 32 | Extension( 33 | "string_distance.maximum_edit_distance", 34 | ["string_distance/maximum_edit_distance.pyx"] 35 | ), 36 | Extension( 37 | "string_distance.edit_distance", 38 | ["string_distance/edit_distance.pyx"], 39 | ), 40 | Extension( 41 | "string_distance.float_minimum_edit_distance", 42 | ["string_distance/float_minimum_edit_distance.pyx"] 43 | ), 44 | Extension( 45 | "string_distance.float_edit_distance", 46 | ["string_distance/float_edit_distance.pyx"] 47 | ), 48 | Extension( 49 | "string_distance.sequence_distance", 50 | ["string_distance/sequence_distance.pyx"] 51 | ), 52 | Extension( 53 | "string_distance.token_distance", 54 | ["string_distance/token_distance.pyx"] 55 | ), 56 | ] 57 | 58 | 59 | setup( 60 | name=About.NAME, 61 | version=About.VERSION, 62 | description="Minimum Edit Distance", 63 | long_description=convert_images(open('README.md').read()), 64 | long_description_content_type="text/markdown", 65 | author=About.AUTHOR, 66 | author_email=About.EMAIL, 67 | url=About.URL, 68 | download_url=About.DL_URL, 69 | license=About.LICENSE, 70 | python_requires='>=3.5', 71 | packages=['string_distance'], 72 | package_data={ 73 | 'string_distance': [ 74 | 'README.md', 75 | 'string_distance/cost.pxd', 76 | 'string_distance/edit_distance.c', 77 | 'string_distance/minimum_edit_distance.c', 78 | 'string_distance/minimum_edit_distance.pxd', 79 | 'string_distance/maximum_edit_distance.c', 80 | 'string_distance/maximum_edit_distance.pxd', 81 | 'string_distance/float_cost.pxd', 82 | 'string_distance/float_edit_distance.c', 83 | 'string_distance/float_minimum_edit_distance.c', 84 | 'string_distance/float_minimum_edit_distance.pxd', 85 | 'string_distance/sequence_distance.c' 86 | 'string_distance/sequence_distance.pxd', 87 | 'string_distance/token_distance.c' 88 | ], 89 | }, 90 | include_package_data=True, 91 | install_requires=[ 92 | 'cython', 93 | ], 94 | setup_requires=[ 95 | 'cython', 96 | ], 97 | extras_require={ 98 | 'test': ['pytest'], 99 | }, 100 | keywords=[], 101 | ext_modules=ext_modules 102 | ) 103 | -------------------------------------------------------------------------------- /string_distance/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.0' 2 | 3 | from string_distance.edit_distance import ( 4 | levenshtein, 5 | levenshtein_no_sub, 6 | damerau_levenshtein, 7 | hamming, 8 | jaro, 9 | jaro_winkler, 10 | ) 11 | from string_distance.edit_distance import ( 12 | levenshteins, 13 | levenshtein_no_subs, 14 | damerau_levenshteins, 15 | hammings, 16 | jaros, 17 | jaro_winklers, 18 | ) 19 | from string_distance.float_edit_distance import brew, brews 20 | from string_distance.token_distance import ( 21 | cosine_distance, 22 | binary_cosine_distance, 23 | jaccard_distance, 24 | ) 25 | from string_distance.sequence_distance import ( 26 | longest_common_subsequence, 27 | longest_common_substring, 28 | longest_common_substring_string, 29 | ratcliff_obershelp, 30 | ) 31 | from string_distance.sequence_distance import ( 32 | longest_common_subsequences, 33 | longest_common_substrings, 34 | longest_common_substring_strings, 35 | ratcliff_obershelps, 36 | ) 37 | from string_distance.bm25 import BM25, bm25_scores 38 | -------------------------------------------------------------------------------- /string_distance/bm25.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import Counter, defaultdict 3 | 4 | def BM25(corpus, k=1.5, b=0.75): 5 | corpus_size = len(corpus) 6 | doc_len = [len(doc) for doc in corpus] 7 | avg_len = sum(doc_len) / corpus_size 8 | freqs = [] 9 | df = Counter() 10 | idf = defaultdict(int) 11 | epsilon = 0.25 12 | 13 | for document in corpus: 14 | f = Counter(document) 15 | freqs.append(f) 16 | 17 | for token, freq in f.items(): 18 | df[token] += 1 19 | 20 | for token, freq in df.items(): 21 | # This is the weird IDF that BM25 uses 22 | idf[token] = math.log((corpus_size - freq + 0.5) / (freq + 0.5)) 23 | 24 | avg_idf = sum(x for x in idf.values()) / len(idf) 25 | 26 | def score(query, d): 27 | if isinstance(d, int): 28 | freq = freqs[d] 29 | dlen = doc_len[d] 30 | else: 31 | freq = Counter(d) 32 | dlen = len(d) 33 | val = 0 34 | for token in query: 35 | if token not in freq: 36 | continue 37 | idf_ = idf[token] if idf[token] >= 0 else epsilon * avg_idf 38 | val += idf_ * (freq[token] * (k + 1)) / (freq[token] + k * (1 - b + b * (dlen / avg_len))) 39 | return val 40 | 41 | return score 42 | 43 | def bm25_scores(corpus): 44 | bm = BM25(corpus) 45 | scores = [] 46 | for doc in corpus: 47 | s = [] 48 | for i in range(len(corpus)): 49 | s.append(bm(doc, i)) 50 | scores.append(s) 51 | return scores 52 | -------------------------------------------------------------------------------- /string_distance/cost.pxd: -------------------------------------------------------------------------------- 1 | # cdef unicode vowels = u'aeiouAEIOU' 2 | # cdef int[10] VOWELS 3 | # cdef int VOWEL_SIZE = 10 4 | # cdef int i 5 | # for i in range(VOWEL_SIZE): 6 | # VOWELS[i] = vowels[i] 7 | 8 | 9 | ctypedef int (*cmp_func)(int c1, int c2) 10 | ctypedef int (*char_func)(int c) 11 | ctypedef int (*init_func)(int place, int cost) 12 | ctypedef int (*max_func)(int a, int b, int c) 13 | 14 | 15 | cdef inline int trans_func(int c1, int c2): 16 | return 1 17 | 18 | 19 | cdef inline int sub_func1(int c1, int c2) nogil: 20 | if c1 == c2: 21 | return 0 22 | return 1 23 | 24 | 25 | cdef inline int sub_func2(int c1, int c2) nogil: 26 | if c1 == c2: 27 | return 0 28 | return 2 29 | 30 | cdef inline int del_func(int c) nogil: 31 | return 1 32 | 33 | 34 | cdef inline int ins_func(int c) nogil: 35 | return 1 36 | 37 | cdef inline int nw_init_func(int place, int cost): 38 | return -(place * cost) 39 | 40 | cdef inline int sw_init_func(int place, int cost): 41 | return 0 42 | 43 | cdef inline int nw_max_func(int a, int b, int c): 44 | return max(a, b, c) 45 | 46 | cdef inline int sw_max_func(int a, int b, int c): 47 | return max(0, a, b, c) 48 | 49 | cdef inline int sim_func(int c1, int c2): 50 | if c1 == c2: 51 | return 1 52 | return 0 53 | 54 | # cdef inline int heavy_vowels(int c): 55 | # cdef int i 56 | # for i in range(VOWEL_SIZE): 57 | # if c == VOWELS[i]: 58 | # return 5 59 | # return 1 60 | 61 | # cdef inline int heavy_vowel_sub(int c1, int c2): 62 | # cdef int i 63 | # if c1 == c2: 64 | # return 0 65 | # for i in range(VOWEL_SIZE): 66 | # if c2 == VOWELS[i]: 67 | # return 5 68 | # return 2 69 | -------------------------------------------------------------------------------- /string_distance/edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | # cython: cdivision=True 5 | 6 | from libc.stdlib cimport malloc, free 7 | from string_distance.minimum_edit_distance cimport distance, transpose_distance 8 | from string_distance.maximum_edit_distance cimport distance as dist 9 | from string_distance.cost cimport ( 10 | ins_func, del_func, 11 | sub_func1, sub_func2, 12 | trans_func, 13 | nw_init_func, sw_init_func, 14 | nw_max_func, sw_max_func, 15 | sim_func 16 | ) 17 | 18 | 19 | cpdef int levenshtein(unicode source, unicode target): 20 | return distance( 21 | source, target, 22 | insert_cost=ins_func, 23 | delete_cost=del_func, 24 | substitution_cost=sub_func1 25 | ) 26 | 27 | 28 | cpdef int levenshtein_no_sub(unicode source, unicode target): 29 | return distance( 30 | source, target, 31 | insert_cost=ins_func, 32 | delete_cost=del_func, 33 | substitution_cost=sub_func2 34 | ) 35 | 36 | 37 | cpdef int damerau_levenshtein(unicode source, unicode target): 38 | return transpose_distance( 39 | source, target, 40 | insert_cost=ins_func, 41 | delete_cost=del_func, 42 | substitution_cost=sub_func1, 43 | transpose_cost=trans_func 44 | ) 45 | 46 | 47 | cpdef int needleman_wunsch(unicode source, unicode target, int gap_cost=1): 48 | return dist( 49 | source, target, 50 | gap_cost=gap_cost, 51 | init=nw_init_func, 52 | sim_func=sim_func, 53 | max_fn=nw_max_func 54 | ) 55 | 56 | 57 | cpdef int smith_waterman(unicode source, unicode target, int gap_cost=1): 58 | return dist( 59 | source, target, 60 | gap_cost=gap_cost, 61 | init=sw_init_func, 62 | sim_func=sim_func, 63 | max_fn=sw_max_func 64 | ) 65 | 66 | 67 | cpdef int hamming(unicode source, unicode target) except -1: 68 | cdef int i 69 | cdef int source_length = len(source) 70 | cdef int target_length = len(target) 71 | cdef int distance = 0 72 | if source_length != target_length: 73 | raise ValueError("Strings must be the same length.") 74 | for i in range(source_length): 75 | if source[i] != target[i]: 76 | distance += 1 77 | return distance 78 | 79 | 80 | cpdef float jaro(unicode source, unicode target) except -1: 81 | cdef int s_l = len(source) 82 | cdef int t_l = len(target) 83 | cdef int i = 0, j = 0, k = 0 84 | cdef int m = 0, t = 0 85 | cdef float w 86 | cdef int window = (max(s_l, t_l) / 2) - 1 87 | 88 | cdef int* s_ints = malloc(s_l * sizeof(int)) 89 | if not s_ints: 90 | raise MemoryError() 91 | cdef int* t_ints = malloc(t_l * sizeof(int)) 92 | if not t_ints: 93 | raise MemoryError() 94 | cdef int* s_matched = malloc(s_l * sizeof(int)) 95 | if not s_matched: 96 | raise MemoryError() 97 | cdef int* t_matched = malloc(t_l * sizeof(int)) 98 | if not t_matched: 99 | raise MemoryError() 100 | 101 | try: 102 | for i in range(s_l): 103 | s_ints[i] = source[i] 104 | s_matched[i] = 0 105 | for i in range(t_l): 106 | t_ints[i] = target[j] 107 | t_matched[i] = 0 108 | 109 | # Search forward within the window for a match. 110 | for i in range(s_l): 111 | low = i - window if i > window else 0 112 | high = i + window if i + window < t_l else t_l - 1 113 | for j in range(low, high + 1): 114 | # If you get a match break so you don't double count. 115 | if target[j] == source[i]: 116 | s_matched[i] = 1 117 | t_matched[j] = 1 118 | m += 1 119 | break 120 | 121 | # If there is no matches at all quit. 122 | if m == 0: 123 | return 0.0 124 | 125 | # Find transpositions. 126 | for i in range(s_l): 127 | # If this character matches something in the other string. 128 | if s_matched[i] == 1: 129 | # Loop through other string 130 | for j in range(k, t_l): 131 | # Once we hit a matching char in the other string stop 132 | if t_matched[j]: 133 | # Move start of search 134 | k = j + 1 135 | break 136 | # If the matches we are looking at aren't the same add a transposition 137 | if s_ints[i] != t_ints[j]: 138 | t += 1 139 | t = t / 2 140 | 141 | w = (m / s_l + m / t_l + (m - t) / m) / 3. 142 | 143 | finally: 144 | free(s_ints) 145 | free(t_ints) 146 | free(s_matched) 147 | free(t_matched) 148 | 149 | return w 150 | 151 | 152 | cpdef float jaro_winkler(unicode source, unicode target, float p=0.1) except -1: 153 | cdef int l = 0 154 | cdef int end = min(len(source), len(target), 4) 155 | cdef float jaro_sim 156 | if p > 0.25: 157 | raise ValueError("p should not exceed 0.25, got {}".format("p")) 158 | for l in range(end): 159 | if source[l] != target[l]: 160 | break 161 | jaro_sim = jaro(source, target) 162 | return jaro_sim + (l * p * (1 - jaro_sim)) 163 | 164 | 165 | # Functions that do a one vs many comparison 166 | cpdef list levenshteins(unicode source, list targets): 167 | cdef unicode target 168 | cdef list results = [] 169 | for target in targets: 170 | results.append( 171 | levenshtein(source, target) 172 | ) 173 | return results 174 | 175 | 176 | cpdef list levenshtein_no_subs(unicode source, list targets): 177 | cdef unicode target 178 | cdef list results = [] 179 | for target in targets: 180 | results.append( 181 | levenshtein_no_sub(source, target) 182 | ) 183 | return results 184 | 185 | 186 | cpdef list damerau_levenshteins(unicode source, list targets): 187 | cdef unicode target 188 | cdef list results = [] 189 | for target in targets: 190 | results.append( 191 | damerau_levenshtein(source, target) 192 | ) 193 | return results 194 | 195 | 196 | cpdef list needleman_wunschs(unicode source, list targets, int gap_cost=1): 197 | cdef unicode target 198 | cdef list results = [] 199 | for target in targets: 200 | results.append( 201 | needleman_wunschs(source, target, gap_cost=gap_cost) 202 | ) 203 | return results 204 | 205 | 206 | cpdef list smith_watermans(unicode source, list targets, int gap_cost=1): 207 | cdef unicode target 208 | cdef list results = [] 209 | for target in targets: 210 | results.append( 211 | smith_waterman(source, target, gap_cost=gap_cost) 212 | ) 213 | return results 214 | 215 | 216 | cpdef list hammings(unicode source, list targets): 217 | cdef unicode target 218 | cdef list results = [] 219 | for target in targets: 220 | results.append( 221 | hamming(source, target) 222 | ) 223 | return results 224 | 225 | 226 | cpdef list jaros(unicode source, list targets): 227 | cdef unicode target 228 | cdef list results = [] 229 | for target in targets: 230 | results.append( 231 | jaro(source, target) 232 | ) 233 | return results 234 | 235 | 236 | cpdef list jaro_winklers(unicode source, list targets, float p=0.1): 237 | cdef unicode target 238 | cdef list results = [] 239 | for target in targets: 240 | results.append( 241 | jaro_winkler(source, target, p) 242 | ) 243 | return results 244 | 245 | 246 | # cpdef int levenshtein_heavy_vowels(unicode source, unicode target): 247 | # return distance( 248 | # source, target, 249 | # insert_cost=heavy_vowels, 250 | # delete_cost=del_func, 251 | # substitution_cost=heavy_vowel_sub 252 | # ) 253 | -------------------------------------------------------------------------------- /string_distance/float_cost.pxd: -------------------------------------------------------------------------------- 1 | ctypedef float (*cmp_func)(int c1, int c2) 2 | ctypedef float (*char_func)(int c) 3 | 4 | 5 | cdef inline float sub_func(int c1, int c2): 6 | if c1 == c2: 7 | return 0 8 | return 1.0 9 | 10 | 11 | cdef inline float ins_func(int c1): 12 | return 0.1 13 | 14 | 15 | cdef inline float del_func(int c1): 16 | return 15 17 | -------------------------------------------------------------------------------- /string_distance/float_edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from string_distance.float_minimum_edit_distance cimport distance 6 | from string_distance.float_cost cimport ins_func, del_func, sub_func 7 | 8 | 9 | cpdef float brew(unicode source, unicode target): 10 | return distance( 11 | source, target, 12 | insert_cost=ins_func, 13 | delete_cost=del_func, 14 | substitution_cost=sub_func 15 | ) 16 | 17 | 18 | cpdef list brews(unicode source, list targets): 19 | cdef unicode target 20 | cdef list results = [] 21 | for target in targets: 22 | results.append(brew(source, target)) 23 | return results 24 | -------------------------------------------------------------------------------- /string_distance/float_minimum_edit_distance.pxd: -------------------------------------------------------------------------------- 1 | from string_distance.float_cost cimport char_func, cmp_func 2 | 3 | cdef float distance( 4 | unicode source, unicode target, 5 | char_func insert_cost, 6 | char_func delete_cost, 7 | cmp_func substitution_cost 8 | ) except -1 9 | -------------------------------------------------------------------------------- /string_distance/float_minimum_edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from cpython cimport PyMem_Malloc, PyMem_Free 6 | from cython.view cimport array as cvarray 7 | from string_distance.float_cost cimport char_func, cmp_func 8 | 9 | cdef float distance( 10 | unicode source, unicode target, 11 | char_func insert_cost, 12 | char_func delete_cost, 13 | cmp_func substitution_cost 14 | ) except -1: 15 | cdef int n = len(source) 16 | cdef int m = len(target) 17 | cdef int i, j, index, offset 18 | cdef float[:, :] table = cvarray(shape=(2, m + 1), itemsize=sizeof(float), format="f") 19 | cdef int* soruce_ints 20 | cdef int* target_ints 21 | 22 | source_ints = PyMem_Malloc(n * sizeof(int)) 23 | if not source_ints: 24 | raise MemoryError() 25 | target_ints = PyMem_Malloc(m * sizeof(int)) 26 | if not target_ints: 27 | raise MemoryError() 28 | 29 | try: 30 | table[0, 0] = 0 31 | index = 0 32 | for j in range(1, m + 1): 33 | target_ints[j - 1] = target[j - 1] 34 | table[0, j] = table[0, j - 1] + insert_cost(target_ints[j - 1]) 35 | 36 | for i in range(1, n + 1): 37 | index = i % 2 38 | if index == 1: 39 | offset = -1 40 | else: 41 | offset = 1 42 | source_ints[i - 1] = source[i - 1] 43 | table[index, 0] = table[index + offset, 0] + delete_cost(source_ints[i - 1]) 44 | 45 | for j in range(1, m + 1): 46 | table[index, j] = min( 47 | table[index + offset, j] + delete_cost(source_ints[i - 1]), 48 | table[index, j - 1] + insert_cost(target_ints[j - 1]), 49 | table[index + offset, j - 1] + substitution_cost(source_ints[i - 1], target_ints[j - 1]) 50 | ) 51 | finally: 52 | PyMem_Free(source_ints) 53 | PyMem_Free(target_ints) 54 | return table[index, m] 55 | -------------------------------------------------------------------------------- /string_distance/maximum_edit_distance.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from string_distance.cost cimport init_func, cmp_func, max_func 6 | 7 | 8 | cdef int distance( 9 | unicode source, unicode target, 10 | int gap_cost, 11 | init_func init, 12 | cmp_func sim_func, 13 | max_func max_fn 14 | ) except? -1 15 | -------------------------------------------------------------------------------- /string_distance/maximum_edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from cpython cimport PyMem_Malloc, PyMem_Free 6 | from cython.view cimport array as cvarray 7 | from string_distance.cost cimport init_func, cmp_func, max_func 8 | 9 | 10 | cdef int distance( 11 | unicode source, unicode target, 12 | int gap_cost, 13 | init_func init, 14 | cmp_func sim_func, 15 | max_func max_fn 16 | ) except? -1: 17 | cdef int n = len(source) 18 | cdef int m = len(target) 19 | cdef int i = 0 20 | cdef int j = 0 21 | cdef int index = 0 22 | cdef int offset 23 | cdef int[:, :] table = cvarray(shape=(2, m + 1), itemsize=sizeof(int), format="i") 24 | cdef int* source_ints 25 | cdef int* target_ints 26 | 27 | source_ints = PyMem_Malloc(n * sizeof(int)) 28 | if not source_ints: 29 | raise MemoryError() 30 | target_ints = PyMem_Malloc(m * sizeof(int)) 31 | if not target_ints: 32 | raise MemoryError() 33 | 34 | try: 35 | table[0, 0] = 0 36 | for j in range(1, m + 1): 37 | target_ints[j - 1] = target[j - 1] 38 | table[0, j] = init(j, gap_cost) 39 | 40 | for i in range(1, n + 1): 41 | index = i % 2 42 | if index == 1: 43 | offset = -1 44 | else: 45 | offset = 1 46 | source_ints[i - 1] = source[i - 1] 47 | table[index, 0] = init(i, gap_cost) 48 | 49 | for j in range(1, m + 1): 50 | table[index, j] = max_fn( 51 | table[index + offset, j - 1] + sim_func(source_ints[i - 1], target_ints[j - 1]), 52 | table[index + offset, j] - gap_cost, 53 | table[index, j - 1] - gap_cost 54 | ) 55 | 56 | finally: 57 | PyMem_Free(source_ints) 58 | PyMem_Free(target_ints) 59 | return table[index, m] 60 | -------------------------------------------------------------------------------- /string_distance/minimum_edit_distance.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from string_distance.cost cimport char_func, cmp_func 6 | 7 | 8 | cdef int distance( 9 | unicode source, unicode target, 10 | char_func insert_cost, 11 | char_func delete_cost, 12 | cmp_func substitution_cost 13 | ) except -1 14 | 15 | 16 | cdef int transpose_distance( 17 | unicode source, unicode target, 18 | char_func insert_cost, 19 | char_func delete_cost, 20 | cmp_func substitution_cost, 21 | cmp_func transpose_cost 22 | ) except -1 23 | -------------------------------------------------------------------------------- /string_distance/minimum_edit_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from cpython cimport PyMem_Malloc, PyMem_Free 6 | from cython.view cimport array as cvarray 7 | from string_distance.cost cimport char_func, cmp_func 8 | 9 | cdef int distance( 10 | unicode source, unicode target, 11 | char_func insert_cost, 12 | char_func delete_cost, 13 | cmp_func substitution_cost 14 | ) except -1: 15 | cdef int n = len(source) 16 | cdef int m = len(target) 17 | cdef int i, j, index, offset 18 | cdef int[:, :] table = cvarray(shape=(2, m + 1), itemsize=sizeof(int), format="i") 19 | cdef int* source_ints 20 | cdef int* target_ints 21 | 22 | source_ints = PyMem_Malloc(n * sizeof(int)) 23 | if not source_ints: 24 | raise MemoryError() 25 | target_ints = PyMem_Malloc(m * sizeof(int)) 26 | if not target_ints: 27 | raise MemoryError() 28 | 29 | try: 30 | table[0, 0] = 0 31 | index = 0 32 | for j in range(1, m + 1): 33 | target_ints[j - 1] = target[j - 1] 34 | table[0, j] = table[0, j - 1] + insert_cost(target_ints[j - 1]) 35 | 36 | for i in range(1, n + 1): 37 | index = i % 2 38 | if index == 1: 39 | offset = -1 40 | else: 41 | offset = 1 42 | # This could be moved out of the loop 43 | source_ints[i - 1] = source[i - 1] 44 | table[index, 0] = table[index + offset, 0] + delete_cost(source_ints[i - 1]) 45 | 46 | for j in range(1, m + 1): 47 | table[index, j] = min( 48 | table[index + offset, j] + delete_cost(source_ints[i - 1]), 49 | table[index, j - 1] + insert_cost(target_ints[j - 1]), 50 | table[index + offset, j - 1] + substitution_cost(source_ints[i - 1], target_ints[j - 1]) 51 | ) 52 | finally: 53 | PyMem_Free(source_ints) 54 | PyMem_Free(target_ints) 55 | return table[index, m] 56 | 57 | 58 | cdef int transpose_distance( 59 | unicode source, unicode target, 60 | char_func insert_cost, 61 | char_func delete_cost, 62 | cmp_func substitution_cost, 63 | cmp_func transpose_cost 64 | ) except -1: 65 | cdef int n = len(source) 66 | cdef int m = len(target) 67 | cdef int i, j, index, offset, t_offset 68 | cdef int[:, :] table = cvarray(shape=(3, m + 1), itemsize=sizeof(int), format="i") 69 | cdef int* soruce_ints 70 | cdef int* target_ints 71 | 72 | source_ints = PyMem_Malloc(n * sizeof(int)) 73 | if not source_ints: 74 | raise MemoryError() 75 | target_ints = PyMem_Malloc(m * sizeof(int)) 76 | if not target_ints: 77 | raise MemoryError() 78 | 79 | try: 80 | table[0, 0] = 0 81 | index = 0 82 | for j in range(1, m + 1): 83 | target_ints[j - 1] = target[j - 1] 84 | table[0, j] = table[0, j - 1] + insert_cost(target_ints[j - 1]) 85 | 86 | for i in range(1, n + 1): 87 | index = i % 3 88 | if index == 0: 89 | offset = 2 90 | t_offset = 1 91 | elif index == 1: 92 | offset = -1 93 | t_offset = 1 94 | else: 95 | offset = -1 96 | t_offset = -2 97 | source_ints[i - 1] = source[i - 1] 98 | table[index, 0] = table[index + offset, 0] + delete_cost(source_ints[i - 1]) 99 | 100 | for j in range(1, m + 1): 101 | table[index, j] = min( 102 | table[index + offset, j] + delete_cost(source_ints[i - 1]), 103 | table[index, j - 1] + insert_cost(target_ints[j - 1]), 104 | table[index + offset, j - 1] + substitution_cost(source_ints[i - 1], target_ints[j - 1]) 105 | ) 106 | # If I can do a transpose 107 | if i > 1 and j > 1: 108 | if source_ints[i - 1] == target_ints[j - 2] and source_ints[i - 2] == target_ints[j - 1]: 109 | table[index, j] = min( 110 | table[index, j], 111 | table[index + t_offset, j - 2] + transpose_cost(source_ints[i - 1], target_ints[i - 2]) 112 | ) 113 | finally: 114 | PyMem_Free(source_ints) 115 | PyMem_Free(target_ints) 116 | return table[index, m] 117 | -------------------------------------------------------------------------------- /string_distance/sequence_distance.pxd: -------------------------------------------------------------------------------- 1 | cdef struct Match: 2 | int length 3 | int source_start 4 | int source_end 5 | int target_start 6 | int target_end 7 | -------------------------------------------------------------------------------- /string_distance/sequence_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | # cython: cdivision=True 5 | 6 | from cpython cimport PyMem_Malloc, PyMem_Free 7 | from cython.view cimport array as cvarray 8 | 9 | 10 | cdef Match* longest_common_substring_base(unicode source, unicode target, Match* match) except NULL: 11 | cdef int n = len(source) 12 | cdef int m = len(target) 13 | cdef int i, j 14 | cdef int r_i = 0 15 | cdef int r_j = 0 16 | cdef int result = 0 17 | cdef int[:, :] table = cvarray(shape=(n + 1, m + 1), itemsize=sizeof(int), format="i") 18 | cdef int* source_ints 19 | cdef int* target_ints 20 | 21 | source_ints = PyMem_Malloc(n * sizeof(int)) 22 | if not source_ints: 23 | raise MemoryError() 24 | target_ints = PyMem_Malloc(m * sizeof(int)) 25 | if not target_ints: 26 | raise MemoryError() 27 | 28 | try: 29 | for i in range(1, n + 1): 30 | source_ints[i - 1] = source[i - 1] 31 | table[i, 0] = 0 32 | for j in range(1, m + 1): 33 | target_ints[j - 1] = target[j - 1] 34 | table[0, j] = 0 35 | 36 | table[0, 0] = 0 37 | for i in range(1, n + 1): 38 | for j in range(1, m + 1): 39 | table[i, j] = 0 40 | if source_ints[i - 1] == target_ints[j - 1]: 41 | table[i, j] = table[i - 1, j - 1] + 1 42 | if table[i, j] > result: 43 | result = table[i, j] 44 | r_i = i 45 | r_j = j 46 | 47 | finally: 48 | PyMem_Free(source_ints) 49 | PyMem_Free(target_ints) 50 | 51 | match.length = result 52 | match.source_start = r_i - result 53 | match.source_end = r_i 54 | match.target_start = r_j - result 55 | match.target_end = r_j 56 | 57 | return match 58 | 59 | 60 | cpdef int longest_common_substring(unicode source, unicode target) except -1: 61 | cdef int length 62 | cdef Match* match = PyMem_Malloc(sizeof(Match)) 63 | if not match: 64 | raise MemoryError() 65 | try: 66 | match = longest_common_substring_base(source, target, match) 67 | length = match.length 68 | finally: 69 | PyMem_Free(match) 70 | return length 71 | 72 | 73 | cpdef list longest_common_substrings(unicode source, list targets): 74 | cdef unicode target 75 | cdef list result = [] 76 | for target in targets: 77 | result.append( 78 | longest_common_substring(source, target) 79 | ) 80 | return result 81 | 82 | 83 | cpdef unicode longest_common_substring_string(unicode source, unicode target): 84 | cdef unicode substring 85 | cdef Match* match = PyMem_Malloc(sizeof(Match)) 86 | if not match: 87 | raise MemoryError() 88 | try: 89 | match = longest_common_substring_base(source, target, match) 90 | substring = source[match.source_start:match.source_end] 91 | finally: 92 | PyMem_Free(match) 93 | return substring 94 | 95 | 96 | cpdef list longest_common_substring_strings(unicode source, list targets): 97 | cdef unicode target 98 | cdef list result = [] 99 | for target in targets: 100 | result.append( 101 | longest_common_substring_string(source, target) 102 | ) 103 | return result 104 | 105 | 106 | cdef int rco_recursive(unicode source, unicode target) except -1: 107 | cdef length = 0 108 | cdef unicode left_source, right_source, left_target, right_target 109 | cdef Match* match = PyMem_Malloc(sizeof(Match)) 110 | if not match: 111 | raise MemoryError() 112 | try: 113 | match = longest_common_substring_base(source, target, match) 114 | length = match.length 115 | if length == 0: 116 | return 0 117 | left_source = source[:match.source_start] 118 | right_source = source[match.source_end:] 119 | left_target = target[:match.target_start] 120 | right_target = target[match.target_end:] 121 | finally: 122 | PyMem_Free(match) 123 | 124 | return ( 125 | rco_recursive(left_source, left_target) + 126 | length + 127 | rco_recursive(right_source, right_target) 128 | ) 129 | 130 | 131 | cpdef float ratcliff_obershelp(unicode source, unicode target) except -1: 132 | cdef int element_count = len(source) + len(target) 133 | if element_count == 0: 134 | return 0 135 | return ((2 * rco_recursive(source, target))) / element_count 136 | 137 | 138 | cpdef list ratcliff_obershelps(unicode source, list targets): 139 | cdef unicode target 140 | cdef list result = [] 141 | for target in targets: 142 | result.append( 143 | ratcliff_obershelps(source, target) 144 | ) 145 | return result 146 | 147 | 148 | cpdef int longest_common_subsequence(unicode source, unicode target) except -1: 149 | cdef int n = len(source) 150 | cdef int m = len(target) 151 | cdef int i, j 152 | cdef int[:, :] table = cvarray(shape=(n + 1, m + 1), itemsize=sizeof(int), format="i") 153 | cdef int* source_ints 154 | cdef int* target_ints 155 | 156 | source_ints = PyMem_Malloc(n * sizeof(int)) 157 | if not source_ints: 158 | raise MemoryError() 159 | target_ints = PyMem_Malloc(m * sizeof(int)) 160 | if not target_ints: 161 | raise MemoryError() 162 | 163 | try: 164 | for i in range(1, (n + 1)): 165 | source_ints[i - 1] = source[i - 1] 166 | table[i, 0] = 0 167 | for j in range(1, (m + 1)): 168 | target_ints[j - 1] = target[j - 1] 169 | table[0, j] = 0 170 | 171 | table[0, 0] = 0 172 | for i in range(1, (n + 1)): 173 | for j in range(1, (m + 1)): 174 | if source_ints[i - 1] == target_ints[j - 1]: 175 | table[i, j] = table[i - 1, j - 1] + 1 176 | else: 177 | table[i, j] = max(table[i - 1, j], table[i, j - 1]) 178 | 179 | finally: 180 | PyMem_Free(source_ints) 181 | PyMem_Free(target_ints) 182 | 183 | return table[n, m] 184 | 185 | 186 | cpdef list longest_common_subsequences(unicode source, list targets): 187 | cdef unicode target 188 | cdef list result = [] 189 | for target in targets: 190 | result.append( 191 | longest_common_subsequence(source, target) 192 | ) 193 | return result 194 | -------------------------------------------------------------------------------- /string_distance/spike/edit_distance_parallel.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | from cpython cimport PyMem_Malloc, PyMem_Free 6 | from cython.view cimport array as cvarray 7 | from edit_distance cimport ins_func, del_func, sub_func1 8 | from cython.parallel cimport prange 9 | 10 | ctypedef int (*cmp_func_no_gil)(int c1, int c2) nogil 11 | ctypedef int (*char_func_no_gil)(int c) nogil 12 | 13 | cdef int distance_parallel( 14 | unicode source, unicode target, 15 | char_func_no_gil insert_cost, 16 | char_func_no_gil delete_cost, 17 | cmp_func_no_gil substitution_cost 18 | ) except -1: 19 | cdef int n = len(source) 20 | cdef int m = len(target) 21 | cdef int i, j, k, diags, start_col, diag_size, diag, rows, cols 22 | cdef int[:, :] table = cvarray(shape=(n + 1, m + 1), item=sizeof(int), format="i") 23 | cdef int* soruce_ints 24 | cdef int* target_ints 25 | 26 | source_ints = PyMem_Malloc(n * sizeof(int)) 27 | if not source_ints: 28 | raise MemoryError() 29 | target_ints = PyMem_Malloc(m * sizeof(int)) 30 | if not target_ints: 31 | raise MemoryError() 32 | 33 | try: 34 | table[0, 0] = 0 35 | for i in range(1, n + 1): 36 | source_ints[i - 1] = source[i - 1] 37 | for j in range(1, m + 1): 38 | target_ints[j - 1] = target[j - 1] 39 | 40 | rows = n + 1 41 | cols = m + 1 42 | diags = rows + cols 43 | for diag in range(1, diags): 44 | start_col = max(0, diag - rows) 45 | diag_size = min(diag, (cols - start_col), rows) 46 | for k in prange(diag_size, nogil=True): 47 | i = min(rows, diag) - k - 1 48 | j = start_col + k 49 | if i == 0 and j == 0: 50 | continue 51 | elif i == 0: 52 | table[i, j] = table[i, j - 1] + delete_cost(source_ints[j - 1]) 53 | elif j == 0: 54 | table[i, j] = table[i - 1, j] + insert_cost(target_ints[i - 1]) 55 | else: 56 | table[i, j] = min( 57 | table[i - 1, j] + delete_cost(source_ints[i - 1]), 58 | table[i, j - 1] + insert_cost(target_ints[j - 1]), 59 | table[i - 1, j - 1] + substitution_cost(source_ints[i - 1], target_ints[j - 1]) 60 | ) 61 | finally: 62 | PyMem_Free(source_ints) 63 | PyMem_Free(target_ints) 64 | return table[n, m] 65 | 66 | 67 | cpdef int levenshtein_parallel(unicode source, unicode target): 68 | return distance_parallel( 69 | source, target, 70 | insert_cost=ins_func, 71 | delete_cost=del_func, 72 | substitution_cost=sub_func1 73 | ) 74 | -------------------------------------------------------------------------------- /string_distance/token_distance.pxd: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | 5 | cdef list n_grams(unicode string, int n) 6 | cdef dict shingle(list ngrams) 7 | cdef dict binary_shingle(list ngrams) 8 | cdef float norm(dict vector) 9 | cdef float binary_cosine(dict source, dict target) 10 | cdef float cosine(dict source, dict target) 11 | cdef float jaccard(dict source, dict target) 12 | -------------------------------------------------------------------------------- /string_distance/token_distance.pyx: -------------------------------------------------------------------------------- 1 | # cython: language_level=3 2 | # cython: boundscheck=False 3 | # cython: wraparound=False 4 | # cython: cdivision=True 5 | 6 | from cpython cimport PyMem_Malloc, PyMem_Free 7 | from libc.math cimport sqrt 8 | 9 | 10 | ctypedef float (*dist_func)(dict source, dict target) 11 | ctypedef dict (*shingle_func)(list ngram) 12 | 13 | 14 | cdef list n_grams(unicode string, int n): 15 | cdef int i 16 | cdef list ngrams = [] 17 | for i in range(len(string) - n + 1): 18 | ngrams.append(string[i:i+n]) 19 | return ngrams 20 | 21 | 22 | cdef dict binary_shingle(list ngrams): 23 | cdef dict shingled = {} 24 | cdef unicode gram 25 | for gram in ngrams: 26 | shingled[gram] = 1 27 | return shingled 28 | 29 | 30 | cdef dict shingle(list ngrams): 31 | cdef dict shingled = {} 32 | cdef unicode gram 33 | for gram in ngrams: 34 | if gram in shingled: 35 | shingled[gram] += 1 36 | else: 37 | shingled[gram] = 1 38 | return shingled 39 | 40 | 41 | cdef float norm(dict vector): 42 | cdef float norm_val = 0 43 | for v in vector.values(): 44 | norm_val += v * v 45 | return sqrt(norm_val) 46 | 47 | 48 | cdef float binary_cosine(dict source, dict target): 49 | """intersection could be computed with `len(set(source) & set(target))` 50 | that is fastest for normal python but with cython explicit loops are faster. 51 | 52 | With`len(source) = 15720` and `len(target) = 46750` 53 | python way = 0.004878520965576172 54 | cython way = 0.002151012420654297 55 | """ 56 | cdef unicode key 57 | cdef int intersection = 0 58 | # Because this is binary the norm of a vector is just the sum of the vector 59 | cdef float norm_ = sqrt(len(source)) * sqrt(len(target)) 60 | for key in source: 61 | if key in target: 62 | intersection += 1 63 | return (intersection) / norm_ 64 | 65 | 66 | cdef float cosine(dict source, dict target): 67 | cdef float source_norm = norm(source) 68 | cdef float target_norm = norm(target) 69 | cdef float norm_ = source_norm * target_norm 70 | cdef float intersection = 0 71 | cdef unicode k 72 | cdef float v 73 | for k, v in source.items(): 74 | if k in target: 75 | intersection += v * target[k] 76 | return (intersection) / norm_ 77 | 78 | 79 | cdef float jaccard(dict source, dict target): 80 | cdef unicode key 81 | cdef set union_ = set() 82 | cdef int intersection = 0 83 | for key in source: 84 | union_.add(key) 85 | if key in target: 86 | intersection += 1 87 | for key in target: 88 | union_.add(key) 89 | return (intersection) / len(union_) 90 | 91 | 92 | cdef float distance( 93 | unicode source, unicode target, 94 | dist_func metric, 95 | shingle_func transform, 96 | int n=2 97 | ): 98 | cdef list source_ngrams, target_ngrams 99 | cdef dict source_vec, target_vec 100 | source_ngrams = n_grams(source, n) 101 | target_ngrams = n_grams(target, n) 102 | source_vec = transform(source_ngrams) 103 | target_vec = transform(target_ngrams) 104 | return 1 - metric(source_vec, target_vec) 105 | 106 | 107 | cpdef float binary_cosine_distance(unicode source, unicode target, int n=2): 108 | return distance(source, target, binary_cosine, binary_shingle, n) 109 | 110 | 111 | cpdef float cosine_distance(unicode source, unicode target, int n=2): 112 | return distance(source, target, cosine, shingle, n) 113 | 114 | 115 | cpdef float jaccard_distance(unicode source, unicode target, int n=2): 116 | return distance(source, target, jaccard, binary_shingle, n) 117 | -------------------------------------------------------------------------------- /tests/bm25_test.py: -------------------------------------------------------------------------------- 1 | import math 2 | from string_distance import BM25, bm25_scores 3 | 4 | corpus = [ 5 | ["black", "cat", "white", "cat"], 6 | ["cat", "outer", "space"], 7 | ["wag", "dog"], 8 | ] 9 | 10 | gold = [ 11 | [1.1237959024144617, 0.1824377227735681, 0], 12 | [0.11770175662810844, 1.1128701089187656, 0], 13 | [0, 0, 1.201942644155272] 14 | ] 15 | 16 | def test_bm_weights(): 17 | scores = bm25_scores(corpus) 18 | for row, gold_row in zip(scores, gold): 19 | for weight, gold_weight in zip(row, gold_row): 20 | assert math.isclose(weight, gold_weight) 21 | 22 | def test_closure(): 23 | gold = 1.0216512475319814 24 | bm25 = BM25(corpus) 25 | weight = bm25(["rocket", "in", "outer", "space"], 1) 26 | assert math.isclose(weight, gold) 27 | -------------------------------------------------------------------------------- /tests/edit_distance_test.py: -------------------------------------------------------------------------------- 1 | import math 2 | import string 3 | import random 4 | import pytest 5 | from string_distance import ( 6 | hamming, 7 | levenshtein, 8 | levenshtein_no_sub, 9 | brew, 10 | damerau_levenshtein, 11 | longest_common_subsequence, 12 | jaro, 13 | jaro_winkler 14 | ) 15 | 16 | 17 | # Brew Tests 18 | def test_brew(): 19 | source = "Hosp" 20 | target = "Hospital" 21 | gold = 0.4 22 | assert math.isclose(brew(source, target), gold, rel_tol=1e-7) 23 | 24 | 25 | def test_brew_longer_source(): 26 | source = "Hospital" 27 | target = "Hosp" 28 | brew(source, target) 29 | 30 | 31 | def test_brew_different_when_swapped(): 32 | source = "asdfbne" 33 | target = "asejrb" 34 | assert brew(source, target) != brew(target, source) 35 | 36 | 37 | def test_brew_empty_source(): 38 | source = "" 39 | target = "asdfb" 40 | # From brew insert cost 41 | gold = len(target) * 0.1 42 | assert brew(source, target) == gold 43 | 44 | 45 | def test_brew_empty_target(): 46 | source = "asdfb" 47 | target = "" 48 | # From brew delete cost 49 | gold = len(source) * 15 50 | assert brew(source, target) == gold 51 | 52 | 53 | def test_brew_both_empty(): 54 | source = "" 55 | target = "" 56 | gold = 0 57 | assert brew(source, target) == gold 58 | 59 | 60 | # Levenshtein Tests 61 | def test_levenshtein(): 62 | source = "intention" 63 | target = "execution" 64 | gold = 5 65 | assert levenshtein(source, target) == gold 66 | 67 | def test_levenshtein_longer_source(): 68 | source = "aaaaaaaa" 69 | target = "aaaaa" 70 | gold = abs(len(source) - len(target)) * 1 71 | assert levenshtein(source, target) == gold 72 | 73 | def test_levenshtein_longer_target(): 74 | source = "aaaaa" 75 | target = "aaaaaaaa" 76 | gold = abs(len(source) - len(target)) * 1 77 | assert levenshtein(source, target) == gold 78 | 79 | def test_levenshtein_empty_source(): 80 | source = "" 81 | target = "aaaaa" 82 | gold = len(target) * 1 83 | assert levenshtein(source, target) == gold 84 | 85 | def test_levenshtein_empty_target(): 86 | source = "aaaaa" 87 | target = "" 88 | gold = len(source) * 1 89 | assert levenshtein(source, target) == gold 90 | 91 | def test_levenshtein_both_empty(): 92 | source = "" 93 | target = "" 94 | gold = 0 95 | assert levenshtein(source, target) == gold 96 | 97 | 98 | # Levenshtein_no_sub Tests 99 | def test_no_sub_is_different(): 100 | source = "aabbc" 101 | target = "aabbz" 102 | assert levenshtein(source, target) != levenshtein_no_sub(source, target) 103 | 104 | 105 | def test_levenshtein_no_sub(): 106 | source = "intention" 107 | target = "execution" 108 | gold = 8 109 | assert levenshtein_no_sub(source, target) == gold 110 | 111 | def test_levenshtein_no_sub_longer_source(): 112 | source = "aaaaaaaa" 113 | target = "aaaaa" 114 | gold = abs(len(source) - len(target)) * 1 115 | assert levenshtein_no_sub(source, target) == gold 116 | 117 | def test_levenshtein_no_sub_longer_target(): 118 | source = "aaaaa" 119 | target = "aaaaaaaa" 120 | gold = abs(len(source) - len(target)) * 1 121 | assert levenshtein_no_sub(source, target) == gold 122 | 123 | def test_levenshtein_no_sub_empty_source(): 124 | source = "" 125 | target = "aaaaa" 126 | gold = len(target) * 1 127 | assert levenshtein_no_sub(source, target) == gold 128 | 129 | def test_levenshtein_no_sub_empty_target(): 130 | source = "aaaaa" 131 | target = "" 132 | gold = len(source) * 1 133 | assert levenshtein_no_sub(source, target) == gold 134 | 135 | def test_levenshtein_no_subboth_empty(): 136 | source = "" 137 | target = "" 138 | gold = 0 139 | assert levenshtein_no_sub(source, target) == gold 140 | 141 | 142 | # Damerau Levenshtein 143 | def test_dl_is_different(): 144 | source = "agdaabbvnb" 145 | target = "aedababicb" 146 | assert damerau_levenshtein(source, target) != levenshtein(source, target) 147 | assert damerau_levenshtein(source, target) == levenshtein(source, target) - 1 148 | 149 | def test_dl(): 150 | source = "from" 151 | target = "form" 152 | gold = 1 153 | assert damerau_levenshtein(source, target) == gold 154 | 155 | def test_dl_2(): 156 | source = "jurisdiction" 157 | target = "Copley" 158 | gold = 12 159 | assert damerau_levenshtein(source, target) == gold 160 | 161 | # Hamming Tests 162 | def test_hamming_size_mismatch(): 163 | source = "ABCD" 164 | target = "AB" 165 | with pytest.raises(ValueError): 166 | hamming(source, target) 167 | 168 | 169 | def test_hamming_empty(): 170 | source = "" 171 | target = "" 172 | gold = 0 173 | assert hamming(source, target) == gold 174 | 175 | 176 | def test_hamming(): 177 | source = "100110110" 178 | target = "101110011" 179 | gold = 3 180 | assert hamming(source, target) == gold 181 | 182 | 183 | def test_edit_distance_from_LCS(): 184 | """Edit distance (with only insert and delete) should be equ with 185 | len(a) + len(b) + 2 * lcs(a, b) 186 | from: Mining of Massive Datasets second ed. ch 3 page 96 187 | """ 188 | def real_test(source, target): 189 | lcs = longest_common_subsequence(source, target) 190 | gold = levenshtein_no_sub(source, target) 191 | assert (len(source) + len(target) - (2 * lcs)) == gold 192 | 193 | trials = 100 194 | for _ in range(trials): 195 | source = ''.join([random.choice(("G", "A", "T", "C")) for _ in range(random.randint(5, 41))]) 196 | target = ''.join([random.choice(("G", "A", "T", "C")) for _ in range(random.randint(5, 41))]) 197 | real_test(source, target) 198 | 199 | # Jaro(_Winker)? tests 200 | def test_jaro_vs_winkler_prefix_match(): 201 | prefix = ''.join([random.choice(string.ascii_lowercase) for _ in range(4)]) 202 | source = ''.join([random.choice(string.ascii_lowercase) for _ in range(10)]) 203 | target = ''.join([random.choice(string.ascii_lowercase) for _ in range(10)]) 204 | source = prefix + source 205 | target = prefix + target 206 | j = jaro(source, target) 207 | jw = jaro_winkler(source, target) 208 | assert not math.isclose(j, jw) 209 | 210 | def test_jaro_vs_winkler_prefix_mismatch(): 211 | prefix = ''.join([random.choice(string.ascii_lowercase) for _ in range(4)]) 212 | prefix2 = ''.join([chr(ord(c) + 1) for c in prefix]) 213 | source = ''.join([random.choice(string.ascii_lowercase) for _ in range(10)]) 214 | target = ''.join([random.choice(string.ascii_lowercase) for _ in range(10)]) 215 | source = prefix + source 216 | target = prefix2 + target 217 | assert prefix != prefix2 218 | j = jaro(source, target) 219 | jw = jaro_winkler(source, target) 220 | assert math.isclose(j, jw) 221 | 222 | def test_jaro(): 223 | source = "This is a test" 224 | target = "There is goes" 225 | gold = 0.7257742285728455 226 | assert math.isclose(jaro(source, target), gold) 227 | 228 | def test_jaro_winkler(): 229 | source = "This is a test" 230 | target = "There is goes" 231 | gold = 0.7806193828582764 232 | assert math.isclose(jaro_winkler(source, target), gold) 233 | -------------------------------------------------------------------------------- /tests/sequence_distance_test.py: -------------------------------------------------------------------------------- 1 | import math 2 | from string_distance import ( 3 | longest_common_substring, 4 | longest_common_substring_string, 5 | longest_common_subsequence, 6 | ratcliff_obershelp 7 | ) 8 | 9 | 10 | # Subsequence Tests 11 | def test_longest_common_subsequence_longer_source(): 12 | source = "bhgABCkdkrnDEFjasfdGkkuHIJ" 13 | target = "aABCDEFGHIJfd" 14 | gold = len("ABCDEFGHIJ") 15 | assert longest_common_subsequence(source, target) == gold 16 | 17 | 18 | def test_longest_common_subsequence_longer_target(): 19 | source = "aABCDEFGHIJfd" 20 | target = "bhgABCkdkrnDEFjasfdGkkuHIJ" 21 | gold = len("ABCDEFGHIJ") 22 | assert longest_common_subsequence(source, target) == gold 23 | 24 | 25 | def test_longest_common_subsequence_empty_source(): 26 | source = "" 27 | target = "ABCD" 28 | gold = 0 29 | assert longest_common_subsequence(source, target) == gold 30 | 31 | 32 | def test_longest_common_subsequence_empty_target(): 33 | source = "ABCD" 34 | target = "" 35 | gold = 0 36 | assert longest_common_subsequence(source, target) == gold 37 | 38 | 39 | def test_longest_common_subsequence_both_empty(): 40 | source = "" 41 | target = "" 42 | gold = 0 43 | assert longest_common_subsequence(source, target) == gold 44 | 45 | 46 | # Substring Tests 47 | def test_longest_common_substring_longer_source(): 48 | source = "asdnerhABCDEFfjekn" 49 | target = "fndbaABCDjEFjj" 50 | gold = len("ABCD") 51 | assert longest_common_substring(source, target) == gold 52 | 53 | 54 | def test_longest_common_substring_longer_target(): 55 | source = "fndbaABCDjEFjj" 56 | target = "asdnerhABCDEFfjekn" 57 | gold = len("ABCD") 58 | assert longest_common_substring(source, target) == gold 59 | 60 | 61 | def test_longest_common_substring_empty_source(): 62 | source = "" 63 | target = "asdnerhABCDEFfjekn" 64 | gold = 0 65 | assert longest_common_substring(source, target) == gold 66 | 67 | 68 | def test_longest_common_substring_empty_target(): 69 | source = "asdnerhABCDEFfjekn" 70 | target = "" 71 | gold = 0 72 | assert longest_common_substring(source, target) == gold 73 | 74 | 75 | def test_longest_common_substring_empty_target(): 76 | source = "" 77 | target = "" 78 | gold = 0 79 | assert longest_common_substring(source, target) == gold 80 | 81 | 82 | def test_length_and_string_match(): 83 | source = "cvABCDghHJJL" 84 | target = "ABCDjghjJJL" 85 | length = longest_common_substring(source, target) 86 | str_length = len(longest_common_substring_string(source, target)) 87 | assert length == str_length 88 | 89 | # Ratcliff Obershelp Tests 90 | def test_rco(): 91 | source = "penVAINia" 92 | target = "pinVAINeya" 93 | gold = (len("pnVAINa") * 2) / (len(source) + len(target)) 94 | assert math.isclose(ratcliff_obershelp(source, target), gold, rel_tol=1e-5) 95 | 96 | 97 | def test_rco_longer_source(): 98 | source = "pinVAINeya" 99 | target = "penVAINia" 100 | gold = (len("pnVAINa") * 2) / (len(source) + len(target)) 101 | assert math.isclose(ratcliff_obershelp(source, target), gold, rel_tol=1e-5) 102 | 103 | 104 | def test_rco_empty_source(): 105 | source = "" 106 | target = "penVAINia" 107 | gold = 0 108 | assert math.isclose(ratcliff_obershelp(source, target), gold, rel_tol=1e-5) 109 | 110 | 111 | def test_rco_empty_target(): 112 | source = "pinVAINeya" 113 | target = "" 114 | gold = 0 115 | assert math.isclose(ratcliff_obershelp(source, target), gold, rel_tol=1e-5) 116 | 117 | 118 | def test_rco_both_empty(): 119 | source = "" 120 | target = "" 121 | gold = 0 122 | assert math.isclose(ratcliff_obershelp(source, target), gold, rel_tol=1e-5) 123 | -------------------------------------------------------------------------------- /tests/setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | 4 | setup( 5 | name = 'Tests', 6 | ext_modules = cythonize(["*.pyx"]), 7 | ) 8 | -------------------------------------------------------------------------------- /tests/token_distance_test.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import Counter 3 | from string_distance import cosine_distance, jaccard_distance, binary_cosine_distance 4 | # import pyximport; pyximport.install(build_in_temp=True, inplace=True) 5 | from wrapper import * 6 | 7 | def test_cosine_and_binary_different(): 8 | source = "aabbaabcc" 9 | target = "abbacc" 10 | assert cosine_distance(source, target) != binary_cosine_distance(source, target) 11 | 12 | def test_cosine_equality(): 13 | for i in range(2, 7): 14 | word = ''.join(str(n) for n in range(i)) 15 | assert math.isclose(cosine_distance(word, word), 0, abs_tol=1e-6) 16 | 17 | def test_cosine_distance(): 18 | source = "aabbbabaccb" 19 | target = "abbbaccdf" 20 | gold = 0.20943057537078857 21 | assert cosine_distance(source, target) == gold 22 | 23 | def test_binary_cosine_distance(): 24 | source = "aabbbabaccb" 25 | target = "abbbaccdf" 26 | gold = 0.2857142686843872 27 | assert binary_cosine_distance(source, target) == gold 28 | 29 | def test_jaccard_distance(): 30 | source = "abcd" 31 | source_shingled = ["ab", "bc", "cd"] 32 | target = "abddef" 33 | target_shingled = ["ab", "bd", "dd", "de", "ef"] 34 | n = 2 35 | intersection = len(set(source_shingled) & set(target_shingled)) 36 | union = len(set(source_shingled) | set(target_shingled)) 37 | gold = 1 - (intersection / union) 38 | assert math.isclose(jaccard_distance(source, target, n=n), gold, rel_tol=1e-5) 39 | 40 | def test_n_grams_2(): 41 | n = 2 42 | source = "abcdef" 43 | gold = ["ab", "bc", "cd", "de", "ef"] 44 | assert n_grams(source, n) == gold 45 | 46 | def test_n_grams_3(): 47 | n = 3 48 | source = "abcdef" 49 | gold = ["abc", "bcd", "cde", "def"] 50 | assert n_grams(source, n) == gold 51 | 52 | def test_shingle(): 53 | source = ["aa", "aa", "ab", "bc", "cc", "bc"] 54 | gold = Counter(source) 55 | res = shingle(source) 56 | for k in res: 57 | assert k in gold 58 | assert res[k] == gold[k] 59 | for k in gold: 60 | assert k in res 61 | 62 | def test_binary_shingle(): 63 | source = ["aa", "aa", "ab", "bc", "cc", "bc"] 64 | gold = Counter(source) 65 | res = binary_shingle(source) 66 | for k in res: 67 | assert k in gold 68 | assert res[k] == 1 69 | for k in gold: 70 | assert k in res 71 | assert res["aa"] != gold["aa"] 72 | assert res["bc"] != gold["bc"] 73 | 74 | def test_norm(): 75 | input_ = {"aa": 12, "bb": 1, "c": 9} 76 | gold = math.sqrt(12 * 12 + 1 * 1 + 9 * 9) 77 | assert math.isclose(norm(input_), gold, rel_tol=1e-6) 78 | 79 | def test_cosine(): 80 | source = dict(Counter(["1", "1", "2", "3", "4", "7", "0"])) 81 | target = dict(Counter(["1", "2", "2", "5", "5", "6", "9"])) 82 | gold = 4 / (norm(source) * norm(target)) 83 | assert math.isclose(cosine(source, target), gold, rel_tol=1e-6) 84 | 85 | 86 | def test_binary_cosine(): 87 | source = dict.fromkeys(["1", "1", "2", "3", "4", "7", "0"], 1) 88 | target = dict.fromkeys(["1", "2", "2", "5", "5", "6", "9"], 1) 89 | gold = 2 / (norm(source) * norm(target)) 90 | assert math.isclose(binary_cosine(source, target), gold, rel_tol=1e-6) 91 | 92 | def test_jaccard(): 93 | source = dict.fromkeys(["ab", "bc", "cd"]) 94 | target = dict.fromkeys(["ab", "bd", "dd", "de", "ef"]) 95 | intersection = len(set(source) & set(target)) 96 | union = len(set(source) | set(target)) 97 | gold = intersection / union 98 | assert math.isclose(jaccard(source, target), gold, rel_tol=1e-6) 99 | -------------------------------------------------------------------------------- /tests/wrapper.pyx: -------------------------------------------------------------------------------- 1 | from string_distance.token_distance cimport ( 2 | n_grams as cn_grams, 3 | shingle as cshingle, 4 | norm as cnorm, 5 | binary_shingle as cbinary_shingle, 6 | binary_cosine as cbin_cosine, 7 | cosine as ccosine, 8 | jaccard as cjaccard, 9 | ) 10 | 11 | def n_grams(string, n): 12 | return cn_grams(string, n) 13 | 14 | def shingle(n_g): 15 | return cshingle(n_g) 16 | 17 | def binary_shingle(n_g): 18 | return cbinary_shingle(n_g) 19 | 20 | def norm(d): 21 | return cnorm(d) 22 | 23 | def binary_cosine(s, t): 24 | return cbin_cosine(s, t) 25 | 26 | def cosine(s, t): 27 | return ccosine(s, t) 28 | 29 | def jaccard(s, t): 30 | return cjaccard(s, t) 31 | --------------------------------------------------------------------------------