├── .gitignore ├── .travis.yml ├── CHANGES.txt ├── README.md ├── appveyor.yml ├── asv.conf.json ├── benchmarks ├── __init__.py └── benchamarks.py ├── continuous-integration └── appveyor │ ├── install.ps1 │ ├── rm_rf.py │ └── run_with_env.cmd ├── docs ├── API.rst ├── Installation.rst ├── Makefile ├── Simfunctions.rst ├── Tokenizers.rst ├── Tutorial.rst ├── conf.py ├── index.rst └── make.bat ├── py_stringmatching ├── __init__.py ├── compat.py ├── simfunctions.py ├── tests │ ├── test_simfunctions.py │ └── test_tokenizers.py ├── tokenizers.py └── utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.so 9 | 10 | # Distribution / packaging 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | ### VirtualEnv template 61 | # Virtualenv 62 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 63 | [Bb]in 64 | [Ii]nclude 65 | [Ll]ib 66 | [Ss]cripts 67 | pyvenv.cfg 68 | pip-selfcheck.json 69 | ### IPythonNotebook template 70 | # Temporary data 71 | .ipynb_checkpoints/ 72 | 73 | # idea 74 | .idea 75 | 76 | #scratch 77 | scratch 78 | # Created by .ignore support plugin (hsz.mobi) 79 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | - "3.3" 5 | - "3.4" 6 | - "3.5" 7 | install: 8 | - pip install -r requirements.txt 9 | - pip install codecov 10 | 11 | script: 12 | - nosetests 13 | # - coverage run py_stringmatching/tests/test_simfunctions.py 14 | # - coverage run py_stringmatching/tests/test_tokenizers.py 15 | 16 | after_success: 17 | - codecov 18 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kvpradap/py_stringmatching/abc3df5d4db5ebfef648c9cc069d95e4468f6f19/CHANGES.txt -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Travis CI Status [![Build Status](https://travis-ci.org/kvpradap/py_stringmatching.svg?branch=master)](https://travis-ci.org/kvpradap/py_stringmatching) 2 | 3 | # py-stringmatching 4 | Python library for string matching! 5 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | 2 | environment: 3 | 4 | matrix: 5 | - PYTHON: "C:\\Python27_32" 6 | PYTHON_VERSION: "2.7" 7 | PYTHON_ARCH: "32" 8 | CONDA_PY: "27" 9 | 10 | - PYTHON: "C:\\Python27_64" 11 | PYTHON_VERSION: "2.7" 12 | PYTHON_ARCH: "64" 13 | CONDA_PY: "27" 14 | 15 | - PYTHON: "C:\\Python34_32" 16 | PYTHON_VERSION: "3.4" 17 | PYTHON_ARCH: "32" 18 | CONDA_PY: "34" 19 | 20 | - PYTHON: "C:\\Python34_64" 21 | PYTHON_VERSION: "3.4" 22 | PYTHON_ARCH: "64" 23 | CONDA_PY: "34" 24 | 25 | 26 | install: 27 | # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit) 28 | - powershell .\\continuous-integration\\appveyor\\install.ps1 29 | - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%" 30 | 31 | # Don't install from requirements-pip.txt, python-coveralls has broken dependencies on windows it seems. 32 | - conda install --yes setuptools nose numpy pip coverage 33 | - pip install six python-Levenshtein rednose 34 | - python setup.py install 35 | 36 | build: false 37 | 38 | test_script: 39 | # Nosetests take care of unit tests 40 | # Behave runs the example scripts and tries to verify if it produces the right output 41 | - nosetests 42 | #- behave --tags ~@skip # Everything without the tag @skip 43 | 44 | on_success: 45 | # Could run coveralls here but will leave that to travis tests 46 | - echo Build succesful! 47 | #- coverage report 48 | # coveralls 49 | -------------------------------------------------------------------------------- /asv.conf.json: -------------------------------------------------------------------------------- 1 | { 2 | // The version of the config file format. Do not change, unless 3 | // you know what you are doing. 4 | "version": 1, 5 | // The name of the project being benchmarked 6 | "project": ".", 7 | // The project's homepage 8 | "project_url": "https://github.com/kvpradap/py_stringmatching/", 9 | // The URL or local path of the source code repository for the 10 | // project being benchmarked 11 | "repo": "https://github.com/kvpradap/py_stringmatching.git", 12 | //"repo": "/Users/pradap/Documents/Research/Python-Package/py_stringmatching", 13 | 14 | // List of branches to benchmark. If not provided, defaults to "master" 15 | // (for git) or "tip" (for mercurial). 16 | "branches": [ 17 | "master" 18 | ], 19 | // for git 20 | // "branches": ["tip"], // for mercurial 21 | 22 | // The DVCS being used. If not set, it will be automatically 23 | // determined from "repo" by looking at the protocol in the URL 24 | // (if remote), or by looking for special directories, such as 25 | // ".git" (if local). 26 | // "dvcs": "git", 27 | 28 | // The tool to use to create environments. May be "conda", 29 | // "virtualenv" or other value depending on the plugins in use. 30 | // If missing or the empty string, the tool will be automatically 31 | // determined by looking for tools on the PATH environment 32 | // variable. 33 | "environment_type": "conda", 34 | // the base URL to show a commit for the project. 35 | "show_commit_url": "https://github.com/kvpradap/py_stringmatching/commit/", 36 | // The Pythons you'd like to test against. If not provided, defaults 37 | // to the current version of Python used to run `asv`. 38 | "pythons": [ 39 | "2.7", 40 | "3.3", 41 | "3.4" 42 | ], 43 | //"pythons": ["3.3"], 44 | 45 | // The matrix of dependencies to test. Each key is the name of a 46 | // package (in PyPI) and the values are version numbers. An empty 47 | // list indicates to just test against the default (latest) 48 | // version. 49 | // "matrix": { 50 | // "numpy": ["1.6", "1.7"] 51 | // }, 52 | 53 | // The directory (relative to the current directory) that benchmarks are 54 | // stored in. If not provided, defaults to "benchmarks" 55 | "benchmark_dir": "benchmarks" 56 | 57 | // The directory (relative to the current directory) to cache the Python 58 | // environments in. If not provided, defaults to "env" 59 | // "env_dir": "env", 60 | 61 | 62 | // The directory (relative to the current directory) that raw benchmark 63 | // results are stored in. If not provided, defaults to "results". 64 | // "results_dir": "results", 65 | 66 | // The directory (relative to the current directory) that the html tree 67 | // should be written to. If not provided, defaults to "html". 68 | // "html_dir": "html", 69 | 70 | // The number of characters to retain in the commit hashes. 71 | // "hash_length": 8, 72 | 73 | // `asv` will cache wheels of the recent builds in each 74 | // environment, making them faster to install next time. This is 75 | // number of builds to keep, per environment. 76 | // "wheel_cache_size": 0 77 | } 78 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | _short_string_1 = 'badgerdi' 2 | _short_string_2 = 'diproject' 3 | 4 | _medium_string_1 = 'data integration' 5 | _medium_string_2 = 'entity matching' 6 | 7 | _long_string_1 = 'Data integration involves combining data residing in different sources and ' \ 8 | 'providing users with a unified view of these data' # > 12 9 | _long_string_2 = 'Record linkage (RL) refers to the task of finding records in a data set that ' \ 10 | 'refer to the same entity across different data sources ' 11 | _long_hamm_string1 = 'Data integration involves combining data residing in different sources and ' \ 12 | 'providing users with a unified view of these data' 13 | _long_hamm_string2 = 'Data integration involves combining data residing in different sources and ' \ 14 | 'providing users with a unified vieu of these data' 15 | 16 | _small_num_tokens_wo_rep = ['data', 'integration'] 17 | _small_num_tokens_wi_rep = ['data', 'integration'] 18 | _med_num_tokens_wo_rep = ['data', 'integration', 'involves', 'data', 19 | 'residing', 'in', 'different', 'sources'] 20 | _med_num_tokens_wi_rep = ['data', 'integration', 'involves', 'data', 21 | 'integration', 'in', 'different', 'data'] 22 | 23 | _large_num_tokens_wo_rep = ['Data', 'integration', 'involves', 'combining', 'data', 'residing', 'in', 24 | 'different', 'sources', 'and', 'providing', 'users', 'with', 'a', 'unified', 25 | 'view', 'of', 'these', 'data.', 'This', 'process', 'becomes', 'significant', 'in', 26 | 'a', 'variety', 'of', 'situations.'] 27 | 28 | _large_num_tokens_wi_rep = ['Data', 'integration', 'involves', 'combining', 'data', 'data', 'in', 29 | 'different', 'sources', 'and', 'different', 'users', 'with', 'a', 'unified', 30 | 'view', 'of', 'these', 'data.', 'This', 'data', 'becomes', 'significant', 'in', 31 | 'a', 'different', 'of', 'data.'] 32 | -------------------------------------------------------------------------------- /benchmarks/benchamarks.py: -------------------------------------------------------------------------------- 1 | # Write the benchmarking functions here. 2 | # See "Writing benchmarks" in the asv docs for more information. 3 | 4 | from py_stringmatching import simfunctions 5 | from . import _short_string_1, _long_string_1, _medium_string_1, _short_string_2, _long_string_2, _medium_string_2 6 | from . import _small_num_tokens_wi_rep, _small_num_tokens_wo_rep, _med_num_tokens_wi_rep, _med_num_tokens_wo_rep, \ 7 | _large_num_tokens_wi_rep, _large_num_tokens_wo_rep, _long_hamm_string1, _long_hamm_string2 8 | 9 | 10 | class TimeAffine: 11 | def time_short_short(self): 12 | simfunctions.affine(_short_string_1, _short_string_2) 13 | 14 | def time_medium_medium(self): 15 | simfunctions.affine(_medium_string_1, _medium_string_2) 16 | 17 | def time_long_long(self): 18 | simfunctions.affine(_long_string_1, _long_string_2) 19 | 20 | def time_short_medium(self): 21 | simfunctions.affine(_short_string_1, _medium_string_1) 22 | 23 | def time_short_long(self): 24 | simfunctions.affine(_short_string_1, _long_string_1) 25 | 26 | def time_medium_long(self): 27 | simfunctions.affine(_medium_string_1, _long_string_1) 28 | 29 | 30 | class TimeJaro: 31 | def time_short_short(self): 32 | simfunctions.jaro(_short_string_1, _short_string_2) 33 | 34 | def time_medium_medium(self): 35 | simfunctions.jaro(_medium_string_1, _medium_string_2) 36 | 37 | def time_long_long(self): 38 | simfunctions.jaro(_long_string_1, _long_string_2) 39 | 40 | def time_short_medium(self): 41 | simfunctions.jaro(_short_string_1, _medium_string_1) 42 | 43 | def time_short_long(self): 44 | simfunctions.jaro(_short_string_1, _long_string_1) 45 | 46 | def time_medium_long(self): 47 | simfunctions.jaro(_medium_string_1, _long_string_1) 48 | 49 | 50 | class TimeJaroWinkler: 51 | def time_short_short(self): 52 | simfunctions.jaro_winkler(_short_string_1, _short_string_2) 53 | 54 | def time_medium_medium(self): 55 | simfunctions.jaro_winkler(_medium_string_1, _medium_string_2) 56 | 57 | def time_long_long(self): 58 | simfunctions.jaro_winkler(_long_string_1, _long_string_2) 59 | 60 | def time_short_medium(self): 61 | simfunctions.jaro_winkler(_short_string_1, _medium_string_1) 62 | 63 | def time_short_long(self): 64 | simfunctions.jaro_winkler(_short_string_1, _long_string_1) 65 | 66 | def time_medium_long(self): 67 | simfunctions.jaro_winkler(_medium_string_1, _long_string_1) 68 | 69 | 70 | class TimeHammingDistance: 71 | def time_short_short(self): 72 | simfunctions.hamming_distance(_short_string_1, _short_string_1) 73 | 74 | def time_medium_medium(self): 75 | simfunctions.hamming_distance(_medium_string_1, _medium_string_1) 76 | 77 | def time_long_long(self): 78 | simfunctions.hamming_distance(_long_hamm_string1, _long_hamm_string2) 79 | 80 | # def time_short_medium(self): 81 | # simfunctions.hamming_distance(_short_string_1, _medium_string_1) 82 | # 83 | # def time_short_long(self): 84 | # simfunctions.hamming_distance(_short_string_1, _long_string_1) 85 | # 86 | # def time_medium_long(self): 87 | # simfunctions.hamming_distance(_medium_string_1, _long_string_1) 88 | 89 | 90 | # 91 | # class TimeJaro1: 92 | # def time_short_short(self): 93 | # Levenshtein.jaro(_short_string_1, _short_string_2) 94 | # 95 | # def time_medium_medium(self): 96 | # Levenshtein.jaro(_medium_string_1, _medium_string_2) 97 | # 98 | # def time_long_long(self): 99 | # Levenshtein.jaro(_long_string_1, _long_string_2) 100 | # 101 | # def time_short_medium(self): 102 | # Levenshtein.jaro(_short_string_1, _medium_string_1) 103 | # 104 | # def time_short_long(self): 105 | # Levenshtein.jaro(_short_string_1, _long_string_1) 106 | # 107 | # def time_medium_long(self): 108 | # Levenshtein.jaro(_medium_string_1, _long_string_1) 109 | # 110 | # 111 | class TimeLevenshtein: 112 | def time_short_short(self): 113 | simfunctions.levenshtein(_short_string_1, _short_string_2) 114 | 115 | def time_medium_medium(self): 116 | simfunctions.levenshtein(_medium_string_1, _medium_string_2) 117 | 118 | def time_long_long(self): 119 | simfunctions.levenshtein(_long_string_1, _long_string_2) 120 | 121 | def time_short_medium(self): 122 | simfunctions.levenshtein(_short_string_1, _medium_string_1) 123 | 124 | def time_short_long(self): 125 | simfunctions.levenshtein(_short_string_1, _long_string_1) 126 | 127 | def time_medium_long(self): 128 | simfunctions.levenshtein(_medium_string_1, _long_string_1) 129 | 130 | 131 | class TimeNeedlemanWunsch: 132 | def time_short_short(self): 133 | simfunctions.needleman_wunsch(_short_string_1, _short_string_2) 134 | 135 | def time_medium_medium(self): 136 | simfunctions.needleman_wunsch(_medium_string_1, _medium_string_2) 137 | 138 | def time_long_long(self): 139 | simfunctions.needleman_wunsch(_long_string_1, _long_string_2) 140 | 141 | def time_short_medium(self): 142 | simfunctions.needleman_wunsch(_short_string_1, _medium_string_1) 143 | 144 | def time_short_long(self): 145 | simfunctions.needleman_wunsch(_short_string_1, _long_string_1) 146 | 147 | def time_medium_long(self): 148 | simfunctions.needleman_wunsch(_medium_string_1, _long_string_1) 149 | 150 | 151 | class TimeSmithWaterman: 152 | def time_short_short(self): 153 | simfunctions.smith_waterman(_short_string_1, _short_string_2) 154 | 155 | def time_medium_medium(self): 156 | simfunctions.smith_waterman(_medium_string_1, _medium_string_2) 157 | 158 | def time_long_long(self): 159 | simfunctions.smith_waterman(_long_string_1, _long_string_2) 160 | 161 | def time_short_medium(self): 162 | simfunctions.smith_waterman(_short_string_1, _medium_string_1) 163 | 164 | def time_short_long(self): 165 | simfunctions.smith_waterman(_short_string_1, _long_string_1) 166 | 167 | def time_medium_long(self): 168 | simfunctions.smith_waterman(_medium_string_1, _long_string_1) 169 | 170 | 171 | class TimeCosine: 172 | def time_small_small_wo_rep(self): 173 | simfunctions.cosine(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep) 174 | 175 | def time_small_small_wi_rep(self): 176 | simfunctions.cosine(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep) 177 | 178 | def time_medium_medium_wo_rep(self): 179 | simfunctions.cosine(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep) 180 | 181 | def time_medium_medium_wi_rep(self): 182 | simfunctions.cosine(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep) 183 | 184 | def time_large_large_wo_rep(self): 185 | simfunctions.cosine(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep) 186 | 187 | def time_large_large_wi_rep(self): 188 | simfunctions.cosine(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep) 189 | 190 | def time_small_medium_wo_rep(self): 191 | simfunctions.cosine(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep) 192 | 193 | def time_small_medium_wi_rep(self): 194 | simfunctions.cosine(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep) 195 | 196 | def time_small_large_wo_rep(self): 197 | simfunctions.cosine(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep) 198 | 199 | def time_small_large_wi_rep(self): 200 | simfunctions.cosine(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep) 201 | 202 | def time_medium_large_wo_rep(self): 203 | simfunctions.cosine(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 204 | 205 | def time_medium_large_wi_rep(self): 206 | simfunctions.cosine(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 207 | 208 | 209 | class TimeJaccard: 210 | def time_small_small_wo_rep(self): 211 | simfunctions.jaccard(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep) 212 | 213 | def time_small_small_wi_rep(self): 214 | simfunctions.jaccard(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep) 215 | 216 | def time_medium_medium_wo_rep(self): 217 | simfunctions.jaccard(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep) 218 | 219 | def time_medium_medium_wi_rep(self): 220 | simfunctions.jaccard(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep) 221 | 222 | def time_large_large_wo_rep(self): 223 | simfunctions.jaccard(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep) 224 | 225 | def time_large_large_wi_rep(self): 226 | simfunctions.jaccard(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep) 227 | 228 | def time_small_medium_wo_rep(self): 229 | simfunctions.jaccard(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep) 230 | 231 | def time_small_medium_wi_rep(self): 232 | simfunctions.jaccard(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep) 233 | 234 | def time_small_large_wo_rep(self): 235 | simfunctions.jaccard(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep) 236 | 237 | def time_small_large_wi_rep(self): 238 | simfunctions.jaccard(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep) 239 | 240 | def time_medium_large_wo_rep(self): 241 | simfunctions.jaccard(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 242 | 243 | def time_medium_large_wi_rep(self): 244 | simfunctions.jaccard(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 245 | 246 | 247 | class TimeOverlap: 248 | def time_small_small_wo_rep(self): 249 | simfunctions.overlap_coefficient(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep) 250 | 251 | def time_small_small_wi_rep(self): 252 | simfunctions.overlap_coefficient(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep) 253 | 254 | def time_medium_medium_wo_rep(self): 255 | simfunctions.overlap_coefficient(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep) 256 | 257 | def time_medium_medium_wi_rep(self): 258 | simfunctions.overlap_coefficient(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep) 259 | 260 | def time_large_large_wo_rep(self): 261 | simfunctions.overlap_coefficient(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep) 262 | 263 | def time_large_large_wi_rep(self): 264 | simfunctions.overlap_coefficient(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep) 265 | 266 | def time_small_medium_wo_rep(self): 267 | simfunctions.overlap_coefficient(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep) 268 | 269 | def time_small_medium_wi_rep(self): 270 | simfunctions.overlap_coefficient(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep) 271 | 272 | def time_small_large_wo_rep(self): 273 | simfunctions.overlap_coefficient(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep) 274 | 275 | def time_small_large_wi_rep(self): 276 | simfunctions.overlap_coefficient(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep) 277 | 278 | def time_medium_large_wo_rep(self): 279 | simfunctions.overlap_coefficient(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 280 | 281 | def time_medium_large_wi_rep(self): 282 | simfunctions.overlap_coefficient(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 283 | 284 | 285 | class TimeMongeElkan: 286 | def time_small_small_wo_rep(self): 287 | simfunctions.monge_elkan(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep) 288 | 289 | def time_small_small_wi_rep(self): 290 | simfunctions.monge_elkan(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep) 291 | 292 | def time_medium_medium_wo_rep(self): 293 | simfunctions.monge_elkan(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep) 294 | 295 | def time_medium_medium_wi_rep(self): 296 | simfunctions.monge_elkan(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep) 297 | 298 | def time_large_large_wo_rep(self): 299 | simfunctions.monge_elkan(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep) 300 | 301 | def time_large_large_wi_rep(self): 302 | simfunctions.monge_elkan(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep) 303 | 304 | def time_small_medium_wo_rep(self): 305 | simfunctions.monge_elkan(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep) 306 | 307 | def time_small_medium_wi_rep(self): 308 | simfunctions.monge_elkan(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep) 309 | 310 | 311 | class TimeTfIdf: 312 | corpus_list = [_small_num_tokens_wo_rep, _small_num_tokens_wi_rep, _med_num_tokens_wi_rep, _med_num_tokens_wo_rep, 313 | _large_num_tokens_wo_rep, _large_num_tokens_wi_rep] 314 | 315 | def time_small_small_wo_rep_no_corpus_no_dampen(self): 316 | simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep) 317 | 318 | def time_small_small_wi_rep_no_corpus_no_dampen(self): 319 | simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep) 320 | 321 | def time_medium_medium_wo_rep_no_corpus_no_dampen(self): 322 | simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep) 323 | 324 | def time_medium_medium_wi_rep_no_corpus_no_dampen(self): 325 | simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep) 326 | 327 | def time_large_large_wo_rep_no_corpus_no_dampen(self): 328 | simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep) 329 | 330 | def time_large_large_wi_rep_no_corpus_no_dampen(self): 331 | simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep) 332 | 333 | def time_small_medium_wo_rep_no_corpus_no_dampen(self): 334 | simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep) 335 | 336 | def time_small_medium_wi_rep_no_corpus_no_dampen(self): 337 | simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep) 338 | 339 | def time_small_large_wo_rep_no_corpus_no_dampen(self): 340 | simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep) 341 | 342 | def time_small_large_wi_rep_no_corpus_no_dampen(self): 343 | simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep) 344 | 345 | def time_medium_large_wo_rep_no_corpus_no_dampen(self): 346 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 347 | 348 | def time_medium_large_wi_rep_no_corpus_no_dampen(self): 349 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 350 | 351 | # dampen - true 352 | def time_small_small_wo_rep_no_corpus(self): 353 | simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, dampen=True) 354 | 355 | def time_small_small_wi_rep_no_corpus(self): 356 | simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, dampen=True) 357 | 358 | def time_medium_medium_wo_rep_no_corpus(self): 359 | simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True) 360 | 361 | def time_medium_medium_wi_rep_no_corpus(self): 362 | simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, dampen=True) 363 | 364 | def time_large_large_wo_rep_no_corpus(self): 365 | simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True) 366 | 367 | def time_large_large_wi_rep_no_corpus(self): 368 | simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True) 369 | 370 | def time_small_medium_wo_rep_no_corpus(self): 371 | simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True) 372 | 373 | def time_small_medium_wi_rep_no_corpus(self): 374 | simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, dampen=True) 375 | 376 | def time_small_large_wo_rep_no_corpus(self): 377 | simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True) 378 | 379 | def time_small_large_wi_rep_no_corpus(self): 380 | simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True) 381 | 382 | def time_medium_large_wo_rep_no_corpus(self): 383 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True) 384 | 385 | def time_medium_large_wi_rep_no_corpus(self): 386 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True) 387 | 388 | # corpus list - true 389 | def time_small_small_wo_rep_no_dampen(self): 390 | simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, corpus_list=self.corpus_list) 391 | 392 | def time_small_small_wi_rep_no_dampen(self): 393 | simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, corpus_list=self.corpus_list) 394 | 395 | def time_medium_medium_wo_rep_no_dampen(self): 396 | simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list) 397 | 398 | def time_medium_medium_wi_rep_no_dampen(self): 399 | simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list) 400 | 401 | def time_large_large_wo_rep_no_dampen(self): 402 | simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 403 | 404 | def time_large_large_wi_rep_no_dampen(self): 405 | simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list) 406 | 407 | def time_small_medium_wo_rep_no_dampen(self): 408 | simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list) 409 | 410 | def time_small_medium_wi_rep_no_dampen(self): 411 | simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list) 412 | 413 | def time_small_large_wo_rep_no_dampen(self): 414 | simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 415 | 416 | def time_small_large_wi_rep_no_dampen(self): 417 | simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list) 418 | 419 | def time_medium_large_wo_rep_no_dampen(self): 420 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 421 | 422 | def time_medium_large_wi_rep_no_dampen(self): 423 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 424 | 425 | # corpus list - true, dampen_true 426 | def time_small_small_wo_rep(self): 427 | simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, corpus_list=self.corpus_list, 428 | dampen=True) 429 | 430 | def time_small_small_wi_rep(self): 431 | simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, corpus_list=self.corpus_list, 432 | dampen=True) 433 | 434 | def time_medium_medium_wo_rep(self): 435 | simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list, 436 | dampen=True) 437 | 438 | def time_medium_medium_wi_rep(self): 439 | simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list, 440 | dampen=True) 441 | 442 | def time_large_large_wo_rep(self): 443 | simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, 444 | dampen=True) 445 | 446 | def time_large_large_wi_rep(self): 447 | simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list, 448 | dampen=True) 449 | 450 | def time_small_medium_wo_rep(self): 451 | simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True) 452 | 453 | def time_small_medium_wi_rep(self): 454 | simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list, dampen=True) 455 | 456 | def time_small_large_wo_rep(self): 457 | simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, 458 | dampen=True) 459 | 460 | def time_small_large_wi_rep(self): 461 | simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list, 462 | dampen=True) 463 | 464 | def time_medium_large_wo_rep(self): 465 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True) 466 | 467 | def time_medium_large_wi_rep(self): 468 | simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True) 469 | 470 | 471 | class TimeSoftTfIdf: 472 | corpus_list = [_small_num_tokens_wo_rep, _small_num_tokens_wi_rep, _med_num_tokens_wi_rep, _med_num_tokens_wo_rep, 473 | _large_num_tokens_wo_rep, _large_num_tokens_wi_rep] 474 | 475 | # no corpus list 476 | def time_small_small_wo_rep_no_corpus(self): 477 | simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep) 478 | 479 | def time_small_small_wi_rep_no_corpus(self): 480 | simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep) 481 | 482 | def time_medium_medium_wo_rep_no_corpus(self): 483 | simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep) 484 | 485 | def time_medium_medium_wi_rep_no_corpus(self): 486 | simfunctions.soft_tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep) 487 | 488 | def time_large_large_wo_rep_no_corpus(self): 489 | simfunctions.soft_tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep) 490 | 491 | def time_large_large_wi_rep_no_corpus(self): 492 | simfunctions.soft_tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep) 493 | 494 | def time_small_medium_wo_rep_no_corpus(self): 495 | simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep) 496 | 497 | def time_small_medium_wi_rep_no_corpus(self): 498 | simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep) 499 | 500 | def time_small_large_wo_rep_no_corpus(self): 501 | simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep) 502 | 503 | def time_small_large_wi_rep_no_corpus(self): 504 | simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep) 505 | 506 | def time_medium_large_wo_rep_no_corpus(self): 507 | simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 508 | 509 | def time_medium_large_wi_rep_no_corpus(self): 510 | simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep) 511 | 512 | # with corpus list 513 | def time_small_small_wo_rep(self): 514 | simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, corpus_list=self.corpus_list) 515 | 516 | def time_small_small_wi_rep(self): 517 | simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, corpus_list=self.corpus_list) 518 | 519 | def time_medium_medium_wo_rep(self): 520 | simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list) 521 | 522 | def time_medium_medium_wi_rep(self): 523 | simfunctions.soft_tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list) 524 | 525 | def time_large_large_wo_rep(self): 526 | simfunctions.soft_tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 527 | 528 | def time_large_large_wi_rep(self): 529 | simfunctions.soft_tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list) 530 | 531 | def time_small_medium_wo_rep(self): 532 | simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list) 533 | 534 | def time_small_medium_wi_rep(self): 535 | simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list) 536 | 537 | def time_small_large_wo_rep(self): 538 | simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 539 | 540 | def time_small_large_wi_rep(self): 541 | simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list) 542 | 543 | def time_medium_large_wo_rep(self): 544 | simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 545 | 546 | def time_medium_large_wi_rep(self): 547 | simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list) 548 | -------------------------------------------------------------------------------- /continuous-integration/appveyor/install.ps1: -------------------------------------------------------------------------------- 1 | # Sample script to install Miniconda under Windows 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 4 | 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/" 6 | 7 | 8 | function DownloadMiniconda ($python_version, $platform_suffix) { 9 | $webclient = New-Object System.Net.WebClient 10 | if ($python_version -match "3.4") { 11 | $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" 12 | } else { 13 | $filename = "Miniconda-latest-Windows-" + $platform_suffix + ".exe" 14 | } 15 | $url = $MINICONDA_URL + $filename 16 | 17 | $basedir = $pwd.Path + "\" 18 | $filepath = $basedir + $filename 19 | if (Test-Path $filename) { 20 | Write-Host "Reusing" $filepath 21 | return $filepath 22 | } 23 | 24 | # Download and retry up to 3 times in case of network transient errors. 25 | Write-Host "Downloading" $filename "from" $url 26 | $retry_attempts = 2 27 | for($i=0; $i -lt $retry_attempts; $i++){ 28 | try { 29 | $webclient.DownloadFile($url, $filepath) 30 | break 31 | } 32 | Catch [Exception]{ 33 | Start-Sleep 1 34 | } 35 | } 36 | if (Test-Path $filepath) { 37 | Write-Host "File saved at" $filepath 38 | } else { 39 | # Retry once to get the error message if any at the last try 40 | $webclient.DownloadFile($url, $filepath) 41 | } 42 | return $filepath 43 | } 44 | 45 | 46 | function InstallMiniconda ($python_version, $architecture, $python_home) { 47 | Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home 48 | if (Test-Path $python_home) { 49 | Write-Host $python_home "already exists, skipping." 50 | return $false 51 | } 52 | if ($architecture -match "32") { 53 | $platform_suffix = "x86" 54 | } else { 55 | $platform_suffix = "x86_64" 56 | } 57 | 58 | $filepath = DownloadMiniconda $python_version $platform_suffix 59 | Write-Host "Installing" $filepath "to" $python_home 60 | $install_log = $python_home + ".log" 61 | $args = "/S /D=$python_home" 62 | Write-Host $filepath $args 63 | Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru 64 | if (Test-Path $python_home) { 65 | Write-Host "Python $python_version ($architecture) installation complete" 66 | } else { 67 | Write-Host "Failed to install Python in $python_home" 68 | Get-Content -Path $install_log 69 | Exit 1 70 | } 71 | } 72 | 73 | 74 | function InstallCondaPackages ($python_home, $spec) { 75 | $conda_path = $python_home + "\Scripts\conda.exe" 76 | $args = "install --yes " + $spec 77 | Write-Host ("conda " + $args) 78 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 79 | } 80 | 81 | function UpdateConda ($python_home) { 82 | $conda_path = $python_home + "\Scripts\conda.exe" 83 | Write-Host "Updating conda..." 84 | $args = "update --yes conda" 85 | Write-Host $conda_path $args 86 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 87 | } 88 | 89 | 90 | function main () { 91 | InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON 92 | UpdateConda $env:PYTHON 93 | InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client" 94 | } 95 | 96 | main 97 | 98 | -------------------------------------------------------------------------------- /continuous-integration/appveyor/rm_rf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import stat 5 | import shutil 6 | 7 | def remove_readonly(func, path, excinfo): 8 | os.chmod(path, stat.S_IWRITE) 9 | func(path) 10 | 11 | def main(): 12 | print(sys.executable) 13 | try: 14 | shutil.rmtree(sys.argv[1], onerror=remove_readonly) 15 | except Exception as e: 16 | print("Error") 17 | print(e) 18 | 19 | if __name__ == '__main__': 20 | main() 21 | 22 | -------------------------------------------------------------------------------- /continuous-integration/appveyor/run_with_env.cmd: -------------------------------------------------------------------------------- 1 | :: To build extensions for 64 bit Python 3, we need to configure environment 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) 4 | :: 5 | :: To build extensions for 64 bit Python 2, we need to configure environment 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) 8 | :: 9 | :: 32 bit builds do not require specific environment configurations. 10 | :: 11 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the 12 | :: cmd interpreter, at least for (SDK v7.0) 13 | :: 14 | :: More details at: 15 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows 16 | :: http://stackoverflow.com/a/13751649/163740 17 | :: 18 | :: Author: Olivier Grisel 19 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 20 | @ECHO OFF 21 | 22 | SET COMMAND_TO_RUN=%* 23 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows 24 | 25 | SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%" 26 | IF %MAJOR_PYTHON_VERSION% == "2" ( 27 | SET WINDOWS_SDK_VERSION="v7.0" 28 | ) ELSE IF %MAJOR_PYTHON_VERSION% == "3" ( 29 | SET WINDOWS_SDK_VERSION="v7.1" 30 | ) ELSE ( 31 | ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" 32 | EXIT 1 33 | ) 34 | 35 | IF "%PYTHON_ARCH%"=="64" ( 36 | ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture 37 | SET DISTUTILS_USE_SDK=1 38 | SET MSSdk=1 39 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% 40 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release 41 | ECHO Executing: %COMMAND_TO_RUN% 42 | call %COMMAND_TO_RUN% || EXIT 1 43 | ) ELSE ( 44 | ECHO Using default MSVC build environment for 32 bit architecture 45 | ECHO Executing: %COMMAND_TO_RUN% 46 | call %COMMAND_TO_RUN% || EXIT 1 47 | ) 48 | 49 | -------------------------------------------------------------------------------- /docs/API.rst: -------------------------------------------------------------------------------- 1 | === 2 | API 3 | === 4 | .. toctree:: 5 | :maxdepth: 2 6 | 7 | Tokenizers 8 | Simfunctions 9 | -------------------------------------------------------------------------------- /docs/Installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | This pages describes the requirements, dependencies and provides a step by step instruction 5 | to install the py_stringmatching package. 6 | 7 | Requirements 8 | ------------ 9 | * Python 2.7 or Python 3.3+ 10 | 11 | Dependencies 12 | ------------ 13 | * numpy>=1.7.0 14 | * six 15 | * python-Levenshtein >= 0.12.0 16 | 17 | 18 | .. note:: 19 | The user need not install these dependency packages before installing the py_stringmatching package. 20 | The py_stringmatching installer will automatically install the required packages. 21 | 22 | 23 | Step by Step Installation Instruction 24 | ------------------------------------- 25 | Step 1: Download the py_stringmatching package from `here 26 | `_ 27 | into your home directory. 28 | 29 | You can download into any directory within your home directory. For now we assume that you use a 30 | linux operating system and will download into "HOME/", the top level. 31 | 32 | Also, we assume that you have sufficient privileges to install a python package. 33 | 34 | Step 2: Unzip the package by executing the following command:: 35 | 36 | tar -xzvf py_stringmatching.tar.gz 37 | 38 | py_stringmatching will be unpackaged into directory "HOME/py_stringmatching-0.1 39 | 40 | 41 | Step 3: At the command prompt execute the following commands:: 42 | 43 | cd HOME/py_stringmatching-0.1 44 | python setup.py install 45 | 46 | This will install py_stringmatching package. 47 | 48 | .. note:: 49 | 50 | If the python package installation requires root permission then, you can install the package in 51 | your home directory like this:: 52 | 53 | python setup.py install --user 54 | 55 | for more information look at the stackoverflow `link 56 | `_. 57 | 58 | Supported Platforms 59 | ------------------- 60 | It is tested primarily on OSX and Linux, but due to minimal dependencies it should work perfectly on Windows. 61 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/py_stringmatching.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/py_stringmatching.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/py_stringmatching" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/py_stringmatching" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/Simfunctions.rst: -------------------------------------------------------------------------------- 1 | Similarity Functions 2 | ==================== 3 | 4 | .. automodule:: py_stringmatching.simfunctions 5 | 6 | 7 | .. autofunction:: levenshtein(string1, string2) 8 | .. autofunction:: hamming_distance(string1, string2) 9 | .. autofunction:: jaro(string1, string2) 10 | .. autofunction:: jaro_winkler(string1, string2, prefix_weight=0.1) 11 | .. autofunction:: needleman_wunsch(string1, string2, gap_cost=1, sim_score=sim_ident) 12 | .. autofunction:: smith_waterman(string1, string2, gap_cost=1, sim_score=sim_ident) 13 | .. autofunction:: affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident) 14 | .. autofunction:: jaccard(set1, set2) 15 | .. autofunction:: overlap_coefficient(set1, set2) 16 | .. autofunction:: cosine(set1, set2) 17 | .. autofunction:: monge_elkan(bag1, bag2, sim_func=levenshtein) 18 | .. autofunction:: tfidf(bag1, bag2, corpus_list = None, dampen=False) 19 | .. autofunction:: soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5) -------------------------------------------------------------------------------- /docs/Tokenizers.rst: -------------------------------------------------------------------------------- 1 | Tokenizers 2 | ========== 3 | 4 | .. automodule:: py_stringmatching.tokenizers 5 | :show-inheritance: 6 | 7 | .. autofunction:: delimiter(input_string, delim_str=' ') 8 | .. autofunction:: whitespace(input_string) 9 | .. autofunction:: qgram(input_string, qval=2) 10 | 11 | 12 | -------------------------------------------------------------------------------- /docs/Tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | Once the package is installed, the user can import the similarity functions and tokenizers 4 | like this:: 5 | 6 | from py_stringmatching import simfunctions, tokenizers 7 | 8 | Further, the user can use the tokenizers and similarity functions like this:: 9 | 10 | x = 'this is a string matching package for data science class' 11 | y = 'this string matching package can be used to generate features' 12 | f = simfunctions.cosine(tokenizers.whitespace(x), tokenizers.whitespace(y)) 13 | 14 | 15 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # py_stringmatching documentation build configuration file, created by 4 | # sphinx-quickstart on Mon Feb 1 13:42:26 2016. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | import sys 16 | 17 | # add path - warning: needs to updated based on package path 18 | # sys.path.append('/scratch/pradap/python-work/py_stringmatching') 19 | sys.path.append('/Users/pradap/Documents/Research/Python-Package/py_stringmatching') 20 | 21 | # If extensions (or modules to document with autodoc) are in another directory, 22 | # add these directories to sys.path here. If the directory is relative to the 23 | # documentation root, use os.path.abspath to make it absolute, like shown here. 24 | # sys.path.insert(0, os.path.abspath('.')) 25 | 26 | # -- General configuration ------------------------------------------------ 27 | 28 | # If your documentation needs a minimal Sphinx version, state it here. 29 | # needs_sphinx = '1.0' 30 | 31 | # Add any Sphinx extension module names here, as strings. They can be 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 33 | # ones. 34 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon', 35 | 'sphinx.ext.doctest', 36 | 'sphinx.ext.intersphinx', 37 | 'sphinx.ext.mathjax', 38 | 'sphinx.ext.ifconfig', 39 | 'sphinx.ext.viewcode', 40 | ] 41 | 42 | # Napoleon settings 43 | napoleon_google_docstring = True 44 | napoleon_numpy_docstring = True 45 | napoleon_include_private_with_doc = False 46 | napoleon_include_special_with_doc = True 47 | napoleon_use_admonition_for_examples = False 48 | napoleon_use_admonition_for_notes = False 49 | napoleon_use_admonition_for_references = False 50 | napoleon_use_ivar = False 51 | napoleon_use_param = True 52 | napoleon_use_rtype = True 53 | 54 | # Add any paths that contain templates here, relative to this directory. 55 | templates_path = ['_templates'] 56 | 57 | # The suffix(es) of source filenames. 58 | # You can specify multiple suffix as a list of string: 59 | # source_suffix = ['.rst', '.md'] 60 | source_suffix = '.rst' 61 | 62 | # The encoding of source files. 63 | # source_encoding = 'utf-8-sig' 64 | 65 | # The master toctree document. 66 | master_doc = 'index' 67 | 68 | # General information about the project. 69 | project = u'py_stringmatching' 70 | copyright = u'2016, Magellan Team' 71 | author = u'Magellan Team' 72 | 73 | # The version info for the project you're documenting, acts as replacement for 74 | # |version| and |release|, also used in various other places throughout the 75 | # built documents. 76 | # 77 | # The short X.Y version. 78 | version = '0.1' 79 | # The full version, including alpha/beta/rc tags. 80 | release = '0.1' 81 | 82 | # The language for content autogenerated by Sphinx. Refer to documentation 83 | # for a list of supported languages. 84 | # 85 | # This is also used if you do content translation via gettext catalogs. 86 | # Usually you set "language" from the command line for these cases. 87 | language = None 88 | 89 | # There are two options for replacing |today|: either, you set today to some 90 | # non-false value, then it is used: 91 | # today = '' 92 | # Else, today_fmt is used as the format for a strftime call. 93 | # today_fmt = '%B %d, %Y' 94 | 95 | # List of patterns, relative to source directory, that match files and 96 | # directories to ignore when looking for source files. 97 | exclude_patterns = ['_build'] 98 | 99 | # The reST default role (used for this markup: `text`) to use for all 100 | # documents. 101 | # default_role = None 102 | 103 | # If true, '()' will be appended to :func: etc. cross-reference text. 104 | # add_function_parentheses = True 105 | 106 | # If true, the current module name will be prepended to all description 107 | # unit titles (such as .. function::). 108 | add_module_names = True 109 | 110 | # If true, sectionauthor and moduleauthor directives will be shown in the 111 | # output. They are ignored by default. 112 | # show_authors = False 113 | 114 | # The name of the Pygments (syntax highlighting) style to use. 115 | pygments_style = 'sphinx' 116 | 117 | # A list of ignored prefixes for module index sorting. 118 | # modindex_common_prefix = [] 119 | 120 | # If true, keep warnings as "system message" paragraphs in the built documents. 121 | # keep_warnings = False 122 | 123 | # If true, `todo` and `todoList` produce output, else they produce nothing. 124 | todo_include_todos = False 125 | 126 | # -- Options for HTML output ---------------------------------------------- 127 | 128 | # The theme to use for HTML and HTML Help pages. See the documentation for 129 | # a list of builtin themes. 130 | html_theme = 'sphinx_rtd_theme' 131 | 132 | # Theme options are theme-specific and customize the look and feel of a theme 133 | # further. For a list of options available for each theme, see the 134 | # documentation. 135 | # html_theme_options = {} 136 | 137 | # Add any paths that contain custom themes here, relative to this directory. 138 | # html_theme_path = sphinx_bootstrap_theme.get_html_theme_path() 139 | 140 | # The name for this set of Sphinx documents. If None, it defaults to 141 | # " v documentation". 142 | # html_title = None 143 | 144 | # A shorter title for the navigation bar. Default is the same as html_title. 145 | # html_short_title = None 146 | 147 | # The name of an image file (relative to this directory) to place at the top 148 | # of the sidebar. 149 | # html_logo = None 150 | 151 | # The name of an image file (within the static path) to use as favicon of the 152 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 153 | # pixels large. 154 | # html_favicon = None 155 | 156 | # Add any paths that contain custom static files (such as style sheets) here, 157 | # relative to this directory. They are copied after the builtin static files, 158 | # so a file named "default.css" will overwrite the builtin "default.css". 159 | html_static_path = ['_static'] 160 | 161 | # Add any extra paths that contain custom files (such as robots.txt or 162 | # .htaccess) here, relative to this directory. These files are copied 163 | # directly to the root of the documentation. 164 | # html_extra_path = [] 165 | 166 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, 167 | # using the given strftime format. 168 | # html_last_updated_fmt = '%b %d, %Y' 169 | 170 | # If true, SmartyPants will be used to convert quotes and dashes to 171 | # typographically correct entities. 172 | # html_use_smartypants = True 173 | 174 | # Custom sidebar templates, maps document names to template names. 175 | # html_sidebars = {} 176 | 177 | # Additional templates that should be rendered to pages, maps page names to 178 | # template names. 179 | # html_additional_pages = {} 180 | 181 | # If false, no module index is generated. 182 | # html_domain_indices = True 183 | 184 | # If false, no index is generated. 185 | # html_use_index = True 186 | 187 | # If true, the index is split into individual pages for each letter. 188 | # html_split_index = False 189 | 190 | # If true, links to the reST sources are added to the pages. 191 | # html_show_sourcelink = True 192 | 193 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True. 194 | # html_show_sphinx = True 195 | 196 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. 197 | # html_show_copyright = True 198 | 199 | # If true, an OpenSearch description file will be output, and all pages will 200 | # contain a tag referring to it. The value of this option must be the 201 | # base URL from which the finished HTML is served. 202 | # html_use_opensearch = '' 203 | 204 | # This is the file name suffix for HTML files (e.g. ".xhtml"). 205 | # html_file_suffix = None 206 | 207 | # Language to be used for generating the HTML full-text search index. 208 | # Sphinx supports the following languages: 209 | # 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' 210 | # 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' 211 | # html_search_language = 'en' 212 | 213 | # A dictionary with options for the search language support, empty by default. 214 | # Now only 'ja' uses this config value 215 | # html_search_options = {'type': 'default'} 216 | 217 | # The name of a javascript file (relative to the configuration directory) that 218 | # implements a search results scorer. If empty, the default will be used. 219 | # html_search_scorer = 'scorer.js' 220 | 221 | # Output file base name for HTML help builder. 222 | htmlhelp_basename = 'py_stringmatchingdoc' 223 | 224 | # -- Options for LaTeX output --------------------------------------------- 225 | 226 | latex_elements = { 227 | # The paper size ('letterpaper' or 'a4paper'). 228 | # 'papersize': 'letterpaper', 229 | 230 | # The font size ('10pt', '11pt' or '12pt'). 231 | # 'pointsize': '10pt', 232 | 233 | # Additional stuff for the LaTeX preamble. 234 | # 'preamble': '', 235 | 236 | # Latex figure (float) alignment 237 | # 'figure_align': 'htbp', 238 | } 239 | 240 | # Grouping the document tree into LaTeX files. List of tuples 241 | # (source start file, target name, title, 242 | # author, documentclass [howto, manual, or own class]). 243 | latex_documents = [ 244 | (master_doc, 'py_stringmatching.tex', u'py\\_stringmatching Documentation', 245 | u'Magellan Team', 'manual'), 246 | ] 247 | 248 | # The name of an image file (relative to this directory) to place at the top of 249 | # the title page. 250 | # latex_logo = None 251 | 252 | # For "manual" documents, if this is true, then toplevel headings are parts, 253 | # not chapters. 254 | # latex_use_parts = False 255 | 256 | # If true, show page references after internal links. 257 | # latex_show_pagerefs = False 258 | 259 | # If true, show URL addresses after external links. 260 | # latex_show_urls = False 261 | 262 | # Documents to append as an appendix to all manuals. 263 | # latex_appendices = [] 264 | 265 | # If false, no module index is generated. 266 | # latex_domain_indices = True 267 | 268 | 269 | # -- Options for manual page output --------------------------------------- 270 | 271 | # One entry per manual page. List of tuples 272 | # (source start file, name, description, authors, manual section). 273 | man_pages = [ 274 | (master_doc, 'py_stringmatching', u'py_stringmatching Documentation', 275 | [author], 1) 276 | ] 277 | 278 | # If true, show URL addresses after external links. 279 | # man_show_urls = False 280 | 281 | 282 | # -- Options for Texinfo output ------------------------------------------- 283 | 284 | # Grouping the document tree into Texinfo files. List of tuples 285 | # (source start file, target name, title, author, 286 | # dir menu entry, description, category) 287 | texinfo_documents = [ 288 | (master_doc, 'py_stringmatching', u'py_stringmatching Documentation', 289 | author, 'py_stringmatching', 'One line description of project.', 290 | 'Miscellaneous'), 291 | ] 292 | 293 | # Documents to append as an appendix to all manuals. 294 | # texinfo_appendices = [] 295 | 296 | # If false, no module index is generated. 297 | # texinfo_domain_indices = True 298 | 299 | # How to display URL addresses: 'footnote', 'no', or 'inline'. 300 | # texinfo_show_urls = 'footnote' 301 | 302 | # If true, do not generate a @detailmenu in the "Top" node's menu. 303 | # texinfo_no_detailmenu = False 304 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to py_stringmatching's documentation! 2 | ============================================= 3 | 4 | Contents: 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | Installation 10 | Tutorial 11 | API 12 | 13 | Indices and tables 14 | ================== 15 | 16 | * :ref:`genindex` 17 | * :ref:`modindex` 18 | * :ref:`search` 19 | 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\py_stringmatching.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\py_stringmatching.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /py_stringmatching/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = 0.1 2 | -------------------------------------------------------------------------------- /py_stringmatching/compat.py: -------------------------------------------------------------------------------- 1 | """py-stringmatching.compat.py 2 | The compat module defines some variables to enable Python 2 and Python 3 3 | compatibility within a single codebase 4 | The following are defined: 5 | - _range -- use in place of xrange/range 6 | - _unicode -- use in place of unicode/str 7 | - _unichr -- use in place of unichr/chr 8 | - _long -- use in place of long/int 9 | And: 10 | - numeric_type -- defines the set of numeric types 11 | """ 12 | 13 | import sys 14 | 15 | # pylint: disable=invalid-name 16 | if sys.version_info[0] == 3: # pragma: no cover 17 | _range = range 18 | _unicode = str 19 | _unichr = chr 20 | _long = int 21 | numeric_type = (int, float, complex) 22 | else: # pragma: no cover 23 | _range = xrange 24 | _unicode = unicode 25 | _unichr = unichr 26 | _long = long 27 | numeric_type = (int, long, float, complex) 28 | -------------------------------------------------------------------------------- /py_stringmatching/simfunctions.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | from __future__ import division 3 | from __future__ import unicode_literals 4 | 5 | import collections 6 | import math 7 | 8 | import Levenshtein 9 | import numpy as np 10 | 11 | from py_stringmatching import utils 12 | # noinspection PyProtectedMember,PyProtectedMember 13 | from .compat import _range 14 | 15 | 16 | def sim_ident(s1, s2): 17 | return int(s1 == s2) 18 | 19 | 20 | # ---------------------- sequence based similarity measures ---------------------- 21 | 22 | 23 | def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident): 24 | """ 25 | Computes the Affine gap score between two strings. 26 | 27 | The Affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more 28 | gracefully. 29 | 30 | For more information refer to string matching chapter in the DI book. 31 | 32 | Args: 33 | string1,string2 (str) : Input strings 34 | 35 | gap_start (float): Cost for the gap at the start (defaults to 1) 36 | 37 | gap_continuation (float) : Cost for the gap continuation (defaults to 0.5) 38 | 39 | sim_score (function) : Function computing similarity score between two chars, represented as strings 40 | (defaults to identity). 41 | 42 | Returns: 43 | Affine gap score (float) 44 | 45 | Raises: 46 | TypeError : If the inputs are not strings or if one of the inputs is None. 47 | 48 | Examples: 49 | >>> affine('dva', 'deeva') 50 | 1.5 51 | >>> affine('dva', 'deeve', gap_start=2, gap_continuation=0.5) 52 | -0.5 53 | >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))) 54 | 4.4 55 | """ 56 | # input validations 57 | utils.sim_check_for_none(string1, string2) 58 | utils.tok_check_for_string_input(string1, string2) 59 | # if one of the strings is empty return 0 60 | if utils.sim_check_for_empty(string1, string2): 61 | return 0 62 | 63 | gap_start = -gap_start 64 | gap_continuation = -gap_continuation 65 | m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) 66 | x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) 67 | y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) 68 | # DP initialization 69 | for i in _range(1, len(string1) + 1): 70 | m[i][0] = -float("inf") 71 | x[i][0] = gap_start + (i - 1) * gap_continuation 72 | y[i][0] = -float("inf") 73 | # DP initialization 74 | for j in _range(1, len(string2) + 1): 75 | m[0][j] = -float("inf") 76 | x[0][j] = -float("inf") 77 | y[0][j] = gap_start + (j - 1) * gap_continuation 78 | # affine gap calculation using DP 79 | for i in _range(1, len(string1) + 1): 80 | for j in _range(1, len(string2) + 1): 81 | # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j 82 | m[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1]) 83 | # the best score given that x_i is aligned to a gap 84 | x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j]) 85 | # the best score given that y_j is aligned to a gap 86 | y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1]) 87 | return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)]) 88 | 89 | 90 | # jaro 91 | # noinspection PyUnboundLocalVariable,PyUnboundLocalVariable,PyUnboundLocalVariable,PyUnboundLocalVariable 92 | def jaro(string1, string2): 93 | """ 94 | Computes the Jaro measure between two strings. 95 | 96 | The Jaro measure is a type of edit distance, This was developed mainly to compare short strings, 97 | such as first and last names. 98 | 99 | 100 | Args: 101 | string1,string2 (str): Input strings 102 | 103 | Returns: 104 | Jaro measure (float) 105 | 106 | 107 | Raises: 108 | TypeError : If the inputs are not strings or if one of the inputs is None. 109 | 110 | 111 | Examples: 112 | >>> jaro('MARTHA', 'MARHTA') 113 | 0.9444444444444445 114 | >>> jaro('DWAYNE', 'DUANE') 115 | 0.8222222222222223 116 | >>> jaro('DIXON', 'DICKSONX') 117 | 0.7666666666666666 118 | 119 | 120 | """ 121 | # input validations 122 | utils.sim_check_for_none(string1, string2) 123 | utils.tok_check_for_string_input(string1, string2) 124 | # if one of the strings is empty return 0 125 | if utils.sim_check_for_empty(string1, string2): 126 | return 0 127 | 128 | len_s1 = len(string1) 129 | len_s2 = len(string2) 130 | 131 | max_len = max(len_s1, len_s2) 132 | search_range = (max_len // 2) - 1 133 | if search_range < 0: 134 | search_range = 0 135 | 136 | flags_s1 = [False] * len_s1 137 | flags_s2 = [False] * len_s2 138 | 139 | common_chars = 0 140 | for i, ch_s1 in enumerate(string1): 141 | low = i - search_range if i > search_range else 0 142 | hi = i + search_range if i + search_range < len_s2 else len_s2 - 1 143 | for j in _range(low, hi + 1): 144 | if not flags_s2[j] and string2[j] == ch_s1: 145 | flags_s1[i] = flags_s2[j] = True 146 | common_chars += 1 147 | break 148 | if not common_chars: 149 | return 0 150 | k = trans_count = 0 151 | for i, f_s1 in enumerate(flags_s1): 152 | if f_s1: 153 | for j in _range(k, len_s2): 154 | if flags_s2[j]: 155 | k = j + 1 156 | break 157 | if string1[i] != string2[j]: 158 | trans_count += 1 159 | trans_count /= 2 160 | common_chars = float(common_chars) 161 | weight = ((common_chars / len_s1 + common_chars / len_s2 + 162 | (common_chars - trans_count) / common_chars)) / 3 163 | return weight 164 | 165 | 166 | # jaro-winkler 167 | def jaro_winkler(string1, string2, prefix_weight=0.1): 168 | """ 169 | Computes the Jaro-Winkler measure between two strings. 170 | 171 | The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix 172 | and thus are likely to match. 173 | 174 | 175 | Args: 176 | string1,string2 (str): Input strings 177 | 178 | prefix_weight (float): Weight to give the prefix (defaults to 0.1) 179 | 180 | Returns: 181 | Jaro-Winkler measure (float) 182 | 183 | Raises: 184 | TypeError : If the inputs are not strings or if one of the inputs is None. 185 | 186 | 187 | Examples: 188 | >>> jaro_winkler('MARTHA', 'MARHTA') 189 | 0.9611111111111111 190 | >>> jaro_winkler('DWAYNE', 'DUANE') 191 | 0.84 192 | >>> jaro_winkler('DIXON', 'DICKSONX') 193 | 0.8133333333333332 194 | 195 | """ 196 | # input validations 197 | utils.sim_check_for_none(string1, string2) 198 | utils.tok_check_for_string_input(string1, string2) 199 | # if one of the strings is empty return 0 200 | if utils.sim_check_for_empty(string1, string2): 201 | return 0 202 | 203 | jw_score = jaro(string1, string2) 204 | min_len = min(len(string1), len(string2)) 205 | # prefix length can be at max 4 206 | j = min(min_len, 4) 207 | i = 0 208 | while i < j and string1[i] == string2[i] and string1[i]: 209 | i += 1 210 | if i: 211 | jw_score += i * prefix_weight * (1 - jw_score) 212 | return jw_score 213 | 214 | 215 | def hamming_distance(string1, string2): 216 | """ 217 | Computes the Hamming distance between two strings. 218 | 219 | The Hamming distance between two strings of equal length is the number of positions at which the corresponding 220 | symbols are different. In another way, it measures the minimum number of substitutions required to change 221 | one string into the other, or the minimum number of errors that could have transformed one string into the other. 222 | 223 | 224 | Args: 225 | string1,string2 (str): Input strings 226 | 227 | Returns: 228 | Hamming distance (int) 229 | 230 | Raises: 231 | TypeError : If the inputs are not strings or if one of the inputs is None. 232 | ValueError : If the input strings are not of same length 233 | 234 | 235 | Examples: 236 | >>> hamming_distance('', '') 237 | 0 238 | >>> hamming_distance('alex', 'john') 239 | 4 240 | >>> hamming_distance(' ', 'a') 241 | 0 242 | >>> hamming_distance('JOHN', 'john') 243 | 4 244 | """ 245 | # input validations 246 | utils.sim_check_for_none(string1, string2) 247 | utils.tok_check_for_string_input(string1, string2) 248 | # for Hamming Distance string length should be same 249 | utils.sim_check_for_same_len(string1, string2) 250 | # sum all the mismatch characters at the corresponding index of 251 | # input strings 252 | return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2)) 253 | 254 | 255 | def levenshtein(string1, string2): 256 | """ 257 | Computes the Levenshtein distance between two strings. 258 | 259 | Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string 260 | is carried out using a sequence of the following operators: delete a character, insert a character, and 261 | substitute one character for another. 262 | 263 | Args: 264 | string1,string2 (str): Input strings 265 | 266 | Returns: 267 | Levenshtein distance (int) 268 | 269 | Raises: 270 | TypeError : If the inputs are not strings 271 | 272 | Examples: 273 | >>> levenshtein('a', '') 274 | 1 275 | >>> levenshtein('example', 'samples') 276 | 3 277 | >>> levenshtein('levenshtein', 'frankenstein') 278 | 6 279 | 280 | 281 | Note: 282 | This implementation internally uses python-levenshtein package to compute the Levenshtein distance 283 | 284 | """ 285 | # input validations 286 | utils.sim_check_for_none(string1, string2) 287 | utils.sim_check_for_string_inputs(string1, string2) 288 | # using Levenshtein library 289 | return Levenshtein.distance(string1, string2) 290 | 291 | 292 | def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident): 293 | """ 294 | Computes the Needleman-Wunsch measure between two strings. 295 | 296 | The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings. 297 | Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the 298 | score of the best alignment, that is, the maximal score. 299 | 300 | An alignment between two strings is a set of correspondences between the characters of between them, allowing for 301 | gaps. 302 | 303 | Args: 304 | string1,string2 (str) : Input strings 305 | 306 | gap_cost (float) : Cost of gap (defaults to 1.0) 307 | 308 | sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults 309 | to an identity function, where if two characters are same it returns 1.0 else returns 0. 310 | 311 | 312 | Returns: 313 | Needleman-Wunsch measure (float) 314 | 315 | 316 | Raises: 317 | TypeError : If the inputs are not strings or if one of the inputs is None. 318 | 319 | Examples: 320 | >>> needleman_wunsch('dva', 'deeva') 321 | 1.0 322 | >>> needleman_wunsch('dva', 'deeve', 0.0) 323 | 2.0 324 | >>> needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2 : (2.0 if s1 == s2 else -1.0)) 325 | 1.0 326 | >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) 327 | 2.5 328 | """ 329 | # input validations 330 | utils.sim_check_for_none(string1, string2) 331 | utils.sim_check_for_string_inputs(string1, string2) 332 | 333 | dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) 334 | # DP initialization 335 | for i in _range(len(string1) + 1): 336 | dist_mat[i, 0] = -(i * gap_cost) 337 | # DP initialization 338 | for j in _range(len(string2) + 1): 339 | dist_mat[0, j] = -(j * gap_cost) 340 | # Needleman-Wunsch DP calculation 341 | for i in _range(1, len(string1) + 1): 342 | for j in _range(1, len(string2) + 1): 343 | match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) 344 | delete = dist_mat[i - 1, j] - gap_cost 345 | insert = dist_mat[i, j - 1] - gap_cost 346 | dist_mat[i, j] = max(match, delete, insert) 347 | return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1] 348 | 349 | 350 | def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident): 351 | """ 352 | Computes the Smith-Waterman measure between two strings. 353 | 354 | The Smith–Waterman algorithm performs local sequence alignment; that is, for determining similar regions 355 | between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of 356 | all possible lengths and optimizes the similarity measure. 357 | 358 | 359 | Args: 360 | string1,string2 (str) : Input strings 361 | 362 | gap_cost (float) : Cost of gap (defaults to 1.0) 363 | 364 | sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults 365 | to an identity function, where if two characters are same it returns 1 else returns 0. 366 | 367 | Returns: 368 | Smith-Waterman measure (float) 369 | 370 | Raises: 371 | TypeError : If the inputs are not strings or if one of the inputs is None. 372 | 373 | Examples: 374 | >>> smith_waterman('cat', 'hat') 375 | 2.0 376 | >>> smith_waterman('dva', 'deeve', 2.2) 377 | 1.0 378 | >>> smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2 : (2 if s1 == s2 else -1)) 379 | 2.0 380 | >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) 381 | 6.5 382 | """ 383 | # input validations 384 | utils.sim_check_for_none(string1, string2) 385 | utils.sim_check_for_string_inputs(string1, string2) 386 | 387 | dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float) 388 | max_value = 0 389 | # Smith Waterman DP calculations 390 | for i in _range(1, len(string1) + 1): 391 | for j in _range(1, len(string2) + 1): 392 | match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1]) 393 | delete = dist_mat[i - 1, j] - gap_cost 394 | insert = dist_mat[i, j - 1] - gap_cost 395 | dist_mat[i, j] = max(0, match, delete, insert) 396 | max_value = max(max_value, dist_mat[i, j]) 397 | return max_value 398 | 399 | 400 | # ---------------------- token based similarity measures ---------------------- 401 | 402 | # ---------------------- set based similarity measures ---------------------- 403 | def cosine(set1, set2): 404 | """ 405 | Computes the cosine similarity between two sets. 406 | 407 | For two sets X and Y, the cosine similarity is: 408 | 409 | :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}` 410 | 411 | 412 | Args: 413 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 414 | 415 | Returns: 416 | Cosine similarity (float) 417 | 418 | Raises: 419 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 420 | 421 | Examples: 422 | >>> cosine(['data', 'science'], ['data']) 423 | 0.7071067811865475 424 | >>> cosine(['data', 'data', 'science'], ['data', 'management']) 425 | 0.4999999999999999 426 | >>> cosine([], ['data']) 427 | 0.0 428 | 429 | References: 430 | * String similarity joins: An Experimental Evaluation (VLDB 2014) 431 | * Project flamingo : Mike carey, Vernica 432 | """ 433 | # input validations 434 | utils.sim_check_for_none(set1, set2) 435 | utils.sim_check_for_list_or_set_inputs(set1, set2) 436 | # if exact match return 1.0 437 | if utils.sim_check_for_exact_match(set1, set2): 438 | return 1.0 439 | # if one of the strings is empty return 0 440 | if utils.sim_check_for_empty(set1, set2): 441 | return 0 442 | if not isinstance(set1, set): 443 | set1 = set(set1) 444 | if not isinstance(set2, set): 445 | set2 = set(set2) 446 | return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) * math.sqrt(float(len(set2)))) 447 | 448 | 449 | def jaccard(set1, set2): 450 | """ 451 | Computes the Jaccard measure between two sets. 452 | 453 | The Jaccard measure, also known as the Jaccard similarity coefficient, is a statistic used for comparing 454 | the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample 455 | sets, and is defined as the size of the intersection divided by the size of the union of the sample sets. 456 | 457 | 458 | For two sets X and Y, the Jaccard measure is: 459 | 460 | :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X| \\cup |Y|}` 461 | 462 | 463 | Args: 464 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 465 | 466 | Returns: 467 | Jaccard similarity (float) 468 | 469 | Raises: 470 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 471 | 472 | Examples: 473 | >>> jaccard(['data', 'science'], ['data']) 474 | 0.5 475 | >>> jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 476 | 0.375 477 | >>> jaccard(['data', 'management'], ['data', 'data', 'science']) 478 | 0.3333333333333333 479 | """ 480 | # input validations 481 | utils.sim_check_for_none(set1, set2) 482 | utils.sim_check_for_list_or_set_inputs(set1, set2) 483 | # if exact match return 1.0 484 | if utils.sim_check_for_exact_match(set1, set2): 485 | return 1.0 486 | # if one of the strings is empty return 0 487 | if utils.sim_check_for_empty(set1, set2): 488 | return 0 489 | if not isinstance(set1, set): 490 | set1 = set(set1) 491 | if not isinstance(set2, set): 492 | set2 = set(set2) 493 | return float(len(set1 & set2)) / float(len(set1 | set2)) 494 | 495 | 496 | def overlap_coefficient(set1, set2): 497 | """ 498 | Computes the overlap coefficient between two sets. 499 | 500 | The overlap coefficient is a similarity measure related to the Jaccard 501 | measure that measures the overlap between two sets, and is defined as the size of the intersection divided by 502 | the smaller of the size of the two sets. 503 | 504 | For two sets X and Y, the overlap coefficient is: 505 | 506 | :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}` 507 | 508 | Args: 509 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 510 | 511 | Returns: 512 | Overlap coefficient (float) 513 | 514 | Raises: 515 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 516 | 517 | Examples: 518 | >>> (overlap_coefficient([], []) 519 | 1.0 520 | >>> overlap_coefficient([], ['data']) 521 | 0 522 | >>> overlap_coefficient(['data', 'science'], ['data']) 523 | 1.0 524 | 525 | References: 526 | * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient 527 | * Simmetrics library 528 | 529 | """ 530 | # input validations 531 | utils.sim_check_for_none(set1, set2) 532 | utils.sim_check_for_list_or_set_inputs(set1, set2) 533 | # if exact match return 1.0 534 | if utils.sim_check_for_exact_match(set1, set2): 535 | return 1.0 536 | # if one of the strings is empty return 0 537 | if utils.sim_check_for_empty(set1, set2): 538 | return 0 539 | if not isinstance(set1, set): 540 | set1 = set(set1) 541 | if not isinstance(set2, set): 542 | set2 = set(set2) 543 | 544 | return float(len(set1 & set2)) / min(len(set1), len(set2)) 545 | 546 | 547 | # ---------------------- bag based similarity measures ---------------------- 548 | # noinspection PyArgumentList,PyArgumentList 549 | def tfidf(bag1, bag2, corpus_list=None, dampen=False): 550 | """ 551 | Compute tfidf measures between two lists given the corpus information. 552 | This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that 553 | are relevant to keyword queries. 554 | The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms. 555 | 556 | Args: 557 | bag1,bag2 (list): Input lists 558 | 559 | corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None, 560 | the input list are considered the only corpus. 561 | 562 | dampen (boolean): Flag to indicate whether 'log' should be applied to tf and idf measure. 563 | 564 | Returns: 565 | TF-IDF measure between the input lists (float) 566 | 567 | Raises: 568 | TypeError : If the inputs are not lists or if one of the inputs is None 569 | 570 | 571 | Examples: 572 | >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]) 573 | 0.17541160386140586 574 | >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True) 575 | 0.11166746710505392 576 | >>> tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]) 577 | 0.5547001962252291 578 | >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]) 579 | 0.0 580 | >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']], True) 581 | 0.0 582 | >>> tfidf(['a', 'b', 'a'], ['a']) 583 | 0.7071067811865475 584 | """ 585 | # input validations 586 | utils.sim_check_for_none(bag1, bag2) 587 | utils.sim_check_for_list_or_set_inputs(bag1, bag2) 588 | # if the strings match exactly return 1.0 589 | if utils.sim_check_for_exact_match(bag1, bag2): 590 | return 1.0 591 | # if one of the strings is empty return 0 592 | if utils.sim_check_for_empty(bag1, bag2): 593 | return 0 594 | # if corpus is not provided treat input string as corpus 595 | if corpus_list is None: 596 | corpus_list = [bag1, bag2] 597 | corpus_size = len(corpus_list) 598 | # term frequency for input strings 599 | tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) 600 | # number of documents an element appeared 601 | element_freq = {} 602 | # set of unique element 603 | total_unique_elements = set() 604 | for document in corpus_list: 605 | temp_set = set() 606 | for element in document: 607 | # adding element only if it is present in one of two input string 608 | if element in bag1 or element in bag2: 609 | temp_set.add(element) 610 | total_unique_elements.add(element) 611 | # update element document frequency for this document 612 | for element in temp_set: 613 | element_freq[element] = element_freq[element] + 1 if element in element_freq else 1 614 | idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 615 | # tfidf calculation 616 | for element in total_unique_elements: 617 | idf_element = corpus_size * 1.0 / element_freq[element] 618 | v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else ( 619 | idf_element * tf_x[element]) 620 | v_y = 0 if element not in tf_y else (math.log(idf_element) * math.log(tf_y[element] + 1)) if dampen else ( 621 | idf_element * tf_y[element]) 622 | v_x_y += v_x * v_y 623 | v_x_2 += v_x * v_x 624 | v_y_2 += v_y * v_y 625 | return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2)) 626 | 627 | 628 | # hybrid similarity measures 629 | def monge_elkan(bag1, bag2, sim_func=jaro_winkler): 630 | """ 631 | Compute Monge-Elkan similarity measure between two bags (lists). 632 | 633 | The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of 634 | sequence-based and set-based methods. This can be effective for domains in which more control is needed 635 | over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute 636 | over all similarity score. 637 | 638 | Args: 639 | bag1,bag2 (list): Input lists 640 | 641 | sim_func (function): Secondary similarity function. This is expected to be a sequence-based 642 | similarity measure (defaults to levenshtein) 643 | 644 | Returns: 645 | Monge-Elkan similarity score (float) 646 | 647 | Raises: 648 | TypeError : If the inputs are not lists or if one of the inputs is None 649 | 650 | 651 | Examples: 652 | >>> monge_elkan(['Niall'], ['Neal']) 653 | 0.8049999999999999 654 | >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 655 | 0.8677218614718616 656 | >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch) 657 | 2.0 658 | >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine) 659 | 2.25 660 | >>> monge_elkan([''], ['a']) 661 | 0.0 662 | >>> monge_elkan(['Niall'], ['Nigel']) 663 | 0.7866666666666667 664 | 665 | References: 666 | * Principles of Data Integration book 667 | """ 668 | # input validations 669 | utils.sim_check_for_none(bag1, bag2) 670 | utils.sim_check_for_list_or_set_inputs(bag1, bag2) 671 | # if exact match return 1.0 672 | if utils.sim_check_for_exact_match(bag1, bag2): 673 | return 1.0 674 | # if one of the strings is empty return 0 675 | if utils.sim_check_for_empty(bag1, bag2): 676 | return 0 677 | # aggregated sum of all the max sim score of all the elements in bag1 678 | # with elements in bag2 679 | sum_of_maxes = 0 680 | for t1 in bag1: 681 | max_sim = float('-inf') 682 | for t2 in bag2: 683 | max_sim = max(max_sim, sim_func(t1, t2)) 684 | sum_of_maxes += max_sim 685 | sim = float(sum_of_maxes) / float(len(bag1)) 686 | return sim 687 | 688 | 689 | # noinspection PyArgumentList,PyArgumentList 690 | def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5): 691 | """ 692 | Compute Soft-tfidf measures between two lists given the corpus information. 693 | 694 | Args: 695 | bag1,bag2 (list): Input lists 696 | 697 | corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None, 698 | the input list are considered the only corpus 699 | 700 | sim_func (func): Secondary similarity function. This should return a similarity score between two strings (optional), 701 | default is jaro similarity measure 702 | 703 | threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity 704 | of a token pair exceeds the threshold, then the token pair is considered a match. 705 | 706 | Returns: 707 | Soft TF-IDF measure between the input lists 708 | 709 | Raises: 710 | TypeError : If the inputs are not lists or if one of the inputs is None. 711 | 712 | Examples: 713 | >>> soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro, threshold=0.8) 714 | 0.17541160386140586 715 | >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9) 716 | 0.5547001962252291 717 | >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]) 718 | 0.0 719 | >>> soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6) 720 | 0.81649658092772592 721 | 722 | References: 723 | * Principles of Data Integration book 724 | """ 725 | # input validations 726 | utils.sim_check_for_none(bag1, bag2) 727 | utils.sim_check_for_list_or_set_inputs(bag1, bag2) 728 | # if the strings match exactly return 1.0 729 | if utils.sim_check_for_exact_match(bag1, bag2): 730 | return 1.0 731 | # if one of the strings is empty return 0 732 | if utils.sim_check_for_empty(bag1, bag2): 733 | return 0 734 | # if corpus is not provided treat input string as corpus 735 | if corpus_list is None: 736 | corpus_list = [bag1, bag2] 737 | corpus_size = len(corpus_list) * 1.0 738 | # term frequency for input strings 739 | tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) 740 | # number of documents an element appeared 741 | element_freq = {} 742 | # set of unique element 743 | total_unique_elements = set() 744 | for document in corpus_list: 745 | temp_set = set() 746 | for element in document: 747 | # adding element only if it is present in one of two input string 748 | if element in bag1 or element in bag2: 749 | temp_set.add(element) 750 | total_unique_elements.add(element) 751 | # update element document frequency for this document 752 | for element in temp_set: 753 | element_freq[element] = element_freq[element] + 1 if element in element_freq else 1 754 | similarity_map = {} 755 | # calculating the term sim score against the input string 2, construct similarity map 756 | for x in bag1: 757 | if x not in similarity_map: 758 | max_score = 0.0 759 | for y in bag2: 760 | score = sim_func(x, y) 761 | # adding sim only if it is above threshold and highest for this element 762 | if score > threshold and score > max_score: 763 | similarity_map[x] = utils.Similarity(x, y, score) 764 | max_score = score 765 | result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 766 | # soft-tfidf calculation 767 | for element in total_unique_elements: 768 | # numerator 769 | if element in similarity_map: 770 | sim = similarity_map[element] 771 | idf_first = corpus_size if sim.first_string not in element_freq else corpus_size / \ 772 | element_freq[sim.first_string] 773 | idf_second = corpus_size if sim.second_string not in element_freq else corpus_size / \ 774 | element_freq[sim.second_string] 775 | v_x = 0 if sim.first_string not in tf_x else idf_first * tf_x[sim.first_string] 776 | v_y = 0 if sim.second_string not in tf_y else idf_second * tf_y[sim.second_string] 777 | result += v_x * v_y * sim.similarity_score 778 | # denominator 779 | idf = corpus_size if element not in element_freq else corpus_size / element_freq[element] 780 | v_x = 0 if element not in tf_x else idf * tf_x[element] 781 | v_x_2 += v_x * v_x 782 | v_y = 0 if element not in tf_y else idf * tf_y[element] 783 | v_y_2 += v_y * v_y 784 | return result if v_x_2 == 0 else result / (math.sqrt(v_x_2) * math.sqrt(v_y_2)) 785 | -------------------------------------------------------------------------------- /py_stringmatching/tests/test_simfunctions.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import math 4 | import unittest 5 | 6 | from nose.tools import * 7 | 8 | 9 | # sequence based similarity measures 10 | from py_stringmatching.simfunctions import levenshtein, jaro, jaro_winkler, hamming_distance, needleman_wunsch, \ 11 | smith_waterman, affine 12 | # token based similarity measures 13 | from py_stringmatching.simfunctions import overlap_coefficient, jaccard, cosine, tfidf, soft_tfidf 14 | # hybrid similarity measures 15 | from py_stringmatching.simfunctions import monge_elkan 16 | 17 | 18 | # ---------------------- sequence based similarity measures ---------------------- 19 | 20 | 21 | class AffineTestCases(unittest.TestCase): 22 | def test_valid_input(self): 23 | self.assertAlmostEqual(affine('dva', 'deeva'), 1.5) 24 | self.assertAlmostEqual(affine('dva', 'deeve', gap_start=2, gap_continuation=0.5), -0.5) 25 | self.assertAlmostEqual( 26 | affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))), 27 | 4.4) 28 | self.assertAlmostEqual( 29 | affine(' ', ' ', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))), 1) 30 | 31 | @raises(TypeError) 32 | def test_invalid_input1(self): 33 | affine(None, 'MARHTA') 34 | 35 | @raises(TypeError) 36 | def test_invalid_input2(self): 37 | affine('MARHTA', None) 38 | 39 | @raises(TypeError) 40 | def test_invalid_input3(self): 41 | affine('MARHTA', 12.90) 42 | 43 | @raises(TypeError) 44 | def test_invalid_input4(self): 45 | affine(12.90, 'MARTHA') 46 | 47 | @raises(TypeError) 48 | def test_invalid_input5(self): 49 | affine(None, None) 50 | 51 | 52 | class JaroTestCases(unittest.TestCase): 53 | def test_valid_input(self): 54 | # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance 55 | self.assertAlmostEqual(jaro('MARTHA', 'MARHTA'), 0.9444444444444445) 56 | self.assertAlmostEqual(jaro('DWAYNE', 'DUANE'), 0.8222222222222223) 57 | self.assertAlmostEqual(jaro('DIXON', 'DICKSONX'), 0.7666666666666666) 58 | 59 | @raises(TypeError) 60 | def test_invalid_input1(self): 61 | jaro(None, 'MARHTA') 62 | 63 | @raises(TypeError) 64 | def test_invalid_input2(self): 65 | jaro('MARHTA', None) 66 | 67 | @raises(TypeError) 68 | def test_invalid_input3(self): 69 | jaro(None, None) 70 | 71 | 72 | class JaroWinklerTestCases(unittest.TestCase): 73 | def test_valid_input(self): 74 | # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance 75 | self.assertAlmostEqual(jaro_winkler('MARTHA', 'MARHTA'), 0.9611111111111111) 76 | self.assertAlmostEqual(jaro_winkler('DWAYNE', 'DUANE'), 0.84) 77 | self.assertAlmostEqual(jaro_winkler('DIXON', 'DICKSONX'), 0.8133333333333332) 78 | 79 | @raises(TypeError) 80 | def test_invalid_input1(self): 81 | jaro_winkler(None, 'MARHTA') 82 | 83 | @raises(TypeError) 84 | def test_invalid_input2(self): 85 | jaro_winkler('MARHTA', None) 86 | 87 | @raises(TypeError) 88 | def test_invalid_input3(self): 89 | jaro_winkler(None, None) 90 | 91 | 92 | class LevenshteinTestCases(unittest.TestCase): 93 | def test_valid_input(self): 94 | # http://oldfashionedsoftware.com/tag/levenshtein-distance/ 95 | self.assertEqual(levenshtein('a', ''), 1) 96 | self.assertEqual(levenshtein('', 'a'), 1) 97 | self.assertEqual(levenshtein('abc', ''), 3) 98 | self.assertEqual(levenshtein('', 'abc'), 3) 99 | self.assertEqual(levenshtein('', ''), 0) 100 | self.assertEqual(levenshtein('a', 'a'), 0) 101 | self.assertEqual(levenshtein('abc', 'abc'), 0) 102 | self.assertEqual(levenshtein('', 'a'), 1) 103 | self.assertEqual(levenshtein('a', 'ab'), 1) 104 | self.assertEqual(levenshtein('b', 'ab'), 1) 105 | self.assertEqual(levenshtein('ac', 'abc'), 1) 106 | self.assertEqual(levenshtein('abcdefg', 'xabxcdxxefxgx'), 6) 107 | self.assertEqual(levenshtein('a', ''), 1) 108 | self.assertEqual(levenshtein('ab', 'a'), 1) 109 | self.assertEqual(levenshtein('ab', 'b'), 1) 110 | self.assertEqual(levenshtein('abc', 'ac'), 1) 111 | self.assertEqual(levenshtein('xabxcdxxefxgx', 'abcdefg'), 6) 112 | self.assertEqual(levenshtein('a', 'b'), 1) 113 | self.assertEqual(levenshtein('ab', 'ac'), 1) 114 | self.assertEqual(levenshtein('ac', 'bc'), 1) 115 | self.assertEqual(levenshtein('abc', 'axc'), 1) 116 | self.assertEqual(levenshtein('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6) 117 | self.assertEqual(levenshtein('example', 'samples'), 3) 118 | self.assertEqual(levenshtein('sturgeon', 'urgently'), 6) 119 | self.assertEqual(levenshtein('levenshtein', 'frankenstein'), 6) 120 | self.assertEqual(levenshtein('distance', 'difference'), 5) 121 | self.assertEqual(levenshtein('java was neat', 'scala is great'), 7) 122 | 123 | @raises(TypeError) 124 | def test_invalid_input1(self): 125 | levenshtein('a', None) 126 | 127 | @raises(TypeError) 128 | def test_invalid_input2(self): 129 | levenshtein(None, 'b') 130 | 131 | @raises(TypeError) 132 | def test_invalid_input3(self): 133 | levenshtein(None, None) 134 | 135 | 136 | class HammingDistanceTestCases(unittest.TestCase): 137 | def test_valid_input(self): 138 | self.assertEqual(hamming_distance('-789', 'john'), 4) 139 | self.assertEqual(hamming_distance('a', '*'), 1) 140 | self.assertEqual(hamming_distance('b', 'a'), 1) 141 | self.assertEqual(hamming_distance('abc', 'p q'), 3) 142 | self.assertEqual(hamming_distance('karolin', 'kathrin'), 3) 143 | self.assertEqual(hamming_distance('KARI', 'kari'), 4) 144 | 145 | def test_valid_input_compatibility(self): 146 | self.assertEqual(hamming_distance(u'karolin', u'kathrin'), 3) 147 | self.assertEqual(hamming_distance(u'', u''), 0) 148 | # str_1 = u'foo'.encode(encoding='UTF-8', errors='strict') 149 | # str_2 = u'bar'.encode(encoding='UTF-8', errors='strict') 150 | # self.assertEqual(hamming_distance(str_1, str_2), 3) # check with Ali - python 3 returns type error 151 | # self.assertEqual(hamming_distance(str_1, str_1), 0) # check with Ali - python 3 returns type error 152 | 153 | @raises(TypeError) 154 | def test_invalid_input1(self): 155 | hamming_distance('a', None) 156 | 157 | @raises(TypeError) 158 | def test_invalid_input2(self): 159 | hamming_distance(None, 'b') 160 | 161 | @raises(TypeError) 162 | def test_invalid_input3(self): 163 | hamming_distance(None, None) 164 | 165 | @raises(ValueError) 166 | def test_invalid_input4(self): 167 | hamming_distance('a', '') 168 | 169 | @raises(ValueError) 170 | def test_invalid_input5(self): 171 | hamming_distance('', 'This is a long string') 172 | 173 | @raises(ValueError) 174 | def test_invalid_input6(self): 175 | hamming_distance('ali', 'alex') 176 | 177 | 178 | class NeedlemanWunschTestCases(unittest.TestCase): 179 | def test_valid_input(self): 180 | self.assertEqual(needleman_wunsch('dva', 'deeva'), 1.0) 181 | self.assertEqual(needleman_wunsch('dva', 'deeve', 0.0), 2.0) 182 | self.assertEqual(needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2: (2 if s1 == s2 else -1)), 1.0) 183 | self.assertEqual( 184 | needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, 185 | sim_score=lambda s1, s2: (1 if s1 == s2 else -1)), 186 | 2.5) 187 | 188 | @raises(TypeError) 189 | def test_invalid_input1(self): 190 | needleman_wunsch('a', None) 191 | 192 | @raises(TypeError) 193 | def test_invalid_input2(self): 194 | needleman_wunsch(None, 'b') 195 | 196 | @raises(TypeError) 197 | def test_invalid_input3(self): 198 | needleman_wunsch(None, None) 199 | 200 | 201 | class SmithWatermanTestCases(unittest.TestCase): 202 | def test_valid_input(self): 203 | self.assertEqual(smith_waterman('cat', 'hat'), 2.0) 204 | self.assertEqual(smith_waterman('dva', 'deeve', 2.2), 1.0) 205 | self.assertEqual(smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2: (2 if s1 == s2 else -1)), 2.0) 206 | self.assertEqual( 207 | smith_waterman('GCATGCU', 'GATTACA', gap_cost=1, sim_score=lambda s1, s2: (int(1 if s1 == s2 else -1))), 208 | 2.0) 209 | self.assertEqual( 210 | smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2: (1.5 if s1 == s2 else 0.5)), 211 | 6.5) 212 | 213 | @raises(TypeError) 214 | def test_invalid_input1(self): 215 | smith_waterman('a', None) 216 | 217 | @raises(TypeError) 218 | def test_invalid_input2(self): 219 | smith_waterman(None, 'b') 220 | 221 | @raises(TypeError) 222 | def test_invalid_input3(self): 223 | smith_waterman(None, None) 224 | 225 | 226 | # ---------------------- token based similarity measures ---------------------- 227 | 228 | # ---------------------- set based similarity measures ---------------------- 229 | class OverlapCoefficientTestCases(unittest.TestCase): 230 | def test_valid_input(self): 231 | self.assertEqual(overlap_coefficient([], []), 1.0) 232 | self.assertEqual(overlap_coefficient(['data', 'science'], ['data']), 1.0 / min(2.0, 1.0)) 233 | self.assertEqual(overlap_coefficient(['data', 'science'], ['science', 'good']), 1.0 / min(2.0, 3.0)) 234 | self.assertEqual(overlap_coefficient([], ['data']), 0) 235 | self.assertEqual(overlap_coefficient(['data', 'data', 'science'], ['data', 'management']), 1.0 / min(3.0, 2.0)) 236 | 237 | @raises(TypeError) 238 | def test_invalid_input1(self): 239 | overlap_coefficient(['a'], None) 240 | 241 | @raises(TypeError) 242 | def test_invalid_input2(self): 243 | overlap_coefficient(None, ['b']) 244 | 245 | @raises(TypeError) 246 | def test_invalid_input3(self): 247 | overlap_coefficient(None, None) 248 | 249 | 250 | class JaccardTestCases(unittest.TestCase): 251 | def test_valid_input(self): 252 | self.assertEqual(jaccard(['data', 'science'], ['data']), 1.0 / 2.0) 253 | self.assertEqual(jaccard(['data', 'science'], ['science', 'good']), 1.0 / 3.0) 254 | self.assertEqual(jaccard([], ['data']), 0) 255 | self.assertEqual(jaccard(['data', 'data', 'science'], ['data', 'management']), 1.0 / 3.0) 256 | self.assertEqual(jaccard(['data', 'management'], ['data', 'data', 'science']), 1.0 / 3.0) 257 | self.assertEqual(jaccard([], []), 1.0) 258 | self.assertEqual(jaccard(set([]), set([])), 1.0) 259 | self.assertEqual(jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0) 260 | 261 | @raises(TypeError) 262 | def test_invalid_input1(self): 263 | jaccard(1, 1) 264 | 265 | @raises(TypeError) 266 | def test_invalid_input1(self): 267 | jaccard(['a'], None) 268 | 269 | @raises(TypeError) 270 | def test_invalid_input2(self): 271 | jaccard(None, ['b']) 272 | 273 | @raises(TypeError) 274 | def test_invalid_input3(self): 275 | jaccard(None, None) 276 | 277 | 278 | class CosineTestCases(unittest.TestCase): 279 | def test_valid_input(self): 280 | self.assertEqual(cosine(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1))) 281 | self.assertEqual(cosine(['data', 'science'], ['science', 'good']), 282 | 1.0 / (math.sqrt(2) * math.sqrt(2))) 283 | self.assertEqual(cosine([], ['data']), 0.0) 284 | self.assertEqual(cosine(['data', 'data', 'science'], ['data', 'management']), 285 | 1.0 / (math.sqrt(2) * math.sqrt(2))) 286 | self.assertEqual(cosine(['data', 'management'], ['data', 'data', 'science']), 287 | 1.0 / (math.sqrt(2) * math.sqrt(2))) 288 | self.assertEqual(cosine([], []), 1.0) 289 | self.assertEqual(cosine(set([]), set([])), 1.0) 290 | self.assertEqual(cosine({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), 291 | 3.0 / (math.sqrt(4) * math.sqrt(7))) 292 | 293 | @raises(TypeError) 294 | def test_invalid_input1(self): 295 | cosine(1, 1) 296 | 297 | @raises(TypeError) 298 | def test_invalid_input4(self): 299 | cosine(['a'], None) 300 | 301 | @raises(TypeError) 302 | def test_invalid_input2(self): 303 | cosine(None, ['b']) 304 | 305 | @raises(TypeError) 306 | def test_invalid_input3(self): 307 | cosine(None, None) 308 | 309 | 310 | class TfidfTestCases(unittest.TestCase): 311 | def test_valid_input(self): 312 | self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True), 313 | 0.11166746710505392) 314 | self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.17541160386140586) 315 | self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.5547001962252291) 316 | self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475) 317 | self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0) 318 | self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475) 319 | 320 | @raises(TypeError) 321 | def test_invalid_input1(self): 322 | tfidf(1, 1) 323 | 324 | @raises(TypeError) 325 | def test_invalid_input4(self): 326 | tfidf(['a'], None) 327 | 328 | @raises(TypeError) 329 | def test_invalid_input2(self): 330 | tfidf(None, ['b']) 331 | 332 | @raises(TypeError) 333 | def test_invalid_input3(self): 334 | tfidf(None, None) 335 | 336 | 337 | # ---------------------- bag based similarity measures ---------------------- 338 | # class CosineTestCases(unittest.TestCase): 339 | # def test_valid_input(self): 340 | # NONQ_FROM = 'The quick brown fox jumped over the lazy dog.' 341 | # NONQ_TO = 'That brown dog jumped over the fox.' 342 | # self.assertEqual(cosine([], []), 1) # check-- done. both simmetrics, abydos return 1. 343 | # self.assertEqual(cosine(['the', 'quick'], []), 0) 344 | # self.assertEqual(cosine([], ['the', 'quick']), 0) 345 | # self.assertAlmostEqual(cosine(whitespace(NONQ_TO), whitespace(NONQ_FROM)), 346 | # 4/math.sqrt(9*7)) 347 | # 348 | # @raises(TypeError) 349 | # def test_invalid_input1(self): 350 | # cosine(['a'], None) 351 | # @raises(TypeError) 352 | # def test_invalid_input2(self): 353 | # cosine(None, ['b']) 354 | # @raises(TypeError) 355 | # def test_invalid_input3(self): 356 | # cosine(None, None) 357 | 358 | 359 | # ---------------------- hybrid similarity measure ---------------------- 360 | 361 | class Soft_TfidfTestCases(unittest.TestCase): 362 | def test_valid_input(self): 363 | self.assertEqual(soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro, 364 | threshold=0.8), 0.17541160386140586) 365 | self.assertEqual(soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']], 366 | threshold=0.9), 0.5547001962252291) 367 | self.assertEqual(soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0) 368 | self.assertEqual(soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6), 369 | 0.81649658092772592) 370 | 371 | @raises(TypeError) 372 | def test_invalid_input1(self): 373 | soft_tfidf(1, 1) 374 | 375 | @raises(TypeError) 376 | def test_invalid_input4(self): 377 | soft_tfidf(['a'], None) 378 | 379 | @raises(TypeError) 380 | def test_invalid_input2(self): 381 | soft_tfidf(None, ['b']) 382 | 383 | @raises(TypeError) 384 | def test_invalid_input3(self): 385 | soft_tfidf(None, None) 386 | 387 | 388 | class MongeElkanTestCases(unittest.TestCase): 389 | def test_valid_input(self): 390 | self.assertEqual(monge_elkan([''], ['']), 1.0) # need to check this 391 | 392 | self.assertEqual(monge_elkan([''], ['a']), 0.0) 393 | self.assertEqual(monge_elkan(['a'], ['a']), 1.0) 394 | 395 | self.assertEqual(monge_elkan(['Niall'], ['Neal']), 0.8049999999999999) 396 | self.assertEqual(monge_elkan(['Niall'], ['Njall']), 0.88) 397 | self.assertEqual(monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], 398 | ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), 0.8364448051948052) 399 | self.assertEqual( 400 | monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], 401 | ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], 402 | sim_func=needleman_wunsch), 2.0) 403 | self.assertEqual( 404 | monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], 405 | ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], 406 | sim_func=affine), 2.25) 407 | self.assertEqual(monge_elkan(['Niall'], ['Niel']), 0.8266666666666667) 408 | self.assertEqual(monge_elkan(['Niall'], ['Nigel']), 0.7866666666666667) 409 | 410 | @raises(TypeError) 411 | def test_invalid_input1(self): 412 | monge_elkan(1, 1) 413 | 414 | @raises(TypeError) 415 | def test_invalid_input4(self): 416 | monge_elkan(['a'], None) 417 | 418 | @raises(TypeError) 419 | def test_invalid_input2(self): 420 | monge_elkan(None, ['b']) 421 | 422 | @raises(TypeError) 423 | def test_invalid_input3(self): 424 | monge_elkan(None, None) 425 | -------------------------------------------------------------------------------- /py_stringmatching/tests/test_tokenizers.py: -------------------------------------------------------------------------------- 1 | from __future__ import unicode_literals 2 | 3 | import unittest 4 | from nose.tools import * 5 | 6 | from py_stringmatching.tokenizers import qgram, delimiter, whitespace 7 | 8 | 9 | class QgramTestCases(unittest.TestCase): 10 | def test_qgrams_valid(self): 11 | self.assertEqual(qgram(''), []) 12 | self.assertEqual(qgram('a'), []) 13 | self.assertEqual(qgram('aa'), ['aa']) 14 | self.assertEqual(qgram('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se']) 15 | self.assertEqual(qgram('d', 1), ['d']) 16 | self.assertEqual(qgram('database', 3), ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']) 17 | 18 | @raises(TypeError) 19 | def test_qgrams_none(self): 20 | self.assertEqual(qgram(None), []) 21 | 22 | 23 | class DelimiterTestCases(unittest.TestCase): 24 | def test_delimiter_valid(self): 25 | self.assertEqual(delimiter('data science'), ['data', 'science']) 26 | self.assertEqual(delimiter('data,science', ','), ['data', 'science']) 27 | self.assertEqual(delimiter('data science', ','), ['data science']) 28 | self.assertEqual(delimiter('data$#$science', '$#$'), ['data', 'science']) 29 | 30 | def test_delimiter_invalid1(self): 31 | self.assertEqual(delimiter('data science', None), ['data', 'science']) 32 | 33 | @raises(TypeError) 34 | def test_delimiter_invalid2(self): 35 | self.assertEqual(delimiter('data science', 10), ['data', 'science']) 36 | 37 | @raises(TypeError) 38 | def test_delimiter_invalid3(self): 39 | self.assertEqual(delimiter(None), []) 40 | 41 | 42 | class WhiteSpaceTestCases(unittest.TestCase): 43 | def test_delimiter_valid(self): 44 | self.assertEqual(whitespace('data science'), ['data', 'science']) 45 | self.assertEqual(whitespace('data science'), ['data', 'science']) 46 | self.assertEqual(whitespace('data science'), ['data', 'science']) 47 | self.assertEqual(whitespace('data\tscience'), ['data', 'science']) 48 | self.assertEqual(whitespace('data\nscience'), ['data', 'science']) 49 | 50 | @raises(TypeError) 51 | def test_delimiter_invalid(self): 52 | self.assertEqual(whitespace(None)) 53 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizers.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from .compat import _range 3 | 4 | 5 | # @todo: add examples in the comments 6 | 7 | def qgram(input_string, qval=2): 8 | """ 9 | Tokenizes input string into q-grams. 10 | 11 | A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and 12 | k-grams. 13 | 14 | Args: 15 | input_string (str): Input string 16 | 17 | qval (int): Q-gram length (defaults to 2) 18 | 19 | Returns: 20 | Token list (list) 21 | 22 | Raises: 23 | TypeError : If the input is not a string 24 | 25 | Examples: 26 | >>> qgram('database') 27 | ['da','at','ta','ab','ba','as','se'] 28 | >>> qgram('a') 29 | [] 30 | >>> qgram('database', 3) 31 | ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'] 32 | 33 | 34 | """ 35 | utils.tok_check_for_none(input_string) 36 | utils.tok_check_for_string_input(input_string) 37 | 38 | qgram_list = [] 39 | 40 | if len(input_string) < qval or qval < 1: 41 | return qgram_list 42 | 43 | qgram_list = [input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1))] 44 | return qgram_list 45 | 46 | 47 | def delimiter(input_string, delim_str=' '): 48 | """ 49 | Tokenizes input string based on the given delimiter. 50 | 51 | Args: 52 | input_string (str): Input string 53 | 54 | delim_str (str): Delimiter string 55 | 56 | 57 | Returns: 58 | Token list (list) 59 | 60 | Raises: 61 | TypeError : If the input is not a string 62 | 63 | Examples: 64 | >>> delimiter('data science') 65 | ['data', 'science'] 66 | >>> delimiter('data$#$science', '$#$') 67 | ['data', 'science'] 68 | >>> delimiter('data science', ',') 69 | ['data science'] 70 | 71 | """ 72 | utils.tok_check_for_none(input_string) 73 | utils.tok_check_for_string_input(input_string) 74 | 75 | return input_string.split(delim_str) 76 | 77 | 78 | def whitespace(input_string): 79 | """ 80 | Tokenizes input string based on white space. 81 | 82 | Args: 83 | input_string (str): Input string 84 | 85 | Returns: 86 | Token list (list) 87 | 88 | Raises: 89 | TypeError : If the input is not a string 90 | 91 | Examples: 92 | >>> whitespace('data science') 93 | ['data', 'science'] 94 | >>> whitespace('data science') 95 | ['data', 'science'] 96 | >>> whitespace('data\tscience') 97 | ['data', 'science'] 98 | 99 | """ 100 | utils.tok_check_for_none(input_string) 101 | utils.tok_check_for_string_input(input_string) 102 | 103 | return input_string.split() 104 | -------------------------------------------------------------------------------- /py_stringmatching/utils.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import six 4 | 5 | """ 6 | This module defines a list of decorator functions to check input strings/list. The reason this is separated 7 | from the similarity functions is the implementation of checking functions can change later, depending on 8 | our decision to handle missing values. 9 | """ 10 | 11 | 12 | def _sim_check_for_list_or_set_inputs(func): 13 | @functools.wraps(func) 14 | def decorator(*args, **kwargs): 15 | if not isinstance(args[0], list): 16 | if not isinstance(args[0], set): 17 | raise TypeError('First argument is expected to be a python list or set') 18 | if not isinstance(args[1], list): 19 | if not isinstance(args[1], set): 20 | raise TypeError('Second argument is expected to be a python list or set') 21 | return func(*args, **kwargs) 22 | 23 | return decorator 24 | 25 | 26 | def _sim_check_for_string_inputs(func): 27 | @functools.wraps(func) 28 | def decorator(*args, **kwargs): 29 | if not isinstance(args[0], six.string_types): 30 | raise TypeError('First argument is expected to be a string') 31 | if not isinstance(args[1], six.string_types): 32 | raise TypeError('Second argument is expected to be a string') 33 | return func(*args, **kwargs) 34 | 35 | return decorator 36 | 37 | 38 | def _sim_check_for_same_len(func): 39 | @functools.wraps(func) 40 | def decorator(*args, **kwargs): 41 | if args[0] is None: 42 | raise TypeError("string1 is None") 43 | if args[1] is None: 44 | raise TypeError("string2 is None") 45 | if len(args[0]) != len(args[1]): 46 | raise ValueError("Undefined for sequences of unequal length") 47 | return func(*args, **kwargs) 48 | 49 | return decorator 50 | 51 | 52 | def _sim_check_for_exact_match(func): 53 | @functools.wraps(func) 54 | def decorator(*args, **kwargs): 55 | if args[0] == args[1]: 56 | return 1.0 57 | return func(*args, **kwargs) 58 | 59 | return decorator 60 | 61 | 62 | def _sim_check_for_empty(func): 63 | @functools.wraps(func) 64 | def decorator(*args, **kwargs): 65 | if len(args[0]) == 0 or len(args[1]) == 0: 66 | return 0 67 | return func(*args, **kwargs) 68 | 69 | return decorator 70 | 71 | 72 | def _sim_check_for_none(func): 73 | @functools.wraps(func) 74 | def decorator(*args, **kwargs): 75 | if args[0] is None: 76 | raise TypeError("string1 is None") 77 | if args[1] is None: 78 | raise TypeError("string2 is None") 79 | return func(*args, **kwargs) 80 | 81 | return decorator 82 | 83 | 84 | def _tok_check_for_none(func): 85 | @functools.wraps(func) 86 | def decorator(*args, **kwargs): 87 | empty_list = [] 88 | if args[0] is None: 89 | return empty_list 90 | return func(*args, **kwargs) 91 | 92 | return decorator 93 | 94 | 95 | def _tok_check_for_string_input(func): 96 | @functools.wraps(func) 97 | def decorator(*args, **kwargs): 98 | if not isinstance(args[0], six.string_types): 99 | raise TypeError('Input is expected to be a string') 100 | return func(*args, **kwargs) 101 | 102 | return decorator 103 | 104 | 105 | def sim_check_for_none(*args): 106 | if len(args) > 0 and args[0] is None: 107 | raise TypeError("First argument cannot be None") 108 | if len(args) > 1 and args[1] is None: 109 | raise TypeError("Second argument cannot be None") 110 | 111 | 112 | def sim_check_for_empty(*args): 113 | if len(args[0]) == 0 or len(args[1]) == 0: 114 | return True 115 | 116 | 117 | def sim_check_for_same_len(*args): 118 | if len(args[0]) != len(args[1]): 119 | raise ValueError("Undefined for sequences of unequal length") 120 | 121 | 122 | def sim_check_for_string_inputs(*args): 123 | if not isinstance(args[0], six.string_types): 124 | raise TypeError('First argument is expected to be a string') 125 | if not isinstance(args[1], six.string_types): 126 | raise TypeError('Second argument is expected to be a string') 127 | 128 | 129 | def sim_check_for_list_or_set_inputs(*args): 130 | if not isinstance(args[0], list): 131 | if not isinstance(args[0], set): 132 | raise TypeError('First argument is expected to be a python list or set') 133 | if not isinstance(args[1], list): 134 | if not isinstance(args[1], set): 135 | raise TypeError('Second argument is expected to be a python list or set') 136 | 137 | 138 | def sim_check_for_exact_match(*args): 139 | if args[0] == args[1]: 140 | return True 141 | 142 | 143 | def tok_check_for_string_input(*args): 144 | for i in range(len(args)): 145 | if not isinstance(args[i], six.string_types): 146 | raise TypeError('Input is expected to be a string') 147 | 148 | 149 | def tok_check_for_none(*args): 150 | if args[0] is None: 151 | raise TypeError("First argument cannot be None") 152 | 153 | 154 | class Similarity: 155 | def __init__(self, string1, string2, score): 156 | self.first_string = string1 157 | self.second_string = string2 158 | self.similarity_score = score 159 | 160 | # # check for NaNs 161 | # def check_strings_for_nulls(func): 162 | # @functools.wraps(func) 163 | # def decorator(*args, **kwargs): 164 | # if np.isnan(args[0]) is True: 165 | # return np.NaN 166 | # if np.isnan(args[1]) is None: 167 | # return np.NaN 168 | # return func(*args, **kwargs) 169 | # return decorator 170 | # 171 | # # check for nulls in tokens 172 | # def check_tokens_for_nulls(func): 173 | # @functools.wraps(func) 174 | # def decorator(*args, **kwargs): 175 | # tmp_args0 = args[0] 176 | # if not isinstance(tmp_args0, list): 177 | # tmp_args0 = [tmp_args0] 178 | # if any(np.isnan(tmp_args0)) is True: 179 | # return np.NaN 180 | # tmp_args1 = args[1] 181 | # if not isinstance(tmp_args1, list): 182 | # tmp_args1 = [tmp_args1] 183 | # if any(np.isnan(tmp_args1)) is True: 184 | # return np.NaN 185 | # return func(*args, **kwargs) 186 | # return decorator 187 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.7.0 2 | six 3 | python-Levenshtein >= 0.12.0 4 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | # Set this to True to enable building extensions using Cython. 4 | # Set it to False to build extensions from the C file (that 5 | # was previously created using Cython). 6 | # Set it to 'auto' to build with Cython if available, otherwise 7 | # from the C file. 8 | 9 | setup( 10 | name='py_stringmatching', 11 | version='0.1', 12 | description='Python library for string matching.', 13 | long_description=""" 14 | String matching is an important problem in many settings such as data integration, natural language processing,etc. 15 | This package aims to implement most commonly used string matching measures. 16 | """, 17 | url='http://github.com/kvpradap/py_stringmatching', 18 | author='Pradap Konda', 19 | author_email='pradap@cs.wisc.edu', 20 | license=['MIT'], 21 | packages=['py_stringmatching'], 22 | install_requires=[ 23 | 'numpy >= 1.7.0', 24 | 'six', 25 | 'python-Levenshtein >= 0.12.0' 26 | ], 27 | include_package_data=True, 28 | zip_safe=False 29 | ) 30 | --------------------------------------------------------------------------------