├── .gitignore
├── .travis.yml
├── CHANGES.txt
├── README.md
├── appveyor.yml
├── asv.conf.json
├── benchmarks
    ├── __init__.py
    └── benchamarks.py
├── continuous-integration
    └── appveyor
    │   ├── install.ps1
    │   ├── rm_rf.py
    │   └── run_with_env.cmd
├── docs
    ├── API.rst
    ├── Installation.rst
    ├── Makefile
    ├── Simfunctions.rst
    ├── Tokenizers.rst
    ├── Tutorial.rst
    ├── conf.py
    ├── index.rst
    └── make.bat
├── py_stringmatching
    ├── __init__.py
    ├── compat.py
    ├── simfunctions.py
    ├── tests
    │   ├── test_simfunctions.py
    │   └── test_tokenizers.py
    ├── tokenizers.py
    └── utils.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.so
 9 | 
10 | # Distribution / packaging
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | 
28 | # PyInstaller
29 | #  Usually these files are written by a python script from a template
30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 | 
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 | 
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | ### VirtualEnv template
61 | # Virtualenv
62 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
63 | [Bb]in
64 | [Ii]nclude
65 | [Ll]ib
66 | [Ss]cripts
67 | pyvenv.cfg
68 | pip-selfcheck.json
69 | ### IPythonNotebook template
70 | # Temporary data
71 | .ipynb_checkpoints/
72 | 
73 | # idea
74 | .idea
75 | 
76 | #scratch
77 | scratch
78 | # Created by .ignore support plugin (hsz.mobi)
79 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |   - "2.7"
 4 |   - "3.3"
 5 |   - "3.4"
 6 |   - "3.5"
 7 | install:
 8 |   - pip install -r requirements.txt
 9 |   - pip install codecov
10 | 
11 | script:
12 |   - nosetests
13 | #  - coverage run py_stringmatching/tests/test_simfunctions.py
14 | #  - coverage run py_stringmatching/tests/test_tokenizers.py
15 | 
16 | after_success:
17 |   - codecov
18 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kvpradap/py_stringmatching/abc3df5d4db5ebfef648c9cc069d95e4468f6f19/CHANGES.txt


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Travis CI Status [![Build Status](https://travis-ci.org/kvpradap/py_stringmatching.svg?branch=master)](https://travis-ci.org/kvpradap/py_stringmatching)
2 | 
3 | # py-stringmatching
4 | Python library for string matching!
5 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | environment:
 3 | 
 4 |   matrix:
 5 |     - PYTHON: "C:\\Python27_32"
 6 |       PYTHON_VERSION: "2.7"
 7 |       PYTHON_ARCH: "32"
 8 |       CONDA_PY: "27"
 9 | 
10 |     - PYTHON: "C:\\Python27_64"
11 |       PYTHON_VERSION: "2.7"
12 |       PYTHON_ARCH: "64"
13 |       CONDA_PY: "27"
14 | 
15 |     - PYTHON: "C:\\Python34_32"
16 |       PYTHON_VERSION: "3.4"
17 |       PYTHON_ARCH: "32"
18 |       CONDA_PY: "34"
19 | 
20 |     - PYTHON: "C:\\Python34_64"
21 |       PYTHON_VERSION: "3.4"
22 |       PYTHON_ARCH: "64"
23 |       CONDA_PY: "34"
24 | 
25 | 
26 | install:
27 |   # this installs the appropriate Miniconda (Py2/Py3, 32/64 bit)
28 |   - powershell .\\continuous-integration\\appveyor\\install.ps1
29 |   - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
30 | 
31 |   # Don't install from requirements-pip.txt, python-coveralls has broken dependencies on windows it seems.
32 |   - conda install --yes setuptools nose numpy pip coverage 
33 |   - pip install six python-Levenshtein rednose
34 |   - python setup.py install
35 | 
36 | build: false
37 | 
38 | test_script:
39 |   # Nosetests take care of unit tests
40 |   # Behave runs the example scripts and tries to verify if it produces the right output
41 |   - nosetests
42 |   #- behave --tags ~@skip # Everything without the tag @skip
43 | 
44 | on_success:
45 |   # Could run coveralls here but will leave that to travis tests
46 |   - echo Build succesful!
47 |   #- coverage report
48 |   # coveralls
49 | 


--------------------------------------------------------------------------------
/asv.conf.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   // The version of the config file format.  Do not change, unless
 3 |   // you know what you are doing.
 4 |   "version": 1,
 5 |   // The name of the project being benchmarked
 6 |   "project": ".",
 7 |   // The project's homepage
 8 |   "project_url": "https://github.com/kvpradap/py_stringmatching/",
 9 |   // The URL or local path of the source code repository for the
10 |   // project being benchmarked
11 |   "repo": "https://github.com/kvpradap/py_stringmatching.git",
12 |   //"repo": "/Users/pradap/Documents/Research/Python-Package/py_stringmatching",
13 | 
14 |   // List of branches to benchmark. If not provided, defaults to "master"
15 |   // (for git) or "tip" (for mercurial).
16 |   "branches": [
17 |     "master"
18 |   ],
19 |   // for git
20 |   // "branches": ["tip"],    // for mercurial
21 | 
22 |   // The DVCS being used.  If not set, it will be automatically
23 |   // determined from "repo" by looking at the protocol in the URL
24 |   // (if remote), or by looking for special directories, such as
25 |   // ".git" (if local).
26 |   // "dvcs": "git",
27 | 
28 |   // The tool to use to create environments.  May be "conda",
29 |   // "virtualenv" or other value depending on the plugins in use.
30 |   // If missing or the empty string, the tool will be automatically
31 |   // determined by looking for tools on the PATH environment
32 |   // variable.
33 |   "environment_type": "conda",
34 |   // the base URL to show a commit for the project.
35 |   "show_commit_url": "https://github.com/kvpradap/py_stringmatching/commit/",
36 |   // The Pythons you'd like to test against.  If not provided, defaults
37 |   // to the current version of Python used to run `asv`.
38 |   "pythons": [
39 |     "2.7",
40 |     "3.3",
41 |     "3.4"
42 |   ],
43 |   //"pythons": ["3.3"],
44 | 
45 |   // The matrix of dependencies to test.  Each key is the name of a
46 |   // package (in PyPI) and the values are version numbers.  An empty
47 |   // list indicates to just test against the default (latest)
48 |   // version.
49 |   // "matrix": {
50 |   //     "numpy": ["1.6", "1.7"]
51 |   // },
52 | 
53 |   // The directory (relative to the current directory) that benchmarks are
54 |   // stored in.  If not provided, defaults to "benchmarks"
55 |   "benchmark_dir": "benchmarks"
56 | 
57 |   // The directory (relative to the current directory) to cache the Python
58 |   // environments in.  If not provided, defaults to "env"
59 |   // "env_dir": "env",
60 | 
61 | 
62 |   // The directory (relative to the current directory) that raw benchmark
63 |   // results are stored in.  If not provided, defaults to "results".
64 |   // "results_dir": "results",
65 | 
66 |   // The directory (relative to the current directory) that the html tree
67 |   // should be written to.  If not provided, defaults to "html".
68 |   // "html_dir": "html",
69 | 
70 |   // The number of characters to retain in the commit hashes.
71 |   // "hash_length": 8,
72 | 
73 |   // `asv` will cache wheels of the recent builds in each
74 |   // environment, making them faster to install next time.  This is
75 |   // number of builds to keep, per environment.
76 |   // "wheel_cache_size": 0
77 | }
78 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
 1 | _short_string_1 = 'badgerdi'
 2 | _short_string_2 = 'diproject'
 3 | 
 4 | _medium_string_1 = 'data integration'
 5 | _medium_string_2 = 'entity matching'
 6 | 
 7 | _long_string_1 = 'Data integration involves combining data residing in different sources and ' \
 8 |                  'providing users with a unified view of these data'  # > 12
 9 | _long_string_2 = 'Record linkage (RL) refers to the task of finding records in a data set that ' \
10 |                  'refer to the same entity across different data sources '
11 | _long_hamm_string1 = 'Data integration involves combining data residing in different sources and ' \
12 |                      'providing users with a unified view of these data'
13 | _long_hamm_string2 = 'Data integration involves combining data residing in different sources and ' \
14 |                      'providing users with a unified vieu of these data'
15 | 
16 | _small_num_tokens_wo_rep = ['data', 'integration']
17 | _small_num_tokens_wi_rep = ['data', 'integration']
18 | _med_num_tokens_wo_rep = ['data', 'integration', 'involves', 'data',
19 |                           'residing', 'in', 'different', 'sources']
20 | _med_num_tokens_wi_rep = ['data', 'integration', 'involves', 'data',
21 |                           'integration', 'in', 'different', 'data']
22 | 
23 | _large_num_tokens_wo_rep = ['Data', 'integration', 'involves', 'combining', 'data', 'residing', 'in',
24 |                             'different', 'sources', 'and', 'providing', 'users', 'with', 'a', 'unified',
25 |                             'view', 'of', 'these', 'data.', 'This', 'process', 'becomes', 'significant', 'in',
26 |                             'a', 'variety', 'of', 'situations.']
27 | 
28 | _large_num_tokens_wi_rep = ['Data', 'integration', 'involves', 'combining', 'data', 'data', 'in',
29 |                             'different', 'sources', 'and', 'different', 'users', 'with', 'a', 'unified',
30 |                             'view', 'of', 'these', 'data.', 'This', 'data', 'becomes', 'significant', 'in',
31 |                             'a', 'different', 'of', 'data.']
32 | 


--------------------------------------------------------------------------------
/benchmarks/benchamarks.py:
--------------------------------------------------------------------------------
  1 | # Write the benchmarking functions here.
  2 | # See "Writing benchmarks" in the asv docs for more information.
  3 | 
  4 | from py_stringmatching import simfunctions
  5 | from . import _short_string_1, _long_string_1, _medium_string_1, _short_string_2, _long_string_2, _medium_string_2
  6 | from . import _small_num_tokens_wi_rep, _small_num_tokens_wo_rep, _med_num_tokens_wi_rep, _med_num_tokens_wo_rep, \
  7 |     _large_num_tokens_wi_rep, _large_num_tokens_wo_rep, _long_hamm_string1, _long_hamm_string2
  8 | 
  9 | 
 10 | class TimeAffine:
 11 |     def time_short_short(self):
 12 |         simfunctions.affine(_short_string_1, _short_string_2)
 13 | 
 14 |     def time_medium_medium(self):
 15 |         simfunctions.affine(_medium_string_1, _medium_string_2)
 16 | 
 17 |     def time_long_long(self):
 18 |         simfunctions.affine(_long_string_1, _long_string_2)
 19 | 
 20 |     def time_short_medium(self):
 21 |         simfunctions.affine(_short_string_1, _medium_string_1)
 22 | 
 23 |     def time_short_long(self):
 24 |         simfunctions.affine(_short_string_1, _long_string_1)
 25 | 
 26 |     def time_medium_long(self):
 27 |         simfunctions.affine(_medium_string_1, _long_string_1)
 28 | 
 29 | 
 30 | class TimeJaro:
 31 |     def time_short_short(self):
 32 |         simfunctions.jaro(_short_string_1, _short_string_2)
 33 | 
 34 |     def time_medium_medium(self):
 35 |         simfunctions.jaro(_medium_string_1, _medium_string_2)
 36 | 
 37 |     def time_long_long(self):
 38 |         simfunctions.jaro(_long_string_1, _long_string_2)
 39 | 
 40 |     def time_short_medium(self):
 41 |         simfunctions.jaro(_short_string_1, _medium_string_1)
 42 | 
 43 |     def time_short_long(self):
 44 |         simfunctions.jaro(_short_string_1, _long_string_1)
 45 | 
 46 |     def time_medium_long(self):
 47 |         simfunctions.jaro(_medium_string_1, _long_string_1)
 48 | 
 49 | 
 50 | class TimeJaroWinkler:
 51 |     def time_short_short(self):
 52 |         simfunctions.jaro_winkler(_short_string_1, _short_string_2)
 53 | 
 54 |     def time_medium_medium(self):
 55 |         simfunctions.jaro_winkler(_medium_string_1, _medium_string_2)
 56 | 
 57 |     def time_long_long(self):
 58 |         simfunctions.jaro_winkler(_long_string_1, _long_string_2)
 59 | 
 60 |     def time_short_medium(self):
 61 |         simfunctions.jaro_winkler(_short_string_1, _medium_string_1)
 62 | 
 63 |     def time_short_long(self):
 64 |         simfunctions.jaro_winkler(_short_string_1, _long_string_1)
 65 | 
 66 |     def time_medium_long(self):
 67 |         simfunctions.jaro_winkler(_medium_string_1, _long_string_1)
 68 | 
 69 | 
 70 | class TimeHammingDistance:
 71 |     def time_short_short(self):
 72 |         simfunctions.hamming_distance(_short_string_1, _short_string_1)
 73 | 
 74 |     def time_medium_medium(self):
 75 |         simfunctions.hamming_distance(_medium_string_1, _medium_string_1)
 76 | 
 77 |     def time_long_long(self):
 78 |         simfunctions.hamming_distance(_long_hamm_string1, _long_hamm_string2)
 79 | 
 80 |         # def time_short_medium(self):
 81 |         #     simfunctions.hamming_distance(_short_string_1, _medium_string_1)
 82 |         #
 83 |         # def time_short_long(self):
 84 |         #     simfunctions.hamming_distance(_short_string_1, _long_string_1)
 85 |         #
 86 |         # def time_medium_long(self):
 87 |         #     simfunctions.hamming_distance(_medium_string_1, _long_string_1)
 88 | 
 89 | 
 90 | #
 91 | # class TimeJaro1:
 92 | #     def time_short_short(self):
 93 | #         Levenshtein.jaro(_short_string_1, _short_string_2)
 94 | #
 95 | #     def time_medium_medium(self):
 96 | #         Levenshtein.jaro(_medium_string_1, _medium_string_2)
 97 | #
 98 | #     def time_long_long(self):
 99 | #         Levenshtein.jaro(_long_string_1, _long_string_2)
100 | #
101 | #     def time_short_medium(self):
102 | #         Levenshtein.jaro(_short_string_1, _medium_string_1)
103 | #
104 | #     def time_short_long(self):
105 | #         Levenshtein.jaro(_short_string_1, _long_string_1)
106 | #
107 | #     def time_medium_long(self):
108 | #         Levenshtein.jaro(_medium_string_1, _long_string_1)
109 | #
110 | #
111 | class TimeLevenshtein:
112 |     def time_short_short(self):
113 |         simfunctions.levenshtein(_short_string_1, _short_string_2)
114 | 
115 |     def time_medium_medium(self):
116 |         simfunctions.levenshtein(_medium_string_1, _medium_string_2)
117 | 
118 |     def time_long_long(self):
119 |         simfunctions.levenshtein(_long_string_1, _long_string_2)
120 | 
121 |     def time_short_medium(self):
122 |         simfunctions.levenshtein(_short_string_1, _medium_string_1)
123 | 
124 |     def time_short_long(self):
125 |         simfunctions.levenshtein(_short_string_1, _long_string_1)
126 | 
127 |     def time_medium_long(self):
128 |         simfunctions.levenshtein(_medium_string_1, _long_string_1)
129 | 
130 | 
131 | class TimeNeedlemanWunsch:
132 |     def time_short_short(self):
133 |         simfunctions.needleman_wunsch(_short_string_1, _short_string_2)
134 | 
135 |     def time_medium_medium(self):
136 |         simfunctions.needleman_wunsch(_medium_string_1, _medium_string_2)
137 | 
138 |     def time_long_long(self):
139 |         simfunctions.needleman_wunsch(_long_string_1, _long_string_2)
140 | 
141 |     def time_short_medium(self):
142 |         simfunctions.needleman_wunsch(_short_string_1, _medium_string_1)
143 | 
144 |     def time_short_long(self):
145 |         simfunctions.needleman_wunsch(_short_string_1, _long_string_1)
146 | 
147 |     def time_medium_long(self):
148 |         simfunctions.needleman_wunsch(_medium_string_1, _long_string_1)
149 | 
150 | 
151 | class TimeSmithWaterman:
152 |     def time_short_short(self):
153 |         simfunctions.smith_waterman(_short_string_1, _short_string_2)
154 | 
155 |     def time_medium_medium(self):
156 |         simfunctions.smith_waterman(_medium_string_1, _medium_string_2)
157 | 
158 |     def time_long_long(self):
159 |         simfunctions.smith_waterman(_long_string_1, _long_string_2)
160 | 
161 |     def time_short_medium(self):
162 |         simfunctions.smith_waterman(_short_string_1, _medium_string_1)
163 | 
164 |     def time_short_long(self):
165 |         simfunctions.smith_waterman(_short_string_1, _long_string_1)
166 | 
167 |     def time_medium_long(self):
168 |         simfunctions.smith_waterman(_medium_string_1, _long_string_1)
169 | 
170 | 
171 | class TimeCosine:
172 |     def time_small_small_wo_rep(self):
173 |         simfunctions.cosine(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
174 | 
175 |     def time_small_small_wi_rep(self):
176 |         simfunctions.cosine(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep)
177 | 
178 |     def time_medium_medium_wo_rep(self):
179 |         simfunctions.cosine(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
180 | 
181 |     def time_medium_medium_wi_rep(self):
182 |         simfunctions.cosine(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep)
183 | 
184 |     def time_large_large_wo_rep(self):
185 |         simfunctions.cosine(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep)
186 | 
187 |     def time_large_large_wi_rep(self):
188 |         simfunctions.cosine(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
189 | 
190 |     def time_small_medium_wo_rep(self):
191 |         simfunctions.cosine(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep)
192 | 
193 |     def time_small_medium_wi_rep(self):
194 |         simfunctions.cosine(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
195 | 
196 |     def time_small_large_wo_rep(self):
197 |         simfunctions.cosine(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
198 | 
199 |     def time_small_large_wi_rep(self):
200 |         simfunctions.cosine(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep)
201 | 
202 |     def time_medium_large_wo_rep(self):
203 |         simfunctions.cosine(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
204 | 
205 |     def time_medium_large_wi_rep(self):
206 |         simfunctions.cosine(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
207 | 
208 | 
209 | class TimeJaccard:
210 |     def time_small_small_wo_rep(self):
211 |         simfunctions.jaccard(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
212 | 
213 |     def time_small_small_wi_rep(self):
214 |         simfunctions.jaccard(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep)
215 | 
216 |     def time_medium_medium_wo_rep(self):
217 |         simfunctions.jaccard(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
218 | 
219 |     def time_medium_medium_wi_rep(self):
220 |         simfunctions.jaccard(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep)
221 | 
222 |     def time_large_large_wo_rep(self):
223 |         simfunctions.jaccard(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep)
224 | 
225 |     def time_large_large_wi_rep(self):
226 |         simfunctions.jaccard(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
227 | 
228 |     def time_small_medium_wo_rep(self):
229 |         simfunctions.jaccard(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep)
230 | 
231 |     def time_small_medium_wi_rep(self):
232 |         simfunctions.jaccard(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
233 | 
234 |     def time_small_large_wo_rep(self):
235 |         simfunctions.jaccard(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
236 | 
237 |     def time_small_large_wi_rep(self):
238 |         simfunctions.jaccard(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep)
239 | 
240 |     def time_medium_large_wo_rep(self):
241 |         simfunctions.jaccard(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
242 | 
243 |     def time_medium_large_wi_rep(self):
244 |         simfunctions.jaccard(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
245 | 
246 | 
247 | class TimeOverlap:
248 |     def time_small_small_wo_rep(self):
249 |         simfunctions.overlap_coefficient(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
250 | 
251 |     def time_small_small_wi_rep(self):
252 |         simfunctions.overlap_coefficient(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep)
253 | 
254 |     def time_medium_medium_wo_rep(self):
255 |         simfunctions.overlap_coefficient(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
256 | 
257 |     def time_medium_medium_wi_rep(self):
258 |         simfunctions.overlap_coefficient(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep)
259 | 
260 |     def time_large_large_wo_rep(self):
261 |         simfunctions.overlap_coefficient(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep)
262 | 
263 |     def time_large_large_wi_rep(self):
264 |         simfunctions.overlap_coefficient(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
265 | 
266 |     def time_small_medium_wo_rep(self):
267 |         simfunctions.overlap_coefficient(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep)
268 | 
269 |     def time_small_medium_wi_rep(self):
270 |         simfunctions.overlap_coefficient(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
271 | 
272 |     def time_small_large_wo_rep(self):
273 |         simfunctions.overlap_coefficient(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
274 | 
275 |     def time_small_large_wi_rep(self):
276 |         simfunctions.overlap_coefficient(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep)
277 | 
278 |     def time_medium_large_wo_rep(self):
279 |         simfunctions.overlap_coefficient(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
280 | 
281 |     def time_medium_large_wi_rep(self):
282 |         simfunctions.overlap_coefficient(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
283 | 
284 | 
285 | class TimeMongeElkan:
286 |     def time_small_small_wo_rep(self):
287 |         simfunctions.monge_elkan(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
288 | 
289 |     def time_small_small_wi_rep(self):
290 |         simfunctions.monge_elkan(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep)
291 | 
292 |     def time_medium_medium_wo_rep(self):
293 |         simfunctions.monge_elkan(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
294 | 
295 |     def time_medium_medium_wi_rep(self):
296 |         simfunctions.monge_elkan(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep)
297 | 
298 |     def time_large_large_wo_rep(self):
299 |         simfunctions.monge_elkan(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep)
300 | 
301 |     def time_large_large_wi_rep(self):
302 |         simfunctions.monge_elkan(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
303 | 
304 |     def time_small_medium_wo_rep(self):
305 |         simfunctions.monge_elkan(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep)
306 | 
307 |     def time_small_medium_wi_rep(self):
308 |         simfunctions.monge_elkan(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
309 | 
310 | 
311 | class TimeTfIdf:
312 |     corpus_list = [_small_num_tokens_wo_rep, _small_num_tokens_wi_rep, _med_num_tokens_wi_rep, _med_num_tokens_wo_rep,
313 |                    _large_num_tokens_wo_rep, _large_num_tokens_wi_rep]
314 | 
315 |     def time_small_small_wo_rep_no_corpus_no_dampen(self):
316 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
317 | 
318 |     def time_small_small_wi_rep_no_corpus_no_dampen(self):
319 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep)
320 | 
321 |     def time_medium_medium_wo_rep_no_corpus_no_dampen(self):
322 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
323 | 
324 |     def time_medium_medium_wi_rep_no_corpus_no_dampen(self):
325 |         simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep)
326 | 
327 |     def time_large_large_wo_rep_no_corpus_no_dampen(self):
328 |         simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep)
329 | 
330 |     def time_large_large_wi_rep_no_corpus_no_dampen(self):
331 |         simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
332 | 
333 |     def time_small_medium_wo_rep_no_corpus_no_dampen(self):
334 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep)
335 | 
336 |     def time_small_medium_wi_rep_no_corpus_no_dampen(self):
337 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
338 | 
339 |     def time_small_large_wo_rep_no_corpus_no_dampen(self):
340 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
341 | 
342 |     def time_small_large_wi_rep_no_corpus_no_dampen(self):
343 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep)
344 | 
345 |     def time_medium_large_wo_rep_no_corpus_no_dampen(self):
346 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
347 | 
348 |     def time_medium_large_wi_rep_no_corpus_no_dampen(self):
349 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
350 | 
351 |     # dampen - true
352 |     def time_small_small_wo_rep_no_corpus(self):
353 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, dampen=True)
354 | 
355 |     def time_small_small_wi_rep_no_corpus(self):
356 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, dampen=True)
357 | 
358 |     def time_medium_medium_wo_rep_no_corpus(self):
359 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True)
360 | 
361 |     def time_medium_medium_wi_rep_no_corpus(self):
362 |         simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, dampen=True)
363 | 
364 |     def time_large_large_wo_rep_no_corpus(self):
365 |         simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
366 | 
367 |     def time_large_large_wi_rep_no_corpus(self):
368 |         simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True)
369 | 
370 |     def time_small_medium_wo_rep_no_corpus(self):
371 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, dampen=True)
372 | 
373 |     def time_small_medium_wi_rep_no_corpus(self):
374 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, dampen=True)
375 | 
376 |     def time_small_large_wo_rep_no_corpus(self):
377 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
378 | 
379 |     def time_small_large_wi_rep_no_corpus(self):
380 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, dampen=True)
381 | 
382 |     def time_medium_large_wo_rep_no_corpus(self):
383 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
384 | 
385 |     def time_medium_large_wi_rep_no_corpus(self):
386 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, dampen=True)
387 | 
388 |     # corpus list - true
389 |     def time_small_small_wo_rep_no_dampen(self):
390 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, corpus_list=self.corpus_list)
391 | 
392 |     def time_small_small_wi_rep_no_dampen(self):
393 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, corpus_list=self.corpus_list)
394 | 
395 |     def time_medium_medium_wo_rep_no_dampen(self):
396 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
397 | 
398 |     def time_medium_medium_wi_rep_no_dampen(self):
399 |         simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list)
400 | 
401 |     def time_large_large_wo_rep_no_dampen(self):
402 |         simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
403 | 
404 |     def time_large_large_wi_rep_no_dampen(self):
405 |         simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
406 | 
407 |     def time_small_medium_wo_rep_no_dampen(self):
408 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
409 | 
410 |     def time_small_medium_wi_rep_no_dampen(self):
411 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list)
412 | 
413 |     def time_small_large_wo_rep_no_dampen(self):
414 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
415 | 
416 |     def time_small_large_wi_rep_no_dampen(self):
417 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
418 | 
419 |     def time_medium_large_wo_rep_no_dampen(self):
420 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
421 | 
422 |     def time_medium_large_wi_rep_no_dampen(self):
423 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
424 | 
425 |     # corpus list - true, dampen_true
426 |     def time_small_small_wo_rep(self):
427 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, corpus_list=self.corpus_list,
428 |                            dampen=True)
429 | 
430 |     def time_small_small_wi_rep(self):
431 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, corpus_list=self.corpus_list,
432 |                            dampen=True)
433 | 
434 |     def time_medium_medium_wo_rep(self):
435 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list,
436 |                            dampen=True)
437 | 
438 |     def time_medium_medium_wi_rep(self):
439 |         simfunctions.tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list,
440 |                            dampen=True)
441 | 
442 |     def time_large_large_wo_rep(self):
443 |         simfunctions.tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list,
444 |                            dampen=True)
445 | 
446 |     def time_large_large_wi_rep(self):
447 |         simfunctions.tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list,
448 |                            dampen=True)
449 | 
450 |     def time_small_medium_wo_rep(self):
451 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True)
452 | 
453 |     def time_small_medium_wi_rep(self):
454 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list, dampen=True)
455 | 
456 |     def time_small_large_wo_rep(self):
457 |         simfunctions.tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list,
458 |                            dampen=True)
459 | 
460 |     def time_small_large_wi_rep(self):
461 |         simfunctions.tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list,
462 |                            dampen=True)
463 | 
464 |     def time_medium_large_wo_rep(self):
465 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True)
466 | 
467 |     def time_medium_large_wi_rep(self):
468 |         simfunctions.tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list, dampen=True)
469 | 
470 | 
471 | class TimeSoftTfIdf:
472 |     corpus_list = [_small_num_tokens_wo_rep, _small_num_tokens_wi_rep, _med_num_tokens_wi_rep, _med_num_tokens_wo_rep,
473 |                    _large_num_tokens_wo_rep, _large_num_tokens_wi_rep]
474 | 
475 |     # no corpus list
476 |     def time_small_small_wo_rep_no_corpus(self):
477 |         simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep)
478 | 
479 |     def time_small_small_wi_rep_no_corpus(self):
480 |         simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep)
481 | 
482 |     def time_medium_medium_wo_rep_no_corpus(self):
483 |         simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep)
484 | 
485 |     def time_medium_medium_wi_rep_no_corpus(self):
486 |         simfunctions.soft_tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep)
487 | 
488 |     def time_large_large_wo_rep_no_corpus(self):
489 |         simfunctions.soft_tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep)
490 | 
491 |     def time_large_large_wi_rep_no_corpus(self):
492 |         simfunctions.soft_tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep)
493 | 
494 |     def time_small_medium_wo_rep_no_corpus(self):
495 |         simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep)
496 | 
497 |     def time_small_medium_wi_rep_no_corpus(self):
498 |         simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep)
499 | 
500 |     def time_small_large_wo_rep_no_corpus(self):
501 |         simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep)
502 | 
503 |     def time_small_large_wi_rep_no_corpus(self):
504 |         simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep)
505 | 
506 |     def time_medium_large_wo_rep_no_corpus(self):
507 |         simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
508 | 
509 |     def time_medium_large_wi_rep_no_corpus(self):
510 |         simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep)
511 | 
512 |     # with corpus list
513 |     def time_small_small_wo_rep(self):
514 |         simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _small_num_tokens_wo_rep, corpus_list=self.corpus_list)
515 | 
516 |     def time_small_small_wi_rep(self):
517 |         simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _small_num_tokens_wi_rep, corpus_list=self.corpus_list)
518 | 
519 |     def time_medium_medium_wo_rep(self):
520 |         simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
521 | 
522 |     def time_medium_medium_wi_rep(self):
523 |         simfunctions.soft_tfidf(_med_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list)
524 | 
525 |     def time_large_large_wo_rep(self):
526 |         simfunctions.soft_tfidf(_large_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
527 | 
528 |     def time_large_large_wi_rep(self):
529 |         simfunctions.soft_tfidf(_large_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
530 | 
531 |     def time_small_medium_wo_rep(self):
532 |         simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _med_num_tokens_wo_rep, corpus_list=self.corpus_list)
533 | 
534 |     def time_small_medium_wi_rep(self):
535 |         simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _med_num_tokens_wi_rep, corpus_list=self.corpus_list)
536 | 
537 |     def time_small_large_wo_rep(self):
538 |         simfunctions.soft_tfidf(_small_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
539 | 
540 |     def time_small_large_wi_rep(self):
541 |         simfunctions.soft_tfidf(_small_num_tokens_wi_rep, _large_num_tokens_wi_rep, corpus_list=self.corpus_list)
542 | 
543 |     def time_medium_large_wo_rep(self):
544 |         simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
545 | 
546 |     def time_medium_large_wi_rep(self):
547 |         simfunctions.soft_tfidf(_med_num_tokens_wo_rep, _large_num_tokens_wo_rep, corpus_list=self.corpus_list)
548 | 


--------------------------------------------------------------------------------
/continuous-integration/appveyor/install.ps1:
--------------------------------------------------------------------------------
 1 | # Sample script to install Miniconda under Windows
 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon
 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
 4 | 
 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/"
 6 | 
 7 | 
 8 | function DownloadMiniconda ($python_version, $platform_suffix) {
 9 |     $webclient = New-Object System.Net.WebClient
10 |     if ($python_version -match "3.4") {
11 |         $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe"
12 |     } else {
13 |         $filename = "Miniconda-latest-Windows-" + $platform_suffix + ".exe"
14 |     }
15 |     $url = $MINICONDA_URL + $filename
16 | 
17 |     $basedir = $pwd.Path + "\"
18 |     $filepath = $basedir + $filename
19 |     if (Test-Path $filename) {
20 |         Write-Host "Reusing" $filepath
21 |         return $filepath
22 |     }
23 | 
24 |     # Download and retry up to 3 times in case of network transient errors.
25 |     Write-Host "Downloading" $filename "from" $url
26 |     $retry_attempts = 2
27 |     for($i=0; $i -lt $retry_attempts; $i++){
28 |         try {
29 |             $webclient.DownloadFile($url, $filepath)
30 |             break
31 |         }
32 |         Catch [Exception]{
33 |             Start-Sleep 1
34 |         }
35 |    }
36 |    if (Test-Path $filepath) {
37 |        Write-Host "File saved at" $filepath
38 |    } else {
39 |        # Retry once to get the error message if any at the last try
40 |        $webclient.DownloadFile($url, $filepath)
41 |    }
42 |    return $filepath
43 | }
44 | 
45 | 
46 | function InstallMiniconda ($python_version, $architecture, $python_home) {
47 |     Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home
48 |     if (Test-Path $python_home) {
49 |         Write-Host $python_home "already exists, skipping."
50 |         return $false
51 |     }
52 |     if ($architecture -match "32") {
53 |         $platform_suffix = "x86"
54 |     } else {
55 |         $platform_suffix = "x86_64"
56 |     }
57 | 
58 |     $filepath = DownloadMiniconda $python_version $platform_suffix
59 |     Write-Host "Installing" $filepath "to" $python_home
60 |     $install_log = $python_home + ".log"
61 |     $args = "/S /D=$python_home"
62 |     Write-Host $filepath $args
63 |     Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru
64 |     if (Test-Path $python_home) {
65 |         Write-Host "Python $python_version ($architecture) installation complete"
66 |     } else {
67 |         Write-Host "Failed to install Python in $python_home"
68 |         Get-Content -Path $install_log
69 |         Exit 1
70 |     }
71 | }
72 | 
73 | 
74 | function InstallCondaPackages ($python_home, $spec) {
75 |     $conda_path = $python_home + "\Scripts\conda.exe"
76 |     $args = "install --yes " + $spec
77 |     Write-Host ("conda " + $args)
78 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
79 | }
80 | 
81 | function UpdateConda ($python_home) {
82 |     $conda_path = $python_home + "\Scripts\conda.exe"
83 |     Write-Host "Updating conda..."
84 |     $args = "update --yes conda"
85 |     Write-Host $conda_path $args
86 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
87 | }
88 | 
89 | 
90 | function main () {
91 |     InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
92 |     UpdateConda $env:PYTHON
93 |     InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client"
94 | }
95 | 
96 | main
97 | 
98 | 


--------------------------------------------------------------------------------
/continuous-integration/appveyor/rm_rf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import sys
 4 | import stat
 5 | import shutil
 6 | 
 7 | def remove_readonly(func, path, excinfo):
 8 |     os.chmod(path, stat.S_IWRITE)
 9 |     func(path)
10 | 
11 | def main():
12 |     print(sys.executable)
13 |     try:
14 |         shutil.rmtree(sys.argv[1], onerror=remove_readonly)
15 |     except Exception as e:
16 |         print("Error")
17 |         print(e)
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 
22 | 


--------------------------------------------------------------------------------
/continuous-integration/appveyor/run_with_env.cmd:
--------------------------------------------------------------------------------
 1 | :: To build extensions for 64 bit Python 3, we need to configure environment
 2 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
 3 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
 4 | ::
 5 | :: To build extensions for 64 bit Python 2, we need to configure environment
 6 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
 7 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
 8 | ::
 9 | :: 32 bit builds do not require specific environment configurations.
10 | ::
11 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the
12 | :: cmd interpreter, at least for (SDK v7.0)
13 | ::
14 | :: More details at:
15 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
16 | :: http://stackoverflow.com/a/13751649/163740
17 | ::
18 | :: Author: Olivier Grisel
19 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
20 | @ECHO OFF
21 | 
22 | SET COMMAND_TO_RUN=%*
23 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows
24 | 
25 | SET MAJOR_PYTHON_VERSION="%PYTHON_VERSION:~0,1%"
26 | IF %MAJOR_PYTHON_VERSION% == "2" (
27 |     SET WINDOWS_SDK_VERSION="v7.0"
28 | ) ELSE IF %MAJOR_PYTHON_VERSION% == "3" (
29 |     SET WINDOWS_SDK_VERSION="v7.1"
30 | ) ELSE (
31 |     ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
32 |     EXIT 1
33 | )
34 | 
35 | IF "%PYTHON_ARCH%"=="64" (
36 |     ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
37 |     SET DISTUTILS_USE_SDK=1
38 |     SET MSSdk=1
39 |     "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
40 |     "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
41 |     ECHO Executing: %COMMAND_TO_RUN%
42 |     call %COMMAND_TO_RUN% || EXIT 1
43 | ) ELSE (
44 |     ECHO Using default MSVC build environment for 32 bit architecture
45 |     ECHO Executing: %COMMAND_TO_RUN%
46 |     call %COMMAND_TO_RUN% || EXIT 1
47 | )
48 | 
49 | 


--------------------------------------------------------------------------------
/docs/API.rst:
--------------------------------------------------------------------------------
1 | ===
2 | API
3 | ===
4 | .. toctree::
5 | :maxdepth: 2
6 | 
7 |         Tokenizers
8 |         Simfunctions
9 | 


--------------------------------------------------------------------------------
/docs/Installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 | This pages describes the requirements, dependencies and provides a step by step instruction
 5 | to install the py_stringmatching package.
 6 | 
 7 | Requirements
 8 | ------------
 9 |     * Python 2.7 or Python 3.3+
10 | 
11 | Dependencies
12 | ------------
13 |     * numpy>=1.7.0
14 |     * six
15 |     * python-Levenshtein >= 0.12.0
16 | 
17 | 
18 | .. note::
19 | The user need not install these dependency packages before installing the py_stringmatching package.
20 |     The py_stringmatching installer will automatically install the required packages.
21 | 
22 | 
23 | Step by Step Installation Instruction
24 | -------------------------------------
25 | Step 1: Download the py_stringmatching package from `here
26 | <http://pradap-www.cs.wisc.edu/py_stringmatching/py_stringmatching-0.1.tar.gz>`_
27 | into your home directory.
28 | 
29 | You can download into any directory within your home directory. For now we assume that you use a
30 | linux operating system and will download into "HOME/", the top level.
31 | 
32 | Also, we assume that you have sufficient privileges to install a python package.
33 | 
34 | Step 2: Unzip the package by executing the following command::
35 | 
36 |     tar -xzvf py_stringmatching.tar.gz
37 | 
38 | py_stringmatching will be unpackaged into directory "HOME/py_stringmatching-0.1
39 | 
40 | 
41 | Step 3: At the command prompt execute the following commands::
42 | 
43 |     cd HOME/py_stringmatching-0.1
44 |     python setup.py install
45 | 
46 | This will install py_stringmatching package.
47 | 
48 | .. note::
49 | 
50 |     If the python package installation requires root permission then, you can install the package in
51 |     your home directory like this::
52 | 
53 |         python setup.py install --user
54 | 
55 |     for more information look at the stackoverflow `link
56 |     <http://stackoverflow.com/questions/14179941/how-to-install-python-packages-without-root-privileges>`_.
57 | 
58 | Supported Platforms
59 | -------------------
60 | It is tested primarily on OSX and Linux, but due to minimal dependencies it should work perfectly on Windows.
61 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/py_stringmatching.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/py_stringmatching.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/py_stringmatching"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/py_stringmatching"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/docs/Simfunctions.rst:
--------------------------------------------------------------------------------
 1 | Similarity Functions
 2 | ====================
 3 | 
 4 | .. automodule:: py_stringmatching.simfunctions
 5 | 
 6 | 
 7 |     .. autofunction:: levenshtein(string1, string2)
 8 | .. autofunction:: hamming_distance(string1, string2)
 9 | .. autofunction:: jaro(string1, string2)
10 | .. autofunction:: jaro_winkler(string1, string2, prefix_weight=0.1)
11 | .. autofunction:: needleman_wunsch(string1, string2, gap_cost=1, sim_score=sim_ident)
12 | .. autofunction:: smith_waterman(string1, string2, gap_cost=1, sim_score=sim_ident)
13 | .. autofunction:: affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident)
14 | .. autofunction:: jaccard(set1, set2)
15 | .. autofunction:: overlap_coefficient(set1, set2)
16 | .. autofunction:: cosine(set1, set2)
17 | .. autofunction:: monge_elkan(bag1, bag2, sim_func=levenshtein)
18 | .. autofunction:: tfidf(bag1, bag2, corpus_list = None, dampen=False)
19 | .. autofunction:: soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5)


--------------------------------------------------------------------------------
/docs/Tokenizers.rst:
--------------------------------------------------------------------------------
 1 | Tokenizers
 2 | ==========
 3 | 
 4 | .. automodule:: py_stringmatching.tokenizers
 5 | :show-inheritance:
 6 | 
 7 |         .. autofunction:: delimiter(input_string, delim_str=' ')
 8 | .. autofunction:: whitespace(input_string)
 9 | .. autofunction:: qgram(input_string, qval=2)
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/docs/Tutorial.rst:
--------------------------------------------------------------------------------
 1 | Tutorial
 2 | ========
 3 | Once the package is installed, the user can import the similarity functions and tokenizers
 4 | like this::
 5 | 
 6 |     from py_stringmatching import simfunctions, tokenizers
 7 | 
 8 | Further, the user can use the tokenizers and similarity functions like this::
 9 | 
10 |     x = 'this is a string matching package for data science class'
11 |     y = 'this string matching package can be used to generate features'
12 |     f = simfunctions.cosine(tokenizers.whitespace(x), tokenizers.whitespace(y))
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # py_stringmatching documentation build configuration file, created by
  4 | # sphinx-quickstart on Mon Feb  1 13:42:26 2016.
  5 | #
  6 | # This file is execfile()d with the current directory set to its
  7 | # containing dir.
  8 | #
  9 | # Note that not all possible configuration values are present in this
 10 | # autogenerated file.
 11 | #
 12 | # All configuration values have a default; values that are commented out
 13 | # serve to show the default.
 14 | 
 15 | import sys
 16 | 
 17 | # add path - warning: needs to updated based on package path
 18 | # sys.path.append('/scratch/pradap/python-work/py_stringmatching')
 19 | sys.path.append('/Users/pradap/Documents/Research/Python-Package/py_stringmatching')
 20 | 
 21 | # If extensions (or modules to document with autodoc) are in another directory,
 22 | # add these directories to sys.path here. If the directory is relative to the
 23 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 24 | # sys.path.insert(0, os.path.abspath('.'))
 25 | 
 26 | # -- General configuration ------------------------------------------------
 27 | 
 28 | # If your documentation needs a minimal Sphinx version, state it here.
 29 | # needs_sphinx = '1.0'
 30 | 
 31 | # Add any Sphinx extension module names here, as strings. They can be
 32 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 33 | # ones.
 34 | extensions = ['sphinx.ext.autodoc', 'sphinx.ext.napoleon',
 35 |               'sphinx.ext.doctest',
 36 |               'sphinx.ext.intersphinx',
 37 |               'sphinx.ext.mathjax',
 38 |               'sphinx.ext.ifconfig',
 39 |               'sphinx.ext.viewcode',
 40 |               ]
 41 | 
 42 | # Napoleon settings
 43 | napoleon_google_docstring = True
 44 | napoleon_numpy_docstring = True
 45 | napoleon_include_private_with_doc = False
 46 | napoleon_include_special_with_doc = True
 47 | napoleon_use_admonition_for_examples = False
 48 | napoleon_use_admonition_for_notes = False
 49 | napoleon_use_admonition_for_references = False
 50 | napoleon_use_ivar = False
 51 | napoleon_use_param = True
 52 | napoleon_use_rtype = True
 53 | 
 54 | # Add any paths that contain templates here, relative to this directory.
 55 | templates_path = ['_templates']
 56 | 
 57 | # The suffix(es) of source filenames.
 58 | # You can specify multiple suffix as a list of string:
 59 | # source_suffix = ['.rst', '.md']
 60 | source_suffix = '.rst'
 61 | 
 62 | # The encoding of source files.
 63 | # source_encoding = 'utf-8-sig'
 64 | 
 65 | # The master toctree document.
 66 | master_doc = 'index'
 67 | 
 68 | # General information about the project.
 69 | project = u'py_stringmatching'
 70 | copyright = u'2016, Magellan Team'
 71 | author = u'Magellan Team'
 72 | 
 73 | # The version info for the project you're documenting, acts as replacement for
 74 | # |version| and |release|, also used in various other places throughout the
 75 | # built documents.
 76 | #
 77 | # The short X.Y version.
 78 | version = '0.1'
 79 | # The full version, including alpha/beta/rc tags.
 80 | release = '0.1'
 81 | 
 82 | # The language for content autogenerated by Sphinx. Refer to documentation
 83 | # for a list of supported languages.
 84 | #
 85 | # This is also used if you do content translation via gettext catalogs.
 86 | # Usually you set "language" from the command line for these cases.
 87 | language = None
 88 | 
 89 | # There are two options for replacing |today|: either, you set today to some
 90 | # non-false value, then it is used:
 91 | # today = ''
 92 | # Else, today_fmt is used as the format for a strftime call.
 93 | # today_fmt = '%B %d, %Y'
 94 | 
 95 | # List of patterns, relative to source directory, that match files and
 96 | # directories to ignore when looking for source files.
 97 | exclude_patterns = ['_build']
 98 | 
 99 | # The reST default role (used for this markup: `text`) to use for all
100 | # documents.
101 | # default_role = None
102 | 
103 | # If true, '()' will be appended to :func: etc. cross-reference text.
104 | # add_function_parentheses = True
105 | 
106 | # If true, the current module name will be prepended to all description
107 | # unit titles (such as .. function::).
108 | add_module_names = True
109 | 
110 | # If true, sectionauthor and moduleauthor directives will be shown in the
111 | # output. They are ignored by default.
112 | # show_authors = False
113 | 
114 | # The name of the Pygments (syntax highlighting) style to use.
115 | pygments_style = 'sphinx'
116 | 
117 | # A list of ignored prefixes for module index sorting.
118 | # modindex_common_prefix = []
119 | 
120 | # If true, keep warnings as "system message" paragraphs in the built documents.
121 | # keep_warnings = False
122 | 
123 | # If true, `todo` and `todoList` produce output, else they produce nothing.
124 | todo_include_todos = False
125 | 
126 | # -- Options for HTML output ----------------------------------------------
127 | 
128 | # The theme to use for HTML and HTML Help pages.  See the documentation for
129 | # a list of builtin themes.
130 | html_theme = 'sphinx_rtd_theme'
131 | 
132 | # Theme options are theme-specific and customize the look and feel of a theme
133 | # further.  For a list of options available for each theme, see the
134 | # documentation.
135 | # html_theme_options = {}
136 | 
137 | # Add any paths that contain custom themes here, relative to this directory.
138 | # html_theme_path = sphinx_bootstrap_theme.get_html_theme_path()
139 | 
140 | # The name for this set of Sphinx documents.  If None, it defaults to
141 | # "<project> v<release> documentation".
142 | # html_title = None
143 | 
144 | # A shorter title for the navigation bar.  Default is the same as html_title.
145 | # html_short_title = None
146 | 
147 | # The name of an image file (relative to this directory) to place at the top
148 | # of the sidebar.
149 | # html_logo = None
150 | 
151 | # The name of an image file (within the static path) to use as favicon of the
152 | # docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
153 | # pixels large.
154 | # html_favicon = None
155 | 
156 | # Add any paths that contain custom static files (such as style sheets) here,
157 | # relative to this directory. They are copied after the builtin static files,
158 | # so a file named "default.css" will overwrite the builtin "default.css".
159 | html_static_path = ['_static']
160 | 
161 | # Add any extra paths that contain custom files (such as robots.txt or
162 | # .htaccess) here, relative to this directory. These files are copied
163 | # directly to the root of the documentation.
164 | # html_extra_path = []
165 | 
166 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
167 | # using the given strftime format.
168 | # html_last_updated_fmt = '%b %d, %Y'
169 | 
170 | # If true, SmartyPants will be used to convert quotes and dashes to
171 | # typographically correct entities.
172 | # html_use_smartypants = True
173 | 
174 | # Custom sidebar templates, maps document names to template names.
175 | # html_sidebars = {}
176 | 
177 | # Additional templates that should be rendered to pages, maps page names to
178 | # template names.
179 | # html_additional_pages = {}
180 | 
181 | # If false, no module index is generated.
182 | # html_domain_indices = True
183 | 
184 | # If false, no index is generated.
185 | # html_use_index = True
186 | 
187 | # If true, the index is split into individual pages for each letter.
188 | # html_split_index = False
189 | 
190 | # If true, links to the reST sources are added to the pages.
191 | # html_show_sourcelink = True
192 | 
193 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
194 | # html_show_sphinx = True
195 | 
196 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
197 | # html_show_copyright = True
198 | 
199 | # If true, an OpenSearch description file will be output, and all pages will
200 | # contain a <link> tag referring to it.  The value of this option must be the
201 | # base URL from which the finished HTML is served.
202 | # html_use_opensearch = ''
203 | 
204 | # This is the file name suffix for HTML files (e.g. ".xhtml").
205 | # html_file_suffix = None
206 | 
207 | # Language to be used for generating the HTML full-text search index.
208 | # Sphinx supports the following languages:
209 | #   'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
210 | #   'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
211 | # html_search_language = 'en'
212 | 
213 | # A dictionary with options for the search language support, empty by default.
214 | # Now only 'ja' uses this config value
215 | # html_search_options = {'type': 'default'}
216 | 
217 | # The name of a javascript file (relative to the configuration directory) that
218 | # implements a search results scorer. If empty, the default will be used.
219 | # html_search_scorer = 'scorer.js'
220 | 
221 | # Output file base name for HTML help builder.
222 | htmlhelp_basename = 'py_stringmatchingdoc'
223 | 
224 | # -- Options for LaTeX output ---------------------------------------------
225 | 
226 | latex_elements = {
227 |     # The paper size ('letterpaper' or 'a4paper').
228 |     # 'papersize': 'letterpaper',
229 | 
230 |     # The font size ('10pt', '11pt' or '12pt').
231 |     # 'pointsize': '10pt',
232 | 
233 |     # Additional stuff for the LaTeX preamble.
234 |     # 'preamble': '',
235 | 
236 |     # Latex figure (float) alignment
237 |     # 'figure_align': 'htbp',
238 | }
239 | 
240 | # Grouping the document tree into LaTeX files. List of tuples
241 | # (source start file, target name, title,
242 | #  author, documentclass [howto, manual, or own class]).
243 | latex_documents = [
244 |     (master_doc, 'py_stringmatching.tex', u'py\\_stringmatching Documentation',
245 |      u'Magellan Team', 'manual'),
246 | ]
247 | 
248 | # The name of an image file (relative to this directory) to place at the top of
249 | # the title page.
250 | # latex_logo = None
251 | 
252 | # For "manual" documents, if this is true, then toplevel headings are parts,
253 | # not chapters.
254 | # latex_use_parts = False
255 | 
256 | # If true, show page references after internal links.
257 | # latex_show_pagerefs = False
258 | 
259 | # If true, show URL addresses after external links.
260 | # latex_show_urls = False
261 | 
262 | # Documents to append as an appendix to all manuals.
263 | # latex_appendices = []
264 | 
265 | # If false, no module index is generated.
266 | # latex_domain_indices = True
267 | 
268 | 
269 | # -- Options for manual page output ---------------------------------------
270 | 
271 | # One entry per manual page. List of tuples
272 | # (source start file, name, description, authors, manual section).
273 | man_pages = [
274 |     (master_doc, 'py_stringmatching', u'py_stringmatching Documentation',
275 |      [author], 1)
276 | ]
277 | 
278 | # If true, show URL addresses after external links.
279 | # man_show_urls = False
280 | 
281 | 
282 | # -- Options for Texinfo output -------------------------------------------
283 | 
284 | # Grouping the document tree into Texinfo files. List of tuples
285 | # (source start file, target name, title, author,
286 | #  dir menu entry, description, category)
287 | texinfo_documents = [
288 |     (master_doc, 'py_stringmatching', u'py_stringmatching Documentation',
289 |      author, 'py_stringmatching', 'One line description of project.',
290 |      'Miscellaneous'),
291 | ]
292 | 
293 | # Documents to append as an appendix to all manuals.
294 | # texinfo_appendices = []
295 | 
296 | # If false, no module index is generated.
297 | # texinfo_domain_indices = True
298 | 
299 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
300 | # texinfo_show_urls = 'footnote'
301 | 
302 | # If true, do not generate a @detailmenu in the "Top" node's menu.
303 | # texinfo_no_detailmenu = False
304 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to py_stringmatching's documentation!
 2 | =============================================
 3 | 
 4 | Contents:
 5 | 
 6 | .. toctree::
 7 | :maxdepth: 2
 8 | 
 9 |         Installation
10 |         Tutorial
11 |         API
12 | 
13 | Indices and tables
14 | ==================
15 | 
16 | * :ref:`genindex`
17 | * :ref:`modindex`
18 | * :ref:`search`
19 | 
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\py_stringmatching.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\py_stringmatching.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/py_stringmatching/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = 0.1
2 | 


--------------------------------------------------------------------------------
/py_stringmatching/compat.py:
--------------------------------------------------------------------------------
 1 | """py-stringmatching.compat.py
 2 | The compat module defines some variables to enable Python 2 and Python 3
 3 | compatibility within a single codebase
 4 | The following are defined:
 5 |     - _range   -- use in place of xrange/range
 6 |     - _unicode -- use in place of unicode/str
 7 |     - _unichr  -- use in place of unichr/chr
 8 |     - _long    -- use in place of long/int
 9 | And:
10 |     - numeric_type -- defines the set of numeric types
11 | """
12 | 
13 | import sys
14 | 
15 | # pylint: disable=invalid-name
16 | if sys.version_info[0] == 3:  # pragma: no cover
17 |     _range = range
18 |     _unicode = str
19 |     _unichr = chr
20 |     _long = int
21 |     numeric_type = (int, float, complex)
22 | else:  # pragma: no cover
23 |     _range = xrange
24 |     _unicode = unicode
25 |     _unichr = unichr
26 |     _long = long
27 |     numeric_type = (int, long, float, complex)
28 | 


--------------------------------------------------------------------------------
/py_stringmatching/simfunctions.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | from __future__ import division
  3 | from __future__ import unicode_literals
  4 | 
  5 | import collections
  6 | import math
  7 | 
  8 | import Levenshtein
  9 | import numpy as np
 10 | 
 11 | from py_stringmatching import utils
 12 | # noinspection PyProtectedMember,PyProtectedMember
 13 | from .compat import _range
 14 | 
 15 | 
 16 | def sim_ident(s1, s2):
 17 |     return int(s1 == s2)
 18 | 
 19 | 
 20 | # ---------------------- sequence based similarity measures  ----------------------
 21 | 
 22 | 
 23 | def affine(string1, string2, gap_start=1, gap_continuation=0.5, sim_score=sim_ident):
 24 |     """
 25 |     Computes the Affine gap score between two strings.
 26 | 
 27 |     The Affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
 28 |     gracefully.
 29 | 
 30 |     For more information refer to string matching chapter in the DI book.
 31 | 
 32 |     Args:
 33 |         string1,string2 (str) : Input strings
 34 | 
 35 |         gap_start (float): Cost for the gap at the start (defaults to 1)
 36 | 
 37 |         gap_continuation (float) : Cost for the gap continuation (defaults to 0.5)
 38 | 
 39 |         sim_score (function) : Function computing similarity score between two chars, represented as strings
 40 |             (defaults to identity).
 41 | 
 42 |     Returns:
 43 |         Affine gap score (float)
 44 | 
 45 |     Raises:
 46 |         TypeError : If the inputs are not strings or if one of the inputs is None.
 47 | 
 48 |     Examples:
 49 |         >>> affine('dva', 'deeva')
 50 |         1.5
 51 |         >>> affine('dva', 'deeve', gap_start=2, gap_continuation=0.5)
 52 |         -0.5
 53 |         >>> affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0)))
 54 |         4.4
 55 |     """
 56 |     # input validations
 57 |     utils.sim_check_for_none(string1, string2)
 58 |     utils.tok_check_for_string_input(string1, string2)
 59 |     # if one of the strings is empty return 0
 60 |     if utils.sim_check_for_empty(string1, string2):
 61 |         return 0
 62 | 
 63 |     gap_start = -gap_start
 64 |     gap_continuation = -gap_continuation
 65 |     m = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
 66 |     x = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
 67 |     y = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
 68 |     # DP initialization
 69 |     for i in _range(1, len(string1) + 1):
 70 |         m[i][0] = -float("inf")
 71 |         x[i][0] = gap_start + (i - 1) * gap_continuation
 72 |         y[i][0] = -float("inf")
 73 |     # DP initialization
 74 |     for j in _range(1, len(string2) + 1):
 75 |         m[0][j] = -float("inf")
 76 |         x[0][j] = -float("inf")
 77 |         y[0][j] = gap_start + (j - 1) * gap_continuation
 78 |     # affine gap calculation using DP
 79 |     for i in _range(1, len(string1) + 1):
 80 |         for j in _range(1, len(string2) + 1):
 81 |             # best score between x_1....x_i and y_1....y_j given that x_i is aligned to y_j
 82 |             m[i][j] = sim_score(string1[i - 1], string2[j - 1]) + max(m[i - 1][j - 1], x[i - 1][j - 1], y[i - 1][j - 1])
 83 |             # the best score given that x_i is aligned to a gap
 84 |             x[i][j] = max(gap_start + m[i - 1][j], gap_continuation + x[i - 1][j])
 85 |             # the best score given that y_j is aligned to a gap
 86 |             y[i][j] = max(gap_start + m[i][j - 1], gap_continuation + y[i][j - 1])
 87 |     return max(m[len(string1)][len(string2)], x[len(string1)][len(string2)], y[len(string1)][len(string2)])
 88 | 
 89 | 
 90 | # jaro
 91 | # noinspection PyUnboundLocalVariable,PyUnboundLocalVariable,PyUnboundLocalVariable,PyUnboundLocalVariable
 92 | def jaro(string1, string2):
 93 |     """
 94 |     Computes the Jaro measure between two strings.
 95 | 
 96 |     The Jaro measure is a type of edit distance, This was developed mainly to compare short strings,
 97 |     such as first and last names.
 98 | 
 99 | 
100 |     Args:
101 |         string1,string2 (str): Input strings
102 | 
103 |     Returns:
104 |         Jaro measure (float)
105 | 
106 | 
107 |     Raises:
108 |         TypeError : If the inputs are not strings or if one of the inputs is None.
109 | 
110 | 
111 |     Examples:
112 |         >>> jaro('MARTHA', 'MARHTA')
113 |         0.9444444444444445
114 |         >>> jaro('DWAYNE', 'DUANE')
115 |         0.8222222222222223
116 |         >>> jaro('DIXON', 'DICKSONX')
117 |         0.7666666666666666
118 | 
119 | 
120 |     """
121 |     # input validations
122 |     utils.sim_check_for_none(string1, string2)
123 |     utils.tok_check_for_string_input(string1, string2)
124 |     # if one of the strings is empty return 0
125 |     if utils.sim_check_for_empty(string1, string2):
126 |         return 0
127 | 
128 |     len_s1 = len(string1)
129 |     len_s2 = len(string2)
130 | 
131 |     max_len = max(len_s1, len_s2)
132 |     search_range = (max_len // 2) - 1
133 |     if search_range < 0:
134 |         search_range = 0
135 | 
136 |     flags_s1 = [False] * len_s1
137 |     flags_s2 = [False] * len_s2
138 | 
139 |     common_chars = 0
140 |     for i, ch_s1 in enumerate(string1):
141 |         low = i - search_range if i > search_range else 0
142 |         hi = i + search_range if i + search_range < len_s2 else len_s2 - 1
143 |         for j in _range(low, hi + 1):
144 |             if not flags_s2[j] and string2[j] == ch_s1:
145 |                 flags_s1[i] = flags_s2[j] = True
146 |                 common_chars += 1
147 |                 break
148 |     if not common_chars:
149 |         return 0
150 |     k = trans_count = 0
151 |     for i, f_s1 in enumerate(flags_s1):
152 |         if f_s1:
153 |             for j in _range(k, len_s2):
154 |                 if flags_s2[j]:
155 |                     k = j + 1
156 |                     break
157 |             if string1[i] != string2[j]:
158 |                 trans_count += 1
159 |     trans_count /= 2
160 |     common_chars = float(common_chars)
161 |     weight = ((common_chars / len_s1 + common_chars / len_s2 +
162 |                (common_chars - trans_count) / common_chars)) / 3
163 |     return weight
164 | 
165 | 
166 | # jaro-winkler
167 | def jaro_winkler(string1, string2, prefix_weight=0.1):
168 |     """
169 |     Computes the Jaro-Winkler measure between two strings.
170 | 
171 |     The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix
172 |     and thus are likely to match.
173 | 
174 | 
175 |     Args:
176 |         string1,string2 (str): Input strings
177 | 
178 |         prefix_weight (float): Weight to give the prefix (defaults to 0.1)
179 | 
180 |     Returns:
181 |         Jaro-Winkler measure (float)
182 | 
183 |     Raises:
184 |         TypeError : If the inputs are not strings or if one of the inputs is None.
185 | 
186 | 
187 |     Examples:
188 |         >>> jaro_winkler('MARTHA', 'MARHTA')
189 |         0.9611111111111111
190 |         >>> jaro_winkler('DWAYNE', 'DUANE')
191 |         0.84
192 |         >>> jaro_winkler('DIXON', 'DICKSONX')
193 |         0.8133333333333332
194 | 
195 |     """
196 |     # input validations
197 |     utils.sim_check_for_none(string1, string2)
198 |     utils.tok_check_for_string_input(string1, string2)
199 |     # if one of the strings is empty return 0
200 |     if utils.sim_check_for_empty(string1, string2):
201 |         return 0
202 | 
203 |     jw_score = jaro(string1, string2)
204 |     min_len = min(len(string1), len(string2))
205 |     # prefix length can be at max 4
206 |     j = min(min_len, 4)
207 |     i = 0
208 |     while i < j and string1[i] == string2[i] and string1[i]:
209 |         i += 1
210 |     if i:
211 |         jw_score += i * prefix_weight * (1 - jw_score)
212 |     return jw_score
213 | 
214 | 
215 | def hamming_distance(string1, string2):
216 |     """
217 |     Computes the Hamming distance between two strings.
218 | 
219 |     The Hamming distance between two strings of equal length is the number of positions at which the corresponding
220 |     symbols are different. In another way, it measures the minimum number of substitutions required to change
221 |     one string into the other, or the minimum number of errors that could have transformed one string into the other.
222 | 
223 | 
224 |     Args:
225 |         string1,string2 (str): Input strings
226 | 
227 |     Returns:
228 |         Hamming distance (int)
229 | 
230 |     Raises:
231 |         TypeError : If the inputs are not strings or if one of the inputs is None.
232 |         ValueError : If the input strings are not of same length
233 | 
234 | 
235 |     Examples:
236 |         >>> hamming_distance('', '')
237 |         0
238 |         >>> hamming_distance('alex', 'john')
239 |         4
240 |         >>> hamming_distance(' ', 'a')
241 |         0
242 |         >>> hamming_distance('JOHN', 'john')
243 |         4
244 |     """
245 |     # input validations
246 |     utils.sim_check_for_none(string1, string2)
247 |     utils.tok_check_for_string_input(string1, string2)
248 |     # for Hamming Distance string length should be same
249 |     utils.sim_check_for_same_len(string1, string2)
250 |     # sum all the mismatch characters at the corresponding index of
251 |     # input strings
252 |     return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
253 | 
254 | 
255 | def levenshtein(string1, string2):
256 |     """
257 |     Computes the Levenshtein distance between two strings.
258 | 
259 |     Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
260 |     is carried out using a sequence of the following operators: delete a character, insert a character, and
261 |     substitute one character for another.
262 | 
263 |     Args:
264 |         string1,string2 (str): Input strings
265 | 
266 |     Returns:
267 |         Levenshtein distance (int)
268 | 
269 |     Raises:
270 |         TypeError : If the inputs are not strings
271 | 
272 |     Examples:
273 |         >>> levenshtein('a', '')
274 |         1
275 |         >>> levenshtein('example', 'samples')
276 |         3
277 |         >>> levenshtein('levenshtein', 'frankenstein')
278 |         6
279 | 
280 | 
281 |     Note:
282 |         This implementation internally uses python-levenshtein package to compute the Levenshtein distance
283 | 
284 |     """
285 |     # input validations
286 |     utils.sim_check_for_none(string1, string2)
287 |     utils.sim_check_for_string_inputs(string1, string2)
288 |     # using Levenshtein library
289 |     return Levenshtein.distance(string1, string2)
290 | 
291 | 
292 | def needleman_wunsch(string1, string2, gap_cost=1.0, sim_score=sim_ident):
293 |     """
294 |     Computes the Needleman-Wunsch measure between two strings.
295 | 
296 |     The Needleman-Wunsch generalizes the Levenshtein distance and considers global alignment between two strings.
297 |     Specifically, it is computed by assigning a score to each alignment between two input strings and choosing the
298 |     score of the best alignment, that is, the maximal score.
299 | 
300 |     An alignment between two strings is a set of correspondences between the characters of between them, allowing for
301 |     gaps.
302 | 
303 |     Args:
304 |         string1,string2 (str) : Input strings
305 | 
306 |         gap_cost (float) : Cost of gap (defaults to 1.0)
307 | 
308 |         sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults
309 |             to an identity function, where if two characters are same it returns 1.0 else returns 0.
310 | 
311 | 
312 |     Returns:
313 |         Needleman-Wunsch measure (float)
314 | 
315 | 
316 |     Raises:
317 |         TypeError : If the inputs are not strings or if one of the inputs is None.
318 | 
319 |     Examples:
320 |         >>> needleman_wunsch('dva', 'deeva')
321 |         1.0
322 |         >>> needleman_wunsch('dva', 'deeve', 0.0)
323 |         2.0
324 |         >>> needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
325 |         1.0
326 |         >>> needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5, sim_score=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
327 |         2.5
328 |     """
329 |     # input validations
330 |     utils.sim_check_for_none(string1, string2)
331 |     utils.sim_check_for_string_inputs(string1, string2)
332 | 
333 |     dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
334 |     # DP initialization
335 |     for i in _range(len(string1) + 1):
336 |         dist_mat[i, 0] = -(i * gap_cost)
337 |     # DP initialization
338 |     for j in _range(len(string2) + 1):
339 |         dist_mat[0, j] = -(j * gap_cost)
340 |     # Needleman-Wunsch DP calculation
341 |     for i in _range(1, len(string1) + 1):
342 |         for j in _range(1, len(string2) + 1):
343 |             match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
344 |             delete = dist_mat[i - 1, j] - gap_cost
345 |             insert = dist_mat[i, j - 1] - gap_cost
346 |             dist_mat[i, j] = max(match, delete, insert)
347 |     return dist_mat[dist_mat.shape[0] - 1, dist_mat.shape[1] - 1]
348 | 
349 | 
350 | def smith_waterman(string1, string2, gap_cost=1.0, sim_score=sim_ident):
351 |     """
352 |     Computes the Smith-Waterman measure between two strings.
353 | 
354 |     The Smith–Waterman algorithm performs local sequence alignment; that is, for determining similar regions
355 |     between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of
356 |     all possible lengths and optimizes the similarity measure.
357 | 
358 | 
359 |     Args:
360 |         string1,string2 (str) : Input strings
361 | 
362 |         gap_cost (float) : Cost of gap (defaults to 1.0)
363 | 
364 |         sim_score (function) : Similarity function to give a score for the correspondence between characters. Defaults
365 |             to an identity function, where if two characters are same it returns 1 else returns 0.
366 | 
367 |     Returns:
368 |         Smith-Waterman measure (float)
369 | 
370 |     Raises:
371 |         TypeError : If the inputs are not strings or if one of the inputs is None.
372 | 
373 |     Examples:
374 |         >>> smith_waterman('cat', 'hat')
375 |         2.0
376 |         >>> smith_waterman('dva', 'deeve', 2.2)
377 |         1.0
378 |         >>> smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2 : (2 if s1 == s2 else -1))
379 |         2.0
380 |         >>> smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
381 |         6.5
382 |     """
383 |     # input validations
384 |     utils.sim_check_for_none(string1, string2)
385 |     utils.sim_check_for_string_inputs(string1, string2)
386 | 
387 |     dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=np.float)
388 |     max_value = 0
389 |     # Smith Waterman DP calculations
390 |     for i in _range(1, len(string1) + 1):
391 |         for j in _range(1, len(string2) + 1):
392 |             match = dist_mat[i - 1, j - 1] + sim_score(string1[i - 1], string2[j - 1])
393 |             delete = dist_mat[i - 1, j] - gap_cost
394 |             insert = dist_mat[i, j - 1] - gap_cost
395 |             dist_mat[i, j] = max(0, match, delete, insert)
396 |             max_value = max(max_value, dist_mat[i, j])
397 |     return max_value
398 | 
399 | 
400 | # ---------------------- token based similarity measures  ----------------------
401 | 
402 | # ---------------------- set based similarity measures  ----------------------
403 | def cosine(set1, set2):
404 |     """
405 |     Computes the cosine similarity between two sets.
406 | 
407 |     For two sets X and Y, the cosine similarity is:
408 | 
409 |     :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`
410 | 
411 | 
412 |     Args:
413 |         set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
414 | 
415 |     Returns:
416 |         Cosine similarity (float)
417 | 
418 |     Raises:
419 |         TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
420 | 
421 |     Examples:
422 |      >>> cosine(['data', 'science'], ['data'])
423 |      0.7071067811865475
424 |      >>> cosine(['data', 'data', 'science'], ['data', 'management'])
425 |      0.4999999999999999
426 |      >>> cosine([], ['data'])
427 |      0.0
428 | 
429 |     References:
430 |         * String similarity joins: An Experimental Evaluation (VLDB 2014)
431 |         * Project flamingo : Mike carey, Vernica
432 |     """
433 |     # input validations
434 |     utils.sim_check_for_none(set1, set2)
435 |     utils.sim_check_for_list_or_set_inputs(set1, set2)
436 |     # if exact match return 1.0
437 |     if utils.sim_check_for_exact_match(set1, set2):
438 |         return 1.0
439 |     # if one of the strings is empty return 0
440 |     if utils.sim_check_for_empty(set1, set2):
441 |         return 0
442 |     if not isinstance(set1, set):
443 |         set1 = set(set1)
444 |     if not isinstance(set2, set):
445 |         set2 = set(set2)
446 |     return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) * math.sqrt(float(len(set2))))
447 | 
448 | 
449 | def jaccard(set1, set2):
450 |     """
451 |     Computes the Jaccard measure between two sets.
452 | 
453 |     The Jaccard measure, also known as the Jaccard similarity coefficient, is a statistic used for comparing
454 |     the similarity and diversity of sample sets. The Jaccard coefficient measures similarity between finite sample
455 |     sets, and is defined as the size of the intersection divided by the size of the union of the sample sets.
456 | 
457 | 
458 |     For two sets X and Y, the Jaccard measure is:
459 | 
460 |     :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X| \\cup |Y|}`
461 | 
462 | 
463 |     Args:
464 |         set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
465 | 
466 |     Returns:
467 |         Jaccard similarity (float)
468 | 
469 |     Raises:
470 |         TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
471 | 
472 |     Examples:
473 |         >>> jaccard(['data', 'science'], ['data'])
474 |         0.5
475 |         >>> jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
476 |         0.375
477 |         >>> jaccard(['data', 'management'], ['data', 'data', 'science'])
478 |         0.3333333333333333
479 |     """
480 |     # input validations
481 |     utils.sim_check_for_none(set1, set2)
482 |     utils.sim_check_for_list_or_set_inputs(set1, set2)
483 |     # if exact match return 1.0
484 |     if utils.sim_check_for_exact_match(set1, set2):
485 |         return 1.0
486 |     # if one of the strings is empty return 0
487 |     if utils.sim_check_for_empty(set1, set2):
488 |         return 0
489 |     if not isinstance(set1, set):
490 |         set1 = set(set1)
491 |     if not isinstance(set2, set):
492 |         set2 = set(set2)
493 |     return float(len(set1 & set2)) / float(len(set1 | set2))
494 | 
495 | 
496 | def overlap_coefficient(set1, set2):
497 |     """
498 |     Computes the overlap coefficient between two sets.
499 | 
500 |     The overlap coefficient is a similarity measure related to the Jaccard
501 |     measure  that measures the overlap between two sets, and is defined as the size of the intersection divided by
502 |     the smaller of the size of the two sets.
503 | 
504 |     For two sets X and Y, the overlap coefficient is:
505 | 
506 |     :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}`
507 | 
508 |     Args:
509 |         set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
510 | 
511 |     Returns:
512 |         Overlap coefficient (float)
513 | 
514 |     Raises:
515 |         TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
516 | 
517 |     Examples:
518 |         >>> (overlap_coefficient([], [])
519 |         1.0
520 |         >>> overlap_coefficient([], ['data'])
521 |         0
522 |         >>> overlap_coefficient(['data', 'science'], ['data'])
523 |         1.0
524 | 
525 |     References:
526 |         * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
527 |         * Simmetrics library
528 | 
529 |     """
530 |     # input validations
531 |     utils.sim_check_for_none(set1, set2)
532 |     utils.sim_check_for_list_or_set_inputs(set1, set2)
533 |     # if exact match return 1.0
534 |     if utils.sim_check_for_exact_match(set1, set2):
535 |         return 1.0
536 |     # if one of the strings is empty return 0
537 |     if utils.sim_check_for_empty(set1, set2):
538 |         return 0
539 |     if not isinstance(set1, set):
540 |         set1 = set(set1)
541 |     if not isinstance(set2, set):
542 |         set2 = set(set2)
543 | 
544 |     return float(len(set1 & set2)) / min(len(set1), len(set2))
545 | 
546 | 
547 | # ---------------------- bag based similarity measures  ----------------------
548 | # noinspection PyArgumentList,PyArgumentList
549 | def tfidf(bag1, bag2, corpus_list=None, dampen=False):
550 |     """
551 |     Compute tfidf measures between two lists given the corpus information.
552 |     This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to find documents that
553 |     are relevant to keyword queries.
554 |     The intuition underlying the TF/IDF measure is that two strings are similar if they share distinguishing terms.
555 | 
556 |     Args:
557 |         bag1,bag2 (list): Input lists
558 | 
559 |         corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
560 |             the input list are considered the only corpus.
561 | 
562 |         dampen (boolean): Flag to indicate whether 'log' should be applied to tf and idf measure.
563 | 
564 |     Returns:
565 |         TF-IDF measure between the input lists (float)
566 | 
567 |     Raises:
568 |         TypeError : If the inputs are not lists or if one of the inputs is None
569 | 
570 | 
571 |     Examples:
572 |         >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']])
573 |         0.17541160386140586
574 |         >>> tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True)
575 |         0.11166746710505392
576 |         >>> tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']])
577 |         0.5547001962252291
578 |         >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']])
579 |         0.0
580 |         >>> tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']], True)
581 |         0.0
582 |         >>> tfidf(['a', 'b', 'a'], ['a'])
583 |         0.7071067811865475
584 |     """
585 |     # input validations
586 |     utils.sim_check_for_none(bag1, bag2)
587 |     utils.sim_check_for_list_or_set_inputs(bag1, bag2)
588 |     # if the strings match exactly return 1.0
589 |     if utils.sim_check_for_exact_match(bag1, bag2):
590 |         return 1.0
591 |     # if one of the strings is empty return 0
592 |     if utils.sim_check_for_empty(bag1, bag2):
593 |         return 0
594 |     # if corpus is not provided treat input string as corpus
595 |     if corpus_list is None:
596 |         corpus_list = [bag1, bag2]
597 |     corpus_size = len(corpus_list)
598 |     # term frequency for input strings
599 |     tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
600 |     # number of documents an element appeared
601 |     element_freq = {}
602 |     # set of unique element
603 |     total_unique_elements = set()
604 |     for document in corpus_list:
605 |         temp_set = set()
606 |         for element in document:
607 |             # adding element only if it is present in one of two input string
608 |             if element in bag1 or element in bag2:
609 |                 temp_set.add(element)
610 |                 total_unique_elements.add(element)
611 |         # update element document frequency for this document
612 |         for element in temp_set:
613 |             element_freq[element] = element_freq[element] + 1 if element in element_freq else 1
614 |     idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
615 |     # tfidf calculation
616 |     for element in total_unique_elements:
617 |         idf_element = corpus_size * 1.0 / element_freq[element]
618 |         v_x = 0 if element not in tf_x else (math.log(idf_element) * math.log(tf_x[element] + 1)) if dampen else (
619 |             idf_element * tf_x[element])
620 |         v_y = 0 if element not in tf_y else (math.log(idf_element) * math.log(tf_y[element] + 1)) if dampen else (
621 |             idf_element * tf_y[element])
622 |         v_x_y += v_x * v_y
623 |         v_x_2 += v_x * v_x
624 |         v_y_2 += v_y * v_y
625 |     return 0.0 if v_x_y == 0 else v_x_y / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
626 | 
627 | 
628 | # hybrid similarity measures
629 | def monge_elkan(bag1, bag2, sim_func=jaro_winkler):
630 |     """
631 |     Compute Monge-Elkan similarity measure between two bags (lists).
632 | 
633 |     The Monge-Elkan similarity measure is a type of Hybrid similarity measure that combine the benefits of
634 |     sequence-based and set-based methods. This can be effective for domains in which more control is needed
635 |     over the similarity measure. It implicitly uses a secondary similarity measure, such as levenshtein to compute
636 |     over all similarity score.
637 | 
638 |     Args:
639 |         bag1,bag2 (list): Input lists
640 | 
641 |         sim_func (function): Secondary similarity function. This is expected to be a sequence-based
642 |             similarity measure (defaults to levenshtein)
643 | 
644 |     Returns:
645 |         Monge-Elkan similarity score (float)
646 | 
647 |     Raises:
648 |         TypeError : If the inputs are not lists or if one of the inputs is None
649 | 
650 | 
651 |     Examples:
652 |         >>> monge_elkan(['Niall'], ['Neal'])
653 |         0.8049999999999999
654 |         >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
655 |         0.8677218614718616
656 |         >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=needleman_wunsch)
657 |         2.0
658 |         >>> monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'], sim_func=affine)
659 |         2.25
660 |         >>> monge_elkan([''], ['a'])
661 |         0.0
662 |         >>> monge_elkan(['Niall'], ['Nigel'])
663 |         0.7866666666666667
664 | 
665 |     References:
666 |         * Principles of Data Integration book
667 |     """
668 |     # input validations
669 |     utils.sim_check_for_none(bag1, bag2)
670 |     utils.sim_check_for_list_or_set_inputs(bag1, bag2)
671 |     # if exact match return 1.0
672 |     if utils.sim_check_for_exact_match(bag1, bag2):
673 |         return 1.0
674 |     # if one of the strings is empty return 0
675 |     if utils.sim_check_for_empty(bag1, bag2):
676 |         return 0
677 |     # aggregated sum of all the max sim score of all the elements in bag1
678 |     # with elements in bag2
679 |     sum_of_maxes = 0
680 |     for t1 in bag1:
681 |         max_sim = float('-inf')
682 |         for t2 in bag2:
683 |             max_sim = max(max_sim, sim_func(t1, t2))
684 |         sum_of_maxes += max_sim
685 |     sim = float(sum_of_maxes) / float(len(bag1))
686 |     return sim
687 | 
688 | 
689 | # noinspection PyArgumentList,PyArgumentList
690 | def soft_tfidf(bag1, bag2, corpus_list=None, sim_func=jaro, threshold=0.5):
691 |     """
692 |     Compute Soft-tfidf measures between two lists given the corpus information.
693 | 
694 |     Args:
695 |         bag1,bag2 (list): Input lists
696 | 
697 |         corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
698 |             the input list are considered the only corpus
699 | 
700 |         sim_func (func): Secondary similarity function. This should return a similarity score between two strings (optional),
701 |             default is jaro similarity measure
702 | 
703 |         threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
704 |             of a token pair exceeds the threshold, then the token pair is considered a match.
705 | 
706 |     Returns:
707 |         Soft TF-IDF measure between the input lists
708 | 
709 |     Raises:
710 |         TypeError : If the inputs are not lists or if one of the inputs is None.
711 | 
712 |     Examples:
713 |         >>> soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro, threshold=0.8)
714 |         0.17541160386140586
715 |         >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
716 |         0.5547001962252291
717 |         >>> soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']])
718 |         0.0
719 |         >>> soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6)
720 |         0.81649658092772592
721 | 
722 |     References:
723 |         * Principles of Data Integration book
724 |     """
725 |     # input validations
726 |     utils.sim_check_for_none(bag1, bag2)
727 |     utils.sim_check_for_list_or_set_inputs(bag1, bag2)
728 |     # if the strings match exactly return 1.0
729 |     if utils.sim_check_for_exact_match(bag1, bag2):
730 |         return 1.0
731 |     # if one of the strings is empty return 0
732 |     if utils.sim_check_for_empty(bag1, bag2):
733 |         return 0
734 |     # if corpus is not provided treat input string as corpus
735 |     if corpus_list is None:
736 |         corpus_list = [bag1, bag2]
737 |     corpus_size = len(corpus_list) * 1.0
738 |     # term frequency for input strings
739 |     tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
740 |     # number of documents an element appeared
741 |     element_freq = {}
742 |     # set of unique element
743 |     total_unique_elements = set()
744 |     for document in corpus_list:
745 |         temp_set = set()
746 |         for element in document:
747 |             # adding element only if it is present in one of two input string
748 |             if element in bag1 or element in bag2:
749 |                 temp_set.add(element)
750 |                 total_unique_elements.add(element)
751 |         # update element document frequency for this document
752 |         for element in temp_set:
753 |             element_freq[element] = element_freq[element] + 1 if element in element_freq else 1
754 |     similarity_map = {}
755 |     # calculating the term sim score against the input string 2, construct similarity map
756 |     for x in bag1:
757 |         if x not in similarity_map:
758 |             max_score = 0.0
759 |             for y in bag2:
760 |                 score = sim_func(x, y)
761 |                 # adding sim only if it is above threshold and highest for this element
762 |                 if score > threshold and score > max_score:
763 |                     similarity_map[x] = utils.Similarity(x, y, score)
764 |                     max_score = score
765 |     result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
766 |     # soft-tfidf calculation
767 |     for element in total_unique_elements:
768 |         # numerator
769 |         if element in similarity_map:
770 |             sim = similarity_map[element]
771 |             idf_first = corpus_size if sim.first_string not in element_freq else corpus_size / \
772 |                                                                                  element_freq[sim.first_string]
773 |             idf_second = corpus_size if sim.second_string not in element_freq else corpus_size / \
774 |                                                                                    element_freq[sim.second_string]
775 |             v_x = 0 if sim.first_string not in tf_x else idf_first * tf_x[sim.first_string]
776 |             v_y = 0 if sim.second_string not in tf_y else idf_second * tf_y[sim.second_string]
777 |             result += v_x * v_y * sim.similarity_score
778 |         # denominator
779 |         idf = corpus_size if element not in element_freq else corpus_size / element_freq[element]
780 |         v_x = 0 if element not in tf_x else idf * tf_x[element]
781 |         v_x_2 += v_x * v_x
782 |         v_y = 0 if element not in tf_y else idf * tf_y[element]
783 |         v_y_2 += v_y * v_y
784 |     return result if v_x_2 == 0 else result / (math.sqrt(v_x_2) * math.sqrt(v_y_2))
785 | 


--------------------------------------------------------------------------------
/py_stringmatching/tests/test_simfunctions.py:
--------------------------------------------------------------------------------
  1 | from __future__ import unicode_literals
  2 | 
  3 | import math
  4 | import unittest
  5 | 
  6 | from nose.tools import *
  7 | 
  8 | 
  9 | # sequence based similarity measures
 10 | from py_stringmatching.simfunctions import levenshtein, jaro, jaro_winkler, hamming_distance, needleman_wunsch, \
 11 |     smith_waterman, affine
 12 | # token based similarity measures
 13 | from py_stringmatching.simfunctions import overlap_coefficient, jaccard, cosine, tfidf, soft_tfidf
 14 | # hybrid similarity measures
 15 | from py_stringmatching.simfunctions import monge_elkan
 16 | 
 17 | 
 18 | # ---------------------- sequence based similarity measures  ----------------------
 19 | 
 20 | 
 21 | class AffineTestCases(unittest.TestCase):
 22 |     def test_valid_input(self):
 23 |         self.assertAlmostEqual(affine('dva', 'deeva'), 1.5)
 24 |         self.assertAlmostEqual(affine('dva', 'deeve', gap_start=2, gap_continuation=0.5), -0.5)
 25 |         self.assertAlmostEqual(
 26 |             affine('AAAGAATTCA', 'AAATCA', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))),
 27 |             4.4)
 28 |         self.assertAlmostEqual(
 29 |             affine(' ', ' ', gap_continuation=0.2, sim_score=lambda s1, s2: (int(1 if s1 == s2 else 0))), 1)
 30 | 
 31 |     @raises(TypeError)
 32 |     def test_invalid_input1(self):
 33 |         affine(None, 'MARHTA')
 34 | 
 35 |     @raises(TypeError)
 36 |     def test_invalid_input2(self):
 37 |         affine('MARHTA', None)
 38 | 
 39 |     @raises(TypeError)
 40 |     def test_invalid_input3(self):
 41 |         affine('MARHTA', 12.90)
 42 | 
 43 |     @raises(TypeError)
 44 |     def test_invalid_input4(self):
 45 |         affine(12.90, 'MARTHA')
 46 | 
 47 |     @raises(TypeError)
 48 |     def test_invalid_input5(self):
 49 |         affine(None, None)
 50 | 
 51 | 
 52 | class JaroTestCases(unittest.TestCase):
 53 |     def test_valid_input(self):
 54 |         # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
 55 |         self.assertAlmostEqual(jaro('MARTHA', 'MARHTA'), 0.9444444444444445)
 56 |         self.assertAlmostEqual(jaro('DWAYNE', 'DUANE'), 0.8222222222222223)
 57 |         self.assertAlmostEqual(jaro('DIXON', 'DICKSONX'), 0.7666666666666666)
 58 | 
 59 |     @raises(TypeError)
 60 |     def test_invalid_input1(self):
 61 |         jaro(None, 'MARHTA')
 62 | 
 63 |     @raises(TypeError)
 64 |     def test_invalid_input2(self):
 65 |         jaro('MARHTA', None)
 66 | 
 67 |     @raises(TypeError)
 68 |     def test_invalid_input3(self):
 69 |         jaro(None, None)
 70 | 
 71 | 
 72 | class JaroWinklerTestCases(unittest.TestCase):
 73 |     def test_valid_input(self):
 74 |         # https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
 75 |         self.assertAlmostEqual(jaro_winkler('MARTHA', 'MARHTA'), 0.9611111111111111)
 76 |         self.assertAlmostEqual(jaro_winkler('DWAYNE', 'DUANE'), 0.84)
 77 |         self.assertAlmostEqual(jaro_winkler('DIXON', 'DICKSONX'), 0.8133333333333332)
 78 | 
 79 |     @raises(TypeError)
 80 |     def test_invalid_input1(self):
 81 |         jaro_winkler(None, 'MARHTA')
 82 | 
 83 |     @raises(TypeError)
 84 |     def test_invalid_input2(self):
 85 |         jaro_winkler('MARHTA', None)
 86 | 
 87 |     @raises(TypeError)
 88 |     def test_invalid_input3(self):
 89 |         jaro_winkler(None, None)
 90 | 
 91 | 
 92 | class LevenshteinTestCases(unittest.TestCase):
 93 |     def test_valid_input(self):
 94 |         # http://oldfashionedsoftware.com/tag/levenshtein-distance/
 95 |         self.assertEqual(levenshtein('a', ''), 1)
 96 |         self.assertEqual(levenshtein('', 'a'), 1)
 97 |         self.assertEqual(levenshtein('abc', ''), 3)
 98 |         self.assertEqual(levenshtein('', 'abc'), 3)
 99 |         self.assertEqual(levenshtein('', ''), 0)
100 |         self.assertEqual(levenshtein('a', 'a'), 0)
101 |         self.assertEqual(levenshtein('abc', 'abc'), 0)
102 |         self.assertEqual(levenshtein('', 'a'), 1)
103 |         self.assertEqual(levenshtein('a', 'ab'), 1)
104 |         self.assertEqual(levenshtein('b', 'ab'), 1)
105 |         self.assertEqual(levenshtein('ac', 'abc'), 1)
106 |         self.assertEqual(levenshtein('abcdefg', 'xabxcdxxefxgx'), 6)
107 |         self.assertEqual(levenshtein('a', ''), 1)
108 |         self.assertEqual(levenshtein('ab', 'a'), 1)
109 |         self.assertEqual(levenshtein('ab', 'b'), 1)
110 |         self.assertEqual(levenshtein('abc', 'ac'), 1)
111 |         self.assertEqual(levenshtein('xabxcdxxefxgx', 'abcdefg'), 6)
112 |         self.assertEqual(levenshtein('a', 'b'), 1)
113 |         self.assertEqual(levenshtein('ab', 'ac'), 1)
114 |         self.assertEqual(levenshtein('ac', 'bc'), 1)
115 |         self.assertEqual(levenshtein('abc', 'axc'), 1)
116 |         self.assertEqual(levenshtein('xabxcdxxefxgx', '1ab2cd34ef5g6'), 6)
117 |         self.assertEqual(levenshtein('example', 'samples'), 3)
118 |         self.assertEqual(levenshtein('sturgeon', 'urgently'), 6)
119 |         self.assertEqual(levenshtein('levenshtein', 'frankenstein'), 6)
120 |         self.assertEqual(levenshtein('distance', 'difference'), 5)
121 |         self.assertEqual(levenshtein('java was neat', 'scala is great'), 7)
122 | 
123 |     @raises(TypeError)
124 |     def test_invalid_input1(self):
125 |         levenshtein('a', None)
126 | 
127 |     @raises(TypeError)
128 |     def test_invalid_input2(self):
129 |         levenshtein(None, 'b')
130 | 
131 |     @raises(TypeError)
132 |     def test_invalid_input3(self):
133 |         levenshtein(None, None)
134 | 
135 | 
136 | class HammingDistanceTestCases(unittest.TestCase):
137 |     def test_valid_input(self):
138 |         self.assertEqual(hamming_distance('-789', 'john'), 4)
139 |         self.assertEqual(hamming_distance('a', '*'), 1)
140 |         self.assertEqual(hamming_distance('b', 'a'), 1)
141 |         self.assertEqual(hamming_distance('abc', 'p q'), 3)
142 |         self.assertEqual(hamming_distance('karolin', 'kathrin'), 3)
143 |         self.assertEqual(hamming_distance('KARI', 'kari'), 4)
144 | 
145 |     def test_valid_input_compatibility(self):
146 |         self.assertEqual(hamming_distance(u'karolin', u'kathrin'), 3)
147 |         self.assertEqual(hamming_distance(u'', u''), 0)
148 |         # str_1 = u'foo'.encode(encoding='UTF-8', errors='strict')
149 |         # str_2 = u'bar'.encode(encoding='UTF-8', errors='strict')
150 |         # self.assertEqual(hamming_distance(str_1, str_2), 3) # check with Ali - python 3 returns type error
151 |         # self.assertEqual(hamming_distance(str_1, str_1), 0) # check with Ali - python 3 returns type error
152 | 
153 |     @raises(TypeError)
154 |     def test_invalid_input1(self):
155 |         hamming_distance('a', None)
156 | 
157 |     @raises(TypeError)
158 |     def test_invalid_input2(self):
159 |         hamming_distance(None, 'b')
160 | 
161 |     @raises(TypeError)
162 |     def test_invalid_input3(self):
163 |         hamming_distance(None, None)
164 | 
165 |     @raises(ValueError)
166 |     def test_invalid_input4(self):
167 |         hamming_distance('a', '')
168 | 
169 |     @raises(ValueError)
170 |     def test_invalid_input5(self):
171 |         hamming_distance('', 'This is a long string')
172 | 
173 |     @raises(ValueError)
174 |     def test_invalid_input6(self):
175 |         hamming_distance('ali', 'alex')
176 | 
177 | 
178 | class NeedlemanWunschTestCases(unittest.TestCase):
179 |     def test_valid_input(self):
180 |         self.assertEqual(needleman_wunsch('dva', 'deeva'), 1.0)
181 |         self.assertEqual(needleman_wunsch('dva', 'deeve', 0.0), 2.0)
182 |         self.assertEqual(needleman_wunsch('dva', 'deeve', 1.0, sim_score=lambda s1, s2: (2 if s1 == s2 else -1)), 1.0)
183 |         self.assertEqual(
184 |             needleman_wunsch('GCATGCUA', 'GATTACA', gap_cost=0.5,
185 |                              sim_score=lambda s1, s2: (1 if s1 == s2 else -1)),
186 |             2.5)
187 | 
188 |     @raises(TypeError)
189 |     def test_invalid_input1(self):
190 |         needleman_wunsch('a', None)
191 | 
192 |     @raises(TypeError)
193 |     def test_invalid_input2(self):
194 |         needleman_wunsch(None, 'b')
195 | 
196 |     @raises(TypeError)
197 |     def test_invalid_input3(self):
198 |         needleman_wunsch(None, None)
199 | 
200 | 
201 | class SmithWatermanTestCases(unittest.TestCase):
202 |     def test_valid_input(self):
203 |         self.assertEqual(smith_waterman('cat', 'hat'), 2.0)
204 |         self.assertEqual(smith_waterman('dva', 'deeve', 2.2), 1.0)
205 |         self.assertEqual(smith_waterman('dva', 'deeve', 1, sim_score=lambda s1, s2: (2 if s1 == s2 else -1)), 2.0)
206 |         self.assertEqual(
207 |             smith_waterman('GCATGCU', 'GATTACA', gap_cost=1, sim_score=lambda s1, s2: (int(1 if s1 == s2 else -1))),
208 |             2.0)
209 |         self.assertEqual(
210 |             smith_waterman('GCATAGCU', 'GATTACA', gap_cost=1.4, sim_score=lambda s1, s2: (1.5 if s1 == s2 else 0.5)),
211 |             6.5)
212 | 
213 |     @raises(TypeError)
214 |     def test_invalid_input1(self):
215 |         smith_waterman('a', None)
216 | 
217 |     @raises(TypeError)
218 |     def test_invalid_input2(self):
219 |         smith_waterman(None, 'b')
220 | 
221 |     @raises(TypeError)
222 |     def test_invalid_input3(self):
223 |         smith_waterman(None, None)
224 | 
225 | 
226 | # ---------------------- token based similarity measures  ----------------------
227 | 
228 | # ---------------------- set based similarity measures  ----------------------
229 | class OverlapCoefficientTestCases(unittest.TestCase):
230 |     def test_valid_input(self):
231 |         self.assertEqual(overlap_coefficient([], []), 1.0)
232 |         self.assertEqual(overlap_coefficient(['data', 'science'], ['data']), 1.0 / min(2.0, 1.0))
233 |         self.assertEqual(overlap_coefficient(['data', 'science'], ['science', 'good']), 1.0 / min(2.0, 3.0))
234 |         self.assertEqual(overlap_coefficient([], ['data']), 0)
235 |         self.assertEqual(overlap_coefficient(['data', 'data', 'science'], ['data', 'management']), 1.0 / min(3.0, 2.0))
236 | 
237 |     @raises(TypeError)
238 |     def test_invalid_input1(self):
239 |         overlap_coefficient(['a'], None)
240 | 
241 |     @raises(TypeError)
242 |     def test_invalid_input2(self):
243 |         overlap_coefficient(None, ['b'])
244 | 
245 |     @raises(TypeError)
246 |     def test_invalid_input3(self):
247 |         overlap_coefficient(None, None)
248 | 
249 | 
250 | class JaccardTestCases(unittest.TestCase):
251 |     def test_valid_input(self):
252 |         self.assertEqual(jaccard(['data', 'science'], ['data']), 1.0 / 2.0)
253 |         self.assertEqual(jaccard(['data', 'science'], ['science', 'good']), 1.0 / 3.0)
254 |         self.assertEqual(jaccard([], ['data']), 0)
255 |         self.assertEqual(jaccard(['data', 'data', 'science'], ['data', 'management']), 1.0 / 3.0)
256 |         self.assertEqual(jaccard(['data', 'management'], ['data', 'data', 'science']), 1.0 / 3.0)
257 |         self.assertEqual(jaccard([], []), 1.0)
258 |         self.assertEqual(jaccard(set([]), set([])), 1.0)
259 |         self.assertEqual(jaccard({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}), 3.0 / 8.0)
260 | 
261 |     @raises(TypeError)
262 |     def test_invalid_input1(self):
263 |         jaccard(1, 1)
264 | 
265 |     @raises(TypeError)
266 |     def test_invalid_input1(self):
267 |         jaccard(['a'], None)
268 | 
269 |     @raises(TypeError)
270 |     def test_invalid_input2(self):
271 |         jaccard(None, ['b'])
272 | 
273 |     @raises(TypeError)
274 |     def test_invalid_input3(self):
275 |         jaccard(None, None)
276 | 
277 | 
278 | class CosineTestCases(unittest.TestCase):
279 |     def test_valid_input(self):
280 |         self.assertEqual(cosine(['data', 'science'], ['data']), 1.0 / (math.sqrt(2) * math.sqrt(1)))
281 |         self.assertEqual(cosine(['data', 'science'], ['science', 'good']),
282 |                          1.0 / (math.sqrt(2) * math.sqrt(2)))
283 |         self.assertEqual(cosine([], ['data']), 0.0)
284 |         self.assertEqual(cosine(['data', 'data', 'science'], ['data', 'management']),
285 |                          1.0 / (math.sqrt(2) * math.sqrt(2)))
286 |         self.assertEqual(cosine(['data', 'management'], ['data', 'data', 'science']),
287 |                          1.0 / (math.sqrt(2) * math.sqrt(2)))
288 |         self.assertEqual(cosine([], []), 1.0)
289 |         self.assertEqual(cosine(set([]), set([])), 1.0)
290 |         self.assertEqual(cosine({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}),
291 |                          3.0 / (math.sqrt(4) * math.sqrt(7)))
292 | 
293 |     @raises(TypeError)
294 |     def test_invalid_input1(self):
295 |         cosine(1, 1)
296 | 
297 |     @raises(TypeError)
298 |     def test_invalid_input4(self):
299 |         cosine(['a'], None)
300 | 
301 |     @raises(TypeError)
302 |     def test_invalid_input2(self):
303 |         cosine(None, ['b'])
304 | 
305 |     @raises(TypeError)
306 |     def test_invalid_input3(self):
307 |         cosine(None, None)
308 | 
309 | 
310 | class TfidfTestCases(unittest.TestCase):
311 |     def test_valid_input(self):
312 |         self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], True),
313 |                          0.11166746710505392)
314 |         self.assertEqual(tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.17541160386140586)
315 |         self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']]), 0.5547001962252291)
316 |         self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475)
317 |         self.assertEqual(tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0)
318 |         self.assertEqual(tfidf(['a', 'b', 'a'], ['a']), 0.7071067811865475)
319 | 
320 |     @raises(TypeError)
321 |     def test_invalid_input1(self):
322 |         tfidf(1, 1)
323 | 
324 |     @raises(TypeError)
325 |     def test_invalid_input4(self):
326 |         tfidf(['a'], None)
327 | 
328 |     @raises(TypeError)
329 |     def test_invalid_input2(self):
330 |         tfidf(None, ['b'])
331 | 
332 |     @raises(TypeError)
333 |     def test_invalid_input3(self):
334 |         tfidf(None, None)
335 | 
336 | 
337 | # ---------------------- bag based similarity measures  ----------------------
338 | # class CosineTestCases(unittest.TestCase):
339 | #     def test_valid_input(self):
340 | #         NONQ_FROM = 'The quick brown fox jumped over the lazy dog.'
341 | #         NONQ_TO = 'That brown dog jumped over the fox.'
342 | #         self.assertEqual(cosine([], []), 1) # check-- done. both simmetrics, abydos return 1.
343 | #         self.assertEqual(cosine(['the', 'quick'], []), 0)
344 | #         self.assertEqual(cosine([], ['the', 'quick']), 0)
345 | #         self.assertAlmostEqual(cosine(whitespace(NONQ_TO), whitespace(NONQ_FROM)),
346 | #                                4/math.sqrt(9*7))
347 | #
348 | #     @raises(TypeError)
349 | #     def test_invalid_input1(self):
350 | #         cosine(['a'], None)
351 | #     @raises(TypeError)
352 | #     def test_invalid_input2(self):
353 | #         cosine(None, ['b'])
354 | #     @raises(TypeError)
355 | #     def test_invalid_input3(self):
356 | #         cosine(None, None)
357 | 
358 | 
359 | # ---------------------- hybrid similarity measure  ----------------------
360 | 
361 | class Soft_TfidfTestCases(unittest.TestCase):
362 |     def test_valid_input(self):
363 |         self.assertEqual(soft_tfidf(['a', 'b', 'a'], ['a', 'c'], [['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=jaro,
364 |                                     threshold=0.8), 0.17541160386140586)
365 |         self.assertEqual(soft_tfidf(['a', 'b', 'a'], ['a'], [['a', 'b', 'a'], ['a', 'c'], ['a']],
366 |                                     threshold=0.9), 0.5547001962252291)
367 |         self.assertEqual(soft_tfidf(['a', 'b', 'a'], ['a'], [['x', 'y'], ['w'], ['q']]), 0.0)
368 |         self.assertEqual(soft_tfidf(['aa', 'bb', 'a'], ['ab', 'ba'], sim_func=affine, threshold=0.6),
369 |                          0.81649658092772592)
370 | 
371 |     @raises(TypeError)
372 |     def test_invalid_input1(self):
373 |         soft_tfidf(1, 1)
374 | 
375 |     @raises(TypeError)
376 |     def test_invalid_input4(self):
377 |         soft_tfidf(['a'], None)
378 | 
379 |     @raises(TypeError)
380 |     def test_invalid_input2(self):
381 |         soft_tfidf(None, ['b'])
382 | 
383 |     @raises(TypeError)
384 |     def test_invalid_input3(self):
385 |         soft_tfidf(None, None)
386 | 
387 | 
388 | class MongeElkanTestCases(unittest.TestCase):
389 |     def test_valid_input(self):
390 |         self.assertEqual(monge_elkan([''], ['']), 1.0)  # need to check this
391 | 
392 |         self.assertEqual(monge_elkan([''], ['a']), 0.0)
393 |         self.assertEqual(monge_elkan(['a'], ['a']), 1.0)
394 | 
395 |         self.assertEqual(monge_elkan(['Niall'], ['Neal']), 0.8049999999999999)
396 |         self.assertEqual(monge_elkan(['Niall'], ['Njall']), 0.88)
397 |         self.assertEqual(monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
398 |                                      ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']), 0.8364448051948052)
399 |         self.assertEqual(
400 |             monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
401 |                         ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'],
402 |                         sim_func=needleman_wunsch), 2.0)
403 |         self.assertEqual(
404 |             monge_elkan(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'],
405 |                         ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'],
406 |                         sim_func=affine), 2.25)
407 |         self.assertEqual(monge_elkan(['Niall'], ['Niel']), 0.8266666666666667)
408 |         self.assertEqual(monge_elkan(['Niall'], ['Nigel']), 0.7866666666666667)
409 | 
410 |     @raises(TypeError)
411 |     def test_invalid_input1(self):
412 |         monge_elkan(1, 1)
413 | 
414 |     @raises(TypeError)
415 |     def test_invalid_input4(self):
416 |         monge_elkan(['a'], None)
417 | 
418 |     @raises(TypeError)
419 |     def test_invalid_input2(self):
420 |         monge_elkan(None, ['b'])
421 | 
422 |     @raises(TypeError)
423 |     def test_invalid_input3(self):
424 |         monge_elkan(None, None)
425 | 


--------------------------------------------------------------------------------
/py_stringmatching/tests/test_tokenizers.py:
--------------------------------------------------------------------------------
 1 | from __future__ import unicode_literals
 2 | 
 3 | import unittest
 4 | from nose.tools import *
 5 | 
 6 | from py_stringmatching.tokenizers import qgram, delimiter, whitespace
 7 | 
 8 | 
 9 | class QgramTestCases(unittest.TestCase):
10 |     def test_qgrams_valid(self):
11 |         self.assertEqual(qgram(''), [])
12 |         self.assertEqual(qgram('a'), [])
13 |         self.assertEqual(qgram('aa'), ['aa'])
14 |         self.assertEqual(qgram('database'), ['da', 'at', 'ta', 'ab', 'ba', 'as', 'se'])
15 |         self.assertEqual(qgram('d', 1), ['d'])
16 |         self.assertEqual(qgram('database', 3), ['dat', 'ata', 'tab', 'aba', 'bas', 'ase'])
17 | 
18 |     @raises(TypeError)
19 |     def test_qgrams_none(self):
20 |         self.assertEqual(qgram(None), [])
21 | 
22 | 
23 | class DelimiterTestCases(unittest.TestCase):
24 |     def test_delimiter_valid(self):
25 |         self.assertEqual(delimiter('data science'), ['data', 'science'])
26 |         self.assertEqual(delimiter('data,science', ','), ['data', 'science'])
27 |         self.assertEqual(delimiter('data science', ','), ['data science'])
28 |         self.assertEqual(delimiter('data$#$science', '$#$'), ['data', 'science'])
29 | 
30 |     def test_delimiter_invalid1(self):
31 |         self.assertEqual(delimiter('data science', None), ['data', 'science'])
32 | 
33 |     @raises(TypeError)
34 |     def test_delimiter_invalid2(self):
35 |         self.assertEqual(delimiter('data science', 10), ['data', 'science'])
36 | 
37 |     @raises(TypeError)
38 |     def test_delimiter_invalid3(self):
39 |         self.assertEqual(delimiter(None), [])
40 | 
41 | 
42 | class WhiteSpaceTestCases(unittest.TestCase):
43 |     def test_delimiter_valid(self):
44 |         self.assertEqual(whitespace('data science'), ['data', 'science'])
45 |         self.assertEqual(whitespace('data        science'), ['data', 'science'])
46 |         self.assertEqual(whitespace('data   science'), ['data', 'science'])
47 |         self.assertEqual(whitespace('data\tscience'), ['data', 'science'])
48 |         self.assertEqual(whitespace('data\nscience'), ['data', 'science'])
49 | 
50 |     @raises(TypeError)
51 |     def test_delimiter_invalid(self):
52 |         self.assertEqual(whitespace(None))
53 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizers.py:
--------------------------------------------------------------------------------
  1 | from py_stringmatching import utils
  2 | from .compat import _range
  3 | 
  4 | 
  5 | # @todo: add examples in the comments
  6 | 
  7 | def qgram(input_string, qval=2):
  8 |     """
  9 |     Tokenizes input string into q-grams.
 10 | 
 11 |     A q-gram is defined as all sequences of q characters. Q-grams are also known as n-grams and
 12 |     k-grams.
 13 | 
 14 |     Args:
 15 |         input_string (str): Input string
 16 | 
 17 |         qval (int): Q-gram length (defaults to 2)
 18 | 
 19 |     Returns:
 20 |         Token list (list)
 21 | 
 22 |     Raises:
 23 |         TypeError : If the input is not a string
 24 | 
 25 |     Examples:
 26 |         >>> qgram('database')
 27 |         ['da','at','ta','ab','ba','as','se']
 28 |         >>> qgram('a')
 29 |         []
 30 |         >>> qgram('database', 3)
 31 |         ['dat', 'ata', 'tab', 'aba', 'bas', 'ase']
 32 | 
 33 | 
 34 |     """
 35 |     utils.tok_check_for_none(input_string)
 36 |     utils.tok_check_for_string_input(input_string)
 37 | 
 38 |     qgram_list = []
 39 | 
 40 |     if len(input_string) < qval or qval < 1:
 41 |         return qgram_list
 42 | 
 43 |     qgram_list = [input_string[i:i + qval] for i in _range(len(input_string) - (qval - 1))]
 44 |     return qgram_list
 45 | 
 46 | 
 47 | def delimiter(input_string, delim_str=' '):
 48 |     """
 49 |     Tokenizes input string based on the given delimiter.
 50 | 
 51 |     Args:
 52 |         input_string (str): Input string
 53 | 
 54 |         delim_str (str): Delimiter string
 55 | 
 56 | 
 57 |     Returns:
 58 |         Token list (list)
 59 | 
 60 |     Raises:
 61 |         TypeError : If the input is not a string
 62 | 
 63 |     Examples:
 64 |         >>> delimiter('data science')
 65 |         ['data', 'science']
 66 |         >>> delimiter('data$#$science', '$#$')
 67 |         ['data', 'science']
 68 |         >>> delimiter('data science', ',')
 69 |         ['data science']
 70 | 
 71 |     """
 72 |     utils.tok_check_for_none(input_string)
 73 |     utils.tok_check_for_string_input(input_string)
 74 | 
 75 |     return input_string.split(delim_str)
 76 | 
 77 | 
 78 | def whitespace(input_string):
 79 |     """
 80 |     Tokenizes input string based on white space.
 81 | 
 82 |     Args:
 83 |         input_string (str): Input string
 84 | 
 85 |     Returns:
 86 |         Token list (list)
 87 | 
 88 |     Raises:
 89 |         TypeError : If the input is not a string
 90 | 
 91 |     Examples:
 92 |         >>> whitespace('data science')
 93 |         ['data', 'science']
 94 |         >>> whitespace('data        science')
 95 |         ['data', 'science']
 96 |         >>> whitespace('data\tscience')
 97 |         ['data', 'science']
 98 | 
 99 |     """
100 |     utils.tok_check_for_none(input_string)
101 |     utils.tok_check_for_string_input(input_string)
102 | 
103 |     return input_string.split()
104 | 


--------------------------------------------------------------------------------
/py_stringmatching/utils.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | 
  3 | import six
  4 | 
  5 | """
  6 | This module defines a list of decorator functions to check input strings/list. The reason this is separated
  7 | from the similarity functions is the implementation of checking functions can change later, depending on
  8 | our decision to handle missing values.
  9 | """
 10 | 
 11 | 
 12 | def _sim_check_for_list_or_set_inputs(func):
 13 |     @functools.wraps(func)
 14 |     def decorator(*args, **kwargs):
 15 |         if not isinstance(args[0], list):
 16 |             if not isinstance(args[0], set):
 17 |                 raise TypeError('First argument is expected to be a python list or set')
 18 |         if not isinstance(args[1], list):
 19 |             if not isinstance(args[1], set):
 20 |                 raise TypeError('Second argument is expected to be a python list or set')
 21 |         return func(*args, **kwargs)
 22 | 
 23 |     return decorator
 24 | 
 25 | 
 26 | def _sim_check_for_string_inputs(func):
 27 |     @functools.wraps(func)
 28 |     def decorator(*args, **kwargs):
 29 |         if not isinstance(args[0], six.string_types):
 30 |             raise TypeError('First argument is expected to be a string')
 31 |         if not isinstance(args[1], six.string_types):
 32 |             raise TypeError('Second argument is expected to be a string')
 33 |         return func(*args, **kwargs)
 34 | 
 35 |     return decorator
 36 | 
 37 | 
 38 | def _sim_check_for_same_len(func):
 39 |     @functools.wraps(func)
 40 |     def decorator(*args, **kwargs):
 41 |         if args[0] is None:
 42 |             raise TypeError("string1 is None")
 43 |         if args[1] is None:
 44 |             raise TypeError("string2 is None")
 45 |         if len(args[0]) != len(args[1]):
 46 |             raise ValueError("Undefined for sequences of unequal length")
 47 |         return func(*args, **kwargs)
 48 | 
 49 |     return decorator
 50 | 
 51 | 
 52 | def _sim_check_for_exact_match(func):
 53 |     @functools.wraps(func)
 54 |     def decorator(*args, **kwargs):
 55 |         if args[0] == args[1]:
 56 |             return 1.0
 57 |         return func(*args, **kwargs)
 58 | 
 59 |     return decorator
 60 | 
 61 | 
 62 | def _sim_check_for_empty(func):
 63 |     @functools.wraps(func)
 64 |     def decorator(*args, **kwargs):
 65 |         if len(args[0]) == 0 or len(args[1]) == 0:
 66 |             return 0
 67 |         return func(*args, **kwargs)
 68 | 
 69 |     return decorator
 70 | 
 71 | 
 72 | def _sim_check_for_none(func):
 73 |     @functools.wraps(func)
 74 |     def decorator(*args, **kwargs):
 75 |         if args[0] is None:
 76 |             raise TypeError("string1 is None")
 77 |         if args[1] is None:
 78 |             raise TypeError("string2 is None")
 79 |         return func(*args, **kwargs)
 80 | 
 81 |     return decorator
 82 | 
 83 | 
 84 | def _tok_check_for_none(func):
 85 |     @functools.wraps(func)
 86 |     def decorator(*args, **kwargs):
 87 |         empty_list = []
 88 |         if args[0] is None:
 89 |             return empty_list
 90 |         return func(*args, **kwargs)
 91 | 
 92 |     return decorator
 93 | 
 94 | 
 95 | def _tok_check_for_string_input(func):
 96 |     @functools.wraps(func)
 97 |     def decorator(*args, **kwargs):
 98 |         if not isinstance(args[0], six.string_types):
 99 |             raise TypeError('Input is expected to be a string')
100 |         return func(*args, **kwargs)
101 | 
102 |     return decorator
103 | 
104 | 
105 | def sim_check_for_none(*args):
106 |     if len(args) > 0 and args[0] is None:
107 |         raise TypeError("First argument cannot be None")
108 |     if len(args) > 1 and args[1] is None:
109 |         raise TypeError("Second argument cannot be None")
110 | 
111 | 
112 | def sim_check_for_empty(*args):
113 |     if len(args[0]) == 0 or len(args[1]) == 0:
114 |         return True
115 | 
116 | 
117 | def sim_check_for_same_len(*args):
118 |     if len(args[0]) != len(args[1]):
119 |         raise ValueError("Undefined for sequences of unequal length")
120 | 
121 | 
122 | def sim_check_for_string_inputs(*args):
123 |     if not isinstance(args[0], six.string_types):
124 |         raise TypeError('First argument is expected to be a string')
125 |     if not isinstance(args[1], six.string_types):
126 |         raise TypeError('Second argument is expected to be a string')
127 | 
128 | 
129 | def sim_check_for_list_or_set_inputs(*args):
130 |     if not isinstance(args[0], list):
131 |         if not isinstance(args[0], set):
132 |             raise TypeError('First argument is expected to be a python list or set')
133 |     if not isinstance(args[1], list):
134 |         if not isinstance(args[1], set):
135 |             raise TypeError('Second argument is expected to be a python list or set')
136 | 
137 | 
138 | def sim_check_for_exact_match(*args):
139 |     if args[0] == args[1]:
140 |         return True
141 | 
142 | 
143 | def tok_check_for_string_input(*args):
144 |     for i in range(len(args)):
145 |         if not isinstance(args[i], six.string_types):
146 |             raise TypeError('Input is expected to be a string')
147 | 
148 | 
149 | def tok_check_for_none(*args):
150 |     if args[0] is None:
151 |         raise TypeError("First argument cannot be None")
152 | 
153 | 
154 | class Similarity:
155 |     def __init__(self, string1, string2, score):
156 |         self.first_string = string1
157 |         self.second_string = string2
158 |         self.similarity_score = score
159 | 
160 | # # check for NaNs
161 | # def check_strings_for_nulls(func):
162 | #     @functools.wraps(func)
163 | #     def decorator(*args, **kwargs):
164 | #         if np.isnan(args[0]) is True:
165 | #             return np.NaN
166 | #         if np.isnan(args[1]) is None:
167 | #             return np.NaN
168 | #         return func(*args, **kwargs)
169 | #     return decorator
170 | #
171 | # # check for nulls in tokens
172 | # def check_tokens_for_nulls(func):
173 | #     @functools.wraps(func)
174 | #     def decorator(*args, **kwargs):
175 | #         tmp_args0 = args[0]
176 | #         if not isinstance(tmp_args0, list):
177 | #             tmp_args0 = [tmp_args0]
178 | #         if any(np.isnan(tmp_args0)) is True:
179 | #             return np.NaN
180 | #         tmp_args1 = args[1]
181 | #         if not isinstance(tmp_args1, list):
182 | #             tmp_args1 = [tmp_args1]
183 | #         if any(np.isnan(tmp_args1)) is True:
184 | #             return np.NaN
185 | #         return func(*args, **kwargs)
186 | #     return decorator
187 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7.0
2 | six
3 | python-Levenshtein >= 0.12.0
4 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | # Set this to True to enable building extensions using Cython.
 4 | # Set it to False to build extensions from the C file (that
 5 | # was previously created using Cython).
 6 | # Set it to 'auto' to build with Cython if available, otherwise
 7 | # from the C file.
 8 | 
 9 | setup(
10 |     name='py_stringmatching',
11 |     version='0.1',
12 |     description='Python library for string matching.',
13 |     long_description="""
14 |     String matching is an important problem in many settings such as data integration, natural language processing,etc.
15 |     This package aims to implement most commonly used string matching measures.
16 |     """,
17 |     url='http://github.com/kvpradap/py_stringmatching',
18 |     author='Pradap Konda',
19 |     author_email='pradap@cs.wisc.edu',
20 |     license=['MIT'],
21 |     packages=['py_stringmatching'],
22 |     install_requires=[
23 |         'numpy >= 1.7.0',
24 |         'six',
25 |         'python-Levenshtein >= 0.12.0'
26 |     ],
27 |     include_package_data=True,
28 |     zip_safe=False
29 | )
30 | 


--------------------------------------------------------------------------------