├── .coveragerc
├── .github
    └── workflows
    │   ├── pip-test.yml
    │   └── testing.yml
├── .gitignore
├── CHANGES.txt
├── LICENSE
├── LICENSES
    ├── NUMPY_LICENSE
    └── SIX_LICENSE
├── MANIFEST.in
├── README.rst
├── benchmarks
    ├── __init__.py
    ├── datasets
    │   ├── long_strings.csv
    │   ├── medium_strings.csv
    │   └── short_strings.csv
    └── run_benchmark.py
├── build_tools
    ├── appveyor
    │   ├── install.ps1
    │   ├── rm_rf.py
    │   └── run_with_env.cmd
    ├── cythonize.py
    ├── move-conda-package.py
    └── requirements_dev.txt
├── docs
    ├── Affine.rst
    ├── AlphabeticTokenizer.rst
    ├── AlphanumericTokenizer.rst
    ├── BagDistance.rst
    ├── Benchmark.rst
    ├── Contributing.rst
    ├── Cosine.rst
    ├── DelimiterTokenizer.rst
    ├── Dice.rst
    ├── Editex.rst
    ├── GeneralizedJaccard.rst
    ├── HammingDistance.rst
    ├── Installation.rst
    ├── Jaccard.rst
    ├── Jaro.rst
    ├── JaroWinkler.rst
    ├── Levenshtein.rst
    ├── Makefile
    ├── MongeElkan.rst
    ├── NeedlemanWunsch.rst
    ├── OverlapCoefficient.rst
    ├── PartialRatio.rst
    ├── PartialTokenSort.rst
    ├── QgramTokenizer.rst
    ├── Ratio.rst
    ├── SimilarityMeasure.rst
    ├── SmithWaterman.rst
    ├── SoftTfIdf.rst
    ├── Soundex.rst
    ├── TfIdf.rst
    ├── TokenSort.rst
    ├── Tokenizer.rst
    ├── Tutorial.rst
    ├── TverskyIndex.rst
    ├── WhatIsNew.rst
    ├── WhitespaceTokenizer.rst
    ├── conf.py
    ├── index.rst
    └── make.bat
├── py_stringmatching
    ├── __init__.py
    ├── similarity_measure
    │   ├── __init__.py
    │   ├── affine.py
    │   ├── bag_distance.py
    │   ├── cosine.py
    │   ├── cython
    │   │   ├── __init__.py
    │   │   ├── cython_affine.pyx
    │   │   ├── cython_jaro.pyx
    │   │   ├── cython_jaro_winkler.pyx
    │   │   ├── cython_levenshtein.pyx
    │   │   ├── cython_needleman_wunsch.pyx
    │   │   ├── cython_smith_waterman.pyx
    │   │   └── cython_utils.pyx
    │   ├── dice.py
    │   ├── editex.py
    │   ├── generalized_jaccard.py
    │   ├── hamming_distance.py
    │   ├── hybrid_similarity_measure.py
    │   ├── jaccard.py
    │   ├── jaro.py
    │   ├── jaro_winkler.py
    │   ├── levenshtein.py
    │   ├── monge_elkan.py
    │   ├── needleman_wunsch.py
    │   ├── overlap_coefficient.py
    │   ├── partial_ratio.py
    │   ├── partial_token_sort.py
    │   ├── phonetic_similarity_measure.py
    │   ├── ratio.py
    │   ├── sequence_similarity_measure.py
    │   ├── similarity_measure.py
    │   ├── smith_waterman.py
    │   ├── soft_tfidf.py
    │   ├── soundex.py
    │   ├── tfidf.py
    │   ├── token_similarity_measure.py
    │   ├── token_sort.py
    │   └── tversky_index.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_sim_Soundex.py
    │   ├── test_simfunctions.py
    │   ├── test_tokenizers.py
    │   └── utils.py
    ├── tokenizer
    │   ├── __init__.py
    │   ├── alphabetic_tokenizer.py
    │   ├── alphanumeric_tokenizer.py
    │   ├── definition_tokenizer.py
    │   ├── delimiter_tokenizer.py
    │   ├── qgram_tokenizer.py
    │   ├── tokenizer.py
    │   └── whitespace_tokenizer.py
    └── utils.py
└── setup.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | branch = True
 3 | source = py_stringmatching
 4 | include = */py_stringmatching/*
 5 | omit =  
 6 |     */tests/*
 7 |     */benchmarks/*
 8 |     *__init__*
 9 |     */python?.?/*
10 | 


--------------------------------------------------------------------------------
/.github/workflows/pip-test.yml:
--------------------------------------------------------------------------------
 1 | # Testing on linux, windows, macos, for python versions 3.7, 3.8, 3.9, 3.10, 3.11, 3.12
 2 | 
 3 | name: Test pip install
 4 | 
 5 | on:
 6 |   - push
 7 |   - pull_request
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
16 |         os: ["ubuntu-latest", "windows-latest", "macos-latest"]
17 |     runs-on: ${{ matrix.os }}
18 |     env:
19 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v3
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |       - name: Version check
28 |         run: python --version
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install "Cython>=0.29.23" "coveralls"
33 |       - name: Install package
34 |         run: |
35 |           python setup.py sdist
36 |           pip install dist/py-stringmatching-0.4.6.tar.gz
37 | 


--------------------------------------------------------------------------------
/.github/workflows/testing.yml:
--------------------------------------------------------------------------------
 1 | # Testing on linux, windows, macos, for python versions 3.7, 3.8, 3.9, 3.10, 3.11, 3.12
 2 | 
 3 | name: Unit testing
 4 | 
 5 | on:
 6 |   - push
 7 |   - pull_request
 8 | 
 9 | jobs:
10 |   build:
11 | 
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"]
16 |         os: ["ubuntu-latest", "windows-latest", "macos-latest"]
17 |     runs-on: ${{ matrix.os }}
18 |     env:
19 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |       - name: Set up Python ${{ matrix.python-version }}
24 |         uses: actions/setup-python@v3
25 |         with:
26 |           python-version: ${{ matrix.python-version }}
27 |       - name: Version check
28 |         run: python --version
29 |       - name: Install dependencies
30 |         run: |
31 |           python -m pip install --upgrade pip
32 |           pip install "numpy<2.0" "Cython>=0.29.23" "coveralls"
33 |       - name: Install package
34 |         run: python setup.py build_ext --inplace
35 |       - name: Run tests
36 |         run: |
37 |           python -m unittest -v
38 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | # Byte-compiled / optimized / DLL files
 3 | __pycache__/
 4 | *.py[cod]
 5 | *$py.class
 6 | 
 7 | # C extensions
 8 | *.c
 9 | *.so
10 | 
11 | # Distribution / packaging
12 | .Python
13 | env/
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | 
29 | # PyInstaller
30 | #  Usually these files are written by a python script from a template
31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 | 
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 | 
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .coverage
43 | .coverage.*
44 | .cache
45 | coverage.xml
46 | *,cover
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | 
55 | # Sphinx documentation
56 | docs/_build/
57 | 
58 | # PyBuilder
59 | target/
60 | 
61 | # temp dir
62 | scratch/
63 | 
64 | # idea files
65 | .idea/
66 | # Created by .ignore support plugin (hsz.mobi)
67 | 
68 | # Performance Testing #
69 | # #######################
70 | html/
71 | results/
72 | 
73 | # Project specific
74 | cythonize.dat
75 | 
76 | garage/
77 | cover/
78 | 
79 | # macOS
80 | .DS_Store
81 | 


--------------------------------------------------------------------------------
/CHANGES.txt:
--------------------------------------------------------------------------------
 1 | v0.4.6 - 7/5/2024
 2 |   * Limited Numpy to <2.0 in setup.py, due to compatibility issues
 3 |   * Added preliminary testing of pip install to Github Actions workflow
 4 | 
 5 | v0.4.5 - 1/26/2024
 6 |   * Discontinued usage of cythonize.py during setup due to Python 3.12 compatibility issues
 7 | 
 8 | v0.4.4 - 1/26/2024
 9 |   * Dropped support for Python 2
10 |   * Added support for Python 3.12
11 |   * Adjusted setuptools.setup project name to match name on PyPI
12 | 
13 | v0.4.3 - 2/8/2023
14 |   * Dropped support for Python 3.6.
15 |   * Added support for Python 3.10 and 3.11.
16 |   * Replaced aliases removed from Numpy 1.24.
17 |   * Switched from Nose to vanilla Unittest.
18 |   * Replaced Travis and Appveyor CI testing with Github Actions.
19 | 
20 | v0.4.2 - 10/17/2020
21 |   * Bug fix: Made PartialRatio importable from py_stringmatching.
22 |   * Dropped support for Python 3.4.
23 |   * This is the last version of py_stringmatching that will support Python 2 and Python 3.5.
24 | 
25 | v0.4.1 - 02/22/2019
26 |   * Cython version was updated. The package is now built with updated Cython version >= 0.27.3.
27 |   * Added support for Python 3.7 version and dropped Testing support for Python 3.3 version.
28 | 
29 | v0.4.0 - 07/18/2017
30 |   * Rewritten five similarity measures in Cython: Affine, Jaro, Jaro Winkler, Needleman Wunsch, and Smith Waterman.
31 |   * Added benchmark scripts to measure the performance of similarity measures. 
32 |  
33 | v0.3.0 - 05/29/2017
34 |   * Added nine new string similarity measures - Bag Distance, Editex,
35 |   Generalized Jaccard, Partial Ratio, Partial Token Sort, Ratio,
36 |   Soundex, Token Sort, and Tversky Index.
37 | 
38 | v0.2.1 - 07/14/2016
39 |   * Remove explicit installation of numpy using pip in setup. Add numpy in setup_requires and compile extensions by including numpy install path.
40 | 
41 | v0.2.0 - 07/06/2016
42 |   * Qgram tokenizers have been modified to take a flag called "padding". If this flag is True (the default), then a prefix and a suffix will be added to the input string before tokenizing (see the Tutorial for a reason for this).
43 |   * Version 0.1.0 does not handle strings in unicode correctly. Specifically, if an input string contains non-ascii characters, a string similarity measure may interpret the string incorrectly and thus compute an incorrect similarity score. In this version we have fixed the string similarity measures. Specifically, we convert the input strings into unicode before computing similarity measures. NOTE: the tokenizers are still not yet unicode-aware.
44 |   * In Version 0.1.0, the flag "dampen" for TF/IDF similarity measure has the default value of False. In this version we have modified it to have the default value of True, which is the more common value for this flag in practice.
45 | 
46 | v0.1.0 - 06/14/2016
47 |   * Initial release. 
48 |   * Contains 5 tokenizers - Alphabetic tokenizer, Alphanumeric tokenizer, Delimiter tokenizer, Qgram tokenizer and
49 |     Whitespace tokenizer.
50 |   * Contains 14 similarity measures - Affine, Cosine, Dice, Hamming distance, Jaccard, Jaro, Jaro-Winkler, 
51 |     Levenshtein, Monge-Elkan, Needleman-Wunsch, Overlap coefficient, Smith-Waterman, Soft TF-IDF, and TF-IDF.
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016, anhaidgroup
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | * Neither the name of py_stringmatching nor the names of its
15 |   contributors may be used to endorse or promote products derived from
16 |   this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/LICENSES/NUMPY_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2005-2016, NumPy Developers.
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are
 6 | met:
 7 | 
 8 |     * Redistributions of source code must retain the above copyright
 9 |        notice, this list of conditions and the following disclaimer.
10 | 
11 |     * Redistributions in binary form must reproduce the above
12 |        copyright notice, this list of conditions and the following
13 |        disclaimer in the documentation and/or other materials provided
14 |        with the distribution.
15 | 
16 |     * Neither the name of the NumPy Developers nor the names of any
17 |        contributors may be used to endorse or promote products derived
18 |        from this software without specific prior written permission.
19 | 
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/LICENSES/SIX_LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2010-2016 Benjamin Peterson
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so,
 8 | subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include README.rst
2 | include CHANGES.txt
3 | include requirements.txt
4 | include LICENSE
5 | recursive-include LICENSES *
6 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | py_stringmatching
 2 | =================
 3 | 
 4 | This project seeks to build a Python software package that consists of a comprehensive and scalable set of string tokenizers (such as alphabetical tokenizers, whitespace tokenizers) and string similarity measures (such as edit distance, Jaccard, TF/IDF). The package is free, open-source, and BSD-licensed.
 5 | 
 6 | Important links
 7 | ===============
 8 | 
 9 |  * Project Homepage: https://sites.google.com/site/anhaidgroup/projects/magellan/py_stringmatching
10 |  * Code repository: https://github.com/anhaidgroup/py_stringmatching
11 |  * User Manual: https://anhaidgroup.github.io/py_stringmatching/v0.4.2/index.html
12 |  * Tutorial: https://anhaidgroup.github.io/py_stringmatching/v0.4.2/Tutorial.html
13 |  * How to Contribute: https://anhaidgroup.github.io/py_stringmatching/v0.4.2/Contributing.html
14 |  * Developer Manual: http://pages.cs.wisc.edu/~anhai/py_stringmatching/v0.2.0/dev-manual-v0.2.0.pdf
15 |  * Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues
16 |  * Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching
17 |  
18 | Dependencies
19 | ============
20 | 
21 | py_stringmatching has been tested on each Python version between 3.7 and 3.12, inclusive.
22 | 
23 | The required dependencies to build the package are NumPy 1.7.0 or higher, but lower than 2.0,
24 | and a C or C++ compiler. For the development version, you will also need Cython.
25 | 
26 | Platforms
27 | =========
28 | 
29 | py_stringmatching has been tested on Linux, OS X and Windows. At this time we have only tested on x86 architecture.
30 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/__init__.py


--------------------------------------------------------------------------------
/benchmarks/datasets/long_strings.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/datasets/long_strings.csv


--------------------------------------------------------------------------------
/benchmarks/datasets/medium_strings.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/datasets/medium_strings.csv


--------------------------------------------------------------------------------
/benchmarks/datasets/short_strings.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/datasets/short_strings.csv


--------------------------------------------------------------------------------
/benchmarks/run_benchmark.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from math import ceil, sqrt
  3 | import time
  4 | 
  5 | import pandas as pd
  6 | import matplotlib.pyplot as plt
  7 | 
  8 | 
  9 | def run_benchmark(short_dataset_path, medium_dataset_path, long_dataset_path,
 10 |                   data_size, sim_measure, tokenizer = None, num_repeat = 1, 
 11 |                   random_seed = 0, output_file = None, encoding = 'latin-1'):
 12 |     """Run benchmark for 9 configurations (short-short, short-medium, 
 13 |     short-long, medium-short, medium-medium, medium-long, long-short, 
 14 |     long-medium, long-long) for the provided similarity measure.
 15 | 
 16 |     Specifically, this method will take in 3 files as input each containing 
 17 |     one column of strings. Next, it will sample the input files based on the 
 18 |     provided data_size and then runs benchmark for different configurations for 
 19 |     the provided similarity measure. Finally, it returns a dataframe containing
 20 |     the benchmark results.                     
 21 |                                                                                 
 22 |     Args:                                                                   
 23 |         short_dataset_path (string): Path to the dataset containing short strings.
 24 |         medium_dataset_path (string): Path to the dataset containing medium strings.
 25 |         long_dataset_path (string): Path to the dataset containing long strings.
 26 |         data_size (int): Number of string pairs to be benchmarked.
 27 |         sim_measure (function): Similarity function to be benchmarked.
 28 |         tokenizer (function): Tokenizer to be used (in case of token-based similarity measures). Defaults to None.
 29 |         num_repeat (int): Number of times to run each configuration. Defaults to 1.
 30 |         random_seed (int): Random seed to be used for sampling. Defaults to 0.
 31 |         output_file (string): Output path to save the benchmark results. Defaults to None.         
 32 |         encoding (string): Encoding of the input datasets. Defaults to latin-1.
 33 | 
 34 |     Returns:                                                                
 35 |         Benchmark results (Dataframe).                                   
 36 |                                                                                 
 37 |     Examples:
 38 |         >>> jac = Jaccard()                                                 
 39 |         >>> ws = WhitespaceTokenizer(return_set=True)
 40 |         >>> results = run_benchmark('datasets/short_strings.csv', 'datasets/medium_strings.csv', 'datasets/long_strings.csv', 100000, 
 41 |                 jac.get_sim_score, ws.tokenize, output_file = 'result.csv') # Benchmark results will be saved in result.csv
 42 |         >>> ed = Levenshtein()
 43 |         >>> results = run_benchmark('datasets/short_strings.csv', 'datasets/medium_strings.csv', 'datasets/long_strings.csv', 100000,
 44 |                       ed.get_sim_score) 
 45 |     """
 46 |   
 47 |     # read data
 48 |     short_strings = pd.read_csv(short_dataset_path, encoding = encoding)
 49 |     medium_strings = pd.read_csv(medium_dataset_path, encoding = encoding)                                  
 50 |     long_strings = pd.read_csv(long_dataset_path, encoding = encoding)                                  
 51 | 
 52 |     short_len = len(short_strings)
 53 |     medium_len = len(medium_strings)
 54 |     long_len = len(long_strings)
 55 | 
 56 |     # compute individual table size
 57 |     table_size = ceil(sqrt(data_size))
 58 | 
 59 |     # sample strings    
 60 |     short_table = list(short_strings.sample(table_size, replace = True, 
 61 |                                             random_state = random_seed).values)
 62 |     medium_table = list(medium_strings.sample(table_size, replace = True, 
 63 |                                               random_state = random_seed).values)
 64 |     long_table = list(long_strings.sample(table_size, replace = True, 
 65 |                                           random_state = random_seed).values)
 66 |     
 67 |     tables = [('short', short_table), ('medium', medium_table), 
 68 |               ('long', long_table)]
 69 | 
 70 |     # run benchmark for each configuration
 71 |     bench_output = []
 72 |     for i in range(len(tables)):
 73 |         for j in range(len(tables)):
 74 |             runtimes = profile_runtime(tables[i][1], tables[j][1], tokenizer, 
 75 |                                        sim_measure, num_repeat)
 76 |             runtimes.append(sum(runtimes)/float(num_repeat))
 77 |             runtimes.insert(0, '_'.join([tables[i][0], tables[j][0]]))
 78 |             bench_output.append(runtimes)
 79 | 
 80 |     header = ['run_'+str(i+1)+' (in secs)' for i in range(num_repeat)]
 81 |     header.append('average (in secs)')
 82 |     header.insert(0, 'configuration')
 83 |     output_table = pd.DataFrame(bench_output, columns = header)
 84 | 
 85 |     if output_file:
 86 |         output_table.to_csv(output_file, index = False)
 87 | 
 88 |     return output_table
 89 | 
 90 |  
 91 | def profile_runtime(table_A, table_B, tokenizer, sim_measure, num_repeat):
 92 |     # run benchmark for one configuration
 93 |     runtimes = []
 94 |     for i in range(num_repeat):
 95 |         start_time = time.time()
 96 |         for string1 in table_A:
 97 |             for string2 in table_B:
 98 |                 if tokenizer:
 99 |                     score = sim_measure(tokenizer(string1[0]), tokenizer(string2[0]))
100 |                 else:
101 |                     score = sim_measure(string1[0], string2[0])
102 |         end_time = time.time()
103 |         runtimes.append(end_time-start_time)
104 |     return runtimes
105 | 
106 | 
107 | def plot_benchmark(bench_output, output_file, 
108 |                    conf_attr = 'configuration', time_attr = 'average (in secs)'):
109 |     # Generate plot from benchmark output
110 |     x_range = list(range(len(bench_output)))
111 |     plt.xticks(x_range, list(bench_output[conf_attr]))
112 |     plt.plot(x_range, bench_output[time_attr], marker='o')
113 |     plt.xlabel('Configuration')
114 |     plt.ylabel('Average time (in secs)')
115 |     plt.title('Benchmark plot')
116 |     plt.savefig(output_file)
117 |     print('Plot generated successfully.')
118 |      
119 | 


--------------------------------------------------------------------------------
/build_tools/appveyor/install.ps1:
--------------------------------------------------------------------------------
 1 | # Sample script to install Miniconda under Windows
 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon
 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
 4 | 
 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/"
 6 | 
 7 | 
 8 | function DownloadMiniconda ($python_version, $platform_suffix) {
 9 |     $webclient = New-Object System.Net.WebClient
10 |     $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe"
11 |    # $filename = "Miniconda3-3.8.3-Windows-" + $platform_suffix + ".exe"
12 |     $url = $MINICONDA_URL + $filename
13 | 
14 |     $basedir = $pwd.Path + "\"
15 |     $filepath = $basedir + $filename
16 |     if (Test-Path $filename) {
17 |         Write-Host "Reusing" $filepath
18 |         return $filepath
19 |     }
20 | 
21 |     # Download and retry up to 3 times in case of network transient errors.
22 |     Write-Host "Downloading" $filename "from" $url
23 |     $retry_attempts = 2
24 |     for($i=0; $i -lt $retry_attempts; $i++){
25 |         try {
26 |             $webclient.DownloadFile($url, $filepath)
27 |             break
28 |         }
29 |         Catch [Exception]{
30 |             Start-Sleep 1
31 |         }
32 |    }
33 |    if (Test-Path $filepath) {
34 |        Write-Host "File saved at" $filepath
35 |    } else {
36 |        # Retry once to get the error message if any at the last try
37 |        $webclient.DownloadFile($url, $filepath)
38 |    }
39 |    return $filepath
40 | }
41 | 
42 | 
43 | function InstallMiniconda ($python_version, $architecture, $python_home) {
44 |     Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home
45 |     if (Test-Path $python_home) {
46 |         Write-Host $python_home "already exists, skipping."
47 |         return $false
48 |     }
49 |     if ($architecture -match "32") {
50 |         $platform_suffix = "x86"
51 |     } else {
52 |         $platform_suffix = "x86_64"
53 |     }
54 | 
55 |     $filepath = DownloadMiniconda $python_version $platform_suffix
56 |     Write-Host "Installing" $filepath "to" $python_home
57 |     $install_log = $python_home + ".log"
58 |     $args = "/S /D=$python_home"
59 |     Write-Host $filepath $args
60 |     Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru
61 |     if (Test-Path $python_home) {
62 |         Write-Host "Python $python_version ($architecture) installation complete"
63 |     } else {
64 |         Write-Host "Failed to install Python in $python_home"
65 |         Get-Content -Path $install_log
66 |         Exit 1
67 |     }
68 | }
69 | 
70 | 
71 | function InstallCondaPackages ($python_home, $spec) {
72 |     $conda_path = $python_home + "\Scripts\conda.exe"
73 |     $args = "install --yes " + $spec
74 |     Write-Host ("conda " + $args)
75 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
76 | }
77 | 
78 | function UpdateConda ($python_home) {
79 |     $conda_path = $python_home + "\Scripts\conda.exe"
80 |     Write-Host "Updating conda..."
81 |     $args = "update --yes conda"
82 |     Write-Host $conda_path $args
83 |     Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru
84 | }
85 | 
86 | 
87 | function main () {
88 |     InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON
89 |     UpdateConda $env:PYTHON
90 |     InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client"
91 | }
92 | 
93 | main
94 | 


--------------------------------------------------------------------------------
/build_tools/appveyor/rm_rf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | import os
 3 | import sys
 4 | import stat
 5 | import shutil
 6 | 
 7 | def remove_readonly(func, path, excinfo):
 8 |     os.chmod(path, stat.S_IWRITE)
 9 |     func(path)
10 | 
11 | def main():
12 |     print(sys.executable)
13 |     try:
14 |         shutil.rmtree(sys.argv[1], onerror=remove_readonly)
15 |     except Exception as e:
16 |         print("Error")
17 |         print(e)
18 | 
19 | if __name__ == '__main__':
20 |     main()
21 | 
22 | 


--------------------------------------------------------------------------------
/build_tools/appveyor/run_with_env.cmd:
--------------------------------------------------------------------------------
 1 | :: EXPECTED ENV VARS: PYTHON_ARCH (either x86 or x64)
 2 | ::                    CONDA_PY (either 27, 33, 35 etc. - only major version is extracted)
 3 | ::
 4 | ::
 5 | :: To build extensions for 64 bit Python 3, we need to configure environment
 6 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of:
 7 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1)
 8 | ::
 9 | :: To build extensions for 64 bit Python 2, we need to configure environment
10 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of:
11 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0)
12 | ::
13 | :: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific
14 | :: environment configurations.
15 | ::
16 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the
17 | :: cmd interpreter, at least for (SDK v7.0)
18 | ::
19 | :: More details at:
20 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows
21 | :: http://stackoverflow.com/a/13751649/163740
22 | ::
23 | :: Author: Phil Elson
24 | :: Original Author: Olivier Grisel (https://github.com/ogrisel/python-appveyor-demo)
25 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/
26 | ::
27 | :: Notes about batch files for Python people:
28 | ::
29 | :: Quotes in values are literally part of the values:
30 | ::      SET FOO="bar"
31 | :: FOO is now five characters long: " b a r "
32 | :: If you don't want quotes, don't include them on the right-hand side.
33 | ::
34 | :: The CALL lines at the end of this file look redundant, but if you move them
35 | :: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y
36 | :: case, I don't know why.
37 | :: originally from https://github.com/pelson/Obvious-CI/blob/master/scripts/obvci_appveyor_python_build_env.cmd
38 | @ECHO OFF
39 | 
40 | SET COMMAND_TO_RUN=%*
41 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows
42 | 
43 | :: Extract the major and minor versions, and allow for the minor version to be
44 | :: more than 9.  This requires the version number to have two dots in it.
45 | SET MAJOR_PYTHON_VERSION=%CONDA_PY:~0,1%
46 | 
47 | IF "%CONDA_PY:~2,1%" == "" (
48 |     :: CONDA_PY style, such as 27, 34 etc.
49 |     SET MINOR_PYTHON_VERSION=%CONDA_PY:~1,1%
50 | ) ELSE (
51 |     IF "%CONDA_PY:~3,1%" == "." (
52 |      SET MINOR_PYTHON_VERSION=%CONDA_PY:~2,1%
53 |     ) ELSE (
54 |      SET MINOR_PYTHON_VERSION=%CONDA_PY:~2,2%
55 |     )
56 | )
57 | 
58 | :: Based on the Python version, determine what SDK version to use, and whether
59 | :: to set the SDK for 64-bit.
60 | IF %MAJOR_PYTHON_VERSION% == 2 (
61 |     SET WINDOWS_SDK_VERSION="v7.0"
62 |     SET SET_SDK_64=Y
63 | ) ELSE (
64 |     IF %MAJOR_PYTHON_VERSION% == 3 (
65 |         SET WINDOWS_SDK_VERSION="v7.1"
66 |         IF %MINOR_PYTHON_VERSION% LEQ 4 (
67 |             SET SET_SDK_64=Y
68 |         ) ELSE (
69 |             SET SET_SDK_64=N
70 |         )
71 |     ) ELSE (
72 |         ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%"
73 |         EXIT /B 1
74 |     )
75 | )
76 | 
77 | IF "%PYTHON_ARCH%"=="64" (
78 |     IF %SET_SDK_64% == Y (
79 |         ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture
80 |         SET DISTUTILS_USE_SDK=1
81 |         SET MSSdk=1
82 |         "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION%
83 |         "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release
84 |         ECHO Executing: %COMMAND_TO_RUN%
85 |         call %COMMAND_TO_RUN% || EXIT /B 1
86 |     ) ELSE (
87 |         ECHO Using default MSVC build environment for 64 bit architecture
88 |         ECHO Executing: %COMMAND_TO_RUN%
89 |         call %COMMAND_TO_RUN% || EXIT /B 1
90 |     )
91 | ) ELSE (
92 |     ECHO Using default MSVC build environment for 32 bit architecture
93 |     ECHO Executing: %COMMAND_TO_RUN%
94 |     call %COMMAND_TO_RUN% || EXIT /B 1
95 | )
96 | 


--------------------------------------------------------------------------------
/build_tools/cythonize.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | """ cythonize
  3 | 
  4 | Cythonize pyx files into C files as needed.
  5 | 
  6 | Usage: cythonize [root_dir]
  7 | 
  8 | Default [root_dir] is 'py_stringmatching'.
  9 | 
 10 | Checks pyx files to see if they have been changed relative to their
 11 | corresponding C files.  If they have, then runs cython on these files to
 12 | recreate the C files.
 13 | 
 14 | The script detects changes in the pyx/pxd files using checksums
 15 | [or hashes] stored in a database file
 16 | 
 17 | Simple script to invoke Cython on all .pyx
 18 | files; while waiting for a proper build system. Uses file hashes to
 19 | figure out if rebuild is needed.
 20 | 
 21 | It is called by ./setup.py sdist so that sdist package can be installed without
 22 | cython
 23 | 
 24 | Originally written by Dag Sverre Seljebotn, and adapted from scikit-learn
 25 | (BSD 3-clause)
 26 | 
 27 | We copied it for py_stringmatching.
 28 | 
 29 | Note: this script does not check any of the dependent C libraries; it only
 30 | operates on the Cython .pyx files or their corresponding Cython header (.pxd)
 31 | files.
 32 | """
 33 | 
 34 | from __future__ import division, print_function, absolute_import
 35 | 
 36 | import os
 37 | import re
 38 | import sys
 39 | import hashlib
 40 | import subprocess
 41 | 
 42 | HASH_FILE = 'cythonize.dat'
 43 | DEFAULT_ROOT = 'py_stringmatching'
 44 | 
 45 | # WindowsError is not defined on unix systems
 46 | try:
 47 |     WindowsError
 48 | except NameError:
 49 |     WindowsError = None
 50 | 
 51 | 
 52 | def cythonize(cython_file, gen_file):
 53 |     try:
 54 |         from Cython.Compiler.Version import version as cython_version
 55 |         from distutils.version import LooseVersion
 56 |         if LooseVersion(cython_version) < LooseVersion('0.21'):
 57 |             raise Exception('Building py_stringmatching requires Cython >= 0.21')
 58 | 
 59 |     except ImportError:
 60 |         pass
 61 | 
 62 |     flags = ['--fast-fail']
 63 |     if gen_file.endswith('.cpp'):
 64 |         flags += ['--cplus']
 65 | 
 66 |     try:
 67 |         try:
 68 |             rc = subprocess.call(['cython'] +
 69 |                                  flags + ["-o", gen_file, cython_file])
 70 |             if rc != 0:
 71 |                 raise Exception('Cythonizing %s failed' % cython_file)
 72 |         except OSError:
 73 |             # There are ways of installing Cython that don't result in a cython
 74 |             # executable on the path, see scipy issue gh-2397.
 75 |             rc = subprocess.call([sys.executable, '-c',
 76 |                                   'import sys; from Cython.Compiler.Main '
 77 |                                   'import setuptools_main as main;'
 78 |                                   ' sys.exit(main())'] + flags +
 79 |                                  ["-o", gen_file, cython_file])
 80 |             if rc != 0:
 81 |                 raise Exception('Cythonizing %s failed' % cython_file)
 82 |     except OSError:
 83 |         raise OSError('Cython needs to be installed')
 84 | 
 85 | 
 86 | def load_hashes(filename):
 87 |     """Load the hashes dict from the hashfile"""
 88 |     # { filename : (sha1 of header if available or 'NA',
 89 |     #               sha1 of input,
 90 |     #               sha1 of output) }
 91 | 
 92 |     hashes = {}
 93 |     try:
 94 |         with open(filename, 'r') as cython_hash_file:
 95 |             for hash_record in cython_hash_file:
 96 |                 (filename, header_hash,
 97 |                  cython_hash, gen_file_hash) = hash_record.split()
 98 |                 hashes[filename] = (header_hash, cython_hash, gen_file_hash)
 99 |     except (KeyError, ValueError, AttributeError, IOError):
100 |         hashes = {}
101 |     return hashes
102 | 
103 | 
104 | def save_hashes(hashes, filename):
105 |     """Save the hashes dict to the hashfile"""
106 |     with open(filename, 'w') as cython_hash_file:
107 |         for key, value in hashes.items():
108 |             cython_hash_file.write("%s %s %s %s\n"
109 |                                    % (key, value[0], value[1], value[2]))
110 | 
111 | 
112 | def sha1_of_file(filename):
113 |     h = hashlib.sha1()
114 |     with open(filename, "rb") as f:
115 |         h.update(f.read())
116 |     return h.hexdigest()
117 | 
118 | 
119 | def clean_path(path):
120 |     """Clean the path"""
121 |     path = path.replace(os.sep, '/')
122 |     if path.startswith('./'):
123 |         path = path[2:]
124 |     return path
125 | 
126 | 
127 | def get_hash_tuple(header_path, cython_path, gen_file_path):
128 |     """Get the hashes from the given files"""
129 | 
130 |     header_hash = (sha1_of_file(header_path)
131 |                    if os.path.exists(header_path) else 'NA')
132 |     from_hash = sha1_of_file(cython_path)
133 |     to_hash = (sha1_of_file(gen_file_path)
134 |                if os.path.exists(gen_file_path) else 'NA')
135 | 
136 |     return header_hash, from_hash, to_hash
137 | 
138 | 
139 | def cythonize_if_unchanged(path, cython_file, gen_file, hashes):
140 |     full_cython_path = os.path.join(path, cython_file)
141 |     full_header_path = full_cython_path.replace('.pyx', '.pxd')
142 |     full_gen_file_path = os.path.join(path, gen_file)
143 | 
144 |     current_hash = get_hash_tuple(full_header_path, full_cython_path,
145 |                                   full_gen_file_path)
146 | 
147 |     if current_hash == hashes.get(clean_path(full_cython_path)):
148 |         print('%s has not changed' % full_cython_path)
149 |         return
150 | 
151 |     print('Processing %s' % full_cython_path)
152 |     cythonize(full_cython_path, full_gen_file_path)
153 | 
154 |     # changed target file, recompute hash
155 |     current_hash = get_hash_tuple(full_header_path, full_cython_path,
156 |                                   full_gen_file_path)
157 | 
158 |     # Update the hashes dict with the new hash
159 |     hashes[clean_path(full_cython_path)] = current_hash
160 | 
161 | 
162 | def check_and_cythonize(root_dir):
163 |     print(root_dir)
164 |     hashes = load_hashes(HASH_FILE)
165 | 
166 |     for cur_dir, dirs, files in os.walk(root_dir):
167 |         for filename in files:
168 |             if filename.endswith('.pyx'):
169 |                 gen_file_ext = '.c'
170 |                 # Cython files with libcpp imports should be compiled to cpp
171 |                 with open(os.path.join(cur_dir, filename), 'rb') as f:
172 |                     data = f.read()
173 |                     m = re.search(b"libcpp", data, re.I | re.M)
174 |                     if m:
175 |                         gen_file_ext = ".cpp"
176 |                 cython_file = filename
177 |                 gen_file = filename.replace('.pyx', gen_file_ext)
178 |                 cythonize_if_unchanged(cur_dir, cython_file, gen_file, hashes)
179 | 
180 |                 # Save hashes once per module. This prevents cythonizing prev.
181 |                 # files again when debugging broken code in a single file
182 |                 save_hashes(hashes, HASH_FILE)
183 | 
184 | 
185 | def main(root_dir=DEFAULT_ROOT):
186 |     check_and_cythonize(root_dir)
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     try:
191 |         root_dir_arg = sys.argv[1]
192 |     except IndexError:
193 |         root_dir_arg = DEFAULT_ROOT
194 |     main(root_dir_arg)
195 | 


--------------------------------------------------------------------------------
/build_tools/move-conda-package.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import yaml
 4 | import glob
 5 | import shutil
 6 | 
 7 | #try
 8 | #    from conda_build.config import config
 9 | #except ImportError:
10 | from conda_build.config import Config # 03/03/2017: Updated based on the changes to conda_build.config
11 | config = Config()
12 | 
13 | with open(os.path.join(sys.argv[1], 'meta.yaml')) as f:
14 |     name = yaml.load(f)['package']['name']
15 | 
16 | binary_package_glob = os.path.join(config.bldpkgs_dir, '{0}*.tar.bz2'.format(name))
17 | binary_package = glob.glob(binary_package_glob)[0]
18 | 
19 | shutil.move(binary_package, '.')
20 | 


--------------------------------------------------------------------------------
/build_tools/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.7.0
2 | Cython
3 | 


--------------------------------------------------------------------------------
/docs/Affine.rst:
--------------------------------------------------------------------------------
1 | Affine Gap
2 | --------------------------------------------------
3 | 
4 | .. autoclass:: py_stringmatching.similarity_measure.affine.Affine(gap_start=1, gap_continuation=0.5, sim_func=identity_function)
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/AlphabeticTokenizer.rst:
--------------------------------------------------------------------------------
1 | Alphabetic Tokenizer
2 | -------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.tokenizer.alphabetic_tokenizer
5 |     :members:
6 |     :inherited-members:
7 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/AlphanumericTokenizer.rst:
--------------------------------------------------------------------------------
1 | Alphanumeric Tokenizer
2 | ---------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.tokenizer.alphanumeric_tokenizer
5 |     :members:
6 |     :inherited-members:
7 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/BagDistance.rst:
--------------------------------------------------------------------------------
1 | Bag Distance
2 | ------------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.bag_distance
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/Benchmark.rst:
--------------------------------------------------------------------------------
 1 | Runtime Benchmark 
 2 | =================
 3 | 
 4 | For this package, we add a runtime benchmark (consisting of a script and several datasets) to measure the runtime performance of similarity measures. This benchmark can be used by users to judge whether similarity measures are fast enough for their purposes, and used by developers to speed up the measures.
 5 | 
 6 | Running the Benchmark
 7 | ---------------------
 8 | 
 9 | The user can run the benchmark as follows:
10 | 
11 | Step 1: Clone the py_stringmatching package from GitHub using the following command::
12 |     
13 |     git clone https://github.com/anhaidgroup/py_stringmatching.git 
14 | 
15 | Step 2: Change the working directory to py_stringmatching/benchmarks/custom_benchmarks
16 | 
17 | Step 3: Run the benchmark using the following sequence of commands:
18 | 
19 |     >>> import py_stringmatching as sm
20 |     >>> from run_benchmark import *
21 |     # create an object for the similarity measure you need to benchmark
22 |     >>> jaccard = sm.Jaccard()                                                                                   
23 |     # create a tokenizer object (in case of token-based measures)            
24 |     >>> ws = sm.WhitespaceTokenizer(return_set = True)
25 |     # Set dataset paths
26 |     >>> short_strings_path = 'datasets/short_strings.csv'
27 |     >>> medium_strings_path = 'datasets/medium_strings.csv'
28 |     >>> long_strings_path = 'datasets/long_strings.csv'
29 |     # Data size (number of string pairs) over which the benchmark should be run
30 |     >>> data_size = 10000
31 |     # Number of times to repeat
32 |     >>> num_repeat = 3
33 |     # Output file where the benchmark results should be written
34 |     >>> output_file = 'benchmark_results.csv'
35 |     # run the benchmark
36 |     >>> run_benchmark(short_strings_path, medium_strings_path, long_strings_path, data_size = data_size, jaccard.get_sim_score, ws.tokenize, num_repeat = num_repeat, output_file = output_file)
37 | 
38 | The benchmark contains three datasets in the `datasets` directory: (1) short_strings.csv, (2) medium_strings.csv, and (3) long_strings.csv. Each dataset contains 5000 strings. Specifically, short_strings.csv contains strings with length in the range of 2-15 (avg. of 10), medium_strings.csv contains strings with length in the range of 18-39 (avg. of 25), and
39 | long_strings.csv contains strings with length in the range of 60-1726 (avg. of 127).
40 | 
41 | The above command will run the benchmark for 9 different configurations 
42 | (short-short, short-medium, short-long, medium-short, medium-medium, medium-long, 
43 | long-short, long-medium, long-long) for the provided similarity measure, and
44 | writes the result to the provided output file. See below for additional details.
45 | 
46 | Interpreting the Results
47 | --------------------------
48 | 
49 | The benchmark results will be a CSV file containing the following information:
50 | 
51 |    * Configuration
52 |    * Runtime (in secs) for each run of a configuration (note that each configuration is run for `num_repeat` times)
53 |    * Average runtime (in secs) for each configuration
54 | 
55 | An example output file will look like this::
56 | 
57 |     configuration,run_1 (in secs),run_2 (in secs),run_3 (in secs),average (in secs) 
58 |     short_short,0.112642049789,0.112892866135,0.112852096558,0.112795670827         
59 |     short_medium,0.115404129028,0.115512132645,0.115454912186,0.115457057953        
60 |     short_long,0.194123983383,0.193922996521,0.193790912628,0.193945964177          
61 |     medium_short,0.11647105217,0.116579055786,0.116438865662,0.116496324539         
62 |     medium_medium,0.118470907211,0.118409156799,0.118496894836,0.118458986282       
63 |     medium_long,0.206312894821,0.206974983215,0.206708908081,0.206665595373         
64 |     long_short,0.205050945282,0.205410957336,0.205253124237,0.205238342285          
65 |     long_medium,0.217441797256,0.21806883812,0.218235015869,0.217915217082          
66 |     long_long,0.770321846008,0.76869893074,0.768806934357,0.769275903702  
67 | 


--------------------------------------------------------------------------------
/docs/Cosine.rst:
--------------------------------------------------------------------------------
1 | Cosine
2 | --------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.cosine
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/DelimiterTokenizer.rst:
--------------------------------------------------------------------------------
1 | Delimiter Tokenizer
2 | ------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.tokenizer.delimiter_tokenizer
5 |     :members:
6 |     :inherited-members:
7 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/Dice.rst:
--------------------------------------------------------------------------------
1 | Dice
2 | ------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.dice
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/Editex.rst:
--------------------------------------------------------------------------------
1 | Editex
2 | ------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.editex
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/GeneralizedJaccard.rst:
--------------------------------------------------------------------------------
1 | Generalized Jaccard
2 | ---------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.generalized_jaccard
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/HammingDistance.rst:
--------------------------------------------------------------------------------
1 | Hamming Distance
2 | ------------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.hamming_distance
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/Installation.rst:
--------------------------------------------------------------------------------
 1 | ============
 2 | Installation
 3 | ============
 4 |  
 5 | Requirements
 6 | ------------
 7 |     * Python 3.7-3.11
 8 |     * C or C++ compiler (parts of the package are in Cython for efficiency reasons, and you need C or C++ compiler to compile these parts)
 9 | 
10 | Platforms
11 | ------------
12 | py_stringmatching has been tested on Linux (Ubuntu 22.04), OS X (Monterey 12), and Windows 10.
13 | 
14 | Dependencies
15 | ------------
16 |     * numpy 1.7.0 or higher
17 | 
18 | .. note::
19 | 
20 |      The py_stringmatching installer will automatically install the above required packages.
21 | 
22 | C Compiler Required
23 | -------------------
24 | Before installing this package, you need to make sure that you have a C compiler installed. This is necessary because this package contains Cython files. Go `here <https://sites.google.com/site/anhaidgroup/projects/magellan/issues>`_ for more information about how to check whether you already have a C compiler and how to install a C compiler.
25 | 
26 | After you have confirmed that you have a C compiler installed, you are ready to install the package. There are two ways to install py_stringmatching package: using pip or source distribution.
27 | 
28 | Installing Using pip
29 | --------------------
30 | The easiest way to install the package is to use pip, which will retrieve py_stringmatching from PyPI then install it::
31 | 
32 |     pip install py_stringmatching
33 |     
34 | Installing from Source Distribution
35 | -------------------------------------
36 | Step 1: Download the py_stringmatching package from `here
37 | <https://sites.google.com/site/anhaidgroup/projects/py_stringmatching>`_.
38 | 
39 | Step 2: Unzip the package and execute the following command from the package root::
40 | 
41 |     python setup.py install
42 |     
43 | .. note::
44 | 
45 |     The above command will try to install py_stringmatching into the defaul Python directory on your machine. If you do not have installation permission for that directory then you can install the package in your home directory as follows::
46 | 
47 |         python setup.py install --user
48 | 
49 |     For more information see the StackOverflow `link
50 |     <http://stackoverflow.com/questions/14179941/how-to-install-python-packages-without-root-privileges>`_.
51 | 
52 | .. note::
53 | 
54 |     Building C files from source requires Cython version 0.29.23 or higher::
55 |     
56 |         pip install Cython>=0.29.23
57 | 
58 | 


--------------------------------------------------------------------------------
/docs/Jaccard.rst:
--------------------------------------------------------------------------------
1 | Jaccard
2 | ---------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.jaccard
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/Jaro.rst:
--------------------------------------------------------------------------------
1 | Jaro
2 | ------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.jaro
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/JaroWinkler.rst:
--------------------------------------------------------------------------------
1 | Jaro Winkler
2 | --------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.jaro_winkler
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/Levenshtein.rst:
--------------------------------------------------------------------------------
1 | Levenshtein
2 | -------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.levenshtein
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
  1 | # Makefile for Sphinx documentation
  2 | #
  3 | 
  4 | # You can set these variables from the command line.
  5 | SPHINXOPTS    =
  6 | SPHINXBUILD   = sphinx-build
  7 | PAPER         =
  8 | BUILDDIR      = _build
  9 | 
 10 | # User-friendly check for sphinx-build
 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
 13 | endif
 14 | 
 15 | # Internal variables.
 16 | PAPEROPT_a4     = -D latex_paper_size=a4
 17 | PAPEROPT_letter = -D latex_paper_size=letter
 18 | ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 19 | # the i18n builder cannot share the environment and doctrees with the others
 20 | I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
 21 | 
 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
 23 | 
 24 | help:
 25 | 	@echo "Please use \`make <target>' where <target> is one of"
 26 | 	@echo "  html       to make standalone HTML files"
 27 | 	@echo "  dirhtml    to make HTML files named index.html in directories"
 28 | 	@echo "  singlehtml to make a single large HTML file"
 29 | 	@echo "  pickle     to make pickle files"
 30 | 	@echo "  json       to make JSON files"
 31 | 	@echo "  htmlhelp   to make HTML files and a HTML help project"
 32 | 	@echo "  qthelp     to make HTML files and a qthelp project"
 33 | 	@echo "  applehelp  to make an Apple Help Book"
 34 | 	@echo "  devhelp    to make HTML files and a Devhelp project"
 35 | 	@echo "  epub       to make an epub"
 36 | 	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
 37 | 	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
 38 | 	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
 39 | 	@echo "  text       to make text files"
 40 | 	@echo "  man        to make manual pages"
 41 | 	@echo "  texinfo    to make Texinfo files"
 42 | 	@echo "  info       to make Texinfo files and run them through makeinfo"
 43 | 	@echo "  gettext    to make PO message catalogs"
 44 | 	@echo "  changes    to make an overview of all changed/added/deprecated items"
 45 | 	@echo "  xml        to make Docutils-native XML files"
 46 | 	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
 47 | 	@echo "  linkcheck  to check all external links for integrity"
 48 | 	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
 49 | 	@echo "  coverage   to run coverage check of the documentation (if enabled)"
 50 | 
 51 | clean:
 52 | 	rm -rf $(BUILDDIR)/*
 53 | 
 54 | html:
 55 | 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 56 | 	@echo
 57 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 58 | 
 59 | dirhtml:
 60 | 	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
 61 | 	@echo
 62 | 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
 63 | 
 64 | singlehtml:
 65 | 	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
 66 | 	@echo
 67 | 	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
 68 | 
 69 | pickle:
 70 | 	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
 71 | 	@echo
 72 | 	@echo "Build finished; now you can process the pickle files."
 73 | 
 74 | json:
 75 | 	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
 76 | 	@echo
 77 | 	@echo "Build finished; now you can process the JSON files."
 78 | 
 79 | htmlhelp:
 80 | 	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
 81 | 	@echo
 82 | 	@echo "Build finished; now you can run HTML Help Workshop with the" \
 83 | 	      ".hhp project file in $(BUILDDIR)/htmlhelp."
 84 | 
 85 | qthelp:
 86 | 	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
 87 | 	@echo
 88 | 	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
 89 | 	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
 90 | 	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/py_stringmatching.qhcp"
 91 | 	@echo "To view the help file:"
 92 | 	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/py_stringmatching.qhc"
 93 | 
 94 | applehelp:
 95 | 	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
 96 | 	@echo
 97 | 	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
 98 | 	@echo "N.B. You won't be able to view it unless you put it in" \
 99 | 	      "~/Library/Documentation/Help or install it in your application" \
100 | 	      "bundle."
101 | 
102 | devhelp:
103 | 	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
104 | 	@echo
105 | 	@echo "Build finished."
106 | 	@echo "To view the help file:"
107 | 	@echo "# mkdir -p $$HOME/.local/share/devhelp/py_stringmatching"
108 | 	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/py_stringmatching"
109 | 	@echo "# devhelp"
110 | 
111 | epub:
112 | 	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
113 | 	@echo
114 | 	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
115 | 
116 | latex:
117 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
118 | 	@echo
119 | 	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
120 | 	@echo "Run \`make' in that directory to run these through (pdf)latex" \
121 | 	      "(use \`make latexpdf' here to do that automatically)."
122 | 
123 | latexpdf:
124 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
125 | 	@echo "Running LaTeX files through pdflatex..."
126 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf
127 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
128 | 
129 | latexpdfja:
130 | 	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
131 | 	@echo "Running LaTeX files through platex and dvipdfmx..."
132 | 	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
133 | 	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
134 | 
135 | text:
136 | 	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
137 | 	@echo
138 | 	@echo "Build finished. The text files are in $(BUILDDIR)/text."
139 | 
140 | man:
141 | 	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
142 | 	@echo
143 | 	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
144 | 
145 | texinfo:
146 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
147 | 	@echo
148 | 	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
149 | 	@echo "Run \`make' in that directory to run these through makeinfo" \
150 | 	      "(use \`make info' here to do that automatically)."
151 | 
152 | info:
153 | 	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
154 | 	@echo "Running Texinfo files through makeinfo..."
155 | 	make -C $(BUILDDIR)/texinfo info
156 | 	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
157 | 
158 | gettext:
159 | 	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
160 | 	@echo
161 | 	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
162 | 
163 | changes:
164 | 	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
165 | 	@echo
166 | 	@echo "The overview file is in $(BUILDDIR)/changes."
167 | 
168 | linkcheck:
169 | 	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
170 | 	@echo
171 | 	@echo "Link check complete; look for any errors in the above output " \
172 | 	      "or in $(BUILDDIR)/linkcheck/output.txt."
173 | 
174 | doctest:
175 | 	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
176 | 	@echo "Testing of doctests in the sources finished, look at the " \
177 | 	      "results in $(BUILDDIR)/doctest/output.txt."
178 | 
179 | coverage:
180 | 	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
181 | 	@echo "Testing of coverage in the sources finished, look at the " \
182 | 	      "results in $(BUILDDIR)/coverage/python.txt."
183 | 
184 | xml:
185 | 	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
186 | 	@echo
187 | 	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
188 | 
189 | pseudoxml:
190 | 	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
191 | 	@echo
192 | 	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
193 | 


--------------------------------------------------------------------------------
/docs/MongeElkan.rst:
--------------------------------------------------------------------------------
1 | Monge Elkan
2 | -------------------------------------------------------
3 | 
4 | .. autoclass:: py_stringmatching.similarity_measure.monge_elkan.MongeElkan(sim_func=jaro_winkler_function)
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/NeedlemanWunsch.rst:
--------------------------------------------------------------------------------
1 | Needleman Wunsch
2 | ------------------------------------------------------------
3 | 
4 | .. autoclass:: py_stringmatching.similarity_measure.needleman_wunsch.NeedlemanWunsch(gap_cost=1.0, sim_func=identity_function)
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/OverlapCoefficient.rst:
--------------------------------------------------------------------------------
1 | Overlap Coefficient
2 | ---------------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.overlap_coefficient
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/PartialRatio.rst:
--------------------------------------------------------------------------------
1 | Partial Ratio                                                             
2 | ------------------------------------------------------------                    
3 |                                                                                 
4 | .. automodule:: py_stringmatching.similarity_measure.partial_ratio           
5 |     :members:   
6 | 


--------------------------------------------------------------------------------
/docs/PartialTokenSort.rst:
--------------------------------------------------------------------------------
1 | Partial Token Sort                                                             
2 | ------------------------------------------------------------                    
3 |                                                                                 
4 | .. automodule:: py_stringmatching.similarity_measure.partial_token_sort       
5 |     :members:   
6 | 


--------------------------------------------------------------------------------
/docs/QgramTokenizer.rst:
--------------------------------------------------------------------------------
1 | Qgram Tokenizer
2 | --------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.tokenizer.qgram_tokenizer
5 |     :members:
6 |     :inherited-members:
7 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/Ratio.rst:
--------------------------------------------------------------------------------
1 | Ratio                                                             
2 | ------------------------------------------------------------                    
3 |                                                                                 
4 | .. automodule:: py_stringmatching.similarity_measure.ratio           
5 |     :members:   
6 | 


--------------------------------------------------------------------------------
/docs/SimilarityMeasure.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | Similarity Measures
 3 | ===================
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 | 
 8 |     Affine
 9 |     BagDistance
10 |     Cosine
11 |     Dice
12 |     Editex
13 |     GeneralizedJaccard
14 |     HammingDistance
15 |     Jaccard
16 |     Jaro
17 |     JaroWinkler
18 |     Levenshtein
19 |     MongeElkan
20 |     NeedlemanWunsch
21 |     OverlapCoefficient
22 |     PartialRatio
23 |     PartialTokenSort
24 |     Ratio
25 |     SmithWaterman
26 |     SoftTfIdf
27 |     Soundex
28 |     TfIdf
29 |     TokenSort
30 |     TverskyIndex
31 | 


--------------------------------------------------------------------------------
/docs/SmithWaterman.rst:
--------------------------------------------------------------------------------
1 | Smith Waterman
2 | ----------------------------------------------------------
3 | 
4 | .. autoclass:: py_stringmatching.similarity_measure.smith_waterman.SmithWaterman(gap_cost=1.0, sim_func=identity_function)
5 |     :members:
6 | 
7 | 


--------------------------------------------------------------------------------
/docs/SoftTfIdf.rst:
--------------------------------------------------------------------------------
1 | Soft TF/IDF
2 | ------------------------------------------------------
3 | 
4 | .. autoclass:: py_stringmatching.similarity_measure.soft_tfidf.SoftTfIdf(corpus_list=None, sim_func=jaro_function, threshold=0.5)
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/Soundex.rst:
--------------------------------------------------------------------------------
1 | Soundex                                                             
2 | ------------------------------------------------------------                    
3 |                                                                                 
4 | .. automodule:: py_stringmatching.similarity_measure.soundex           
5 |     :members:   
6 | 


--------------------------------------------------------------------------------
/docs/TfIdf.rst:
--------------------------------------------------------------------------------
1 | TF/IDF
2 | -------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.similarity_measure.tfidf
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/TokenSort.rst:
--------------------------------------------------------------------------------
1 | Token Sort                                                             
2 | ------------------------------------------------------------                    
3 |                                                                                 
4 | .. automodule:: py_stringmatching.similarity_measure.token_sort       
5 |     :members:   
6 | 


--------------------------------------------------------------------------------
/docs/Tokenizer.rst:
--------------------------------------------------------------------------------
 1 | ===================
 2 | Tokenizers
 3 | ===================
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 2
 7 | 
 8 |     AlphabeticTokenizer
 9 |     AlphanumericTokenizer
10 |     DelimiterTokenizer
11 |     QgramTokenizer
12 |     WhitespaceTokenizer
13 | 
14 | 


--------------------------------------------------------------------------------
/docs/TverskyIndex.rst:
--------------------------------------------------------------------------------
1 | Tversky Index                                                             
2 | ------------------------------------------------------------                    
3 |                                                                                 
4 | .. automodule:: py_stringmatching.similarity_measure.tversky_index         
5 |     :members:   
6 | 


--------------------------------------------------------------------------------
/docs/WhatIsNew.rst:
--------------------------------------------------------------------------------
1 | What is New? 
2 | ============
3 | 
4 | Compared to Version 0.4.2, the following items are new:
5 | 
6 |   * Dropped support for Python 3.5, added support for Python 3.9.
7 | 


--------------------------------------------------------------------------------
/docs/WhitespaceTokenizer.rst:
--------------------------------------------------------------------------------
1 | Whitespace Tokenizer
2 | -------------------------------------------------------
3 | 
4 | .. automodule:: py_stringmatching.tokenizer.whitespace_tokenizer
5 |     :members:
6 |     :inherited-members:
7 |     :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__
8 | 
9 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | User Manual for py_stringmatching
 2 | =================================
 3 | 
 4 | This document explains how to install, use, and contribute to the package.
 5 | 
 6 | Contents
 7 | ========
 8 | 
 9 | .. toctree::
10 |     :maxdepth: 2
11 | 
12 |     WhatIsNew
13 |     Installation
14 |     Tutorial
15 |     Tokenizer
16 |     SimilarityMeasure
17 |     Benchmark
18 |     Contributing
19 | 
20 | Indices and tables
21 | ==================
22 | 
23 | * :ref:`genindex`
24 | * :ref:`modindex`
25 | * :ref:`search`
26 | 
27 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
  1 | @ECHO OFF
  2 | 
  3 | REM Command file for Sphinx documentation
  4 | 
  5 | if "%SPHINXBUILD%" == "" (
  6 | 	set SPHINXBUILD=sphinx-build
  7 | )
  8 | set BUILDDIR=_build
  9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% .
 10 | set I18NSPHINXOPTS=%SPHINXOPTS% .
 11 | if NOT "%PAPER%" == "" (
 12 | 	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
 13 | 	set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS%
 14 | )
 15 | 
 16 | if "%1" == "" goto help
 17 | 
 18 | if "%1" == "help" (
 19 | 	:help
 20 | 	echo.Please use `make ^<target^>` where ^<target^> is one of
 21 | 	echo.  html       to make standalone HTML files
 22 | 	echo.  dirhtml    to make HTML files named index.html in directories
 23 | 	echo.  singlehtml to make a single large HTML file
 24 | 	echo.  pickle     to make pickle files
 25 | 	echo.  json       to make JSON files
 26 | 	echo.  htmlhelp   to make HTML files and a HTML help project
 27 | 	echo.  qthelp     to make HTML files and a qthelp project
 28 | 	echo.  devhelp    to make HTML files and a Devhelp project
 29 | 	echo.  epub       to make an epub
 30 | 	echo.  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter
 31 | 	echo.  text       to make text files
 32 | 	echo.  man        to make manual pages
 33 | 	echo.  texinfo    to make Texinfo files
 34 | 	echo.  gettext    to make PO message catalogs
 35 | 	echo.  changes    to make an overview over all changed/added/deprecated items
 36 | 	echo.  xml        to make Docutils-native XML files
 37 | 	echo.  pseudoxml  to make pseudoxml-XML files for display purposes
 38 | 	echo.  linkcheck  to check all external links for integrity
 39 | 	echo.  doctest    to run all doctests embedded in the documentation if enabled
 40 | 	echo.  coverage   to run coverage check of the documentation if enabled
 41 | 	goto end
 42 | )
 43 | 
 44 | if "%1" == "clean" (
 45 | 	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
 46 | 	del /q /s %BUILDDIR%\*
 47 | 	goto end
 48 | )
 49 | 
 50 | 
 51 | REM Check if sphinx-build is available and fallback to Python version if any
 52 | %SPHINXBUILD% 2> nul
 53 | if errorlevel 9009 goto sphinx_python
 54 | goto sphinx_ok
 55 | 
 56 | :sphinx_python
 57 | 
 58 | set SPHINXBUILD=python -m sphinx.__init__
 59 | %SPHINXBUILD% 2> nul
 60 | if errorlevel 9009 (
 61 | 	echo.
 62 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
 63 | 	echo.installed, then set the SPHINXBUILD environment variable to point
 64 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
 65 | 	echo.may add the Sphinx directory to PATH.
 66 | 	echo.
 67 | 	echo.If you don't have Sphinx installed, grab it from
 68 | 	echo.http://sphinx-doc.org/
 69 | 	exit /b 1
 70 | )
 71 | 
 72 | :sphinx_ok
 73 | 
 74 | 
 75 | if "%1" == "html" (
 76 | 	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
 77 | 	if errorlevel 1 exit /b 1
 78 | 	echo.
 79 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
 80 | 	goto end
 81 | )
 82 | 
 83 | if "%1" == "dirhtml" (
 84 | 	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
 85 | 	if errorlevel 1 exit /b 1
 86 | 	echo.
 87 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
 88 | 	goto end
 89 | )
 90 | 
 91 | if "%1" == "singlehtml" (
 92 | 	%SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml
 93 | 	if errorlevel 1 exit /b 1
 94 | 	echo.
 95 | 	echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml.
 96 | 	goto end
 97 | )
 98 | 
 99 | if "%1" == "pickle" (
100 | 	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
101 | 	if errorlevel 1 exit /b 1
102 | 	echo.
103 | 	echo.Build finished; now you can process the pickle files.
104 | 	goto end
105 | )
106 | 
107 | if "%1" == "json" (
108 | 	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
109 | 	if errorlevel 1 exit /b 1
110 | 	echo.
111 | 	echo.Build finished; now you can process the JSON files.
112 | 	goto end
113 | )
114 | 
115 | if "%1" == "htmlhelp" (
116 | 	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
117 | 	if errorlevel 1 exit /b 1
118 | 	echo.
119 | 	echo.Build finished; now you can run HTML Help Workshop with the ^
120 | .hhp project file in %BUILDDIR%/htmlhelp.
121 | 	goto end
122 | )
123 | 
124 | if "%1" == "qthelp" (
125 | 	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
126 | 	if errorlevel 1 exit /b 1
127 | 	echo.
128 | 	echo.Build finished; now you can run "qcollectiongenerator" with the ^
129 | .qhcp project file in %BUILDDIR%/qthelp, like this:
130 | 	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\py_stringmatching.qhcp
131 | 	echo.To view the help file:
132 | 	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\py_stringmatching.ghc
133 | 	goto end
134 | )
135 | 
136 | if "%1" == "devhelp" (
137 | 	%SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp
138 | 	if errorlevel 1 exit /b 1
139 | 	echo.
140 | 	echo.Build finished.
141 | 	goto end
142 | )
143 | 
144 | if "%1" == "epub" (
145 | 	%SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub
146 | 	if errorlevel 1 exit /b 1
147 | 	echo.
148 | 	echo.Build finished. The epub file is in %BUILDDIR%/epub.
149 | 	goto end
150 | )
151 | 
152 | if "%1" == "latex" (
153 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
154 | 	if errorlevel 1 exit /b 1
155 | 	echo.
156 | 	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
157 | 	goto end
158 | )
159 | 
160 | if "%1" == "latexpdf" (
161 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
162 | 	cd %BUILDDIR%/latex
163 | 	make all-pdf
164 | 	cd %~dp0
165 | 	echo.
166 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
167 | 	goto end
168 | )
169 | 
170 | if "%1" == "latexpdfja" (
171 | 	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
172 | 	cd %BUILDDIR%/latex
173 | 	make all-pdf-ja
174 | 	cd %~dp0
175 | 	echo.
176 | 	echo.Build finished; the PDF files are in %BUILDDIR%/latex.
177 | 	goto end
178 | )
179 | 
180 | if "%1" == "text" (
181 | 	%SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text
182 | 	if errorlevel 1 exit /b 1
183 | 	echo.
184 | 	echo.Build finished. The text files are in %BUILDDIR%/text.
185 | 	goto end
186 | )
187 | 
188 | if "%1" == "man" (
189 | 	%SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man
190 | 	if errorlevel 1 exit /b 1
191 | 	echo.
192 | 	echo.Build finished. The manual pages are in %BUILDDIR%/man.
193 | 	goto end
194 | )
195 | 
196 | if "%1" == "texinfo" (
197 | 	%SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo
198 | 	if errorlevel 1 exit /b 1
199 | 	echo.
200 | 	echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo.
201 | 	goto end
202 | )
203 | 
204 | if "%1" == "gettext" (
205 | 	%SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale
206 | 	if errorlevel 1 exit /b 1
207 | 	echo.
208 | 	echo.Build finished. The message catalogs are in %BUILDDIR%/locale.
209 | 	goto end
210 | )
211 | 
212 | if "%1" == "changes" (
213 | 	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
214 | 	if errorlevel 1 exit /b 1
215 | 	echo.
216 | 	echo.The overview file is in %BUILDDIR%/changes.
217 | 	goto end
218 | )
219 | 
220 | if "%1" == "linkcheck" (
221 | 	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
222 | 	if errorlevel 1 exit /b 1
223 | 	echo.
224 | 	echo.Link check complete; look for any errors in the above output ^
225 | or in %BUILDDIR%/linkcheck/output.txt.
226 | 	goto end
227 | )
228 | 
229 | if "%1" == "doctest" (
230 | 	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
231 | 	if errorlevel 1 exit /b 1
232 | 	echo.
233 | 	echo.Testing of doctests in the sources finished, look at the ^
234 | results in %BUILDDIR%/doctest/output.txt.
235 | 	goto end
236 | )
237 | 
238 | if "%1" == "coverage" (
239 | 	%SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage
240 | 	if errorlevel 1 exit /b 1
241 | 	echo.
242 | 	echo.Testing of coverage in the sources finished, look at the ^
243 | results in %BUILDDIR%/coverage/python.txt.
244 | 	goto end
245 | )
246 | 
247 | if "%1" == "xml" (
248 | 	%SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml
249 | 	if errorlevel 1 exit /b 1
250 | 	echo.
251 | 	echo.Build finished. The XML files are in %BUILDDIR%/xml.
252 | 	goto end
253 | )
254 | 
255 | if "%1" == "pseudoxml" (
256 | 	%SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml
257 | 	if errorlevel 1 exit /b 1
258 | 	echo.
259 | 	echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml.
260 | 	goto end
261 | )
262 | 
263 | :end
264 | 


--------------------------------------------------------------------------------
/py_stringmatching/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.4.6"
 2 | 
 3 | # Import tokenizers
 4 | from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer
 5 | from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer
 6 | from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
 7 | from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer
 8 | from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer
 9 | 
10 | # Import similarity measures
11 | from py_stringmatching.similarity_measure.affine import Affine
12 | from py_stringmatching.similarity_measure.bag_distance import BagDistance
13 | from py_stringmatching.similarity_measure.cosine import Cosine
14 | from py_stringmatching.similarity_measure.dice import Dice
15 | from py_stringmatching.similarity_measure.editex import Editex
16 | from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard
17 | from py_stringmatching.similarity_measure.hamming_distance import HammingDistance
18 | from py_stringmatching.similarity_measure.jaccard import Jaccard
19 | from py_stringmatching.similarity_measure.jaro import Jaro
20 | from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
21 | from py_stringmatching.similarity_measure.levenshtein import Levenshtein
22 | from py_stringmatching.similarity_measure.monge_elkan import MongeElkan
23 | from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch
24 | from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient
25 | from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman
26 | from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf
27 | from py_stringmatching.similarity_measure.soundex import Soundex
28 | from py_stringmatching.similarity_measure.tfidf import TfIdf
29 | from py_stringmatching.similarity_measure.tversky_index import TverskyIndex
30 | from py_stringmatching.similarity_measure.partial_ratio import PartialRatio
31 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/similarity_measure/__init__.py


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/affine.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from py_stringmatching import utils
  3 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  4 |                                                     SequenceSimilarityMeasure
  5 | from py_stringmatching.similarity_measure.cython.cython_affine import affine
  6 | from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident
  7 | 
  8 | class Affine(SequenceSimilarityMeasure):
  9 |     """Returns the affine gap score between two strings. 
 10 | 
 11 |     The affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more
 12 |     gracefully. For more information refer to the string matching chapter in the DI book ("Principles of Data Integration").
 13 | 
 14 |     Args:
 15 |         gap_start (float): Cost for the gap at the start (defaults to 1).
 16 |         gap_continuation (float): Cost for the gap continuation (defaults to 0.5).
 17 |         sim_func (function): Function computing similarity score between two characters, which are represented as strings (defaults
 18 |                              to an identity function, which returns 1 if the two characters are the same and returns 0 otherwise). 
 19 | 
 20 |     Attributes:
 21 |         gap_start (float): An attribute to store the gap cost at the start.
 22 |         gap_continuation (float): An attribute to store the gap continuation cost.
 23 |         sim_func (function): An attribute to store the similarity function.
 24 |     """
 25 | 
 26 |     def __init__(self, gap_start=1, gap_continuation=0.5, sim_func=cython_sim_ident):
 27 |         self.gap_start = gap_start
 28 |         self.gap_continuation = gap_continuation
 29 |         self.sim_func = sim_func
 30 |         super(Affine, self).__init__()
 31 | 
 32 |     def get_raw_score(self, string1, string2):
 33 |         """Computes the affine gap score between two strings. This score can be outside the range [0,1].
 34 |         
 35 |         Args:
 36 |             string1,string2 (str) : Input strings.
 37 | 
 38 |         Returns:
 39 |             Affine gap score betwen the two input strings (float).
 40 | 
 41 |         Raises:
 42 |             TypeError : If the inputs are not strings or if one of the inputs is None.
 43 | 
 44 |         Examples:
 45 |             >>> aff = Affine()
 46 |             >>> aff.get_raw_score('dva', 'deeva')
 47 |             1.5
 48 |             >>> aff = Affine(gap_start=2, gap_continuation=0.5)
 49 |             >>> aff.get_raw_score('dva', 'deeve')
 50 |             -0.5
 51 |             >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0)))
 52 |             >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA')
 53 |             4.4
 54 |         """
 55 |         # input validations
 56 |         utils.sim_check_for_none(string1, string2)
 57 | 
 58 |         # convert input to unicode.
 59 |         string1 = utils.convert_to_unicode(string1)
 60 |         string2 = utils.convert_to_unicode(string2)
 61 | 
 62 |         utils.tok_check_for_string_input(string1, string2)
 63 | 
 64 |         # if one of the strings is empty return 0
 65 |         if utils.sim_check_for_empty(string1, string2):
 66 |             return 0
 67 | 
 68 |         return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func)
 69 | 
 70 |     def get_gap_start(self):
 71 |         """Get gap start cost.
 72 | 
 73 |         Returns:
 74 |             gap start cost (float).
 75 |         """
 76 |         return self.gap_start
 77 | 
 78 |     def get_gap_continuation(self):
 79 |         """Get gap continuation cost.
 80 | 
 81 |         Returns:
 82 |             gap continuation cost (float).
 83 |         """
 84 |         return self.gap_continuation
 85 | 
 86 |     def get_sim_func(self):
 87 |         """Get similarity function.
 88 | 
 89 |         Returns:
 90 |             similarity function (function).
 91 |         """
 92 |         return self.sim_func
 93 | 
 94 |     def set_gap_start(self, gap_start):
 95 |         """Set gap start cost.
 96 | 
 97 |         Args:
 98 |             gap_start (float): Cost for the gap at the start.
 99 |         """
100 |         self.gap_start = gap_start
101 |         return True
102 | 
103 |     def set_gap_continuation(self, gap_continuation):
104 |         """Set gap continuation cost.
105 | 
106 |         Args:
107 |             gap_continuation (float): Cost for the gap continuation.
108 |         """
109 |         self.gap_continuation = gap_continuation
110 |         return True
111 | 
112 |     def set_sim_func(self, sim_func):
113 |         """Set similarity function.
114 | 
115 |         Args:
116 |             sim_func (function): Function computing similarity score between two characters, represented as strings.
117 |         """
118 |         self.sim_func = sim_func
119 |         return True
120 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/bag_distance.py:
--------------------------------------------------------------------------------
  1 | """Bag distance measure"""
  2 | 
  3 | import collections
  4 | 
  5 | from py_stringmatching import utils
  6 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  7 |                                                     SequenceSimilarityMeasure
  8 | 
  9 | 
 10 | class BagDistance(SequenceSimilarityMeasure):
 11 |     """Bag distance measure class.
 12 |     """
 13 |     def __init__(self):
 14 |         super(BagDistance, self).__init__()
 15 | 
 16 |     def get_raw_score(self, string1, string2):
 17 |         """
 18 |         Computes the bag distance between two strings.
 19 | 
 20 |         For two strings X and Y, the Bag distance is:
 21 |         :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )`
 22 | 
 23 |         Args:
 24 |             string1,string2 (str): Input strings
 25 | 
 26 |         Returns:
 27 |             Bag distance (int)
 28 | 
 29 |         Raises:
 30 |             TypeError : If the inputs are not strings
 31 | 
 32 |         Examples:
 33 |             >>> bd = BagDistance()
 34 |             >>> bd.get_raw_score('cat', 'hat')
 35 |             1
 36 |             >>> bd.get_raw_score('Niall', 'Neil')
 37 |             2
 38 |             >>> bd.get_raw_score('aluminum', 'Catalan')
 39 |             5
 40 |             >>> bd.get_raw_score('ATCG', 'TAGC')
 41 |             0
 42 |             >>> bd.get_raw_score('abcde', 'xyz')
 43 |             5
 44 | 
 45 |         References:
 46 |             * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
 47 |         """
 48 |         # input validations
 49 |         utils.sim_check_for_none(string1, string2)
 50 |         utils.sim_check_for_string_inputs(string1, string2)
 51 |         if utils.sim_check_for_exact_match(string1, string2):
 52 |             return 0
 53 | 
 54 |         len_str1 = len(string1)
 55 |         len_str2 = len(string2)
 56 | 
 57 |         if len_str1 == 0:
 58 |             return len_str2
 59 | 
 60 |         if len_str2 == 0:
 61 |             return len_str1
 62 | 
 63 |         bag1 = collections.Counter(string1)
 64 |         bag2 = collections.Counter(string2)
 65 | 
 66 |         size1 = sum((bag1 - bag2).values())
 67 |         size2 = sum((bag2 - bag1).values())
 68 | 
 69 |         # returning the max of difference of sets
 70 |         return max(size1, size2)
 71 | 
 72 |     def get_sim_score(self, string1, string2):
 73 |         """
 74 |         Computes the normalized bag similarity between two strings.
 75 | 
 76 |         Args:
 77 |             string1,string2 (str): Input strings
 78 | 
 79 |         Returns:
 80 |             Normalized bag similarity (float)
 81 | 
 82 |         Raises:
 83 |             TypeError : If the inputs are not strings
 84 | 
 85 |         Examples:
 86 |             >>> bd = BagDistance()
 87 |             >>> bd.get_sim_score('cat', 'hat')
 88 |             0.6666666666666667
 89 |             >>> bd.get_sim_score('Niall', 'Neil')
 90 |             0.6
 91 |             >>> bd.get_sim_score('aluminum', 'Catalan')
 92 |             0.375
 93 |             >>> bd.get_sim_score('ATCG', 'TAGC')
 94 |             1.0
 95 |             >>> bd.get_sim_score('abcde', 'xyz')
 96 |             0.0
 97 | 
 98 |         References:
 99 |             * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf
100 |         """
101 |         raw_score = self.get_raw_score(string1, string2)
102 |         string1_len = len(string1)
103 |         string2_len = len(string2)
104 |         if string1_len == 0 and string2_len == 0:
105 |             return 1.0
106 |         return 1 - (raw_score / max(string1_len, string2_len))
107 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cosine.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from py_stringmatching import utils
 4 | from py_stringmatching.similarity_measure.token_similarity_measure import \
 5 |                                                     TokenSimilarityMeasure
 6 | 
 7 | 
 8 | class Cosine(TokenSimilarityMeasure):
 9 |     """Computes a variant of cosine measure known as Ochiai coefficient.
10 | 
11 |     This is not the cosine measure that computes the cosine of the angle between two given vectors. Rather, it computes a variant of cosine measure known as Ochiai coefficient (see the Wikipedia page "Cosine Similarity"). Specifically, for two sets X and Y, this measure computes:
12 | 
13 |         :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}`
14 |         
15 |     Note:
16 |         * In the case where one of X and Y is an empty set and the other is a non-empty set, we define their cosine score to be 0.
17 |         * In the case where both X and Y are empty sets, we define their cosine score to be 1. 
18 |     """
19 | 
20 |     def __init__(self):
21 |         super(Cosine, self).__init__()
22 | 
23 |     def get_raw_score(self, set1, set2):
24 |         """Computes the raw cosine score between two sets.
25 | 
26 |         Args:
27 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
28 | 
29 |         Returns:
30 |             Cosine similarity (float)
31 | 
32 |         Raises:
33 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
34 | 
35 |         Examples:
36 |             >>> cos = Cosine()
37 |             >>> cos.get_raw_score(['data', 'science'], ['data'])
38 |             0.7071067811865475
39 |             >>> cos.get_raw_score(['data', 'data', 'science'], ['data', 'management'])
40 |             0.4999999999999999
41 |             >>> cos.get_raw_score([], ['data'])
42 |             0.0
43 | 
44 |         References:
45 |             * String similarity joins: An Experimental Evaluation (a paper appearing in the VLDB 2014 Conference).
46 |             * Project Flamingo at http://flamingo.ics.uci.edu.
47 |         """
48 |         # input validations
49 |         utils.sim_check_for_none(set1, set2)
50 |         utils.sim_check_for_list_or_set_inputs(set1, set2)
51 | 
52 |         # if exact match return 1.0
53 |         if utils.sim_check_for_exact_match(set1, set2):
54 |             return 1.0
55 | 
56 |         # if one of the strings is empty return 0
57 |         if utils.sim_check_for_empty(set1, set2):
58 |             return 0
59 | 
60 |         if not isinstance(set1, set):
61 |             set1 = set(set1)
62 |         if not isinstance(set2, set):
63 |             set2 = set(set2)
64 | 
65 |         return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) *
66 |                                           math.sqrt(float(len(set2))))
67 | 
68 |     def get_sim_score(self, set1, set2):
69 |         """Computes the normalized cosine similarity between two sets.
70 | 
71 |         Args:
72 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
73 | 
74 |         Returns:
75 |             Normalized cosine similarity (float)
76 | 
77 |         Raises:
78 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
79 | 
80 |         Examples:
81 |             >>> cos = Cosine()
82 |             >>> cos.get_sim_score(['data', 'science'], ['data'])
83 |             0.7071067811865475
84 |             >>> cos.get_sim_score(['data', 'data', 'science'], ['data', 'management'])
85 |             0.4999999999999999
86 |             >>> cos.get_sim_score([], ['data'])
87 |             0.0
88 | 
89 |         """
90 |         return self.get_raw_score(set1, set2)
91 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/similarity_measure/cython/__init__.py


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_affine.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | import numpy as np
 3 | from py_stringmatching.similarity_measure.cython.cython_utils import float_max_two
 4 | from py_stringmatching.similarity_measure.cython.cython_utils import float_max_three
 5 | 
 6 | 
 7 | 
 8 | def affine(unicode string1, unicode string2, float main_gap_start, float main_gap_continuation, sim_func ):
 9 | 
10 |     cdef float gap_start = - main_gap_start
11 |     cdef float gap_continuation = - main_gap_continuation
12 |     cdef int len_str1 = len(string1)
13 |     cdef int len_str2 = len(string2)
14 |     cdef int i=0, j=0
15 |     cdef double[:, :] m = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.double)
16 |     cdef double[:, :] x = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.double)
17 |     cdef double[:, :] y = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.double)
18 | 
19 |     # DP initialization
20 |     for i from 1 <= i < (len_str1+1):
21 |         m[i, 0] = -float(np.inf)
22 |         x[i, 0] = gap_start + (i-1) * gap_continuation
23 |         y[i, 0] = -float(np.inf)
24 |     #
25 |     # # DP initialization
26 |     for j from 1 <= j < (len_str2+1):
27 |         m[0, j] = -float(np.inf)
28 |         x[0, j] = -float(np.inf)
29 |         y[0, j] = gap_start + (j-1) * gap_continuation
30 | 
31 | 
32 |     # affine gap calculation using DP
33 |     for i from 1 <= i < (len_str1 + 1):
34 |         for j from 1 <= j < (len_str2 + 1):
35 |             # best score between x_1....x_i and y_1....y_j
36 |                 # given that x_i is aligned to y_j
37 |             m[i, j] = (sim_func(string1[i-1], string2[j-1]) + float_max_three(m[i-1][j-1],
38 |                                                                        x[i-1][j-1], y[i-1][j-1]))
39 |             # the best score given that x_i is aligned to a gap
40 |             x[i, j] = float_max_two((gap_start + m[i-1, j]), (gap_continuation+ x[i-1, j]))
41 |             # the best score given that y_j is aligned to a gap
42 |             y[i, j] = float_max_two((gap_start+ m[i, j-1]), (gap_continuation + y[i, j-1]))
43 | 
44 |     return float_max_three(m[len_str1, len_str2], x[len_str1, len_str2], y[len_str1, len_str2])
45 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_jaro.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | from py_stringmatching.similarity_measure.cython.cython_utils import int_max_two
 3 | import numpy as np
 4 | cimport numpy as np
 5 | 
 6 | 
 7 | #Cython functions to compute the Jaro score
 8 | def jaro(unicode string1, unicode string2):
 9 |     """Computes the Jaro score between two strings.
10 |         Args:
11 |             string1,string2 (str): Input strings.
12 |         Returns:
13 |             Jaro distance score (float).
14 |     """
15 | 
16 | 
17 |     cdef int len_str1 = len(string1), len_str2 = len(string2)
18 |     cdef int max_len = int_max_two(len_str1, len_str2)
19 |     cdef int search_range = (max_len // 2) - 1
20 | 
21 |     if search_range < 0:
22 |         search_range = 0
23 | 
24 |     # populating numpy arrays of length as each string with zeros
25 |     cdef int[:] flags_s1 = np.zeros(len_str1, dtype=np.int32)
26 |     cdef int[:] flags_s2 = np.zeros(len_str2, dtype=np.int32)
27 | 
28 |     cdef int common_chars = 0, low = 0, high = 0, i = 0, j = 0
29 | 
30 |     # Finding the number of common characters in two strings
31 |     for i from 0 <= i < len_str1:
32 |         low = i - search_range if i > search_range else 0
33 |         high = i + search_range if i + search_range < len_str2 else len_str2 - 1
34 |         for j from low <= j < (high + 1):
35 |             if flags_s2[j] == 0 and string2[j] == string1[i]:
36 |                 flags_s1[i] = flags_s2[j] = 1
37 |                 common_chars += 1
38 |                 break
39 | 
40 |     if common_chars == 0:
41 |         return 0
42 | 
43 |     cdef int trans_count = 0, k = 0
44 | 
45 |     # Finding the number of transpositions and Jaro distance
46 |     for i from 0 <= i < len_str1:
47 |         if flags_s1[i] == 1:
48 |             for j from k <= j < len_str2:
49 |                 if flags_s2[j] == 1:
50 |                     k = j + 1
51 |                     break
52 |             if string1[i] != string2[j]:
53 |                 trans_count += 1
54 |     trans_count /= 2
55 |     cdef float score = (float(common_chars) / len_str1 + float(common_chars) / len_str2 +
56 |                          (float(common_chars) - trans_count) / float(common_chars)) / 3
57 |     return score
58 | 
59 | 
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_jaro_winkler.pyx:
--------------------------------------------------------------------------------
 1 | 
 2 | from py_stringmatching.similarity_measure.cython.cython_utils import int_min_two
 3 | from py_stringmatching.similarity_measure.cython.cython_jaro import jaro
 4 | 
 5 | 
 6 | def jaro_winkler(unicode string1, unicode string2, float prefix_weight):
 7 |     """Function to find the Jaro Winkler distance between two strings.
 8 |     Args:
 9 |         string1,string2 (unicode), prefix_weight (float): Input strings and prefix weight.
10 |     Returns:
11 |         Jaro Winkler distance score (float)
12 |     """
13 |     cdef int i = 0
14 |     cdef float jw_score = jaro(string1, string2)
15 |     cdef int min_len = int_min_two(len(string1), len(string2))
16 |     cdef int j = int_min_two(min_len, 4)
17 | 
18 |     #Finding the Jaro Winkler distance between two strings
19 |     while i < j and string1[i] == string2[i]:
20 |         i += 1
21 |     if i != 0:
22 |         jw_score += i * prefix_weight * (1 - jw_score)
23 | 
24 |     return jw_score
25 | 
26 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_levenshtein.pyx:
--------------------------------------------------------------------------------
 1 | # cython: boundscheck=False
 2 | 
 3 | import cython
 4 | import numpy as np
 5 | cimport numpy as cnp
 6 | from py_stringmatching.similarity_measure.cython.cython_utils import int_min_three
 7 | from numpy import int32
 8 | from numpy cimport int32_t
 9 | 
10 | DTYPE = int
11 | ctypedef cnp.int_t DTYPE_t
12 | 
13 | @cython.boundscheck(False)
14 | @cython.wraparound(False)
15 | 
16 | 
17 | 
18 | def levenshtein(unicode string1, unicode string2):
19 | 
20 |     cdef int len_str1 = len(string1)
21 |     cdef int len_str2 = len(string2)
22 | 
23 |     cdef int ins_cost = 1
24 |     cdef int del_cost = 1
25 |     cdef int sub_cost = 1
26 |     cdef int trans_cost = 1
27 | 
28 |     cdef int i = 0
29 |     cdef int j = 0
30 | 
31 |     if len_str1 == 0:
32 |         return len_str2 * ins_cost
33 | 
34 |     if len_str2 == 0:
35 |         return len_str1 * del_cost
36 | 
37 |     cdef int[:,:] d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int32)
38 | 
39 |     for i from 0 <= i < (len_str1 + 1):
40 |         d_mat[i, 0] = i * del_cost
41 | 
42 |     for j from 0 <= j < (len_str2 + 1):
43 |         d_mat[0, j] = j * ins_cost
44 | 
45 |     cdef unsigned char lchar = 0
46 |     cdef unsigned char rchar = 0
47 | 
48 |     for i from 0 <= i < (len_str1):
49 |         lchar = string1[i]
50 |         for j from 0 <= j < (len_str2):
51 |             rchar = string2[j]
52 | 
53 |             d_mat[i+1,j+1] = int_min_three(d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost, d_mat[i, j]
54 |                                         + (sub_cost if lchar != rchar else 0))
55 |     return d_mat[len_str1, len_str2]
56 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_needleman_wunsch.pyx:
--------------------------------------------------------------------------------
 1 | import cython
 2 | import numpy as np
 3 | cimport numpy as np
 4 | 
 5 | @cython.boundscheck(False)
 6 | @cython.wraparound(False)
 7 | 
 8 | def needleman_wunsch(unicode string1, unicode string2, float gap_cost,
 9 |                                                             sim_score):
10 |     """ Computes Needleman-Wunsch measure raw score.
11 |     Args:
12 |         string1, string2 (unicode): Input unicode strings
13 |         gap_cost (float): Cost of gap
14 |         sim_score (sim function): Similarity function given by user if not use default sim ident function
15 |     Returns:
16 |         Returns Needleman-Wunsch similarity score (float)
17 |     """
18 | 
19 |     cdef int i = 0, j = 0
20 |     cdef double match = 0.0, delete = 0.0, insert = 0.0
21 |     cdef double sim_func_score = 0.0
22 |     cdef int len_s1 = len(string1), len_s2 = len(string2)
23 |     cdef double[:,:] dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=float)
24 | 
25 |     # DP initialization
26 |     for i from 0 <= i < (len_s1 + 1):
27 |         dist_mat[i, 0] = -(i * gap_cost)
28 | 
29 |     # DP initialization
30 |     for j from 0 <= j < (len_s2 + 1):
31 |         dist_mat[0, j] = -(j * gap_cost)
32 | 
33 | 
34 |     # Needleman-Wunsch DP calculation
35 |     for i from 1 <= i < (len_s1 + 1):
36 |         for j from 1 <= j < (len_s2 + 1):
37 |             sim_func_score = sim_score(string1[i - 1], string2[j - 1])
38 |             match = dist_mat[i - 1, j - 1] + sim_func_score
39 |             delete = dist_mat[i - 1, j] - gap_cost
40 |             insert = dist_mat[i, j - 1] - gap_cost
41 |             dist_mat[i, j] = max(match, delete, insert)
42 | 
43 |     return dist_mat[len_s1, len_s2]


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_smith_waterman.pyx:
--------------------------------------------------------------------------------
 1 | import cython
 2 | import numpy as np
 3 | cimport numpy as np
 4 | 
 5 | @cython.boundscheck(False)
 6 | @cython.wraparound(False)
 7 | 
 8 | 
 9 | def smith_waterman(unicode string1, unicode string2, float gap_cost, \
10 |                                                              sim_func):
11 | 
12 |     cdef int i = 0, j = 0
13 |     cdef double match = 0.0, delete = 0.0, insert = 0.0
14 |     cdef double sim_score = 0.0, max_value = 0.0
15 |     cdef int len_s1 = len(string1), len_s2 = len(string2)
16 |     cdef double[:,:] dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=float)
17 | 
18 | 
19 |     # Smith Waterman DP calculations
20 |     for i from 1 <= i < (len_s1 + 1):
21 |         for j from 1 <= j < (len_s2 + 1):
22 | 
23 |             sim_func_score = sim_func(string1[i - 1], string2[j - 1])
24 |             match = dist_mat[i - 1, j - 1] + sim_func_score
25 |             delete = dist_mat[i - 1, j] - gap_cost
26 |             insert = dist_mat[i, j - 1] - gap_cost
27 |             dist_mat[i, j] = max(0, match, delete, insert)
28 |             max_value = max(max_value, dist_mat[i, j])
29 | 
30 |     return max_value


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/cython/cython_utils.pyx:
--------------------------------------------------------------------------------
 1 | import cython
 2 | 
 3 | 
 4 | def cython_sim_ident(unicode char1, unicode char2):
 5 |     return 1 if char1 == char2 else 0
 6 | 
 7 | 
 8 | def int_max_two(int a, int b):
 9 |     """Finds the maximum integer of the given two integers.
10 |         Args:
11 |             integer1, integer2 (int): Input integers.
12 |         Returns:
13 |             Maximum integer (int).
14 |     """
15 |     if a > b : return a
16 |     else: return b
17 | 
18 | 
19 | def int_max_three(int a, int b, int c):
20 |     """Finds the maximum integer of the given three integers.
21 |         Args:
22 |             integer1, integer2, integer3 (int): Input integers.
23 |         Returns:
24 |             Maximum integer (int).
25 |     """
26 |     cdef int max_int = a
27 |     if b > max_int:
28 |         max_int = b
29 |     if c > max_int:
30 |         max_int = c
31 |     return max_int
32 | 
33 | 
34 | def float_max_two(float a, float b):
35 |     """Finds the maximum float of the given two floats.
36 |         Args:
37 |             float1, float2 (float): Input floats.
38 |         Returns:
39 |             Maximum float (float).
40 |     """
41 |     if a > b : return a
42 |     else: return b
43 | 
44 | 
45 | def float_max_three(float a, float b, float c):
46 |     """Finds the maximum float of the given two float.
47 |         Args:
48 |             float1, float2, float3 (float): Input floats.
49 |         Returns:
50 |             Maximum float (float).
51 |     """
52 |     cdef float max_float = a
53 |     if b > max_float:
54 |         max_float = b
55 |     if c > max_float:
56 |         max_float = c
57 |     return max_float
58 | 
59 | 
60 | def int_min_two(int a, int b):
61 |     """Finds the minimum integer of the given two integers.
62 |     Args:
63 |         integer a,integer b (int): Input integers.
64 |     Returns:
65 |         Minimum integer (int).
66 |     """
67 |     if a > b : return b
68 |     else: return a
69 | 
70 | 
71 | def int_min_three(int a, int b, int c):
72 |     """Finds the minimum integer of the given two integers.
73 |     Args:
74 |         integer a, integer b, integer c (int): Input integers.
75 |     Returns:
76 |         Minimum integer (int).
77 |     """
78 |     cdef int min_int = a
79 |     if b < min_int:
80 |         min_int = b
81 |     if c < min_int:
82 |         min_int = c
83 |     return min_int
84 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/dice.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.similarity_measure.token_similarity_measure import \
 3 |                                                     TokenSimilarityMeasure
 4 | 
 5 | 
 6 | class Dice(TokenSimilarityMeasure):
 7 |     """Returns the Dice score between two strings.
 8 | 
 9 |     The Dice similarity score is defined as twice the shared information (intersection) divided by sum of cardinalities.
10 |     For two sets X and Y, the Dice similarity score is:
11 | 
12 |         :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}`
13 |         
14 |     Note:
15 |         In the case where both X and Y are empty sets, we define their Dice score to be 1. 
16 |     """
17 | 
18 |     def __init__(self):
19 |         super(Dice, self).__init__()
20 | 
21 |     def get_raw_score(self, set1, set2):
22 |         """Computes the raw Dice score between two sets. This score is already in [0,1].
23 | 
24 |         Args:
25 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
26 | 
27 |         Returns:
28 |             Dice similarity score (float).
29 | 
30 |         Raises:
31 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
32 | 
33 |         Examples:
34 |             >>> dice = Dice()
35 |             >>> dice.get_raw_score(['data', 'science'], ['data'])
36 |             0.6666666666666666
37 |             >>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
38 |             0.5454545454545454
39 |             >>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
40 |             0.5
41 | 
42 |         References:
43 |             * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient
44 |             * SimMetrics library.
45 |         """
46 |         
47 |         # input validations
48 |         utils.sim_check_for_none(set1, set2)
49 |         utils.sim_check_for_list_or_set_inputs(set1, set2)
50 | 
51 |         # if exact match return 1.0
52 |         if utils.sim_check_for_exact_match(set1, set2):
53 |             return 1.0
54 | 
55 |         # if one of the strings is empty return 0
56 |         if utils.sim_check_for_empty(set1, set2):
57 |             return 0
58 | 
59 |         if not isinstance(set1, set):
60 |             set1 = set(set1)
61 |         if not isinstance(set2, set):
62 |             set2 = set(set2)
63 | 
64 |         return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2))
65 | 
66 |     def get_sim_score(self, set1, set2):
67 |         """Computes the normalized dice similarity score between two sets. Simply call get_raw_score.
68 | 
69 |         Args:
70 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
71 | 
72 |         Returns:
73 |             Normalized dice similarity (float).
74 | 
75 |         Raises:
76 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
77 | 
78 |         Examples:
79 |             >>> dice = Dice()
80 |             >>> dice.get_sim_score(['data', 'science'], ['data'])
81 |             0.6666666666666666
82 |             >>> dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
83 |             0.5454545454545454
84 |             >>> dice.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
85 |             0.5
86 | 
87 |         """
88 |         return self.get_raw_score(set1, set2)
89 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/generalized_jaccard.py:
--------------------------------------------------------------------------------
  1 | """Generalized jaccard similarity measure"""
  2 | 
  3 | from py_stringmatching import utils
  4 | from py_stringmatching.similarity_measure.jaro import Jaro
  5 | from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
  6 |                                                     HybridSimilarityMeasure
  7 | 
  8 | 
  9 | class GeneralizedJaccard(HybridSimilarityMeasure):
 10 |     """Generalized jaccard similarity measure class.
 11 | 
 12 |     Parameters:
 13 |         sim_func (function): similarity function. This should return a similarity score between two strings in set (optional),
 14 |                              default is jaro similarity measure
 15 |         threshold (float): Threshold value (defaults to 0.5). If the similarity of a token pair exceeds the threshold,
 16 |                            then the token pair is considered a match.
 17 |     """
 18 |     def __init__(self, sim_func=Jaro().get_raw_score, threshold=0.5):
 19 |         self.sim_func = sim_func
 20 |         self.threshold = threshold
 21 |         super(GeneralizedJaccard, self).__init__()
 22 | 
 23 |     def get_raw_score(self, set1, set2):
 24 |         """
 25 |         Computes the Generalized Jaccard measure between two sets.
 26 | 
 27 |         This similarity measure is softened version of the Jaccard measure. The Jaccard measure is
 28 |         promising candidate for tokens which exactly match across the sets. However, in practice tokens
 29 |         are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable
 30 |         matching in such cases.
 31 | 
 32 |         Args:
 33 |             set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.
 34 | 
 35 |         Returns:
 36 |             Generalized Jaccard similarity (float)
 37 | 
 38 |         Raises:
 39 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
 40 |             ValueError : If the similarity measure doesn't return values in the range [0,1]
 41 | 
 42 |         Examples:
 43 |             >>> gj = GeneralizedJaccard()
 44 |             >>> gj.get_raw_score(['data', 'science'], ['data'])
 45 |             0.5
 46 |             >>> gj.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
 47 |             0.3333333333333333
 48 |             >>> gj.get_raw_score(['Niall'], ['Neal', 'Njall'])
 49 |             0.43333333333333335
 50 |             >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8)
 51 |             >>> gj.get_raw_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
 52 |                                  ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
 53 |             0.45810185185185187
 54 |         """
 55 |         # input validations
 56 |         utils.sim_check_for_none(set1, set2)
 57 |         utils.sim_check_for_list_or_set_inputs(set1, set2)
 58 | 
 59 |         # if exact match return 1.0
 60 |         if utils.sim_check_for_exact_match(set1, set2):
 61 |             return 1.0
 62 | 
 63 |         # if one of the strings is empty return 0
 64 |         if utils.sim_check_for_empty(set1, set2):
 65 |             return 0
 66 | 
 67 |         if not isinstance(set1, set):
 68 |             set1 = set(set1)
 69 |         if not isinstance(set2, set):
 70 |             set2 = set(set2)
 71 | 
 72 |         set1_x = set()
 73 |         set2_y = set()
 74 |         match_score = 0.0
 75 |         match_count = 0
 76 |         list_matches = []
 77 |         for element in set1:
 78 |             for item in set2:
 79 |                 score = self.sim_func(element, item)
 80 |                 if score > 1 or score < 0:
 81 |                     raise ValueError('Similarity measure should' + \
 82 |                                      ' return value in the range [0,1]')
 83 |                 if score > self.threshold:
 84 |                     list_matches.append((element, item, score))
 85 | 
 86 |         # position of first string, second string and sim score in tuple
 87 |         first_string_pos = 0
 88 |         second_string_pos = 1
 89 |         sim_score_pos = 2
 90 | 
 91 |         # sort the score of all the pairs
 92 |         list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True)
 93 | 
 94 |         # select score in increasing order of their weightage, 
 95 |         # do not reselect the same element from either set.
 96 |         for element in list_matches:
 97 |             if (element[first_string_pos] not in set1_x and
 98 |                 element[second_string_pos] not in set2_y):
 99 |                 set1_x.add(element[first_string_pos])
100 |                 set2_y.add(element[second_string_pos])
101 |                 match_score += element[sim_score_pos]
102 |                 match_count += 1
103 | 
104 |         return float(match_score) / float(len(set1) + len(set2) - match_count)
105 | 
106 |     def get_sim_score(self, set1, set2):
107 |         """
108 |         Computes the normalized Generalized Jaccard similarity between two sets.
109 | 
110 |         Args:
111 |             set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets.
112 | 
113 |         Returns:
114 |             Normalized Generalized Jaccard similarity (float)
115 | 
116 |         Raises:
117 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
118 |             ValueError : If the similarity measure doesn't return values in the range [0,1]
119 | 
120 |         Examples:
121 |             >>> gj = GeneralizedJaccard()
122 |             >>> gj.get_sim_score(['data', 'science'], ['data'])
123 |             0.5
124 |             >>> gj.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
125 |             0.3333333333333333
126 |             >>> gj.get_sim_score(['Niall'], ['Neal', 'Njall'])
127 |             0.43333333333333335
128 |             >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8)
129 |             >>> gj.get_sim_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'],
130 |                                  ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
131 |             0.45810185185185187
132 | 
133 |         """
134 |         return self.get_raw_score(set1, set2)
135 | 
136 |     def get_sim_func(self):
137 |         """
138 |         Get similarity function
139 | 
140 |         Returns:
141 |             similarity function (function)
142 |         """
143 |         return self.sim_func
144 | 
145 |     def get_threshold(self):
146 |         """
147 |         Get threshold used for the similarity function
148 | 
149 |         Returns:
150 |             threshold (float)
151 |         """
152 |         return self.threshold
153 | 
154 |     def set_sim_func(self, sim_func):
155 |         """
156 |         Set similarity function
157 | 
158 |         Args:
159 |             sim_func (function): similarity function
160 |         """
161 |         self.sim_func = sim_func
162 |         return True
163 | 
164 |     def set_threshold(self, threshold):
165 |         """
166 |         Set threshold value for the similarity function
167 | 
168 |         Args:
169 |             threshold (float): threshold value
170 |         """
171 |         self.threshold = threshold
172 |         return True
173 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/hamming_distance.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
 3 |                                                     SequenceSimilarityMeasure
 4 | 
 5 | 
 6 | class HammingDistance(SequenceSimilarityMeasure):
 7 |     """Computes Hamming distance.
 8 | 
 9 |     The Hamming distance between two strings of equal length is the number of positions at which the corresponding
10 |     symbols are different. Thus, it measures the minimum number of substitutions required to change
11 |     one string into the other, or the minimum number of errors that could have transformed one string into the other.
12 |     """
13 | 
14 |     def __init__(self):
15 |         super(HammingDistance, self).__init__()
16 | 
17 |     def get_raw_score(self, string1, string2):
18 |         """Computes the raw hamming distance between two strings.
19 | 
20 |         Args:
21 |             string1,string2 (str): Input strings.
22 | 
23 |         Returns:
24 |             Hamming distance (int).
25 | 
26 |         Raises:
27 |             TypeError : If the inputs are not strings or if one of the inputs is None.
28 |             ValueError : If the input strings are not of same length.
29 | 
30 |         Examples:
31 |             >>> hd = HammingDistance()
32 |             >>> hd.get_raw_score('', '')
33 |             0
34 |             >>> hd.get_raw_score('alex', 'john')
35 |             4
36 |             >>> hd.get_raw_score(' ', 'a')
37 |             1
38 |             >>> hd.get_raw_score('JOHN', 'john')
39 |             4
40 |         """
41 |         
42 |         # input validations
43 |         utils.sim_check_for_none(string1, string2)
44 | 
45 |         # convert input to unicode.
46 |         string1 = utils.convert_to_unicode(string1)
47 |         string2 = utils.convert_to_unicode(string2)
48 | 
49 |         utils.tok_check_for_string_input(string1, string2)
50 | 
51 |         # for Hamming Distance string length should be same
52 |         utils.sim_check_for_same_len(string1, string2)
53 | 
54 |         # sum all the mismatch characters at the corresponding index of
55 |         # input strings
56 |         return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2))
57 | 
58 |     def get_sim_score(self, string1, string2):
59 |         """Computes the normalized Hamming similarity score between two strings.
60 | 
61 |         Args:
62 |             string1,string2 (str): Input strings.
63 | 
64 |         Returns:
65 |             Normalized Hamming similarity score (float).
66 | 
67 |         Raises:
68 |             TypeError : If the inputs are not strings or if one of the inputs is None.
69 |             ValueError : If the input strings are not of same length.
70 | 
71 |         Examples:
72 |             >>> hd = HammingDistance()
73 |             >>> hd.get_sim_score('', '')
74 |             1.0
75 |             >>> hd.get_sim_score('alex', 'john')
76 |             0.0
77 |             >>> hd.get_sim_score(' ', 'a')
78 |             0.0
79 |             >>> hd.get_sim_score('JOHN', 'john')
80 |             0.0
81 |         """
82 | 
83 |         # convert input to unicode.
84 |         string1 = utils.convert_to_unicode(string1)
85 |         string2 = utils.convert_to_unicode(string2)
86 |         
87 |         raw_score = self.get_raw_score(string1, string2)
88 | 
89 |         common_len = len(string1)
90 |         if common_len == 0:
91 |             return 1.0
92 |         return 1 - (raw_score / common_len)
93 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/hybrid_similarity_measure.py:
--------------------------------------------------------------------------------
1 | """Hybrid similarity measure"""
2 | 
3 | from py_stringmatching.similarity_measure.similarity_measure import \
4 |                                                         SimilarityMeasure
5 | 
6 | class HybridSimilarityMeasure(SimilarityMeasure):
7 |     pass
8 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/jaccard.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.similarity_measure.token_similarity_measure import \
 3 |                                                     TokenSimilarityMeasure
 4 | 
 5 | 
 6 | class Jaccard(TokenSimilarityMeasure):
 7 |     """Computes Jaccard measure.
 8 | 
 9 |      For two sets X and Y, the Jaccard similarity score is:
10 | 
11 |         :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}`
12 |         
13 |      Note:
14 |          In the case where both X and Y are empty sets, we define their Jaccard score to be 1. 
15 |     """
16 | 
17 |     def __init__(self):
18 |         super(Jaccard, self).__init__()
19 | 
20 |     def get_raw_score(self, set1, set2):
21 |         """Computes the raw Jaccard score between two sets.
22 | 
23 |         Args:
24 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
25 | 
26 |         Returns:
27 |             Jaccard similarity score (float).
28 | 
29 |         Raises:
30 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
31 | 
32 |         Examples:
33 |             >>> jac = Jaccard()
34 |             >>> jac.get_raw_score(['data', 'science'], ['data'])
35 |             0.5
36 |             >>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
37 |             0.375
38 |             >>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
39 |             0.3333333333333333
40 |         """
41 |         
42 |         # input validations
43 |         utils.sim_check_for_none(set1, set2)
44 |         utils.sim_check_for_list_or_set_inputs(set1, set2)
45 | 
46 |         # if exact match return 1.0
47 |         if utils.sim_check_for_exact_match(set1, set2):
48 |             return 1.0
49 | 
50 |         # if one of the strings is empty return 0
51 |         if utils.sim_check_for_empty(set1, set2):
52 |             return 0
53 | 
54 |         if not isinstance(set1, set):
55 |             set1 = set(set1)
56 |         if not isinstance(set2, set):
57 |             set2 = set(set2)
58 | 
59 |         return float(len(set1 & set2)) / float(len(set1 | set2))
60 | 
61 |     def get_sim_score(self, set1, set2):
62 |         """Computes the normalized Jaccard similarity between two sets. Simply call get_raw_score.
63 | 
64 |         Args:
65 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
66 | 
67 |         Returns:
68 |             Normalized Jaccard similarity (float).
69 | 
70 |         Raises:
71 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
72 | 
73 |         Examples:
74 |             >>> jac = Jaccard()
75 |             >>> jac.get_sim_score(['data', 'science'], ['data'])
76 |             0.5
77 |             >>> jac.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
78 |             0.375
79 |             >>> jac.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
80 |             0.3333333333333333
81 |         """
82 |         return self.get_raw_score(set1, set2)
83 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/jaro.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
 3 |                                                     SequenceSimilarityMeasure
 4 | from py_stringmatching.similarity_measure.cython.cython_jaro import jaro
 5 | 
 6 | 
 7 | class Jaro(SequenceSimilarityMeasure):
 8 |     """Computes Jaro measure.
 9 | 
10 |     The Jaro measure is a type of edit distance, developed mainly to compare short strings,
11 |     such as first and last names.
12 |     """
13 | 
14 |     def __init__(self):
15 |         super(Jaro, self).__init__()
16 | 
17 |     def get_raw_score(self, string1, string2):
18 |         """Computes the raw Jaro score between two strings.
19 | 
20 |         Args:
21 |             string1,string2 (str): Input strings.
22 | 
23 |         Returns:
24 |             Jaro similarity score (float).
25 | 
26 |         Raises:
27 |             TypeError : If the inputs are not strings or if one of the inputs is None.
28 | 
29 |         Examples:
30 |             >>> jaro = Jaro()
31 |             >>> jaro.get_raw_score('MARTHA', 'MARHTA')
32 |             0.9444444444444445
33 |             >>> jaro.get_raw_score('DWAYNE', 'DUANE')
34 |             0.8222222222222223
35 |             >>> jaro.get_raw_score('DIXON', 'DICKSONX')
36 |             0.7666666666666666
37 | 
38 |         """
39 | 
40 |         # input validations
41 |         utils.sim_check_for_none(string1, string2)
42 | 
43 |         # convert input to unicode.
44 |         string1 = utils.convert_to_unicode(string1)
45 |         string2 = utils.convert_to_unicode(string2)
46 | 
47 |         utils.tok_check_for_string_input(string1, string2)
48 | 
49 |         # if one of the strings is empty return 0
50 |         if utils.sim_check_for_empty(string1, string2):
51 |             return 0
52 | 
53 |         return jaro(string1, string2)
54 | 
55 |     def get_sim_score(self, string1, string2):
56 |         """Computes the normalized Jaro similarity score between two strings. Simply call get_raw_score.
57 | 
58 |         Args:
59 |             string1,string2 (str): Input strings.
60 | 
61 |         Returns:
62 |             Normalized Jaro similarity score (float).
63 | 
64 |         Raises:
65 |             TypeError : If the inputs are not strings or if one of the inputs is None.
66 | 
67 |         Examples:
68 |             >>> jaro = Jaro()
69 |             >>> jaro.get_sim_score('MARTHA', 'MARHTA')
70 |             0.9444444444444445
71 |             >>> jaro.get_sim_score('DWAYNE', 'DUANE')
72 |             0.8222222222222223
73 |             >>> jaro.get_sim_score('DIXON', 'DICKSONX')
74 |             0.7666666666666666
75 | 
76 |         """
77 |         return self.get_raw_score(string1, string2)
78 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/jaro_winkler.py:
--------------------------------------------------------------------------------
  1 | from py_stringmatching import utils
  2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  3 |                                                     SequenceSimilarityMeasure
  4 | from py_stringmatching.similarity_measure.cython.cython_jaro_winkler import jaro_winkler
  5 | 
  6 | 
  7 | class JaroWinkler(SequenceSimilarityMeasure):
  8 |     """Computes Jaro-Winkler measure.
  9 | 
 10 |     The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix and thus are likely to match.
 11 | 
 12 |     Args:
 13 |         prefix_weight (float): Weight to give to the prefix (defaults to 0.1).
 14 | 
 15 |     Attributes:
 16 |         prefix_weight (float): An attribute to store the prefix weight.
 17 |     """
 18 | 
 19 |     def __init__(self, prefix_weight=0.1):
 20 |         self.prefix_weight = prefix_weight
 21 |         super(JaroWinkler, self).__init__()
 22 | 
 23 |     def get_raw_score(self, string1, string2):
 24 |         """Computes the raw Jaro-Winkler score between two strings.
 25 | 
 26 |         Args:
 27 |             string1,string2 (str): Input strings.
 28 | 
 29 |         Returns:
 30 |             Jaro-Winkler similarity score (float).
 31 | 
 32 |         Raises:
 33 |             TypeError : If the inputs are not strings or if one of the inputs is None.
 34 | 
 35 |         Examples:
 36 |             >>> jw = JaroWinkler()
 37 |             >>> jw.get_raw_score('MARTHA', 'MARHTA')
 38 |             0.9611111111111111
 39 |             >>> jw.get_raw_score('DWAYNE', 'DUANE')
 40 |             0.84
 41 |             >>> jw.get_raw_score('DIXON', 'DICKSONX')
 42 |             0.8133333333333332
 43 | 
 44 |         """
 45 |         
 46 |         # input validations
 47 |         utils.sim_check_for_none(string1, string2)
 48 | 
 49 |         # convert input to unicode.
 50 |         string1 = utils.convert_to_unicode(string1)
 51 |         string2 = utils.convert_to_unicode(string2)
 52 | 
 53 |         utils.tok_check_for_string_input(string1, string2)
 54 | 
 55 |         # if one of the strings is empty return 0
 56 |         if utils.sim_check_for_empty(string1, string2):
 57 |             return 0
 58 | 
 59 |         return jaro_winkler(string1, string2, self.prefix_weight)
 60 | 
 61 |     def get_sim_score(self, string1, string2):
 62 |         """Computes the normalized Jaro-Winkler similarity score between two strings. Simply call get_raw_score.
 63 | 
 64 |         Args:
 65 |             string1,string2 (str): Input strings.
 66 | 
 67 |         Returns:
 68 |             Normalized Jaro-Winkler similarity (float).
 69 | 
 70 |         Raises:
 71 |             TypeError : If the inputs are not strings or if one of the inputs is None.
 72 | 
 73 |         Examples:
 74 |             >>> jw = JaroWinkler()
 75 |             >>> jw.get_sim_score('MARTHA', 'MARHTA')
 76 |             0.9611111111111111
 77 |             >>> jw.get_sim_score('DWAYNE', 'DUANE')
 78 |             0.84
 79 |             >>> jw.get_sim_score('DIXON', 'DICKSONX')
 80 |             0.8133333333333332
 81 |         """
 82 |         return self.get_raw_score(string1, string2)
 83 | 
 84 |     def get_prefix_weight(self):
 85 |         """Get prefix weight.
 86 | 
 87 |         Returns:
 88 |             prefix weight (float).
 89 |         """
 90 |         return self.prefix_weight
 91 | 
 92 |     def set_prefix_weight(self, prefix_weight):
 93 |         """Set prefix weight.
 94 | 
 95 |         Args:
 96 |             prefix_weight (float): Weight to give to the prefix.
 97 |         """
 98 |         self.prefix_weight = prefix_weight
 99 |         return True
100 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/levenshtein.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.similarity_measure.cython.cython_levenshtein import levenshtein
 3 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
 4 |     SequenceSimilarityMeasure
 5 | 
 6 | 
 7 | class Levenshtein(SequenceSimilarityMeasure):
 8 |     """Computes Levenshtein measure (also known as edit distance).
 9 | 
10 |     Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string
11 |     is carried out using a sequence of the following operators: delete a character, insert a character, and
12 |     substitute one character for another.
13 |     """
14 | 
15 |     def __init__(self):
16 |         super(Levenshtein, self).__init__()
17 | 
18 |     def get_raw_score(self, string1, string2):
19 |         """Computes the raw Levenshtein distance between two strings.
20 | 
21 |         Args:
22 |             string1,string2 (str): Input strings.
23 | 
24 |         Returns:
25 |             Levenshtein distance (int).
26 | 
27 |         Raises:
28 |             TypeError : If the inputs are not strings.
29 | 
30 |         Examples:
31 |             >>> lev = Levenshtein()
32 |             >>> lev.get_raw_score('a', '')
33 |             1
34 |             >>> lev.get_raw_score('example', 'samples')
35 |             3
36 |             >>> lev.get_raw_score('levenshtein', 'frankenstein')
37 |             6
38 |         """
39 |         
40 |         # input validations
41 |         utils.sim_check_for_none(string1, string2)
42 | 
43 |         # convert input to unicode.
44 |         string1 = utils.convert_to_unicode(string1)
45 |         string2 = utils.convert_to_unicode(string2)
46 | 
47 |         utils.tok_check_for_string_input(string1, string2)
48 | 
49 |         if utils.sim_check_for_exact_match(string1, string2):
50 |             return 0.0
51 | 
52 |         return levenshtein(string1, string2)
53 | 
54 |     def get_sim_score(self, string1, string2):
55 |         """Computes the normalized Levenshtein similarity score between two strings.
56 | 
57 |         Args:
58 |             string1,string2 (str): Input strings.
59 | 
60 |         Returns:
61 |             Normalized Levenshtein similarity (float).
62 | 
63 |         Raises:
64 |             TypeError : If the inputs are not strings.
65 | 
66 |         Examples:
67 |             >>> lev = Levenshtein()
68 |             >>> lev.get_sim_score('a', '')
69 |             0.0
70 |             >>> lev.get_sim_score('example', 'samples')
71 |             0.5714285714285714
72 |             >>> lev.get_sim_score('levenshtein', 'frankenstein')
73 |             0.5
74 | 
75 |         """
76 | 
77 |         # convert input strings to unicode.
78 |         string1 = utils.convert_to_unicode(string1)
79 |         string2 = utils.convert_to_unicode(string2)
80 | 
81 |         raw_score = self.get_raw_score(string1, string2)
82 |         max_len = max(len(string1), len(string2))
83 |         if max_len == 0:
84 |             return 1.0
85 |         return 1 - (raw_score / max_len)
86 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/monge_elkan.py:
--------------------------------------------------------------------------------
  1 | from py_stringmatching import utils
  2 | from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler
  3 | from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
  4 |                                                     HybridSimilarityMeasure
  5 | 
  6 | 
  7 | class MongeElkan(HybridSimilarityMeasure):
  8 |     """Computes Monge-Elkan measure.
  9 | 
 10 |     The Monge-Elkan similarity measure is a type of hybrid similarity measure that combines the benefits of
 11 |     sequence-based and set-based methods. This can be effective for domains in which more control is needed
 12 |     over the similarity measure. It implicitly uses a secondary similarity measure, such as Levenshtein to compute
 13 |     over all similarity score. See the string matching chapter in the DI book (Principles of Data Integration). 
 14 | 
 15 |     Args:
 16 |         sim_func (function): Secondary similarity function. This is expected to be a sequence-based
 17 |                              similarity measure (defaults to Jaro-Winkler similarity measure).
 18 | 
 19 |     Attributes:
 20 |         sim_func (function): An attribute to store the secondary similarity function.
 21 |     """
 22 | 
 23 |     def __init__(self, sim_func=JaroWinkler().get_raw_score):
 24 |         self.sim_func = sim_func
 25 |         super(MongeElkan, self).__init__()
 26 | 
 27 |     def get_raw_score(self, bag1, bag2):
 28 |         """Computes the raw Monge-Elkan score between two bags (lists).
 29 | 
 30 |         Args:
 31 |             bag1,bag2 (list): Input lists.
 32 | 
 33 |         Returns:
 34 |             Monge-Elkan similarity score (float).
 35 | 
 36 |         Raises:
 37 |             TypeError : If the inputs are not lists or if one of the inputs is None.
 38 | 
 39 |         Examples:
 40 |             >>> me = MongeElkan()
 41 |             >>> me.get_raw_score(['Niall'], ['Neal'])
 42 |             0.8049999999999999
 43 |             >>> me.get_raw_score(['Niall'], ['Nigel'])
 44 |             0.7866666666666667
 45 |             >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
 46 |             0.8364448130130768
 47 |             >>> me.get_raw_score([''], ['a'])
 48 |             0.0
 49 |             >>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score)
 50 |             >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
 51 |             2.0
 52 |             >>> me = MongeElkan(sim_func=Affine().get_raw_score)
 53 |             >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego'])
 54 |             2.25
 55 | 
 56 |         References:
 57 |             * Principles of Data Integration book
 58 |         """
 59 |         
 60 |         # input validations
 61 |         utils.sim_check_for_none(bag1, bag2)
 62 |         utils.sim_check_for_list_or_set_inputs(bag1, bag2)
 63 | 
 64 |         # if exact match return 1.0
 65 |         if utils.sim_check_for_exact_match(bag1, bag2):
 66 |             return 1.0
 67 | 
 68 |         # if one of the strings is empty return 0
 69 |         if utils.sim_check_for_empty(bag1, bag2):
 70 |             return 0
 71 | 
 72 |         # aggregated sum of all the max sim score of all the elements in bag1
 73 |         # with elements in bag2
 74 |         sum_of_maxes = 0
 75 |         for el1 in bag1:
 76 |             max_sim = float('-inf')
 77 |             for el2 in bag2:
 78 |                 max_sim = max(max_sim, self.sim_func(el1, el2))
 79 |             sum_of_maxes += max_sim
 80 | 
 81 |         sim = float(sum_of_maxes) / float(len(bag1))
 82 | 
 83 |         return sim
 84 | 
 85 |     def get_sim_func(self):
 86 |         """Get the secondary similarity function.
 87 | 
 88 |         Returns:
 89 |             secondary similarity function (function).
 90 |         """
 91 |         return self.sim_func
 92 | 
 93 |     def set_sim_func(self, sim_func):
 94 |         """Set the secondary similarity function.
 95 | 
 96 |         Args:
 97 |             sim_func (function): Secondary similarity function.
 98 |         """
 99 |         self.sim_func = sim_func
100 |         return True
101 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/needleman_wunsch.py:
--------------------------------------------------------------------------------
  1 | from py_stringmatching import utils
  2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  3 |                                                     SequenceSimilarityMeasure
  4 | from py_stringmatching.similarity_measure.cython.cython_needleman_wunsch import needleman_wunsch
  5 | from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident
  6 | 
  7 | 
  8 | class NeedlemanWunsch(SequenceSimilarityMeasure):
  9 |     """Computes Needleman-Wunsch measure.
 10 | 
 11 |     The Needleman-Wunsch distance generalizes the Levenshtein distance and considers global alignment between two strings.
 12 |     Specifically, it is computed by assigning a score to each alignment between the two input strings and choosing the
 13 |     score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences
 14 |     between their characters, allowing for gaps.
 15 | 
 16 |     Args:
 17 |         gap_cost (float): Cost of gap (defaults to 1.0).
 18 |         sim_func (function): Similarity function to give a score for each correspondence between the characters (defaults
 19 |                              to an identity function, which returns 1 if the two characters are the same and 0 otherwise.  
 20 |                              
 21 |     Attributes:
 22 |         gap_cost (float): An attribute to store the gap cost.
 23 |         sim_func (function): An attribute to store the similarity function.
 24 |     """
 25 | 
 26 |     def __init__(self, gap_cost=1.0, sim_func=cython_sim_ident):
 27 |         self.gap_cost = gap_cost
 28 |         self.sim_func = sim_func
 29 |         super(NeedlemanWunsch, self).__init__()
 30 | 
 31 |     def get_raw_score(self, string1, string2):
 32 |         """Computes the raw Needleman-Wunsch score between two strings.
 33 | 
 34 |         Args:
 35 |             string1,string2 (str) : Input strings.
 36 | 
 37 |         Returns:
 38 |             Needleman-Wunsch similarity score (float).
 39 | 
 40 |         Raises:
 41 |             TypeError : If the inputs are not strings or if one of the inputs is None.
 42 | 
 43 |         Examples:
 44 |             >>> nw = NeedlemanWunsch()
 45 |             >>> nw.get_raw_score('dva', 'deeva')
 46 |             1.0
 47 |             >>> nw = NeedlemanWunsch(gap_cost=0.0)
 48 |             >>> nw.get_raw_score('dva', 'deeve')
 49 |             2.0
 50 |             >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0))
 51 |             >>> nw.get_raw_score('dva', 'deeve')
 52 |             1.0
 53 |             >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0))
 54 |             >>> nw.get_raw_score('GCATGCUA', 'GATTACA')
 55 |             2.5
 56 |         """
 57 |         
 58 |         # input validations
 59 |         utils.sim_check_for_none(string1, string2)
 60 | 
 61 |         # convert input to unicode.
 62 |         string1 = utils.convert_to_unicode(string1)
 63 |         string2 = utils.convert_to_unicode(string2)
 64 | 
 65 |         utils.tok_check_for_string_input(string1, string2)
 66 | 
 67 |         # returns the similarity score from the cython function
 68 |         return needleman_wunsch(string1, string2, self.gap_cost, self.sim_func)
 69 | 
 70 |     def get_gap_cost(self):
 71 |         """Get gap cost.
 72 | 
 73 |         Returns:
 74 |             Gap cost (float).
 75 |         """
 76 |         return self.gap_cost
 77 | 
 78 |     def get_sim_func(self):
 79 |         """Get the similarity function.
 80 | 
 81 |         Returns:
 82 |             similarity function (function).
 83 |         """
 84 |         return self.sim_func
 85 | 
 86 |     def set_gap_cost(self, gap_cost):
 87 |         """Set gap cost.
 88 | 
 89 |         Args:
 90 |             gap_cost (float): Cost of gap.
 91 |         """
 92 |         self.gap_cost = gap_cost
 93 |         return True
 94 | 
 95 |     def set_sim_func(self, sim_func):
 96 |         """Set similarity function.
 97 | 
 98 |         Args:
 99 |             sim_func (function): Similarity function to give a score for the correspondence between characters.
100 |         """
101 |         self.sim_func = sim_func
102 |         return True
103 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/overlap_coefficient.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.similarity_measure.token_similarity_measure import \
 3 |                                                     TokenSimilarityMeasure
 4 | 
 5 | 
 6 | class OverlapCoefficient(TokenSimilarityMeasure):
 7 |     """Computes overlap coefficient measure.
 8 | 
 9 |     The overlap coefficient is a similarity measure related to the Jaccard
10 |     measure  that measures the overlap between two sets, and is defined as the size of the intersection divided by
11 |     the smaller of the size of the two sets. For two sets X and Y, the overlap coefficient is:
12 | 
13 |         :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}`
14 |         
15 |     Note:
16 |         * In the case where one of X and Y is an empty set and the other is a non-empty set, we define their overlap coefficient to be 0.
17 |         * In the case where both X and Y are empty sets, we define their overlap coefficient to be 1. 
18 |     """
19 | 
20 |     def __init__(self):
21 |         super(OverlapCoefficient, self).__init__()
22 | 
23 |     def get_raw_score(self, set1, set2):
24 |         """Computes the raw overlap coefficient score between two sets.
25 | 
26 |         Args:
27 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
28 | 
29 |         Returns:
30 |             Overlap coefficient (float).
31 | 
32 |         Raises:
33 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
34 | 
35 |         Examples:
36 |             >>> oc = OverlapCoefficient()
37 |             >>> oc.get_raw_score(['data', 'science'], ['data'])
38 |             1.0
39 |             >>> oc.get_raw_score([], [])
40 |             1.0
41 |             >>> oc.get_raw_score([], ['data'])
42 |             0
43 | 
44 |         References:
45 |             * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient
46 |             * SimMetrics library
47 |         """
48 |         
49 |         # input validations
50 |         utils.sim_check_for_none(set1, set2)
51 |         utils.sim_check_for_list_or_set_inputs(set1, set2)
52 | 
53 |         # if exact match return 1.0
54 |         if utils.sim_check_for_exact_match(set1, set2):
55 |             return 1.0
56 | 
57 |         # if one of the strings is empty return 0
58 |         if utils.sim_check_for_empty(set1, set2):
59 |             return 0
60 | 
61 |         if not isinstance(set1, set):
62 |             set1 = set(set1)
63 |         if not isinstance(set2, set):
64 |             set2 = set(set2)
65 | 
66 |         return float(len(set1 & set2)) / min(len(set1), len(set2))
67 | 
68 |     def get_sim_score(self, set1, set2):
69 |         """Computes the normalized overlap coefficient between two sets. Simply call get_raw_score. 
70 | 
71 |         Args:
72 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
73 | 
74 |         Returns:
75 |             Normalized overlap coefficient (float).
76 | 
77 |         Raises:
78 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
79 | 
80 |         Examples:
81 |             >>> oc = OverlapCoefficient()
82 |             >>> oc.get_sim_score(['data', 'science'], ['data'])
83 |             1.0
84 |             >>> oc.get_sim_score([], [])
85 |             1.0
86 |             >>> oc.get_sim_score([], ['data'])
87 |             0
88 | 
89 |         """
90 |         return self.get_raw_score(set1, set2)
91 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/partial_ratio.py:
--------------------------------------------------------------------------------
  1 | """Fuzzy Wuzzy Partial Ratio Similarity Measure"""
  2 | 
  3 | from difflib import SequenceMatcher
  4 | from py_stringmatching import utils
  5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  6 |                                                     SequenceSimilarityMeasure
  7 | 
  8 | 
  9 | class PartialRatio(SequenceSimilarityMeasure):
 10 |     """Computes the Fuzzy Wuzzy partial ratio similarity between two strings.
 11 | 
 12 |     Fuzzy Wuzzy partial ratio raw score is a measure of the strings similarity as an int in the
 13 |     range [0, 100]. Given two strings X and Y, let the shorter string (X) be of length m.
 14 |     It finds the fuzzy wuzzy ratio similarity measure between the shorter string and every
 15 |     substring of length m of the longer string, and returns the maximum of
 16 |     those similarity measures. Fuzzy Wuzzy partial ratio sim score is a float in the range [0, 1] 
 17 |     and is obtained by dividing the raw score by 100.
 18 | 
 19 |     Note:
 20 |     In the case where either of strings X or Y are empty, we define the Fuzzy Wuzzy ratio similarity 
 21 |     score to be 0.
 22 |     """
 23 |     def __init__(self):
 24 |         pass
 25 | 
 26 |     def get_raw_score(self, string1, string2):
 27 |         """
 28 |         Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings.
 29 |         This score is in the range [0,100].
 30 | 
 31 |         Args:
 32 |             string1,string2 (str): Input strings
 33 | 
 34 |         Returns:
 35 |             Partial Ratio measure raw score (int) is returned
 36 | 
 37 |         Raises:
 38 |             TypeError: If the inputs are not strings
 39 | 
 40 |         Examples:
 41 |             >>> s = PartialRatio()
 42 |             >>> s.get_raw_score('Robert Rupert', 'Rupert')
 43 |             100
 44 |             >>> s.get_raw_score('Sue', 'sue')
 45 |             67
 46 |             >>> s.get_raw_score('example', 'samples')
 47 |             86
 48 | 
 49 |         References:
 50 |             * https://pypi.python.org/pypi/fuzzywuzzy
 51 |         """
 52 |         # input validations
 53 |         utils.sim_check_for_none(string1, string2)
 54 |         utils.sim_check_for_string_inputs(string1, string2)
 55 | 
 56 |         # if one of the strings is empty return 0
 57 |         if utils.sim_check_for_empty(string1, string2):
 58 |             return 0
 59 | 
 60 |         string1 = utils.convert_to_unicode(string1)
 61 |         string2 = utils.convert_to_unicode(string2)
 62 | 
 63 |         # string1 should be smaller in length than string2. If this is not the case
 64 |         # then swap string1 and string2
 65 |         if len(string1) > len(string2):
 66 |             temp = string1
 67 |             string1 = string2
 68 |             string2 = temp
 69 | 
 70 |         sm = SequenceMatcher(None, string1, string2)
 71 |         matching_blocks = sm.get_matching_blocks()
 72 | 
 73 |         scores = []
 74 |         for block in matching_blocks:
 75 |             string2_starting_index = 0
 76 |             if (block[1] - block[0] > 0):
 77 |                 string2_starting_index = block[1] - block[0]
 78 |             string2_ending_index = string2_starting_index + len(string1)
 79 |             string2_substr = string2[string2_starting_index:string2_ending_index]
 80 | 
 81 |             sm2 = SequenceMatcher(None, string1, string2_substr)
 82 |             similarity_ratio = sm2.ratio()
 83 |             if similarity_ratio > .995:
 84 |                 return 100
 85 |             else:
 86 |                 scores.append(similarity_ratio)
 87 | 
 88 |         return int(round(100 * max(scores)))
 89 | 
 90 |     def get_sim_score(self, string1, string2):
 91 |         """
 92 |         Computes the Fuzzy Wuzzy partial ratio similarity score between two strings.
 93 |         This score is in the range [0,1].
 94 | 
 95 |         Args:
 96 |             string1,string2 (str): Input strings
 97 | 
 98 |         Returns:
 99 |             Partial Ratio measure similarity score (float) is returned
100 | 
101 |         Raises:
102 |             TypeError: If the inputs are not strings
103 | 
104 |         Examples:
105 |             >>> s = PartialRatio()
106 |             >>> s.get_sim_score('Robert Rupert', 'Rupert')
107 |             1.0
108 |             >>> s.get_sim_score('Sue', 'sue')
109 |             0.67
110 |             >>> s.get_sim_score('example', 'samples')
111 |             0.86
112 |         
113 |         References:
114 |             * https://pypi.python.org/pypi/fuzzywuzzy
115 |         """
116 |         # input validations
117 |         utils.sim_check_for_none(string1, string2)
118 |         utils.sim_check_for_string_inputs(string1, string2)
119 | 
120 |         # if one of the strings is empty return 0
121 |         if utils.sim_check_for_empty(string1, string2):
122 |             return 0
123 | 
124 |         raw_score = 1.0 * self.get_raw_score(string1, string2)
125 |         sim_score = raw_score / 100
126 |         return sim_score
127 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/partial_token_sort.py:
--------------------------------------------------------------------------------
  1 | """Fuzzy Wuzzy Token Sort Similarity Measure"""
  2 | 
  3 | from py_stringmatching import utils
  4 | 
  5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  6 |                                                     SequenceSimilarityMeasure
  7 | from py_stringmatching.similarity_measure.partial_ratio import PartialRatio
  8 | 
  9 | 
 10 | class PartialTokenSort(SequenceSimilarityMeasure):
 11 |     """Computes Fuzzy Wuzzy partial token sort similarity measure.
 12 | 
 13 |     Fuzzy Wuzzy partial token sort ratio raw raw_score is a measure of the strings similarity as an 
 14 |     int in the range [0, 100]. For two strings X and Y, the score is obtained by
 15 |     splitting the two strings into tokens and then sorting the tokens. The score is
 16 |     then the fuzzy wuzzy partial ratio raw score of the transformed strings. Fuzzy Wuzzy token
 17 |     sort sim score is a float in the range [0, 1] and is obtained by dividing the raw score
 18 |     by 100.
 19 |         
 20 |      Note:
 21 |          In the case where either of strings X or Y are empty, we define the
 22 |          Fuzzy Wuzzy partial ratio similarity score to be 0. 
 23 |     """
 24 |     def __init__(self):
 25 |         pass
 26 | 
 27 |     def _process_string_and_sort(self, s, force_ascii, full_process=True):
 28 |         """Returns a string with tokens sorted. Processes the string if
 29 |         full_process flag is enabled. If force_ascii flag is enabled then
 30 |         processing removes non ascii characters from the string."""
 31 |         # pull tokens
 32 |         ts = utils.process_string(s, force_ascii=force_ascii) if full_process else s
 33 |         tokens = ts.split()
 34 | 
 35 |         # sort tokens and join
 36 |         sorted_string = u" ".join(sorted(tokens))
 37 |         return sorted_string.strip()
 38 | 
 39 |     def get_raw_score(self, string1, string2, force_ascii=True, full_process=True):
 40 |         """
 41 |         Computes the Fuzzy Wuzzy partial token sort measure raw score between two strings.
 42 |         This score is in the range [0,100].
 43 | 
 44 |         Args:
 45 |             string1,string2 (str), : Input strings
 46 |             force_ascii (boolean) : Flag to remove non-ascii characters or not
 47 |             full_process (boolean) : Flag to process the string or not. Processing includes
 48 |             removing non alphanumeric characters, converting string to lower case and 
 49 |             removing leading and trailing whitespaces.
 50 | 
 51 |         Returns:
 52 |             Partial Token Sort measure raw score (int) is returned
 53 | 
 54 |         Raises:
 55 |             TypeError: If the inputs are not strings
 56 | 
 57 |         Examples:
 58 |             >>> s = PartialTokenSort()
 59 |             >>> s.get_raw_score('great is scala', 'java is great')
 60 |             81
 61 |             >>> s.get_raw_score('Sue', 'sue')
 62 |             100
 63 |             >>> s.get_raw_score('C++ and Java', 'Java and Python')
 64 |             64
 65 | 
 66 |         References:
 67 |             * https://pypi.python.org/pypi/fuzzywuzzy
 68 |         """
 69 |         # input validations
 70 |         utils.sim_check_for_none(string1, string2)
 71 |         utils.sim_check_for_string_inputs(string1, string2)
 72 | 
 73 |         # if one of the strings is empty return 0
 74 |         if utils.sim_check_for_empty(string1, string2):
 75 |             return 0
 76 | 
 77 |         sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process)
 78 |         sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process)
 79 |         partialRatio = PartialRatio()
 80 |         return partialRatio.get_raw_score(sorted1, sorted2)
 81 | 
 82 |     def get_sim_score(self, string1, string2, force_ascii=True, full_process=True):
 83 |         """
 84 |         Computes the Fuzzy Wuzzy partial token sort similarity score between two strings.
 85 |         This score is in the range [0,1].
 86 | 
 87 |         Args:
 88 |             string1,string2 (str), : Input strings
 89 |             force_ascii (boolean) : Flag to remove non-ascii characters or not
 90 |             full_process (boolean) : Flag to process the string or not. Processing includes
 91 |             removing non alphanumeric characters, converting string to lower case and 
 92 |             removing leading and trailing whitespaces.
 93 | 
 94 |         Returns:
 95 |             Partial Token Sort measure similarity score (float) is returned
 96 | 
 97 |         Raises:
 98 |             TypeError: If the inputs are not strings
 99 | 
100 |         Examples:
101 |             >>> s = PartialTokenSort()
102 |             >>> s.get_sim_score('great is scala', 'java is great')
103 |             0.81
104 |             >>> s.get_sim_score('Sue', 'sue')
105 |             1.0
106 |             >>> s.get_sim_score('C++ and Java', 'Java and Python')
107 |             0.64
108 | 
109 |         References:
110 |             * https://pypi.python.org/pypi/fuzzywuzzy
111 |         """
112 |         raw_score = 1.0 * self.get_raw_score(string1, string2, force_ascii, full_process)
113 |         sim_score = raw_score / 100
114 |         return sim_score


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/phonetic_similarity_measure.py:
--------------------------------------------------------------------------------
1 | """Phonetics based similarity measure"""
2 | 
3 | from py_stringmatching.similarity_measure.similarity_measure import \
4 |                                                         SimilarityMeasure
5 | 
6 | class PhoneticSimilarityMeasure(SimilarityMeasure):
7 |     pass
8 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/ratio.py:
--------------------------------------------------------------------------------
  1 | """Fuzzy Wuzzy Ratio Similarity Measure"""
  2 | 
  3 | from difflib import SequenceMatcher
  4 | from py_stringmatching import utils
  5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  6 |                                                     SequenceSimilarityMeasure
  7 | 
  8 | 
  9 | class Ratio(SequenceSimilarityMeasure):
 10 |     """Computes Fuzzy Wuzzy ratio similarity measure.
 11 | 
 12 |     Fuzzy Wuzzy ratio raw score is a measure of the strings similarity as an int in the
 13 |     range [0, 100]. For two strings X and Y, the score is defined by 
 14 |     int(round((2.0 * M / T) * 100)) where T is the total number of characters in 
 15 |     both strings, and M is the number of matches in the two strings. Fuzzy Wuzzy ratio
 16 |     sim score is a float in the range [0, 1] and is obtained by dividing the raw score
 17 |     by 100.
 18 |         
 19 |      Note:
 20 |          In the case where either of strings X or Y are empty, we define the
 21 |          Fuzzy Wuzzy ratio similarity score to be 0. 
 22 |     """
 23 |     def __init__(self):
 24 |         pass
 25 | 
 26 |     def get_raw_score(self, string1, string2):
 27 |         """
 28 |         Computes the Fuzzy Wuzzy ratio measure raw score between two strings.
 29 |         This score is in the range [0,100].
 30 | 
 31 |         Args:
 32 |             string1,string2 (str): Input strings
 33 | 
 34 |         Returns:
 35 |             Ratio measure raw score (int) is returned
 36 | 
 37 |         Raises:
 38 |             TypeError: If the inputs are not strings
 39 | 
 40 |         Examples:
 41 |             >>> s = Ratio()
 42 |             >>> s.get_raw_score('Robert', 'Rupert')
 43 |             67
 44 |             >>> s.get_raw_score('Sue', 'sue')
 45 |             67
 46 |             >>> s.get_raw_score('example', 'samples')
 47 |             71
 48 | 
 49 |         References:
 50 |             * https://pypi.python.org/pypi/fuzzywuzzy
 51 |         """
 52 |         # input validations
 53 |         utils.sim_check_for_none(string1, string2)
 54 |         utils.sim_check_for_string_inputs(string1, string2)
 55 | 
 56 |         # if one of the strings is empty return 0
 57 |         if utils.sim_check_for_empty(string1, string2):
 58 |             return 0
 59 | 
 60 |         string1 = utils.convert_to_unicode(string1)
 61 |         string2 = utils.convert_to_unicode(string2)
 62 | 
 63 |         sm = SequenceMatcher(None, string1, string2)
 64 |         return int(round(100 * sm.ratio()))
 65 | 
 66 |     def get_sim_score(self, string1, string2):
 67 |         """
 68 |         Computes the Fuzzy Wuzzy ratio similarity score between two strings.
 69 |         This score is in the range [0,1].
 70 | 
 71 |         Args:
 72 |             string1,string2 (str): Input strings
 73 | 
 74 |         Returns:
 75 |             Ratio measure similarity score (float) is returned
 76 | 
 77 |         Raises:
 78 |             TypeError: If the inputs are not strings
 79 | 
 80 |         Examples:
 81 |             >>> s = Ratio()
 82 |             >>> s.get_sim_score('Robert', 'Rupert')
 83 |             0.67
 84 |             >>> s.get_sim_score('Sue', 'sue')
 85 |             0.67
 86 |             >>> s.get_sim_score('example', 'samples')
 87 |             0.71
 88 | 
 89 |         References:
 90 |             * https://pypi.python.org/pypi/fuzzywuzzy
 91 |         """
 92 |         # input validations
 93 |         utils.sim_check_for_none(string1, string2)
 94 |         utils.sim_check_for_string_inputs(string1, string2)
 95 | 
 96 |         # if one of the strings is empty return 0
 97 |         if utils.sim_check_for_empty(string1, string2):
 98 |             return 0
 99 | 
100 |         raw_score = 1.0 * self.get_raw_score(string1, string2)
101 |         sim_score = raw_score / 100
102 |         return sim_score
103 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/sequence_similarity_measure.py:
--------------------------------------------------------------------------------
1 | """Sequence based similarity measure"""
2 | 
3 | from py_stringmatching.similarity_measure.similarity_measure import \
4 |                                                         SimilarityMeasure
5 | 
6 | class SequenceSimilarityMeasure(SimilarityMeasure):
7 |     pass
8 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/similarity_measure.py:
--------------------------------------------------------------------------------
1 | """Similarity measure"""
2 | 
3 | class SimilarityMeasure(object):
4 |     pass
5 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/smith_waterman.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident
  4 | from py_stringmatching import utils
  5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  6 |                                                     SequenceSimilarityMeasure
  7 | from py_stringmatching.similarity_measure.cython.cython_smith_waterman import smith_waterman
  8 | 
  9 | 
 10 | class SmithWaterman(SequenceSimilarityMeasure):
 11 |     """Computes Smith-Waterman measure.
 12 | 
 13 |     The Smith-Waterman algorithm performs local sequence alignment; that is, for determining similar regions
 14 |     between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of
 15 |     all possible lengths and optimizes the similarity measure. See the string matching chapter in the DI book (Principles of Data Integration). 
 16 | 
 17 |     Args:
 18 |         gap_cost (float): Cost of gap (defaults to 1.0).
 19 |         sim_func (function): Similarity function to give a score for the correspondence between the characters (defaults
 20 |                              to an identity function, which returns 1 if the two characters are the same and 0 otherwise).
 21 | 
 22 |     Attributes:
 23 |         gap_cost (float): An attribute to store the gap cost.
 24 |         sim_func (function): An attribute to store the similarity function.
 25 |     """
 26 | 
 27 |     def __init__(self, gap_cost=1.0, sim_func=cython_sim_ident):
 28 |         self.gap_cost = gap_cost
 29 |         self.sim_func = sim_func
 30 |         super(SmithWaterman, self).__init__()
 31 | 
 32 |     def get_raw_score(self, string1, string2):
 33 |         """Computes the raw Smith-Waterman score between two strings.
 34 | 
 35 |         Args:
 36 |             string1,string2 (str) : Input strings.
 37 | 
 38 |         Returns:
 39 |             Smith-Waterman similarity score (float).
 40 | 
 41 |         Raises:
 42 |             TypeError : If the inputs are not strings or if one of the inputs is None.
 43 | 
 44 |         Examples:
 45 |             >>> sw = SmithWaterman()
 46 |             >>> sw.get_raw_score('cat', 'hat')
 47 |             2.0
 48 |             >>> sw = SmithWaterman(gap_cost=2.2)
 49 |             >>> sw.get_raw_score('dva', 'deeve')
 50 |             1.0
 51 |             >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1))
 52 |             >>> sw.get_raw_score('dva', 'deeve')
 53 |             2.0
 54 |             >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5))
 55 |             >>> sw.get_raw_score('GCATAGCU', 'GATTACA')
 56 |             6.5
 57 |         """
 58 |         
 59 |         # input validations
 60 |         utils.sim_check_for_none(string1, string2)
 61 | 
 62 |         # convert input to unicode.
 63 |         string1 = utils.convert_to_unicode(string1)
 64 |         string2 = utils.convert_to_unicode(string2)
 65 | 
 66 |         utils.tok_check_for_string_input(string1, string2)
 67 | 
 68 |         # Returns smith waterman similarity score from cython function
 69 |         return smith_waterman(string1,string2,self.gap_cost,self.sim_func)
 70 | 
 71 |     def get_gap_cost(self):
 72 |         """Get gap cost.
 73 | 
 74 |         Returns:
 75 |             Gap cost (float).
 76 |         """
 77 |         return self.gap_cost
 78 | 
 79 |     def get_sim_func(self):
 80 |         """Get similarity function.
 81 | 
 82 |         Returns:
 83 |             Similarity function (function).
 84 |         """
 85 |         return self.sim_func
 86 | 
 87 |     def set_gap_cost(self, gap_cost):
 88 |         """Set gap cost.
 89 | 
 90 |         Args:
 91 |             gap_cost (float): Cost of gap.
 92 |         """
 93 |         self.gap_cost = gap_cost
 94 |         return True
 95 | 
 96 |     def set_sim_func(self, sim_func):
 97 |         """Set similarity function.
 98 | 
 99 |         Args:
100 |             sim_func (function): Similarity function to give a score for the correspondence between the characters.
101 |         """
102 |         self.sim_func = sim_func
103 |         return True
104 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/soft_tfidf.py:
--------------------------------------------------------------------------------
  1 | from math import sqrt
  2 | import collections
  3 | 
  4 | from py_stringmatching import utils
  5 | from py_stringmatching.similarity_measure.jaro import Jaro
  6 | from py_stringmatching.similarity_measure.hybrid_similarity_measure import \
  7 |                                                     HybridSimilarityMeasure
  8 | 
  9 | 
 10 | class SoftTfIdf(HybridSimilarityMeasure):
 11 |     """Computes soft TF/IDF measure. 
 12 |     
 13 |     Note:
 14 |         Currently, this measure is implemented without dampening. This is similar to setting dampen flag to be False in TF-IDF.
 15 |         We plan to add the dampen flag in the next release.   
 16 | 
 17 |     Args:
 18 |         corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None,
 19 |                                      the input list are considered the only corpus.
 20 |         sim_func (function): Secondary similarity function. This should return a similarity score between two strings (optional),
 21 |                              default is the Jaro similarity measure.
 22 |         threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity
 23 |                            of a token pair exceeds the threshold, then the token pair is considered a match.
 24 | 
 25 |     Attributes:
 26 |         sim_func (function): An attribute to store the secondary similarity function.
 27 |         threshold (float): An attribute to store the threshold value for the secondary similarity function. 
 28 |     """
 29 | 
 30 |     def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score,
 31 |                  threshold=0.5):
 32 |         self.__corpus_list = corpus_list
 33 |         self.__document_frequency = {}
 34 |         self.__compute_document_frequency()
 35 |         self.__corpus_size = 0 if self.__corpus_list is None else (
 36 |                                                          len(self.__corpus_list))
 37 |         self.sim_func = sim_func
 38 |         self.threshold = threshold
 39 |         super(SoftTfIdf, self).__init__()
 40 | 
 41 |     def get_raw_score(self, bag1, bag2):
 42 |         """Computes the raw soft TF/IDF score between two lists given the corpus information.
 43 | 
 44 |         Args:
 45 |             bag1,bag2 (list): Input lists
 46 | 
 47 |         Returns:
 48 |             Soft TF/IDF score between the input lists (float).
 49 | 
 50 |         Raises:
 51 |             TypeError : If the inputs are not lists or if one of the inputs is None.
 52 | 
 53 |         Examples:
 54 |             >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8)
 55 |             >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
 56 |             0.17541160386140586
 57 |             >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9)
 58 |             >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
 59 |             0.5547001962252291
 60 |             >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']])
 61 |             >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
 62 |             0.0
 63 |             >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6)
 64 |             >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba'])
 65 |             0.81649658092772592
 66 | 
 67 |         References:
 68 |             * the string matching chapter of the "Principles of Data Integration" book.
 69 |         """
 70 |         
 71 |         # input validations
 72 |         utils.sim_check_for_none(bag1, bag2)
 73 |         utils.sim_check_for_list_or_set_inputs(bag1, bag2)
 74 | 
 75 |         # if the strings match exactly return 1.0
 76 |         if utils.sim_check_for_exact_match(bag1, bag2):
 77 |             return 1.0
 78 | 
 79 |         # if one of the strings is empty return 0
 80 |         if utils.sim_check_for_empty(bag1, bag2):
 81 |             return 0
 82 | 
 83 |         # term frequency for input strings
 84 |         tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
 85 |         
 86 |         # find unique elements in the input lists and their document frequency 
 87 |         local_df = {}
 88 |         for element in tf_x:
 89 |             local_df[element] = local_df.get(element, 0) + 1
 90 |         for element in tf_y:
 91 |             local_df[element] = local_df.get(element, 0) + 1
 92 | 
 93 |         # if corpus is not provided treat input string as corpus
 94 |         curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
 95 |                                    (self.__document_frequency, self.__corpus_size))
 96 | 
 97 |         # calculating the term sim score against the input string 2,
 98 |         # construct similarity map
 99 |         similarity_map = {}
100 |         for term_x in tf_x:
101 |             max_score = 0.0
102 |             for term_y in tf_y:
103 |                 score = self.sim_func(term_x, term_y)
104 |                 # adding sim only if it is above threshold and
105 |                 # highest for this element
106 |                 if score > self.threshold and score > max_score:
107 |                     similarity_map[term_x] = (term_x, term_y, score)
108 |                     max_score = score
109 | 
110 |         # position of first string, second string and sim score
111 |         # in the tuple
112 |         first_string_pos = 0
113 |         second_string_pos = 1
114 |         sim_score_pos = 2
115 | 
116 |         result, v_x_2, v_y_2 = 0.0, 0.0, 0.0
117 |         # soft-tfidf calculation
118 |         for element in local_df.keys():
119 |             if curr_df.get(element) is None:
120 |                 continue
121 |             # numerator
122 |             if element in similarity_map:
123 |                 sim = similarity_map[element]
124 |                 idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1)
125 |                 idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1)
126 |                 v_x = idf_first * tf_x.get(sim[first_string_pos], 0)
127 |                 v_y = idf_second * tf_y.get(sim[second_string_pos], 0)
128 |                 result += v_x * v_y * sim[sim_score_pos]
129 |             # denominator
130 |             idf = corpus_size / curr_df[element]
131 |             v_x = idf * tf_x.get(element, 0)
132 |             v_x_2 += v_x * v_x
133 |             v_y = idf * tf_y.get(element, 0)
134 |             v_y_2 += v_y * v_y
135 |         return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2))
136 | 
137 |     def get_corpus_list(self):
138 |         """Get corpus list.
139 | 
140 |         Returns:
141 |             corpus list (list of lists).
142 |         """
143 |         return self.__corpus_list
144 | 
145 |     def get_sim_func(self):
146 |         """Get secondary similarity function.
147 | 
148 |         Returns:
149 |             secondary similarity function (function).
150 |         """
151 |         return self.sim_func
152 | 
153 |     def get_threshold(self):
154 |         """Get threshold used for the secondary similarity function.
155 | 
156 |         Returns:
157 |             threshold (float).
158 |         """
159 |         return self.threshold
160 | 
161 |     def set_threshold(self, threshold):
162 |         """Set threshold value for the secondary similarity function.
163 | 
164 |         Args:
165 |             threshold (float): threshold value.
166 |         """
167 |         self.threshold = threshold
168 |         return True
169 | 
170 |     def set_sim_func(self, sim_func):
171 |         """Set secondary similarity function.
172 | 
173 |         Args:
174 |             sim_func (function): Secondary similarity function.
175 |         """
176 |         self.sim_func = sim_func
177 |         return True
178 | 
179 |     def set_corpus_list(self, corpus_list):
180 |         """Set corpus list.
181 | 
182 |         Args:
183 |             corpus_list (list of lists): Corpus list.
184 |         """
185 |         self.__corpus_list = corpus_list
186 |         self.__document_frequency = {}
187 |         self.__compute_document_frequency()
188 |         self.__corpus_size = 0 if self.__corpus_list is None else (
189 |                                                          len(self.__corpus_list))
190 |         return True
191 | 
192 |     def __compute_document_frequency(self):
193 |         if self.__corpus_list != None:
194 |             for document in self.__corpus_list:
195 |                 for element in set(document):
196 |                     self.__document_frequency[element] = (
197 |                         self.__document_frequency.get(element, 0) + 1)
198 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/soundex.py:
--------------------------------------------------------------------------------
  1 | """Soundex phonetic similarity measure"""
  2 | 
  3 | import re
  4 | 
  5 | from py_stringmatching import utils
  6 | from py_stringmatching.similarity_measure.phonetic_similarity_measure import \
  7 |                                                     PhoneticSimilarityMeasure
  8 | 
  9 | 
 10 | class Soundex(PhoneticSimilarityMeasure):
 11 |     """Soundex phonetic similarity measure class.
 12 |     """
 13 |     def __init__(self):
 14 |         super(Soundex, self).__init__()
 15 | 
 16 |     def get_raw_score(self, string1, string2):
 17 |         """
 18 |         Computes the Soundex phonetic similarity between two strings.
 19 | 
 20 |         Phonetic measure such as soundex match string based on their sound. These
 21 |         measures have been especially effective in matching names, since names are
 22 |         often spelled in different ways that sound the same. For example, Meyer, Meier,
 23 |         and Mire sound the same, as do Smith, Smithe, and Smythe.
 24 | 
 25 |         Soundex is used primarily to match surnames. It does not work as well for names
 26 |         of East Asian origins, because much of the discriminating power of these names
 27 |         resides in the vowel sounds, which the code ignores.
 28 | 
 29 |         Args:
 30 |             string1,string2 (str): Input strings
 31 | 
 32 |         Returns:
 33 |             Soundex similarity score (int) is returned
 34 | 
 35 |         Raises:
 36 |             TypeError : If the inputs are not strings
 37 | 
 38 |         Examples:
 39 |             >>> s = Soundex()
 40 |             >>> s.get_raw_score('Robert', 'Rupert')
 41 |             1
 42 |             >>> s.get_raw_score('Sue', 's')
 43 |             1
 44 |             >>> s.get_raw_score('Gough', 'Goff')
 45 |             0
 46 |             >>> s.get_raw_score('a,,li', 'ali')
 47 |             1
 48 | 
 49 |         """
 50 |         # input validations
 51 |         utils.sim_check_for_none(string1, string2)
 52 |         utils.sim_check_for_string_inputs(string1, string2)
 53 | 
 54 |         # remove all chars but alphanumeric characters
 55 |         string1 = re.sub("[^a-zA-Z0-9]", "", string1)
 56 |         string2 = re.sub("[^a-zA-Z0-9]", "", string2)
 57 | 
 58 |         utils.sim_check_for_zero_len(string1, string2)
 59 | 
 60 |         if utils.sim_check_for_exact_match(string1, string2):
 61 |             return 1
 62 | 
 63 |         string1, string2 = string1.upper(), string2.upper()
 64 |         first_letter1, first_letter2 = string1[0], string2[0]
 65 |         string1, string2 = string1[1:], string2[1:]
 66 | 
 67 |         # remove occurrences of vowels, 'y', 'w' and 'h'
 68 |         string1 = re.sub('[AEIOUYWH]', '', string1)
 69 |         string2 = re.sub('[AEIOUYWH]', '', string2)
 70 | 
 71 |         # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4
 72 |         # (M,N)->5 (R)->6
 73 |         string1 = re.sub('[BFPV]', '1', string1)
 74 |         string1 = re.sub('[CGJKQSXZ]', '2', string1)
 75 |         string1 = re.sub('[DT]', '3', string1)
 76 |         string1 = re.sub('[L]', '4', string1)
 77 |         string1 = re.sub('[MN]', '5', string1)
 78 |         string1 = re.sub('[R]', '6', string1)
 79 | 
 80 |         string2 = re.sub('[BFPV]', '1', string2)
 81 |         string2 = re.sub('[CGJKQSXZ]', '2', string2)
 82 |         string2 = re.sub('[DT]', '3', string2)
 83 |         string2 = re.sub('[L]', '4', string2)
 84 |         string2 = re.sub('[MN]', '5', string2)
 85 |         string2 = re.sub('[R]', '6', string2)
 86 | 
 87 |         string1 = first_letter1 + string1[:3]
 88 |         string2 = first_letter2 + string2[:3]
 89 | 
 90 |         return 1 if string1 == string2 else 0
 91 | 
 92 |     def get_sim_score(self, string1, string2):
 93 |         """
 94 |         Computes the normalized soundex similarity between two strings.
 95 | 
 96 |         Args:
 97 |             string1,string2 (str): Input strings
 98 | 
 99 |         Returns:
100 |             Normalized soundex similarity (int)
101 | 
102 |         Raises:
103 |             TypeError : If the inputs are not strings or if one of the inputs is None.
104 | 
105 |         Examples:
106 |             >>> s = Soundex()
107 |             >>> s.get_sim_score('Robert', 'Rupert')
108 |             1
109 |             >>> s.get_sim_score('Sue', 's')
110 |             1
111 |             >>> s.get_sim_score('Gough', 'Goff')
112 |             0
113 |             >>> s.get_sim_score('a,,li', 'ali')
114 |             1
115 | 
116 |         """
117 |         return self.get_raw_score(string1, string2)
118 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/tfidf.py:
--------------------------------------------------------------------------------
  1 | from math import log, sqrt
  2 | import collections
  3 | 
  4 | from py_stringmatching import utils
  5 | from py_stringmatching.similarity_measure.token_similarity_measure import \
  6 |                                                     TokenSimilarityMeasure
  7 | 
  8 | 
  9 | class TfIdf(TokenSimilarityMeasure):
 10 |     """Computes TF/IDF measure.
 11 | 
 12 |     This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to
 13 |     find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure
 14 |     is that two strings are similar if they share distinguishing terms. See the string matching chapter in the book "Principles of Data Integration"
 15 |     
 16 |     Args:
 17 |         corpus_list (list of lists): The corpus that will be used to compute TF and IDF values. This corpus is a list of strings, where each string has been tokenized into a list of tokens (that is, a bag of tokens). The default is set to None. In this case, when we call this TF/IDF measure on two input strings (using get_raw_score or get_sim_score), the corpus is taken to be the list of those two strings. 
 18 |         dampen (boolean): Flag to indicate whether 'log' should be used in TF and IDF formulas (defaults to True). 
 19 | 
 20 |     Attributes:
 21 |         dampen (boolean): An attribute to store the dampen flag.
 22 |     """
 23 | 
 24 |     def __init__(self, corpus_list=None, dampen=True):
 25 |         self.__corpus_list = corpus_list
 26 |         self.__document_frequency = {}
 27 |         self.__compute_document_frequency()
 28 |         self.__corpus_size = 0 if self.__corpus_list is None else (
 29 |                                                          len(self.__corpus_list))
 30 |         self.dampen = dampen        
 31 |         super(TfIdf, self).__init__()
 32 | 
 33 |     def get_raw_score(self, bag1, bag2):
 34 |         """Computes the raw TF/IDF score between two lists.
 35 | 
 36 |         Args:
 37 |             bag1,bag2 (list): Input lists.
 38 | 
 39 |         Returns:
 40 |             TF/IDF score between the input lists (float).
 41 | 
 42 |         Raises:
 43 |             TypeError : If the inputs are not lists or if one of the inputs is None.
 44 | 
 45 |         Examples:
 46 |             
 47 |             >>> # here the corpus is a list of three strings that 
 48 |             >>> # have been tokenized into three lists of tokens
 49 |             >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
 50 |             >>> tfidf.get_raw_score(['a', 'b', 'a'], ['b', 'c'])
 51 |             0.7071067811865475
 52 |             >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
 53 |             0.0
 54 |             >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
 55 |             >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
 56 |             0.0
 57 |             >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False)
 58 |             >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c'])
 59 |             0.25298221281347033
 60 |             >>> tfidf = TfIdf(dampen=False)
 61 |             >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
 62 |             0.7071067811865475
 63 |             >>> tfidf = TfIdf()
 64 |             >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a'])
 65 |             0.0
 66 |         """
 67 |         # input validations
 68 |         utils.sim_check_for_none(bag1, bag2)
 69 |         utils.sim_check_for_list_or_set_inputs(bag1, bag2)
 70 | 
 71 |         # if the strings match exactly return 1.0
 72 |         if utils.sim_check_for_exact_match(bag1, bag2):
 73 |             return 1.0
 74 | 
 75 |         # if one of the strings is empty return 0
 76 |         if utils.sim_check_for_empty(bag1, bag2):
 77 |             return 0
 78 | 
 79 |         # term frequency for input strings
 80 |         tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2)
 81 |          
 82 |         # find unique elements in the input lists and their document frequency 
 83 |         local_df = {}
 84 |         for element in tf_x:
 85 |             local_df[element] = local_df.get(element, 0) + 1
 86 |         for element in tf_y:
 87 |             local_df[element] = local_df.get(element, 0) + 1
 88 | 
 89 |         # if corpus is not provided treat input string as corpus
 90 |         curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else (
 91 |                                    (self.__document_frequency, self.__corpus_size))
 92 | 
 93 |         idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = (0.0, 0.0, 0.0, 
 94 |                                                       0.0, 0.0, 0.0)
 95 | 
 96 |         # tfidf calculation
 97 |         for element in local_df.keys():
 98 |             df_element = curr_df.get(element)
 99 |             if df_element is None:
100 |                 continue
101 |             idf_element = corpus_size * 1.0 / df_element
102 |             v_x = 0 if element not in tf_x else (log(idf_element) * log(tf_x[element] + 1)) if self.dampen else (
103 |                   idf_element * tf_x[element])
104 |             v_y = 0 if element not in tf_y else (log(idf_element) * log(tf_y[element] + 1)) if self.dampen else (
105 |                   idf_element * tf_y[element])
106 |             v_x_y += v_x * v_y
107 |             v_x_2 += v_x * v_x
108 |             v_y_2 += v_y * v_y
109 | 
110 |         return 0.0 if v_x_y == 0 else v_x_y / (sqrt(v_x_2) * sqrt(v_y_2))
111 | 
112 |     def get_sim_score(self, bag1, bag2):
113 |         """Computes the normalized TF/IDF similarity score between two lists. Simply call get_raw_score.
114 | 
115 |         Args:
116 |             bag1,bag2 (list): Input lists.
117 | 
118 |         Returns:
119 |             Normalized TF/IDF similarity score between the input lists (float).
120 | 
121 |         Raises:
122 |             TypeError : If the inputs are not lists or if one of the inputs is None.
123 | 
124 |         Examples:
125 | 
126 |             >>> # here the corpus is a list of three strings that 
127 |             >>> # have been tokenized into three lists of tokens
128 |             >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']])
129 |             >>> tfidf.get_sim_score(['a', 'b', 'a'], ['b', 'c'])
130 |             0.7071067811865475
131 |             >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
132 |             0.0
133 |             >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']])
134 |             >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
135 |             0.0
136 |             >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False)
137 |             >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'c'])
138 |             0.25298221281347033
139 |             >>> tfidf = TfIdf(dampen=False)
140 |             >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
141 |             0.7071067811865475
142 |             >>> tfidf = TfIdf()
143 |             >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a'])
144 |             0.0            
145 |         """
146 |         return self.get_raw_score(bag1, bag2)
147 | 
148 |     def get_dampen(self):
149 |         """Get dampen flag.
150 | 
151 |         Returns:
152 |             dampen flag (boolean).
153 |         """
154 |         return self.dampen
155 | 
156 |     def get_corpus_list(self):
157 |         """Get corpus list.
158 | 
159 |         Returns:
160 |             corpus list (list of lists).
161 |         """
162 |         return self.__corpus_list
163 | 
164 |     def set_dampen(self, dampen):
165 |         """Set dampen flag.
166 | 
167 |         Args:
168 |             dampen (boolean): Flag to indicate whether 'log' should be applied to TF and IDF formulas.
169 |         """
170 |         self.dampen = dampen
171 |         return True
172 | 
173 |     def set_corpus_list(self, corpus_list):
174 |         """Set corpus list.
175 | 
176 |         Args:
177 |             corpus_list (list of lists): Corpus list.
178 |         """
179 |         self.__corpus_list = corpus_list
180 |         self.__document_frequency = {}
181 |         self.__compute_document_frequency()
182 |         self.__corpus_size = 0 if self.__corpus_list is None else (
183 |                                                          len(self.__corpus_list))
184 |         return True
185 | 
186 |     def __compute_document_frequency(self):
187 |         if self.__corpus_list != None:
188 |             for document in self.__corpus_list:
189 |                 for element in set(document):
190 |                     self.__document_frequency[element] = ( 
191 |                         self.__document_frequency.get(element, 0) + 1)
192 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/token_similarity_measure.py:
--------------------------------------------------------------------------------
1 | """Token based similarity measure"""
2 | 
3 | from py_stringmatching.similarity_measure.similarity_measure import \
4 |                                                         SimilarityMeasure
5 | 
6 | class TokenSimilarityMeasure(SimilarityMeasure):
7 |     pass
8 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/token_sort.py:
--------------------------------------------------------------------------------
  1 | """Fuzzy Wuzzy Token Sort Similarity Measure"""
  2 | 
  3 | from py_stringmatching import utils
  4 | 
  5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \
  6 |                                                     SequenceSimilarityMeasure
  7 | from py_stringmatching.similarity_measure.ratio import Ratio
  8 | 
  9 | 
 10 | class TokenSort(SequenceSimilarityMeasure):
 11 |     """Computes Fuzzy Wuzzy token sort similarity measure.
 12 | 
 13 |     Fuzzy Wuzzy token sort ratio raw raw_score is a measure of the strings similarity as an 
 14 |     int in the range [0, 100]. For two strings X and Y, the score is obtained by
 15 |     splitting the two strings into tokens and then sorting the tokens. The score is
 16 |     then the fuzzy wuzzy ratio raw score of the transformed strings. Fuzzy Wuzzy token
 17 |     sort sim score is a float in the range [0, 1] and is obtained by dividing the raw score
 18 |     by 100.
 19 |         
 20 |      Note:
 21 |          In the case where either of strings X or Y are empty, we define the
 22 |          Fuzzy Wuzzy ratio similarity score to be 0. 
 23 |     """
 24 |     def __init__(self):
 25 |         pass
 26 | 
 27 |     def _process_string_and_sort(self, s, force_ascii, full_process=True):
 28 |         """Returns a string with tokens sorted. Processes the string if
 29 |         full_process flag is enabled. If force_ascii flag is enabled then
 30 |         processing removes non ascii characters from the string."""
 31 |         # pull tokens
 32 |         ts = utils.process_string(s, force_ascii=force_ascii) if full_process else s
 33 |         tokens = ts.split()
 34 | 
 35 |         # sort tokens and join
 36 |         sorted_string = u" ".join(sorted(tokens))
 37 |         return sorted_string.strip()
 38 | 
 39 |     def get_raw_score(self, string1, string2, force_ascii=True, full_process=True):
 40 |         """
 41 |         Computes the Fuzzy Wuzzy token sort measure raw score between two strings.
 42 |         This score is in the range [0,100].
 43 | 
 44 |         Args:
 45 |             string1,string2 (str), : Input strings
 46 |             force_ascii (boolean) : Flag to remove non-ascii characters or not
 47 |             full_process (boolean) : Flag to process the string or not. Processing includes
 48 |             removing non alphanumeric characters, converting string to lower case and 
 49 |             removing leading and trailing whitespaces.
 50 | 
 51 |         Returns:
 52 |             Token Sort measure raw score (int) is returned
 53 | 
 54 |         Raises:
 55 |             TypeError: If the inputs are not strings
 56 | 
 57 |         Examples:
 58 |             >>> s = TokenSort()
 59 |             >>> s.get_raw_score('great is scala', 'java is great')
 60 |             81
 61 |             >>> s.get_raw_score('Sue', 'sue')
 62 |             100
 63 |             >>> s.get_raw_score('C++ and Java', 'Java and Python')
 64 |             64
 65 | 
 66 |         References:
 67 |             * https://pypi.python.org/pypi/fuzzywuzzy
 68 |         """
 69 |         # input validations
 70 |         utils.sim_check_for_none(string1, string2)
 71 |         utils.sim_check_for_string_inputs(string1, string2)
 72 | 
 73 |         # if one of the strings is empty return 0
 74 |         if utils.sim_check_for_empty(string1, string2):
 75 |             return 0
 76 | 
 77 |         sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process)
 78 |         sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process)
 79 |         ratio = Ratio()
 80 |         return ratio.get_raw_score(sorted1, sorted2)
 81 | 
 82 |     def get_sim_score(self, string1, string2, force_ascii=True, full_process=True):
 83 |         """
 84 |         Computes the Fuzzy Wuzzy token sort similarity score between two strings.
 85 |         This score is in the range [0,1].
 86 | 
 87 |         Args:
 88 |             string1,string2 (str), : Input strings
 89 |             force_ascii (boolean) : Flag to remove non-ascii characters or not
 90 |             full_process (boolean) : Flag to process the string or not. Processing includes
 91 |             removing non alphanumeric characters, converting string to lower case and 
 92 |             removing leading and trailing whitespaces.
 93 | 
 94 |         Returns:
 95 |             Token Sort measure similarity score (float) is returned
 96 | 
 97 |         Raises:
 98 |             TypeError: If the inputs are not strings
 99 | 
100 |         Examples:
101 |             >>> s = TokenSort()
102 |             >>> s.get_sim_score('great is scala', 'java is great')
103 |             0.81
104 |             >>> s.get_sim_score('Sue', 'sue')
105 |             1.0
106 |             >>> s.get_sim_score('C++ and Java', 'Java and Python')
107 |             0.64
108 | 
109 |         References:
110 |             * https://pypi.python.org/pypi/fuzzywuzzy
111 |         """
112 |         raw_score = 1.0 * self.get_raw_score(string1, string2, force_ascii, full_process)
113 |         sim_score = raw_score / 100
114 |         return sim_score
115 | 


--------------------------------------------------------------------------------
/py_stringmatching/similarity_measure/tversky_index.py:
--------------------------------------------------------------------------------
  1 | """Tversky index similarity measure"""
  2 | 
  3 | from py_stringmatching import utils
  4 | from py_stringmatching.similarity_measure.token_similarity_measure import \
  5 |                                                     TokenSimilarityMeasure
  6 | 
  7 | 
  8 | class TverskyIndex(TokenSimilarityMeasure):
  9 |     """Tversky index similarity measure class.
 10 | 
 11 |     Parameters:
 12 |         alpha, beta (float): Tversky index parameters (defaults to 0.5).
 13 |     """
 14 |     def __init__(self, alpha=0.5, beta=0.5):
 15 |         # validate alpha and beta 
 16 |         utils.sim_check_tversky_parameters(alpha, beta)
 17 | 
 18 |         self.alpha = alpha
 19 |         self.beta = beta
 20 |         super(TverskyIndex, self).__init__()
 21 | 
 22 |     def get_raw_score(self, set1, set2):
 23 |         """
 24 |         Computes the Tversky index similarity between two sets.
 25 | 
 26 |         The Tversky index is an asymmetric similarity measure on sets that compares a variant to a prototype. The
 27 |         Tversky index can be seen as a generalization of Dice's coefficient and Tanimoto coefficient.
 28 | 
 29 |         For sets X and Y the Tversky index is a number between 0 and 1 given by:
 30 |         :math:`tversky_index(X, Y) = \\frac{|X \\cap Y|}{|X \\cap Y| + \alpha |X-Y| + \beta |Y-X|}`
 31 |         where, :math: \alpha, \beta >=0
 32 | 
 33 |         Args:
 34 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
 35 | 
 36 |         Returns:
 37 |             Tversly index similarity (float)
 38 | 
 39 |         Raises:
 40 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
 41 | 
 42 |         Examples:
 43 |             >>> tvi = TverskyIndex()
 44 |             >>> tvi.get_raw_score(['data', 'science'], ['data'])
 45 |             0.6666666666666666
 46 |             >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
 47 |             0.5
 48 |             >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
 49 |             0.5454545454545454
 50 |             >>> tvi = TverskyIndex(0.5, 0.5)
 51 |             >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
 52 |             0.5454545454545454
 53 |             >>> tvi = TverskyIndex(beta=0.5)
 54 |             >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science'])
 55 |             0.5
 56 |         """
 57 |         # input validations
 58 |         utils.sim_check_for_none(set1, set2)
 59 |         utils.sim_check_for_list_or_set_inputs(set1, set2)
 60 | 
 61 |         # if exact match return 1.0
 62 |         if utils.sim_check_for_exact_match(set1, set2):
 63 |             return 1.0
 64 | 
 65 |         # if one of the strings is empty return 0
 66 |         if utils.sim_check_for_empty(set1, set2):
 67 |             return 0
 68 | 
 69 |         if not isinstance(set1, set):
 70 |             set1 = set(set1)
 71 |         if not isinstance(set2, set):
 72 |             set2 = set(set2)
 73 |         intersection = float(len(set1 & set2))
 74 | 
 75 |         return 1.0 * intersection / (intersection +
 76 |             (self.alpha * len(set1 - set2)) + (self.beta * len(set2 - set1)))
 77 | 
 78 |     def get_sim_score(self, set1, set2):
 79 |         """
 80 |         Computes the normalized tversky index similarity between two sets.
 81 | 
 82 |         Args:
 83 |             set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets.
 84 | 
 85 |         Returns:
 86 |             Normalized tversky index similarity (float)
 87 | 
 88 |         Raises:
 89 |             TypeError : If the inputs are not sets (or lists) or if one of the inputs is None.
 90 | 
 91 |         Examples:
 92 |             >>> tvi = TverskyIndex()
 93 |             >>> tvi.get_sim_score(['data', 'science'], ['data'])
 94 |             0.6666666666666666
 95 |             >>> tvi.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
 96 |             0.5
 97 |             >>> tvi.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
 98 |             0.5454545454545454
 99 |             >>> tvi = TverskyIndex(0.5, 0.5)
100 |             >>> tvi.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8})
101 |             0.5454545454545454
102 |             >>> tvi = TverskyIndex(beta=0.5)
103 |             >>> tvi.get_sim_score(['data', 'management'], ['data', 'data', 'science'])
104 |             0.5
105 | 
106 |         """
107 |         return self.get_raw_score(set1, set2)
108 | 
109 |     def get_alpha(self):
110 |         """
111 |         Get alpha
112 | 
113 |         Returns:
114 |             alpha (float)
115 |         """
116 |         return self.alpha
117 | 
118 |     def get_beta(self):
119 |         """
120 |         Get beta
121 | 
122 |         Returns:
123 |             beta (float)
124 |         """
125 |         return self.beta
126 | 
127 |     def set_alpha(self, alpha):
128 |         """
129 |         Set alpha
130 | 
131 |         Args:
132 |             alpha (float): Tversky index parameter
133 |         """
134 |         self.alpha = alpha
135 |         return True
136 | 
137 |     def set_beta(self, beta):
138 |         """
139 |         Set beta
140 | 
141 |         Args:
142 |             beta (float): Tversky index parameter
143 |         """
144 |         self.beta = beta
145 |         return True
146 | 


--------------------------------------------------------------------------------
/py_stringmatching/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/tests/__init__.py


--------------------------------------------------------------------------------
/py_stringmatching/tests/test_sim_Soundex.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import unittest
  4 | 
  5 | from py_stringmatching.similarity_measure.soundex import Soundex
  6 | 
  7 | from .utils import raises
  8 | 
  9 | 
 10 | class SoundexTestCases(unittest.TestCase):
 11 |     def setUp(self):
 12 |         self.sdx = Soundex()
 13 | 
 14 |     def test_valid_input_raw_score(self):
 15 |         self.assertEqual(self.sdx.get_raw_score('Robert', 'Rupert'), 1)
 16 |         self.assertEqual(self.sdx.get_raw_score('Sue', 'S'), 1)
 17 |         self.assertEqual(self.sdx.get_raw_score('robert', 'rupert'), 1)
 18 |         self.assertEqual(self.sdx.get_raw_score('Gough', 'goff'), 0)
 19 |         self.assertEqual(self.sdx.get_raw_score('gough', 'Goff'), 0)
 20 |         self.assertEqual(self.sdx.get_raw_score('ali', 'a,,,li'), 1)
 21 |         self.assertEqual(self.sdx.get_raw_score('Jawornicki', 'Yavornitzky'), 0)
 22 |         self.assertEqual(self.sdx.get_raw_score('Robert', 'Robert'), 1)
 23 |         self.assertEqual(self.sdx.get_raw_score('Ris..h.ab', 'Ris;hab.'), 1)
 24 |         self.assertEqual(self.sdx.get_raw_score('gough', 'G2'), 1)
 25 |         self.assertEqual(self.sdx.get_raw_score('robert', 'R1:6:3'), 1)
 26 | 
 27 |     def test_valid_input_sim_score(self):
 28 |         self.assertEqual(self.sdx.get_sim_score('Robert', 'Rupert'), 1)
 29 |         self.assertEqual(self.sdx.get_sim_score('Sue', 'S'), 1)
 30 |         self.assertEqual(self.sdx.get_sim_score('robert', 'rupert'), 1)
 31 |         self.assertEqual(self.sdx.get_sim_score('Gough', 'goff'), 0)
 32 |         self.assertEqual(self.sdx.get_sim_score('gough', 'Goff'), 0)
 33 |         self.assertEqual(self.sdx.get_sim_score('ali', 'a,,,li'), 1)
 34 |         self.assertEqual(self.sdx.get_sim_score('Jawornicki', 'Yavornitzky'), 0)
 35 |         self.assertEqual(self.sdx.get_sim_score('Robert', 'Robert'), 1)
 36 |         self.assertEqual(self.sdx.get_raw_score('Ris..h.ab', 'Ris;hab.'), 1)
 37 |         self.assertEqual(self.sdx.get_sim_score('Gough', 'G2'), 1)
 38 |         self.assertEqual(self.sdx.get_sim_score('gough', 'G2'), 1)
 39 |         self.assertEqual(self.sdx.get_sim_score('robert', 'R1:6:3'), 1)
 40 | 
 41 |     @raises(TypeError)
 42 |     def test_invalid_input1_raw_score(self):
 43 |         self.sdx.get_raw_score('a', None)
 44 | 
 45 |     @raises(TypeError)
 46 |     def test_invalid_input2_raw_score(self):
 47 |         self.sdx.get_raw_score(None, 'b')
 48 | 
 49 |     @raises(TypeError)
 50 |     def test_invalid_input3_raw_score(self):
 51 |         self.sdx.get_raw_score(None, None)
 52 | 
 53 |     @raises(ValueError)
 54 |     def test_invalid_input4_raw_score(self):
 55 |         self.sdx.get_raw_score('a', '')
 56 | 
 57 |     @raises(ValueError)
 58 |     def test_invalid_input5_raw_score(self):
 59 |         self.sdx.get_raw_score('', 'This is a long string')
 60 | 
 61 |     @raises(TypeError)
 62 |     def test_invalid_input7_raw_score(self):
 63 |         self.sdx.get_raw_score('xyz', [''])
 64 | 
 65 |     @raises(TypeError)
 66 |     def test_invalid_input1_sim_score(self):
 67 |         self.sdx.get_sim_score('a', None)
 68 | 
 69 |     @raises(TypeError)
 70 |     def test_invalid_input2_sim_score(self):
 71 |         self.sdx.get_sim_score(None, 'b')
 72 | 
 73 |     @raises(TypeError)
 74 |     def test_invalid_input3_sim_score(self):
 75 |         self.sdx.get_sim_score(None, None)
 76 | 
 77 |     @raises(ValueError)
 78 |     def test_invalid_input4_sim_score(self):
 79 |         self.sdx.get_sim_score('a', '')
 80 | 
 81 |     @raises(ValueError)
 82 |     def test_invalid_input5_sim_score(self):
 83 |         self.sdx.get_sim_score('', 'This is a long string')
 84 | 
 85 |     @raises(TypeError)
 86 |     def test_invalid_input7_sim_score(self):
 87 |         self.sdx.get_sim_score('xyz', [''])
 88 | 
 89 |     @raises(ValueError)
 90 |     def test_invalid_input8_sim_score(self):
 91 |         self.sdx.get_sim_score('..,', '..abc.')
 92 | 
 93 |     @raises(ValueError)
 94 |     def test_invalid_input9_sim_score(self):
 95 |         self.sdx.get_sim_score('..', '')
 96 | 
 97 |     @raises(ValueError)
 98 |     def test_invalid_input10_sim_score(self):
 99 |         self.sdx.get_sim_score('.', '..abc,,')
100 | 
101 |     @raises(TypeError)
102 |     def test_invalid_input11_sim_score(self):
103 |         self.sdx.get_sim_score('abc', 123)
104 | 


--------------------------------------------------------------------------------
/py_stringmatching/tests/utils.py:
--------------------------------------------------------------------------------
1 | # Simplified knockoff of nose.tools.raises
2 | def raises(exc_type):
3 |     def deco(f):
4 |         def raises_wrapper(self):
5 |             with self.assertRaises(exc_type):
6 |                 return f(self)
7 |         return raises_wrapper
8 |     return deco
9 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/tokenizer/__init__.py


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/alphabetic_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from py_stringmatching import utils
 4 | from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer
 5 | 
 6 | 
 7 | class AlphabeticTokenizer(DefinitionTokenizer):
 8 |     """Returns tokens that are maximal sequences of consecutive alphabetical characters.
 9 |     
10 |     Args:
11 |         return_set (boolean): A flag to indicate whether to return a set of tokens instead of a bag of tokens (defaults to False).
12 |         
13 |     Attributes: 
14 |         return_set (boolean): An attribute that stores the value for the flag return_set. 
15 |     """
16 |     
17 |     def __init__(self, return_set=False):
18 |         self.__al_regex = re.compile('[a-zA-Z]+')
19 |         super(AlphabeticTokenizer, self).__init__(return_set)
20 | 
21 |     def tokenize(self, input_string):
22 |         """Tokenizes input string into alphabetical tokens.
23 |         
24 |         Args:
25 |             input_string (str): The string to be tokenized.
26 | 
27 |         Returns:
28 |             A Python list, which represents a set of tokens if the flag return_set is True, and a bag of tokens otherwise. 
29 | 
30 |         Raises:
31 |             TypeError : If the input is not a string.
32 | 
33 |         Examples:
34 |             >>> al_tok = AlphabeticTokenizer()
35 |             >>> al_tok.tokenize('data99science, data#integration.')
36 |             ['data', 'science', 'data', 'integration']
37 |             >>> al_tok.tokenize('99')
38 |             []
39 |             >>> al_tok = AlphabeticTokenizer(return_set=True) 
40 |             >>> al_tok.tokenize('data99science, data#integration.')
41 |             ['data', 'science', 'integration']
42 |         """
43 |         utils.tok_check_for_none(input_string)
44 |         utils.tok_check_for_string_input(input_string)
45 | 
46 |         token_list = list(filter(None, self.__al_regex.findall(input_string)))
47 | 
48 |         if self.return_set:
49 |             return utils.convert_bag_to_set(token_list)
50 | 
51 |         return token_list
52 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/alphanumeric_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from py_stringmatching import utils
 4 | from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer
 5 | 
 6 | 
 7 | class AlphanumericTokenizer(DefinitionTokenizer):
 8 |     """Returns tokens that are maximal sequences of consecutive alphanumeric characters. 
 9 | 
10 |     Args:
11 |         return_set (boolean): A flag to indicate whether to return a set of
12 |                               tokens instead of a bag of tokens (defaults to False).
13 |                               
14 |     Attributes: 
15 |         return_set (boolean): An attribute to store the value of the flag return_set.
16 |     """
17 |     
18 |     def __init__(self, return_set=False):
19 |         self.__alnum_regex = re.compile('[a-zA-Z0-9]+')
20 |         super(AlphanumericTokenizer, self).__init__(return_set)
21 | 
22 |     def tokenize(self, input_string):
23 |         """Tokenizes input string into alphanumeric tokens.
24 | 
25 |         Args:
26 |             input_string (str): The string to be tokenized.
27 | 
28 |         Returns:
29 |             A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. 
30 | 
31 |         Raises:
32 |             TypeError : If the input is not a string.
33 | 
34 |         Examples:
35 |             >>> alnum_tok = AlphanumericTokenizer()
36 |             >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
37 |             ['data9', 'science', 'data9', 'integration', '88']
38 |             >>> alnum_tok.tokenize('#.&')
39 |             []
40 |             >>> alnum_tok = AlphanumericTokenizer(return_set=True) 
41 |             >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88')
42 |             ['data9', 'science', 'integration', '88']
43 |                       
44 |         """
45 |         utils.tok_check_for_none(input_string)
46 |         utils.tok_check_for_string_input(input_string)
47 | 
48 |         token_list = list(filter(None,
49 |                                  self.__alnum_regex.findall(input_string)))
50 | 
51 |         if self.return_set:
52 |             return utils.convert_bag_to_set(token_list)
53 | 
54 |         return token_list
55 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/definition_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching.tokenizer.tokenizer import Tokenizer
 2 | 
 3 | 
 4 | class DefinitionTokenizer(Tokenizer):
 5 |     """A class of tokenizers that uses a definition to find tokens, as opposed to using delimiters.
 6 |     
 7 |     Examples of definitions include alphabetical tokens, qgram tokens. Examples of delimiters include white space, punctuations.
 8 | 
 9 |     Args:
10 |         return_set (boolean): A flag to indicate whether to return a set of
11 |                               tokens instead of a bag of tokens (defaults to False).
12 |                               
13 |     Attributes: 
14 |         return_set (boolean): An attribute to store the flag return_set. 
15 |     """
16 |     
17 |     def __init__(self, return_set=False):
18 |         super(DefinitionTokenizer, self).__init__(return_set)
19 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/delimiter_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from py_stringmatching import utils
  4 | from py_stringmatching.tokenizer.tokenizer import Tokenizer
  5 | 
  6 | 
  7 | class DelimiterTokenizer(Tokenizer):
  8 |     """Uses delimiters to find tokens, as apposed to using definitions. 
  9 |     
 10 |     Examples of delimiters include white space and punctuations. Examples of definitions include alphabetical and qgram tokens. 
 11 | 
 12 |     Args:
 13 |         delim_set (set): A set of delimiter strings (defaults to space delimiter).
 14 |         return_set (boolean): A flag to indicate whether to return a set of
 15 |                               tokens instead of a bag of tokens (defaults to False).
 16 |                               
 17 |     Attributes: 
 18 |         return_set (boolean): An attribute to store the value of the flag return_set.
 19 |     """
 20 | 
 21 |     def __init__(self, 
 22 |                  delim_set=set([' ']), return_set=False):
 23 |         self.__delim_set = None
 24 |         self.__use_split = None
 25 |         self.__delim_str = None
 26 |         self.__delim_regex = None
 27 |         self._update_delim_set(delim_set)
 28 |         super(DelimiterTokenizer, self).__init__(return_set)
 29 | 
 30 |     def tokenize(self, input_string):
 31 |         """Tokenizes input string based on the set of delimiters.
 32 | 
 33 |         Args:
 34 |             input_string (str): The string to be tokenized. 
 35 | 
 36 |         Returns:
 37 |             A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. 
 38 | 
 39 |         Raises:
 40 |             TypeError : If the input is not a string.
 41 | 
 42 |         Examples:
 43 |             >>> delim_tok = DelimiterTokenizer() 
 44 |             >>> delim_tok.tokenize('data science')
 45 |             ['data', 'science']
 46 |             >>> delim_tok = DelimiterTokenizer(['$#$']) 
 47 |             >>> delim_tok.tokenize('data$#$science')
 48 |             ['data', 'science']
 49 |             >>> delim_tok = DelimiterTokenizer([',', '.']) 
 50 |             >>> delim_tok.tokenize('data,science.data,integration.')
 51 |             ['data', 'science', 'data', 'integration']
 52 |             >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) 
 53 |             >>> delim_tok.tokenize('data,science.data,integration.')
 54 |             ['data', 'science', 'integration']
 55 | 
 56 |         """
 57 |         utils.tok_check_for_none(input_string)
 58 |         utils.tok_check_for_string_input(input_string)
 59 |     
 60 |         if self.__use_split:
 61 |             token_list = list(filter(None,
 62 |                                      input_string.split(self.__delim_str)))
 63 |         else:
 64 |             token_list = list(filter(None,
 65 |                                      self.__delim_regex.split(input_string)))
 66 | 
 67 |         if self.return_set:
 68 |             return utils.convert_bag_to_set(token_list)
 69 | 
 70 |         return token_list
 71 | 
 72 |     def get_delim_set(self):
 73 |         """Gets the current set of delimiters.
 74 |         
 75 |         Returns:
 76 |             A Python set which is the current set of delimiters. 
 77 |         """
 78 |         return self.__delim_set
 79 | 
 80 |     def set_delim_set(self, delim_set):
 81 |         """Sets the current set of delimiters.
 82 |         
 83 |         Args:
 84 |             delim_set (set): A set of delimiter strings.
 85 |         """
 86 |         return self._update_delim_set(delim_set)
 87 | 
 88 |     def _update_delim_set(self, delim_set):
 89 |         if not isinstance(delim_set, set):
 90 |             delim_set = set(delim_set)
 91 |         self.__delim_set = delim_set
 92 |         # if there is only one delimiter string, use split instead of regex
 93 |         self.__use_split = False
 94 |         if len(self.__delim_set) == 1:
 95 |             self.__delim_str = list(self.__delim_set)[0]
 96 |             self.__use_split = True
 97 |         else:
 98 |             self.__delim_regex = re.compile('|'.join(
 99 |                                      map(re.escape, self.__delim_set)))
100 |         return True
101 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/tokenizer.py:
--------------------------------------------------------------------------------
 1 | class Tokenizer(object):
 2 |     """The root class for tokenizers.
 3 | 
 4 |     Args:
 5 |         return_set (boolean): A flag to indicate whether to return a set of
 6 |                               tokens instead of a bag of tokens (defaults to False). 
 7 |                               
 8 |     Attributes: 
 9 |         return_set (boolean): An attribute to store the flag return_set. 
10 |     """
11 |     
12 |     def __init__(self, return_set=False):
13 |         self.return_set = return_set
14 | 
15 |     def get_return_set(self):
16 |         """Gets the value of the return_set flag.
17 | 
18 |         Returns:
19 |             The boolean value of the return_set flag. 
20 |         """
21 |         return self.return_set
22 | 
23 |     def set_return_set(self, return_set):
24 |         """Sets the value of the return_set flag.
25 | 
26 |         Args:
27 |             return_set (boolean): a flag to indicate whether to return a set of tokens instead of a bag of tokens.
28 |         """
29 |         self.return_set = return_set
30 |         return True
31 | 


--------------------------------------------------------------------------------
/py_stringmatching/tokenizer/whitespace_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from py_stringmatching import utils
 2 | from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer
 3 | 
 4 | 
 5 | class WhitespaceTokenizer(DelimiterTokenizer):
 6 |     """Segments the input string using whitespaces then returns the segments as tokens. 
 7 |     
 8 |     Currently using the split function in Python, so whitespace character refers to 
 9 |     the actual whitespace character as well as the tab and newline characters. 
10 | 
11 |     Args:
12 |         return_set (boolean): A flag to indicate whether to return a set of
13 |                               tokens instead of a bag of tokens (defaults to False).
14 |                               
15 |     Attributes:
16 |         return_set (boolean): An attribute to store the flag return_set. 
17 |     """
18 |     
19 |     def __init__(self, return_set=False):
20 |         super(WhitespaceTokenizer, self).__init__([' ', '\t', '\n'], return_set)
21 | 
22 |     def tokenize(self, input_string):
23 |         """Tokenizes input string based on white space.
24 | 
25 |         Args:
26 |             input_string (str): The string to be tokenized. 
27 | 
28 |         Returns:
29 |             A Python list, which is a set or a bag of tokens, depending on whether return_set is True or False. 
30 | 
31 |         Raises:
32 |             TypeError : If the input is not a string.
33 | 
34 |         Examples:
35 |             >>> ws_tok = WhitespaceTokenizer() 
36 |             >>> ws_tok.tokenize('data science')
37 |             ['data', 'science']
38 |             >>> ws_tok.tokenize('data        science')
39 |             ['data', 'science']
40 |             >>> ws_tok.tokenize('data\tscience')
41 |             ['data', 'science']
42 |             >>> ws_tok = WhitespaceTokenizer(return_set=True) 
43 |             >>> ws_tok.tokenize('data   science data integration')
44 |             ['data', 'science', 'integration']
45 |         """
46 |         utils.tok_check_for_none(input_string)
47 |         utils.tok_check_for_string_input(input_string)
48 | 
49 |         token_list =  list(filter(None, input_string.split()))
50 | 
51 |         if self.return_set:
52 |             return utils.convert_bag_to_set(token_list)
53 | 
54 |         return token_list
55 | 
56 |     def set_delim_set(self, delim_set):
57 |         raise AttributeError('Delimiters cannot be set for WhitespaceTokenizer')
58 | 


--------------------------------------------------------------------------------
/py_stringmatching/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | """
  4 | This module defines a list of utility and validation functions.
  5 | """
  6 | 
  7 | 
  8 | def sim_check_for_none(*args):
  9 |     if len(args) > 0 and args[0] is None:
 10 |         raise TypeError("First argument cannot be None")
 11 |     if len(args) > 1 and args[1] is None:
 12 |         raise TypeError("Second argument cannot be None")
 13 | 
 14 | 
 15 | def sim_check_for_empty(*args):
 16 |     if len(args[0]) == 0 or len(args[1]) == 0:
 17 |         return True
 18 | 
 19 | 
 20 | def sim_check_for_same_len(*args):
 21 |     if len(args[0]) != len(args[1]):
 22 |         raise ValueError("Undefined for sequences of unequal length")
 23 | 
 24 | 
 25 | def sim_check_for_string_inputs(*args):
 26 |     if not isinstance(args[0], str):
 27 |         raise TypeError('First argument is expected to be a string')
 28 |     if not isinstance(args[1], str):
 29 |         raise TypeError('Second argument is expected to be a string')
 30 | 
 31 | 
 32 | def sim_check_for_list_or_set_inputs(*args):
 33 |     if not isinstance(args[0], list):
 34 |         if not isinstance(args[0], set):
 35 |             raise TypeError('First argument is expected to be a python list or set')
 36 |     if not isinstance(args[1], list):
 37 |         if not isinstance(args[1], set):
 38 |             raise TypeError('Second argument is expected to be a python list or set')
 39 | 
 40 | 
 41 | def sim_check_tversky_parameters(alpha, beta):
 42 |         if alpha < 0 or beta < 0:
 43 |             raise ValueError('Tversky parameters should be greater than or equal to zero')
 44 | 
 45 | 
 46 | def sim_check_for_exact_match(*args):
 47 |     if args[0] == args[1]:
 48 |         return True
 49 | 
 50 | 
 51 | def sim_check_for_zero_len(*args):
 52 |     if len(args[0].strip()) == 0 or len(args[1].strip()) == 0:
 53 |         raise ValueError("Undefined for string of zero length")
 54 | 
 55 | 
 56 | def tok_check_for_string_input(*args):
 57 |     for i in range(len(args)):
 58 |         if not isinstance(args[i], str):
 59 |             raise TypeError('Input is expected to be a string')
 60 | 
 61 | 
 62 | def tok_check_for_none(*args):
 63 |     if args[0] is None:
 64 |         raise TypeError("First argument cannot be None")
 65 | 
 66 | 
 67 | def convert_bag_to_set(input_list):
 68 |     seen_tokens = {}
 69 |     output_set =[]
 70 |     for token in input_list:
 71 |         if seen_tokens.get(token) == None:
 72 |             output_set.append(token)
 73 |             seen_tokens[token] = True
 74 |     return output_set
 75 | 
 76 | 
 77 | def convert_to_unicode(input_string):
 78 |     """Convert input string to unicode."""
 79 |     if isinstance(input_string, bytes):
 80 |         return input_string.decode('utf-8')
 81 |     return input_string 
 82 | 
 83 | 
 84 | def remove_non_ascii_chars(input_string):
 85 |     remove_chars = str("").join([chr(i) for i in range(128, 256)])
 86 |     translation_table = dict((ord(c), None) for c in remove_chars)
 87 |     return input_string.translate(translation_table)
 88 | 
 89 | 
 90 | def process_string(input_string, force_ascii=False):
 91 |     """Process string by
 92 |     -- removing all but letters and numbers
 93 |     -- trim whitespace
 94 |     -- converting string to lower case
 95 |     if force_ascii == True, force convert to ascii"""
 96 | 
 97 |     if force_ascii:
 98 |         input_string = remove_non_ascii_chars(input_string)
 99 | 
100 |     regex = re.compile(r"(?ui)\W")
101 | 
102 |     # Keep only Letters and Numbers.
103 |     out_string = regex.sub(" ", input_string)
104 | 
105 |     # Convert String to lowercase.
106 |     out_string = out_string.lower()
107 | 
108 |     # Remove leading and trailing whitespaces.
109 |     out_string = out_string.strip()
110 |     return out_string
111 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | import sys
  3 | import os
  4 | 
  5 | # check if pip is installed. If not, raise an ImportError
  6 | PIP_INSTALLED = True
  7 | 
  8 | try:
  9 |     import pip
 10 | except ImportError:
 11 |     PIP_INSTALLED = False
 12 | 
 13 | def install_and_import(package):
 14 |     import importlib
 15 |     try:
 16 |         importlib.import_module(package)
 17 |     except ImportError:
 18 |         if not PIP_INSTALLED:
 19 |             raise ImportError('pip is not installed.')
 20 |         pip.main(['install', package])
 21 |     finally:
 22 |         globals()[package] = importlib.import_module(package)
 23 | 
 24 | # check if setuptools is installed. If not, install setuptools
 25 | # automatically using pip.
 26 | install_and_import('setuptools')
 27 | 
 28 | from setuptools.command.build_ext import build_ext as _build_ext
 29 | 
 30 | class build_ext(_build_ext):
 31 |     def build_extensions(self):
 32 |         import pkg_resources                                                            
 33 |         numpy_incl = pkg_resources.resource_filename('numpy', 'core/include')
 34 | 
 35 |         for ext in self.extensions:
 36 |             if (hasattr(ext, 'include_dirs') and
 37 |                     not numpy_incl in ext.include_dirs):
 38 |                 ext.include_dirs.append(numpy_incl)
 39 |         _build_ext.build_extensions(self)
 40 | 
 41 | def generate_cython():
 42 | 
 43 |     from Cython.Build import cythonize
 44 |     
 45 |     module_list = ["py_stringmatching/similarity_measure/cython/cython_affine.pyx",
 46 |                    "py_stringmatching/similarity_measure/cython/cython_jaro.pyx",
 47 |                    "py_stringmatching/similarity_measure/cython/cython_jaro_winkler.pyx",
 48 |                    "py_stringmatching/similarity_measure/cython/cython_levenshtein.pyx",
 49 |                    "py_stringmatching/similarity_measure/cython/cython_needleman_wunsch.pyx",
 50 |                    "py_stringmatching/similarity_measure/cython/cython_smith_waterman.pyx",
 51 |                    "py_stringmatching/similarity_measure/cython/cython_utils.pyx"
 52 |                    ]
 53 |     p = cythonize(module_list)
 54 |     
 55 |     if not p:
 56 |         raise RuntimeError("Running cythonize failed!")
 57 | 
 58 | 
 59 | cmdclass = {"build_ext": build_ext}
 60 | 
 61 | 
 62 | if __name__ == "__main__":
 63 | 
 64 |     no_frills = (len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or
 65 |                                          sys.argv[1] in ('--help-commands',
 66 |                                                          'egg_info', '--version',
 67 |                                                          'clean')))
 68 | 
 69 |     cwd = os.path.abspath(os.path.dirname(__file__))
 70 |     if not os.path.exists(os.path.join(cwd, 'PKG-INFO')) and not no_frills:
 71 |         # Generate Cython sources, unless building from source release
 72 |         generate_cython()
 73 | 
 74 |     # specify extensions that need to be compiled
 75 |     extensions = [setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_levenshtein",
 76 |                                        ["py_stringmatching/similarity_measure/cython/cython_levenshtein.c"],
 77 |                                        include_dirs=[]),
 78 |                   setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_jaro",
 79 |                                        ["py_stringmatching/similarity_measure/cython/cython_jaro.c"],
 80 |                                        include_dirs=[]),
 81 |                   setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_jaro_winkler",
 82 |                                        ["py_stringmatching/similarity_measure/cython/cython_jaro_winkler.c"],
 83 |                                        include_dirs=[]),
 84 |                   setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_utils",
 85 |                                        ["py_stringmatching/similarity_measure/cython/cython_utils.c"],
 86 |                                        include_dirs=[]),
 87 |                   setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_needleman_wunsch",
 88 |                                        ["py_stringmatching/similarity_measure/cython/cython_needleman_wunsch.c"],
 89 |                                        include_dirs=[]),
 90 |                   setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_smith_waterman",
 91 |                                        ["py_stringmatching/similarity_measure/cython/cython_smith_waterman.c"],
 92 |                                        include_dirs=[]),
 93 |                   setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_affine",
 94 |                                        ["py_stringmatching/similarity_measure/cython/cython_affine.c"],
 95 |                                        include_dirs=[])
 96 | 
 97 |                   ]
 98 |  
 99 |     # find packages to be included. exclude benchmarks.
100 |     packages = setuptools.find_packages(exclude=["benchmarks", "benchmarks.custom_benchmarks"])
101 | 
102 |     with open('README.rst') as f:
103 |         LONG_DESCRIPTION = f.read()
104 | 
105 |     setuptools.setup(
106 |         name='py-stringmatching',
107 |         version='0.4.6',
108 |         description='Python library for string matching.',
109 |         long_description=LONG_DESCRIPTION,
110 |         url='https://sites.google.com/site/anhaidgroup/projects/magellan/py_stringmatching',
111 |         author='UW Magellan Team',
112 |         author_email='uwmagellan@gmail.com',
113 |         license='BSD',
114 |         classifiers=[
115 |             'Development Status :: 4 - Beta',
116 |             'Environment :: Console',
117 |             'Intended Audience :: Developers',
118 |             'Intended Audience :: Science/Research',
119 |             'Intended Audience :: Education',
120 |             'License :: OSI Approved :: BSD License',
121 |             'Operating System :: POSIX',
122 |             'Operating System :: Unix',
123 |             'Operating System :: MacOS',
124 |             'Operating System :: Microsoft :: Windows',
125 |             'Programming Language :: Python',
126 |             'Programming Language :: Python :: 3',
127 |             'Programming Language :: Python :: 3.7',
128 |             'Programming Language :: Python :: 3.8',
129 |             'Programming Language :: Python :: 3.9',
130 |             'Programming Language :: Python :: 3.10',
131 |             'Programming Language :: Python :: 3.11',
132 |             'Programming Language :: Python :: 3.12',
133 |             'Topic :: Scientific/Engineering',
134 |             'Topic :: Utilities',
135 |             'Topic :: Software Development :: Libraries',
136 |         ],
137 |         packages=packages,
138 |         install_requires=[
139 |             'numpy >= 1.7.0,<2.0',
140 |         ],
141 |         setup_requires=[
142 |             'numpy >= 1.7.0,<2.0'                                                   
143 |         ],
144 |         ext_modules=extensions,
145 |         cmdclass=cmdclass,
146 |         include_package_data=True,
147 |         zip_safe=False
148 |     )
149 | 


--------------------------------------------------------------------------------