├── .coveragerc ├── .github └── workflows │ ├── pip-test.yml │ └── testing.yml ├── .gitignore ├── CHANGES.txt ├── LICENSE ├── LICENSES ├── NUMPY_LICENSE └── SIX_LICENSE ├── MANIFEST.in ├── README.rst ├── benchmarks ├── __init__.py ├── datasets │ ├── long_strings.csv │ ├── medium_strings.csv │ └── short_strings.csv └── run_benchmark.py ├── build_tools ├── appveyor │ ├── install.ps1 │ ├── rm_rf.py │ └── run_with_env.cmd ├── cythonize.py ├── move-conda-package.py └── requirements_dev.txt ├── docs ├── Affine.rst ├── AlphabeticTokenizer.rst ├── AlphanumericTokenizer.rst ├── BagDistance.rst ├── Benchmark.rst ├── Contributing.rst ├── Cosine.rst ├── DelimiterTokenizer.rst ├── Dice.rst ├── Editex.rst ├── GeneralizedJaccard.rst ├── HammingDistance.rst ├── Installation.rst ├── Jaccard.rst ├── Jaro.rst ├── JaroWinkler.rst ├── Levenshtein.rst ├── Makefile ├── MongeElkan.rst ├── NeedlemanWunsch.rst ├── OverlapCoefficient.rst ├── PartialRatio.rst ├── PartialTokenSort.rst ├── QgramTokenizer.rst ├── Ratio.rst ├── SimilarityMeasure.rst ├── SmithWaterman.rst ├── SoftTfIdf.rst ├── Soundex.rst ├── TfIdf.rst ├── TokenSort.rst ├── Tokenizer.rst ├── Tutorial.rst ├── TverskyIndex.rst ├── WhatIsNew.rst ├── WhitespaceTokenizer.rst ├── conf.py ├── index.rst └── make.bat ├── py_stringmatching ├── __init__.py ├── similarity_measure │ ├── __init__.py │ ├── affine.py │ ├── bag_distance.py │ ├── cosine.py │ ├── cython │ │ ├── __init__.py │ │ ├── cython_affine.pyx │ │ ├── cython_jaro.pyx │ │ ├── cython_jaro_winkler.pyx │ │ ├── cython_levenshtein.pyx │ │ ├── cython_needleman_wunsch.pyx │ │ ├── cython_smith_waterman.pyx │ │ └── cython_utils.pyx │ ├── dice.py │ ├── editex.py │ ├── generalized_jaccard.py │ ├── hamming_distance.py │ ├── hybrid_similarity_measure.py │ ├── jaccard.py │ ├── jaro.py │ ├── jaro_winkler.py │ ├── levenshtein.py │ ├── monge_elkan.py │ ├── needleman_wunsch.py │ ├── overlap_coefficient.py │ ├── partial_ratio.py │ ├── partial_token_sort.py │ ├── phonetic_similarity_measure.py │ ├── ratio.py │ ├── sequence_similarity_measure.py │ ├── similarity_measure.py │ ├── smith_waterman.py │ ├── soft_tfidf.py │ ├── soundex.py │ ├── tfidf.py │ ├── token_similarity_measure.py │ ├── token_sort.py │ └── tversky_index.py ├── tests │ ├── __init__.py │ ├── test_sim_Soundex.py │ ├── test_simfunctions.py │ ├── test_tokenizers.py │ └── utils.py ├── tokenizer │ ├── __init__.py │ ├── alphabetic_tokenizer.py │ ├── alphanumeric_tokenizer.py │ ├── definition_tokenizer.py │ ├── delimiter_tokenizer.py │ ├── qgram_tokenizer.py │ ├── tokenizer.py │ └── whitespace_tokenizer.py └── utils.py └── setup.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | branch = True 3 | source = py_stringmatching 4 | include = */py_stringmatching/* 5 | omit = 6 | */tests/* 7 | */benchmarks/* 8 | *__init__* 9 | */python?.?/* 10 | -------------------------------------------------------------------------------- /.github/workflows/pip-test.yml: -------------------------------------------------------------------------------- 1 | # Testing on linux, windows, macos, for python versions 3.7, 3.8, 3.9, 3.10, 3.11, 3.12 2 | 3 | name: Test pip install 4 | 5 | on: 6 | - push 7 | - pull_request 8 | 9 | jobs: 10 | build: 11 | 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 16 | os: ["ubuntu-latest", "windows-latest", "macos-latest"] 17 | runs-on: ${{ matrix.os }} 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Version check 28 | run: python --version 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install "Cython>=0.29.23" "coveralls" 33 | - name: Install package 34 | run: | 35 | python setup.py sdist 36 | pip install dist/py-stringmatching-0.4.6.tar.gz 37 | -------------------------------------------------------------------------------- /.github/workflows/testing.yml: -------------------------------------------------------------------------------- 1 | # Testing on linux, windows, macos, for python versions 3.7, 3.8, 3.9, 3.10, 3.11, 3.12 2 | 3 | name: Unit testing 4 | 5 | on: 6 | - push 7 | - pull_request 8 | 9 | jobs: 10 | build: 11 | 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] 16 | os: ["ubuntu-latest", "windows-latest", "macos-latest"] 17 | runs-on: ${{ matrix.os }} 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Version check 28 | run: python --version 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install "numpy<2.0" "Cython>=0.29.23" "coveralls" 33 | - name: Install package 34 | run: python setup.py build_ext --inplace 35 | - name: Run tests 36 | run: | 37 | python -m unittest -v 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | # Byte-compiled / optimized / DLL files 3 | __pycache__/ 4 | *.py[cod] 5 | *$py.class 6 | 7 | # C extensions 8 | *.c 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | env/ 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .coverage 43 | .coverage.* 44 | .cache 45 | coverage.xml 46 | *,cover 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # temp dir 62 | scratch/ 63 | 64 | # idea files 65 | .idea/ 66 | # Created by .ignore support plugin (hsz.mobi) 67 | 68 | # Performance Testing # 69 | # ####################### 70 | html/ 71 | results/ 72 | 73 | # Project specific 74 | cythonize.dat 75 | 76 | garage/ 77 | cover/ 78 | 79 | # macOS 80 | .DS_Store 81 | -------------------------------------------------------------------------------- /CHANGES.txt: -------------------------------------------------------------------------------- 1 | v0.4.6 - 7/5/2024 2 | * Limited Numpy to <2.0 in setup.py, due to compatibility issues 3 | * Added preliminary testing of pip install to Github Actions workflow 4 | 5 | v0.4.5 - 1/26/2024 6 | * Discontinued usage of cythonize.py during setup due to Python 3.12 compatibility issues 7 | 8 | v0.4.4 - 1/26/2024 9 | * Dropped support for Python 2 10 | * Added support for Python 3.12 11 | * Adjusted setuptools.setup project name to match name on PyPI 12 | 13 | v0.4.3 - 2/8/2023 14 | * Dropped support for Python 3.6. 15 | * Added support for Python 3.10 and 3.11. 16 | * Replaced aliases removed from Numpy 1.24. 17 | * Switched from Nose to vanilla Unittest. 18 | * Replaced Travis and Appveyor CI testing with Github Actions. 19 | 20 | v0.4.2 - 10/17/2020 21 | * Bug fix: Made PartialRatio importable from py_stringmatching. 22 | * Dropped support for Python 3.4. 23 | * This is the last version of py_stringmatching that will support Python 2 and Python 3.5. 24 | 25 | v0.4.1 - 02/22/2019 26 | * Cython version was updated. The package is now built with updated Cython version >= 0.27.3. 27 | * Added support for Python 3.7 version and dropped Testing support for Python 3.3 version. 28 | 29 | v0.4.0 - 07/18/2017 30 | * Rewritten five similarity measures in Cython: Affine, Jaro, Jaro Winkler, Needleman Wunsch, and Smith Waterman. 31 | * Added benchmark scripts to measure the performance of similarity measures. 32 | 33 | v0.3.0 - 05/29/2017 34 | * Added nine new string similarity measures - Bag Distance, Editex, 35 | Generalized Jaccard, Partial Ratio, Partial Token Sort, Ratio, 36 | Soundex, Token Sort, and Tversky Index. 37 | 38 | v0.2.1 - 07/14/2016 39 | * Remove explicit installation of numpy using pip in setup. Add numpy in setup_requires and compile extensions by including numpy install path. 40 | 41 | v0.2.0 - 07/06/2016 42 | * Qgram tokenizers have been modified to take a flag called "padding". If this flag is True (the default), then a prefix and a suffix will be added to the input string before tokenizing (see the Tutorial for a reason for this). 43 | * Version 0.1.0 does not handle strings in unicode correctly. Specifically, if an input string contains non-ascii characters, a string similarity measure may interpret the string incorrectly and thus compute an incorrect similarity score. In this version we have fixed the string similarity measures. Specifically, we convert the input strings into unicode before computing similarity measures. NOTE: the tokenizers are still not yet unicode-aware. 44 | * In Version 0.1.0, the flag "dampen" for TF/IDF similarity measure has the default value of False. In this version we have modified it to have the default value of True, which is the more common value for this flag in practice. 45 | 46 | v0.1.0 - 06/14/2016 47 | * Initial release. 48 | * Contains 5 tokenizers - Alphabetic tokenizer, Alphanumeric tokenizer, Delimiter tokenizer, Qgram tokenizer and 49 | Whitespace tokenizer. 50 | * Contains 14 similarity measures - Affine, Cosine, Dice, Hamming distance, Jaccard, Jaro, Jaro-Winkler, 51 | Levenshtein, Monge-Elkan, Needleman-Wunsch, Overlap coefficient, Smith-Waterman, Soft TF-IDF, and TF-IDF. 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016, anhaidgroup 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of py_stringmatching nor the names of its 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 24 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /LICENSES/NUMPY_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005-2016, NumPy Developers. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are 6 | met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of the NumPy Developers nor the names of any 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /LICENSES/SIX_LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2016 Benjamin Peterson 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so, 8 | subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 15 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 16 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 17 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 18 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 19 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.rst 2 | include CHANGES.txt 3 | include requirements.txt 4 | include LICENSE 5 | recursive-include LICENSES * 6 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | py_stringmatching 2 | ================= 3 | 4 | This project seeks to build a Python software package that consists of a comprehensive and scalable set of string tokenizers (such as alphabetical tokenizers, whitespace tokenizers) and string similarity measures (such as edit distance, Jaccard, TF/IDF). The package is free, open-source, and BSD-licensed. 5 | 6 | Important links 7 | =============== 8 | 9 | * Project Homepage: https://sites.google.com/site/anhaidgroup/projects/magellan/py_stringmatching 10 | * Code repository: https://github.com/anhaidgroup/py_stringmatching 11 | * User Manual: https://anhaidgroup.github.io/py_stringmatching/v0.4.2/index.html 12 | * Tutorial: https://anhaidgroup.github.io/py_stringmatching/v0.4.2/Tutorial.html 13 | * How to Contribute: https://anhaidgroup.github.io/py_stringmatching/v0.4.2/Contributing.html 14 | * Developer Manual: http://pages.cs.wisc.edu/~anhai/py_stringmatching/v0.2.0/dev-manual-v0.2.0.pdf 15 | * Issue Tracker: https://github.com/anhaidgroup/py_stringmatching/issues 16 | * Mailing List: https://groups.google.com/forum/#!forum/py_stringmatching 17 | 18 | Dependencies 19 | ============ 20 | 21 | py_stringmatching has been tested on each Python version between 3.7 and 3.12, inclusive. 22 | 23 | The required dependencies to build the package are NumPy 1.7.0 or higher, but lower than 2.0, 24 | and a C or C++ compiler. For the development version, you will also need Cython. 25 | 26 | Platforms 27 | ========= 28 | 29 | py_stringmatching has been tested on Linux, OS X and Windows. At this time we have only tested on x86 architecture. 30 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/__init__.py -------------------------------------------------------------------------------- /benchmarks/datasets/long_strings.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/datasets/long_strings.csv -------------------------------------------------------------------------------- /benchmarks/datasets/medium_strings.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/datasets/medium_strings.csv -------------------------------------------------------------------------------- /benchmarks/datasets/short_strings.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/benchmarks/datasets/short_strings.csv -------------------------------------------------------------------------------- /benchmarks/run_benchmark.py: -------------------------------------------------------------------------------- 1 | 2 | from math import ceil, sqrt 3 | import time 4 | 5 | import pandas as pd 6 | import matplotlib.pyplot as plt 7 | 8 | 9 | def run_benchmark(short_dataset_path, medium_dataset_path, long_dataset_path, 10 | data_size, sim_measure, tokenizer = None, num_repeat = 1, 11 | random_seed = 0, output_file = None, encoding = 'latin-1'): 12 | """Run benchmark for 9 configurations (short-short, short-medium, 13 | short-long, medium-short, medium-medium, medium-long, long-short, 14 | long-medium, long-long) for the provided similarity measure. 15 | 16 | Specifically, this method will take in 3 files as input each containing 17 | one column of strings. Next, it will sample the input files based on the 18 | provided data_size and then runs benchmark for different configurations for 19 | the provided similarity measure. Finally, it returns a dataframe containing 20 | the benchmark results. 21 | 22 | Args: 23 | short_dataset_path (string): Path to the dataset containing short strings. 24 | medium_dataset_path (string): Path to the dataset containing medium strings. 25 | long_dataset_path (string): Path to the dataset containing long strings. 26 | data_size (int): Number of string pairs to be benchmarked. 27 | sim_measure (function): Similarity function to be benchmarked. 28 | tokenizer (function): Tokenizer to be used (in case of token-based similarity measures). Defaults to None. 29 | num_repeat (int): Number of times to run each configuration. Defaults to 1. 30 | random_seed (int): Random seed to be used for sampling. Defaults to 0. 31 | output_file (string): Output path to save the benchmark results. Defaults to None. 32 | encoding (string): Encoding of the input datasets. Defaults to latin-1. 33 | 34 | Returns: 35 | Benchmark results (Dataframe). 36 | 37 | Examples: 38 | >>> jac = Jaccard() 39 | >>> ws = WhitespaceTokenizer(return_set=True) 40 | >>> results = run_benchmark('datasets/short_strings.csv', 'datasets/medium_strings.csv', 'datasets/long_strings.csv', 100000, 41 | jac.get_sim_score, ws.tokenize, output_file = 'result.csv') # Benchmark results will be saved in result.csv 42 | >>> ed = Levenshtein() 43 | >>> results = run_benchmark('datasets/short_strings.csv', 'datasets/medium_strings.csv', 'datasets/long_strings.csv', 100000, 44 | ed.get_sim_score) 45 | """ 46 | 47 | # read data 48 | short_strings = pd.read_csv(short_dataset_path, encoding = encoding) 49 | medium_strings = pd.read_csv(medium_dataset_path, encoding = encoding) 50 | long_strings = pd.read_csv(long_dataset_path, encoding = encoding) 51 | 52 | short_len = len(short_strings) 53 | medium_len = len(medium_strings) 54 | long_len = len(long_strings) 55 | 56 | # compute individual table size 57 | table_size = ceil(sqrt(data_size)) 58 | 59 | # sample strings 60 | short_table = list(short_strings.sample(table_size, replace = True, 61 | random_state = random_seed).values) 62 | medium_table = list(medium_strings.sample(table_size, replace = True, 63 | random_state = random_seed).values) 64 | long_table = list(long_strings.sample(table_size, replace = True, 65 | random_state = random_seed).values) 66 | 67 | tables = [('short', short_table), ('medium', medium_table), 68 | ('long', long_table)] 69 | 70 | # run benchmark for each configuration 71 | bench_output = [] 72 | for i in range(len(tables)): 73 | for j in range(len(tables)): 74 | runtimes = profile_runtime(tables[i][1], tables[j][1], tokenizer, 75 | sim_measure, num_repeat) 76 | runtimes.append(sum(runtimes)/float(num_repeat)) 77 | runtimes.insert(0, '_'.join([tables[i][0], tables[j][0]])) 78 | bench_output.append(runtimes) 79 | 80 | header = ['run_'+str(i+1)+' (in secs)' for i in range(num_repeat)] 81 | header.append('average (in secs)') 82 | header.insert(0, 'configuration') 83 | output_table = pd.DataFrame(bench_output, columns = header) 84 | 85 | if output_file: 86 | output_table.to_csv(output_file, index = False) 87 | 88 | return output_table 89 | 90 | 91 | def profile_runtime(table_A, table_B, tokenizer, sim_measure, num_repeat): 92 | # run benchmark for one configuration 93 | runtimes = [] 94 | for i in range(num_repeat): 95 | start_time = time.time() 96 | for string1 in table_A: 97 | for string2 in table_B: 98 | if tokenizer: 99 | score = sim_measure(tokenizer(string1[0]), tokenizer(string2[0])) 100 | else: 101 | score = sim_measure(string1[0], string2[0]) 102 | end_time = time.time() 103 | runtimes.append(end_time-start_time) 104 | return runtimes 105 | 106 | 107 | def plot_benchmark(bench_output, output_file, 108 | conf_attr = 'configuration', time_attr = 'average (in secs)'): 109 | # Generate plot from benchmark output 110 | x_range = list(range(len(bench_output))) 111 | plt.xticks(x_range, list(bench_output[conf_attr])) 112 | plt.plot(x_range, bench_output[time_attr], marker='o') 113 | plt.xlabel('Configuration') 114 | plt.ylabel('Average time (in secs)') 115 | plt.title('Benchmark plot') 116 | plt.savefig(output_file) 117 | print('Plot generated successfully.') 118 | 119 | -------------------------------------------------------------------------------- /build_tools/appveyor/install.ps1: -------------------------------------------------------------------------------- 1 | # Sample script to install Miniconda under Windows 2 | # Authors: Olivier Grisel, Jonathan Helmus and Kyle Kastner, Robert McGibbon 3 | # License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 4 | 5 | $MINICONDA_URL = "http://repo.continuum.io/miniconda/" 6 | 7 | 8 | function DownloadMiniconda ($python_version, $platform_suffix) { 9 | $webclient = New-Object System.Net.WebClient 10 | $filename = "Miniconda3-latest-Windows-" + $platform_suffix + ".exe" 11 | # $filename = "Miniconda3-3.8.3-Windows-" + $platform_suffix + ".exe" 12 | $url = $MINICONDA_URL + $filename 13 | 14 | $basedir = $pwd.Path + "\" 15 | $filepath = $basedir + $filename 16 | if (Test-Path $filename) { 17 | Write-Host "Reusing" $filepath 18 | return $filepath 19 | } 20 | 21 | # Download and retry up to 3 times in case of network transient errors. 22 | Write-Host "Downloading" $filename "from" $url 23 | $retry_attempts = 2 24 | for($i=0; $i -lt $retry_attempts; $i++){ 25 | try { 26 | $webclient.DownloadFile($url, $filepath) 27 | break 28 | } 29 | Catch [Exception]{ 30 | Start-Sleep 1 31 | } 32 | } 33 | if (Test-Path $filepath) { 34 | Write-Host "File saved at" $filepath 35 | } else { 36 | # Retry once to get the error message if any at the last try 37 | $webclient.DownloadFile($url, $filepath) 38 | } 39 | return $filepath 40 | } 41 | 42 | 43 | function InstallMiniconda ($python_version, $architecture, $python_home) { 44 | Write-Host "Installing Python" $python_version "for" $architecture "bit architecture to" $python_home 45 | if (Test-Path $python_home) { 46 | Write-Host $python_home "already exists, skipping." 47 | return $false 48 | } 49 | if ($architecture -match "32") { 50 | $platform_suffix = "x86" 51 | } else { 52 | $platform_suffix = "x86_64" 53 | } 54 | 55 | $filepath = DownloadMiniconda $python_version $platform_suffix 56 | Write-Host "Installing" $filepath "to" $python_home 57 | $install_log = $python_home + ".log" 58 | $args = "/S /D=$python_home" 59 | Write-Host $filepath $args 60 | Start-Process -FilePath $filepath -ArgumentList $args -Wait -Passthru 61 | if (Test-Path $python_home) { 62 | Write-Host "Python $python_version ($architecture) installation complete" 63 | } else { 64 | Write-Host "Failed to install Python in $python_home" 65 | Get-Content -Path $install_log 66 | Exit 1 67 | } 68 | } 69 | 70 | 71 | function InstallCondaPackages ($python_home, $spec) { 72 | $conda_path = $python_home + "\Scripts\conda.exe" 73 | $args = "install --yes " + $spec 74 | Write-Host ("conda " + $args) 75 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 76 | } 77 | 78 | function UpdateConda ($python_home) { 79 | $conda_path = $python_home + "\Scripts\conda.exe" 80 | Write-Host "Updating conda..." 81 | $args = "update --yes conda" 82 | Write-Host $conda_path $args 83 | Start-Process -FilePath "$conda_path" -ArgumentList $args -Wait -Passthru 84 | } 85 | 86 | 87 | function main () { 88 | InstallMiniconda $env:PYTHON_VERSION $env:PYTHON_ARCH $env:PYTHON 89 | UpdateConda $env:PYTHON 90 | InstallCondaPackages $env:PYTHON "conda-build jinja2 anaconda-client" 91 | } 92 | 93 | main 94 | -------------------------------------------------------------------------------- /build_tools/appveyor/rm_rf.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import os 3 | import sys 4 | import stat 5 | import shutil 6 | 7 | def remove_readonly(func, path, excinfo): 8 | os.chmod(path, stat.S_IWRITE) 9 | func(path) 10 | 11 | def main(): 12 | print(sys.executable) 13 | try: 14 | shutil.rmtree(sys.argv[1], onerror=remove_readonly) 15 | except Exception as e: 16 | print("Error") 17 | print(e) 18 | 19 | if __name__ == '__main__': 20 | main() 21 | 22 | -------------------------------------------------------------------------------- /build_tools/appveyor/run_with_env.cmd: -------------------------------------------------------------------------------- 1 | :: EXPECTED ENV VARS: PYTHON_ARCH (either x86 or x64) 2 | :: CONDA_PY (either 27, 33, 35 etc. - only major version is extracted) 3 | :: 4 | :: 5 | :: To build extensions for 64 bit Python 3, we need to configure environment 6 | :: variables to use the MSVC 2010 C++ compilers from GRMSDKX_EN_DVD.iso of: 7 | :: MS Windows SDK for Windows 7 and .NET Framework 4 (SDK v7.1) 8 | :: 9 | :: To build extensions for 64 bit Python 2, we need to configure environment 10 | :: variables to use the MSVC 2008 C++ compilers from GRMSDKX_EN_DVD.iso of: 11 | :: MS Windows SDK for Windows 7 and .NET Framework 3.5 (SDK v7.0) 12 | :: 13 | :: 32 bit builds, and 64-bit builds for 3.5 and beyond, do not require specific 14 | :: environment configurations. 15 | :: 16 | :: Note: this script needs to be run with the /E:ON and /V:ON flags for the 17 | :: cmd interpreter, at least for (SDK v7.0) 18 | :: 19 | :: More details at: 20 | :: https://github.com/cython/cython/wiki/64BitCythonExtensionsOnWindows 21 | :: http://stackoverflow.com/a/13751649/163740 22 | :: 23 | :: Author: Phil Elson 24 | :: Original Author: Olivier Grisel (https://github.com/ogrisel/python-appveyor-demo) 25 | :: License: CC0 1.0 Universal: http://creativecommons.org/publicdomain/zero/1.0/ 26 | :: 27 | :: Notes about batch files for Python people: 28 | :: 29 | :: Quotes in values are literally part of the values: 30 | :: SET FOO="bar" 31 | :: FOO is now five characters long: " b a r " 32 | :: If you don't want quotes, don't include them on the right-hand side. 33 | :: 34 | :: The CALL lines at the end of this file look redundant, but if you move them 35 | :: outside of the IF clauses, they do not run properly in the SET_SDK_64==Y 36 | :: case, I don't know why. 37 | :: originally from https://github.com/pelson/Obvious-CI/blob/master/scripts/obvci_appveyor_python_build_env.cmd 38 | @ECHO OFF 39 | 40 | SET COMMAND_TO_RUN=%* 41 | SET WIN_SDK_ROOT=C:\Program Files\Microsoft SDKs\Windows 42 | 43 | :: Extract the major and minor versions, and allow for the minor version to be 44 | :: more than 9. This requires the version number to have two dots in it. 45 | SET MAJOR_PYTHON_VERSION=%CONDA_PY:~0,1% 46 | 47 | IF "%CONDA_PY:~2,1%" == "" ( 48 | :: CONDA_PY style, such as 27, 34 etc. 49 | SET MINOR_PYTHON_VERSION=%CONDA_PY:~1,1% 50 | ) ELSE ( 51 | IF "%CONDA_PY:~3,1%" == "." ( 52 | SET MINOR_PYTHON_VERSION=%CONDA_PY:~2,1% 53 | ) ELSE ( 54 | SET MINOR_PYTHON_VERSION=%CONDA_PY:~2,2% 55 | ) 56 | ) 57 | 58 | :: Based on the Python version, determine what SDK version to use, and whether 59 | :: to set the SDK for 64-bit. 60 | IF %MAJOR_PYTHON_VERSION% == 2 ( 61 | SET WINDOWS_SDK_VERSION="v7.0" 62 | SET SET_SDK_64=Y 63 | ) ELSE ( 64 | IF %MAJOR_PYTHON_VERSION% == 3 ( 65 | SET WINDOWS_SDK_VERSION="v7.1" 66 | IF %MINOR_PYTHON_VERSION% LEQ 4 ( 67 | SET SET_SDK_64=Y 68 | ) ELSE ( 69 | SET SET_SDK_64=N 70 | ) 71 | ) ELSE ( 72 | ECHO Unsupported Python version: "%MAJOR_PYTHON_VERSION%" 73 | EXIT /B 1 74 | ) 75 | ) 76 | 77 | IF "%PYTHON_ARCH%"=="64" ( 78 | IF %SET_SDK_64% == Y ( 79 | ECHO Configuring Windows SDK %WINDOWS_SDK_VERSION% for Python %MAJOR_PYTHON_VERSION% on a 64 bit architecture 80 | SET DISTUTILS_USE_SDK=1 81 | SET MSSdk=1 82 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Setup\WindowsSdkVer.exe" -q -version:%WINDOWS_SDK_VERSION% 83 | "%WIN_SDK_ROOT%\%WINDOWS_SDK_VERSION%\Bin\SetEnv.cmd" /x64 /release 84 | ECHO Executing: %COMMAND_TO_RUN% 85 | call %COMMAND_TO_RUN% || EXIT /B 1 86 | ) ELSE ( 87 | ECHO Using default MSVC build environment for 64 bit architecture 88 | ECHO Executing: %COMMAND_TO_RUN% 89 | call %COMMAND_TO_RUN% || EXIT /B 1 90 | ) 91 | ) ELSE ( 92 | ECHO Using default MSVC build environment for 32 bit architecture 93 | ECHO Executing: %COMMAND_TO_RUN% 94 | call %COMMAND_TO_RUN% || EXIT /B 1 95 | ) 96 | -------------------------------------------------------------------------------- /build_tools/cythonize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ cythonize 3 | 4 | Cythonize pyx files into C files as needed. 5 | 6 | Usage: cythonize [root_dir] 7 | 8 | Default [root_dir] is 'py_stringmatching'. 9 | 10 | Checks pyx files to see if they have been changed relative to their 11 | corresponding C files. If they have, then runs cython on these files to 12 | recreate the C files. 13 | 14 | The script detects changes in the pyx/pxd files using checksums 15 | [or hashes] stored in a database file 16 | 17 | Simple script to invoke Cython on all .pyx 18 | files; while waiting for a proper build system. Uses file hashes to 19 | figure out if rebuild is needed. 20 | 21 | It is called by ./setup.py sdist so that sdist package can be installed without 22 | cython 23 | 24 | Originally written by Dag Sverre Seljebotn, and adapted from scikit-learn 25 | (BSD 3-clause) 26 | 27 | We copied it for py_stringmatching. 28 | 29 | Note: this script does not check any of the dependent C libraries; it only 30 | operates on the Cython .pyx files or their corresponding Cython header (.pxd) 31 | files. 32 | """ 33 | 34 | from __future__ import division, print_function, absolute_import 35 | 36 | import os 37 | import re 38 | import sys 39 | import hashlib 40 | import subprocess 41 | 42 | HASH_FILE = 'cythonize.dat' 43 | DEFAULT_ROOT = 'py_stringmatching' 44 | 45 | # WindowsError is not defined on unix systems 46 | try: 47 | WindowsError 48 | except NameError: 49 | WindowsError = None 50 | 51 | 52 | def cythonize(cython_file, gen_file): 53 | try: 54 | from Cython.Compiler.Version import version as cython_version 55 | from distutils.version import LooseVersion 56 | if LooseVersion(cython_version) < LooseVersion('0.21'): 57 | raise Exception('Building py_stringmatching requires Cython >= 0.21') 58 | 59 | except ImportError: 60 | pass 61 | 62 | flags = ['--fast-fail'] 63 | if gen_file.endswith('.cpp'): 64 | flags += ['--cplus'] 65 | 66 | try: 67 | try: 68 | rc = subprocess.call(['cython'] + 69 | flags + ["-o", gen_file, cython_file]) 70 | if rc != 0: 71 | raise Exception('Cythonizing %s failed' % cython_file) 72 | except OSError: 73 | # There are ways of installing Cython that don't result in a cython 74 | # executable on the path, see scipy issue gh-2397. 75 | rc = subprocess.call([sys.executable, '-c', 76 | 'import sys; from Cython.Compiler.Main ' 77 | 'import setuptools_main as main;' 78 | ' sys.exit(main())'] + flags + 79 | ["-o", gen_file, cython_file]) 80 | if rc != 0: 81 | raise Exception('Cythonizing %s failed' % cython_file) 82 | except OSError: 83 | raise OSError('Cython needs to be installed') 84 | 85 | 86 | def load_hashes(filename): 87 | """Load the hashes dict from the hashfile""" 88 | # { filename : (sha1 of header if available or 'NA', 89 | # sha1 of input, 90 | # sha1 of output) } 91 | 92 | hashes = {} 93 | try: 94 | with open(filename, 'r') as cython_hash_file: 95 | for hash_record in cython_hash_file: 96 | (filename, header_hash, 97 | cython_hash, gen_file_hash) = hash_record.split() 98 | hashes[filename] = (header_hash, cython_hash, gen_file_hash) 99 | except (KeyError, ValueError, AttributeError, IOError): 100 | hashes = {} 101 | return hashes 102 | 103 | 104 | def save_hashes(hashes, filename): 105 | """Save the hashes dict to the hashfile""" 106 | with open(filename, 'w') as cython_hash_file: 107 | for key, value in hashes.items(): 108 | cython_hash_file.write("%s %s %s %s\n" 109 | % (key, value[0], value[1], value[2])) 110 | 111 | 112 | def sha1_of_file(filename): 113 | h = hashlib.sha1() 114 | with open(filename, "rb") as f: 115 | h.update(f.read()) 116 | return h.hexdigest() 117 | 118 | 119 | def clean_path(path): 120 | """Clean the path""" 121 | path = path.replace(os.sep, '/') 122 | if path.startswith('./'): 123 | path = path[2:] 124 | return path 125 | 126 | 127 | def get_hash_tuple(header_path, cython_path, gen_file_path): 128 | """Get the hashes from the given files""" 129 | 130 | header_hash = (sha1_of_file(header_path) 131 | if os.path.exists(header_path) else 'NA') 132 | from_hash = sha1_of_file(cython_path) 133 | to_hash = (sha1_of_file(gen_file_path) 134 | if os.path.exists(gen_file_path) else 'NA') 135 | 136 | return header_hash, from_hash, to_hash 137 | 138 | 139 | def cythonize_if_unchanged(path, cython_file, gen_file, hashes): 140 | full_cython_path = os.path.join(path, cython_file) 141 | full_header_path = full_cython_path.replace('.pyx', '.pxd') 142 | full_gen_file_path = os.path.join(path, gen_file) 143 | 144 | current_hash = get_hash_tuple(full_header_path, full_cython_path, 145 | full_gen_file_path) 146 | 147 | if current_hash == hashes.get(clean_path(full_cython_path)): 148 | print('%s has not changed' % full_cython_path) 149 | return 150 | 151 | print('Processing %s' % full_cython_path) 152 | cythonize(full_cython_path, full_gen_file_path) 153 | 154 | # changed target file, recompute hash 155 | current_hash = get_hash_tuple(full_header_path, full_cython_path, 156 | full_gen_file_path) 157 | 158 | # Update the hashes dict with the new hash 159 | hashes[clean_path(full_cython_path)] = current_hash 160 | 161 | 162 | def check_and_cythonize(root_dir): 163 | print(root_dir) 164 | hashes = load_hashes(HASH_FILE) 165 | 166 | for cur_dir, dirs, files in os.walk(root_dir): 167 | for filename in files: 168 | if filename.endswith('.pyx'): 169 | gen_file_ext = '.c' 170 | # Cython files with libcpp imports should be compiled to cpp 171 | with open(os.path.join(cur_dir, filename), 'rb') as f: 172 | data = f.read() 173 | m = re.search(b"libcpp", data, re.I | re.M) 174 | if m: 175 | gen_file_ext = ".cpp" 176 | cython_file = filename 177 | gen_file = filename.replace('.pyx', gen_file_ext) 178 | cythonize_if_unchanged(cur_dir, cython_file, gen_file, hashes) 179 | 180 | # Save hashes once per module. This prevents cythonizing prev. 181 | # files again when debugging broken code in a single file 182 | save_hashes(hashes, HASH_FILE) 183 | 184 | 185 | def main(root_dir=DEFAULT_ROOT): 186 | check_and_cythonize(root_dir) 187 | 188 | 189 | if __name__ == '__main__': 190 | try: 191 | root_dir_arg = sys.argv[1] 192 | except IndexError: 193 | root_dir_arg = DEFAULT_ROOT 194 | main(root_dir_arg) 195 | -------------------------------------------------------------------------------- /build_tools/move-conda-package.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import yaml 4 | import glob 5 | import shutil 6 | 7 | #try 8 | # from conda_build.config import config 9 | #except ImportError: 10 | from conda_build.config import Config # 03/03/2017: Updated based on the changes to conda_build.config 11 | config = Config() 12 | 13 | with open(os.path.join(sys.argv[1], 'meta.yaml')) as f: 14 | name = yaml.load(f)['package']['name'] 15 | 16 | binary_package_glob = os.path.join(config.bldpkgs_dir, '{0}*.tar.bz2'.format(name)) 17 | binary_package = glob.glob(binary_package_glob)[0] 18 | 19 | shutil.move(binary_package, '.') 20 | -------------------------------------------------------------------------------- /build_tools/requirements_dev.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.7.0 2 | Cython 3 | -------------------------------------------------------------------------------- /docs/Affine.rst: -------------------------------------------------------------------------------- 1 | Affine Gap 2 | -------------------------------------------------- 3 | 4 | .. autoclass:: py_stringmatching.similarity_measure.affine.Affine(gap_start=1, gap_continuation=0.5, sim_func=identity_function) 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/AlphabeticTokenizer.rst: -------------------------------------------------------------------------------- 1 | Alphabetic Tokenizer 2 | ------------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.tokenizer.alphabetic_tokenizer 5 | :members: 6 | :inherited-members: 7 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 8 | 9 | -------------------------------------------------------------------------------- /docs/AlphanumericTokenizer.rst: -------------------------------------------------------------------------------- 1 | Alphanumeric Tokenizer 2 | --------------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.tokenizer.alphanumeric_tokenizer 5 | :members: 6 | :inherited-members: 7 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 8 | 9 | -------------------------------------------------------------------------------- /docs/BagDistance.rst: -------------------------------------------------------------------------------- 1 | Bag Distance 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.bag_distance 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/Benchmark.rst: -------------------------------------------------------------------------------- 1 | Runtime Benchmark 2 | ================= 3 | 4 | For this package, we add a runtime benchmark (consisting of a script and several datasets) to measure the runtime performance of similarity measures. This benchmark can be used by users to judge whether similarity measures are fast enough for their purposes, and used by developers to speed up the measures. 5 | 6 | Running the Benchmark 7 | --------------------- 8 | 9 | The user can run the benchmark as follows: 10 | 11 | Step 1: Clone the py_stringmatching package from GitHub using the following command:: 12 | 13 | git clone https://github.com/anhaidgroup/py_stringmatching.git 14 | 15 | Step 2: Change the working directory to py_stringmatching/benchmarks/custom_benchmarks 16 | 17 | Step 3: Run the benchmark using the following sequence of commands: 18 | 19 | >>> import py_stringmatching as sm 20 | >>> from run_benchmark import * 21 | # create an object for the similarity measure you need to benchmark 22 | >>> jaccard = sm.Jaccard() 23 | # create a tokenizer object (in case of token-based measures) 24 | >>> ws = sm.WhitespaceTokenizer(return_set = True) 25 | # Set dataset paths 26 | >>> short_strings_path = 'datasets/short_strings.csv' 27 | >>> medium_strings_path = 'datasets/medium_strings.csv' 28 | >>> long_strings_path = 'datasets/long_strings.csv' 29 | # Data size (number of string pairs) over which the benchmark should be run 30 | >>> data_size = 10000 31 | # Number of times to repeat 32 | >>> num_repeat = 3 33 | # Output file where the benchmark results should be written 34 | >>> output_file = 'benchmark_results.csv' 35 | # run the benchmark 36 | >>> run_benchmark(short_strings_path, medium_strings_path, long_strings_path, data_size = data_size, jaccard.get_sim_score, ws.tokenize, num_repeat = num_repeat, output_file = output_file) 37 | 38 | The benchmark contains three datasets in the `datasets` directory: (1) short_strings.csv, (2) medium_strings.csv, and (3) long_strings.csv. Each dataset contains 5000 strings. Specifically, short_strings.csv contains strings with length in the range of 2-15 (avg. of 10), medium_strings.csv contains strings with length in the range of 18-39 (avg. of 25), and 39 | long_strings.csv contains strings with length in the range of 60-1726 (avg. of 127). 40 | 41 | The above command will run the benchmark for 9 different configurations 42 | (short-short, short-medium, short-long, medium-short, medium-medium, medium-long, 43 | long-short, long-medium, long-long) for the provided similarity measure, and 44 | writes the result to the provided output file. See below for additional details. 45 | 46 | Interpreting the Results 47 | -------------------------- 48 | 49 | The benchmark results will be a CSV file containing the following information: 50 | 51 | * Configuration 52 | * Runtime (in secs) for each run of a configuration (note that each configuration is run for `num_repeat` times) 53 | * Average runtime (in secs) for each configuration 54 | 55 | An example output file will look like this:: 56 | 57 | configuration,run_1 (in secs),run_2 (in secs),run_3 (in secs),average (in secs) 58 | short_short,0.112642049789,0.112892866135,0.112852096558,0.112795670827 59 | short_medium,0.115404129028,0.115512132645,0.115454912186,0.115457057953 60 | short_long,0.194123983383,0.193922996521,0.193790912628,0.193945964177 61 | medium_short,0.11647105217,0.116579055786,0.116438865662,0.116496324539 62 | medium_medium,0.118470907211,0.118409156799,0.118496894836,0.118458986282 63 | medium_long,0.206312894821,0.206974983215,0.206708908081,0.206665595373 64 | long_short,0.205050945282,0.205410957336,0.205253124237,0.205238342285 65 | long_medium,0.217441797256,0.21806883812,0.218235015869,0.217915217082 66 | long_long,0.770321846008,0.76869893074,0.768806934357,0.769275903702 67 | -------------------------------------------------------------------------------- /docs/Cosine.rst: -------------------------------------------------------------------------------- 1 | Cosine 2 | -------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.cosine 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/DelimiterTokenizer.rst: -------------------------------------------------------------------------------- 1 | Delimiter Tokenizer 2 | ------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.tokenizer.delimiter_tokenizer 5 | :members: 6 | :inherited-members: 7 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 8 | 9 | -------------------------------------------------------------------------------- /docs/Dice.rst: -------------------------------------------------------------------------------- 1 | Dice 2 | ------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.dice 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/Editex.rst: -------------------------------------------------------------------------------- 1 | Editex 2 | ------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.editex 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/GeneralizedJaccard.rst: -------------------------------------------------------------------------------- 1 | Generalized Jaccard 2 | --------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.generalized_jaccard 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/HammingDistance.rst: -------------------------------------------------------------------------------- 1 | Hamming Distance 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.hamming_distance 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/Installation.rst: -------------------------------------------------------------------------------- 1 | ============ 2 | Installation 3 | ============ 4 | 5 | Requirements 6 | ------------ 7 | * Python 3.7-3.11 8 | * C or C++ compiler (parts of the package are in Cython for efficiency reasons, and you need C or C++ compiler to compile these parts) 9 | 10 | Platforms 11 | ------------ 12 | py_stringmatching has been tested on Linux (Ubuntu 22.04), OS X (Monterey 12), and Windows 10. 13 | 14 | Dependencies 15 | ------------ 16 | * numpy 1.7.0 or higher 17 | 18 | .. note:: 19 | 20 | The py_stringmatching installer will automatically install the above required packages. 21 | 22 | C Compiler Required 23 | ------------------- 24 | Before installing this package, you need to make sure that you have a C compiler installed. This is necessary because this package contains Cython files. Go `here `_ for more information about how to check whether you already have a C compiler and how to install a C compiler. 25 | 26 | After you have confirmed that you have a C compiler installed, you are ready to install the package. There are two ways to install py_stringmatching package: using pip or source distribution. 27 | 28 | Installing Using pip 29 | -------------------- 30 | The easiest way to install the package is to use pip, which will retrieve py_stringmatching from PyPI then install it:: 31 | 32 | pip install py_stringmatching 33 | 34 | Installing from Source Distribution 35 | ------------------------------------- 36 | Step 1: Download the py_stringmatching package from `here 37 | `_. 38 | 39 | Step 2: Unzip the package and execute the following command from the package root:: 40 | 41 | python setup.py install 42 | 43 | .. note:: 44 | 45 | The above command will try to install py_stringmatching into the defaul Python directory on your machine. If you do not have installation permission for that directory then you can install the package in your home directory as follows:: 46 | 47 | python setup.py install --user 48 | 49 | For more information see the StackOverflow `link 50 | `_. 51 | 52 | .. note:: 53 | 54 | Building C files from source requires Cython version 0.29.23 or higher:: 55 | 56 | pip install Cython>=0.29.23 57 | 58 | -------------------------------------------------------------------------------- /docs/Jaccard.rst: -------------------------------------------------------------------------------- 1 | Jaccard 2 | --------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.jaccard 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/Jaro.rst: -------------------------------------------------------------------------------- 1 | Jaro 2 | ------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.jaro 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/JaroWinkler.rst: -------------------------------------------------------------------------------- 1 | Jaro Winkler 2 | -------------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.jaro_winkler 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/Levenshtein.rst: -------------------------------------------------------------------------------- 1 | Levenshtein 2 | ------------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.levenshtein 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | PAPER = 8 | BUILDDIR = _build 9 | 10 | # User-friendly check for sphinx-build 11 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 12 | $(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/) 13 | endif 14 | 15 | # Internal variables. 16 | PAPEROPT_a4 = -D latex_paper_size=a4 17 | PAPEROPT_letter = -D latex_paper_size=letter 18 | ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 19 | # the i18n builder cannot share the environment and doctrees with the others 20 | I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . 21 | 22 | .PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext 23 | 24 | help: 25 | @echo "Please use \`make ' where is one of" 26 | @echo " html to make standalone HTML files" 27 | @echo " dirhtml to make HTML files named index.html in directories" 28 | @echo " singlehtml to make a single large HTML file" 29 | @echo " pickle to make pickle files" 30 | @echo " json to make JSON files" 31 | @echo " htmlhelp to make HTML files and a HTML help project" 32 | @echo " qthelp to make HTML files and a qthelp project" 33 | @echo " applehelp to make an Apple Help Book" 34 | @echo " devhelp to make HTML files and a Devhelp project" 35 | @echo " epub to make an epub" 36 | @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter" 37 | @echo " latexpdf to make LaTeX files and run them through pdflatex" 38 | @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx" 39 | @echo " text to make text files" 40 | @echo " man to make manual pages" 41 | @echo " texinfo to make Texinfo files" 42 | @echo " info to make Texinfo files and run them through makeinfo" 43 | @echo " gettext to make PO message catalogs" 44 | @echo " changes to make an overview of all changed/added/deprecated items" 45 | @echo " xml to make Docutils-native XML files" 46 | @echo " pseudoxml to make pseudoxml-XML files for display purposes" 47 | @echo " linkcheck to check all external links for integrity" 48 | @echo " doctest to run all doctests embedded in the documentation (if enabled)" 49 | @echo " coverage to run coverage check of the documentation (if enabled)" 50 | 51 | clean: 52 | rm -rf $(BUILDDIR)/* 53 | 54 | html: 55 | $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html 56 | @echo 57 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." 58 | 59 | dirhtml: 60 | $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml 61 | @echo 62 | @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml." 63 | 64 | singlehtml: 65 | $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml 66 | @echo 67 | @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml." 68 | 69 | pickle: 70 | $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle 71 | @echo 72 | @echo "Build finished; now you can process the pickle files." 73 | 74 | json: 75 | $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json 76 | @echo 77 | @echo "Build finished; now you can process the JSON files." 78 | 79 | htmlhelp: 80 | $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp 81 | @echo 82 | @echo "Build finished; now you can run HTML Help Workshop with the" \ 83 | ".hhp project file in $(BUILDDIR)/htmlhelp." 84 | 85 | qthelp: 86 | $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp 87 | @echo 88 | @echo "Build finished; now you can run "qcollectiongenerator" with the" \ 89 | ".qhcp project file in $(BUILDDIR)/qthelp, like this:" 90 | @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/py_stringmatching.qhcp" 91 | @echo "To view the help file:" 92 | @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/py_stringmatching.qhc" 93 | 94 | applehelp: 95 | $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp 96 | @echo 97 | @echo "Build finished. The help book is in $(BUILDDIR)/applehelp." 98 | @echo "N.B. You won't be able to view it unless you put it in" \ 99 | "~/Library/Documentation/Help or install it in your application" \ 100 | "bundle." 101 | 102 | devhelp: 103 | $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp 104 | @echo 105 | @echo "Build finished." 106 | @echo "To view the help file:" 107 | @echo "# mkdir -p $$HOME/.local/share/devhelp/py_stringmatching" 108 | @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/py_stringmatching" 109 | @echo "# devhelp" 110 | 111 | epub: 112 | $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub 113 | @echo 114 | @echo "Build finished. The epub file is in $(BUILDDIR)/epub." 115 | 116 | latex: 117 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 118 | @echo 119 | @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex." 120 | @echo "Run \`make' in that directory to run these through (pdf)latex" \ 121 | "(use \`make latexpdf' here to do that automatically)." 122 | 123 | latexpdf: 124 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 125 | @echo "Running LaTeX files through pdflatex..." 126 | $(MAKE) -C $(BUILDDIR)/latex all-pdf 127 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 128 | 129 | latexpdfja: 130 | $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex 131 | @echo "Running LaTeX files through platex and dvipdfmx..." 132 | $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja 133 | @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex." 134 | 135 | text: 136 | $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text 137 | @echo 138 | @echo "Build finished. The text files are in $(BUILDDIR)/text." 139 | 140 | man: 141 | $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man 142 | @echo 143 | @echo "Build finished. The manual pages are in $(BUILDDIR)/man." 144 | 145 | texinfo: 146 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 147 | @echo 148 | @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo." 149 | @echo "Run \`make' in that directory to run these through makeinfo" \ 150 | "(use \`make info' here to do that automatically)." 151 | 152 | info: 153 | $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo 154 | @echo "Running Texinfo files through makeinfo..." 155 | make -C $(BUILDDIR)/texinfo info 156 | @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo." 157 | 158 | gettext: 159 | $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale 160 | @echo 161 | @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale." 162 | 163 | changes: 164 | $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes 165 | @echo 166 | @echo "The overview file is in $(BUILDDIR)/changes." 167 | 168 | linkcheck: 169 | $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck 170 | @echo 171 | @echo "Link check complete; look for any errors in the above output " \ 172 | "or in $(BUILDDIR)/linkcheck/output.txt." 173 | 174 | doctest: 175 | $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest 176 | @echo "Testing of doctests in the sources finished, look at the " \ 177 | "results in $(BUILDDIR)/doctest/output.txt." 178 | 179 | coverage: 180 | $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage 181 | @echo "Testing of coverage in the sources finished, look at the " \ 182 | "results in $(BUILDDIR)/coverage/python.txt." 183 | 184 | xml: 185 | $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml 186 | @echo 187 | @echo "Build finished. The XML files are in $(BUILDDIR)/xml." 188 | 189 | pseudoxml: 190 | $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml 191 | @echo 192 | @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml." 193 | -------------------------------------------------------------------------------- /docs/MongeElkan.rst: -------------------------------------------------------------------------------- 1 | Monge Elkan 2 | ------------------------------------------------------- 3 | 4 | .. autoclass:: py_stringmatching.similarity_measure.monge_elkan.MongeElkan(sim_func=jaro_winkler_function) 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/NeedlemanWunsch.rst: -------------------------------------------------------------------------------- 1 | Needleman Wunsch 2 | ------------------------------------------------------------ 3 | 4 | .. autoclass:: py_stringmatching.similarity_measure.needleman_wunsch.NeedlemanWunsch(gap_cost=1.0, sim_func=identity_function) 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/OverlapCoefficient.rst: -------------------------------------------------------------------------------- 1 | Overlap Coefficient 2 | --------------------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.overlap_coefficient 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/PartialRatio.rst: -------------------------------------------------------------------------------- 1 | Partial Ratio 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.partial_ratio 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/PartialTokenSort.rst: -------------------------------------------------------------------------------- 1 | Partial Token Sort 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.partial_token_sort 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/QgramTokenizer.rst: -------------------------------------------------------------------------------- 1 | Qgram Tokenizer 2 | -------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.tokenizer.qgram_tokenizer 5 | :members: 6 | :inherited-members: 7 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 8 | 9 | -------------------------------------------------------------------------------- /docs/Ratio.rst: -------------------------------------------------------------------------------- 1 | Ratio 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.ratio 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/SimilarityMeasure.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Similarity Measures 3 | =================== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | Affine 9 | BagDistance 10 | Cosine 11 | Dice 12 | Editex 13 | GeneralizedJaccard 14 | HammingDistance 15 | Jaccard 16 | Jaro 17 | JaroWinkler 18 | Levenshtein 19 | MongeElkan 20 | NeedlemanWunsch 21 | OverlapCoefficient 22 | PartialRatio 23 | PartialTokenSort 24 | Ratio 25 | SmithWaterman 26 | SoftTfIdf 27 | Soundex 28 | TfIdf 29 | TokenSort 30 | TverskyIndex 31 | -------------------------------------------------------------------------------- /docs/SmithWaterman.rst: -------------------------------------------------------------------------------- 1 | Smith Waterman 2 | ---------------------------------------------------------- 3 | 4 | .. autoclass:: py_stringmatching.similarity_measure.smith_waterman.SmithWaterman(gap_cost=1.0, sim_func=identity_function) 5 | :members: 6 | 7 | -------------------------------------------------------------------------------- /docs/SoftTfIdf.rst: -------------------------------------------------------------------------------- 1 | Soft TF/IDF 2 | ------------------------------------------------------ 3 | 4 | .. autoclass:: py_stringmatching.similarity_measure.soft_tfidf.SoftTfIdf(corpus_list=None, sim_func=jaro_function, threshold=0.5) 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/Soundex.rst: -------------------------------------------------------------------------------- 1 | Soundex 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.soundex 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/TfIdf.rst: -------------------------------------------------------------------------------- 1 | TF/IDF 2 | ------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.tfidf 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/TokenSort.rst: -------------------------------------------------------------------------------- 1 | Token Sort 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.token_sort 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/Tokenizer.rst: -------------------------------------------------------------------------------- 1 | =================== 2 | Tokenizers 3 | =================== 4 | 5 | .. toctree:: 6 | :maxdepth: 2 7 | 8 | AlphabeticTokenizer 9 | AlphanumericTokenizer 10 | DelimiterTokenizer 11 | QgramTokenizer 12 | WhitespaceTokenizer 13 | 14 | -------------------------------------------------------------------------------- /docs/TverskyIndex.rst: -------------------------------------------------------------------------------- 1 | Tversky Index 2 | ------------------------------------------------------------ 3 | 4 | .. automodule:: py_stringmatching.similarity_measure.tversky_index 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/WhatIsNew.rst: -------------------------------------------------------------------------------- 1 | What is New? 2 | ============ 3 | 4 | Compared to Version 0.4.2, the following items are new: 5 | 6 | * Dropped support for Python 3.5, added support for Python 3.9. 7 | -------------------------------------------------------------------------------- /docs/WhitespaceTokenizer.rst: -------------------------------------------------------------------------------- 1 | Whitespace Tokenizer 2 | ------------------------------------------------------- 3 | 4 | .. automodule:: py_stringmatching.tokenizer.whitespace_tokenizer 5 | :members: 6 | :inherited-members: 7 | :exclude-members: __delattr__, __format__, __getattribute__, __hash__, __reduce__, __reduce_ex__, __repr__, __setattr__, __sizeof__, __str__ 8 | 9 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | User Manual for py_stringmatching 2 | ================================= 3 | 4 | This document explains how to install, use, and contribute to the package. 5 | 6 | Contents 7 | ======== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | 12 | WhatIsNew 13 | Installation 14 | Tutorial 15 | Tokenizer 16 | SimilarityMeasure 17 | Benchmark 18 | Contributing 19 | 20 | Indices and tables 21 | ================== 22 | 23 | * :ref:`genindex` 24 | * :ref:`modindex` 25 | * :ref:`search` 26 | 27 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=_build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% . 10 | set I18NSPHINXOPTS=%SPHINXOPTS% . 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 2> nul 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\py_stringmatching.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\py_stringmatching.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | -------------------------------------------------------------------------------- /py_stringmatching/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.4.6" 2 | 3 | # Import tokenizers 4 | from py_stringmatching.tokenizer.alphabetic_tokenizer import AlphabeticTokenizer 5 | from py_stringmatching.tokenizer.alphanumeric_tokenizer import AlphanumericTokenizer 6 | from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer 7 | from py_stringmatching.tokenizer.qgram_tokenizer import QgramTokenizer 8 | from py_stringmatching.tokenizer.whitespace_tokenizer import WhitespaceTokenizer 9 | 10 | # Import similarity measures 11 | from py_stringmatching.similarity_measure.affine import Affine 12 | from py_stringmatching.similarity_measure.bag_distance import BagDistance 13 | from py_stringmatching.similarity_measure.cosine import Cosine 14 | from py_stringmatching.similarity_measure.dice import Dice 15 | from py_stringmatching.similarity_measure.editex import Editex 16 | from py_stringmatching.similarity_measure.generalized_jaccard import GeneralizedJaccard 17 | from py_stringmatching.similarity_measure.hamming_distance import HammingDistance 18 | from py_stringmatching.similarity_measure.jaccard import Jaccard 19 | from py_stringmatching.similarity_measure.jaro import Jaro 20 | from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler 21 | from py_stringmatching.similarity_measure.levenshtein import Levenshtein 22 | from py_stringmatching.similarity_measure.monge_elkan import MongeElkan 23 | from py_stringmatching.similarity_measure.needleman_wunsch import NeedlemanWunsch 24 | from py_stringmatching.similarity_measure.overlap_coefficient import OverlapCoefficient 25 | from py_stringmatching.similarity_measure.smith_waterman import SmithWaterman 26 | from py_stringmatching.similarity_measure.soft_tfidf import SoftTfIdf 27 | from py_stringmatching.similarity_measure.soundex import Soundex 28 | from py_stringmatching.similarity_measure.tfidf import TfIdf 29 | from py_stringmatching.similarity_measure.tversky_index import TverskyIndex 30 | from py_stringmatching.similarity_measure.partial_ratio import PartialRatio 31 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/similarity_measure/__init__.py -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/affine.py: -------------------------------------------------------------------------------- 1 | 2 | from py_stringmatching import utils 3 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 4 | SequenceSimilarityMeasure 5 | from py_stringmatching.similarity_measure.cython.cython_affine import affine 6 | from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident 7 | 8 | class Affine(SequenceSimilarityMeasure): 9 | """Returns the affine gap score between two strings. 10 | 11 | The affine gap measure is an extension of the Needleman-Wunsch measure that handles the longer gaps more 12 | gracefully. For more information refer to the string matching chapter in the DI book ("Principles of Data Integration"). 13 | 14 | Args: 15 | gap_start (float): Cost for the gap at the start (defaults to 1). 16 | gap_continuation (float): Cost for the gap continuation (defaults to 0.5). 17 | sim_func (function): Function computing similarity score between two characters, which are represented as strings (defaults 18 | to an identity function, which returns 1 if the two characters are the same and returns 0 otherwise). 19 | 20 | Attributes: 21 | gap_start (float): An attribute to store the gap cost at the start. 22 | gap_continuation (float): An attribute to store the gap continuation cost. 23 | sim_func (function): An attribute to store the similarity function. 24 | """ 25 | 26 | def __init__(self, gap_start=1, gap_continuation=0.5, sim_func=cython_sim_ident): 27 | self.gap_start = gap_start 28 | self.gap_continuation = gap_continuation 29 | self.sim_func = sim_func 30 | super(Affine, self).__init__() 31 | 32 | def get_raw_score(self, string1, string2): 33 | """Computes the affine gap score between two strings. This score can be outside the range [0,1]. 34 | 35 | Args: 36 | string1,string2 (str) : Input strings. 37 | 38 | Returns: 39 | Affine gap score betwen the two input strings (float). 40 | 41 | Raises: 42 | TypeError : If the inputs are not strings or if one of the inputs is None. 43 | 44 | Examples: 45 | >>> aff = Affine() 46 | >>> aff.get_raw_score('dva', 'deeva') 47 | 1.5 48 | >>> aff = Affine(gap_start=2, gap_continuation=0.5) 49 | >>> aff.get_raw_score('dva', 'deeve') 50 | -0.5 51 | >>> aff = Affine(gap_continuation=0.2, sim_func=lambda s1, s2: (int(1 if s1 == s2 else 0))) 52 | >>> aff.get_raw_score('AAAGAATTCA', 'AAATCA') 53 | 4.4 54 | """ 55 | # input validations 56 | utils.sim_check_for_none(string1, string2) 57 | 58 | # convert input to unicode. 59 | string1 = utils.convert_to_unicode(string1) 60 | string2 = utils.convert_to_unicode(string2) 61 | 62 | utils.tok_check_for_string_input(string1, string2) 63 | 64 | # if one of the strings is empty return 0 65 | if utils.sim_check_for_empty(string1, string2): 66 | return 0 67 | 68 | return affine(string1, string2, self.gap_start, self.gap_continuation, self.sim_func) 69 | 70 | def get_gap_start(self): 71 | """Get gap start cost. 72 | 73 | Returns: 74 | gap start cost (float). 75 | """ 76 | return self.gap_start 77 | 78 | def get_gap_continuation(self): 79 | """Get gap continuation cost. 80 | 81 | Returns: 82 | gap continuation cost (float). 83 | """ 84 | return self.gap_continuation 85 | 86 | def get_sim_func(self): 87 | """Get similarity function. 88 | 89 | Returns: 90 | similarity function (function). 91 | """ 92 | return self.sim_func 93 | 94 | def set_gap_start(self, gap_start): 95 | """Set gap start cost. 96 | 97 | Args: 98 | gap_start (float): Cost for the gap at the start. 99 | """ 100 | self.gap_start = gap_start 101 | return True 102 | 103 | def set_gap_continuation(self, gap_continuation): 104 | """Set gap continuation cost. 105 | 106 | Args: 107 | gap_continuation (float): Cost for the gap continuation. 108 | """ 109 | self.gap_continuation = gap_continuation 110 | return True 111 | 112 | def set_sim_func(self, sim_func): 113 | """Set similarity function. 114 | 115 | Args: 116 | sim_func (function): Function computing similarity score between two characters, represented as strings. 117 | """ 118 | self.sim_func = sim_func 119 | return True 120 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/bag_distance.py: -------------------------------------------------------------------------------- 1 | """Bag distance measure""" 2 | 3 | import collections 4 | 5 | from py_stringmatching import utils 6 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 7 | SequenceSimilarityMeasure 8 | 9 | 10 | class BagDistance(SequenceSimilarityMeasure): 11 | """Bag distance measure class. 12 | """ 13 | def __init__(self): 14 | super(BagDistance, self).__init__() 15 | 16 | def get_raw_score(self, string1, string2): 17 | """ 18 | Computes the bag distance between two strings. 19 | 20 | For two strings X and Y, the Bag distance is: 21 | :math:`max( |bag(string1)-bag(string2)|, |bag(string2)-bag(string1)| )` 22 | 23 | Args: 24 | string1,string2 (str): Input strings 25 | 26 | Returns: 27 | Bag distance (int) 28 | 29 | Raises: 30 | TypeError : If the inputs are not strings 31 | 32 | Examples: 33 | >>> bd = BagDistance() 34 | >>> bd.get_raw_score('cat', 'hat') 35 | 1 36 | >>> bd.get_raw_score('Niall', 'Neil') 37 | 2 38 | >>> bd.get_raw_score('aluminum', 'Catalan') 39 | 5 40 | >>> bd.get_raw_score('ATCG', 'TAGC') 41 | 0 42 | >>> bd.get_raw_score('abcde', 'xyz') 43 | 5 44 | 45 | References: 46 | * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf 47 | """ 48 | # input validations 49 | utils.sim_check_for_none(string1, string2) 50 | utils.sim_check_for_string_inputs(string1, string2) 51 | if utils.sim_check_for_exact_match(string1, string2): 52 | return 0 53 | 54 | len_str1 = len(string1) 55 | len_str2 = len(string2) 56 | 57 | if len_str1 == 0: 58 | return len_str2 59 | 60 | if len_str2 == 0: 61 | return len_str1 62 | 63 | bag1 = collections.Counter(string1) 64 | bag2 = collections.Counter(string2) 65 | 66 | size1 = sum((bag1 - bag2).values()) 67 | size2 = sum((bag2 - bag1).values()) 68 | 69 | # returning the max of difference of sets 70 | return max(size1, size2) 71 | 72 | def get_sim_score(self, string1, string2): 73 | """ 74 | Computes the normalized bag similarity between two strings. 75 | 76 | Args: 77 | string1,string2 (str): Input strings 78 | 79 | Returns: 80 | Normalized bag similarity (float) 81 | 82 | Raises: 83 | TypeError : If the inputs are not strings 84 | 85 | Examples: 86 | >>> bd = BagDistance() 87 | >>> bd.get_sim_score('cat', 'hat') 88 | 0.6666666666666667 89 | >>> bd.get_sim_score('Niall', 'Neil') 90 | 0.6 91 | >>> bd.get_sim_score('aluminum', 'Catalan') 92 | 0.375 93 | >>> bd.get_sim_score('ATCG', 'TAGC') 94 | 1.0 95 | >>> bd.get_sim_score('abcde', 'xyz') 96 | 0.0 97 | 98 | References: 99 | * String Matching with Metric Trees Using an Approximate Distance: http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf 100 | """ 101 | raw_score = self.get_raw_score(string1, string2) 102 | string1_len = len(string1) 103 | string2_len = len(string2) 104 | if string1_len == 0 and string2_len == 0: 105 | return 1.0 106 | return 1 - (raw_score / max(string1_len, string2_len)) 107 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cosine.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from py_stringmatching import utils 4 | from py_stringmatching.similarity_measure.token_similarity_measure import \ 5 | TokenSimilarityMeasure 6 | 7 | 8 | class Cosine(TokenSimilarityMeasure): 9 | """Computes a variant of cosine measure known as Ochiai coefficient. 10 | 11 | This is not the cosine measure that computes the cosine of the angle between two given vectors. Rather, it computes a variant of cosine measure known as Ochiai coefficient (see the Wikipedia page "Cosine Similarity"). Specifically, for two sets X and Y, this measure computes: 12 | 13 | :math:`cosine(X, Y) = \\frac{|X \\cap Y|}{\\sqrt{|X| \\cdot |Y|}}` 14 | 15 | Note: 16 | * In the case where one of X and Y is an empty set and the other is a non-empty set, we define their cosine score to be 0. 17 | * In the case where both X and Y are empty sets, we define their cosine score to be 1. 18 | """ 19 | 20 | def __init__(self): 21 | super(Cosine, self).__init__() 22 | 23 | def get_raw_score(self, set1, set2): 24 | """Computes the raw cosine score between two sets. 25 | 26 | Args: 27 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 28 | 29 | Returns: 30 | Cosine similarity (float) 31 | 32 | Raises: 33 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 34 | 35 | Examples: 36 | >>> cos = Cosine() 37 | >>> cos.get_raw_score(['data', 'science'], ['data']) 38 | 0.7071067811865475 39 | >>> cos.get_raw_score(['data', 'data', 'science'], ['data', 'management']) 40 | 0.4999999999999999 41 | >>> cos.get_raw_score([], ['data']) 42 | 0.0 43 | 44 | References: 45 | * String similarity joins: An Experimental Evaluation (a paper appearing in the VLDB 2014 Conference). 46 | * Project Flamingo at http://flamingo.ics.uci.edu. 47 | """ 48 | # input validations 49 | utils.sim_check_for_none(set1, set2) 50 | utils.sim_check_for_list_or_set_inputs(set1, set2) 51 | 52 | # if exact match return 1.0 53 | if utils.sim_check_for_exact_match(set1, set2): 54 | return 1.0 55 | 56 | # if one of the strings is empty return 0 57 | if utils.sim_check_for_empty(set1, set2): 58 | return 0 59 | 60 | if not isinstance(set1, set): 61 | set1 = set(set1) 62 | if not isinstance(set2, set): 63 | set2 = set(set2) 64 | 65 | return float(len(set1 & set2)) / (math.sqrt(float(len(set1))) * 66 | math.sqrt(float(len(set2)))) 67 | 68 | def get_sim_score(self, set1, set2): 69 | """Computes the normalized cosine similarity between two sets. 70 | 71 | Args: 72 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 73 | 74 | Returns: 75 | Normalized cosine similarity (float) 76 | 77 | Raises: 78 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 79 | 80 | Examples: 81 | >>> cos = Cosine() 82 | >>> cos.get_sim_score(['data', 'science'], ['data']) 83 | 0.7071067811865475 84 | >>> cos.get_sim_score(['data', 'data', 'science'], ['data', 'management']) 85 | 0.4999999999999999 86 | >>> cos.get_sim_score([], ['data']) 87 | 0.0 88 | 89 | """ 90 | return self.get_raw_score(set1, set2) 91 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/similarity_measure/cython/__init__.py -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_affine.pyx: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | from py_stringmatching.similarity_measure.cython.cython_utils import float_max_two 4 | from py_stringmatching.similarity_measure.cython.cython_utils import float_max_three 5 | 6 | 7 | 8 | def affine(unicode string1, unicode string2, float main_gap_start, float main_gap_continuation, sim_func ): 9 | 10 | cdef float gap_start = - main_gap_start 11 | cdef float gap_continuation = - main_gap_continuation 12 | cdef int len_str1 = len(string1) 13 | cdef int len_str2 = len(string2) 14 | cdef int i=0, j=0 15 | cdef double[:, :] m = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.double) 16 | cdef double[:, :] x = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.double) 17 | cdef double[:, :] y = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.double) 18 | 19 | # DP initialization 20 | for i from 1 <= i < (len_str1+1): 21 | m[i, 0] = -float(np.inf) 22 | x[i, 0] = gap_start + (i-1) * gap_continuation 23 | y[i, 0] = -float(np.inf) 24 | # 25 | # # DP initialization 26 | for j from 1 <= j < (len_str2+1): 27 | m[0, j] = -float(np.inf) 28 | x[0, j] = -float(np.inf) 29 | y[0, j] = gap_start + (j-1) * gap_continuation 30 | 31 | 32 | # affine gap calculation using DP 33 | for i from 1 <= i < (len_str1 + 1): 34 | for j from 1 <= j < (len_str2 + 1): 35 | # best score between x_1....x_i and y_1....y_j 36 | # given that x_i is aligned to y_j 37 | m[i, j] = (sim_func(string1[i-1], string2[j-1]) + float_max_three(m[i-1][j-1], 38 | x[i-1][j-1], y[i-1][j-1])) 39 | # the best score given that x_i is aligned to a gap 40 | x[i, j] = float_max_two((gap_start + m[i-1, j]), (gap_continuation+ x[i-1, j])) 41 | # the best score given that y_j is aligned to a gap 42 | y[i, j] = float_max_two((gap_start+ m[i, j-1]), (gap_continuation + y[i, j-1])) 43 | 44 | return float_max_three(m[len_str1, len_str2], x[len_str1, len_str2], y[len_str1, len_str2]) 45 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_jaro.pyx: -------------------------------------------------------------------------------- 1 | 2 | from py_stringmatching.similarity_measure.cython.cython_utils import int_max_two 3 | import numpy as np 4 | cimport numpy as np 5 | 6 | 7 | #Cython functions to compute the Jaro score 8 | def jaro(unicode string1, unicode string2): 9 | """Computes the Jaro score between two strings. 10 | Args: 11 | string1,string2 (str): Input strings. 12 | Returns: 13 | Jaro distance score (float). 14 | """ 15 | 16 | 17 | cdef int len_str1 = len(string1), len_str2 = len(string2) 18 | cdef int max_len = int_max_two(len_str1, len_str2) 19 | cdef int search_range = (max_len // 2) - 1 20 | 21 | if search_range < 0: 22 | search_range = 0 23 | 24 | # populating numpy arrays of length as each string with zeros 25 | cdef int[:] flags_s1 = np.zeros(len_str1, dtype=np.int32) 26 | cdef int[:] flags_s2 = np.zeros(len_str2, dtype=np.int32) 27 | 28 | cdef int common_chars = 0, low = 0, high = 0, i = 0, j = 0 29 | 30 | # Finding the number of common characters in two strings 31 | for i from 0 <= i < len_str1: 32 | low = i - search_range if i > search_range else 0 33 | high = i + search_range if i + search_range < len_str2 else len_str2 - 1 34 | for j from low <= j < (high + 1): 35 | if flags_s2[j] == 0 and string2[j] == string1[i]: 36 | flags_s1[i] = flags_s2[j] = 1 37 | common_chars += 1 38 | break 39 | 40 | if common_chars == 0: 41 | return 0 42 | 43 | cdef int trans_count = 0, k = 0 44 | 45 | # Finding the number of transpositions and Jaro distance 46 | for i from 0 <= i < len_str1: 47 | if flags_s1[i] == 1: 48 | for j from k <= j < len_str2: 49 | if flags_s2[j] == 1: 50 | k = j + 1 51 | break 52 | if string1[i] != string2[j]: 53 | trans_count += 1 54 | trans_count /= 2 55 | cdef float score = (float(common_chars) / len_str1 + float(common_chars) / len_str2 + 56 | (float(common_chars) - trans_count) / float(common_chars)) / 3 57 | return score 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_jaro_winkler.pyx: -------------------------------------------------------------------------------- 1 | 2 | from py_stringmatching.similarity_measure.cython.cython_utils import int_min_two 3 | from py_stringmatching.similarity_measure.cython.cython_jaro import jaro 4 | 5 | 6 | def jaro_winkler(unicode string1, unicode string2, float prefix_weight): 7 | """Function to find the Jaro Winkler distance between two strings. 8 | Args: 9 | string1,string2 (unicode), prefix_weight (float): Input strings and prefix weight. 10 | Returns: 11 | Jaro Winkler distance score (float) 12 | """ 13 | cdef int i = 0 14 | cdef float jw_score = jaro(string1, string2) 15 | cdef int min_len = int_min_two(len(string1), len(string2)) 16 | cdef int j = int_min_two(min_len, 4) 17 | 18 | #Finding the Jaro Winkler distance between two strings 19 | while i < j and string1[i] == string2[i]: 20 | i += 1 21 | if i != 0: 22 | jw_score += i * prefix_weight * (1 - jw_score) 23 | 24 | return jw_score 25 | 26 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_levenshtein.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False 2 | 3 | import cython 4 | import numpy as np 5 | cimport numpy as cnp 6 | from py_stringmatching.similarity_measure.cython.cython_utils import int_min_three 7 | from numpy import int32 8 | from numpy cimport int32_t 9 | 10 | DTYPE = int 11 | ctypedef cnp.int_t DTYPE_t 12 | 13 | @cython.boundscheck(False) 14 | @cython.wraparound(False) 15 | 16 | 17 | 18 | def levenshtein(unicode string1, unicode string2): 19 | 20 | cdef int len_str1 = len(string1) 21 | cdef int len_str2 = len(string2) 22 | 23 | cdef int ins_cost = 1 24 | cdef int del_cost = 1 25 | cdef int sub_cost = 1 26 | cdef int trans_cost = 1 27 | 28 | cdef int i = 0 29 | cdef int j = 0 30 | 31 | if len_str1 == 0: 32 | return len_str2 * ins_cost 33 | 34 | if len_str2 == 0: 35 | return len_str1 * del_cost 36 | 37 | cdef int[:,:] d_mat = np.zeros((len_str1 + 1, len_str2 + 1), dtype=np.int32) 38 | 39 | for i from 0 <= i < (len_str1 + 1): 40 | d_mat[i, 0] = i * del_cost 41 | 42 | for j from 0 <= j < (len_str2 + 1): 43 | d_mat[0, j] = j * ins_cost 44 | 45 | cdef unsigned char lchar = 0 46 | cdef unsigned char rchar = 0 47 | 48 | for i from 0 <= i < (len_str1): 49 | lchar = string1[i] 50 | for j from 0 <= j < (len_str2): 51 | rchar = string2[j] 52 | 53 | d_mat[i+1,j+1] = int_min_three(d_mat[i + 1, j] + ins_cost, d_mat[i, j + 1] + del_cost, d_mat[i, j] 54 | + (sub_cost if lchar != rchar else 0)) 55 | return d_mat[len_str1, len_str2] 56 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_needleman_wunsch.pyx: -------------------------------------------------------------------------------- 1 | import cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | @cython.boundscheck(False) 6 | @cython.wraparound(False) 7 | 8 | def needleman_wunsch(unicode string1, unicode string2, float gap_cost, 9 | sim_score): 10 | """ Computes Needleman-Wunsch measure raw score. 11 | Args: 12 | string1, string2 (unicode): Input unicode strings 13 | gap_cost (float): Cost of gap 14 | sim_score (sim function): Similarity function given by user if not use default sim ident function 15 | Returns: 16 | Returns Needleman-Wunsch similarity score (float) 17 | """ 18 | 19 | cdef int i = 0, j = 0 20 | cdef double match = 0.0, delete = 0.0, insert = 0.0 21 | cdef double sim_func_score = 0.0 22 | cdef int len_s1 = len(string1), len_s2 = len(string2) 23 | cdef double[:,:] dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=float) 24 | 25 | # DP initialization 26 | for i from 0 <= i < (len_s1 + 1): 27 | dist_mat[i, 0] = -(i * gap_cost) 28 | 29 | # DP initialization 30 | for j from 0 <= j < (len_s2 + 1): 31 | dist_mat[0, j] = -(j * gap_cost) 32 | 33 | 34 | # Needleman-Wunsch DP calculation 35 | for i from 1 <= i < (len_s1 + 1): 36 | for j from 1 <= j < (len_s2 + 1): 37 | sim_func_score = sim_score(string1[i - 1], string2[j - 1]) 38 | match = dist_mat[i - 1, j - 1] + sim_func_score 39 | delete = dist_mat[i - 1, j] - gap_cost 40 | insert = dist_mat[i, j - 1] - gap_cost 41 | dist_mat[i, j] = max(match, delete, insert) 42 | 43 | return dist_mat[len_s1, len_s2] -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_smith_waterman.pyx: -------------------------------------------------------------------------------- 1 | import cython 2 | import numpy as np 3 | cimport numpy as np 4 | 5 | @cython.boundscheck(False) 6 | @cython.wraparound(False) 7 | 8 | 9 | def smith_waterman(unicode string1, unicode string2, float gap_cost, \ 10 | sim_func): 11 | 12 | cdef int i = 0, j = 0 13 | cdef double match = 0.0, delete = 0.0, insert = 0.0 14 | cdef double sim_score = 0.0, max_value = 0.0 15 | cdef int len_s1 = len(string1), len_s2 = len(string2) 16 | cdef double[:,:] dist_mat = np.zeros((len(string1) + 1, len(string2) + 1), dtype=float) 17 | 18 | 19 | # Smith Waterman DP calculations 20 | for i from 1 <= i < (len_s1 + 1): 21 | for j from 1 <= j < (len_s2 + 1): 22 | 23 | sim_func_score = sim_func(string1[i - 1], string2[j - 1]) 24 | match = dist_mat[i - 1, j - 1] + sim_func_score 25 | delete = dist_mat[i - 1, j] - gap_cost 26 | insert = dist_mat[i, j - 1] - gap_cost 27 | dist_mat[i, j] = max(0, match, delete, insert) 28 | max_value = max(max_value, dist_mat[i, j]) 29 | 30 | return max_value -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/cython/cython_utils.pyx: -------------------------------------------------------------------------------- 1 | import cython 2 | 3 | 4 | def cython_sim_ident(unicode char1, unicode char2): 5 | return 1 if char1 == char2 else 0 6 | 7 | 8 | def int_max_two(int a, int b): 9 | """Finds the maximum integer of the given two integers. 10 | Args: 11 | integer1, integer2 (int): Input integers. 12 | Returns: 13 | Maximum integer (int). 14 | """ 15 | if a > b : return a 16 | else: return b 17 | 18 | 19 | def int_max_three(int a, int b, int c): 20 | """Finds the maximum integer of the given three integers. 21 | Args: 22 | integer1, integer2, integer3 (int): Input integers. 23 | Returns: 24 | Maximum integer (int). 25 | """ 26 | cdef int max_int = a 27 | if b > max_int: 28 | max_int = b 29 | if c > max_int: 30 | max_int = c 31 | return max_int 32 | 33 | 34 | def float_max_two(float a, float b): 35 | """Finds the maximum float of the given two floats. 36 | Args: 37 | float1, float2 (float): Input floats. 38 | Returns: 39 | Maximum float (float). 40 | """ 41 | if a > b : return a 42 | else: return b 43 | 44 | 45 | def float_max_three(float a, float b, float c): 46 | """Finds the maximum float of the given two float. 47 | Args: 48 | float1, float2, float3 (float): Input floats. 49 | Returns: 50 | Maximum float (float). 51 | """ 52 | cdef float max_float = a 53 | if b > max_float: 54 | max_float = b 55 | if c > max_float: 56 | max_float = c 57 | return max_float 58 | 59 | 60 | def int_min_two(int a, int b): 61 | """Finds the minimum integer of the given two integers. 62 | Args: 63 | integer a,integer b (int): Input integers. 64 | Returns: 65 | Minimum integer (int). 66 | """ 67 | if a > b : return b 68 | else: return a 69 | 70 | 71 | def int_min_three(int a, int b, int c): 72 | """Finds the minimum integer of the given two integers. 73 | Args: 74 | integer a, integer b, integer c (int): Input integers. 75 | Returns: 76 | Minimum integer (int). 77 | """ 78 | cdef int min_int = a 79 | if b < min_int: 80 | min_int = b 81 | if c < min_int: 82 | min_int = c 83 | return min_int 84 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/dice.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.token_similarity_measure import \ 3 | TokenSimilarityMeasure 4 | 5 | 6 | class Dice(TokenSimilarityMeasure): 7 | """Returns the Dice score between two strings. 8 | 9 | The Dice similarity score is defined as twice the shared information (intersection) divided by sum of cardinalities. 10 | For two sets X and Y, the Dice similarity score is: 11 | 12 | :math:`dice(X, Y) = \\frac{2 * |X \\cap Y|}{|X| + |Y|}` 13 | 14 | Note: 15 | In the case where both X and Y are empty sets, we define their Dice score to be 1. 16 | """ 17 | 18 | def __init__(self): 19 | super(Dice, self).__init__() 20 | 21 | def get_raw_score(self, set1, set2): 22 | """Computes the raw Dice score between two sets. This score is already in [0,1]. 23 | 24 | Args: 25 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 26 | 27 | Returns: 28 | Dice similarity score (float). 29 | 30 | Raises: 31 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 32 | 33 | Examples: 34 | >>> dice = Dice() 35 | >>> dice.get_raw_score(['data', 'science'], ['data']) 36 | 0.6666666666666666 37 | >>> dice.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 38 | 0.5454545454545454 39 | >>> dice.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 40 | 0.5 41 | 42 | References: 43 | * Wikipedia article : https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Dice%27s_coefficient 44 | * SimMetrics library. 45 | """ 46 | 47 | # input validations 48 | utils.sim_check_for_none(set1, set2) 49 | utils.sim_check_for_list_or_set_inputs(set1, set2) 50 | 51 | # if exact match return 1.0 52 | if utils.sim_check_for_exact_match(set1, set2): 53 | return 1.0 54 | 55 | # if one of the strings is empty return 0 56 | if utils.sim_check_for_empty(set1, set2): 57 | return 0 58 | 59 | if not isinstance(set1, set): 60 | set1 = set(set1) 61 | if not isinstance(set2, set): 62 | set2 = set(set2) 63 | 64 | return 2.0 * float(len(set1 & set2)) / float(len(set1) + len(set2)) 65 | 66 | def get_sim_score(self, set1, set2): 67 | """Computes the normalized dice similarity score between two sets. Simply call get_raw_score. 68 | 69 | Args: 70 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 71 | 72 | Returns: 73 | Normalized dice similarity (float). 74 | 75 | Raises: 76 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 77 | 78 | Examples: 79 | >>> dice = Dice() 80 | >>> dice.get_sim_score(['data', 'science'], ['data']) 81 | 0.6666666666666666 82 | >>> dice.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 83 | 0.5454545454545454 84 | >>> dice.get_sim_score(['data', 'management'], ['data', 'data', 'science']) 85 | 0.5 86 | 87 | """ 88 | return self.get_raw_score(set1, set2) 89 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/generalized_jaccard.py: -------------------------------------------------------------------------------- 1 | """Generalized jaccard similarity measure""" 2 | 3 | from py_stringmatching import utils 4 | from py_stringmatching.similarity_measure.jaro import Jaro 5 | from py_stringmatching.similarity_measure.hybrid_similarity_measure import \ 6 | HybridSimilarityMeasure 7 | 8 | 9 | class GeneralizedJaccard(HybridSimilarityMeasure): 10 | """Generalized jaccard similarity measure class. 11 | 12 | Parameters: 13 | sim_func (function): similarity function. This should return a similarity score between two strings in set (optional), 14 | default is jaro similarity measure 15 | threshold (float): Threshold value (defaults to 0.5). If the similarity of a token pair exceeds the threshold, 16 | then the token pair is considered a match. 17 | """ 18 | def __init__(self, sim_func=Jaro().get_raw_score, threshold=0.5): 19 | self.sim_func = sim_func 20 | self.threshold = threshold 21 | super(GeneralizedJaccard, self).__init__() 22 | 23 | def get_raw_score(self, set1, set2): 24 | """ 25 | Computes the Generalized Jaccard measure between two sets. 26 | 27 | This similarity measure is softened version of the Jaccard measure. The Jaccard measure is 28 | promising candidate for tokens which exactly match across the sets. However, in practice tokens 29 | are often misspelled, such as energy vs. eneryg. THe generalized Jaccard measure will enable 30 | matching in such cases. 31 | 32 | Args: 33 | set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets. 34 | 35 | Returns: 36 | Generalized Jaccard similarity (float) 37 | 38 | Raises: 39 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 40 | ValueError : If the similarity measure doesn't return values in the range [0,1] 41 | 42 | Examples: 43 | >>> gj = GeneralizedJaccard() 44 | >>> gj.get_raw_score(['data', 'science'], ['data']) 45 | 0.5 46 | >>> gj.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 47 | 0.3333333333333333 48 | >>> gj.get_raw_score(['Niall'], ['Neal', 'Njall']) 49 | 0.43333333333333335 50 | >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8) 51 | >>> gj.get_raw_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], 52 | ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 53 | 0.45810185185185187 54 | """ 55 | # input validations 56 | utils.sim_check_for_none(set1, set2) 57 | utils.sim_check_for_list_or_set_inputs(set1, set2) 58 | 59 | # if exact match return 1.0 60 | if utils.sim_check_for_exact_match(set1, set2): 61 | return 1.0 62 | 63 | # if one of the strings is empty return 0 64 | if utils.sim_check_for_empty(set1, set2): 65 | return 0 66 | 67 | if not isinstance(set1, set): 68 | set1 = set(set1) 69 | if not isinstance(set2, set): 70 | set2 = set(set2) 71 | 72 | set1_x = set() 73 | set2_y = set() 74 | match_score = 0.0 75 | match_count = 0 76 | list_matches = [] 77 | for element in set1: 78 | for item in set2: 79 | score = self.sim_func(element, item) 80 | if score > 1 or score < 0: 81 | raise ValueError('Similarity measure should' + \ 82 | ' return value in the range [0,1]') 83 | if score > self.threshold: 84 | list_matches.append((element, item, score)) 85 | 86 | # position of first string, second string and sim score in tuple 87 | first_string_pos = 0 88 | second_string_pos = 1 89 | sim_score_pos = 2 90 | 91 | # sort the score of all the pairs 92 | list_matches.sort(key=lambda x: x[sim_score_pos], reverse=True) 93 | 94 | # select score in increasing order of their weightage, 95 | # do not reselect the same element from either set. 96 | for element in list_matches: 97 | if (element[first_string_pos] not in set1_x and 98 | element[second_string_pos] not in set2_y): 99 | set1_x.add(element[first_string_pos]) 100 | set2_y.add(element[second_string_pos]) 101 | match_score += element[sim_score_pos] 102 | match_count += 1 103 | 104 | return float(match_score) / float(len(set1) + len(set2) - match_count) 105 | 106 | def get_sim_score(self, set1, set2): 107 | """ 108 | Computes the normalized Generalized Jaccard similarity between two sets. 109 | 110 | Args: 111 | set1,set2 (set or list): Input sets (or lists) of strings. Input lists are converted to sets. 112 | 113 | Returns: 114 | Normalized Generalized Jaccard similarity (float) 115 | 116 | Raises: 117 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 118 | ValueError : If the similarity measure doesn't return values in the range [0,1] 119 | 120 | Examples: 121 | >>> gj = GeneralizedJaccard() 122 | >>> gj.get_sim_score(['data', 'science'], ['data']) 123 | 0.5 124 | >>> gj.get_sim_score(['data', 'management'], ['data', 'data', 'science']) 125 | 0.3333333333333333 126 | >>> gj.get_sim_score(['Niall'], ['Neal', 'Njall']) 127 | 0.43333333333333335 128 | >>> gj = GeneralizedJaccard(sim_func=JaroWinkler().get_raw_score, threshold=0.8) 129 | >>> gj.get_sim_score(['Comp', 'Sci.', 'and', 'Engr', 'Dept.,', 'Universty', 'of', 'Cal,', 'San', 'Deigo'], 130 | ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 131 | 0.45810185185185187 132 | 133 | """ 134 | return self.get_raw_score(set1, set2) 135 | 136 | def get_sim_func(self): 137 | """ 138 | Get similarity function 139 | 140 | Returns: 141 | similarity function (function) 142 | """ 143 | return self.sim_func 144 | 145 | def get_threshold(self): 146 | """ 147 | Get threshold used for the similarity function 148 | 149 | Returns: 150 | threshold (float) 151 | """ 152 | return self.threshold 153 | 154 | def set_sim_func(self, sim_func): 155 | """ 156 | Set similarity function 157 | 158 | Args: 159 | sim_func (function): similarity function 160 | """ 161 | self.sim_func = sim_func 162 | return True 163 | 164 | def set_threshold(self, threshold): 165 | """ 166 | Set threshold value for the similarity function 167 | 168 | Args: 169 | threshold (float): threshold value 170 | """ 171 | self.threshold = threshold 172 | return True 173 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/hamming_distance.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 3 | SequenceSimilarityMeasure 4 | 5 | 6 | class HammingDistance(SequenceSimilarityMeasure): 7 | """Computes Hamming distance. 8 | 9 | The Hamming distance between two strings of equal length is the number of positions at which the corresponding 10 | symbols are different. Thus, it measures the minimum number of substitutions required to change 11 | one string into the other, or the minimum number of errors that could have transformed one string into the other. 12 | """ 13 | 14 | def __init__(self): 15 | super(HammingDistance, self).__init__() 16 | 17 | def get_raw_score(self, string1, string2): 18 | """Computes the raw hamming distance between two strings. 19 | 20 | Args: 21 | string1,string2 (str): Input strings. 22 | 23 | Returns: 24 | Hamming distance (int). 25 | 26 | Raises: 27 | TypeError : If the inputs are not strings or if one of the inputs is None. 28 | ValueError : If the input strings are not of same length. 29 | 30 | Examples: 31 | >>> hd = HammingDistance() 32 | >>> hd.get_raw_score('', '') 33 | 0 34 | >>> hd.get_raw_score('alex', 'john') 35 | 4 36 | >>> hd.get_raw_score(' ', 'a') 37 | 1 38 | >>> hd.get_raw_score('JOHN', 'john') 39 | 4 40 | """ 41 | 42 | # input validations 43 | utils.sim_check_for_none(string1, string2) 44 | 45 | # convert input to unicode. 46 | string1 = utils.convert_to_unicode(string1) 47 | string2 = utils.convert_to_unicode(string2) 48 | 49 | utils.tok_check_for_string_input(string1, string2) 50 | 51 | # for Hamming Distance string length should be same 52 | utils.sim_check_for_same_len(string1, string2) 53 | 54 | # sum all the mismatch characters at the corresponding index of 55 | # input strings 56 | return sum(bool(ord(c1) - ord(c2)) for c1, c2 in zip(string1, string2)) 57 | 58 | def get_sim_score(self, string1, string2): 59 | """Computes the normalized Hamming similarity score between two strings. 60 | 61 | Args: 62 | string1,string2 (str): Input strings. 63 | 64 | Returns: 65 | Normalized Hamming similarity score (float). 66 | 67 | Raises: 68 | TypeError : If the inputs are not strings or if one of the inputs is None. 69 | ValueError : If the input strings are not of same length. 70 | 71 | Examples: 72 | >>> hd = HammingDistance() 73 | >>> hd.get_sim_score('', '') 74 | 1.0 75 | >>> hd.get_sim_score('alex', 'john') 76 | 0.0 77 | >>> hd.get_sim_score(' ', 'a') 78 | 0.0 79 | >>> hd.get_sim_score('JOHN', 'john') 80 | 0.0 81 | """ 82 | 83 | # convert input to unicode. 84 | string1 = utils.convert_to_unicode(string1) 85 | string2 = utils.convert_to_unicode(string2) 86 | 87 | raw_score = self.get_raw_score(string1, string2) 88 | 89 | common_len = len(string1) 90 | if common_len == 0: 91 | return 1.0 92 | return 1 - (raw_score / common_len) 93 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/hybrid_similarity_measure.py: -------------------------------------------------------------------------------- 1 | """Hybrid similarity measure""" 2 | 3 | from py_stringmatching.similarity_measure.similarity_measure import \ 4 | SimilarityMeasure 5 | 6 | class HybridSimilarityMeasure(SimilarityMeasure): 7 | pass 8 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/jaccard.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.token_similarity_measure import \ 3 | TokenSimilarityMeasure 4 | 5 | 6 | class Jaccard(TokenSimilarityMeasure): 7 | """Computes Jaccard measure. 8 | 9 | For two sets X and Y, the Jaccard similarity score is: 10 | 11 | :math:`jaccard(X, Y) = \\frac{|X \\cap Y|}{|X \\cup Y|}` 12 | 13 | Note: 14 | In the case where both X and Y are empty sets, we define their Jaccard score to be 1. 15 | """ 16 | 17 | def __init__(self): 18 | super(Jaccard, self).__init__() 19 | 20 | def get_raw_score(self, set1, set2): 21 | """Computes the raw Jaccard score between two sets. 22 | 23 | Args: 24 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 25 | 26 | Returns: 27 | Jaccard similarity score (float). 28 | 29 | Raises: 30 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 31 | 32 | Examples: 33 | >>> jac = Jaccard() 34 | >>> jac.get_raw_score(['data', 'science'], ['data']) 35 | 0.5 36 | >>> jac.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 37 | 0.375 38 | >>> jac.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 39 | 0.3333333333333333 40 | """ 41 | 42 | # input validations 43 | utils.sim_check_for_none(set1, set2) 44 | utils.sim_check_for_list_or_set_inputs(set1, set2) 45 | 46 | # if exact match return 1.0 47 | if utils.sim_check_for_exact_match(set1, set2): 48 | return 1.0 49 | 50 | # if one of the strings is empty return 0 51 | if utils.sim_check_for_empty(set1, set2): 52 | return 0 53 | 54 | if not isinstance(set1, set): 55 | set1 = set(set1) 56 | if not isinstance(set2, set): 57 | set2 = set(set2) 58 | 59 | return float(len(set1 & set2)) / float(len(set1 | set2)) 60 | 61 | def get_sim_score(self, set1, set2): 62 | """Computes the normalized Jaccard similarity between two sets. Simply call get_raw_score. 63 | 64 | Args: 65 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 66 | 67 | Returns: 68 | Normalized Jaccard similarity (float). 69 | 70 | Raises: 71 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 72 | 73 | Examples: 74 | >>> jac = Jaccard() 75 | >>> jac.get_sim_score(['data', 'science'], ['data']) 76 | 0.5 77 | >>> jac.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 78 | 0.375 79 | >>> jac.get_sim_score(['data', 'management'], ['data', 'data', 'science']) 80 | 0.3333333333333333 81 | """ 82 | return self.get_raw_score(set1, set2) 83 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/jaro.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 3 | SequenceSimilarityMeasure 4 | from py_stringmatching.similarity_measure.cython.cython_jaro import jaro 5 | 6 | 7 | class Jaro(SequenceSimilarityMeasure): 8 | """Computes Jaro measure. 9 | 10 | The Jaro measure is a type of edit distance, developed mainly to compare short strings, 11 | such as first and last names. 12 | """ 13 | 14 | def __init__(self): 15 | super(Jaro, self).__init__() 16 | 17 | def get_raw_score(self, string1, string2): 18 | """Computes the raw Jaro score between two strings. 19 | 20 | Args: 21 | string1,string2 (str): Input strings. 22 | 23 | Returns: 24 | Jaro similarity score (float). 25 | 26 | Raises: 27 | TypeError : If the inputs are not strings or if one of the inputs is None. 28 | 29 | Examples: 30 | >>> jaro = Jaro() 31 | >>> jaro.get_raw_score('MARTHA', 'MARHTA') 32 | 0.9444444444444445 33 | >>> jaro.get_raw_score('DWAYNE', 'DUANE') 34 | 0.8222222222222223 35 | >>> jaro.get_raw_score('DIXON', 'DICKSONX') 36 | 0.7666666666666666 37 | 38 | """ 39 | 40 | # input validations 41 | utils.sim_check_for_none(string1, string2) 42 | 43 | # convert input to unicode. 44 | string1 = utils.convert_to_unicode(string1) 45 | string2 = utils.convert_to_unicode(string2) 46 | 47 | utils.tok_check_for_string_input(string1, string2) 48 | 49 | # if one of the strings is empty return 0 50 | if utils.sim_check_for_empty(string1, string2): 51 | return 0 52 | 53 | return jaro(string1, string2) 54 | 55 | def get_sim_score(self, string1, string2): 56 | """Computes the normalized Jaro similarity score between two strings. Simply call get_raw_score. 57 | 58 | Args: 59 | string1,string2 (str): Input strings. 60 | 61 | Returns: 62 | Normalized Jaro similarity score (float). 63 | 64 | Raises: 65 | TypeError : If the inputs are not strings or if one of the inputs is None. 66 | 67 | Examples: 68 | >>> jaro = Jaro() 69 | >>> jaro.get_sim_score('MARTHA', 'MARHTA') 70 | 0.9444444444444445 71 | >>> jaro.get_sim_score('DWAYNE', 'DUANE') 72 | 0.8222222222222223 73 | >>> jaro.get_sim_score('DIXON', 'DICKSONX') 74 | 0.7666666666666666 75 | 76 | """ 77 | return self.get_raw_score(string1, string2) 78 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/jaro_winkler.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 3 | SequenceSimilarityMeasure 4 | from py_stringmatching.similarity_measure.cython.cython_jaro_winkler import jaro_winkler 5 | 6 | 7 | class JaroWinkler(SequenceSimilarityMeasure): 8 | """Computes Jaro-Winkler measure. 9 | 10 | The Jaro-Winkler measure is designed to capture cases where two strings have a low Jaro score, but share a prefix and thus are likely to match. 11 | 12 | Args: 13 | prefix_weight (float): Weight to give to the prefix (defaults to 0.1). 14 | 15 | Attributes: 16 | prefix_weight (float): An attribute to store the prefix weight. 17 | """ 18 | 19 | def __init__(self, prefix_weight=0.1): 20 | self.prefix_weight = prefix_weight 21 | super(JaroWinkler, self).__init__() 22 | 23 | def get_raw_score(self, string1, string2): 24 | """Computes the raw Jaro-Winkler score between two strings. 25 | 26 | Args: 27 | string1,string2 (str): Input strings. 28 | 29 | Returns: 30 | Jaro-Winkler similarity score (float). 31 | 32 | Raises: 33 | TypeError : If the inputs are not strings or if one of the inputs is None. 34 | 35 | Examples: 36 | >>> jw = JaroWinkler() 37 | >>> jw.get_raw_score('MARTHA', 'MARHTA') 38 | 0.9611111111111111 39 | >>> jw.get_raw_score('DWAYNE', 'DUANE') 40 | 0.84 41 | >>> jw.get_raw_score('DIXON', 'DICKSONX') 42 | 0.8133333333333332 43 | 44 | """ 45 | 46 | # input validations 47 | utils.sim_check_for_none(string1, string2) 48 | 49 | # convert input to unicode. 50 | string1 = utils.convert_to_unicode(string1) 51 | string2 = utils.convert_to_unicode(string2) 52 | 53 | utils.tok_check_for_string_input(string1, string2) 54 | 55 | # if one of the strings is empty return 0 56 | if utils.sim_check_for_empty(string1, string2): 57 | return 0 58 | 59 | return jaro_winkler(string1, string2, self.prefix_weight) 60 | 61 | def get_sim_score(self, string1, string2): 62 | """Computes the normalized Jaro-Winkler similarity score between two strings. Simply call get_raw_score. 63 | 64 | Args: 65 | string1,string2 (str): Input strings. 66 | 67 | Returns: 68 | Normalized Jaro-Winkler similarity (float). 69 | 70 | Raises: 71 | TypeError : If the inputs are not strings or if one of the inputs is None. 72 | 73 | Examples: 74 | >>> jw = JaroWinkler() 75 | >>> jw.get_sim_score('MARTHA', 'MARHTA') 76 | 0.9611111111111111 77 | >>> jw.get_sim_score('DWAYNE', 'DUANE') 78 | 0.84 79 | >>> jw.get_sim_score('DIXON', 'DICKSONX') 80 | 0.8133333333333332 81 | """ 82 | return self.get_raw_score(string1, string2) 83 | 84 | def get_prefix_weight(self): 85 | """Get prefix weight. 86 | 87 | Returns: 88 | prefix weight (float). 89 | """ 90 | return self.prefix_weight 91 | 92 | def set_prefix_weight(self, prefix_weight): 93 | """Set prefix weight. 94 | 95 | Args: 96 | prefix_weight (float): Weight to give to the prefix. 97 | """ 98 | self.prefix_weight = prefix_weight 99 | return True 100 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/levenshtein.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.cython.cython_levenshtein import levenshtein 3 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 4 | SequenceSimilarityMeasure 5 | 6 | 7 | class Levenshtein(SequenceSimilarityMeasure): 8 | """Computes Levenshtein measure (also known as edit distance). 9 | 10 | Levenshtein distance computes the minimum cost of transforming one string into the other. Transforming a string 11 | is carried out using a sequence of the following operators: delete a character, insert a character, and 12 | substitute one character for another. 13 | """ 14 | 15 | def __init__(self): 16 | super(Levenshtein, self).__init__() 17 | 18 | def get_raw_score(self, string1, string2): 19 | """Computes the raw Levenshtein distance between two strings. 20 | 21 | Args: 22 | string1,string2 (str): Input strings. 23 | 24 | Returns: 25 | Levenshtein distance (int). 26 | 27 | Raises: 28 | TypeError : If the inputs are not strings. 29 | 30 | Examples: 31 | >>> lev = Levenshtein() 32 | >>> lev.get_raw_score('a', '') 33 | 1 34 | >>> lev.get_raw_score('example', 'samples') 35 | 3 36 | >>> lev.get_raw_score('levenshtein', 'frankenstein') 37 | 6 38 | """ 39 | 40 | # input validations 41 | utils.sim_check_for_none(string1, string2) 42 | 43 | # convert input to unicode. 44 | string1 = utils.convert_to_unicode(string1) 45 | string2 = utils.convert_to_unicode(string2) 46 | 47 | utils.tok_check_for_string_input(string1, string2) 48 | 49 | if utils.sim_check_for_exact_match(string1, string2): 50 | return 0.0 51 | 52 | return levenshtein(string1, string2) 53 | 54 | def get_sim_score(self, string1, string2): 55 | """Computes the normalized Levenshtein similarity score between two strings. 56 | 57 | Args: 58 | string1,string2 (str): Input strings. 59 | 60 | Returns: 61 | Normalized Levenshtein similarity (float). 62 | 63 | Raises: 64 | TypeError : If the inputs are not strings. 65 | 66 | Examples: 67 | >>> lev = Levenshtein() 68 | >>> lev.get_sim_score('a', '') 69 | 0.0 70 | >>> lev.get_sim_score('example', 'samples') 71 | 0.5714285714285714 72 | >>> lev.get_sim_score('levenshtein', 'frankenstein') 73 | 0.5 74 | 75 | """ 76 | 77 | # convert input strings to unicode. 78 | string1 = utils.convert_to_unicode(string1) 79 | string2 = utils.convert_to_unicode(string2) 80 | 81 | raw_score = self.get_raw_score(string1, string2) 82 | max_len = max(len(string1), len(string2)) 83 | if max_len == 0: 84 | return 1.0 85 | return 1 - (raw_score / max_len) 86 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/monge_elkan.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.jaro_winkler import JaroWinkler 3 | from py_stringmatching.similarity_measure.hybrid_similarity_measure import \ 4 | HybridSimilarityMeasure 5 | 6 | 7 | class MongeElkan(HybridSimilarityMeasure): 8 | """Computes Monge-Elkan measure. 9 | 10 | The Monge-Elkan similarity measure is a type of hybrid similarity measure that combines the benefits of 11 | sequence-based and set-based methods. This can be effective for domains in which more control is needed 12 | over the similarity measure. It implicitly uses a secondary similarity measure, such as Levenshtein to compute 13 | over all similarity score. See the string matching chapter in the DI book (Principles of Data Integration). 14 | 15 | Args: 16 | sim_func (function): Secondary similarity function. This is expected to be a sequence-based 17 | similarity measure (defaults to Jaro-Winkler similarity measure). 18 | 19 | Attributes: 20 | sim_func (function): An attribute to store the secondary similarity function. 21 | """ 22 | 23 | def __init__(self, sim_func=JaroWinkler().get_raw_score): 24 | self.sim_func = sim_func 25 | super(MongeElkan, self).__init__() 26 | 27 | def get_raw_score(self, bag1, bag2): 28 | """Computes the raw Monge-Elkan score between two bags (lists). 29 | 30 | Args: 31 | bag1,bag2 (list): Input lists. 32 | 33 | Returns: 34 | Monge-Elkan similarity score (float). 35 | 36 | Raises: 37 | TypeError : If the inputs are not lists or if one of the inputs is None. 38 | 39 | Examples: 40 | >>> me = MongeElkan() 41 | >>> me.get_raw_score(['Niall'], ['Neal']) 42 | 0.8049999999999999 43 | >>> me.get_raw_score(['Niall'], ['Nigel']) 44 | 0.7866666666666667 45 | >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 46 | 0.8364448130130768 47 | >>> me.get_raw_score([''], ['a']) 48 | 0.0 49 | >>> me = MongeElkan(sim_func=NeedlemanWunsch().get_raw_score) 50 | >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 51 | 2.0 52 | >>> me = MongeElkan(sim_func=Affine().get_raw_score) 53 | >>> me.get_raw_score(['Comput.', 'Sci.', 'and', 'Eng.', 'Dept.,', 'University', 'of', 'California,', 'San', 'Diego'], ['Department', 'of', 'Computer', 'Science,', 'Univ.', 'Calif.,', 'San', 'Diego']) 54 | 2.25 55 | 56 | References: 57 | * Principles of Data Integration book 58 | """ 59 | 60 | # input validations 61 | utils.sim_check_for_none(bag1, bag2) 62 | utils.sim_check_for_list_or_set_inputs(bag1, bag2) 63 | 64 | # if exact match return 1.0 65 | if utils.sim_check_for_exact_match(bag1, bag2): 66 | return 1.0 67 | 68 | # if one of the strings is empty return 0 69 | if utils.sim_check_for_empty(bag1, bag2): 70 | return 0 71 | 72 | # aggregated sum of all the max sim score of all the elements in bag1 73 | # with elements in bag2 74 | sum_of_maxes = 0 75 | for el1 in bag1: 76 | max_sim = float('-inf') 77 | for el2 in bag2: 78 | max_sim = max(max_sim, self.sim_func(el1, el2)) 79 | sum_of_maxes += max_sim 80 | 81 | sim = float(sum_of_maxes) / float(len(bag1)) 82 | 83 | return sim 84 | 85 | def get_sim_func(self): 86 | """Get the secondary similarity function. 87 | 88 | Returns: 89 | secondary similarity function (function). 90 | """ 91 | return self.sim_func 92 | 93 | def set_sim_func(self, sim_func): 94 | """Set the secondary similarity function. 95 | 96 | Args: 97 | sim_func (function): Secondary similarity function. 98 | """ 99 | self.sim_func = sim_func 100 | return True 101 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/needleman_wunsch.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 3 | SequenceSimilarityMeasure 4 | from py_stringmatching.similarity_measure.cython.cython_needleman_wunsch import needleman_wunsch 5 | from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident 6 | 7 | 8 | class NeedlemanWunsch(SequenceSimilarityMeasure): 9 | """Computes Needleman-Wunsch measure. 10 | 11 | The Needleman-Wunsch distance generalizes the Levenshtein distance and considers global alignment between two strings. 12 | Specifically, it is computed by assigning a score to each alignment between the two input strings and choosing the 13 | score of the best alignment, that is, the maximal score. An alignment between two strings is a set of correspondences 14 | between their characters, allowing for gaps. 15 | 16 | Args: 17 | gap_cost (float): Cost of gap (defaults to 1.0). 18 | sim_func (function): Similarity function to give a score for each correspondence between the characters (defaults 19 | to an identity function, which returns 1 if the two characters are the same and 0 otherwise. 20 | 21 | Attributes: 22 | gap_cost (float): An attribute to store the gap cost. 23 | sim_func (function): An attribute to store the similarity function. 24 | """ 25 | 26 | def __init__(self, gap_cost=1.0, sim_func=cython_sim_ident): 27 | self.gap_cost = gap_cost 28 | self.sim_func = sim_func 29 | super(NeedlemanWunsch, self).__init__() 30 | 31 | def get_raw_score(self, string1, string2): 32 | """Computes the raw Needleman-Wunsch score between two strings. 33 | 34 | Args: 35 | string1,string2 (str) : Input strings. 36 | 37 | Returns: 38 | Needleman-Wunsch similarity score (float). 39 | 40 | Raises: 41 | TypeError : If the inputs are not strings or if one of the inputs is None. 42 | 43 | Examples: 44 | >>> nw = NeedlemanWunsch() 45 | >>> nw.get_raw_score('dva', 'deeva') 46 | 1.0 47 | >>> nw = NeedlemanWunsch(gap_cost=0.0) 48 | >>> nw.get_raw_score('dva', 'deeve') 49 | 2.0 50 | >>> nw = NeedlemanWunsch(gap_cost=1.0, sim_func=lambda s1, s2 : (2.0 if s1 == s2 else -1.0)) 51 | >>> nw.get_raw_score('dva', 'deeve') 52 | 1.0 53 | >>> nw = NeedlemanWunsch(gap_cost=0.5, sim_func=lambda s1, s2 : (1.0 if s1 == s2 else -1.0)) 54 | >>> nw.get_raw_score('GCATGCUA', 'GATTACA') 55 | 2.5 56 | """ 57 | 58 | # input validations 59 | utils.sim_check_for_none(string1, string2) 60 | 61 | # convert input to unicode. 62 | string1 = utils.convert_to_unicode(string1) 63 | string2 = utils.convert_to_unicode(string2) 64 | 65 | utils.tok_check_for_string_input(string1, string2) 66 | 67 | # returns the similarity score from the cython function 68 | return needleman_wunsch(string1, string2, self.gap_cost, self.sim_func) 69 | 70 | def get_gap_cost(self): 71 | """Get gap cost. 72 | 73 | Returns: 74 | Gap cost (float). 75 | """ 76 | return self.gap_cost 77 | 78 | def get_sim_func(self): 79 | """Get the similarity function. 80 | 81 | Returns: 82 | similarity function (function). 83 | """ 84 | return self.sim_func 85 | 86 | def set_gap_cost(self, gap_cost): 87 | """Set gap cost. 88 | 89 | Args: 90 | gap_cost (float): Cost of gap. 91 | """ 92 | self.gap_cost = gap_cost 93 | return True 94 | 95 | def set_sim_func(self, sim_func): 96 | """Set similarity function. 97 | 98 | Args: 99 | sim_func (function): Similarity function to give a score for the correspondence between characters. 100 | """ 101 | self.sim_func = sim_func 102 | return True 103 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/overlap_coefficient.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.similarity_measure.token_similarity_measure import \ 3 | TokenSimilarityMeasure 4 | 5 | 6 | class OverlapCoefficient(TokenSimilarityMeasure): 7 | """Computes overlap coefficient measure. 8 | 9 | The overlap coefficient is a similarity measure related to the Jaccard 10 | measure that measures the overlap between two sets, and is defined as the size of the intersection divided by 11 | the smaller of the size of the two sets. For two sets X and Y, the overlap coefficient is: 12 | 13 | :math:`overlap\\_coefficient(X, Y) = \\frac{|X \\cap Y|}{\\min(|X|, |Y|)}` 14 | 15 | Note: 16 | * In the case where one of X and Y is an empty set and the other is a non-empty set, we define their overlap coefficient to be 0. 17 | * In the case where both X and Y are empty sets, we define their overlap coefficient to be 1. 18 | """ 19 | 20 | def __init__(self): 21 | super(OverlapCoefficient, self).__init__() 22 | 23 | def get_raw_score(self, set1, set2): 24 | """Computes the raw overlap coefficient score between two sets. 25 | 26 | Args: 27 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 28 | 29 | Returns: 30 | Overlap coefficient (float). 31 | 32 | Raises: 33 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 34 | 35 | Examples: 36 | >>> oc = OverlapCoefficient() 37 | >>> oc.get_raw_score(['data', 'science'], ['data']) 38 | 1.0 39 | >>> oc.get_raw_score([], []) 40 | 1.0 41 | >>> oc.get_raw_score([], ['data']) 42 | 0 43 | 44 | References: 45 | * Wikipedia article : https://en.wikipedia.org/wiki/Overlap_coefficient 46 | * SimMetrics library 47 | """ 48 | 49 | # input validations 50 | utils.sim_check_for_none(set1, set2) 51 | utils.sim_check_for_list_or_set_inputs(set1, set2) 52 | 53 | # if exact match return 1.0 54 | if utils.sim_check_for_exact_match(set1, set2): 55 | return 1.0 56 | 57 | # if one of the strings is empty return 0 58 | if utils.sim_check_for_empty(set1, set2): 59 | return 0 60 | 61 | if not isinstance(set1, set): 62 | set1 = set(set1) 63 | if not isinstance(set2, set): 64 | set2 = set(set2) 65 | 66 | return float(len(set1 & set2)) / min(len(set1), len(set2)) 67 | 68 | def get_sim_score(self, set1, set2): 69 | """Computes the normalized overlap coefficient between two sets. Simply call get_raw_score. 70 | 71 | Args: 72 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 73 | 74 | Returns: 75 | Normalized overlap coefficient (float). 76 | 77 | Raises: 78 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 79 | 80 | Examples: 81 | >>> oc = OverlapCoefficient() 82 | >>> oc.get_sim_score(['data', 'science'], ['data']) 83 | 1.0 84 | >>> oc.get_sim_score([], []) 85 | 1.0 86 | >>> oc.get_sim_score([], ['data']) 87 | 0 88 | 89 | """ 90 | return self.get_raw_score(set1, set2) 91 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/partial_ratio.py: -------------------------------------------------------------------------------- 1 | """Fuzzy Wuzzy Partial Ratio Similarity Measure""" 2 | 3 | from difflib import SequenceMatcher 4 | from py_stringmatching import utils 5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 6 | SequenceSimilarityMeasure 7 | 8 | 9 | class PartialRatio(SequenceSimilarityMeasure): 10 | """Computes the Fuzzy Wuzzy partial ratio similarity between two strings. 11 | 12 | Fuzzy Wuzzy partial ratio raw score is a measure of the strings similarity as an int in the 13 | range [0, 100]. Given two strings X and Y, let the shorter string (X) be of length m. 14 | It finds the fuzzy wuzzy ratio similarity measure between the shorter string and every 15 | substring of length m of the longer string, and returns the maximum of 16 | those similarity measures. Fuzzy Wuzzy partial ratio sim score is a float in the range [0, 1] 17 | and is obtained by dividing the raw score by 100. 18 | 19 | Note: 20 | In the case where either of strings X or Y are empty, we define the Fuzzy Wuzzy ratio similarity 21 | score to be 0. 22 | """ 23 | def __init__(self): 24 | pass 25 | 26 | def get_raw_score(self, string1, string2): 27 | """ 28 | Computes the Fuzzy Wuzzy partial ratio measure raw score between two strings. 29 | This score is in the range [0,100]. 30 | 31 | Args: 32 | string1,string2 (str): Input strings 33 | 34 | Returns: 35 | Partial Ratio measure raw score (int) is returned 36 | 37 | Raises: 38 | TypeError: If the inputs are not strings 39 | 40 | Examples: 41 | >>> s = PartialRatio() 42 | >>> s.get_raw_score('Robert Rupert', 'Rupert') 43 | 100 44 | >>> s.get_raw_score('Sue', 'sue') 45 | 67 46 | >>> s.get_raw_score('example', 'samples') 47 | 86 48 | 49 | References: 50 | * https://pypi.python.org/pypi/fuzzywuzzy 51 | """ 52 | # input validations 53 | utils.sim_check_for_none(string1, string2) 54 | utils.sim_check_for_string_inputs(string1, string2) 55 | 56 | # if one of the strings is empty return 0 57 | if utils.sim_check_for_empty(string1, string2): 58 | return 0 59 | 60 | string1 = utils.convert_to_unicode(string1) 61 | string2 = utils.convert_to_unicode(string2) 62 | 63 | # string1 should be smaller in length than string2. If this is not the case 64 | # then swap string1 and string2 65 | if len(string1) > len(string2): 66 | temp = string1 67 | string1 = string2 68 | string2 = temp 69 | 70 | sm = SequenceMatcher(None, string1, string2) 71 | matching_blocks = sm.get_matching_blocks() 72 | 73 | scores = [] 74 | for block in matching_blocks: 75 | string2_starting_index = 0 76 | if (block[1] - block[0] > 0): 77 | string2_starting_index = block[1] - block[0] 78 | string2_ending_index = string2_starting_index + len(string1) 79 | string2_substr = string2[string2_starting_index:string2_ending_index] 80 | 81 | sm2 = SequenceMatcher(None, string1, string2_substr) 82 | similarity_ratio = sm2.ratio() 83 | if similarity_ratio > .995: 84 | return 100 85 | else: 86 | scores.append(similarity_ratio) 87 | 88 | return int(round(100 * max(scores))) 89 | 90 | def get_sim_score(self, string1, string2): 91 | """ 92 | Computes the Fuzzy Wuzzy partial ratio similarity score between two strings. 93 | This score is in the range [0,1]. 94 | 95 | Args: 96 | string1,string2 (str): Input strings 97 | 98 | Returns: 99 | Partial Ratio measure similarity score (float) is returned 100 | 101 | Raises: 102 | TypeError: If the inputs are not strings 103 | 104 | Examples: 105 | >>> s = PartialRatio() 106 | >>> s.get_sim_score('Robert Rupert', 'Rupert') 107 | 1.0 108 | >>> s.get_sim_score('Sue', 'sue') 109 | 0.67 110 | >>> s.get_sim_score('example', 'samples') 111 | 0.86 112 | 113 | References: 114 | * https://pypi.python.org/pypi/fuzzywuzzy 115 | """ 116 | # input validations 117 | utils.sim_check_for_none(string1, string2) 118 | utils.sim_check_for_string_inputs(string1, string2) 119 | 120 | # if one of the strings is empty return 0 121 | if utils.sim_check_for_empty(string1, string2): 122 | return 0 123 | 124 | raw_score = 1.0 * self.get_raw_score(string1, string2) 125 | sim_score = raw_score / 100 126 | return sim_score 127 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/partial_token_sort.py: -------------------------------------------------------------------------------- 1 | """Fuzzy Wuzzy Token Sort Similarity Measure""" 2 | 3 | from py_stringmatching import utils 4 | 5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 6 | SequenceSimilarityMeasure 7 | from py_stringmatching.similarity_measure.partial_ratio import PartialRatio 8 | 9 | 10 | class PartialTokenSort(SequenceSimilarityMeasure): 11 | """Computes Fuzzy Wuzzy partial token sort similarity measure. 12 | 13 | Fuzzy Wuzzy partial token sort ratio raw raw_score is a measure of the strings similarity as an 14 | int in the range [0, 100]. For two strings X and Y, the score is obtained by 15 | splitting the two strings into tokens and then sorting the tokens. The score is 16 | then the fuzzy wuzzy partial ratio raw score of the transformed strings. Fuzzy Wuzzy token 17 | sort sim score is a float in the range [0, 1] and is obtained by dividing the raw score 18 | by 100. 19 | 20 | Note: 21 | In the case where either of strings X or Y are empty, we define the 22 | Fuzzy Wuzzy partial ratio similarity score to be 0. 23 | """ 24 | def __init__(self): 25 | pass 26 | 27 | def _process_string_and_sort(self, s, force_ascii, full_process=True): 28 | """Returns a string with tokens sorted. Processes the string if 29 | full_process flag is enabled. If force_ascii flag is enabled then 30 | processing removes non ascii characters from the string.""" 31 | # pull tokens 32 | ts = utils.process_string(s, force_ascii=force_ascii) if full_process else s 33 | tokens = ts.split() 34 | 35 | # sort tokens and join 36 | sorted_string = u" ".join(sorted(tokens)) 37 | return sorted_string.strip() 38 | 39 | def get_raw_score(self, string1, string2, force_ascii=True, full_process=True): 40 | """ 41 | Computes the Fuzzy Wuzzy partial token sort measure raw score between two strings. 42 | This score is in the range [0,100]. 43 | 44 | Args: 45 | string1,string2 (str), : Input strings 46 | force_ascii (boolean) : Flag to remove non-ascii characters or not 47 | full_process (boolean) : Flag to process the string or not. Processing includes 48 | removing non alphanumeric characters, converting string to lower case and 49 | removing leading and trailing whitespaces. 50 | 51 | Returns: 52 | Partial Token Sort measure raw score (int) is returned 53 | 54 | Raises: 55 | TypeError: If the inputs are not strings 56 | 57 | Examples: 58 | >>> s = PartialTokenSort() 59 | >>> s.get_raw_score('great is scala', 'java is great') 60 | 81 61 | >>> s.get_raw_score('Sue', 'sue') 62 | 100 63 | >>> s.get_raw_score('C++ and Java', 'Java and Python') 64 | 64 65 | 66 | References: 67 | * https://pypi.python.org/pypi/fuzzywuzzy 68 | """ 69 | # input validations 70 | utils.sim_check_for_none(string1, string2) 71 | utils.sim_check_for_string_inputs(string1, string2) 72 | 73 | # if one of the strings is empty return 0 74 | if utils.sim_check_for_empty(string1, string2): 75 | return 0 76 | 77 | sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process) 78 | sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process) 79 | partialRatio = PartialRatio() 80 | return partialRatio.get_raw_score(sorted1, sorted2) 81 | 82 | def get_sim_score(self, string1, string2, force_ascii=True, full_process=True): 83 | """ 84 | Computes the Fuzzy Wuzzy partial token sort similarity score between two strings. 85 | This score is in the range [0,1]. 86 | 87 | Args: 88 | string1,string2 (str), : Input strings 89 | force_ascii (boolean) : Flag to remove non-ascii characters or not 90 | full_process (boolean) : Flag to process the string or not. Processing includes 91 | removing non alphanumeric characters, converting string to lower case and 92 | removing leading and trailing whitespaces. 93 | 94 | Returns: 95 | Partial Token Sort measure similarity score (float) is returned 96 | 97 | Raises: 98 | TypeError: If the inputs are not strings 99 | 100 | Examples: 101 | >>> s = PartialTokenSort() 102 | >>> s.get_sim_score('great is scala', 'java is great') 103 | 0.81 104 | >>> s.get_sim_score('Sue', 'sue') 105 | 1.0 106 | >>> s.get_sim_score('C++ and Java', 'Java and Python') 107 | 0.64 108 | 109 | References: 110 | * https://pypi.python.org/pypi/fuzzywuzzy 111 | """ 112 | raw_score = 1.0 * self.get_raw_score(string1, string2, force_ascii, full_process) 113 | sim_score = raw_score / 100 114 | return sim_score -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/phonetic_similarity_measure.py: -------------------------------------------------------------------------------- 1 | """Phonetics based similarity measure""" 2 | 3 | from py_stringmatching.similarity_measure.similarity_measure import \ 4 | SimilarityMeasure 5 | 6 | class PhoneticSimilarityMeasure(SimilarityMeasure): 7 | pass 8 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/ratio.py: -------------------------------------------------------------------------------- 1 | """Fuzzy Wuzzy Ratio Similarity Measure""" 2 | 3 | from difflib import SequenceMatcher 4 | from py_stringmatching import utils 5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 6 | SequenceSimilarityMeasure 7 | 8 | 9 | class Ratio(SequenceSimilarityMeasure): 10 | """Computes Fuzzy Wuzzy ratio similarity measure. 11 | 12 | Fuzzy Wuzzy ratio raw score is a measure of the strings similarity as an int in the 13 | range [0, 100]. For two strings X and Y, the score is defined by 14 | int(round((2.0 * M / T) * 100)) where T is the total number of characters in 15 | both strings, and M is the number of matches in the two strings. Fuzzy Wuzzy ratio 16 | sim score is a float in the range [0, 1] and is obtained by dividing the raw score 17 | by 100. 18 | 19 | Note: 20 | In the case where either of strings X or Y are empty, we define the 21 | Fuzzy Wuzzy ratio similarity score to be 0. 22 | """ 23 | def __init__(self): 24 | pass 25 | 26 | def get_raw_score(self, string1, string2): 27 | """ 28 | Computes the Fuzzy Wuzzy ratio measure raw score between two strings. 29 | This score is in the range [0,100]. 30 | 31 | Args: 32 | string1,string2 (str): Input strings 33 | 34 | Returns: 35 | Ratio measure raw score (int) is returned 36 | 37 | Raises: 38 | TypeError: If the inputs are not strings 39 | 40 | Examples: 41 | >>> s = Ratio() 42 | >>> s.get_raw_score('Robert', 'Rupert') 43 | 67 44 | >>> s.get_raw_score('Sue', 'sue') 45 | 67 46 | >>> s.get_raw_score('example', 'samples') 47 | 71 48 | 49 | References: 50 | * https://pypi.python.org/pypi/fuzzywuzzy 51 | """ 52 | # input validations 53 | utils.sim_check_for_none(string1, string2) 54 | utils.sim_check_for_string_inputs(string1, string2) 55 | 56 | # if one of the strings is empty return 0 57 | if utils.sim_check_for_empty(string1, string2): 58 | return 0 59 | 60 | string1 = utils.convert_to_unicode(string1) 61 | string2 = utils.convert_to_unicode(string2) 62 | 63 | sm = SequenceMatcher(None, string1, string2) 64 | return int(round(100 * sm.ratio())) 65 | 66 | def get_sim_score(self, string1, string2): 67 | """ 68 | Computes the Fuzzy Wuzzy ratio similarity score between two strings. 69 | This score is in the range [0,1]. 70 | 71 | Args: 72 | string1,string2 (str): Input strings 73 | 74 | Returns: 75 | Ratio measure similarity score (float) is returned 76 | 77 | Raises: 78 | TypeError: If the inputs are not strings 79 | 80 | Examples: 81 | >>> s = Ratio() 82 | >>> s.get_sim_score('Robert', 'Rupert') 83 | 0.67 84 | >>> s.get_sim_score('Sue', 'sue') 85 | 0.67 86 | >>> s.get_sim_score('example', 'samples') 87 | 0.71 88 | 89 | References: 90 | * https://pypi.python.org/pypi/fuzzywuzzy 91 | """ 92 | # input validations 93 | utils.sim_check_for_none(string1, string2) 94 | utils.sim_check_for_string_inputs(string1, string2) 95 | 96 | # if one of the strings is empty return 0 97 | if utils.sim_check_for_empty(string1, string2): 98 | return 0 99 | 100 | raw_score = 1.0 * self.get_raw_score(string1, string2) 101 | sim_score = raw_score / 100 102 | return sim_score 103 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/sequence_similarity_measure.py: -------------------------------------------------------------------------------- 1 | """Sequence based similarity measure""" 2 | 3 | from py_stringmatching.similarity_measure.similarity_measure import \ 4 | SimilarityMeasure 5 | 6 | class SequenceSimilarityMeasure(SimilarityMeasure): 7 | pass 8 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/similarity_measure.py: -------------------------------------------------------------------------------- 1 | """Similarity measure""" 2 | 3 | class SimilarityMeasure(object): 4 | pass 5 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/smith_waterman.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from py_stringmatching.similarity_measure.cython.cython_utils import cython_sim_ident 4 | from py_stringmatching import utils 5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 6 | SequenceSimilarityMeasure 7 | from py_stringmatching.similarity_measure.cython.cython_smith_waterman import smith_waterman 8 | 9 | 10 | class SmithWaterman(SequenceSimilarityMeasure): 11 | """Computes Smith-Waterman measure. 12 | 13 | The Smith-Waterman algorithm performs local sequence alignment; that is, for determining similar regions 14 | between two strings. Instead of looking at the total sequence, the Smith–Waterman algorithm compares segments of 15 | all possible lengths and optimizes the similarity measure. See the string matching chapter in the DI book (Principles of Data Integration). 16 | 17 | Args: 18 | gap_cost (float): Cost of gap (defaults to 1.0). 19 | sim_func (function): Similarity function to give a score for the correspondence between the characters (defaults 20 | to an identity function, which returns 1 if the two characters are the same and 0 otherwise). 21 | 22 | Attributes: 23 | gap_cost (float): An attribute to store the gap cost. 24 | sim_func (function): An attribute to store the similarity function. 25 | """ 26 | 27 | def __init__(self, gap_cost=1.0, sim_func=cython_sim_ident): 28 | self.gap_cost = gap_cost 29 | self.sim_func = sim_func 30 | super(SmithWaterman, self).__init__() 31 | 32 | def get_raw_score(self, string1, string2): 33 | """Computes the raw Smith-Waterman score between two strings. 34 | 35 | Args: 36 | string1,string2 (str) : Input strings. 37 | 38 | Returns: 39 | Smith-Waterman similarity score (float). 40 | 41 | Raises: 42 | TypeError : If the inputs are not strings or if one of the inputs is None. 43 | 44 | Examples: 45 | >>> sw = SmithWaterman() 46 | >>> sw.get_raw_score('cat', 'hat') 47 | 2.0 48 | >>> sw = SmithWaterman(gap_cost=2.2) 49 | >>> sw.get_raw_score('dva', 'deeve') 50 | 1.0 51 | >>> sw = SmithWaterman(gap_cost=1, sim_func=lambda s1, s2 : (2 if s1 == s2 else -1)) 52 | >>> sw.get_raw_score('dva', 'deeve') 53 | 2.0 54 | >>> sw = SmithWaterman(gap_cost=1.4, sim_func=lambda s1, s2 : (1.5 if s1 == s2 else 0.5)) 55 | >>> sw.get_raw_score('GCATAGCU', 'GATTACA') 56 | 6.5 57 | """ 58 | 59 | # input validations 60 | utils.sim_check_for_none(string1, string2) 61 | 62 | # convert input to unicode. 63 | string1 = utils.convert_to_unicode(string1) 64 | string2 = utils.convert_to_unicode(string2) 65 | 66 | utils.tok_check_for_string_input(string1, string2) 67 | 68 | # Returns smith waterman similarity score from cython function 69 | return smith_waterman(string1,string2,self.gap_cost,self.sim_func) 70 | 71 | def get_gap_cost(self): 72 | """Get gap cost. 73 | 74 | Returns: 75 | Gap cost (float). 76 | """ 77 | return self.gap_cost 78 | 79 | def get_sim_func(self): 80 | """Get similarity function. 81 | 82 | Returns: 83 | Similarity function (function). 84 | """ 85 | return self.sim_func 86 | 87 | def set_gap_cost(self, gap_cost): 88 | """Set gap cost. 89 | 90 | Args: 91 | gap_cost (float): Cost of gap. 92 | """ 93 | self.gap_cost = gap_cost 94 | return True 95 | 96 | def set_sim_func(self, sim_func): 97 | """Set similarity function. 98 | 99 | Args: 100 | sim_func (function): Similarity function to give a score for the correspondence between the characters. 101 | """ 102 | self.sim_func = sim_func 103 | return True 104 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/soft_tfidf.py: -------------------------------------------------------------------------------- 1 | from math import sqrt 2 | import collections 3 | 4 | from py_stringmatching import utils 5 | from py_stringmatching.similarity_measure.jaro import Jaro 6 | from py_stringmatching.similarity_measure.hybrid_similarity_measure import \ 7 | HybridSimilarityMeasure 8 | 9 | 10 | class SoftTfIdf(HybridSimilarityMeasure): 11 | """Computes soft TF/IDF measure. 12 | 13 | Note: 14 | Currently, this measure is implemented without dampening. This is similar to setting dampen flag to be False in TF-IDF. 15 | We plan to add the dampen flag in the next release. 16 | 17 | Args: 18 | corpus_list (list of lists): Corpus list (default is set to None) of strings. If set to None, 19 | the input list are considered the only corpus. 20 | sim_func (function): Secondary similarity function. This should return a similarity score between two strings (optional), 21 | default is the Jaro similarity measure. 22 | threshold (float): Threshold value for the secondary similarity function (defaults to 0.5). If the similarity 23 | of a token pair exceeds the threshold, then the token pair is considered a match. 24 | 25 | Attributes: 26 | sim_func (function): An attribute to store the secondary similarity function. 27 | threshold (float): An attribute to store the threshold value for the secondary similarity function. 28 | """ 29 | 30 | def __init__(self, corpus_list=None, sim_func=Jaro().get_raw_score, 31 | threshold=0.5): 32 | self.__corpus_list = corpus_list 33 | self.__document_frequency = {} 34 | self.__compute_document_frequency() 35 | self.__corpus_size = 0 if self.__corpus_list is None else ( 36 | len(self.__corpus_list)) 37 | self.sim_func = sim_func 38 | self.threshold = threshold 39 | super(SoftTfIdf, self).__init__() 40 | 41 | def get_raw_score(self, bag1, bag2): 42 | """Computes the raw soft TF/IDF score between two lists given the corpus information. 43 | 44 | Args: 45 | bag1,bag2 (list): Input lists 46 | 47 | Returns: 48 | Soft TF/IDF score between the input lists (float). 49 | 50 | Raises: 51 | TypeError : If the inputs are not lists or if one of the inputs is None. 52 | 53 | Examples: 54 | >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], sim_func=Jaro().get_raw_score, threshold=0.8) 55 | >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c']) 56 | 0.17541160386140586 57 | >>> soft_tfidf = SoftTfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']], threshold=0.9) 58 | >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 59 | 0.5547001962252291 60 | >>> soft_tfidf = SoftTfIdf([['x', 'y'], ['w'], ['q']]) 61 | >>> soft_tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 62 | 0.0 63 | >>> soft_tfidf = SoftTfIdf(sim_func=Affine().get_raw_score, threshold=0.6) 64 | >>> soft_tfidf.get_raw_score(['aa', 'bb', 'a'], ['ab', 'ba']) 65 | 0.81649658092772592 66 | 67 | References: 68 | * the string matching chapter of the "Principles of Data Integration" book. 69 | """ 70 | 71 | # input validations 72 | utils.sim_check_for_none(bag1, bag2) 73 | utils.sim_check_for_list_or_set_inputs(bag1, bag2) 74 | 75 | # if the strings match exactly return 1.0 76 | if utils.sim_check_for_exact_match(bag1, bag2): 77 | return 1.0 78 | 79 | # if one of the strings is empty return 0 80 | if utils.sim_check_for_empty(bag1, bag2): 81 | return 0 82 | 83 | # term frequency for input strings 84 | tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) 85 | 86 | # find unique elements in the input lists and their document frequency 87 | local_df = {} 88 | for element in tf_x: 89 | local_df[element] = local_df.get(element, 0) + 1 90 | for element in tf_y: 91 | local_df[element] = local_df.get(element, 0) + 1 92 | 93 | # if corpus is not provided treat input string as corpus 94 | curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else ( 95 | (self.__document_frequency, self.__corpus_size)) 96 | 97 | # calculating the term sim score against the input string 2, 98 | # construct similarity map 99 | similarity_map = {} 100 | for term_x in tf_x: 101 | max_score = 0.0 102 | for term_y in tf_y: 103 | score = self.sim_func(term_x, term_y) 104 | # adding sim only if it is above threshold and 105 | # highest for this element 106 | if score > self.threshold and score > max_score: 107 | similarity_map[term_x] = (term_x, term_y, score) 108 | max_score = score 109 | 110 | # position of first string, second string and sim score 111 | # in the tuple 112 | first_string_pos = 0 113 | second_string_pos = 1 114 | sim_score_pos = 2 115 | 116 | result, v_x_2, v_y_2 = 0.0, 0.0, 0.0 117 | # soft-tfidf calculation 118 | for element in local_df.keys(): 119 | if curr_df.get(element) is None: 120 | continue 121 | # numerator 122 | if element in similarity_map: 123 | sim = similarity_map[element] 124 | idf_first = corpus_size / curr_df.get(sim[first_string_pos], 1) 125 | idf_second = corpus_size / curr_df.get(sim[second_string_pos], 1) 126 | v_x = idf_first * tf_x.get(sim[first_string_pos], 0) 127 | v_y = idf_second * tf_y.get(sim[second_string_pos], 0) 128 | result += v_x * v_y * sim[sim_score_pos] 129 | # denominator 130 | idf = corpus_size / curr_df[element] 131 | v_x = idf * tf_x.get(element, 0) 132 | v_x_2 += v_x * v_x 133 | v_y = idf * tf_y.get(element, 0) 134 | v_y_2 += v_y * v_y 135 | return result if v_x_2 == 0 else result / (sqrt(v_x_2) * sqrt(v_y_2)) 136 | 137 | def get_corpus_list(self): 138 | """Get corpus list. 139 | 140 | Returns: 141 | corpus list (list of lists). 142 | """ 143 | return self.__corpus_list 144 | 145 | def get_sim_func(self): 146 | """Get secondary similarity function. 147 | 148 | Returns: 149 | secondary similarity function (function). 150 | """ 151 | return self.sim_func 152 | 153 | def get_threshold(self): 154 | """Get threshold used for the secondary similarity function. 155 | 156 | Returns: 157 | threshold (float). 158 | """ 159 | return self.threshold 160 | 161 | def set_threshold(self, threshold): 162 | """Set threshold value for the secondary similarity function. 163 | 164 | Args: 165 | threshold (float): threshold value. 166 | """ 167 | self.threshold = threshold 168 | return True 169 | 170 | def set_sim_func(self, sim_func): 171 | """Set secondary similarity function. 172 | 173 | Args: 174 | sim_func (function): Secondary similarity function. 175 | """ 176 | self.sim_func = sim_func 177 | return True 178 | 179 | def set_corpus_list(self, corpus_list): 180 | """Set corpus list. 181 | 182 | Args: 183 | corpus_list (list of lists): Corpus list. 184 | """ 185 | self.__corpus_list = corpus_list 186 | self.__document_frequency = {} 187 | self.__compute_document_frequency() 188 | self.__corpus_size = 0 if self.__corpus_list is None else ( 189 | len(self.__corpus_list)) 190 | return True 191 | 192 | def __compute_document_frequency(self): 193 | if self.__corpus_list != None: 194 | for document in self.__corpus_list: 195 | for element in set(document): 196 | self.__document_frequency[element] = ( 197 | self.__document_frequency.get(element, 0) + 1) 198 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/soundex.py: -------------------------------------------------------------------------------- 1 | """Soundex phonetic similarity measure""" 2 | 3 | import re 4 | 5 | from py_stringmatching import utils 6 | from py_stringmatching.similarity_measure.phonetic_similarity_measure import \ 7 | PhoneticSimilarityMeasure 8 | 9 | 10 | class Soundex(PhoneticSimilarityMeasure): 11 | """Soundex phonetic similarity measure class. 12 | """ 13 | def __init__(self): 14 | super(Soundex, self).__init__() 15 | 16 | def get_raw_score(self, string1, string2): 17 | """ 18 | Computes the Soundex phonetic similarity between two strings. 19 | 20 | Phonetic measure such as soundex match string based on their sound. These 21 | measures have been especially effective in matching names, since names are 22 | often spelled in different ways that sound the same. For example, Meyer, Meier, 23 | and Mire sound the same, as do Smith, Smithe, and Smythe. 24 | 25 | Soundex is used primarily to match surnames. It does not work as well for names 26 | of East Asian origins, because much of the discriminating power of these names 27 | resides in the vowel sounds, which the code ignores. 28 | 29 | Args: 30 | string1,string2 (str): Input strings 31 | 32 | Returns: 33 | Soundex similarity score (int) is returned 34 | 35 | Raises: 36 | TypeError : If the inputs are not strings 37 | 38 | Examples: 39 | >>> s = Soundex() 40 | >>> s.get_raw_score('Robert', 'Rupert') 41 | 1 42 | >>> s.get_raw_score('Sue', 's') 43 | 1 44 | >>> s.get_raw_score('Gough', 'Goff') 45 | 0 46 | >>> s.get_raw_score('a,,li', 'ali') 47 | 1 48 | 49 | """ 50 | # input validations 51 | utils.sim_check_for_none(string1, string2) 52 | utils.sim_check_for_string_inputs(string1, string2) 53 | 54 | # remove all chars but alphanumeric characters 55 | string1 = re.sub("[^a-zA-Z0-9]", "", string1) 56 | string2 = re.sub("[^a-zA-Z0-9]", "", string2) 57 | 58 | utils.sim_check_for_zero_len(string1, string2) 59 | 60 | if utils.sim_check_for_exact_match(string1, string2): 61 | return 1 62 | 63 | string1, string2 = string1.upper(), string2.upper() 64 | first_letter1, first_letter2 = string1[0], string2[0] 65 | string1, string2 = string1[1:], string2[1:] 66 | 67 | # remove occurrences of vowels, 'y', 'w' and 'h' 68 | string1 = re.sub('[AEIOUYWH]', '', string1) 69 | string2 = re.sub('[AEIOUYWH]', '', string2) 70 | 71 | # replace (B,F,P,V)->1 (C,G,J,K,Q,S,X,Z)->2 (D,T)->3 (L)->4 72 | # (M,N)->5 (R)->6 73 | string1 = re.sub('[BFPV]', '1', string1) 74 | string1 = re.sub('[CGJKQSXZ]', '2', string1) 75 | string1 = re.sub('[DT]', '3', string1) 76 | string1 = re.sub('[L]', '4', string1) 77 | string1 = re.sub('[MN]', '5', string1) 78 | string1 = re.sub('[R]', '6', string1) 79 | 80 | string2 = re.sub('[BFPV]', '1', string2) 81 | string2 = re.sub('[CGJKQSXZ]', '2', string2) 82 | string2 = re.sub('[DT]', '3', string2) 83 | string2 = re.sub('[L]', '4', string2) 84 | string2 = re.sub('[MN]', '5', string2) 85 | string2 = re.sub('[R]', '6', string2) 86 | 87 | string1 = first_letter1 + string1[:3] 88 | string2 = first_letter2 + string2[:3] 89 | 90 | return 1 if string1 == string2 else 0 91 | 92 | def get_sim_score(self, string1, string2): 93 | """ 94 | Computes the normalized soundex similarity between two strings. 95 | 96 | Args: 97 | string1,string2 (str): Input strings 98 | 99 | Returns: 100 | Normalized soundex similarity (int) 101 | 102 | Raises: 103 | TypeError : If the inputs are not strings or if one of the inputs is None. 104 | 105 | Examples: 106 | >>> s = Soundex() 107 | >>> s.get_sim_score('Robert', 'Rupert') 108 | 1 109 | >>> s.get_sim_score('Sue', 's') 110 | 1 111 | >>> s.get_sim_score('Gough', 'Goff') 112 | 0 113 | >>> s.get_sim_score('a,,li', 'ali') 114 | 1 115 | 116 | """ 117 | return self.get_raw_score(string1, string2) 118 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/tfidf.py: -------------------------------------------------------------------------------- 1 | from math import log, sqrt 2 | import collections 3 | 4 | from py_stringmatching import utils 5 | from py_stringmatching.similarity_measure.token_similarity_measure import \ 6 | TokenSimilarityMeasure 7 | 8 | 9 | class TfIdf(TokenSimilarityMeasure): 10 | """Computes TF/IDF measure. 11 | 12 | This measure employs the notion of TF/IDF score commonly used in information retrieval (IR) to 13 | find documents that are relevant to keyword queries. The intuition underlying the TF/IDF measure 14 | is that two strings are similar if they share distinguishing terms. See the string matching chapter in the book "Principles of Data Integration" 15 | 16 | Args: 17 | corpus_list (list of lists): The corpus that will be used to compute TF and IDF values. This corpus is a list of strings, where each string has been tokenized into a list of tokens (that is, a bag of tokens). The default is set to None. In this case, when we call this TF/IDF measure on two input strings (using get_raw_score or get_sim_score), the corpus is taken to be the list of those two strings. 18 | dampen (boolean): Flag to indicate whether 'log' should be used in TF and IDF formulas (defaults to True). 19 | 20 | Attributes: 21 | dampen (boolean): An attribute to store the dampen flag. 22 | """ 23 | 24 | def __init__(self, corpus_list=None, dampen=True): 25 | self.__corpus_list = corpus_list 26 | self.__document_frequency = {} 27 | self.__compute_document_frequency() 28 | self.__corpus_size = 0 if self.__corpus_list is None else ( 29 | len(self.__corpus_list)) 30 | self.dampen = dampen 31 | super(TfIdf, self).__init__() 32 | 33 | def get_raw_score(self, bag1, bag2): 34 | """Computes the raw TF/IDF score between two lists. 35 | 36 | Args: 37 | bag1,bag2 (list): Input lists. 38 | 39 | Returns: 40 | TF/IDF score between the input lists (float). 41 | 42 | Raises: 43 | TypeError : If the inputs are not lists or if one of the inputs is None. 44 | 45 | Examples: 46 | 47 | >>> # here the corpus is a list of three strings that 48 | >>> # have been tokenized into three lists of tokens 49 | >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']]) 50 | >>> tfidf.get_raw_score(['a', 'b', 'a'], ['b', 'c']) 51 | 0.7071067811865475 52 | >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 53 | 0.0 54 | >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']]) 55 | >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 56 | 0.0 57 | >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False) 58 | >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a', 'c']) 59 | 0.25298221281347033 60 | >>> tfidf = TfIdf(dampen=False) 61 | >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 62 | 0.7071067811865475 63 | >>> tfidf = TfIdf() 64 | >>> tfidf.get_raw_score(['a', 'b', 'a'], ['a']) 65 | 0.0 66 | """ 67 | # input validations 68 | utils.sim_check_for_none(bag1, bag2) 69 | utils.sim_check_for_list_or_set_inputs(bag1, bag2) 70 | 71 | # if the strings match exactly return 1.0 72 | if utils.sim_check_for_exact_match(bag1, bag2): 73 | return 1.0 74 | 75 | # if one of the strings is empty return 0 76 | if utils.sim_check_for_empty(bag1, bag2): 77 | return 0 78 | 79 | # term frequency for input strings 80 | tf_x, tf_y = collections.Counter(bag1), collections.Counter(bag2) 81 | 82 | # find unique elements in the input lists and their document frequency 83 | local_df = {} 84 | for element in tf_x: 85 | local_df[element] = local_df.get(element, 0) + 1 86 | for element in tf_y: 87 | local_df[element] = local_df.get(element, 0) + 1 88 | 89 | # if corpus is not provided treat input string as corpus 90 | curr_df, corpus_size = (local_df, 2) if self.__corpus_list is None else ( 91 | (self.__document_frequency, self.__corpus_size)) 92 | 93 | idf_element, v_x, v_y, v_x_y, v_x_2, v_y_2 = (0.0, 0.0, 0.0, 94 | 0.0, 0.0, 0.0) 95 | 96 | # tfidf calculation 97 | for element in local_df.keys(): 98 | df_element = curr_df.get(element) 99 | if df_element is None: 100 | continue 101 | idf_element = corpus_size * 1.0 / df_element 102 | v_x = 0 if element not in tf_x else (log(idf_element) * log(tf_x[element] + 1)) if self.dampen else ( 103 | idf_element * tf_x[element]) 104 | v_y = 0 if element not in tf_y else (log(idf_element) * log(tf_y[element] + 1)) if self.dampen else ( 105 | idf_element * tf_y[element]) 106 | v_x_y += v_x * v_y 107 | v_x_2 += v_x * v_x 108 | v_y_2 += v_y * v_y 109 | 110 | return 0.0 if v_x_y == 0 else v_x_y / (sqrt(v_x_2) * sqrt(v_y_2)) 111 | 112 | def get_sim_score(self, bag1, bag2): 113 | """Computes the normalized TF/IDF similarity score between two lists. Simply call get_raw_score. 114 | 115 | Args: 116 | bag1,bag2 (list): Input lists. 117 | 118 | Returns: 119 | Normalized TF/IDF similarity score between the input lists (float). 120 | 121 | Raises: 122 | TypeError : If the inputs are not lists or if one of the inputs is None. 123 | 124 | Examples: 125 | 126 | >>> # here the corpus is a list of three strings that 127 | >>> # have been tokenized into three lists of tokens 128 | >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a']]) 129 | >>> tfidf.get_sim_score(['a', 'b', 'a'], ['b', 'c']) 130 | 0.7071067811865475 131 | >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a']) 132 | 0.0 133 | >>> tfidf = TfIdf([['x', 'y'], ['w'], ['q']]) 134 | >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a']) 135 | 0.0 136 | >>> tfidf = TfIdf([['a', 'b', 'a'], ['a', 'c'], ['a'], ['b']], False) 137 | >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a', 'c']) 138 | 0.25298221281347033 139 | >>> tfidf = TfIdf(dampen=False) 140 | >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a']) 141 | 0.7071067811865475 142 | >>> tfidf = TfIdf() 143 | >>> tfidf.get_sim_score(['a', 'b', 'a'], ['a']) 144 | 0.0 145 | """ 146 | return self.get_raw_score(bag1, bag2) 147 | 148 | def get_dampen(self): 149 | """Get dampen flag. 150 | 151 | Returns: 152 | dampen flag (boolean). 153 | """ 154 | return self.dampen 155 | 156 | def get_corpus_list(self): 157 | """Get corpus list. 158 | 159 | Returns: 160 | corpus list (list of lists). 161 | """ 162 | return self.__corpus_list 163 | 164 | def set_dampen(self, dampen): 165 | """Set dampen flag. 166 | 167 | Args: 168 | dampen (boolean): Flag to indicate whether 'log' should be applied to TF and IDF formulas. 169 | """ 170 | self.dampen = dampen 171 | return True 172 | 173 | def set_corpus_list(self, corpus_list): 174 | """Set corpus list. 175 | 176 | Args: 177 | corpus_list (list of lists): Corpus list. 178 | """ 179 | self.__corpus_list = corpus_list 180 | self.__document_frequency = {} 181 | self.__compute_document_frequency() 182 | self.__corpus_size = 0 if self.__corpus_list is None else ( 183 | len(self.__corpus_list)) 184 | return True 185 | 186 | def __compute_document_frequency(self): 187 | if self.__corpus_list != None: 188 | for document in self.__corpus_list: 189 | for element in set(document): 190 | self.__document_frequency[element] = ( 191 | self.__document_frequency.get(element, 0) + 1) 192 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/token_similarity_measure.py: -------------------------------------------------------------------------------- 1 | """Token based similarity measure""" 2 | 3 | from py_stringmatching.similarity_measure.similarity_measure import \ 4 | SimilarityMeasure 5 | 6 | class TokenSimilarityMeasure(SimilarityMeasure): 7 | pass 8 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/token_sort.py: -------------------------------------------------------------------------------- 1 | """Fuzzy Wuzzy Token Sort Similarity Measure""" 2 | 3 | from py_stringmatching import utils 4 | 5 | from py_stringmatching.similarity_measure.sequence_similarity_measure import \ 6 | SequenceSimilarityMeasure 7 | from py_stringmatching.similarity_measure.ratio import Ratio 8 | 9 | 10 | class TokenSort(SequenceSimilarityMeasure): 11 | """Computes Fuzzy Wuzzy token sort similarity measure. 12 | 13 | Fuzzy Wuzzy token sort ratio raw raw_score is a measure of the strings similarity as an 14 | int in the range [0, 100]. For two strings X and Y, the score is obtained by 15 | splitting the two strings into tokens and then sorting the tokens. The score is 16 | then the fuzzy wuzzy ratio raw score of the transformed strings. Fuzzy Wuzzy token 17 | sort sim score is a float in the range [0, 1] and is obtained by dividing the raw score 18 | by 100. 19 | 20 | Note: 21 | In the case where either of strings X or Y are empty, we define the 22 | Fuzzy Wuzzy ratio similarity score to be 0. 23 | """ 24 | def __init__(self): 25 | pass 26 | 27 | def _process_string_and_sort(self, s, force_ascii, full_process=True): 28 | """Returns a string with tokens sorted. Processes the string if 29 | full_process flag is enabled. If force_ascii flag is enabled then 30 | processing removes non ascii characters from the string.""" 31 | # pull tokens 32 | ts = utils.process_string(s, force_ascii=force_ascii) if full_process else s 33 | tokens = ts.split() 34 | 35 | # sort tokens and join 36 | sorted_string = u" ".join(sorted(tokens)) 37 | return sorted_string.strip() 38 | 39 | def get_raw_score(self, string1, string2, force_ascii=True, full_process=True): 40 | """ 41 | Computes the Fuzzy Wuzzy token sort measure raw score between two strings. 42 | This score is in the range [0,100]. 43 | 44 | Args: 45 | string1,string2 (str), : Input strings 46 | force_ascii (boolean) : Flag to remove non-ascii characters or not 47 | full_process (boolean) : Flag to process the string or not. Processing includes 48 | removing non alphanumeric characters, converting string to lower case and 49 | removing leading and trailing whitespaces. 50 | 51 | Returns: 52 | Token Sort measure raw score (int) is returned 53 | 54 | Raises: 55 | TypeError: If the inputs are not strings 56 | 57 | Examples: 58 | >>> s = TokenSort() 59 | >>> s.get_raw_score('great is scala', 'java is great') 60 | 81 61 | >>> s.get_raw_score('Sue', 'sue') 62 | 100 63 | >>> s.get_raw_score('C++ and Java', 'Java and Python') 64 | 64 65 | 66 | References: 67 | * https://pypi.python.org/pypi/fuzzywuzzy 68 | """ 69 | # input validations 70 | utils.sim_check_for_none(string1, string2) 71 | utils.sim_check_for_string_inputs(string1, string2) 72 | 73 | # if one of the strings is empty return 0 74 | if utils.sim_check_for_empty(string1, string2): 75 | return 0 76 | 77 | sorted1 = self._process_string_and_sort(string1, force_ascii, full_process=full_process) 78 | sorted2 = self._process_string_and_sort(string2, force_ascii, full_process=full_process) 79 | ratio = Ratio() 80 | return ratio.get_raw_score(sorted1, sorted2) 81 | 82 | def get_sim_score(self, string1, string2, force_ascii=True, full_process=True): 83 | """ 84 | Computes the Fuzzy Wuzzy token sort similarity score between two strings. 85 | This score is in the range [0,1]. 86 | 87 | Args: 88 | string1,string2 (str), : Input strings 89 | force_ascii (boolean) : Flag to remove non-ascii characters or not 90 | full_process (boolean) : Flag to process the string or not. Processing includes 91 | removing non alphanumeric characters, converting string to lower case and 92 | removing leading and trailing whitespaces. 93 | 94 | Returns: 95 | Token Sort measure similarity score (float) is returned 96 | 97 | Raises: 98 | TypeError: If the inputs are not strings 99 | 100 | Examples: 101 | >>> s = TokenSort() 102 | >>> s.get_sim_score('great is scala', 'java is great') 103 | 0.81 104 | >>> s.get_sim_score('Sue', 'sue') 105 | 1.0 106 | >>> s.get_sim_score('C++ and Java', 'Java and Python') 107 | 0.64 108 | 109 | References: 110 | * https://pypi.python.org/pypi/fuzzywuzzy 111 | """ 112 | raw_score = 1.0 * self.get_raw_score(string1, string2, force_ascii, full_process) 113 | sim_score = raw_score / 100 114 | return sim_score 115 | -------------------------------------------------------------------------------- /py_stringmatching/similarity_measure/tversky_index.py: -------------------------------------------------------------------------------- 1 | """Tversky index similarity measure""" 2 | 3 | from py_stringmatching import utils 4 | from py_stringmatching.similarity_measure.token_similarity_measure import \ 5 | TokenSimilarityMeasure 6 | 7 | 8 | class TverskyIndex(TokenSimilarityMeasure): 9 | """Tversky index similarity measure class. 10 | 11 | Parameters: 12 | alpha, beta (float): Tversky index parameters (defaults to 0.5). 13 | """ 14 | def __init__(self, alpha=0.5, beta=0.5): 15 | # validate alpha and beta 16 | utils.sim_check_tversky_parameters(alpha, beta) 17 | 18 | self.alpha = alpha 19 | self.beta = beta 20 | super(TverskyIndex, self).__init__() 21 | 22 | def get_raw_score(self, set1, set2): 23 | """ 24 | Computes the Tversky index similarity between two sets. 25 | 26 | The Tversky index is an asymmetric similarity measure on sets that compares a variant to a prototype. The 27 | Tversky index can be seen as a generalization of Dice's coefficient and Tanimoto coefficient. 28 | 29 | For sets X and Y the Tversky index is a number between 0 and 1 given by: 30 | :math:`tversky_index(X, Y) = \\frac{|X \\cap Y|}{|X \\cap Y| + \alpha |X-Y| + \beta |Y-X|}` 31 | where, :math: \alpha, \beta >=0 32 | 33 | Args: 34 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 35 | 36 | Returns: 37 | Tversly index similarity (float) 38 | 39 | Raises: 40 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 41 | 42 | Examples: 43 | >>> tvi = TverskyIndex() 44 | >>> tvi.get_raw_score(['data', 'science'], ['data']) 45 | 0.6666666666666666 46 | >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 47 | 0.5 48 | >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 49 | 0.5454545454545454 50 | >>> tvi = TverskyIndex(0.5, 0.5) 51 | >>> tvi.get_raw_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 52 | 0.5454545454545454 53 | >>> tvi = TverskyIndex(beta=0.5) 54 | >>> tvi.get_raw_score(['data', 'management'], ['data', 'data', 'science']) 55 | 0.5 56 | """ 57 | # input validations 58 | utils.sim_check_for_none(set1, set2) 59 | utils.sim_check_for_list_or_set_inputs(set1, set2) 60 | 61 | # if exact match return 1.0 62 | if utils.sim_check_for_exact_match(set1, set2): 63 | return 1.0 64 | 65 | # if one of the strings is empty return 0 66 | if utils.sim_check_for_empty(set1, set2): 67 | return 0 68 | 69 | if not isinstance(set1, set): 70 | set1 = set(set1) 71 | if not isinstance(set2, set): 72 | set2 = set(set2) 73 | intersection = float(len(set1 & set2)) 74 | 75 | return 1.0 * intersection / (intersection + 76 | (self.alpha * len(set1 - set2)) + (self.beta * len(set2 - set1))) 77 | 78 | def get_sim_score(self, set1, set2): 79 | """ 80 | Computes the normalized tversky index similarity between two sets. 81 | 82 | Args: 83 | set1,set2 (set or list): Input sets (or lists). Input lists are converted to sets. 84 | 85 | Returns: 86 | Normalized tversky index similarity (float) 87 | 88 | Raises: 89 | TypeError : If the inputs are not sets (or lists) or if one of the inputs is None. 90 | 91 | Examples: 92 | >>> tvi = TverskyIndex() 93 | >>> tvi.get_sim_score(['data', 'science'], ['data']) 94 | 0.6666666666666666 95 | >>> tvi.get_sim_score(['data', 'management'], ['data', 'data', 'science']) 96 | 0.5 97 | >>> tvi.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 98 | 0.5454545454545454 99 | >>> tvi = TverskyIndex(0.5, 0.5) 100 | >>> tvi.get_sim_score({1, 1, 2, 3, 4}, {2, 3, 4, 5, 6, 7, 7, 8}) 101 | 0.5454545454545454 102 | >>> tvi = TverskyIndex(beta=0.5) 103 | >>> tvi.get_sim_score(['data', 'management'], ['data', 'data', 'science']) 104 | 0.5 105 | 106 | """ 107 | return self.get_raw_score(set1, set2) 108 | 109 | def get_alpha(self): 110 | """ 111 | Get alpha 112 | 113 | Returns: 114 | alpha (float) 115 | """ 116 | return self.alpha 117 | 118 | def get_beta(self): 119 | """ 120 | Get beta 121 | 122 | Returns: 123 | beta (float) 124 | """ 125 | return self.beta 126 | 127 | def set_alpha(self, alpha): 128 | """ 129 | Set alpha 130 | 131 | Args: 132 | alpha (float): Tversky index parameter 133 | """ 134 | self.alpha = alpha 135 | return True 136 | 137 | def set_beta(self, beta): 138 | """ 139 | Set beta 140 | 141 | Args: 142 | beta (float): Tversky index parameter 143 | """ 144 | self.beta = beta 145 | return True 146 | -------------------------------------------------------------------------------- /py_stringmatching/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/tests/__init__.py -------------------------------------------------------------------------------- /py_stringmatching/tests/test_sim_Soundex.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | import unittest 4 | 5 | from py_stringmatching.similarity_measure.soundex import Soundex 6 | 7 | from .utils import raises 8 | 9 | 10 | class SoundexTestCases(unittest.TestCase): 11 | def setUp(self): 12 | self.sdx = Soundex() 13 | 14 | def test_valid_input_raw_score(self): 15 | self.assertEqual(self.sdx.get_raw_score('Robert', 'Rupert'), 1) 16 | self.assertEqual(self.sdx.get_raw_score('Sue', 'S'), 1) 17 | self.assertEqual(self.sdx.get_raw_score('robert', 'rupert'), 1) 18 | self.assertEqual(self.sdx.get_raw_score('Gough', 'goff'), 0) 19 | self.assertEqual(self.sdx.get_raw_score('gough', 'Goff'), 0) 20 | self.assertEqual(self.sdx.get_raw_score('ali', 'a,,,li'), 1) 21 | self.assertEqual(self.sdx.get_raw_score('Jawornicki', 'Yavornitzky'), 0) 22 | self.assertEqual(self.sdx.get_raw_score('Robert', 'Robert'), 1) 23 | self.assertEqual(self.sdx.get_raw_score('Ris..h.ab', 'Ris;hab.'), 1) 24 | self.assertEqual(self.sdx.get_raw_score('gough', 'G2'), 1) 25 | self.assertEqual(self.sdx.get_raw_score('robert', 'R1:6:3'), 1) 26 | 27 | def test_valid_input_sim_score(self): 28 | self.assertEqual(self.sdx.get_sim_score('Robert', 'Rupert'), 1) 29 | self.assertEqual(self.sdx.get_sim_score('Sue', 'S'), 1) 30 | self.assertEqual(self.sdx.get_sim_score('robert', 'rupert'), 1) 31 | self.assertEqual(self.sdx.get_sim_score('Gough', 'goff'), 0) 32 | self.assertEqual(self.sdx.get_sim_score('gough', 'Goff'), 0) 33 | self.assertEqual(self.sdx.get_sim_score('ali', 'a,,,li'), 1) 34 | self.assertEqual(self.sdx.get_sim_score('Jawornicki', 'Yavornitzky'), 0) 35 | self.assertEqual(self.sdx.get_sim_score('Robert', 'Robert'), 1) 36 | self.assertEqual(self.sdx.get_raw_score('Ris..h.ab', 'Ris;hab.'), 1) 37 | self.assertEqual(self.sdx.get_sim_score('Gough', 'G2'), 1) 38 | self.assertEqual(self.sdx.get_sim_score('gough', 'G2'), 1) 39 | self.assertEqual(self.sdx.get_sim_score('robert', 'R1:6:3'), 1) 40 | 41 | @raises(TypeError) 42 | def test_invalid_input1_raw_score(self): 43 | self.sdx.get_raw_score('a', None) 44 | 45 | @raises(TypeError) 46 | def test_invalid_input2_raw_score(self): 47 | self.sdx.get_raw_score(None, 'b') 48 | 49 | @raises(TypeError) 50 | def test_invalid_input3_raw_score(self): 51 | self.sdx.get_raw_score(None, None) 52 | 53 | @raises(ValueError) 54 | def test_invalid_input4_raw_score(self): 55 | self.sdx.get_raw_score('a', '') 56 | 57 | @raises(ValueError) 58 | def test_invalid_input5_raw_score(self): 59 | self.sdx.get_raw_score('', 'This is a long string') 60 | 61 | @raises(TypeError) 62 | def test_invalid_input7_raw_score(self): 63 | self.sdx.get_raw_score('xyz', ['']) 64 | 65 | @raises(TypeError) 66 | def test_invalid_input1_sim_score(self): 67 | self.sdx.get_sim_score('a', None) 68 | 69 | @raises(TypeError) 70 | def test_invalid_input2_sim_score(self): 71 | self.sdx.get_sim_score(None, 'b') 72 | 73 | @raises(TypeError) 74 | def test_invalid_input3_sim_score(self): 75 | self.sdx.get_sim_score(None, None) 76 | 77 | @raises(ValueError) 78 | def test_invalid_input4_sim_score(self): 79 | self.sdx.get_sim_score('a', '') 80 | 81 | @raises(ValueError) 82 | def test_invalid_input5_sim_score(self): 83 | self.sdx.get_sim_score('', 'This is a long string') 84 | 85 | @raises(TypeError) 86 | def test_invalid_input7_sim_score(self): 87 | self.sdx.get_sim_score('xyz', ['']) 88 | 89 | @raises(ValueError) 90 | def test_invalid_input8_sim_score(self): 91 | self.sdx.get_sim_score('..,', '..abc.') 92 | 93 | @raises(ValueError) 94 | def test_invalid_input9_sim_score(self): 95 | self.sdx.get_sim_score('..', '') 96 | 97 | @raises(ValueError) 98 | def test_invalid_input10_sim_score(self): 99 | self.sdx.get_sim_score('.', '..abc,,') 100 | 101 | @raises(TypeError) 102 | def test_invalid_input11_sim_score(self): 103 | self.sdx.get_sim_score('abc', 123) 104 | -------------------------------------------------------------------------------- /py_stringmatching/tests/utils.py: -------------------------------------------------------------------------------- 1 | # Simplified knockoff of nose.tools.raises 2 | def raises(exc_type): 3 | def deco(f): 4 | def raises_wrapper(self): 5 | with self.assertRaises(exc_type): 6 | return f(self) 7 | return raises_wrapper 8 | return deco 9 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anhaidgroup/py_stringmatching/658860f0cad65e1e3da07e7039fe9764d4822272/py_stringmatching/tokenizer/__init__.py -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/alphabetic_tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from py_stringmatching import utils 4 | from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer 5 | 6 | 7 | class AlphabeticTokenizer(DefinitionTokenizer): 8 | """Returns tokens that are maximal sequences of consecutive alphabetical characters. 9 | 10 | Args: 11 | return_set (boolean): A flag to indicate whether to return a set of tokens instead of a bag of tokens (defaults to False). 12 | 13 | Attributes: 14 | return_set (boolean): An attribute that stores the value for the flag return_set. 15 | """ 16 | 17 | def __init__(self, return_set=False): 18 | self.__al_regex = re.compile('[a-zA-Z]+') 19 | super(AlphabeticTokenizer, self).__init__(return_set) 20 | 21 | def tokenize(self, input_string): 22 | """Tokenizes input string into alphabetical tokens. 23 | 24 | Args: 25 | input_string (str): The string to be tokenized. 26 | 27 | Returns: 28 | A Python list, which represents a set of tokens if the flag return_set is True, and a bag of tokens otherwise. 29 | 30 | Raises: 31 | TypeError : If the input is not a string. 32 | 33 | Examples: 34 | >>> al_tok = AlphabeticTokenizer() 35 | >>> al_tok.tokenize('data99science, data#integration.') 36 | ['data', 'science', 'data', 'integration'] 37 | >>> al_tok.tokenize('99') 38 | [] 39 | >>> al_tok = AlphabeticTokenizer(return_set=True) 40 | >>> al_tok.tokenize('data99science, data#integration.') 41 | ['data', 'science', 'integration'] 42 | """ 43 | utils.tok_check_for_none(input_string) 44 | utils.tok_check_for_string_input(input_string) 45 | 46 | token_list = list(filter(None, self.__al_regex.findall(input_string))) 47 | 48 | if self.return_set: 49 | return utils.convert_bag_to_set(token_list) 50 | 51 | return token_list 52 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/alphanumeric_tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from py_stringmatching import utils 4 | from py_stringmatching.tokenizer.definition_tokenizer import DefinitionTokenizer 5 | 6 | 7 | class AlphanumericTokenizer(DefinitionTokenizer): 8 | """Returns tokens that are maximal sequences of consecutive alphanumeric characters. 9 | 10 | Args: 11 | return_set (boolean): A flag to indicate whether to return a set of 12 | tokens instead of a bag of tokens (defaults to False). 13 | 14 | Attributes: 15 | return_set (boolean): An attribute to store the value of the flag return_set. 16 | """ 17 | 18 | def __init__(self, return_set=False): 19 | self.__alnum_regex = re.compile('[a-zA-Z0-9]+') 20 | super(AlphanumericTokenizer, self).__init__(return_set) 21 | 22 | def tokenize(self, input_string): 23 | """Tokenizes input string into alphanumeric tokens. 24 | 25 | Args: 26 | input_string (str): The string to be tokenized. 27 | 28 | Returns: 29 | A Python list, which represents a set of tokens if the flag return_set is true, and a bag of tokens otherwise. 30 | 31 | Raises: 32 | TypeError : If the input is not a string. 33 | 34 | Examples: 35 | >>> alnum_tok = AlphanumericTokenizer() 36 | >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88') 37 | ['data9', 'science', 'data9', 'integration', '88'] 38 | >>> alnum_tok.tokenize('#.&') 39 | [] 40 | >>> alnum_tok = AlphanumericTokenizer(return_set=True) 41 | >>> alnum_tok.tokenize('data9,(science), data9#.(integration).88') 42 | ['data9', 'science', 'integration', '88'] 43 | 44 | """ 45 | utils.tok_check_for_none(input_string) 46 | utils.tok_check_for_string_input(input_string) 47 | 48 | token_list = list(filter(None, 49 | self.__alnum_regex.findall(input_string))) 50 | 51 | if self.return_set: 52 | return utils.convert_bag_to_set(token_list) 53 | 54 | return token_list 55 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/definition_tokenizer.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching.tokenizer.tokenizer import Tokenizer 2 | 3 | 4 | class DefinitionTokenizer(Tokenizer): 5 | """A class of tokenizers that uses a definition to find tokens, as opposed to using delimiters. 6 | 7 | Examples of definitions include alphabetical tokens, qgram tokens. Examples of delimiters include white space, punctuations. 8 | 9 | Args: 10 | return_set (boolean): A flag to indicate whether to return a set of 11 | tokens instead of a bag of tokens (defaults to False). 12 | 13 | Attributes: 14 | return_set (boolean): An attribute to store the flag return_set. 15 | """ 16 | 17 | def __init__(self, return_set=False): 18 | super(DefinitionTokenizer, self).__init__(return_set) 19 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/delimiter_tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from py_stringmatching import utils 4 | from py_stringmatching.tokenizer.tokenizer import Tokenizer 5 | 6 | 7 | class DelimiterTokenizer(Tokenizer): 8 | """Uses delimiters to find tokens, as apposed to using definitions. 9 | 10 | Examples of delimiters include white space and punctuations. Examples of definitions include alphabetical and qgram tokens. 11 | 12 | Args: 13 | delim_set (set): A set of delimiter strings (defaults to space delimiter). 14 | return_set (boolean): A flag to indicate whether to return a set of 15 | tokens instead of a bag of tokens (defaults to False). 16 | 17 | Attributes: 18 | return_set (boolean): An attribute to store the value of the flag return_set. 19 | """ 20 | 21 | def __init__(self, 22 | delim_set=set([' ']), return_set=False): 23 | self.__delim_set = None 24 | self.__use_split = None 25 | self.__delim_str = None 26 | self.__delim_regex = None 27 | self._update_delim_set(delim_set) 28 | super(DelimiterTokenizer, self).__init__(return_set) 29 | 30 | def tokenize(self, input_string): 31 | """Tokenizes input string based on the set of delimiters. 32 | 33 | Args: 34 | input_string (str): The string to be tokenized. 35 | 36 | Returns: 37 | A Python list which is a set or a bag of tokens, depending on whether return_set flag is set to True or False. 38 | 39 | Raises: 40 | TypeError : If the input is not a string. 41 | 42 | Examples: 43 | >>> delim_tok = DelimiterTokenizer() 44 | >>> delim_tok.tokenize('data science') 45 | ['data', 'science'] 46 | >>> delim_tok = DelimiterTokenizer(['$#$']) 47 | >>> delim_tok.tokenize('data$#$science') 48 | ['data', 'science'] 49 | >>> delim_tok = DelimiterTokenizer([',', '.']) 50 | >>> delim_tok.tokenize('data,science.data,integration.') 51 | ['data', 'science', 'data', 'integration'] 52 | >>> delim_tok = DelimiterTokenizer([',', '.'], return_set=True) 53 | >>> delim_tok.tokenize('data,science.data,integration.') 54 | ['data', 'science', 'integration'] 55 | 56 | """ 57 | utils.tok_check_for_none(input_string) 58 | utils.tok_check_for_string_input(input_string) 59 | 60 | if self.__use_split: 61 | token_list = list(filter(None, 62 | input_string.split(self.__delim_str))) 63 | else: 64 | token_list = list(filter(None, 65 | self.__delim_regex.split(input_string))) 66 | 67 | if self.return_set: 68 | return utils.convert_bag_to_set(token_list) 69 | 70 | return token_list 71 | 72 | def get_delim_set(self): 73 | """Gets the current set of delimiters. 74 | 75 | Returns: 76 | A Python set which is the current set of delimiters. 77 | """ 78 | return self.__delim_set 79 | 80 | def set_delim_set(self, delim_set): 81 | """Sets the current set of delimiters. 82 | 83 | Args: 84 | delim_set (set): A set of delimiter strings. 85 | """ 86 | return self._update_delim_set(delim_set) 87 | 88 | def _update_delim_set(self, delim_set): 89 | if not isinstance(delim_set, set): 90 | delim_set = set(delim_set) 91 | self.__delim_set = delim_set 92 | # if there is only one delimiter string, use split instead of regex 93 | self.__use_split = False 94 | if len(self.__delim_set) == 1: 95 | self.__delim_str = list(self.__delim_set)[0] 96 | self.__use_split = True 97 | else: 98 | self.__delim_regex = re.compile('|'.join( 99 | map(re.escape, self.__delim_set))) 100 | return True 101 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/tokenizer.py: -------------------------------------------------------------------------------- 1 | class Tokenizer(object): 2 | """The root class for tokenizers. 3 | 4 | Args: 5 | return_set (boolean): A flag to indicate whether to return a set of 6 | tokens instead of a bag of tokens (defaults to False). 7 | 8 | Attributes: 9 | return_set (boolean): An attribute to store the flag return_set. 10 | """ 11 | 12 | def __init__(self, return_set=False): 13 | self.return_set = return_set 14 | 15 | def get_return_set(self): 16 | """Gets the value of the return_set flag. 17 | 18 | Returns: 19 | The boolean value of the return_set flag. 20 | """ 21 | return self.return_set 22 | 23 | def set_return_set(self, return_set): 24 | """Sets the value of the return_set flag. 25 | 26 | Args: 27 | return_set (boolean): a flag to indicate whether to return a set of tokens instead of a bag of tokens. 28 | """ 29 | self.return_set = return_set 30 | return True 31 | -------------------------------------------------------------------------------- /py_stringmatching/tokenizer/whitespace_tokenizer.py: -------------------------------------------------------------------------------- 1 | from py_stringmatching import utils 2 | from py_stringmatching.tokenizer.delimiter_tokenizer import DelimiterTokenizer 3 | 4 | 5 | class WhitespaceTokenizer(DelimiterTokenizer): 6 | """Segments the input string using whitespaces then returns the segments as tokens. 7 | 8 | Currently using the split function in Python, so whitespace character refers to 9 | the actual whitespace character as well as the tab and newline characters. 10 | 11 | Args: 12 | return_set (boolean): A flag to indicate whether to return a set of 13 | tokens instead of a bag of tokens (defaults to False). 14 | 15 | Attributes: 16 | return_set (boolean): An attribute to store the flag return_set. 17 | """ 18 | 19 | def __init__(self, return_set=False): 20 | super(WhitespaceTokenizer, self).__init__([' ', '\t', '\n'], return_set) 21 | 22 | def tokenize(self, input_string): 23 | """Tokenizes input string based on white space. 24 | 25 | Args: 26 | input_string (str): The string to be tokenized. 27 | 28 | Returns: 29 | A Python list, which is a set or a bag of tokens, depending on whether return_set is True or False. 30 | 31 | Raises: 32 | TypeError : If the input is not a string. 33 | 34 | Examples: 35 | >>> ws_tok = WhitespaceTokenizer() 36 | >>> ws_tok.tokenize('data science') 37 | ['data', 'science'] 38 | >>> ws_tok.tokenize('data science') 39 | ['data', 'science'] 40 | >>> ws_tok.tokenize('data\tscience') 41 | ['data', 'science'] 42 | >>> ws_tok = WhitespaceTokenizer(return_set=True) 43 | >>> ws_tok.tokenize('data science data integration') 44 | ['data', 'science', 'integration'] 45 | """ 46 | utils.tok_check_for_none(input_string) 47 | utils.tok_check_for_string_input(input_string) 48 | 49 | token_list = list(filter(None, input_string.split())) 50 | 51 | if self.return_set: 52 | return utils.convert_bag_to_set(token_list) 53 | 54 | return token_list 55 | 56 | def set_delim_set(self, delim_set): 57 | raise AttributeError('Delimiters cannot be set for WhitespaceTokenizer') 58 | -------------------------------------------------------------------------------- /py_stringmatching/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | """ 4 | This module defines a list of utility and validation functions. 5 | """ 6 | 7 | 8 | def sim_check_for_none(*args): 9 | if len(args) > 0 and args[0] is None: 10 | raise TypeError("First argument cannot be None") 11 | if len(args) > 1 and args[1] is None: 12 | raise TypeError("Second argument cannot be None") 13 | 14 | 15 | def sim_check_for_empty(*args): 16 | if len(args[0]) == 0 or len(args[1]) == 0: 17 | return True 18 | 19 | 20 | def sim_check_for_same_len(*args): 21 | if len(args[0]) != len(args[1]): 22 | raise ValueError("Undefined for sequences of unequal length") 23 | 24 | 25 | def sim_check_for_string_inputs(*args): 26 | if not isinstance(args[0], str): 27 | raise TypeError('First argument is expected to be a string') 28 | if not isinstance(args[1], str): 29 | raise TypeError('Second argument is expected to be a string') 30 | 31 | 32 | def sim_check_for_list_or_set_inputs(*args): 33 | if not isinstance(args[0], list): 34 | if not isinstance(args[0], set): 35 | raise TypeError('First argument is expected to be a python list or set') 36 | if not isinstance(args[1], list): 37 | if not isinstance(args[1], set): 38 | raise TypeError('Second argument is expected to be a python list or set') 39 | 40 | 41 | def sim_check_tversky_parameters(alpha, beta): 42 | if alpha < 0 or beta < 0: 43 | raise ValueError('Tversky parameters should be greater than or equal to zero') 44 | 45 | 46 | def sim_check_for_exact_match(*args): 47 | if args[0] == args[1]: 48 | return True 49 | 50 | 51 | def sim_check_for_zero_len(*args): 52 | if len(args[0].strip()) == 0 or len(args[1].strip()) == 0: 53 | raise ValueError("Undefined for string of zero length") 54 | 55 | 56 | def tok_check_for_string_input(*args): 57 | for i in range(len(args)): 58 | if not isinstance(args[i], str): 59 | raise TypeError('Input is expected to be a string') 60 | 61 | 62 | def tok_check_for_none(*args): 63 | if args[0] is None: 64 | raise TypeError("First argument cannot be None") 65 | 66 | 67 | def convert_bag_to_set(input_list): 68 | seen_tokens = {} 69 | output_set =[] 70 | for token in input_list: 71 | if seen_tokens.get(token) == None: 72 | output_set.append(token) 73 | seen_tokens[token] = True 74 | return output_set 75 | 76 | 77 | def convert_to_unicode(input_string): 78 | """Convert input string to unicode.""" 79 | if isinstance(input_string, bytes): 80 | return input_string.decode('utf-8') 81 | return input_string 82 | 83 | 84 | def remove_non_ascii_chars(input_string): 85 | remove_chars = str("").join([chr(i) for i in range(128, 256)]) 86 | translation_table = dict((ord(c), None) for c in remove_chars) 87 | return input_string.translate(translation_table) 88 | 89 | 90 | def process_string(input_string, force_ascii=False): 91 | """Process string by 92 | -- removing all but letters and numbers 93 | -- trim whitespace 94 | -- converting string to lower case 95 | if force_ascii == True, force convert to ascii""" 96 | 97 | if force_ascii: 98 | input_string = remove_non_ascii_chars(input_string) 99 | 100 | regex = re.compile(r"(?ui)\W") 101 | 102 | # Keep only Letters and Numbers. 103 | out_string = regex.sub(" ", input_string) 104 | 105 | # Convert String to lowercase. 106 | out_string = out_string.lower() 107 | 108 | # Remove leading and trailing whitespaces. 109 | out_string = out_string.strip() 110 | return out_string 111 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import os 4 | 5 | # check if pip is installed. If not, raise an ImportError 6 | PIP_INSTALLED = True 7 | 8 | try: 9 | import pip 10 | except ImportError: 11 | PIP_INSTALLED = False 12 | 13 | def install_and_import(package): 14 | import importlib 15 | try: 16 | importlib.import_module(package) 17 | except ImportError: 18 | if not PIP_INSTALLED: 19 | raise ImportError('pip is not installed.') 20 | pip.main(['install', package]) 21 | finally: 22 | globals()[package] = importlib.import_module(package) 23 | 24 | # check if setuptools is installed. If not, install setuptools 25 | # automatically using pip. 26 | install_and_import('setuptools') 27 | 28 | from setuptools.command.build_ext import build_ext as _build_ext 29 | 30 | class build_ext(_build_ext): 31 | def build_extensions(self): 32 | import pkg_resources 33 | numpy_incl = pkg_resources.resource_filename('numpy', 'core/include') 34 | 35 | for ext in self.extensions: 36 | if (hasattr(ext, 'include_dirs') and 37 | not numpy_incl in ext.include_dirs): 38 | ext.include_dirs.append(numpy_incl) 39 | _build_ext.build_extensions(self) 40 | 41 | def generate_cython(): 42 | 43 | from Cython.Build import cythonize 44 | 45 | module_list = ["py_stringmatching/similarity_measure/cython/cython_affine.pyx", 46 | "py_stringmatching/similarity_measure/cython/cython_jaro.pyx", 47 | "py_stringmatching/similarity_measure/cython/cython_jaro_winkler.pyx", 48 | "py_stringmatching/similarity_measure/cython/cython_levenshtein.pyx", 49 | "py_stringmatching/similarity_measure/cython/cython_needleman_wunsch.pyx", 50 | "py_stringmatching/similarity_measure/cython/cython_smith_waterman.pyx", 51 | "py_stringmatching/similarity_measure/cython/cython_utils.pyx" 52 | ] 53 | p = cythonize(module_list) 54 | 55 | if not p: 56 | raise RuntimeError("Running cythonize failed!") 57 | 58 | 59 | cmdclass = {"build_ext": build_ext} 60 | 61 | 62 | if __name__ == "__main__": 63 | 64 | no_frills = (len(sys.argv) >= 2 and ('--help' in sys.argv[1:] or 65 | sys.argv[1] in ('--help-commands', 66 | 'egg_info', '--version', 67 | 'clean'))) 68 | 69 | cwd = os.path.abspath(os.path.dirname(__file__)) 70 | if not os.path.exists(os.path.join(cwd, 'PKG-INFO')) and not no_frills: 71 | # Generate Cython sources, unless building from source release 72 | generate_cython() 73 | 74 | # specify extensions that need to be compiled 75 | extensions = [setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_levenshtein", 76 | ["py_stringmatching/similarity_measure/cython/cython_levenshtein.c"], 77 | include_dirs=[]), 78 | setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_jaro", 79 | ["py_stringmatching/similarity_measure/cython/cython_jaro.c"], 80 | include_dirs=[]), 81 | setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_jaro_winkler", 82 | ["py_stringmatching/similarity_measure/cython/cython_jaro_winkler.c"], 83 | include_dirs=[]), 84 | setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_utils", 85 | ["py_stringmatching/similarity_measure/cython/cython_utils.c"], 86 | include_dirs=[]), 87 | setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_needleman_wunsch", 88 | ["py_stringmatching/similarity_measure/cython/cython_needleman_wunsch.c"], 89 | include_dirs=[]), 90 | setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_smith_waterman", 91 | ["py_stringmatching/similarity_measure/cython/cython_smith_waterman.c"], 92 | include_dirs=[]), 93 | setuptools.Extension("py_stringmatching.similarity_measure.cython.cython_affine", 94 | ["py_stringmatching/similarity_measure/cython/cython_affine.c"], 95 | include_dirs=[]) 96 | 97 | ] 98 | 99 | # find packages to be included. exclude benchmarks. 100 | packages = setuptools.find_packages(exclude=["benchmarks", "benchmarks.custom_benchmarks"]) 101 | 102 | with open('README.rst') as f: 103 | LONG_DESCRIPTION = f.read() 104 | 105 | setuptools.setup( 106 | name='py-stringmatching', 107 | version='0.4.6', 108 | description='Python library for string matching.', 109 | long_description=LONG_DESCRIPTION, 110 | url='https://sites.google.com/site/anhaidgroup/projects/magellan/py_stringmatching', 111 | author='UW Magellan Team', 112 | author_email='uwmagellan@gmail.com', 113 | license='BSD', 114 | classifiers=[ 115 | 'Development Status :: 4 - Beta', 116 | 'Environment :: Console', 117 | 'Intended Audience :: Developers', 118 | 'Intended Audience :: Science/Research', 119 | 'Intended Audience :: Education', 120 | 'License :: OSI Approved :: BSD License', 121 | 'Operating System :: POSIX', 122 | 'Operating System :: Unix', 123 | 'Operating System :: MacOS', 124 | 'Operating System :: Microsoft :: Windows', 125 | 'Programming Language :: Python', 126 | 'Programming Language :: Python :: 3', 127 | 'Programming Language :: Python :: 3.7', 128 | 'Programming Language :: Python :: 3.8', 129 | 'Programming Language :: Python :: 3.9', 130 | 'Programming Language :: Python :: 3.10', 131 | 'Programming Language :: Python :: 3.11', 132 | 'Programming Language :: Python :: 3.12', 133 | 'Topic :: Scientific/Engineering', 134 | 'Topic :: Utilities', 135 | 'Topic :: Software Development :: Libraries', 136 | ], 137 | packages=packages, 138 | install_requires=[ 139 | 'numpy >= 1.7.0,<2.0', 140 | ], 141 | setup_requires=[ 142 | 'numpy >= 1.7.0,<2.0' 143 | ], 144 | ext_modules=extensions, 145 | cmdclass=cmdclass, 146 | include_package_data=True, 147 | zip_safe=False 148 | ) 149 | --------------------------------------------------------------------------------