├── .github ├── dependabot.yml └── workflows │ └── pythonpackage.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── affinegap ├── __init__.py └── affinegap.pyx ├── pyproject.toml ├── requirements.txt ├── setup.py └── tests └── test_affinegap.py /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "weekly" 7 | ignore: 8 | # Optional: Official actions have moving tags like v1; 9 | # if you use those, you don't need updates. 10 | - dependency-name: "actions/*" 11 | -------------------------------------------------------------------------------- /.github/workflows/pythonpackage.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: [push] 4 | 5 | jobs: 6 | test: 7 | 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [windows-latest, macos-latest, ubuntu-latest] 12 | python-version: [3.7, 3.8, 3.9, "3.10"] 13 | 14 | steps: 15 | - uses: actions/checkout@v1 16 | - name: Set up Python ${{ matrix.python-version }} 17 | uses: actions/setup-python@v1 18 | with: 19 | python-version: ${{ matrix.python-version }} 20 | - name: Install dependencies 21 | run: | 22 | pip install --upgrade pip 23 | pip install -e . 24 | - name: Test with pytest 25 | run: | 26 | pip install -r requirements.txt 27 | pytest 28 | - name: Coveralls 29 | env: 30 | COVERALLS_PARALLEL: true 31 | COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} 32 | run: | 33 | pip install https://github.com/bboe/coveralls-python/archive/github_actions.zip 34 | coveralls 35 | - name: Build and publish 36 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && runner.os != 'Linux' 37 | env: 38 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 39 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 40 | run: | 41 | pip install twine wheel 42 | pip wheel -w dist --no-deps . 43 | twine upload dist/* 44 | continue-on-error: true 45 | finish: 46 | needs: test 47 | runs-on: ubuntu-latest 48 | steps: 49 | - name: Coveralls Finished 50 | uses: coverallsapp/github-action@57daa114ba54fd8e1c8563e8027325c0bf2f5e80 51 | with: 52 | github-token: ${{ secrets.GITHUB_TOKEN }} 53 | parallel-finished: true 54 | wheels: 55 | if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') 56 | needs: test 57 | name: Build wheels on ${{ matrix.os }} 58 | runs-on: ${{ matrix.os }} 59 | strategy: 60 | matrix: 61 | os: [windows-latest, macos-latest, ubuntu-latest] 62 | steps: 63 | - uses: actions/checkout@v1 64 | - uses: actions/setup-python@v2 65 | - name: Set up QEMU 66 | if: runner.os == 'Linux' 67 | uses: docker/setup-qemu-action@v1 68 | with: 69 | platforms: all 70 | - name: Build wheels 71 | uses: pypa/cibuildwheel@v2.3.1 72 | env: 73 | CIBW_ARCHS_MACOS: x86_64 arm64 universal2 74 | CIBW_ARCHS_LINUX: auto aarch64 75 | - name: Build sdist 76 | run: | 77 | pip install numpy cython wheel setuptools 78 | python setup.py sdist 79 | - name: Publish wheels to PyPI 80 | env: 81 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 82 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 83 | run: | 84 | pip install twine 85 | twine upload --skip-existing wheelhouse/*.whl 86 | twine upload dist/* 87 | continue-on-error: true 88 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Forest Gregg and DataMade 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include pyproject.toml 2 | include affinegap/*.pyx 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | affinegap 2 | ========= 3 | 4 | A Cython implementation of the [affine gap penalty string distance](http://en.wikipedia.org/wiki/Gap_penalty#Affine_Gap_Penalty) also known as the [Smith–Waterman algorithm](http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) 5 | 6 | Part of the [Dedupe.io](https://dedupe.io/) cloud service and open source toolset for de-duplicating and finding fuzzy matches in your data. 7 | 8 | [![Build Status](https://travis-ci.org/dedupeio/affinegap.svg?branch=master)](https://travis-ci.org/dedupeio/affinegap) 9 | 10 | ## To install 11 | ```bash 12 | pip install affinegap 13 | ``` 14 | 15 | ## To use 16 | ```python 17 | import affinegap 18 | d1 = affinegap.affineGapDistance('foo', 'bar') 19 | d2 = affinegap.affineGapDistance('foo', 'bar', 20 | matchWeight = 1, 21 | mismatchWeight = 11, 22 | gapWeight = 10, 23 | spaceWeight = 7, 24 | abbreviation_scale = .125) 25 | d3 = affinegap.normalizedAffineGapDistance('foo', 'bar') 26 | ``` 27 | 28 | ## To get set up for development 29 | ```bash 30 | git clone https://github.com/dedupeio/affinegap.git 31 | cd affinegap 32 | pip install -r requirements.txt 33 | cython affinegap/*.pyx 34 | python setup.py develop 35 | pytest 36 | ``` 37 | 38 | ## Team 39 | 40 | * Forest Gregg, Dedupeio 41 | 42 | ## Errors and Bugs 43 | 44 | If something is not behaving intuitively, it is a bug and should be reported. 45 | Report it here by creating an issue: https://github.com/dedupeio/affinegap/issues 46 | 47 | Help us fix the problem as quickly as possible by following [Mozilla's guidelines for reporting bugs.](https://developer.mozilla.org/en-US/docs/Mozilla/QA/Bug_writing_guidelines#General_Outline_of_a_Bug_Report) 48 | 49 | ## Patches and Pull Requests 50 | 51 | Your patches are welcome. Here's our suggested workflow: 52 | 53 | * Fork the project. 54 | * Make your feature addition or bug fix. 55 | * Send us a pull request with a description of your work. Bonus points for topic branches! 56 | 57 | ## Copyright and Attribution 58 | 59 | Copyright (c) 2016 Forest Gregg and Dedupeio. Released under the [MIT License](https://github.com/dedupeio/affinegap/blob/master/LICENSE). 60 | -------------------------------------------------------------------------------- /affinegap/__init__.py: -------------------------------------------------------------------------------- 1 | from .affinegap import affineGapDistance 2 | from .affinegap import normalizedAffineGapDistance 3 | -------------------------------------------------------------------------------- /affinegap/affinegap.pyx: -------------------------------------------------------------------------------- 1 | # cython: boundscheck=False, wraparound=False 2 | # cython: cdivision=True 3 | # cython: c_string_type=unicode, c_string_encoding=utf8 4 | # cython: language_level=3 5 | 6 | from libc cimport limits 7 | from libc.stdlib cimport malloc, free 8 | 9 | cpdef float affineGapDistance(str string_a, str string_b, 10 | float matchWeight = 1, 11 | float mismatchWeight = 11, 12 | float gapWeight = 10, 13 | float spaceWeight = 7, 14 | float abbreviation_scale = .125): 15 | """ 16 | Calculate the affine gap distance between two strings 17 | 18 | Default weights are from Alvaro Monge and Charles Elkan, 1996, 19 | "The field matching problem: Algorithms and applications" 20 | http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.23.9685 21 | """ 22 | 23 | cdef int length1 = len(string_a) 24 | cdef int length2 = len(string_b) 25 | 26 | if (string_a == string_b and 27 | matchWeight == min(matchWeight, 28 | mismatchWeight, 29 | gapWeight)): 30 | return matchWeight * length1 31 | 32 | if length1 < length2 : 33 | string_a, string_b = string_b, string_a 34 | length1, length2 = length2, length1 35 | 36 | # Initialize C Arrays 37 | cdef int memory_size = sizeof(float) * (length1+1) 38 | cdef float *D = malloc(memory_size) 39 | cdef float *V_current = malloc(memory_size) 40 | cdef float *V_previous = malloc(memory_size) 41 | 42 | cdef int i, j 43 | cdef float distance 44 | 45 | # Set up Recurrence relations 46 | # 47 | # Base conditions 48 | # V(0,0) = 0 49 | # V(0,j) = gapWeight + spaceWeight * i 50 | # D(0,j) = Infinity 51 | V_current[0] = 0 52 | for j in range(1, length1 + 1) : 53 | V_current[j] = gapWeight + spaceWeight * j 54 | D[j] = limits.INT_MAX 55 | 56 | for i in range(1, length2 +1) : 57 | char2 = string_b[i-1] 58 | # V_previous = V_current 59 | for _ in range(0, length1 + 1) : 60 | V_previous[_] = V_current[_] 61 | 62 | # Base conditions 63 | # V(i,0) = gapWeight + spaceWeight * i 64 | # I(i,0) = Infinity 65 | V_current[0] = gapWeight + spaceWeight * i 66 | I = limits.INT_MAX 67 | 68 | for j in range(1, length1+1) : 69 | char1 = string_a[j-1] 70 | 71 | # I(i,j) is the edit distance if the jth character of string 1 72 | # was inserted into string 2. 73 | # 74 | # I(i,j) = min(I(i,j-1), V(i,j-1) + gapWeight) + spaceWeight 75 | if j <= length2 : 76 | I = min(I, V_current[j-1] + gapWeight) + spaceWeight 77 | else : 78 | # Pay less for abbreviations 79 | # i.e. 'spago (los angeles) to 'spago' 80 | I = (min(I, V_current[j-1] + gapWeight * abbreviation_scale) 81 | + spaceWeight * abbreviation_scale) 82 | 83 | # D(i,j) is the edit distance if the ith character of string 2 84 | # was deleted from string 1 85 | # 86 | # D(i,j) = min((i-1,j), V(i-1,j) + gapWeight) + spaceWeight 87 | D[j] = min(D[j], V_previous[j] + gapWeight) + spaceWeight 88 | 89 | # M(i,j) is the edit distance if the ith and jth characters 90 | # match or mismatch 91 | # 92 | # M(i,j) = V(i-1,j-1) + (matchWeight | misMatchWeight) 93 | if char2 == char1 : 94 | M = V_previous[j-1] + matchWeight 95 | else: 96 | M = V_previous[j-1] + mismatchWeight 97 | 98 | # V(i,j) is the minimum edit distance 99 | # 100 | # V(i,j) = min(E(i,j), F(i,j), G(i,j)) 101 | V_current[j] = min(I, D[j], M) 102 | 103 | distance = V_current[length1] 104 | 105 | free(D) 106 | free(V_current) 107 | free(V_previous) 108 | 109 | return distance 110 | 111 | cpdef float normalizedAffineGapDistance(str string_a, str string_b, 112 | float matchWeight = 1, 113 | float mismatchWeight = 11, 114 | float gapWeight = 10, 115 | float spaceWeight = 7, 116 | float abbreviation_scale = .125) except? 999 : 117 | 118 | cdef int length1 = len(string_a) 119 | cdef int length2 = len(string_b) 120 | 121 | cdef float normalizer = length1 + length2 122 | 123 | if normalizer == 0: 124 | raise ZeroDivisionError('normalizedAffineGapDistance cannot take two empty strings') 125 | 126 | cdef float distance = affineGapDistance(string_a, string_b, 127 | matchWeight, 128 | mismatchWeight, 129 | gapWeight, 130 | spaceWeight, 131 | abbreviation_scale) 132 | 133 | return distance/normalizer 134 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", 3 | "wheel", 4 | "cython"] 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pytest 2 | pytest-cov 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | try: 5 | from setuptools import setup, Extension 6 | except ImportError : 7 | raise ImportError("setuptools module required, please go to https://pypi.python.org/pypi/setuptools and follow the instructions for installing setuptools") 8 | 9 | try: 10 | from Cython.Build import cythonize 11 | use_cython = True 12 | except ImportError: 13 | use_cython = False 14 | 15 | if use_cython: 16 | ext_modules = cythonize([Extension('affinegap.affinegap', 17 | ['affinegap/affinegap.pyx'])]) 18 | else: 19 | ext_modules = [Extension('affinegap.affinegap', 20 | ['affinegap/affinegap.c'])] 21 | 22 | setup( 23 | name='affinegap', 24 | url='https://github.com/datamade/affinegap', 25 | version='1.12', 26 | description='A Cython implementation of the affine gap string distance', 27 | packages=['affinegap'], 28 | ext_modules=ext_modules, 29 | license='The MIT License: http://www.opensource.org/licenses/mit-license.php', 30 | classifiers=[ 31 | 'Development Status :: 5 - Production/Stable', 32 | 'Intended Audience :: Developers', 33 | 'Intended Audience :: Science/Research', 34 | 'License :: OSI Approved :: MIT License', 35 | 'Natural Language :: English', 36 | 'Operating System :: MacOS :: MacOS X', 37 | 'Operating System :: Microsoft :: Windows', 38 | 'Operating System :: POSIX', 39 | 'Programming Language :: Cython', 40 | 'Topic :: Software Development :: Libraries :: Python Modules', 41 | 'Topic :: Scientific/Engineering', 42 | 'Topic :: Scientific/Engineering :: Information Analysis'] 43 | ) 44 | -------------------------------------------------------------------------------- /tests/test_affinegap.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import affinegap 4 | 5 | class AffineGapTest(unittest.TestCase): 6 | def setUp(self): 7 | self.affineGapDistance = affinegap.affineGapDistance 8 | self.normalizedAffineGapDistance = affinegap.normalizedAffineGapDistance 9 | 10 | def test_affine_gap_correctness(self): 11 | assert self.affineGapDistance('a', u'b', -5, 5, 5, 1, 0.5) == 5 12 | assert self.affineGapDistance('ab', 'cd', -5, 5, 5, 1, 0.5) == 10 13 | assert self.affineGapDistance('ab', 'cde', -5, 5, 5, 1, 0.5) == 13 14 | assert self.affineGapDistance('ab', u'cdë', -5, 5, 5, 1, 0.5) == 13 15 | assert self.affineGapDistance('a', 'cde', -5, 5, 5, 1, 0.5) == 8.5 16 | assert self.affineGapDistance('a', 'cd', -5, 5, 5, 1, 0.5) == 8 17 | assert self.affineGapDistance('b', 'a', -5, 5, 5, 1, 0.5) == 5 18 | assert self.affineGapDistance('a', 'a', -5, 5, 5, 1, 0.5) == -5 19 | assert self.affineGapDistance('a', '', -5, 5, 5, 1, 0.5) == 6 20 | assert self.affineGapDistance('', '', -5, 5, 5, 1, 0.5) == 0 21 | assert self.affineGapDistance('aba', 'aaa', -5, 5, 5, 1, 0.5) == -5 22 | assert self.affineGapDistance('aaa', 'aba', -5, 5, 5, 1, 0.5) == -5 23 | assert self.affineGapDistance('aaa', 'aa', -5, 5, 5, 1, 0.5) == -7 24 | assert self.affineGapDistance('aaa', 'a', -5, 5, 5, 1, 0.5) == -1.5 25 | assert self.affineGapDistance('aaa', '', -5, 5, 5, 1, 0.5) == 8 26 | assert self.affineGapDistance('aaa', 'abba', -5, 5, 5, 1, 0.5) == 1 27 | 28 | def test_normalized_affine_gap_correctness(self): 29 | with self.assertRaises(ZeroDivisionError) : 30 | self.normalizedAffineGapDistance('', '', -5, 5, 5, 1, 0.5) 31 | assert self.normalizedAffineGapDistance('a', '', -5, 5, 5, 1, 0.5) == 6 32 | assert self.normalizedAffineGapDistance('ab', 'ab') == 0.5 33 | assert self.normalizedAffineGapDistance("日本", "日本") == 0.5 34 | 35 | if __name__ == "__main__": 36 | unittest.main() 37 | 38 | --------------------------------------------------------------------------------