├── .gitignore ├── LICENSE ├── Makefile ├── setup.py ├── tests ├── speed.py ├── test_binpat.py └── test_fastcomp.py ├── README.md └── mbleven.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Public domain 2 | 3 | I hereby place my work 'mbleven' into the public domain. 4 | All my copyrights, including all related and neighbouring 5 | rights, are abandoned. 6 | 7 | 2018 Fujimoto Seiji 8 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # Makefile for fastcomp 2 | # 3 | # Mostly for development. 4 | 5 | PYTHON=/usr/bin/env python 6 | 7 | all: 8 | 9 | test: 10 | $(PYTHON) -m unittest discover -v tests 11 | 12 | publish: 13 | $(PYTHON) setup.py sdist register upload 14 | 15 | .PHONY: test 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | 4 | setup( 5 | name='mbleven', 6 | version='1.4.0', 7 | py_modules=['mbleven'], 8 | author='Fujimoto Seiji', 9 | author_email='fujimoto@ceptord.net', 10 | url='https://github.com/fujimotos/mbleven', 11 | description='An efficient algorithm for k-bounded (Damerau-)Levenshtein distance', 12 | license='Public domain', 13 | classifiers=[ 14 | 'Development Status :: 4 - Beta', 15 | 'Intended Audience :: Science/Research', 16 | 'Programming Language :: Python :: 2', 17 | 'Programming Language :: Python :: 3', 18 | 'Topic :: Scientific/Engineering', 19 | ] 20 | ) 21 | -------------------------------------------------------------------------------- /tests/speed.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from mbleven import compare 4 | import random 5 | import string 6 | 7 | def randomstr(minlen=5, maxlen=8): 8 | charset = '01' 9 | length = random.randint(minlen, maxlen) 10 | return ''.join(random.choice(charset) for i in range(length)) 11 | 12 | if __name__ == "__main__": 13 | import timeit 14 | 15 | # Set up conditions 16 | setup = """ 17 | from __main__ import compare, randomstr 18 | cases = [(randomstr(), randomstr()) for x in range(1000)]""" 19 | main = "for case in cases: compare(*case)" 20 | loops = 100 21 | 22 | # Run timeit 23 | timer = timeit.Timer(main, setup=setup) 24 | result = timer.repeat(number=loops) 25 | 26 | # Result 27 | best = round(min(result)*1000/loops, 2) 28 | print('{} loops, best of 3: {} msec per loop'.format(loops, best)) 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | mbleven 2 | ======= 3 | 4 | mbleven is a fast, memory-efficient algorithm to compute k-bounded 5 | Levenshtein distance. 6 | 7 | If the upper-bound parameter k is small (especially k=1,2), this 8 | algorithm runs much faster than the Wagner-Fischer algorithm. 9 | 10 | For details, see [the documentation](https://ceptord.net/fastcomp/index.html). 11 | 12 | 13 | IMPORTANT NOTICE 14 | ---------------- 15 | 16 | This is a proof-of-concept implementation to show how mbleven algorithm 17 | works. If you are searching for a practical library to compute Levenshtein 18 | distance, please take a look at [polyleven]( 19 | https://github.com/fujimotos/polyleven). 20 | 21 | 22 | Installation 23 | ------------ 24 | 25 | Clone this repository and run setup.py 26 | 27 | $ git clone https://github.com/fujimotos/mbleven 28 | $ cd mbleven 29 | $ sudo python setup.py install 30 | 31 | 32 | Usage 33 | ----- 34 | 35 | This module provides a function named `compare()`. It takes two strings 36 | as arguments and returns an integer, which is... 37 | 38 | * the exact distance between two strings (if they are within two edit 39 | distance) 40 | * -1 (if they are over two edit distance away) 41 | 42 | Therefore, the return value should be any one of 0, 1, 2 or -1. 43 | 44 | ```python 45 | >>> from mbleven import compare 46 | >>> compare("meet", "meat") 47 | 1 48 | >>> compare("meet", "eat") 49 | 2 50 | >>> compare("meet", "mars") # distance 3 51 | -1 52 | ``` 53 | 54 | You can also measure the similarity using Damerau-Levenshtein distance 55 | by setting `transpose` flag true. 56 | 57 | ```python 58 | >>> compare("meat", "meta", transpose=True) 59 | 1 60 | >>> compare("abc", "ca", transpose=True) 61 | 2 62 | ``` 63 | -------------------------------------------------------------------------------- /mbleven.py: -------------------------------------------------------------------------------- 1 | """An implementation of mbleven algorithm""" 2 | 3 | # 4 | # Constants 5 | 6 | REPLACE = 'r' 7 | INSERT = 'i' 8 | DELETE = 'd' 9 | TRANSPOSE = 't' 10 | 11 | MATRIX = [ 12 | ['id', 'di', 'rr'], 13 | ['dr', 'rd'], 14 | ['dd'] 15 | ] 16 | 17 | MATRIX_T = [ 18 | ['id', 'di', 'rr', 'tt', 'tr', 'rt'], 19 | ['dr', 'rd', 'dt', 'td'], 20 | ['dd'] 21 | ] 22 | 23 | # 24 | # Library API 25 | 26 | def compare(str1, str2, transpose=False): 27 | len1, len2 = len(str1), len(str2) 28 | 29 | if len1 < len2: 30 | len1, len2 = len2, len1 31 | str1, str2 = str2, str1 32 | 33 | if len1 - len2 > 2: 34 | return -1 35 | 36 | if transpose: 37 | models = MATRIX_T[len1-len2] 38 | else: 39 | models = MATRIX[len1-len2] 40 | 41 | res = 3 42 | for model in models: 43 | cost = check_model(str1, str2, len1, len2, model) 44 | if cost < res: 45 | res = cost 46 | 47 | if res == 3: 48 | res = -1 49 | 50 | return res 51 | 52 | 53 | def check_model(str1, str2, len1, len2, model): 54 | """Check if the model can transform str1 into str2""" 55 | 56 | idx1, idx2 = 0, 0 57 | cost, pad = 0, 0 58 | while (idx1 < len1) and (idx2 < len2): 59 | if str1[idx1] != str2[idx2 - pad]: 60 | cost += 1 61 | if 2 < cost: 62 | return cost 63 | 64 | option = model[cost-1] 65 | if option == DELETE: 66 | idx1 += 1 67 | elif option == INSERT: 68 | idx2 += 1 69 | elif option == REPLACE: 70 | idx1 += 1 71 | idx2 += 1 72 | pad = 0 73 | elif option == TRANSPOSE: 74 | if (idx2 + 1) < len2 and str1[idx1] == str2[idx2+1]: 75 | idx1 += 1 76 | idx2 += 1 77 | pad = 1 78 | else: 79 | return 3 80 | else: 81 | idx1 += 1 82 | idx2 += 1 83 | pad = 0 84 | 85 | return cost + (len1 - idx1) + (len2 - idx2) 86 | -------------------------------------------------------------------------------- /tests/test_binpat.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from mbleven import compare 3 | 4 | # 5 | # Testing with binary patterns 6 | 7 | PATTERNS = ['111', '101', '011', '110', '100', '010', '001', '000', 8 | '11', '10', '01', '00', '1', '0', ''] 9 | 10 | MATRIX_L = [ 11 | [ 0, 1, 1, 1, 2, 2, 2,-1, 1, 2, 2,-1, 2,-1,-1], 12 | [ 1, 0, 2, 2, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2,-1], 13 | [ 1, 2, 0, 2,-1, 1, 1, 2, 1, 2, 1, 2, 2, 2,-1], 14 | [ 1, 2, 2, 0, 1, 1,-1, 2, 1, 1, 2, 2, 2, 2,-1], 15 | [ 2, 1,-1, 1, 0, 2, 2, 1, 2, 1, 2, 1, 2, 2,-1], 16 | [ 2, 2, 1, 1, 2, 0, 2, 1, 2, 1, 1, 1, 2, 2,-1], 17 | [ 2, 1, 1,-1, 2, 2, 0, 1, 2, 2, 1, 1, 2, 2,-1], 18 | [-1, 2, 2, 2, 1, 1, 1, 0,-1, 2, 2, 1,-1, 2,-1], 19 | [ 1, 1, 1, 1, 2, 2, 2,-1, 0, 1, 1, 2, 1, 2, 2], 20 | [ 2, 1, 2, 1, 1, 1, 2, 2, 1, 0, 2, 1, 1, 1, 2], 21 | [ 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 0, 1, 1, 1, 2], 22 | [-1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 0, 2, 1, 2], 23 | [ 2, 2, 2, 2, 2, 2, 2,-1, 1, 1, 1, 2, 0, 1, 1], 24 | [-1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 1], 25 | [-1,-1,-1,-1,-1,-1,-1,-1, 2, 2, 2, 2, 1, 1, 0] 26 | ] 27 | 28 | MATRIX_DL = [ 29 | [ 0, 1, 1, 1, 2, 2, 2,-1, 1, 2, 2,-1, 2,-1,-1], 30 | [ 1, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 2, 2, 2,-1], 31 | [ 1, 1, 0, 2, 2, 1, 1, 2, 1, 2, 1, 2, 2, 2,-1], 32 | [ 1, 1, 2, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2,-1], 33 | [ 2, 1, 2, 1, 0, 1, 2, 1, 2, 1, 2, 1, 2, 2,-1], 34 | [ 2, 2, 1, 1, 1, 0, 1, 1, 2, 1, 1, 1, 2, 2,-1], 35 | [ 2, 1, 1, 2, 2, 1, 0, 1, 2, 2, 1, 1, 2, 2,-1], 36 | [-1, 2, 2, 2, 1, 1, 1, 0,-1, 2, 2, 1,-1, 2,-1], 37 | [ 1, 1, 1, 1, 2, 2, 2,-1, 0, 1, 1, 2, 1, 2, 2], 38 | [ 2, 1, 2, 1, 1, 1, 2, 2, 1, 0, 1, 1, 1, 1, 2], 39 | [ 2, 1, 1, 2, 2, 1, 1, 2, 1, 1, 0, 1, 1, 1, 2], 40 | [-1, 2, 2, 2, 1, 1, 1, 1, 2, 1, 1, 0, 2, 1, 2], 41 | [ 2, 2, 2, 2, 2, 2, 2,-1, 1, 1, 1, 2, 0, 1, 1], 42 | [-1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 1], 43 | [-1,-1,-1,-1,-1,-1,-1,-1, 2, 2, 2, 2, 1, 1, 0] 44 | ] 45 | 46 | class TestBinaryPattern(unittest.TestCase): 47 | def test_levenshtein(self): 48 | for i, s1 in enumerate(PATTERNS): 49 | for j, s2 in enumerate(PATTERNS): 50 | self.assertEqual(compare(s1, s2), MATRIX_L[i][j]) 51 | 52 | def test_damerau_levenshtein(self): 53 | for i, s1 in enumerate(PATTERNS): 54 | for j, s2 in enumerate(PATTERNS): 55 | self.assertEqual(compare(s1, s2, True), MATRIX_DL[i][j]) 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /tests/test_fastcomp.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from mbleven import compare 3 | 4 | class TestFastComp(unittest.TestCase): 5 | def test_equal(self): 6 | self.assertEqual(compare('abc', 'abc'), 0) 7 | 8 | def test_insert(self): 9 | self.assertEqual(compare('abc', 'xabc'), 1) 10 | self.assertEqual(compare('abc', 'axbc'), 1) 11 | self.assertEqual(compare('abc', 'abxc'), 1) 12 | self.assertEqual(compare('abc', 'abcx'), 1) 13 | self.assertEqual(compare('abc', 'xxabc'), 2) 14 | self.assertEqual(compare('abc', 'axxbc'), 2) 15 | self.assertEqual(compare('abc', 'abxxc'), 2) 16 | self.assertEqual(compare('abc', 'abcxx'), 2) 17 | self.assertEqual(compare('abc', 'xabcx'), 2) 18 | 19 | def test_replace(self): 20 | self.assertEqual(compare('abc', 'xbc'), 1) 21 | self.assertEqual(compare('abc', 'axc'), 1) 22 | self.assertEqual(compare('abc', 'abx'), 1) 23 | self.assertEqual(compare('abc', 'xxc'), 2) 24 | self.assertEqual(compare('abc', 'axx'), 2) 25 | self.assertEqual(compare('abc', 'xbx'), 2) 26 | 27 | def test_delete(self): 28 | self.assertEqual(compare('abc', 'ab'), 1) 29 | self.assertEqual(compare('abc', 'ac'), 1) 30 | self.assertEqual(compare('abc', 'bc'), 1) 31 | self.assertEqual(compare('a', 'abc'), 2) 32 | self.assertEqual(compare('b', 'abc'), 2) 33 | self.assertEqual(compare('c', 'abc'), 2) 34 | 35 | def test_insert_delete(self): 36 | self.assertEqual(compare('abcde', 'eabcd'), 2) 37 | self.assertEqual(compare('abcde', 'acdeb'), 2) 38 | self.assertEqual(compare('abcde', 'abdec'), 2) 39 | self.assertEqual(compare('ababa', 'babab'), 2) 40 | 41 | def test_transpose(self): 42 | self.assertEqual(compare('abc', 'bac', True), 1) 43 | self.assertEqual(compare('abc', 'acb', True), 1) 44 | self.assertEqual(compare('abc', 'cba', True), 2) 45 | self.assertEqual(compare('abc', 'ba', True), 2) 46 | self.assertEqual(compare('abc', 'ca', True), 2) 47 | 48 | def test_emptystr(self): 49 | self.assertEqual(compare('', ''), 0) 50 | self.assertEqual(compare('', 'a'), 1) 51 | self.assertEqual(compare('', 'ab'), 2) 52 | self.assertEqual(compare('', 'abc'), -1) 53 | self.assertEqual(compare('abc', ''), -1) 54 | 55 | def test_beyond(self): 56 | self.assertEqual(compare('abc', 'def'), -1) 57 | 58 | if __name__ == '__main__': 59 | unittest.main() 60 | --------------------------------------------------------------------------------