├── .gitignore ├── README.md ├── setup.py ├── src └── main.cpp └── tests └── test_pylcs.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.so 2 | MANIFEST 3 | *.py[cod] 4 | *.egg-info 5 | .DS_Store 6 | _build 7 | _generate 8 | build 9 | dist 10 | tmp 11 | .vscode 12 | .idea/ 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pylcs 2 | **pylcs** is a super fast c++ library which adopts dynamic programming(DP) algorithm to solve two classic LCS problems as below . 3 | 4 | [The longest common subsequence](https://en.wikipedia.org/wiki/Longest_common_subsequence_problem) problem is the problem of finding the longest subsequence common to all sequences in a set of sequences (often just two sequences). 5 | 6 | [The longest common substring](https://en.wikipedia.org/wiki/Longest_common_substring_problem) problem is to find the longest string (or strings) that is a substring (or are substrings) of two or more strings. 7 | 8 | [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance), aka ``edit distance`` is also supported. Emm...forget the package name. Example usage is in [tests](./tests/test_pylcs.py). 9 | 10 | We also support Chinese(or any UTF-8) string. 11 | 12 | 13 | Install 14 | ------- 15 | 16 | To install, simply do ``pip install pylcs`` to pull down the latest version from [PyPI](https://pypi.org/project/pylcs/). 17 | 18 | 19 | Python code example 20 | ------------------- 21 | 22 | ```python 23 | import pylcs 24 | 25 | # finding the longest common subsequence length of string A and string B 26 | A = 'We are shannonai' 27 | B = 'We like shannonai' 28 | pylcs.lcs(A, B) 29 | """ 30 | >>> pylcs.lcs(A, B) 31 | 14 32 | """ 33 | 34 | # finding the longest common subsequence length of string A and a list of string B 35 | A = 'We are shannonai' 36 | B = ['We like shannonai', 'We work in shannonai', 'We are not shannonai'] 37 | pylcs.lcs_of_list(A, B) 38 | """ 39 | >>> pylcs.lcs_of_list(A, B) 40 | [14, 14, 16] 41 | """ 42 | 43 | # finding the longest common substring length of string A and string B 44 | A = 'We are shannonai' 45 | B = 'We like shannonai' 46 | pylcs.lcs2(A, B) 47 | """ 48 | >>> pylcs.lcs2(A, B) 49 | 11 50 | """ 51 | 52 | # finding the longest common substring length of string A and a list of string B 53 | A = 'We are shannonai' 54 | B = ['We like shannonai', 'We work in shannonai', 'We are not shannonai'] 55 | pylcs.lcs2_of_list(A, B) 56 | """ 57 | >>> pylcs.lcs2_of_list(A, B) 58 | [11, 10, 10] 59 | """ 60 | 61 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension 2 | from setuptools.command.build_ext import build_ext 3 | import sys 4 | import setuptools 5 | 6 | 7 | class get_pybind_include(object): 8 | """Helper class to determine the pybind11 include path 9 | The purpose of this class is to postpone importing pybind11 10 | until it is actually installed, so that the ``get_include()`` 11 | method can be invoked. """ 12 | 13 | def __init__(self, user=False): 14 | self.user = user 15 | 16 | def __str__(self): 17 | import pybind11 18 | return pybind11.get_include(self.user) 19 | 20 | 21 | ext_modules = [ 22 | Extension( 23 | 'pylcs', 24 | ['src/main.cpp'], 25 | include_dirs=[ 26 | # Path to pybind11 headers 27 | get_pybind_include(), 28 | get_pybind_include(user=True) 29 | ], 30 | language='c++' 31 | ), 32 | ] 33 | 34 | 35 | # As of Python 3.6, CCompiler has a `has_flag` method. 36 | # cf http://bugs.python.org/issue26689 37 | def has_flag(compiler, flagname): 38 | """Return a boolean indicating whether a flag name is supported on 39 | the specified compiler. 40 | """ 41 | import tempfile 42 | with tempfile.NamedTemporaryFile('w', suffix='.cpp') as f: 43 | f.write('int main (int argc, char **argv) { return 0; }') 44 | try: 45 | compiler.compile([f.name], extra_postargs=[flagname]) 46 | except setuptools.distutils.errors.CompileError: 47 | return False 48 | return True 49 | 50 | 51 | def cpp_flag(compiler): 52 | """Return the -std=c++[11/14] compiler flag. 53 | The c++14 is prefered over c++11 (when it is available). 54 | """ 55 | if has_flag(compiler, '-std=c++14'): 56 | return '-std=c++14' 57 | elif has_flag(compiler, '-std=c++11'): 58 | return '-std=c++11' 59 | else: 60 | raise RuntimeError('Unsupported compiler -- at least C++11 support ' 61 | 'is needed!') 62 | 63 | 64 | class BuildExt(build_ext): 65 | """A custom build extension for adding compiler-specific options.""" 66 | c_opts = { 67 | 'msvc': ['/EHsc'], 68 | 'unix': [], 69 | } 70 | 71 | if sys.platform == 'darwin': 72 | c_opts['unix'] += ['-stdlib=libc++', '-mmacosx-version-min=10.7'] 73 | 74 | def build_extensions(self): 75 | ct = self.compiler.compiler_type 76 | opts = self.c_opts.get(ct, []) 77 | if ct == 'unix': 78 | opts.append('-DVERSION_INFO="%s"' % self.distribution.get_version()) 79 | opts.append(cpp_flag(self.compiler)) 80 | if has_flag(self.compiler, '-fvisibility=hidden'): 81 | opts.append('-fvisibility=hidden') 82 | elif ct == 'msvc': 83 | opts.append('/DVERSION_INFO=\\"%s\\"' % self.distribution.get_version()) 84 | for ext in self.extensions: 85 | ext.extra_compile_args = opts 86 | build_ext.build_extensions(self) 87 | 88 | with open("README.md", "r") as fh: 89 | long_description = fh.read() 90 | 91 | setup( 92 | name='pylcs', 93 | version='0.0.6', 94 | author='Meteorix', 95 | author_email='lxhustauto@gmail.com', 96 | url='https://github.com/Meteorix/pylcs', 97 | description='super fast cpp implementation of longest common subsequence', 98 | long_description=long_description, 99 | long_description_content_type="text/markdown", 100 | ext_modules=ext_modules, 101 | install_requires=['pybind11>=2.2'], 102 | cmdclass={'build_ext': BuildExt}, 103 | zip_safe=False, 104 | ) 105 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | using namespace std; 9 | 10 | 11 | vector utf8_split(const string &str){ 12 | vector split; 13 | int len = str.length(); 14 | int left = 0; 15 | int right = 1; 16 | 17 | for (int i = 0; i < len; i++){ 18 | if (right >= len || ((str[right] & 0xc0) != 0x80)){ 19 | string s = str.substr(left, right - left); 20 | split.push_back(s); 21 | // printf("%s %d %d\n", s.c_str(), left, right); 22 | left = right; 23 | } 24 | right ++; 25 | } 26 | return split; 27 | } 28 | 29 | 30 | // 最长公共子序列(不连续) 31 | int lcs_length_(const string &str1, const string &str2) { 32 | if (str1 == "" || str2 == "") 33 | return 0; 34 | vector s1 = utf8_split(str1); 35 | vector s2 = utf8_split(str2); 36 | int m = s1.size(); 37 | int n = s2.size(); 38 | vector> dp(m + 1, vector(n + 1)); 39 | int i, j; 40 | // printf("%d %d\n", m, n); 41 | 42 | for (i = 0; i <= m; i++) { 43 | dp[i][0] = 0; 44 | } 45 | for (j = 0; j <= n; j++) { 46 | dp[0][j] = 0; 47 | } 48 | for (i = 1; i <= m; i++) { 49 | for (j = 1; j <= n; j++) { 50 | if (s1[i - 1] == s2[j - 1]) { 51 | dp[i][j] = dp[i - 1][j - 1] + 1; 52 | } else { 53 | if (dp[i - 1][j] >= dp[i][j - 1]) 54 | dp[i][j] = dp[i - 1][j]; 55 | else 56 | dp[i][j] = dp[i][j-1]; 57 | } 58 | } 59 | } 60 | return dp[m][n]; 61 | } 62 | 63 | 64 | // 最长公共子串(连续) 65 | int lcs2_length_(const string &str1, const string &str2) { 66 | if (str1 == "" || str2 == "") 67 | return 0; 68 | vector s1 = utf8_split(str1); 69 | vector s2 = utf8_split(str2); 70 | int m = s1.size(); 71 | int n = s2.size(); 72 | vector> dp(m + 1, vector(n + 1)); 73 | int i, j; 74 | int max = 0; 75 | 76 | for (i = 0; i <= m; i++) { 77 | dp[i][0] = 0; 78 | } 79 | for (j = 0; j <= n; j++) { 80 | dp[0][j] = 0; 81 | } 82 | for (i = 1; i <= m; i++) { 83 | for (j = 1; j <= n; j++) { 84 | if (s1[i - 1] == s2[j - 1]) { 85 | dp[i][j] = dp[i - 1][j - 1] + 1; 86 | if (dp[i][j] > max){ 87 | max = dp[i][j]; 88 | } 89 | } 90 | else { 91 | dp[i][j] = 0; 92 | } 93 | } 94 | } 95 | return max; 96 | } 97 | 98 | 99 | // TODO 返回子序列 100 | int lcs(const string &str1, const string &str2){ 101 | return lcs_length_(str1, str2); 102 | } 103 | 104 | 105 | // TODO 返回子串 106 | int lcs2(const string &str1, const string &str2){ 107 | return lcs2_length_(str1, str2); 108 | } 109 | 110 | 111 | vector lcs_of_list(const string &str1, vector &str_list){ 112 | int size = str_list.size(); 113 | vector ls(size); 114 | for (int i = 0; i < size; i++){ 115 | int l = lcs(str1, str_list[i]); 116 | ls[i] = l; 117 | } 118 | return ls; 119 | } 120 | 121 | 122 | vector lcs2_of_list(const string &str1, vector &str_list){ 123 | int size = str_list.size(); 124 | vector ls(size); 125 | for (int i = 0; i < size; i++){ 126 | int l = lcs2(str1, str_list[i]); 127 | ls[i] = l; 128 | } 129 | return ls; 130 | } 131 | 132 | 133 | // 编辑距离 134 | int levenshtein_distance(const string &str1, const string &str2) { 135 | if (str1 == "" || str2 == "") 136 | return 0; 137 | vector s1 = utf8_split(str1); 138 | vector s2 = utf8_split(str2); 139 | int m = s1.size(); 140 | int n = s2.size(); 141 | vector> dp(m + 1, vector(n + 1)); 142 | int i, j; 143 | 144 | for (i = 0; i <= m; i++) { 145 | dp[i][0] = i; 146 | } 147 | for (j = 0; j <= n; j++) { 148 | dp[0][j] = j; 149 | } 150 | for (i = 1; i <= m; i++) { 151 | for (j = 1; j <= n; j++) { 152 | if (s1[i - 1] == s2[j - 1]) { 153 | dp[i][j] = dp[i - 1][j - 1]; 154 | } else { 155 | dp[i][j] = 1 + min({dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1]}); 156 | } 157 | } 158 | } 159 | return dp[m][n]; 160 | } 161 | 162 | 163 | vector levenshtein_distance_of_list(const string &str1, vector &str_list){ 164 | int size = str_list.size(); 165 | vector ls(size); 166 | for (int i = 0; i < size; i++){ 167 | int l = levenshtein_distance(str1, str_list[i]); 168 | ls[i] = l; 169 | } 170 | return ls; 171 | } 172 | 173 | 174 | namespace py = pybind11; 175 | 176 | 177 | PYBIND11_MODULE(pylcs, m) { 178 | m.def("lcs", &lcs, R"pbdoc( 179 | Longest common subsequence 180 | )pbdoc"); 181 | 182 | m.def("lcs_of_list", &lcs_of_list, R"pbdoc( 183 | Longest common subsequence of list 184 | )pbdoc"); 185 | 186 | m.def("lcs2", &lcs2, R"pbdoc( 187 | Longest common substring 188 | )pbdoc"); 189 | 190 | m.def("lcs2_of_list", &lcs2_of_list, R"pbdoc( 191 | Longest common substring of list 192 | )pbdoc"); 193 | 194 | m.def("levenshtein_distance", &levenshtein_distance, R"pbdoc( 195 | Levenshtein Distance of Two Strings 196 | )pbdoc"); 197 | 198 | m.def("edit_distance", &levenshtein_distance, R"pbdoc( 199 | Same As levenshtein_distance(): Levenshtein Distance of Two Strings 200 | )pbdoc"); 201 | 202 | m.def("levenshtein_distance_of_list", &levenshtein_distance_of_list, R"pbdoc( 203 | Levenshtein Distance of one string to a list of strings 204 | )pbdoc"); 205 | 206 | m.def("edit_distance_of_list", &levenshtein_distance_of_list, R"pbdoc( 207 | Levenshtein Distance of one string to a list of strings 208 | )pbdoc"); 209 | } 210 | -------------------------------------------------------------------------------- /tests/test_pylcs.py: -------------------------------------------------------------------------------- 1 | # encoding=utf-8 2 | import pylcs 3 | 4 | 5 | def test_lcs(): 6 | assert pylcs.lcs("aaa", "bbb") == 0 7 | assert pylcs.lcs("aaa", "aabbbaa") == 3 8 | assert pylcs.lcs("你好", "中国") == 0 9 | assert pylcs.lcs("aaa你好", "你好呀") == 2 10 | 11 | 12 | def test_lcs_of_list(): 13 | assert pylcs.lcs_of_list("aaa", ["aabbbaa"] * 10) == [3] * 10 14 | assert pylcs.lcs_of_list("aaa你好", ["你好呀"] * 10) == [2] * 10 15 | 16 | 17 | def test_lcs2(): 18 | assert pylcs.lcs2("aaa", "bbb") == 0 19 | assert pylcs.lcs2("aaa", "aabbbaa") == 2 20 | assert pylcs.lcs2("你好", "中国") == 0 21 | assert pylcs.lcs2("aaa你好", "好呀你") == 1 22 | 23 | 24 | def test_lcs2_of_list(): 25 | assert pylcs.lcs2_of_list("aaa", ["aabbbaa"] * 10) == [2] * 10 26 | assert pylcs.lcs2_of_list("aaa你好", ["好呀你"] * 10) == [1] * 10 27 | 28 | 29 | def test_edit_distance(): 30 | assert pylcs.edit_distance("aaa", "bbb") == 3 31 | assert pylcs.edit_distance("aaa", "aabbbaa") == 4 32 | assert pylcs.edit_distance("你好", "中国") == 2 33 | assert pylcs.edit_distance("aaa你好", "你好呀") == 4 34 | 35 | 36 | def test_edit_distance_of_list(): 37 | assert pylcs.edit_distance_of_list("aaa", ["bbb"] * 10) == [3] * 10 38 | assert pylcs.edit_distance_of_list("aaa你好", ["你好呀"] * 10) == [4] * 10 39 | assert pylcs.edit_distance_of_list("aaa你好", ["bbb", "你好呀"] * 10) == [5, 4] * 10 40 | --------------------------------------------------------------------------------