├── MANIFEST ├── distance ├── _pyimports.py ├── __init__.py ├── _lcsubstrings.py ├── _iterators.py ├── _simpledists.py ├── _fastcomp.py └── _levenshtein.py ├── .gitignore ├── cdistance ├── hamming.c ├── distance.h ├── lcsubstrings.c ├── fastcomp.c ├── levenshtein.c ├── includes.h ├── utarray.h └── distance.c ├── README.md ├── setup.py ├── tests └── tests.py └── LICENSE /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.py 3 | distance/__init__.py 4 | distance/distance.py 5 | -------------------------------------------------------------------------------- /distance/_pyimports.py: -------------------------------------------------------------------------------- 1 | from ._fastcomp import * 2 | from ._lcsubstrings import * 3 | from ._levenshtein import * 4 | from ._simpledists import * 5 | from ._iterators import * 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | parts 13 | bin 14 | var 15 | sdist 16 | develop-eggs 17 | .installed.cfg 18 | lib 19 | lib64 20 | __pycache__ 21 | 22 | # Installer logs 23 | pip-log.txt 24 | 25 | # Unit test / coverage reports 26 | .coverage 27 | .tox 28 | nosetests.xml 29 | 30 | # Translations 31 | *.mo 32 | 33 | # Mr Developer 34 | .mr.developer.cfg 35 | .project 36 | .pydevproject 37 | -------------------------------------------------------------------------------- /cdistance/hamming.c: -------------------------------------------------------------------------------- 1 | #include "distance.h" 2 | 3 | static Py_ssize_t 4 | hamming(unicode *seq1, unicode *seq2, Py_ssize_t len) 5 | { 6 | Py_ssize_t i, dist = 0; 7 | #ifdef SEQUENCE_COMP 8 | int comp; 9 | #endif 10 | 11 | for (i = 0; i < len; i++) { 12 | #ifdef SEQUENCE_COMP 13 | comp = SEQUENCE_COMP(seq1, i, seq2, i); 14 | if (comp == -1) 15 | return -1; 16 | if (!comp) 17 | #else 18 | if (seq1[i] != seq2[i]) 19 | #endif 20 | dist++; 21 | } 22 | 23 | return dist; 24 | } 25 | -------------------------------------------------------------------------------- /distance/__init__.py: -------------------------------------------------------------------------------- 1 | "Utilities for comparing sequences" 2 | 3 | __all__ = ["hamming", "levenshtein", "nlevenshtein", "jaccard", "sorensen", 4 | "fast_comp", "lcsubstrings", "ilevenshtein", "ifast_comp"] 5 | 6 | try: 7 | from .cdistance import * 8 | except ImportError: 9 | from ._pyimports import * 10 | 11 | from ._pyimports import jaccard, sorensen 12 | 13 | def quick_levenshtein(str1, str2): 14 | return fast_comp(str1, str2, transpositions=False) 15 | 16 | def iquick_levenshtein(str1, strs): 17 | return ifast_comp(str1, str2, transpositions=False) 18 | -------------------------------------------------------------------------------- /cdistance/distance.h: -------------------------------------------------------------------------------- 1 | #ifndef DISTANCE_H 2 | #define DISTANCE_H 3 | 4 | #include "Python.h" 5 | #include "utarray.h" 6 | 7 | // Debugging. This kills the interpreter if an assertion fails. 8 | 9 | #ifdef DISTANCE_DEBUG 10 | #undef NDEBUG 11 | #include 12 | #endif 13 | 14 | // Compatibility Python 2 && 3 15 | 16 | #if PY_MAJOR_VERSION < 3 17 | #define PyBytes_Check PyString_Check 18 | #define PyBytes_AS_STRING PyString_AS_STRING 19 | #define PyBytes_GET_SIZE PyString_GET_SIZE 20 | #define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE 21 | #endif 22 | 23 | // Aliases for each sequence type 24 | 25 | typedef Py_UNICODE unicode; 26 | 27 | typedef char byte; 28 | 29 | typedef PyObject array; 30 | 31 | typedef union { 32 | unicode *u; 33 | byte *b; 34 | array *a; 35 | } sequence; 36 | 37 | 38 | // Used in distance.c and some other files 39 | 40 | #define SWAP(type, a, b) \ 41 | do { \ 42 | type a##_tmp = a; \ 43 | a = b; \ 44 | b = a##_tmp; \ 45 | } while (0) 46 | 47 | 48 | // Used in lcsubstrings.c and distance.c for dynamic array 49 | 50 | struct pair_t { 51 | Py_ssize_t i; 52 | Py_ssize_t j; 53 | }; 54 | 55 | UT_icd pair_icd = {sizeof(struct pair_t), NULL, NULL, NULL}; 56 | 57 | #endif 58 | -------------------------------------------------------------------------------- /distance/_lcsubstrings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from array import array 4 | 5 | 6 | def lcsubstrings(seq1, seq2, positions=False): 7 | """Find the longest common substring(s) in the sequences `seq1` and `seq2`. 8 | 9 | If positions evaluates to `True` only their positions will be returned, 10 | together with their length, in a tuple: 11 | 12 | (length, [(start pos in seq1, start pos in seq2)..]) 13 | 14 | Otherwise, the substrings themselves will be returned, in a set. 15 | 16 | Example: 17 | 18 | >>> lcsubstrings("sedentar", "dentist") 19 | {'dent'} 20 | >>> lcsubstrings("sedentar", "dentist", positions=True) 21 | (4, [(2, 0)]) 22 | """ 23 | L1, L2 = len(seq1), len(seq2) 24 | ms = [] 25 | mlen = last = 0 26 | if L1 < L2: 27 | seq1, seq2 = seq2, seq1 28 | L1, L2 = L2, L1 29 | 30 | column = array('L', range(L2)) 31 | 32 | for i in range(L1): 33 | for j in range(L2): 34 | old = column[j] 35 | if seq1[i] == seq2[j]: 36 | if i == 0 or j == 0: 37 | column[j] = 1 38 | else: 39 | column[j] = last + 1 40 | if column[j] > mlen: 41 | mlen = column[j] 42 | ms = [(i, j)] 43 | elif column[j] == mlen: 44 | ms.append((i, j)) 45 | else: 46 | column[j] = 0 47 | last = old 48 | 49 | if positions: 50 | return (mlen, tuple((i - mlen + 1, j - mlen + 1) for i, j in ms if ms)) 51 | return set(seq1[i - mlen + 1:i + 1] for i, _ in ms if ms) 52 | -------------------------------------------------------------------------------- /cdistance/lcsubstrings.c: -------------------------------------------------------------------------------- 1 | #include "distance.h" 2 | 3 | 4 | static UT_array * 5 | lcsubstrings(unicode *seq1, unicode *seq2, 6 | Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t *max_len) 7 | { 8 | Py_ssize_t i, j, mlen = -1; 9 | Py_ssize_t old, last, *column; 10 | UT_array *stack = NULL; 11 | struct pair_t pos; 12 | #ifdef SEQUENCE_COMP 13 | int comp; 14 | #endif 15 | 16 | assert(len1 >= len2); 17 | 18 | utarray_new(stack, &pair_icd); 19 | 20 | if (len2 == 0) { 21 | *max_len = 0; 22 | return stack; 23 | } 24 | 25 | if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) 26 | goto On_Error; 27 | 28 | last = 0; 29 | for (j = 0; j < len2; j++) 30 | column[j] = j; 31 | 32 | for (i = 0; i < len1; i++) { 33 | for (j = 0; j < len2; j++) { 34 | old = column[j]; 35 | #ifdef SEQUENCE_COMP 36 | comp = SEQUENCE_COMP(seq1, i, seq2, j); 37 | if (comp == -1) 38 | goto On_Error; 39 | if (comp) { 40 | #else 41 | if (seq1[i] == seq2[j]) { 42 | #endif 43 | column[j] = ((i == 0 || j == 0) ? 1 : (last + 1)); 44 | if (column[j] > mlen) { 45 | mlen = column[j]; 46 | pos.i = i; 47 | pos.j = j; 48 | utarray_clear(stack); 49 | utarray_push_back(stack, &pos); 50 | } 51 | else if (column[j] == mlen) { 52 | pos.i = i; 53 | pos.j = j; 54 | utarray_push_back(stack, &pos); 55 | } 56 | } 57 | else 58 | column[j] = 0; 59 | last = old; 60 | } 61 | } 62 | 63 | free(column); 64 | 65 | *max_len = mlen; 66 | return stack; 67 | 68 | On_Error: 69 | free(column); 70 | utarray_free(stack); 71 | return NULL; 72 | } 73 | -------------------------------------------------------------------------------- /distance/_iterators.py: -------------------------------------------------------------------------------- 1 | from ._pyimports import levenshtein, fast_comp 2 | 3 | def ilevenshtein(seq1, seqs, max_dist=-1): 4 | """Compute the Levenshtein distance between the sequence `seq1` and the series 5 | of sequences `seqs`. 6 | 7 | `seq1`: the reference sequence 8 | `seqs`: a series of sequences (can be a generator) 9 | `max_dist`: if provided and > 0, only the sequences which distance from 10 | the reference sequence is lower or equal to this value will be returned. 11 | 12 | The return value is a series of pairs (distance, sequence). 13 | 14 | The sequence objects in `seqs` are expected to be of the same kind than 15 | the reference sequence in the C implementation; the same holds true for 16 | `ifast_comp`. 17 | """ 18 | for seq2 in seqs: 19 | dist = levenshtein(seq1, seq2, max_dist=max_dist) 20 | if dist != -1: 21 | yield dist, seq2 22 | 23 | 24 | def ifast_comp(seq1, seqs, transpositions=False): 25 | """Return an iterator over all the sequences in `seqs` which distance from 26 | `seq1` is lower or equal to 2. The sequences which distance from the 27 | reference sequence is higher than that are dropped. 28 | 29 | `seq1`: the reference sequence. 30 | `seqs`: a series of sequences (can be a generator) 31 | `transpositions` has the same sense than in `fast_comp`. 32 | 33 | The return value is a series of pairs (distance, sequence). 34 | 35 | You might want to call `sorted()` on the iterator to get the results in a 36 | significant order: 37 | 38 | >>> g = ifast_comp("foo", ["fo", "bar", "foob", "foo", "foobaz"]) 39 | >>> sorted(g) 40 | [(0, 'foo'), (1, 'fo'), (1, 'foob')] 41 | """ 42 | for seq2 in seqs: 43 | dist = fast_comp(seq1, seq2, transpositions) 44 | if dist != -1: 45 | yield dist, seq2 46 | -------------------------------------------------------------------------------- /distance/_simpledists.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def hamming(seq1, seq2, normalized=False): 4 | """Compute the Hamming distance between the two sequences `seq1` and `seq2`. 5 | The Hamming distance is the number of differing items in two ordered 6 | sequences of the same length. If the sequences submitted do not have the 7 | same length, an error will be raised. 8 | 9 | If `normalized` evaluates to `False`, the return value will be an integer 10 | between 0 and the length of the sequences provided, edge values included; 11 | otherwise, it will be a float between 0 and 1 included, where 0 means 12 | equal, and 1 totally different. Normalized hamming distance is computed as: 13 | 14 | 0.0 if len(seq1) == 0 15 | hamming_dist / len(seq1) otherwise 16 | """ 17 | L = len(seq1) 18 | if L != len(seq2): 19 | raise ValueError("expected two strings of the same length") 20 | if L == 0: 21 | return 0.0 if normalized else 0 # equal 22 | dist = sum(c1 != c2 for c1, c2 in zip(seq1, seq2)) 23 | if normalized: 24 | return dist / float(L) 25 | return dist 26 | 27 | def jaccard(seq1, seq2): 28 | """Compute the Jaccard distance between the two sequences `seq1` and `seq2`. 29 | They should contain hashable items. 30 | 31 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. 32 | """ 33 | set1, set2 = set(seq1), set(seq2) 34 | return 1 - len(set1 & set2) / float(len(set1 | set2)) 35 | 36 | 37 | def sorensen(seq1, seq2): 38 | """Compute the Sorensen distance between the two sequences `seq1` and `seq2`. 39 | They should contain hashable items. 40 | 41 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different. 42 | """ 43 | set1, set2 = set(seq1), set(seq2) 44 | return 1 - (2 * len(set1 & set2) / float(len(set1) + len(set2))) 45 | -------------------------------------------------------------------------------- /distance/_fastcomp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | def fast_comp(seq1, seq2, transpositions=False): 4 | """Compute the distance between the two sequences `seq1` and `seq2` up to a 5 | maximum of 2 included, and return it. If the edit distance between the two 6 | sequences is higher than that, -1 is returned. 7 | 8 | If `transpositions` is `True`, transpositions will be taken into account for 9 | the computation of the distance. This can make a difference, e.g.: 10 | 11 | >>> fast_comp("abc", "bac", transpositions=False) 12 | 2 13 | >>> fast_comp("abc", "bac", transpositions=True) 14 | 1 15 | 16 | This is faster than `levenshtein` by an order of magnitude, but on the 17 | other hand is of limited use. 18 | 19 | The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`. 20 | I've added transpositions support to the original code. 21 | """ 22 | replace, insert, delete = "r", "i", "d" 23 | 24 | L1, L2 = len(seq1), len(seq2) 25 | if L1 < L2: 26 | L1, L2 = L2, L1 27 | seq1, seq2 = seq2, seq1 28 | 29 | ldiff = L1 - L2 30 | if ldiff == 0: 31 | models = (insert+delete, delete+insert, replace+replace) 32 | elif ldiff == 1: 33 | models = (delete+replace, replace+delete) 34 | elif ldiff == 2: 35 | models = (delete+delete,) 36 | else: 37 | return -1 38 | 39 | res = 3 40 | for model in models: 41 | i = j = c = 0 42 | while (i < L1) and (j < L2): 43 | if seq1[i] != seq2[j]: 44 | c = c+1 45 | if 2 < c: 46 | break 47 | 48 | if transpositions and ldiff != 2 \ 49 | and i < L1 - 1 and j < L2 - 1 \ 50 | and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]: 51 | i, j = i+2, j+2 52 | else: 53 | cmd = model[c-1] 54 | if cmd == delete: 55 | i = i+1 56 | elif cmd == insert: 57 | j = j+1 58 | else: 59 | assert cmd == replace 60 | i,j = i+1, j+1 61 | else: 62 | i,j = i+1, j+1 63 | 64 | if 2 < c: 65 | continue 66 | elif i < L1: 67 | if L1-i <= model[c:].count(delete): 68 | c = c + (L1-i) 69 | else: 70 | continue 71 | elif j < L2: 72 | if L2-j <= model[c:].count(insert): 73 | c = c + (L2-j) 74 | else: 75 | continue 76 | 77 | if c < res: 78 | res = c 79 | 80 | if res == 3: 81 | res = -1 82 | return res 83 | -------------------------------------------------------------------------------- /cdistance/fastcomp.c: -------------------------------------------------------------------------------- 1 | #include "distance.h" 2 | 3 | 4 | static short 5 | fastcomp(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, int transpositions) 6 | { 7 | char *models[3]; 8 | short m, cnt, res = 3; 9 | Py_ssize_t i, j, c, ldiff; 10 | #ifdef SEQUENCE_COMP 11 | int comp; 12 | #endif 13 | 14 | if (len1 < len2) { 15 | SWAP(unicode *, seq1, seq2); 16 | SWAP(Py_ssize_t, len1, len2); 17 | } 18 | 19 | ldiff = len1 - len2; 20 | switch (ldiff) { 21 | case 0: 22 | models[2] = "id"; 23 | models[1] = "di"; 24 | models[0] = "rr"; 25 | m = 2; 26 | break; 27 | case 1: 28 | models[1] = "dr"; 29 | models[0] = "rd"; 30 | m = 1; 31 | break; 32 | case 2: 33 | models[0] = "dd"; 34 | m = 0; 35 | break; 36 | default: 37 | return -1; 38 | } 39 | 40 | for (; m >= 0; m--) { 41 | 42 | i = j = c = 0; 43 | 44 | while (i < len1 && j < len2) 45 | { 46 | #ifdef SEQUENCE_COMP 47 | comp = SEQUENCE_COMP(seq1, i, seq2, j); 48 | if (comp == -1) 49 | return -2; 50 | if (!comp) { 51 | #else 52 | if (seq1[i] != seq2[j]) { 53 | #endif 54 | c++; 55 | if (c > 2) 56 | break; 57 | 58 | /* Transpositions handling. `ldiff`, which is the absolute difference between the length 59 | of the sequences `seq1` and `seq2`, should not be equal to 2 because in this case only 60 | deletions can happen (given that the distance between the two sequences should not be 61 | higher than 2, this is the shortest path). 62 | We do a lookahead to check if a transposition is possible between the current position 63 | and the next one, and, if so, we systematically choose this path over the other alternative 64 | edit operations. We act like so because the cost of a transposition is always the lowest 65 | one in such situations. 66 | */ 67 | #ifdef SEQUENCE_COMP 68 | if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1)) { 69 | comp = SEQUENCE_COMP(seq1, i + 1, seq2, j); 70 | if (comp == -1) 71 | return -2; 72 | else if (comp) { 73 | comp = SEQUENCE_COMP(seq1, i, seq2, j + 1); 74 | if (comp == -1) 75 | return -2; 76 | else if (comp) { 77 | i = i + 2; 78 | j = j + 2; 79 | continue; 80 | } 81 | } 82 | } 83 | #else 84 | if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1) && \ 85 | seq1[i + 1] == seq2[j] && \ 86 | seq1[i] == seq2[j + 1]) { 87 | i = i + 2; 88 | j = j + 2; 89 | continue; 90 | } 91 | #endif 92 | if (models[m][c - 1] == 'd') 93 | i++; 94 | else if (models[m][c - 1] == 'i') 95 | j++; 96 | else { 97 | i++; 98 | j++; 99 | } 100 | } 101 | else { 102 | i++; 103 | j++; 104 | } 105 | } 106 | 107 | if (c > 2) 108 | continue; 109 | 110 | else if (i < len1) { 111 | if (c == 1) 112 | cnt = (models[m][1] == 'd'); 113 | else 114 | cnt = (models[m][0] == 'd') + (models[m][1] == 'd'); 115 | if (len1 - i <= cnt) { 116 | c = c + (len1 - i); 117 | } 118 | else 119 | continue; 120 | } 121 | else if (j < len2) { 122 | if (len2 - j <= (models[m][c] == 'i')) 123 | c = c + (len2 - j); 124 | else 125 | continue; 126 | } 127 | if (c < res) { 128 | res = c; 129 | } 130 | } 131 | 132 | if (res == 3) 133 | res = -1; 134 | 135 | return res; 136 | } 137 | -------------------------------------------------------------------------------- /cdistance/levenshtein.c: -------------------------------------------------------------------------------- 1 | #include "distance.h" 2 | 3 | #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c))) 4 | #define MAX3(a, b, c) ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c))) 5 | 6 | #ifndef LEVENSHTEIN_C 7 | #define LEVENSHTEIN_C 8 | 9 | static Py_ssize_t 10 | minimum(const Py_ssize_t *column, Py_ssize_t len) 11 | { 12 | Py_ssize_t min; 13 | 14 | assert(len > 0); 15 | min = column[--len]; 16 | while (len-- >= 0) { 17 | if (column[len] < min) 18 | min = column[len]; 19 | } 20 | 21 | return min; 22 | } 23 | 24 | #endif 25 | 26 | static Py_ssize_t 27 | levenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t max_dist) 28 | { 29 | Py_ssize_t i, j; 30 | Py_ssize_t last, old; 31 | Py_ssize_t cost, dist = -2; 32 | Py_ssize_t *column; 33 | 34 | #ifdef SEQUENCE_COMP 35 | int comp; 36 | #endif 37 | 38 | if (len1 < len2) { 39 | SWAP(unicode *, seq1, seq2); 40 | SWAP(Py_ssize_t, len1, len2); 41 | } 42 | 43 | if (max_dist >= 0 && (len1 - len2) > max_dist) 44 | return -1; 45 | else { 46 | if (len1 == 0) 47 | return len2; 48 | if (len2 == 0) 49 | return len1; 50 | } 51 | 52 | if ((column = (Py_ssize_t *) malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) 53 | return -2; 54 | 55 | for (j = 1 ; j <= len2; j++) 56 | column[j] = j; 57 | 58 | for (i = 1 ; i <= len1; i++) { 59 | column[0] = i; 60 | for (j = 1, last = i - 1; j <= len2; j++) { 61 | old = column[j]; 62 | #ifdef SEQUENCE_COMP 63 | comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1); 64 | if (comp == -1) { 65 | free(column); 66 | return -3; 67 | } 68 | cost = (!comp); 69 | #else 70 | cost = (seq1[i - 1] != seq2[j - 1]); 71 | #endif 72 | column[j] = MIN3( 73 | column[j] + 1, 74 | column[j - 1] + 1, 75 | last + cost 76 | ); 77 | last = old; 78 | } 79 | if (max_dist >= 0 && minimum(column, len2 + 1) > max_dist) { 80 | free(column); 81 | return -1; 82 | } 83 | } 84 | 85 | dist = column[len2]; 86 | 87 | free(column); 88 | 89 | if (max_dist >= 0 && dist > max_dist) 90 | return -1; 91 | return dist; 92 | } 93 | 94 | 95 | static double 96 | nlevenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, short method) 97 | { 98 | Py_ssize_t i, j; 99 | 100 | // distance 101 | Py_ssize_t ic, dc, rc; 102 | Py_ssize_t last, old; 103 | Py_ssize_t *column; 104 | Py_ssize_t fdist; 105 | 106 | // length 107 | Py_ssize_t lic, ldc, lrc; 108 | Py_ssize_t llast, lold; 109 | Py_ssize_t *length; 110 | Py_ssize_t flen; 111 | 112 | #ifdef SEQUENCE_COMP 113 | int comp; 114 | #endif 115 | 116 | assert(len1 >= len2); 117 | 118 | if (len1 == 0) // len2 is 0 too, so the two sequences are identical 119 | return 0.0; 120 | if (len2 == 0) // completely different 121 | return 1.0; 122 | 123 | if (method == 1) { 124 | fdist = levenshtein(seq1, seq2, len1, len2, -1); 125 | if (fdist < 0) // error 126 | return fdist; 127 | return fdist / (double)len1; 128 | } 129 | 130 | if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) 131 | return -1; 132 | if ((length = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) { 133 | free(column); 134 | return -1; 135 | } 136 | 137 | for (j = 1 ; j <= len2; j++) 138 | column[j] = length[j] = j; 139 | 140 | for (i = 1 ; i <= len1; i++) { 141 | column[0] = length[0] = i; 142 | 143 | for (j = 1, last = llast = i - 1; j <= len2; j++) { 144 | 145 | // distance 146 | old = column[j]; 147 | ic = column[j - 1] + 1; 148 | dc = column[j] + 1; 149 | #ifdef SEQUENCE_COMP 150 | comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1); 151 | if (comp == -1) { 152 | free(column); 153 | free(length); 154 | return -2; 155 | } 156 | rc = last + (!comp); 157 | #else 158 | rc = last + (seq1[i - 1] != seq2[j - 1]); 159 | #endif 160 | column[j] = MIN3(ic, dc, rc); 161 | last = old; 162 | 163 | // length 164 | lold = length[j]; 165 | lic = (ic == column[j] ? length[j - 1] + 1 : 0); 166 | ldc = (dc == column[j] ? length[j] + 1 : 0); 167 | lrc = (rc == column[j] ? llast + 1 : 0); 168 | length[j] = MAX3(lic, ldc, lrc); 169 | llast = lold; 170 | } 171 | } 172 | 173 | fdist = column[len2]; 174 | flen = length[len2]; 175 | 176 | free(column); 177 | free(length); 178 | 179 | return fdist / (double)flen; 180 | } 181 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | distance - Utilities for comparing sequences 2 | ============================================ 3 | 4 | This package provides helpers for computing similarities between arbitrary sequences. Included metrics are Levenshtein, Hamming, Jaccard, and Sorensen distance, plus some bonuses. All distance computations are implemented in pure Python, and most of them are also implemented in C. 5 | 6 | 7 | Installation 8 | ------------ 9 | 10 | If you don't want or need to use the C extension, just unpack the archive and run, as root: 11 | 12 | # python setup.py install 13 | 14 | For the C extension to work, you need the Python source files, and a C compiler (typically Microsoft Visual C++ 2010 on Windows, and GCC on Mac and Linux). On a Debian-like system, you can get all of these with: 15 | 16 | # apt-get install gcc pythonX.X-dev 17 | 18 | where X.X is the number of your Python version. 19 | 20 | Then you should type: 21 | 22 | # python setup.py install --with-c 23 | 24 | Note the use of the `--with-c` switch. 25 | 26 | 27 | Usage 28 | ----- 29 | 30 | A common use case for this module is to compare single words for similarity: 31 | 32 | >>> distance.levenshtein("lenvestein", "levenshtein") 33 | 3 34 | >>> distance.hamming("hamming", "hamning") 35 | 1 36 | 37 | If there is not a one-to-one mapping between sounds and glyphs in your language, or if you want to compare not glyphs, but syllables or phonems, you can pass in tuples of characters: 38 | 39 | >>> t1 = ("de", "ci", "si", "ve") 40 | >>> t2 = ("de", "ri", "si", "ve") 41 | >>> distance.levenshtein(t1, t2) 42 | 1 43 | 44 | Comparing lists of strings can also be useful for computing similarities between sentences, paragraphs, etc.: 45 | 46 | >>> sent1 = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'] 47 | >>> sent2 = ['the', 'lazy', 'fox', 'jumps', 'over', 'the', 'crazy', 'dog'] 48 | >>> distance.levenshtein(sent1, sent2) 49 | 3 50 | 51 | Hamming and Levenshtein distance can be normalized, so that the results of several distance measures can be meaningfully compared. Two strategies are available for Levenshtein: either the length of the shortest alignment between the sequences is taken as factor, or the length of the longer one. Example uses: 52 | 53 | >>> distance.hamming("fat", "cat", normalized=True) 54 | 0.3333333333333333 55 | >>> distance.nlevenshtein("abc", "acd", method=1) # shortest alignment 56 | 0.6666666666666666 57 | >>> distance.nlevenshtein("abc", "acd", method=2) # longest alignment 58 | 0.5 59 | 60 | `jaccard` and `sorensen` return a normalized value per default: 61 | 62 | >>> distance.sorensen("decide", "resize") 63 | 0.5555555555555556 64 | >>> distance.jaccard("decide", "resize") 65 | 0.7142857142857143 66 | 67 | As for the bonuses, there is a `fast_comp` function, which computes the distance between two strings up to a value of 2 included. If the distance between the strings is higher than that, -1 is returned. This function is of limited use, but on the other hand it is quite faster than `levenshtein`. There is also a `lcsubstrings` function which can be used to find the longest common substrings in two sequences. 68 | 69 | Finally, two convenience iterators `ilevenshtein` and `ifast_comp` are provided, which are intended to be used for filtering from a long list of sequences the ones that are close to a reference one. They both return a series of tuples (distance, sequence). Example: 70 | 71 | >>> tokens = ["fo", "bar", "foob", "foo", "fooba", "foobar"] 72 | >>> sorted(distance.ifast_comp("foo", tokens)) 73 | [(0, 'foo'), (1, 'fo'), (1, 'foob'), (2, 'fooba')] 74 | >>> sorted(distance.ilevenshtein("foo", tokens, max_dist=1)) 75 | [(0, 'foo'), (1, 'fo'), (1, 'foob')] 76 | 77 | `ifast_comp` is particularly efficient, and can handle 1 million tokens without a problem. 78 | 79 | For more informations, see the functions documentation (`help(funcname)`). 80 | 81 | Have fun! 82 | 83 | 84 | Changelog 85 | --------- 86 | 87 | 20/11/13: 88 | * Switched back to using the to-be-deprecated Python unicode api. Good news is that this makes the 89 | C extension compatible with Python 2.7+, and that distance computations on unicode strings is now 90 | much faster. 91 | * Added a C version of `lcsubstrings`. 92 | * Added a new method for computing normalized Levenshtein distance. 93 | * Added some tests. 94 | 95 | 12/11/13: 96 | Expanded `fast_comp` (formerly `quick_levenshtein`) so that it can handle transpositions. 97 | Fixed variable interversions in (C) `levenshtein` which produced sometimes strange results. 98 | 99 | 10/11/13: 100 | Added `quick_levenshtein` and `iquick_levenshtein`. 101 | 102 | 05/11/13: 103 | Added Sorensen and Jaccard metrics, fixed memory issue in Levenshtein. 104 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Distance - Utilities for comparing sequences 4 | # Copyright (C) 2013 Michaël Meyer 5 | 6 | # This program is free software: you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, either version 3 of the License, or 9 | # (at your option) any later version. 10 | 11 | # This program is distributed in the hope that it will be useful, 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 | # GNU General Public License for more details. 15 | 16 | # You should have received a copy of the GNU General Public License 17 | # along with this program. If not, see . 18 | 19 | 20 | import os, sys, ast, _ast, re 21 | from distutils.core import setup, Extension 22 | 23 | this_dir = os.path.dirname(os.path.abspath(__file__)) 24 | pkg_dir = os.path.join(this_dir, "distance") 25 | cpkg_dir = os.path.join(this_dir, "cdistance") 26 | 27 | ctypes = ["unicode", "byte", "array"] 28 | 29 | cfunctions = { 30 | "levenshtein": ["levenshtein", "nlevenshtein"], 31 | "hamming": ["hamming"], 32 | "lcsubstrings": ["lcsubstrings"], 33 | "fastcomp": ["fastcomp"], 34 | } 35 | 36 | sequence_compare = """\ 37 | #define SEQUENCE_COMPARE(s1, i1, s2, i2) \\ 38 | (PyObject_RichCompareBool( \\ 39 | PySequence_Fast_GET_ITEM((s1), (i1)), \\ 40 | PySequence_Fast_GET_ITEM((s2), (i2)), \\ 41 | Py_EQ) \\ 42 | ) 43 | """ 44 | 45 | def make_c_doc(): 46 | buff = [] 47 | py_sources = [f for f in os.listdir(pkg_dir) if f.endswith('.py')] 48 | for file in py_sources: 49 | with open(os.path.join(pkg_dir, file)) as f: 50 | content = f.read() 51 | tree = ast.parse(content) 52 | for doc_string in parse_tree(tree, content): 53 | buff.append(doc_string) 54 | join_str = 2 * '\n' 55 | return join_str.join(buff) + '\n' 56 | 57 | 58 | def parse_tree(tree, content): 59 | for node in ast.iter_child_nodes(tree): 60 | if not isinstance(node, _ast.FunctionDef): 61 | continue 62 | doc_string = ast.get_docstring(node) 63 | if not doc_string: 64 | continue 65 | func_def = re.findall("def\s%s\s*(.+?)\s*:" % node.name, content) 66 | assert func_def and len(func_def) == 1 67 | func_def = node.name + func_def[0] + 2 * '\\n\\\n' 68 | doc_string = doc_string.replace('\n', '\\n\\\n').replace('"', '\\"') 69 | doc_string = doc_string.replace('\n' + 8 * ' ', '\n' + 4 * ' ') 70 | doc_string = '#define %s_doc \\\n"%s%s"\n' % (node.name, func_def, doc_string) 71 | yield doc_string 72 | 73 | 74 | def format_header(): 75 | yield sequence_compare 76 | for cfile, cfuncs in cfunctions.items(): 77 | for ctype in ctypes: 78 | if ctype == "array": 79 | yield("#define SEQUENCE_COMP SEQUENCE_COMPARE") 80 | yield('#define unicode %(type)s' % dict(type=ctype)) 81 | for cfunc in cfuncs: 82 | yield("#define %(function)s %(tcode)s%(function)s" % dict(function=cfunc, tcode=ctype[0])) 83 | yield('#include "%(file)s.c"' % dict(file=cfile)) 84 | yield("#undef unicode") 85 | for cfunc in cfuncs: 86 | yield("#undef %(function)s" % dict(function=cfunc)) 87 | if ctype == "array": 88 | yield("#undef SEQUENCE_COMP") 89 | yield("") 90 | 91 | 92 | def prepare(): 93 | with open(os.path.join(cpkg_dir, "includes.h"), "w") as f: 94 | f.write(make_c_doc()) 95 | f.write(4 * '\n') 96 | f.write('\n'.join(format_header())) 97 | 98 | 99 | args = sys.argv[1:] 100 | if "prepare" in args: 101 | prepare() 102 | sys.exit() 103 | 104 | if "--with-c" in args: 105 | args.remove("--with-c") 106 | ext_modules = [Extension('distance.cdistance', sources=["cdistance/distance.c"])] 107 | else: 108 | sys.stderr.write("notice: no C support available\n") 109 | ext_modules = [] 110 | 111 | with open(os.path.join(this_dir, "README.md")) as f: 112 | long_description = f.read() 113 | 114 | setup ( 115 | name = 'Distance', 116 | version = '0.1.3', 117 | description = 'Utilities for comparing sequences', 118 | long_description = long_description, 119 | author='Michaël Meyer', 120 | author_email='michaelnm.meyer@gmail.com', 121 | url='https://github.com/doukremt/distance', 122 | ext_modules = ext_modules, 123 | script_args = args, 124 | packages = ['distance'], 125 | classifiers=( 126 | 'Intended Audience :: Developers', 127 | 'Natural Language :: English', 128 | 'License :: OSI Approved :: GNU General Public License (GPL)', 129 | 'Operating System :: OS Independent', 130 | 'Topic :: Software Development :: Libraries :: Python Modules', 131 | 'Programming Language :: C', 132 | 'Programming Language :: Python', 133 | 'Programming Language :: Python :: 3.3', 134 | ) 135 | ) 136 | -------------------------------------------------------------------------------- /distance/_levenshtein.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from array import array 4 | 5 | 6 | def levenshtein(seq1, seq2, normalized=False, max_dist=-1): 7 | """Compute the absolute Levenshtein distance between the two sequences 8 | `seq1` and `seq2`. 9 | 10 | The Levenshtein distance is the minimum number of edit operations necessary 11 | for transforming one sequence into the other. The edit operations allowed are: 12 | 13 | * deletion: ABC -> BC, AC, AB 14 | * insertion: ABC -> ABCD, EABC, AEBC.. 15 | * substitution: ABC -> ABE, ADC, FBC.. 16 | 17 | The `max_dist` parameter controls at which moment we should stop computing the 18 | distance between the provided sequences. If it is a negative integer, the 19 | distance will be computed until the sequences are exhausted; otherwise, the 20 | computation will stop at the moment the calculated distance is higher than 21 | `max_dist`, and then return -1. For example: 22 | 23 | >>> levenshtein("abc", "abcd", max_dist=1) # dist = 1 24 | 1 25 | >>> levenshtein("abc", "abcde", max_dist=1) # dist = 2 26 | -1 27 | 28 | This can be a time saver if you're not interested in the exact distance, but 29 | only need to check if the distance between the given sequences is below a 30 | given threshold. 31 | 32 | The `normalized` parameter is here for backward compatibility; providing 33 | it will result in a call to `nlevenshtein`, which should be used directly 34 | instead. 35 | """ 36 | if normalized: 37 | return nlevenshtein(seq1, seq2, method=1) 38 | 39 | if seq1 == seq2: 40 | return 0 41 | 42 | len1, len2 = len(seq1), len(seq2) 43 | if max_dist >= 0 and abs(len1 - len2) > max_dist: 44 | return -1 45 | if len1 == 0: 46 | return len2 47 | if len2 == 0: 48 | return len1 49 | if len1 < len2: 50 | len1, len2 = len2, len1 51 | seq1, seq2 = seq2, seq1 52 | 53 | column = array('L', range(len2 + 1)) 54 | 55 | for x in range(1, len1 + 1): 56 | column[0] = x 57 | last = x - 1 58 | for y in range(1, len2 + 1): 59 | old = column[y] 60 | cost = int(seq1[x - 1] != seq2[y - 1]) 61 | column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost) 62 | last = old 63 | if max_dist >= 0 and min(column) > max_dist: 64 | return -1 65 | 66 | if max_dist >= 0 and column[len2] > max_dist: 67 | # stay consistent, even if we have the exact distance 68 | return -1 69 | return column[len2] 70 | 71 | 72 | def nlevenshtein(seq1, seq2, method=1): 73 | """Compute the normalized Levenshtein distance between `seq1` and `seq2`. 74 | 75 | Two normalization methods are provided. For both of them, the normalized 76 | distance will be a float between 0 and 1, where 0 means equal and 1 77 | completely different. The computation obeys the following patterns: 78 | 79 | 0.0 if seq1 == seq2 80 | 1.0 if len(seq1) == 0 or len(seq2) == 0 81 | edit distance / factor otherwise 82 | 83 | The `method` parameter specifies which normalization factor should be used. 84 | It can have the value 1 or 2, which correspond to the following: 85 | 86 | 1: the length of the shortest alignment between the sequences 87 | (that is, the length of the longest sequence) 88 | 2: the length of the longest alignment between the sequences 89 | 90 | Which normalization factor should be chosen is a matter of taste. The first 91 | one is cheap to compute. The second one is more costly, but it accounts 92 | better than the first one for parallelisms of symbols between the sequences. 93 | 94 | For the rationale behind the use of the second method, see: 95 | Heeringa, "Measuring Dialect Pronunciation Differences using Levenshtein 96 | Distance", 2004, p. 130 sq, which is available online at: 97 | http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf 98 | """ 99 | 100 | if seq1 == seq2: 101 | return 0.0 102 | len1, len2 = len(seq1), len(seq2) 103 | if len1 == 0 or len2 == 0: 104 | return 1.0 105 | if len1 < len2: # minimize the arrays size 106 | len1, len2 = len2, len1 107 | seq1, seq2 = seq2, seq1 108 | 109 | if method == 1: 110 | return levenshtein(seq1, seq2) / float(len1) 111 | if method != 2: 112 | raise ValueError("expected either 1 or 2 for `method` parameter") 113 | 114 | column = array('L', range(len2 + 1)) 115 | length = array('L', range(len2 + 1)) 116 | 117 | for x in range(1, len1 + 1): 118 | 119 | column[0] = length[0] = x 120 | last = llast = x - 1 121 | 122 | for y in range(1, len2 + 1): 123 | 124 | # dist 125 | old = column[y] 126 | ic = column[y - 1] + 1 127 | dc = column[y] + 1 128 | rc = last + (seq1[x - 1] != seq2[y - 1]) 129 | column[y] = min(ic, dc, rc) 130 | last = old 131 | 132 | # length 133 | lold = length[y] 134 | lic = length[y - 1] + 1 if ic == column[y] else 0 135 | ldc = length[y] + 1 if dc == column[y] else 0 136 | lrc = llast + 1 if rc == column[y] else 0 137 | length[y] = max(ldc, lic, lrc) 138 | llast = lold 139 | 140 | return column[y] / float(length[y]) 141 | -------------------------------------------------------------------------------- /tests/tests.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | from array import array 3 | try: 4 | from distance import cdistance 5 | except ImportError: 6 | cdistance = None 7 | from distance import _pyimports as pydistance 8 | 9 | 10 | if sys.version_info.major < 3: 11 | t_unicode = unicode 12 | t_bytes = lambda s: s 13 | else: 14 | t_unicode = lambda s: s 15 | t_bytes = lambda s: s.encode() 16 | 17 | all_types = [ 18 | ("unicode", t_unicode), 19 | ("bytes", t_bytes), 20 | ("list", list), 21 | ("tuple", tuple), 22 | ] 23 | 24 | 25 | def hamming(func, t, **kwargs): 26 | 27 | # types; only for c 28 | if kwargs["lang"] == "C": 29 | try: 30 | func(1, t("foo")) 31 | except ValueError: 32 | pass 33 | try: 34 | func(t("foo"), 1) 35 | except ValueError: 36 | pass 37 | 38 | # empty string 39 | assert func(t(""), t("")) == 0 40 | 41 | # common 42 | assert func(t("abc"), t("abc")) == 0 43 | assert func(t("abc"), t("abd")) == 1 44 | 45 | # wrong length 46 | try: 47 | func(t("foo"), t("foobar")) 48 | except ValueError: 49 | pass 50 | 51 | try: 52 | func(t(""), t("foo")) 53 | except ValueError: 54 | pass 55 | 56 | # normalization 57 | assert func(t(""), t(""), normalized=True) == 0.0 58 | assert func(t("abc"), t("abc"), normalized=True) == 0.0 59 | assert func(t("ab"), t("ac"), normalized=True) == 0.5 60 | assert func(t("abc"), t("def"), normalized=True) == 1.0 61 | 62 | 63 | def fast_comp(func, t, **kwargs): 64 | 65 | # types; only for c 66 | if kwargs["lang"] == "C": 67 | try: 68 | func(1, t("foo")) 69 | except ValueError: 70 | pass 71 | try: 72 | func(t("foo"), 1) 73 | except ValueError: 74 | pass 75 | 76 | # empty strings 77 | assert func(t(""), t("")) == 0 78 | assert func(t(""), t("a")) == func(t("a"), t("")) == 1 79 | 80 | # edit ops 81 | assert func(t("aa"), t("aa")) == 0 82 | assert func(t("ab"), t("aa")) == 1 83 | assert func(t("ab"), t("a")) == 1 84 | assert func(t("ab"), t("abc")) == 1 85 | 86 | # dist limit 87 | assert func(t("a"), t("bcd")) == func(t("bcd"), t("a")) == -1 88 | 89 | # transpositions 90 | assert func(t("abc"), t("bac"), transpositions=True) == \ 91 | func(t("bac"), t("abc"), transpositions=True) == 1 92 | 93 | 94 | 95 | def levenshtein(func, t, **kwargs): 96 | 97 | # types; only for c 98 | if kwargs["lang"] == "C": 99 | try: 100 | func(1, t("foo")) 101 | except ValueError: 102 | pass 103 | try: 104 | func(t("foo"), 1) 105 | except ValueError: 106 | pass 107 | 108 | # empty strings 109 | assert func(t(""), t("")) == 0 110 | assert func(t(""), t("abcd")) == func(t("abcd"), t("")) == 4 111 | 112 | # edit ops 113 | assert func(t("aa"), t("aa")) == 0 114 | assert func(t("ab"), t("aa")) == 1 115 | assert func(t("ab"), t("a")) == 1 116 | assert func(t("ab"), t("abc")) == 1 117 | 118 | # dist limit 119 | assert func(t("a"), t("b"), max_dist=0) == -1 120 | assert func(t("a"), t("b"), max_dist=1) == 1 121 | assert func(t("foo"), t("bar"), max_dist=-1) == 3 122 | 123 | 124 | def nlevenshtein(func, t, **kwargs): 125 | 126 | # types; only for c 127 | if kwargs["lang"] == "C": 128 | try: 129 | func(1, t("foo")) 130 | except ValueError: 131 | pass 132 | try: 133 | func(t("foo"), 1) 134 | except ValueError: 135 | pass 136 | 137 | # empty strings 138 | assert func(t(""), t(""), 1) == func(t(""), t(""), 2) == 0.0 139 | assert func(t(""), t("foo"), 1) == func(t("foo"), t(""), 1) == \ 140 | func(t(""), t("foo"), 2) == func(t("foo"), t(""), 2) == 1.0 141 | 142 | assert func(t("aa"), t("aa"), 1) == func(t("aa"), t("aa"), 2) == 0.0 143 | assert func(t("ab"), t("aa"), 1) == func(t("ab"), t("aa"), 2) == 0.5 144 | assert func(t("ab"), t("a"), 1) == func(t("ab"), t("a"), 2) == 0.5 145 | assert func(t("ab"), t("abc"), 1) == func(t("ab"), t("abc"), 2) == 0.3333333333333333 146 | 147 | # multiple alignments 148 | assert func(t("abc"), t("adb"), 1) == 0.6666666666666666 149 | assert func(t("abc"), t("adb"), 2) == 0.5 150 | 151 | 152 | def lcsubstrings(func, t, **kwargs): 153 | 154 | # types; only for c 155 | if kwargs["lang"] == "C": 156 | try: 157 | func(1, t("foo")) 158 | except ValueError: 159 | pass 160 | try: 161 | func(t("foo"), 1) 162 | except ValueError: 163 | pass 164 | 165 | # empty strings 166 | try: 167 | assert func(t(""), t(""), False) == set() 168 | except TypeError: 169 | if t is not list: raise 170 | assert func(t(""), t(""), True) == (0, ()) 171 | try: 172 | assert func(t(""), t("foo"), False) == func(t("foo"), t(""), False) == set() 173 | except TypeError: 174 | if t is not list: raise 175 | assert func(t(""), t("foo"), True) == func(t("foo"), t(""), True) == (0, ()) 176 | 177 | # common 178 | try: 179 | assert func(t("abcd"), t("cdba"), False) == {t('cd')} 180 | except TypeError: 181 | if t is not list: raise 182 | assert func(t("abcd"), t("cdba"), True) == (2, ((2, 0),)) 183 | 184 | # reverse 185 | try: 186 | assert func(t("abcdef"), t("cdba"), False) == func(t("cdba"), t("abcdef"), False) 187 | except TypeError: 188 | if t is not list: raise 189 | assert func(t("abcdef"), t("cdba"), True) == func(t("cdba"), t("abcdef"), True) 190 | 191 | 192 | def itors_common(func, t, **kwargs): 193 | 194 | if kwargs["lang"] == "C": 195 | # types check; only need to do it for C impl to avoid an eventual segfaults. 196 | try: func(1, t("foo")) 197 | except ValueError: pass 198 | 199 | itor = func(t("foo"), [t("foo"), 3333]) 200 | next(itor) 201 | try: next(itor) 202 | except ValueError: pass 203 | 204 | # values drop 205 | itor = func(t("aa"), [t("aa"), t("abcd"), t("ba")]) 206 | assert next(itor) == (0, t("aa")) 207 | assert next(itor) == (1, t("ba")) 208 | 209 | 210 | def ilevenshtein(func, t, **kwargs): 211 | itors_common(lambda a, b: func(a, b, max_dist=2), t, **kwargs) 212 | 213 | 214 | def ifast_comp(func, t, **kwargs): 215 | itors_common(func, t, **kwargs) 216 | #transpositions 217 | g = func(t("abc"), [t("bac")], transpositions=False) 218 | assert next(g) == (2, t('bac')) 219 | g = func(t("abc"), [t("bac")], transpositions=True) 220 | assert next(g) == (1, t("bac")) 221 | 222 | 223 | write = lambda s: sys.stderr.write(s + '\n') 224 | 225 | tests = ["hamming", "fast_comp", "levenshtein", "lcsubstrings", "nlevenshtein", "ilevenshtein", "ifast_comp"] 226 | 227 | 228 | def run_test(name): 229 | if cdistance: 230 | cfunc = getattr(cdistance, name) 231 | run_lang_test(name, cfunc, "C") 232 | write("") 233 | pyfunc = getattr(pydistance, name) 234 | run_lang_test(name, pyfunc, "py") 235 | if cdistance is None: 236 | write("skipped C tests") 237 | write("") 238 | 239 | 240 | def run_lang_test(name, func, lang): 241 | print("%s (%s)..." % (name, lang)) 242 | for tname, typ in all_types: 243 | write("type: %s" % tname) 244 | globals()[name](func, typ, lang=lang) 245 | 246 | if __name__ == "__main__": 247 | args = sys.argv[1:] 248 | if not args: 249 | for test in tests: 250 | run_test(test) 251 | sys.exit() 252 | for name in args: 253 | if name in tests: 254 | run_test(name) 255 | else: 256 | write("no such test: %s" % name) 257 | sys.exit(1) 258 | -------------------------------------------------------------------------------- /cdistance/includes.h: -------------------------------------------------------------------------------- 1 | #define hamming_doc \ 2 | "hamming(seq1, seq2, normalized=False)\n\ 3 | \n\ 4 | Compute the Hamming distance between the two sequences `seq1` and `seq2`.\n\ 5 | The Hamming distance is the number of differing items in two ordered\n\ 6 | sequences of the same length. If the sequences submitted do not have the\n\ 7 | same length, an error will be raised.\n\ 8 | \n\ 9 | If `normalized` evaluates to `False`, the return value will be an integer\n\ 10 | between 0 and the length of the sequences provided, edge values included;\n\ 11 | otherwise, it will be a float between 0 and 1 included, where 0 means\n\ 12 | equal, and 1 totally different. Normalized hamming distance is computed as:\n\ 13 | \n\ 14 | 0.0 if len(seq1) == 0\n\ 15 | hamming_dist / len(seq1) otherwise" 16 | 17 | 18 | #define jaccard_doc \ 19 | "jaccard(seq1, seq2)\n\ 20 | \n\ 21 | Compute the Jaccard distance between the two sequences `seq1` and `seq2`.\n\ 22 | They should contain hashable items.\n\ 23 | \n\ 24 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different." 25 | 26 | 27 | #define sorensen_doc \ 28 | "sorensen(seq1, seq2)\n\ 29 | \n\ 30 | Compute the Sorensen distance between the two sequences `seq1` and `seq2`.\n\ 31 | They should contain hashable items.\n\ 32 | \n\ 33 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different." 34 | 35 | 36 | #define lcsubstrings_doc \ 37 | "lcsubstrings(seq1, seq2, positions=False)\n\ 38 | \n\ 39 | Find the longest common substring(s) in the sequences `seq1` and `seq2`.\n\ 40 | \n\ 41 | If positions evaluates to `True` only their positions will be returned,\n\ 42 | together with their length, in a tuple:\n\ 43 | \n\ 44 | (length, [(start pos in seq1, start pos in seq2)..])\n\ 45 | \n\ 46 | Otherwise, the substrings themselves will be returned, in a set.\n\ 47 | \n\ 48 | Example:\n\ 49 | \n\ 50 | >>> lcsubstrings(\"sedentar\", \"dentist\")\n\ 51 | {'dent'}\n\ 52 | >>> lcsubstrings(\"sedentar\", \"dentist\", positions=True)\n\ 53 | (4, [(2, 0)])" 54 | 55 | 56 | #define ilevenshtein_doc \ 57 | "ilevenshtein(seq1, seqs, max_dist=-1)\n\ 58 | \n\ 59 | Compute the Levenshtein distance between the sequence `seq1` and the series\n\ 60 | of sequences `seqs`.\n\ 61 | \n\ 62 | `seq1`: the reference sequence\n\ 63 | `seqs`: a series of sequences (can be a generator)\n\ 64 | `max_dist`: if provided and > 0, only the sequences which distance from\n\ 65 | the reference sequence is lower or equal to this value will be returned.\n\ 66 | \n\ 67 | The return value is a series of pairs (distance, sequence).\n\ 68 | \n\ 69 | The sequence objects in `seqs` are expected to be of the same kind than\n\ 70 | the reference sequence in the C implementation; the same holds true for\n\ 71 | `ifast_comp`." 72 | 73 | 74 | #define ifast_comp_doc \ 75 | "ifast_comp(seq1, seqs, transpositions=False)\n\ 76 | \n\ 77 | Return an iterator over all the sequences in `seqs` which distance from\n\ 78 | `seq1` is lower or equal to 2. The sequences which distance from the\n\ 79 | reference sequence is higher than that are dropped.\n\ 80 | \n\ 81 | `seq1`: the reference sequence.\n\ 82 | `seqs`: a series of sequences (can be a generator)\n\ 83 | `transpositions` has the same sense than in `fast_comp`.\n\ 84 | \n\ 85 | The return value is a series of pairs (distance, sequence).\n\ 86 | \n\ 87 | You might want to call `sorted()` on the iterator to get the results in a\n\ 88 | significant order:\n\ 89 | \n\ 90 | >>> g = ifast_comp(\"foo\", [\"fo\", \"bar\", \"foob\", \"foo\", \"foobaz\"])\n\ 91 | >>> sorted(g)\n\ 92 | [(0, 'foo'), (1, 'fo'), (1, 'foob')]" 93 | 94 | 95 | #define fast_comp_doc \ 96 | "fast_comp(seq1, seq2, transpositions=False)\n\ 97 | \n\ 98 | Compute the distance between the two sequences `seq1` and `seq2` up to a\n\ 99 | maximum of 2 included, and return it. If the edit distance between the two\n\ 100 | sequences is higher than that, -1 is returned.\n\ 101 | \n\ 102 | If `transpositions` is `True`, transpositions will be taken into account for\n\ 103 | the computation of the distance. This can make a difference, e.g.:\n\ 104 | \n\ 105 | >>> fast_comp(\"abc\", \"bac\", transpositions=False)\n\ 106 | 2\n\ 107 | >>> fast_comp(\"abc\", \"bac\", transpositions=True)\n\ 108 | 1\n\ 109 | \n\ 110 | This is faster than `levenshtein` by an order of magnitude, but on the\n\ 111 | other hand is of limited use.\n\ 112 | \n\ 113 | The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.\n\ 114 | I've added transpositions support to the original code." 115 | 116 | 117 | #define levenshtein_doc \ 118 | "levenshtein(seq1, seq2, max_dist=-1, normalized=False)\n\ 119 | \n\ 120 | Compute the absolute Levenshtein distance between the two sequences\n\ 121 | `seq1` and `seq2`.\n\ 122 | \n\ 123 | The Levenshtein distance is the minimum number of edit operations necessary\n\ 124 | for transforming one sequence into the other. The edit operations allowed are:\n\ 125 | \n\ 126 | * deletion: ABC -> BC, AC, AB\n\ 127 | * insertion: ABC -> ABCD, EABC, AEBC..\n\ 128 | * substitution: ABC -> ABE, ADC, FBC..\n\ 129 | \n\ 130 | The `max_dist` parameter controls at which moment we should stop computing the\n\ 131 | distance between the provided sequences. If it is a negative integer, the\n\ 132 | distance will be computed until the sequences are exhausted; otherwise, the\n\ 133 | computation will stop at the moment the calculated distance is higher than\n\ 134 | `max_dist`, and then return -1. For example:\n\ 135 | \n\ 136 | >>> levenshtein(\"abc\", \"abcd\", max_dist=1) # dist = 1\n\ 137 | 1\n\ 138 | >>> levenshtein(\"abc\", \"abcde\", max_dist=1) # dist = 2\n\ 139 | -1\n\ 140 | \n\ 141 | This can be a time saver if you're not interested in the exact distance, but\n\ 142 | only need to check if the distance between the given sequences is below a\n\ 143 | given threshold.\n\ 144 | \n\ 145 | The `normalized` parameter is here for backward compatibility; providing\n\ 146 | it will result in a call to `nlevenshtein`, which should be used directly\n\ 147 | instead. " 148 | 149 | 150 | #define nlevenshtein_doc \ 151 | "nlevenshtein(seq1, seq2, method=1)\n\ 152 | \n\ 153 | Compute the normalized Levenshtein distance between `seq1` and `seq2`.\n\ 154 | \n\ 155 | Two normalization methods are provided. For both of them, the normalized\n\ 156 | distance will be a float between 0 and 1, where 0 means equal and 1\n\ 157 | completely different. The computation obeys the following patterns:\n\ 158 | \n\ 159 | 0.0 if seq1 == seq2\n\ 160 | 1.0 if len(seq1) == 0 or len(seq2) == 0\n\ 161 | edit distance / factor otherwise\n\ 162 | \n\ 163 | The `method` parameter specifies which normalization factor should be used.\n\ 164 | It can have the value 1 or 2, which correspond to the following:\n\ 165 | \n\ 166 | 1: the length of the shortest alignment between the sequences\n\ 167 | (that is, the length of the longest sequence)\n\ 168 | 2: the length of the longest alignment between the sequences\n\ 169 | \n\ 170 | Which normalization factor should be chosen is a matter of taste. The first\n\ 171 | one is cheap to compute. The second one is more costly, but it accounts\n\ 172 | better than the first one for parallelisms of symbols between the sequences.\n\ 173 | \n\ 174 | For the rationale behind the use of the second method, see:\n\ 175 | Heeringa, \"Measuring Dialect Pronunciation Differences using Levenshtein\n\ 176 | Distance\", 2004, p. 130 sq, which is available online at:\n\ 177 | http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf" 178 | 179 | 180 | 181 | 182 | 183 | #define SEQUENCE_COMPARE(s1, i1, s2, i2) \ 184 | (PyObject_RichCompareBool( \ 185 | PySequence_Fast_GET_ITEM((s1), (i1)), \ 186 | PySequence_Fast_GET_ITEM((s2), (i2)), \ 187 | Py_EQ) \ 188 | ) 189 | 190 | #define unicode unicode 191 | #define hamming uhamming 192 | #include "hamming.c" 193 | #undef unicode 194 | #undef hamming 195 | 196 | #define unicode byte 197 | #define hamming bhamming 198 | #include "hamming.c" 199 | #undef unicode 200 | #undef hamming 201 | 202 | #define SEQUENCE_COMP SEQUENCE_COMPARE 203 | #define unicode array 204 | #define hamming ahamming 205 | #include "hamming.c" 206 | #undef unicode 207 | #undef hamming 208 | #undef SEQUENCE_COMP 209 | 210 | #define unicode unicode 211 | #define levenshtein ulevenshtein 212 | #define nlevenshtein unlevenshtein 213 | #include "levenshtein.c" 214 | #undef unicode 215 | #undef levenshtein 216 | #undef nlevenshtein 217 | 218 | #define unicode byte 219 | #define levenshtein blevenshtein 220 | #define nlevenshtein bnlevenshtein 221 | #include "levenshtein.c" 222 | #undef unicode 223 | #undef levenshtein 224 | #undef nlevenshtein 225 | 226 | #define SEQUENCE_COMP SEQUENCE_COMPARE 227 | #define unicode array 228 | #define levenshtein alevenshtein 229 | #define nlevenshtein anlevenshtein 230 | #include "levenshtein.c" 231 | #undef unicode 232 | #undef levenshtein 233 | #undef nlevenshtein 234 | #undef SEQUENCE_COMP 235 | 236 | #define unicode unicode 237 | #define lcsubstrings ulcsubstrings 238 | #include "lcsubstrings.c" 239 | #undef unicode 240 | #undef lcsubstrings 241 | 242 | #define unicode byte 243 | #define lcsubstrings blcsubstrings 244 | #include "lcsubstrings.c" 245 | #undef unicode 246 | #undef lcsubstrings 247 | 248 | #define SEQUENCE_COMP SEQUENCE_COMPARE 249 | #define unicode array 250 | #define lcsubstrings alcsubstrings 251 | #include "lcsubstrings.c" 252 | #undef unicode 253 | #undef lcsubstrings 254 | #undef SEQUENCE_COMP 255 | 256 | #define unicode unicode 257 | #define fastcomp ufastcomp 258 | #include "fastcomp.c" 259 | #undef unicode 260 | #undef fastcomp 261 | 262 | #define unicode byte 263 | #define fastcomp bfastcomp 264 | #include "fastcomp.c" 265 | #undef unicode 266 | #undef fastcomp 267 | 268 | #define SEQUENCE_COMP SEQUENCE_COMPARE 269 | #define unicode array 270 | #define fastcomp afastcomp 271 | #include "fastcomp.c" 272 | #undef unicode 273 | #undef fastcomp 274 | #undef SEQUENCE_COMP 275 | -------------------------------------------------------------------------------- /cdistance/utarray.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) 2008-2013, Troy D. Hanson http://troydhanson.github.com/uthash/ 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 22 | */ 23 | 24 | /* a dynamic array implementation using macros 25 | */ 26 | #ifndef UTARRAY_H 27 | #define UTARRAY_H 28 | 29 | #define UTARRAY_VERSION 1.9.8 30 | 31 | #ifdef __GNUC__ 32 | #define _UNUSED_ __attribute__ ((__unused__)) 33 | #else 34 | #define _UNUSED_ 35 | #endif 36 | 37 | #include /* size_t */ 38 | #include /* memset, etc */ 39 | #include /* exit */ 40 | 41 | #define oom() exit(-1) 42 | 43 | typedef void (ctor_f)(void *dst, const void *src); 44 | typedef void (dtor_f)(void *elt); 45 | typedef void (init_f)(void *elt); 46 | typedef struct { 47 | size_t sz; 48 | init_f *init; 49 | ctor_f *copy; 50 | dtor_f *dtor; 51 | } UT_icd; 52 | 53 | typedef struct { 54 | unsigned i,n;/* i: index of next available slot, n: num slots */ 55 | UT_icd icd; /* initializer, copy and destructor functions */ 56 | char *d; /* n slots of size icd->sz*/ 57 | } UT_array; 58 | 59 | #define utarray_init(a,_icd) do { \ 60 | memset(a,0,sizeof(UT_array)); \ 61 | (a)->icd=*_icd; \ 62 | } while(0) 63 | 64 | #define utarray_done(a) do { \ 65 | if ((a)->n) { \ 66 | if ((a)->icd.dtor) { \ 67 | size_t _ut_i; \ 68 | for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ 69 | (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ 70 | } \ 71 | } \ 72 | free((a)->d); \ 73 | } \ 74 | (a)->n=0; \ 75 | } while(0) 76 | 77 | #define utarray_new(a,_icd) do { \ 78 | a=(UT_array*)malloc(sizeof(UT_array)); \ 79 | utarray_init(a,_icd); \ 80 | } while(0) 81 | 82 | #define utarray_free(a) do { \ 83 | utarray_done(a); \ 84 | free(a); \ 85 | } while(0) 86 | 87 | #define utarray_reserve(a,by) do { \ 88 | if (((a)->i+by) > ((a)->n)) { \ 89 | while(((a)->i+by) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); } \ 90 | if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd.sz)) == NULL) oom(); \ 91 | } \ 92 | } while(0) 93 | 94 | #define utarray_push_back(a,p) do { \ 95 | utarray_reserve(a,1); \ 96 | if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); } \ 97 | else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); }; \ 98 | } while(0) 99 | 100 | #define utarray_pop_back(a) do { \ 101 | if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); } \ 102 | else { (a)->i--; } \ 103 | } while(0) 104 | 105 | #define utarray_extend_back(a) do { \ 106 | utarray_reserve(a,1); \ 107 | if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); } \ 108 | else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); } \ 109 | (a)->i++; \ 110 | } while(0) 111 | 112 | #define utarray_len(a) ((a)->i) 113 | 114 | #define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL) 115 | #define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd.sz*(j) ))) 116 | 117 | #define utarray_insert(a,p,j) do { \ 118 | if (j > (a)->i) utarray_resize(a,j); \ 119 | utarray_reserve(a,1); \ 120 | if ((j) < (a)->i) { \ 121 | memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j), \ 122 | ((a)->i - (j))*((a)->icd.sz)); \ 123 | } \ 124 | if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); } \ 125 | else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); }; \ 126 | (a)->i++; \ 127 | } while(0) 128 | 129 | #define utarray_inserta(a,w,j) do { \ 130 | if (utarray_len(w) == 0) break; \ 131 | if (j > (a)->i) utarray_resize(a,j); \ 132 | utarray_reserve(a,utarray_len(w)); \ 133 | if ((j) < (a)->i) { \ 134 | memmove(_utarray_eltptr(a,(j)+utarray_len(w)), \ 135 | _utarray_eltptr(a,j), \ 136 | ((a)->i - (j))*((a)->icd.sz)); \ 137 | } \ 138 | if ((a)->icd.copy) { \ 139 | size_t _ut_i; \ 140 | for(_ut_i=0;_ut_i<(w)->i;_ut_i++) { \ 141 | (a)->icd.copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i)); \ 142 | } \ 143 | } else { \ 144 | memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0), \ 145 | utarray_len(w)*((a)->icd.sz)); \ 146 | } \ 147 | (a)->i += utarray_len(w); \ 148 | } while(0) 149 | 150 | #define utarray_resize(dst,num) do { \ 151 | size_t _ut_i; \ 152 | if (dst->i > (size_t)(num)) { \ 153 | if ((dst)->icd.dtor) { \ 154 | for(_ut_i=num; _ut_i < dst->i; _ut_i++) { \ 155 | (dst)->icd.dtor(utarray_eltptr(dst,_ut_i)); \ 156 | } \ 157 | } \ 158 | } else if (dst->i < (size_t)(num)) { \ 159 | utarray_reserve(dst,num-dst->i); \ 160 | if ((dst)->icd.init) { \ 161 | for(_ut_i=dst->i; _ut_i < num; _ut_i++) { \ 162 | (dst)->icd.init(utarray_eltptr(dst,_ut_i)); \ 163 | } \ 164 | } else { \ 165 | memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd.sz*(num-dst->i)); \ 166 | } \ 167 | } \ 168 | dst->i = num; \ 169 | } while(0) 170 | 171 | #define utarray_concat(dst,src) do { \ 172 | utarray_inserta((dst),(src),utarray_len(dst)); \ 173 | } while(0) 174 | 175 | #define utarray_erase(a,pos,len) do { \ 176 | if ((a)->icd.dtor) { \ 177 | size_t _ut_i; \ 178 | for(_ut_i=0; _ut_i < len; _ut_i++) { \ 179 | (a)->icd.dtor(utarray_eltptr((a),pos+_ut_i)); \ 180 | } \ 181 | } \ 182 | if ((a)->i > (pos+len)) { \ 183 | memmove( _utarray_eltptr((a),pos), _utarray_eltptr((a),pos+len), \ 184 | (((a)->i)-(pos+len))*((a)->icd.sz)); \ 185 | } \ 186 | (a)->i -= (len); \ 187 | } while(0) 188 | 189 | #define utarray_renew(a,u) do { \ 190 | if (a) utarray_clear(a); \ 191 | else utarray_new((a),(u)); \ 192 | } while(0) 193 | 194 | #define utarray_clear(a) do { \ 195 | if ((a)->i > 0) { \ 196 | if ((a)->icd.dtor) { \ 197 | size_t _ut_i; \ 198 | for(_ut_i=0; _ut_i < (a)->i; _ut_i++) { \ 199 | (a)->icd.dtor(utarray_eltptr(a,_ut_i)); \ 200 | } \ 201 | } \ 202 | (a)->i = 0; \ 203 | } \ 204 | } while(0) 205 | 206 | #define utarray_sort(a,cmp) do { \ 207 | qsort((a)->d, (a)->i, (a)->icd.sz, cmp); \ 208 | } while(0) 209 | 210 | #define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp) 211 | 212 | #define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL) 213 | #define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL)) 214 | #define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL)) 215 | #define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL) 216 | #define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(ssize_t)(a)->icd.sz) : -1) 217 | 218 | /* last we pre-define a few icd for common utarrays of ints and strings */ 219 | static void utarray_str_cpy(void *dst, const void *src) { 220 | char **_src = (char**)src, **_dst = (char**)dst; 221 | *_dst = (*_src == NULL) ? NULL : strdup(*_src); 222 | } 223 | static void utarray_str_dtor(void *elt) { 224 | char **eltc = (char**)elt; 225 | if (*eltc) free(*eltc); 226 | } 227 | static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor}; 228 | static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL}; 229 | static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL}; 230 | 231 | 232 | #endif /* UTARRAY_H */ 233 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | distance license 2 | ================ 3 | 4 | Copyright (C) 2013 Michaël Meyer 5 | 6 | GNU GENERAL PUBLIC LICENSE 7 | Version 2, June 1991 8 | 9 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 10 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 11 | Everyone is permitted to copy and distribute verbatim copies 12 | of this license document, but changing it is not allowed. 13 | 14 | Preamble 15 | 16 | The licenses for most software are designed to take away your 17 | freedom to share and change it. By contrast, the GNU General Public 18 | License is intended to guarantee your freedom to share and change free 19 | software--to make sure the software is free for all its users. This 20 | General Public License applies to most of the Free Software 21 | Foundation's software and to any other program whose authors commit to 22 | using it. (Some other Free Software Foundation software is covered by 23 | the GNU Lesser General Public License instead.) You can apply it to 24 | your programs, too. 25 | 26 | When we speak of free software, we are referring to freedom, not 27 | price. Our General Public Licenses are designed to make sure that you 28 | have the freedom to distribute copies of free software (and charge for 29 | this service if you wish), that you receive source code or can get it 30 | if you want it, that you can change the software or use pieces of it 31 | in new free programs; and that you know you can do these things. 32 | 33 | To protect your rights, we need to make restrictions that forbid 34 | anyone to deny you these rights or to ask you to surrender the rights. 35 | These restrictions translate to certain responsibilities for you if you 36 | distribute copies of the software, or if you modify it. 37 | 38 | For example, if you distribute copies of such a program, whether 39 | gratis or for a fee, you must give the recipients all the rights that 40 | you have. You must make sure that they, too, receive or can get the 41 | source code. And you must show them these terms so they know their 42 | rights. 43 | 44 | We protect your rights with two steps: (1) copyright the software, and 45 | (2) offer you this license which gives you legal permission to copy, 46 | distribute and/or modify the software. 47 | 48 | Also, for each author's protection and ours, we want to make certain 49 | that everyone understands that there is no warranty for this free 50 | software. If the software is modified by someone else and passed on, we 51 | want its recipients to know that what they have is not the original, so 52 | that any problems introduced by others will not reflect on the original 53 | authors' reputations. 54 | 55 | Finally, any free program is threatened constantly by software 56 | patents. We wish to avoid the danger that redistributors of a free 57 | program will individually obtain patent licenses, in effect making the 58 | program proprietary. To prevent this, we have made it clear that any 59 | patent must be licensed for everyone's free use or not licensed at all. 60 | 61 | The precise terms and conditions for copying, distribution and 62 | modification follow. 63 | 64 | GNU GENERAL PUBLIC LICENSE 65 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 66 | 67 | 0. This License applies to any program or other work which contains 68 | a notice placed by the copyright holder saying it may be distributed 69 | under the terms of this General Public License. The "Program", below, 70 | refers to any such program or work, and a "work based on the Program" 71 | means either the Program or any derivative work under copyright law: 72 | that is to say, a work containing the Program or a portion of it, 73 | either verbatim or with modifications and/or translated into another 74 | language. (Hereinafter, translation is included without limitation in 75 | the term "modification".) Each licensee is addressed as "you". 76 | 77 | Activities other than copying, distribution and modification are not 78 | covered by this License; they are outside its scope. The act of 79 | running the Program is not restricted, and the output from the Program 80 | is covered only if its contents constitute a work based on the 81 | Program (independent of having been made by running the Program). 82 | Whether that is true depends on what the Program does. 83 | 84 | 1. You may copy and distribute verbatim copies of the Program's 85 | source code as you receive it, in any medium, provided that you 86 | conspicuously and appropriately publish on each copy an appropriate 87 | copyright notice and disclaimer of warranty; keep intact all the 88 | notices that refer to this License and to the absence of any warranty; 89 | and give any other recipients of the Program a copy of this License 90 | along with the Program. 91 | 92 | You may charge a fee for the physical act of transferring a copy, and 93 | you may at your option offer warranty protection in exchange for a fee. 94 | 95 | 2. You may modify your copy or copies of the Program or any portion 96 | of it, thus forming a work based on the Program, and copy and 97 | distribute such modifications or work under the terms of Section 1 98 | above, provided that you also meet all of these conditions: 99 | 100 | a) You must cause the modified files to carry prominent notices 101 | stating that you changed the files and the date of any change. 102 | 103 | b) You must cause any work that you distribute or publish, that in 104 | whole or in part contains or is derived from the Program or any 105 | part thereof, to be licensed as a whole at no charge to all third 106 | parties under the terms of this License. 107 | 108 | c) If the modified program normally reads commands interactively 109 | when run, you must cause it, when started running for such 110 | interactive use in the most ordinary way, to print or display an 111 | announcement including an appropriate copyright notice and a 112 | notice that there is no warranty (or else, saying that you provide 113 | a warranty) and that users may redistribute the program under 114 | these conditions, and telling the user how to view a copy of this 115 | License. (Exception: if the Program itself is interactive but 116 | does not normally print such an announcement, your work based on 117 | the Program is not required to print an announcement.) 118 | 119 | These requirements apply to the modified work as a whole. If 120 | identifiable sections of that work are not derived from the Program, 121 | and can be reasonably considered independent and separate works in 122 | themselves, then this License, and its terms, do not apply to those 123 | sections when you distribute them as separate works. But when you 124 | distribute the same sections as part of a whole which is a work based 125 | on the Program, the distribution of the whole must be on the terms of 126 | this License, whose permissions for other licensees extend to the 127 | entire whole, and thus to each and every part regardless of who wrote it. 128 | 129 | Thus, it is not the intent of this section to claim rights or contest 130 | your rights to work written entirely by you; rather, the intent is to 131 | exercise the right to control the distribution of derivative or 132 | collective works based on the Program. 133 | 134 | In addition, mere aggregation of another work not based on the Program 135 | with the Program (or with a work based on the Program) on a volume of 136 | a storage or distribution medium does not bring the other work under 137 | the scope of this License. 138 | 139 | 3. You may copy and distribute the Program (or a work based on it, 140 | under Section 2) in object code or executable form under the terms of 141 | Sections 1 and 2 above provided that you also do one of the following: 142 | 143 | a) Accompany it with the complete corresponding machine-readable 144 | source code, which must be distributed under the terms of Sections 145 | 1 and 2 above on a medium customarily used for software interchange; or, 146 | 147 | b) Accompany it with a written offer, valid for at least three 148 | years, to give any third party, for a charge no more than your 149 | cost of physically performing source distribution, a complete 150 | machine-readable copy of the corresponding source code, to be 151 | distributed under the terms of Sections 1 and 2 above on a medium 152 | customarily used for software interchange; or, 153 | 154 | c) Accompany it with the information you received as to the offer 155 | to distribute corresponding source code. (This alternative is 156 | allowed only for noncommercial distribution and only if you 157 | received the program in object code or executable form with such 158 | an offer, in accord with Subsection b above.) 159 | 160 | The source code for a work means the preferred form of the work for 161 | making modifications to it. For an executable work, complete source 162 | code means all the source code for all modules it contains, plus any 163 | associated interface definition files, plus the scripts used to 164 | control compilation and installation of the executable. However, as a 165 | special exception, the source code distributed need not include 166 | anything that is normally distributed (in either source or binary 167 | form) with the major components (compiler, kernel, and so on) of the 168 | operating system on which the executable runs, unless that component 169 | itself accompanies the executable. 170 | 171 | If distribution of executable or object code is made by offering 172 | access to copy from a designated place, then offering equivalent 173 | access to copy the source code from the same place counts as 174 | distribution of the source code, even though third parties are not 175 | compelled to copy the source along with the object code. 176 | 177 | 4. You may not copy, modify, sublicense, or distribute the Program 178 | except as expressly provided under this License. Any attempt 179 | otherwise to copy, modify, sublicense or distribute the Program is 180 | void, and will automatically terminate your rights under this License. 181 | However, parties who have received copies, or rights, from you under 182 | this License will not have their licenses terminated so long as such 183 | parties remain in full compliance. 184 | 185 | 5. You are not required to accept this License, since you have not 186 | signed it. However, nothing else grants you permission to modify or 187 | distribute the Program or its derivative works. These actions are 188 | prohibited by law if you do not accept this License. Therefore, by 189 | modifying or distributing the Program (or any work based on the 190 | Program), you indicate your acceptance of this License to do so, and 191 | all its terms and conditions for copying, distributing or modifying 192 | the Program or works based on it. 193 | 194 | 6. Each time you redistribute the Program (or any work based on the 195 | Program), the recipient automatically receives a license from the 196 | original licensor to copy, distribute or modify the Program subject to 197 | these terms and conditions. You may not impose any further 198 | restrictions on the recipients' exercise of the rights granted herein. 199 | You are not responsible for enforcing compliance by third parties to 200 | this License. 201 | 202 | 7. If, as a consequence of a court judgment or allegation of patent 203 | infringement or for any other reason (not limited to patent issues), 204 | conditions are imposed on you (whether by court order, agreement or 205 | otherwise) that contradict the conditions of this License, they do not 206 | excuse you from the conditions of this License. If you cannot 207 | distribute so as to satisfy simultaneously your obligations under this 208 | License and any other pertinent obligations, then as a consequence you 209 | may not distribute the Program at all. For example, if a patent 210 | license would not permit royalty-free redistribution of the Program by 211 | all those who receive copies directly or indirectly through you, then 212 | the only way you could satisfy both it and this License would be to 213 | refrain entirely from distribution of the Program. 214 | 215 | If any portion of this section is held invalid or unenforceable under 216 | any particular circumstance, the balance of the section is intended to 217 | apply and the section as a whole is intended to apply in other 218 | circumstances. 219 | 220 | It is not the purpose of this section to induce you to infringe any 221 | patents or other property right claims or to contest validity of any 222 | such claims; this section has the sole purpose of protecting the 223 | integrity of the free software distribution system, which is 224 | implemented by public license practices. Many people have made 225 | generous contributions to the wide range of software distributed 226 | through that system in reliance on consistent application of that 227 | system; it is up to the author/donor to decide if he or she is willing 228 | to distribute software through any other system and a licensee cannot 229 | impose that choice. 230 | 231 | This section is intended to make thoroughly clear what is believed to 232 | be a consequence of the rest of this License. 233 | 234 | 8. If the distribution and/or use of the Program is restricted in 235 | certain countries either by patents or by copyrighted interfaces, the 236 | original copyright holder who places the Program under this License 237 | may add an explicit geographical distribution limitation excluding 238 | those countries, so that distribution is permitted only in or among 239 | countries not thus excluded. In such case, this License incorporates 240 | the limitation as if written in the body of this License. 241 | 242 | 9. The Free Software Foundation may publish revised and/or new versions 243 | of the General Public License from time to time. Such new versions will 244 | be similar in spirit to the present version, but may differ in detail to 245 | address new problems or concerns. 246 | 247 | Each version is given a distinguishing version number. If the Program 248 | specifies a version number of this License which applies to it and "any 249 | later version", you have the option of following the terms and conditions 250 | either of that version or of any later version published by the Free 251 | Software Foundation. If the Program does not specify a version number of 252 | this License, you may choose any version ever published by the Free Software 253 | Foundation. 254 | 255 | 10. If you wish to incorporate parts of the Program into other free 256 | programs whose distribution conditions are different, write to the author 257 | to ask for permission. For software which is copyrighted by the Free 258 | Software Foundation, write to the Free Software Foundation; we sometimes 259 | make exceptions for this. Our decision will be guided by the two goals 260 | of preserving the free status of all derivatives of our free software and 261 | of promoting the sharing and reuse of software generally. 262 | 263 | NO WARRANTY 264 | 265 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 266 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 267 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 268 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 269 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 270 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 271 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 272 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 273 | REPAIR OR CORRECTION. 274 | 275 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 276 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 277 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 278 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 279 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 280 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 281 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 282 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 283 | POSSIBILITY OF SUCH DAMAGES. 284 | 285 | END OF TERMS AND CONDITIONS 286 | 287 | How to Apply These Terms to Your New Programs 288 | 289 | If you develop a new program, and you want it to be of the greatest 290 | possible use to the public, the best way to achieve this is to make it 291 | free software which everyone can redistribute and change under these terms. 292 | 293 | To do so, attach the following notices to the program. It is safest 294 | to attach them to the start of each source file to most effectively 295 | convey the exclusion of warranty; and each file should have at least 296 | the "copyright" line and a pointer to where the full notice is found. 297 | 298 | {description} 299 | Copyright (C) {year} {fullname} 300 | 301 | This program is free software; you can redistribute it and/or modify 302 | it under the terms of the GNU General Public License as published by 303 | the Free Software Foundation; either version 2 of the License, or 304 | (at your option) any later version. 305 | 306 | This program is distributed in the hope that it will be useful, 307 | but WITHOUT ANY WARRANTY; without even the implied warranty of 308 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 309 | GNU General Public License for more details. 310 | 311 | You should have received a copy of the GNU General Public License along 312 | with this program; if not, write to the Free Software Foundation, Inc., 313 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 314 | 315 | Also add information on how to contact you by electronic and paper mail. 316 | 317 | If the program is interactive, make it output a short notice like this 318 | when it starts in an interactive mode: 319 | 320 | Gnomovision version 69, Copyright (C) year name of author 321 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 322 | This is free software, and you are welcome to redistribute it 323 | under certain conditions; type `show c' for details. 324 | 325 | The hypothetical commands `show w' and `show c' should show the appropriate 326 | parts of the General Public License. Of course, the commands you use may 327 | be called something other than `show w' and `show c'; they could even be 328 | mouse-clicks or menu items--whatever suits your program. 329 | 330 | You should also get your employer (if you work as a programmer) or your 331 | school, if any, to sign a "copyright disclaimer" for the program, if 332 | necessary. Here is a sample; alter the names: 333 | 334 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 335 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 336 | 337 | {signature of Ty Coon}, 1 April 1989 338 | Ty Coon, President of Vice 339 | 340 | This General Public License does not permit incorporating your program into 341 | proprietary programs. If your program is a subroutine library, you may 342 | consider it more useful to permit linking proprietary applications with the 343 | library. If this is what you want to do, use the GNU Lesser General 344 | Public License instead of this License. 345 | 346 | 347 | fastcomp license 348 | ================ 349 | 350 | MIT LICENSE 351 | 352 | Copyright (c) 2012 Fujimoto 353 | 354 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 355 | 356 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 357 | 358 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 359 | -------------------------------------------------------------------------------- /cdistance/distance.c: -------------------------------------------------------------------------------- 1 | #include "distance.h" 2 | #include "includes.h" 3 | 4 | 5 | static unicode * 6 | get_unicode(PyObject *obj, Py_ssize_t *len) 7 | { 8 | unicode *u; 9 | 10 | if ((u = PyUnicode_AS_UNICODE(obj)) == NULL) { 11 | PyErr_Format(PyExc_RuntimeError, "failed to get unicode representation of object"); 12 | return NULL; 13 | } 14 | *len = PyUnicode_GET_LENGTH(obj); 15 | 16 | return u; 17 | } 18 | 19 | 20 | static byte * 21 | get_byte(PyObject *obj, Py_ssize_t *len) 22 | { 23 | byte *b; 24 | 25 | b = PyBytes_AS_STRING(obj); 26 | *len = PyBytes_GET_SIZE(obj); 27 | 28 | return b; 29 | } 30 | 31 | 32 | static array * 33 | get_array(PyObject *obj, Py_ssize_t *len) 34 | { 35 | array *a; 36 | 37 | if ((a = PySequence_Fast(obj, "we got a problem")) == NULL) 38 | return NULL; 39 | *len = PySequence_Fast_GET_SIZE(a); 40 | 41 | return a; 42 | } 43 | 44 | 45 | static char 46 | get_sequence(PyObject *obj, sequence *seq, Py_ssize_t *len, char type) 47 | { 48 | char t = '\0'; 49 | 50 | if (PyUnicode_Check(obj)) { 51 | t = 'u'; 52 | if ((seq->u = get_unicode(obj, len)) == NULL) 53 | return '\0'; 54 | } else if (PyBytes_Check(obj)) { 55 | t = 'b'; 56 | if ((seq->b = get_byte(obj, len)) == NULL) 57 | return '\0'; 58 | } else if (PySequence_Check(obj)) { 59 | t = 'a'; 60 | if ((seq->a = get_array(obj, len)) == NULL) 61 | return '\0'; 62 | } 63 | 64 | if (!t) { 65 | PyErr_SetString(PyExc_ValueError, "expected a sequence object as first argument"); 66 | return '\0'; 67 | } 68 | if (type && t != type) { 69 | PyErr_SetString(PyExc_ValueError, "type mismatch between the " 70 | "value provided as left argument and one of the elements in " 71 | "the right one, can't process the later"); 72 | if (t == 'a') 73 | Py_DECREF(seq->a); 74 | return '\0'; 75 | } 76 | return t; 77 | } 78 | 79 | 80 | static char 81 | get_sequences(PyObject *arg1, PyObject *arg2, sequence *seq1, sequence *seq2, 82 | Py_ssize_t *len1, Py_ssize_t *len2) 83 | { 84 | if (PyUnicode_Check(arg1) && PyUnicode_Check(arg2)) { 85 | 86 | if ((seq1->u = get_unicode(arg1, len1)) == NULL) 87 | return '\0'; 88 | if ((seq2->u = get_unicode(arg2, len2)) == NULL) 89 | return '\0'; 90 | return 'u'; 91 | 92 | } else if (PyBytes_Check(arg1) && PyBytes_Check(arg2)) { 93 | 94 | if ((seq1->b = get_byte(arg1, len1)) == NULL) 95 | return '\0'; 96 | if ((seq2->b = get_byte(arg2, len2)) == NULL) 97 | return '\0'; 98 | return 'b'; 99 | 100 | } else if (PySequence_Check(arg1) && PySequence_Check(arg2)) { 101 | 102 | if ((seq1->a = get_array(arg1, len1)) == NULL) 103 | return '\0'; 104 | if ((seq2->a = get_array(arg2, len2)) == NULL) { 105 | Py_DECREF(seq1->a); /* warning ! */ 106 | return '\0'; 107 | } 108 | return 'a'; 109 | } 110 | 111 | PyErr_SetString(PyExc_ValueError, "expected two sequence objects"); 112 | return '\0'; 113 | } 114 | 115 | 116 | static PyObject * 117 | hamming_py(PyObject *self, PyObject *args, PyObject *kwargs) 118 | { 119 | PyObject *arg1, *arg2, *odo_normalize = NULL; 120 | int do_normalize = 0; 121 | static char *keywords[] = {"seq1", "seq2", "normalized", NULL}; 122 | 123 | char type; 124 | sequence seq1, seq2; 125 | Py_ssize_t len1, len2; 126 | Py_ssize_t dist; 127 | 128 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, 129 | "OO|O:hamming", keywords, &arg1, &arg2, &odo_normalize)) 130 | return NULL; 131 | 132 | if (odo_normalize && (do_normalize = PyObject_IsTrue(odo_normalize)) == -1) 133 | return NULL; 134 | 135 | if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') 136 | return NULL; 137 | 138 | if (len1 != len2) { 139 | PyErr_SetString(PyExc_ValueError, "expected two objects of the same length"); 140 | if (type == 'a') { 141 | Py_DECREF(seq1.a); 142 | Py_DECREF(seq2.a); 143 | } 144 | return NULL; 145 | } 146 | 147 | switch(type) { 148 | case 'u': 149 | dist = uhamming(seq1.u, seq2.u, len1); 150 | break; 151 | case 'b': 152 | dist = bhamming(seq1.b, seq2.b, len1); 153 | break; 154 | default: 155 | dist = ahamming(seq1.a, seq2.a, len1); 156 | Py_DECREF(seq1.a); 157 | Py_DECREF(seq2.a); 158 | } 159 | 160 | if (dist == -1) // comparison failed 161 | return NULL; 162 | 163 | if (do_normalize) { 164 | if (len1 == 0) 165 | return Py_BuildValue("f", 0.0f); 166 | return Py_BuildValue("d", dist / (double)len1); 167 | } 168 | 169 | return Py_BuildValue("n", dist); 170 | } 171 | 172 | 173 | static PyObject * 174 | lcsubstrings_py_make_set(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen) 175 | { 176 | PyObject *set, *ss; 177 | struct pair_t *pair; 178 | 179 | if ((set = PySet_New(NULL)) == NULL) { 180 | utarray_free(stack); 181 | return NULL; 182 | } 183 | 184 | for (pair = (struct pair_t*)utarray_front(stack); 185 | pair != NULL; 186 | pair = (struct pair_t*)utarray_next(stack, pair)) { 187 | 188 | ss = PySequence_GetSlice(arg2, pair->j - mlen + 1, pair->j + 1); 189 | if (ss == NULL) 190 | goto On_Error; 191 | if ((PySet_Add(set, ss)) == -1) 192 | goto On_Error; 193 | } 194 | 195 | utarray_free(stack); 196 | return set; 197 | 198 | On_Error: 199 | PySet_Clear(set); 200 | Py_DECREF(set); 201 | utarray_free(stack); 202 | return NULL; 203 | } 204 | 205 | 206 | static PyObject * 207 | lcsubstrings_py_make_tuple(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen) 208 | { 209 | PyObject *tp, *stp; 210 | Py_ssize_t i; 211 | struct pair_t *pair; 212 | 213 | if ((stp = PyTuple_New(utarray_len(stack))) == NULL) { 214 | utarray_free(stack); 215 | return NULL; 216 | } 217 | for (i = 0, pair = (struct pair_t*)utarray_front(stack); 218 | pair != NULL; 219 | ++i, pair = (struct pair_t*)utarray_next(stack, pair)) { 220 | PyTuple_SET_ITEM(stp, i, Py_BuildValue("(nn)", pair->i - mlen + 1, pair->j - mlen + 1)); 221 | } 222 | if ((tp = PyTuple_New(2)) == NULL) { 223 | utarray_free(stack); 224 | Py_DECREF(stp); 225 | return NULL; 226 | } 227 | PyTuple_SET_ITEM(tp, 0, Py_BuildValue("n", mlen)); 228 | PyTuple_SET_ITEM(tp, 1, stp); 229 | 230 | utarray_free(stack); 231 | 232 | return tp; 233 | } 234 | 235 | 236 | static PyObject * 237 | lcsubstrings_py(PyObject *self, PyObject *args, PyObject *kwargs) 238 | { 239 | PyObject *arg1, *arg2, *opos = NULL; 240 | int positions = 0; 241 | static char *keywords[] = {"seq1", "seq2", "positions", NULL}; 242 | 243 | char type; 244 | sequence seq1, seq2; 245 | Py_ssize_t len1, len2; 246 | UT_array *stack; 247 | Py_ssize_t mlen = -1; 248 | 249 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, 250 | "OO|O:lcsubstrings", keywords, &arg1, &arg2, &opos)) 251 | return NULL; 252 | if (opos && (positions = PyObject_IsTrue(opos)) == -1) 253 | return NULL; 254 | 255 | if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') 256 | return NULL; 257 | 258 | // special case 259 | if (type == 'a' && (!positions) && (PyList_Check(arg1) || PyList_Check(arg2))) { 260 | Py_DECREF(seq1.a); 261 | Py_DECREF(seq2.a); 262 | PyErr_SetString(PyExc_TypeError, "can't hash lists, pass in tuples instead"); 263 | return NULL; 264 | } 265 | 266 | if (len1 < len2) { 267 | SWAP(PyObject *, arg1, arg2); 268 | SWAP(sequence, seq1, seq2); 269 | SWAP(Py_ssize_t, len1, len2); 270 | } 271 | 272 | switch(type) { 273 | case 'u': 274 | stack = ulcsubstrings(seq1.u, seq2.u, len1, len2, &mlen); 275 | break; 276 | case 'b': 277 | stack = blcsubstrings(seq1.b, seq2.b, len1, len2, &mlen); 278 | break; 279 | default: 280 | stack = alcsubstrings(seq1.a, seq2.a, len1, len2, &mlen); 281 | Py_DECREF(seq1.a); 282 | Py_DECREF(seq2.a); 283 | } 284 | 285 | if (stack == NULL) { 286 | /* memory allocation failed */ 287 | return PyErr_NoMemory(); 288 | } 289 | 290 | if (positions) 291 | return lcsubstrings_py_make_tuple(arg1, arg2, stack, mlen); 292 | return lcsubstrings_py_make_set(arg1, arg2, stack, mlen); 293 | } 294 | 295 | 296 | static PyObject * 297 | nlevenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs) 298 | { 299 | PyObject *arg1, *arg2; 300 | short method = 1; 301 | static char *keywords[] = {"seq1", "seq2", "method", NULL}; 302 | 303 | char type; 304 | sequence seq1, seq2; 305 | Py_ssize_t len1, len2; 306 | double dist; 307 | 308 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, 309 | "OO|h:nlevenshtein", keywords, &arg1, &arg2, &method)) 310 | return NULL; 311 | 312 | if (method != 1 && method != 2) { 313 | PyErr_SetString(PyExc_ValueError, "expected either 1 or 2 for `method` parameter"); 314 | return NULL; 315 | } 316 | 317 | if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') 318 | return NULL; 319 | 320 | if (len1 < len2) { 321 | SWAP(sequence, seq1, seq2); 322 | SWAP(Py_ssize_t, len1, len2); 323 | } 324 | 325 | switch(type) { 326 | case 'u': 327 | dist = unlevenshtein(seq1.u, seq2.u, len1, len2, method); 328 | break; 329 | case 'b': 330 | dist = bnlevenshtein(seq1.b, seq2.b, len1, len2, method); 331 | break; 332 | default: 333 | dist = anlevenshtein(seq1.a, seq2.a, len1, len2, method); 334 | Py_DECREF(seq1.a); 335 | Py_DECREF(seq2.a); 336 | } 337 | 338 | if (dist < 0) { 339 | if (dist == -1) // memory allocation failed 340 | return PyErr_NoMemory(); 341 | return NULL; // comparison failed 342 | } 343 | 344 | return Py_BuildValue("d", dist); 345 | } 346 | 347 | 348 | static PyObject * 349 | levenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs) 350 | { 351 | PyObject *arg1, *arg2, *onorm = NULL; 352 | Py_ssize_t dist = -1; 353 | Py_ssize_t max_dist = -1; 354 | int normalized = 0; 355 | static char *keywords[] = {"seq1", "seq2", "normalized", "max_dist", NULL}; 356 | 357 | char type; 358 | sequence seq1, seq2; 359 | Py_ssize_t len1, len2; 360 | 361 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, 362 | "OO|On:levenshtein", keywords, &arg1, &arg2, &onorm, &max_dist)) 363 | return NULL; 364 | if (onorm && (normalized = PyObject_IsTrue(onorm)) == -1) 365 | return NULL; 366 | 367 | if (normalized) { 368 | onorm = NULL; 369 | return nlevenshtein_py(self, args, onorm); 370 | } 371 | 372 | if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') 373 | return NULL; 374 | 375 | switch(type) { 376 | case 'u': 377 | dist = ulevenshtein(seq1.u, seq2.u, len1, len2, max_dist); 378 | break; 379 | case 'b': 380 | dist = blevenshtein(seq1.b, seq2.b, len1, len2, max_dist); 381 | break; 382 | default: 383 | dist = alevenshtein(seq1.a, seq2.a, len1, len2, max_dist); 384 | Py_DECREF(seq1.a); 385 | Py_DECREF(seq2.a); 386 | } 387 | 388 | if (dist < -1) { 389 | if (dist == -2) 390 | return PyErr_NoMemory(); // memory allocation failed 391 | return NULL; // comparison failed 392 | } 393 | return Py_BuildValue("n", dist); 394 | } 395 | 396 | 397 | static PyObject * 398 | fastcomp_py(PyObject *self, PyObject *args, PyObject *kwargs) 399 | { 400 | PyObject *arg1, *arg2, *otr = NULL; 401 | int transpositions = 0; 402 | static char *keywords[] = {"seq1", "seq2", "transpositions", NULL}; 403 | 404 | char type; 405 | sequence seq1, seq2; 406 | Py_ssize_t len1, len2; 407 | short dist; 408 | 409 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:fast_comp", 410 | keywords, &arg1, &arg2, &transpositions)) 411 | return NULL; 412 | if (otr && (transpositions = PyObject_IsTrue(otr)) == -1) 413 | return NULL; 414 | 415 | if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0') 416 | return NULL; 417 | 418 | if (len1 < len2) { 419 | SWAP(sequence, seq1, seq2); 420 | SWAP(Py_ssize_t, len1, len2); 421 | } 422 | 423 | switch(type) { 424 | case 'u': 425 | dist = ufastcomp(seq1.u, seq2.u, len1, len2, transpositions); 426 | break; 427 | case 'b': 428 | dist = bfastcomp(seq1.b, seq2.b, len1, len2, transpositions); 429 | break; 430 | default: 431 | dist = afastcomp(seq1.a, seq2.a, len1, len2, transpositions); 432 | Py_DECREF(seq1.a); 433 | Py_DECREF(seq2.a); 434 | } 435 | 436 | if (dist == -2) // comparison failed 437 | return NULL; 438 | 439 | return Py_BuildValue("h", dist); 440 | } 441 | 442 | 443 | 444 | // Iterators (for levenshtein and fastcomp). They share the same structure. 445 | 446 | typedef struct { 447 | PyObject_HEAD 448 | PyObject *itor; 449 | char seqtype; // type of the sequence ('u', 'b', 'a') 450 | sequence seq1; // the sequence itself 451 | Py_ssize_t len1; // its length 452 | PyObject *object; // the corresponding pyobject 453 | int transpos; // only valable for fastcomp 454 | Py_ssize_t max_dist; // only for levenshtein 455 | } ItorState; 456 | 457 | 458 | static void itor_dealloc(ItorState *state) 459 | { 460 | // we got two references for tuples and lists, one for the original python object, 461 | // and one returned by `PySequence_fast` 462 | if (state->seqtype == 'a') 463 | Py_XDECREF(state->seq1.a); 464 | Py_XDECREF(state->object); 465 | Py_XDECREF(state->itor); 466 | Py_TYPE(state)->tp_free(state); 467 | } 468 | 469 | 470 | static PyObject * 471 | ifastcomp_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) 472 | { 473 | PyObject *arg1, *arg2, *itor; 474 | int transpositions = 0; 475 | static char *keywords[] = {"seq1", "seqs", "transpositions", NULL}; 476 | 477 | char seqtype; 478 | sequence seq1; 479 | Py_ssize_t len1; 480 | 481 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:ifast_comp", 482 | keywords, &arg1, &arg2, &transpositions)) 483 | return NULL; 484 | if (otr && (transpositions = PyObject_IsTrue(otr)) == -1) 485 | return NULL; 486 | 487 | if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0') 488 | return NULL; 489 | 490 | if ((itor = PyObject_GetIter(arg2)) == NULL) { 491 | PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument"); 492 | return NULL; 493 | } 494 | 495 | ItorState *state = (ItorState *)type->tp_alloc(type, 0); 496 | if (state == NULL) { 497 | Py_DECREF(itor); 498 | return NULL; 499 | } 500 | 501 | Py_INCREF(arg1); 502 | 503 | state->itor = itor; 504 | state->seqtype = seqtype; 505 | state->seq1 = seq1; 506 | state->object = arg1; 507 | state->len1 = len1; 508 | state->transpos = transpositions; 509 | 510 | return (PyObject *)state; 511 | } 512 | 513 | 514 | static PyObject * 515 | ilevenshtein_new(PyTypeObject *type, PyObject *args, PyObject *kwargs) 516 | { 517 | PyObject *arg1, *arg2, *itor; 518 | Py_ssize_t max_dist = -1; 519 | static char *keywords[] = {"seq1", "seqs", "max_dist", NULL}; 520 | 521 | char seqtype; 522 | sequence seq1; 523 | Py_ssize_t len1; 524 | 525 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, 526 | "OO|n:ilevenshtein", keywords, &arg1, &arg2, &max_dist)) 527 | return NULL; 528 | 529 | if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0') 530 | return NULL; 531 | 532 | if ((itor = PyObject_GetIter(arg2)) == NULL) { 533 | PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument"); 534 | return NULL; 535 | } 536 | 537 | ItorState *state = (ItorState *)type->tp_alloc(type, 0); 538 | if (state == NULL) { 539 | Py_DECREF(itor); 540 | return NULL; 541 | } 542 | 543 | Py_INCREF(arg1); 544 | 545 | state->itor = itor; 546 | state->seqtype = seqtype; 547 | state->seq1 = seq1; 548 | state->object = arg1; 549 | state->len1 = len1; 550 | state->max_dist = max_dist; 551 | 552 | return (PyObject *)state; 553 | } 554 | 555 | 556 | static PyObject * 557 | ilevenshtein_next(ItorState *state) 558 | { 559 | PyObject *arg2; 560 | sequence seq1, seq2; 561 | Py_ssize_t len2; 562 | 563 | Py_ssize_t dist = -1; 564 | PyObject *rv; 565 | 566 | seq1 = state->seq1; 567 | 568 | while ((arg2 = PyIter_Next(state->itor)) != NULL) { 569 | 570 | if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') { 571 | Py_DECREF(arg2); 572 | return NULL; 573 | } 574 | switch(state->seqtype) { 575 | case 'u': 576 | dist = ulevenshtein(seq1.u, seq2.u, state->len1, len2, state->max_dist); 577 | break; 578 | case 'b': 579 | dist = blevenshtein(seq1.b, seq2.b, state->len1, len2, state->max_dist); 580 | break; 581 | default: 582 | dist = alevenshtein(seq1.a, seq2.a, state->len1, len2, state->max_dist); 583 | Py_DECREF(seq2.a); 584 | } 585 | if (dist < -1) { 586 | Py_DECREF(arg2); 587 | if (dist == -2) 588 | return PyErr_NoMemory(); // memory allocation failed 589 | return NULL; // comparison failed 590 | } 591 | if (dist != -1) { 592 | rv = Py_BuildValue("(nO)", dist, arg2); 593 | Py_DECREF(arg2); 594 | return rv; 595 | } 596 | Py_DECREF(arg2); 597 | } 598 | 599 | return NULL; 600 | } 601 | 602 | 603 | static PyObject * 604 | ifastcomp_next(ItorState *state) 605 | { 606 | PyObject *arg2; 607 | sequence seq1, seq2; 608 | Py_ssize_t len2; 609 | 610 | short dist = -1; 611 | PyObject *rv; 612 | 613 | seq1 = state->seq1; 614 | 615 | while ((arg2 = PyIter_Next(state->itor)) != NULL) { 616 | 617 | if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') { 618 | Py_DECREF(arg2); 619 | return NULL; 620 | } 621 | switch(state->seqtype) { 622 | case 'u': 623 | dist = ufastcomp(seq1.u, seq2.u, state->len1, len2, state->transpos); 624 | break; 625 | case 'b': 626 | dist = bfastcomp(seq1.b, seq2.b, state->len1, len2, state->transpos); 627 | break; 628 | default: 629 | dist = afastcomp(seq1.a, seq2.a, state->len1, len2, state->transpos); 630 | Py_DECREF(seq2.a); 631 | } 632 | if (dist == -2) { // comparison failed 633 | Py_DECREF(arg2); 634 | return NULL; 635 | } 636 | if (dist != -1) { 637 | rv = Py_BuildValue("(hO)", dist, arg2); 638 | Py_DECREF(arg2); 639 | return rv; 640 | } 641 | Py_DECREF(arg2); 642 | } 643 | 644 | return NULL; 645 | } 646 | 647 | 648 | PyTypeObject IFastComp_Type = { 649 | PyVarObject_HEAD_INIT(&PyType_Type, 0) 650 | "distance.ifast_comp", /* tp_name */ 651 | sizeof(ItorState), /* tp_basicsize */ 652 | 0, /* tp_itemsize */ 653 | (destructor)itor_dealloc, /* tp_dealloc */ 654 | 0, /* tp_print */ 655 | 0, /* tp_getattr */ 656 | 0, /* tp_setattr */ 657 | 0, /* tp_reserved */ 658 | 0, /* tp_repr */ 659 | 0, /* tp_as_number */ 660 | 0, /* tp_as_sequence */ 661 | 0, /* tp_as_mapping */ 662 | 0, /* tp_hash */ 663 | 0, /* tp_call */ 664 | 0, /* tp_str */ 665 | 0, /* tp_getattro */ 666 | 0, /* tp_setattro */ 667 | 0, /* tp_as_buffer */ 668 | Py_TPFLAGS_DEFAULT, /* tp_flags */ 669 | ifast_comp_doc, /* tp_doc */ 670 | 0, /* tp_traverse */ 671 | 0, /* tp_clear */ 672 | 0, /* tp_richcompare */ 673 | 0, /* tp_weaklistoffset */ 674 | PyObject_SelfIter, /* tp_iter */ 675 | (iternextfunc)ifastcomp_next, /* tp_iternext */ 676 | 0, /* tp_methods */ 677 | 0, /* tp_members */ 678 | 0, /* tp_getset */ 679 | 0, /* tp_base */ 680 | 0, /* tp_dict */ 681 | 0, /* tp_descr_get */ 682 | 0, /* tp_descr_set */ 683 | 0, /* tp_dictoffset */ 684 | 0, /* tp_init */ 685 | PyType_GenericAlloc, /* tp_alloc */ 686 | ifastcomp_new, /* tp_new */ 687 | }; 688 | 689 | 690 | PyTypeObject ILevenshtein_Type = { 691 | PyVarObject_HEAD_INIT(&PyType_Type, 0) 692 | "distance.ilevenshtein", /* tp_name */ 693 | sizeof(ItorState), /* tp_basicsize */ 694 | 0, /* tp_itemsize */ 695 | (destructor)itor_dealloc, /* tp_dealloc */ 696 | 0, /* tp_print */ 697 | 0, /* tp_getattr */ 698 | 0, /* tp_setattr */ 699 | 0, /* tp_reserved */ 700 | 0, /* tp_repr */ 701 | 0, /* tp_as_number */ 702 | 0, /* tp_as_sequence */ 703 | 0, /* tp_as_mapping */ 704 | 0, /* tp_hash */ 705 | 0, /* tp_call */ 706 | 0, /* tp_str */ 707 | 0, /* tp_getattro */ 708 | 0, /* tp_setattro */ 709 | 0, /* tp_as_buffer */ 710 | Py_TPFLAGS_DEFAULT, /* tp_flags */ 711 | ilevenshtein_doc, /* tp_doc */ 712 | 0, /* tp_traverse */ 713 | 0, /* tp_clear */ 714 | 0, /* tp_richcompare */ 715 | 0, /* tp_weaklistoffset */ 716 | PyObject_SelfIter, /* tp_iter */ 717 | (iternextfunc)ilevenshtein_next, /* tp_iternext */ 718 | 0, /* tp_methods */ 719 | 0, /* tp_members */ 720 | 0, /* tp_getset */ 721 | 0, /* tp_base */ 722 | 0, /* tp_dict */ 723 | 0, /* tp_descr_get */ 724 | 0, /* tp_descr_set */ 725 | 0, /* tp_dictoffset */ 726 | 0, /* tp_init */ 727 | PyType_GenericAlloc, /* tp_alloc */ 728 | ilevenshtein_new, /* tp_new */ 729 | }; 730 | 731 | 732 | static PyMethodDef CDistanceMethods[] = { 733 | {"hamming", (PyCFunction)hamming_py, METH_VARARGS | METH_KEYWORDS, hamming_doc}, 734 | {"levenshtein", (PyCFunction)levenshtein_py, METH_VARARGS | METH_KEYWORDS, levenshtein_doc}, 735 | {"nlevenshtein", (PyCFunction)nlevenshtein_py, METH_VARARGS | METH_KEYWORDS, nlevenshtein_doc}, 736 | {"lcsubstrings", (PyCFunction)lcsubstrings_py, METH_VARARGS | METH_KEYWORDS, lcsubstrings_doc}, 737 | {"fast_comp", (PyCFunction)fastcomp_py, METH_VARARGS | METH_KEYWORDS, fast_comp_doc}, 738 | {NULL, NULL, 0, NULL} 739 | }; 740 | 741 | 742 | #if PY_MAJOR_VERSION >= 3 743 | static struct PyModuleDef cdistancemodule = { 744 | PyModuleDef_HEAD_INIT, "cdistance", NULL, -1, CDistanceMethods 745 | }; 746 | #endif 747 | 748 | #if PY_MAJOR_VERSION >= 3 749 | PyMODINIT_FUNC PyInit_cdistance(void) 750 | #else 751 | PyMODINIT_FUNC initcdistance(void) 752 | #endif 753 | { 754 | PyObject *module; 755 | 756 | #if PY_MAJOR_VERSION >= 3 757 | if ((module = PyModule_Create(&cdistancemodule)) == NULL) 758 | return NULL; 759 | #else 760 | if ((module = Py_InitModule("cdistance", CDistanceMethods)) == NULL) 761 | return; 762 | #endif 763 | 764 | if (PyType_Ready(&IFastComp_Type) != 0 || PyType_Ready(&ILevenshtein_Type) != 0) 765 | #if PY_MAJOR_VERSION >= 3 766 | return NULL; 767 | #else 768 | return; 769 | #endif 770 | 771 | Py_INCREF((PyObject *)&IFastComp_Type); 772 | Py_INCREF((PyObject *)&ILevenshtein_Type); 773 | 774 | PyModule_AddObject(module, "ifast_comp", (PyObject *)&IFastComp_Type); 775 | PyModule_AddObject(module, "ilevenshtein", (PyObject *)&ILevenshtein_Type); 776 | 777 | #if PY_MAJOR_VERSION >= 3 778 | return module; 779 | #endif 780 | } 781 | --------------------------------------------------------------------------------