├── MANIFEST
├── distance
    ├── _pyimports.py
    ├── __init__.py
    ├── _lcsubstrings.py
    ├── _iterators.py
    ├── _simpledists.py
    ├── _fastcomp.py
    └── _levenshtein.py
├── .gitignore
├── cdistance
    ├── hamming.c
    ├── distance.h
    ├── lcsubstrings.c
    ├── fastcomp.c
    ├── levenshtein.c
    ├── includes.h
    ├── utarray.h
    └── distance.c
├── README.md
├── setup.py
├── tests
    └── tests.py
└── LICENSE


/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | setup.py
3 | distance/__init__.py
4 | distance/distance.py
5 | 


--------------------------------------------------------------------------------
/distance/_pyimports.py:
--------------------------------------------------------------------------------
1 | from ._fastcomp import *
2 | from ._lcsubstrings import *
3 | from ._levenshtein import *
4 | from ._simpledists import *
5 | from ._iterators import *
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | parts
13 | bin
14 | var
15 | sdist
16 | develop-eggs
17 | .installed.cfg
18 | lib
19 | lib64
20 | __pycache__
21 | 
22 | # Installer logs
23 | pip-log.txt
24 | 
25 | # Unit test / coverage reports
26 | .coverage
27 | .tox
28 | nosetests.xml
29 | 
30 | # Translations
31 | *.mo
32 | 
33 | # Mr Developer
34 | .mr.developer.cfg
35 | .project
36 | .pydevproject
37 | 


--------------------------------------------------------------------------------
/cdistance/hamming.c:
--------------------------------------------------------------------------------
 1 | #include "distance.h"
 2 | 
 3 | static Py_ssize_t
 4 | hamming(unicode *seq1, unicode *seq2, Py_ssize_t len)
 5 | {
 6 | 	Py_ssize_t i, dist = 0;
 7 | #ifdef SEQUENCE_COMP
 8 | 	int comp;
 9 | #endif
10 | 	
11 | 	for (i = 0; i < len; i++) {
12 | #ifdef SEQUENCE_COMP
13 | 		comp = SEQUENCE_COMP(seq1, i, seq2, i);
14 | 		if (comp == -1)
15 | 			return -1;
16 | 		if (!comp)
17 | #else
18 | 		if (seq1[i] != seq2[i])
19 | #endif
20 | 			dist++;
21 | 	}
22 | 
23 | 	return dist;
24 | }
25 | 


--------------------------------------------------------------------------------
/distance/__init__.py:
--------------------------------------------------------------------------------
 1 | "Utilities for comparing sequences"
 2 | 
 3 | __all__ = ["hamming", "levenshtein", "nlevenshtein", "jaccard", "sorensen",
 4 | 	"fast_comp", "lcsubstrings", "ilevenshtein", "ifast_comp"]
 5 | 
 6 | try:
 7 | 	from .cdistance import *
 8 | except ImportError:
 9 | 	from ._pyimports import *
10 | 
11 | from ._pyimports import jaccard, sorensen
12 | 
13 | def quick_levenshtein(str1, str2):
14 | 	return fast_comp(str1, str2, transpositions=False)
15 | 
16 | def iquick_levenshtein(str1, strs):
17 | 	return ifast_comp(str1, str2, transpositions=False)
18 | 


--------------------------------------------------------------------------------
/cdistance/distance.h:
--------------------------------------------------------------------------------
 1 | #ifndef DISTANCE_H
 2 | #define DISTANCE_H
 3 | 
 4 | #include "Python.h"
 5 | #include "utarray.h"
 6 | 
 7 | // Debugging. This kills the interpreter if an assertion fails.
 8 | 
 9 | #ifdef DISTANCE_DEBUG
10 | 	#undef NDEBUG
11 | 	#include <assert.h>
12 | #endif
13 | 
14 | // Compatibility Python 2 && 3
15 | 
16 | #if PY_MAJOR_VERSION < 3
17 | 	#define PyBytes_Check        PyString_Check
18 | 	#define PyBytes_AS_STRING    PyString_AS_STRING
19 | 	#define PyBytes_GET_SIZE     PyString_GET_SIZE
20 | 	#define PyUnicode_GET_LENGTH PyUnicode_GET_SIZE
21 | #endif
22 | 
23 | // Aliases for each sequence type
24 | 
25 | typedef Py_UNICODE unicode;
26 | 
27 | typedef char byte;
28 | 
29 | typedef PyObject array;
30 | 
31 | typedef union {
32 | 	unicode *u;
33 | 	byte    *b;
34 | 	array   *a;
35 | } sequence;
36 | 
37 | 
38 | // Used in distance.c and some other files
39 | 
40 | #define SWAP(type, a, b)								\
41 | do {															\
42 | 	type a##_tmp = a;										\
43 | 	a = b;													\
44 | 	b = a##_tmp;											\
45 | } while (0)
46 | 
47 | 
48 | // Used in lcsubstrings.c and distance.c for dynamic array
49 | 
50 | struct pair_t {
51 | 	Py_ssize_t i;
52 | 	Py_ssize_t j;
53 | };
54 | 
55 | UT_icd pair_icd = {sizeof(struct pair_t), NULL, NULL, NULL};
56 | 
57 | #endif
58 | 


--------------------------------------------------------------------------------
/distance/_lcsubstrings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from array import array
 4 | 
 5 | 
 6 | def lcsubstrings(seq1, seq2, positions=False):
 7 | 	"""Find the longest common substring(s) in the sequences `seq1` and `seq2`.
 8 | 	
 9 | 	If positions evaluates to `True` only their positions will be returned,
10 | 	together with their length, in a tuple:
11 | 	
12 | 		(length, [(start pos in seq1, start pos in seq2)..])
13 | 	
14 | 	Otherwise, the substrings themselves will be returned, in a set.
15 | 	
16 | 	Example:
17 | 	
18 | 		>>> lcsubstrings("sedentar", "dentist")
19 | 		{'dent'}
20 | 		>>> lcsubstrings("sedentar", "dentist", positions=True)
21 | 		(4, [(2, 0)])
22 | 	"""
23 | 	L1, L2 = len(seq1), len(seq2)
24 | 	ms = []
25 | 	mlen = last = 0
26 | 	if L1 < L2:
27 | 		seq1, seq2 = seq2, seq1
28 | 		L1, L2 = L2, L1
29 | 	
30 | 	column = array('L', range(L2))
31 | 	
32 | 	for i in range(L1):
33 | 		for j in range(L2):
34 | 			old = column[j]
35 | 			if seq1[i] == seq2[j]:
36 | 				if i == 0 or j == 0:
37 | 					column[j] = 1
38 | 				else:
39 | 					column[j] = last + 1
40 | 				if column[j] > mlen:
41 | 					mlen = column[j]
42 | 					ms = [(i, j)]
43 | 				elif column[j] == mlen:
44 | 					ms.append((i, j))
45 | 			else:
46 | 				column[j] = 0
47 | 			last = old
48 | 	
49 | 	if positions:
50 | 		return (mlen, tuple((i - mlen + 1, j - mlen + 1) for i, j in ms if ms))
51 | 	return set(seq1[i - mlen + 1:i + 1] for i, _ in ms if ms)
52 | 


--------------------------------------------------------------------------------
/cdistance/lcsubstrings.c:
--------------------------------------------------------------------------------
 1 | #include "distance.h"
 2 | 
 3 | 
 4 | static UT_array *
 5 | lcsubstrings(unicode *seq1, unicode *seq2,
 6 |              Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t *max_len)
 7 | {
 8 | 	Py_ssize_t i, j, mlen = -1;
 9 | 	Py_ssize_t old, last, *column;
10 | 	UT_array *stack = NULL;
11 | 	struct pair_t pos;
12 | #ifdef SEQUENCE_COMP
13 | 	int comp;
14 | #endif
15 | 	
16 | 	assert(len1 >= len2);
17 | 	
18 | 	utarray_new(stack, &pair_icd);
19 | 	
20 | 	if (len2 == 0) {
21 | 		*max_len = 0;
22 | 		return stack;
23 | 	}
24 | 	
25 | 	if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL)
26 | 		goto On_Error;
27 | 	
28 | 	last = 0;
29 | 	for (j = 0; j < len2; j++)
30 | 		column[j] = j;
31 | 	
32 | 	for (i = 0; i < len1; i++) {
33 | 		for (j = 0; j < len2; j++) {
34 | 			old = column[j];
35 | #ifdef SEQUENCE_COMP
36 | 			comp = SEQUENCE_COMP(seq1, i, seq2, j);
37 | 			if (comp == -1)
38 | 				goto On_Error;
39 | 			if (comp) {
40 | #else
41 | 			if (seq1[i] == seq2[j]) {
42 | #endif
43 | 				column[j] = ((i == 0 || j == 0) ? 1 : (last + 1));
44 | 				if (column[j] > mlen) {
45 | 					mlen = column[j];
46 | 					pos.i = i;
47 | 					pos.j = j;
48 | 					utarray_clear(stack);
49 | 					utarray_push_back(stack, &pos);
50 | 				}
51 | 				else if (column[j] == mlen) {
52 | 					pos.i = i;
53 | 					pos.j = j;
54 | 					utarray_push_back(stack, &pos);
55 | 				}
56 | 			}
57 | 			else
58 | 				column[j] = 0;
59 | 			last = old;
60 | 		}
61 | 	}
62 | 	
63 | 	free(column);
64 | 
65 | 	*max_len = mlen;
66 | 	return stack;
67 | 	
68 | 	On_Error:
69 | 		free(column);
70 | 		utarray_free(stack);
71 | 		return NULL;
72 | }
73 | 


--------------------------------------------------------------------------------
/distance/_iterators.py:
--------------------------------------------------------------------------------
 1 | from ._pyimports import levenshtein, fast_comp
 2 | 
 3 | def ilevenshtein(seq1, seqs, max_dist=-1):
 4 | 	"""Compute the Levenshtein distance between the sequence `seq1` and the series
 5 | 	of	sequences `seqs`.
 6 | 	
 7 | 		`seq1`: the reference sequence
 8 | 		`seqs`: a series of sequences (can be a generator)
 9 | 		`max_dist`: if provided and > 0, only the sequences which distance from
10 | 		the reference sequence is lower or equal to this value will be returned.
11 | 	
12 | 	The return value is a series of pairs (distance, sequence).
13 | 	
14 | 	The sequence objects in `seqs` are expected to be of the same kind than
15 | 	the reference sequence in the C implementation; the same holds true for
16 | 	`ifast_comp`.
17 | 	"""
18 | 	for seq2 in seqs:
19 | 		dist = levenshtein(seq1, seq2, max_dist=max_dist)
20 | 		if dist != -1:
21 | 			yield dist, seq2
22 | 
23 | 
24 | def ifast_comp(seq1, seqs, transpositions=False):
25 | 	"""Return an iterator over all the sequences in `seqs` which distance from
26 | 	`seq1` is lower or equal to 2. The sequences which distance from the
27 | 	reference sequence is higher than that are dropped.
28 | 	
29 | 		`seq1`: the reference sequence.
30 | 		`seqs`: a series of sequences (can be a generator)
31 | 		`transpositions` has the same sense than in `fast_comp`.
32 | 	
33 | 	The return value is a series of pairs (distance, sequence).
34 | 	
35 | 	You might want to call `sorted()` on the iterator to get the results in a
36 | 	significant order:
37 | 	
38 | 		>>> g = ifast_comp("foo", ["fo", "bar", "foob", "foo", "foobaz"])
39 | 		>>> sorted(g)
40 | 		[(0, 'foo'), (1, 'fo'), (1, 'foob')]
41 | 	"""
42 | 	for seq2 in seqs:
43 | 		dist = fast_comp(seq1, seq2, transpositions)
44 | 		if dist != -1:
45 | 			yield dist, seq2
46 | 


--------------------------------------------------------------------------------
/distance/_simpledists.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | def hamming(seq1, seq2, normalized=False):
 4 | 	"""Compute the Hamming distance between the two sequences `seq1` and `seq2`.
 5 | 	The Hamming distance is the number of differing items in two ordered
 6 | 	sequences of the same length. If the sequences submitted do not have the
 7 | 	same length, an error will be raised.
 8 | 	
 9 | 	If `normalized` evaluates to `False`, the return value will be an integer
10 | 	between 0 and the length of the sequences provided, edge values included;
11 | 	otherwise, it will be a float between 0 and 1 included, where 0 means
12 | 	equal, and 1 totally different. Normalized hamming distance is computed as:
13 | 	
14 | 		0.0                         if len(seq1) == 0
15 | 		hamming_dist / len(seq1)    otherwise
16 | 	"""
17 | 	L = len(seq1)
18 | 	if L != len(seq2):
19 | 		raise ValueError("expected two strings of the same length")
20 | 	if L == 0:
21 | 		return 0.0 if normalized else 0  # equal
22 | 	dist = sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
23 | 	if normalized:
24 | 		return dist / float(L)
25 | 	return dist
26 | 
27 | def jaccard(seq1, seq2):
28 | 	"""Compute the Jaccard distance between the two sequences `seq1` and `seq2`.
29 | 	They should contain hashable items.
30 | 	
31 | 	The return value is a float between 0 and 1, where 0 means equal, and 1 totally different.
32 | 	"""
33 | 	set1, set2 = set(seq1), set(seq2)
34 | 	return 1 - len(set1 & set2) / float(len(set1 | set2))
35 | 
36 | 
37 | def sorensen(seq1, seq2):
38 | 	"""Compute the Sorensen distance between the two sequences `seq1` and `seq2`.
39 | 	They should contain hashable items.
40 | 	
41 | 	The return value is a float between 0 and 1, where 0 means equal, and 1 totally different.
42 | 	"""
43 | 	set1, set2 = set(seq1), set(seq2)
44 | 	return 1 - (2 * len(set1 & set2) / float(len(set1) + len(set2)))
45 | 


--------------------------------------------------------------------------------
/distance/_fastcomp.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | def fast_comp(seq1, seq2, transpositions=False):
 4 | 	"""Compute the distance between the two sequences `seq1` and `seq2` up to a
 5 | 	maximum of 2 included, and return it. If the edit distance between the two
 6 | 	sequences is higher than that, -1 is returned.
 7 | 	
 8 | 	If `transpositions` is `True`, transpositions will be taken into account for
 9 | 	the computation of the distance. This can make a difference, e.g.:
10 | 
11 | 		>>> fast_comp("abc", "bac", transpositions=False)
12 | 		2
13 | 		>>> fast_comp("abc", "bac", transpositions=True)
14 | 		1
15 | 	
16 | 	This is faster than `levenshtein` by an order of magnitude, but on the
17 | 	other hand is of limited use.
18 | 
19 | 	The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.
20 | 	I've added transpositions support to the original code.
21 | 	"""
22 | 	replace, insert, delete = "r", "i", "d"
23 | 
24 | 	L1, L2  = len(seq1), len(seq2)
25 | 	if L1 < L2:
26 | 		L1, L2 = L2, L1
27 | 		seq1, seq2 = seq2, seq1
28 | 
29 | 	ldiff = L1 - L2
30 | 	if ldiff == 0:
31 | 		models = (insert+delete, delete+insert, replace+replace)
32 | 	elif ldiff == 1:
33 | 		models = (delete+replace, replace+delete)
34 | 	elif ldiff == 2:
35 | 		models = (delete+delete,)
36 | 	else:
37 | 		return -1
38 | 
39 | 	res = 3
40 | 	for model in models:
41 | 		i = j = c = 0
42 | 		while (i < L1) and (j < L2):
43 | 			if seq1[i] != seq2[j]:
44 | 				c = c+1
45 | 				if 2 < c:
46 | 					break
47 |             
48 | 				if transpositions and ldiff != 2 \
49 |             	and i < L1 - 1 and j < L2 - 1 \
50 |             	and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]:
51 | 					i, j = i+2, j+2
52 | 				else:
53 | 					cmd = model[c-1]
54 | 					if cmd == delete:
55 | 						i = i+1
56 | 					elif cmd == insert:
57 | 						j = j+1
58 | 					else:
59 | 						assert cmd == replace
60 | 						i,j = i+1, j+1
61 | 			else:
62 | 				i,j = i+1, j+1
63 | 
64 | 		if 2 < c:
65 | 			continue
66 | 		elif i < L1:
67 | 			if L1-i <= model[c:].count(delete):
68 | 				c = c + (L1-i)
69 | 			else:
70 | 				continue
71 | 		elif j < L2:
72 | 			if L2-j <= model[c:].count(insert):
73 | 				c = c + (L2-j)
74 | 			else:
75 | 				continue
76 | 
77 | 		if c < res:
78 | 			res = c
79 | 
80 | 	if res == 3:
81 | 		res = -1
82 | 	return res
83 | 


--------------------------------------------------------------------------------
/cdistance/fastcomp.c:
--------------------------------------------------------------------------------
  1 | #include "distance.h"
  2 | 
  3 | 
  4 | static short
  5 | fastcomp(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, int transpositions)
  6 | {
  7 | 	char *models[3];
  8 | 	short m, cnt, res = 3;
  9 | 	Py_ssize_t i, j, c, ldiff;
 10 | #ifdef SEQUENCE_COMP
 11 | 	int comp;
 12 | #endif
 13 | 	
 14 | 	if (len1 < len2) {
 15 | 		SWAP(unicode *,  seq1, seq2);
 16 | 		SWAP(Py_ssize_t, len1, len2);
 17 | 	}
 18 | 	
 19 | 	ldiff = len1 - len2;
 20 | 	switch (ldiff) {
 21 | 		case 0:
 22 | 			models[2] = "id";
 23 | 			models[1] = "di";
 24 | 			models[0] = "rr";
 25 | 			m = 2;
 26 | 			break;
 27 | 		case 1:
 28 | 			models[1] = "dr";
 29 | 			models[0] = "rd";
 30 | 			m = 1;
 31 | 			break;
 32 | 		case 2:
 33 | 			models[0] = "dd";
 34 | 			m = 0;
 35 | 			break;
 36 | 		default:
 37 | 			return -1;
 38 | 	}
 39 | 
 40 | 	for (; m >= 0; m--) {
 41 | 	
 42 | 		i = j = c = 0;
 43 | 		
 44 | 		while (i < len1 && j < len2)
 45 | 		{
 46 | #ifdef SEQUENCE_COMP
 47 | 			comp = SEQUENCE_COMP(seq1, i, seq2, j);
 48 | 			if (comp == -1)
 49 | 				return -2;
 50 | 			if (!comp) {
 51 | #else
 52 | 			if (seq1[i] != seq2[j]) {
 53 | #endif
 54 | 				c++;
 55 | 				if (c > 2)
 56 | 					break;
 57 | 				
 58 | 				/* Transpositions handling. `ldiff`, which is the absolute difference between the length
 59 | 				of the sequences `seq1` and `seq2`, should not be equal to 2 because in this case only
 60 | 				deletions can happen (given that the distance between the two sequences should not be
 61 | 				higher than 2, this is the shortest path).
 62 | 				We do a lookahead to check if a transposition is possible between the current position
 63 | 				and the next one, and, if so, we systematically	choose this path over the other alternative
 64 | 				edit operations. We act like so because the cost of a transposition is always the lowest
 65 | 				one in such situations.
 66 | 				*/
 67 | #ifdef SEQUENCE_COMP
 68 | 				if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1)) {
 69 | 					comp = SEQUENCE_COMP(seq1, i + 1, seq2, j);
 70 | 					if (comp == -1)
 71 | 						return -2;
 72 | 					else if (comp) {
 73 | 						comp = SEQUENCE_COMP(seq1, i, seq2, j + 1);
 74 | 						if (comp == -1)
 75 | 							return -2;
 76 | 						else if (comp) {
 77 | 							i = i + 2;
 78 | 							j = j + 2;
 79 | 							continue;
 80 | 						}
 81 | 					}
 82 | 				}
 83 | #else
 84 | 				if (transpositions && ldiff != 2 && i < (len1 - 1) && j < (len2 - 1) && \
 85 | 					seq1[i + 1] == seq2[j] && \
 86 | 					seq1[i] == seq2[j + 1]) {
 87 | 					i = i + 2;
 88 | 					j = j + 2;
 89 | 					continue;
 90 | 				}
 91 | #endif
 92 | 				if (models[m][c - 1] == 'd')
 93 | 					i++;
 94 | 				else if (models[m][c - 1] == 'i')
 95 | 					j++;
 96 | 				else {
 97 | 					i++;
 98 | 					j++;
 99 | 				}
100 | 			}
101 | 			else {
102 | 				i++;
103 | 				j++;
104 | 			}
105 | 		}
106 | 		
107 | 		if (c > 2)
108 | 			continue;
109 | 
110 | 		else if (i < len1) {
111 | 			if (c == 1)
112 | 				cnt = (models[m][1] == 'd');
113 | 			else
114 | 				cnt = (models[m][0] == 'd') + (models[m][1] == 'd');
115 | 			if (len1 - i <= cnt) {
116 | 				c = c + (len1 - i);
117 | 			}
118 | 			else
119 | 				continue;
120 | 		}
121 | 		else if (j < len2) {
122 | 			if (len2 - j <= (models[m][c] == 'i'))
123 | 				c = c + (len2 - j);
124 | 			else
125 | 				continue;
126 | 		}
127 | 		if (c < res) {
128 | 			res = c;
129 | 		}
130 | 	}
131 | 
132 | 	if (res == 3)
133 | 		res = -1;
134 | 		
135 | 	return res;
136 | }
137 | 


--------------------------------------------------------------------------------
/cdistance/levenshtein.c:
--------------------------------------------------------------------------------
  1 | #include "distance.h"
  2 | 
  3 | #define MIN3(a, b, c) ((a) < (b) ? ((a) < (c) ? (a) : (c)) : ((b) < (c) ? (b) : (c)))
  4 | #define MAX3(a, b, c) ((a) > (b) ? ((a) > (c) ? (a) : (c)) : ((b) > (c) ? (b) : (c)))
  5 | 
  6 | #ifndef LEVENSHTEIN_C
  7 | #define LEVENSHTEIN_C
  8 | 
  9 | static Py_ssize_t
 10 | minimum(const Py_ssize_t *column, Py_ssize_t len)
 11 | {
 12 | 	Py_ssize_t min;
 13 | 
 14 | 	assert(len > 0);
 15 | 	min = column[--len];
 16 | 	while (len-- >= 0) {
 17 | 		if (column[len] < min)
 18 | 			min = column[len];
 19 | 	}
 20 | 	
 21 | 	return min;
 22 | }
 23 | 
 24 | #endif
 25 | 
 26 | static Py_ssize_t
 27 | levenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, Py_ssize_t max_dist)
 28 | {
 29 | 	Py_ssize_t i, j;
 30 | 	Py_ssize_t last, old;
 31 | 	Py_ssize_t cost, dist = -2;
 32 | 	Py_ssize_t *column;
 33 | 
 34 | #ifdef SEQUENCE_COMP
 35 | 	int comp;
 36 | #endif
 37 | 	
 38 | 	if (len1 < len2) {
 39 | 		SWAP(unicode *,  seq1, seq2);
 40 | 		SWAP(Py_ssize_t, len1, len2);
 41 | 	}
 42 | 	
 43 | 	if (max_dist >= 0 && (len1 - len2) > max_dist)
 44 | 		return -1;
 45 | 	else {
 46 | 		if (len1 == 0)
 47 | 			return len2;
 48 | 		if (len2 == 0)
 49 | 			return len1;
 50 | 	}
 51 | 
 52 | 	if ((column = (Py_ssize_t *) malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL)
 53 | 		return -2;
 54 | 
 55 | 	for (j = 1 ; j <= len2; j++)
 56 | 		column[j] = j;
 57 | 	
 58 | 	for (i = 1 ; i <= len1; i++) {
 59 | 		column[0] = i;
 60 | 		for (j = 1, last = i - 1; j <= len2; j++) {
 61 | 			old = column[j];
 62 | #ifdef SEQUENCE_COMP
 63 | 			comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1);
 64 | 			if (comp == -1) {
 65 | 				free(column);
 66 | 				return -3;
 67 | 			}
 68 | 			cost = (!comp);
 69 | #else
 70 | 			cost = (seq1[i - 1] != seq2[j - 1]);
 71 | #endif
 72 | 			column[j] = MIN3(
 73 | 				column[j] + 1,
 74 | 				column[j - 1] + 1,
 75 | 				last + cost
 76 | 			);
 77 | 			last = old;
 78 | 		}
 79 | 		if (max_dist >= 0 && minimum(column, len2 + 1) > max_dist) {
 80 | 			free(column);
 81 | 			return -1;
 82 | 		}
 83 | 	}
 84 | 
 85 | 	dist = column[len2];
 86 | 	
 87 | 	free(column);
 88 | 	
 89 | 	if (max_dist >= 0 && dist > max_dist)
 90 | 		return -1;
 91 | 	return dist;
 92 | }
 93 | 
 94 | 
 95 | static double
 96 | nlevenshtein(unicode *seq1, unicode *seq2, Py_ssize_t len1, Py_ssize_t len2, short method)
 97 | {
 98 | 	Py_ssize_t i, j;
 99 | 	
100 | 	// distance
101 | 	Py_ssize_t ic, dc, rc;
102 | 	Py_ssize_t last, old;
103 | 	Py_ssize_t *column;
104 | 	Py_ssize_t fdist;
105 | 	
106 | 	// length
107 | 	Py_ssize_t lic, ldc, lrc;
108 | 	Py_ssize_t llast, lold;
109 | 	Py_ssize_t *length;
110 | 	Py_ssize_t flen;
111 | 
112 | #ifdef SEQUENCE_COMP
113 | 	int comp;
114 | #endif
115 | 	
116 | 	assert(len1 >= len2);
117 | 	
118 | 	if (len1 == 0) // len2 is 0 too, so the two sequences are identical
119 | 		return 0.0;
120 | 	if (len2 == 0) // completely different
121 | 		return 1.0;
122 | 	
123 | 	if (method == 1) {
124 | 		fdist = levenshtein(seq1, seq2, len1, len2, -1);
125 | 		if (fdist < 0)  // error
126 | 			return fdist;
127 | 		return fdist / (double)len1;
128 | 	}
129 | 
130 | 	if ((column = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL)
131 | 		return -1;
132 | 	if ((length = (Py_ssize_t *)malloc((len2 + 1) * sizeof(Py_ssize_t))) == NULL) {
133 | 		free(column);
134 | 		return -1;
135 | 	}
136 | 
137 | 	for (j = 1 ; j <= len2; j++)
138 | 		column[j] = length[j] = j;
139 | 	
140 | 	for (i = 1 ; i <= len1; i++) {
141 | 		column[0] = length[0] = i;
142 | 		
143 | 		for (j = 1, last = llast = i - 1; j <= len2; j++) {
144 | 		
145 | 			// distance
146 | 			old = column[j];
147 | 			ic = column[j - 1] + 1;
148 | 			dc = column[j] + 1;
149 | #ifdef SEQUENCE_COMP
150 | 			comp = SEQUENCE_COMP(seq1, i - 1, seq2, j - 1);
151 | 			if (comp == -1) {
152 | 				free(column);
153 | 				free(length);
154 | 				return -2;
155 | 			}
156 | 			rc = last + (!comp);
157 | #else
158 | 			rc = last + (seq1[i - 1] != seq2[j - 1]);
159 | #endif
160 | 			column[j] = MIN3(ic, dc, rc);
161 | 			last = old;
162 | 			
163 | 			// length
164 | 			lold = length[j];
165 | 			lic = (ic == column[j] ? length[j - 1] + 1 : 0);
166 | 			ldc = (dc == column[j] ? length[j] + 1 : 0);
167 | 			lrc = (rc == column[j] ? llast + 1 : 0);
168 | 			length[j] = MAX3(lic, ldc, lrc);
169 | 			llast = lold;
170 | 		}
171 | 	}
172 | 
173 | 	fdist = column[len2];
174 | 	flen = length[len2];
175 | 	
176 | 	free(column);
177 | 	free(length);
178 | 	
179 | 	return fdist / (double)flen;
180 | }
181 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | distance - Utilities for comparing sequences
  2 | ============================================
  3 | 
  4 | This package provides helpers for computing similarities between arbitrary sequences. Included metrics are Levenshtein, Hamming, Jaccard, and Sorensen distance, plus some bonuses. All distance computations are implemented in pure Python, and most of them are also implemented in C.
  5 | 
  6 | 
  7 | Installation
  8 | ------------
  9 | 
 10 | If you don't want or need to use the C extension, just unpack the archive and run, as root:
 11 | 
 12 | 	# python setup.py install
 13 | 
 14 | For the C extension to work, you need the Python source files, and a C compiler (typically Microsoft Visual C++ 2010 on Windows, and GCC on Mac and Linux). On a Debian-like system, you can get all of these with:
 15 | 
 16 | 	# apt-get install gcc pythonX.X-dev
 17 | 
 18 | where X.X is the number of your Python version.
 19 | 
 20 | Then you should type:
 21 | 
 22 | 	# python setup.py install --with-c
 23 | 
 24 | Note the use of the `--with-c` switch.
 25 | 
 26 | 
 27 | Usage
 28 | -----
 29 | 
 30 | A common use case for this module is to compare single words for similarity:
 31 | 
 32 | 	>>> distance.levenshtein("lenvestein", "levenshtein")
 33 | 	3
 34 | 	>>> distance.hamming("hamming", "hamning")
 35 | 	1
 36 | 
 37 | If there is not a one-to-one mapping between sounds and glyphs in your language, or if you want to compare not glyphs, but syllables or phonems, you can pass in tuples of characters:
 38 | 
 39 | 	>>> t1 = ("de", "ci", "si", "ve")
 40 | 	>>> t2 = ("de", "ri", "si", "ve")
 41 | 	>>> distance.levenshtein(t1, t2)
 42 | 	1
 43 | 
 44 | Comparing lists of strings can also be useful for computing similarities between sentences, paragraphs, etc.:
 45 | 
 46 | 	>>> sent1 = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
 47 | 	>>> sent2 = ['the', 'lazy', 'fox', 'jumps', 'over', 'the', 'crazy', 'dog']
 48 | 	>>> distance.levenshtein(sent1, sent2)
 49 | 	3
 50 | 
 51 | Hamming and Levenshtein distance can be normalized, so that the results of several distance measures can be meaningfully compared. Two strategies are available for Levenshtein: either the length of the shortest alignment between the sequences is taken as factor, or the length of the longer one. Example uses:
 52 | 
 53 | 	>>> distance.hamming("fat", "cat", normalized=True)
 54 | 	0.3333333333333333
 55 | 	>>> distance.nlevenshtein("abc", "acd", method=1)  # shortest alignment
 56 | 	0.6666666666666666
 57 | 	>>> distance.nlevenshtein("abc", "acd", method=2)  # longest alignment
 58 | 	0.5
 59 | 
 60 | `jaccard` and `sorensen` return a normalized value per default:
 61 | 
 62 | 	>>> distance.sorensen("decide", "resize")
 63 | 	0.5555555555555556
 64 | 	>>> distance.jaccard("decide", "resize")
 65 | 	0.7142857142857143
 66 | 
 67 | As for the bonuses, there is a `fast_comp` function, which computes the distance between two strings up to a value of 2 included. If the distance between the strings is higher than that, -1 is returned. This function is of limited use, but on the other hand it is quite faster than `levenshtein`. There is also a `lcsubstrings` function which can be used to find the longest common substrings in two sequences.
 68 | 
 69 | Finally, two convenience iterators `ilevenshtein` and `ifast_comp` are provided, which are intended to be used for filtering from a long list of sequences the ones that are close to a reference one. They both return a series of tuples (distance, sequence). Example:
 70 | 
 71 | 	>>> tokens = ["fo", "bar", "foob", "foo", "fooba", "foobar"]
 72 | 	>>> sorted(distance.ifast_comp("foo", tokens))
 73 | 	[(0, 'foo'), (1, 'fo'), (1, 'foob'), (2, 'fooba')]
 74 | 	>>> sorted(distance.ilevenshtein("foo", tokens, max_dist=1))
 75 | 	[(0, 'foo'), (1, 'fo'), (1, 'foob')]
 76 | 
 77 | `ifast_comp` is particularly efficient, and can handle 1 million tokens without a problem.
 78 | 
 79 | For more informations, see the functions documentation (`help(funcname)`).
 80 | 
 81 | Have fun!
 82 | 
 83 | 
 84 | Changelog
 85 | ---------
 86 | 
 87 | 20/11/13:
 88 | * Switched back to using the to-be-deprecated Python unicode api. Good news is that this makes the
 89 | C extension compatible with Python 2.7+, and that distance computations on unicode strings is now
 90 | much faster.
 91 | * Added a C version of `lcsubstrings`.
 92 | * Added a new method for computing normalized Levenshtein distance.
 93 | * Added some tests.
 94 | 
 95 | 12/11/13:
 96 | Expanded `fast_comp` (formerly `quick_levenshtein`) so that it can handle transpositions.
 97 | Fixed variable interversions in (C) `levenshtein` which produced sometimes strange results.
 98 | 
 99 | 10/11/13:
100 | Added `quick_levenshtein` and `iquick_levenshtein`.
101 | 
102 | 05/11/13:
103 | Added Sorensen and Jaccard metrics, fixed memory issue in Levenshtein.
104 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*- 
  2 | 
  3 | # Distance - Utilities for comparing sequences
  4 | # Copyright (C) 2013 Michaël Meyer
  5 | 
  6 | # This program is free software: you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, either version 3 of the License, or
  9 | # (at your option) any later version.
 10 | 
 11 | # This program is distributed in the hope that it will be useful,
 12 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 13 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 14 | # GNU General Public License for more details.
 15 | 
 16 | # You should have received a copy of the GNU General Public License
 17 | # along with this program. If not, see <http://www.gnu.org/licenses/>.
 18 | 
 19 | 
 20 | import os, sys, ast, _ast, re
 21 | from distutils.core import setup, Extension
 22 | 
 23 | this_dir = os.path.dirname(os.path.abspath(__file__))
 24 | pkg_dir  = os.path.join(this_dir, "distance")
 25 | cpkg_dir  = os.path.join(this_dir, "cdistance")
 26 | 
 27 | ctypes = ["unicode", "byte", "array"]
 28 | 
 29 | cfunctions = {
 30 | 	"levenshtein": ["levenshtein", "nlevenshtein"],
 31 | 	"hamming": ["hamming"],
 32 | 	"lcsubstrings": ["lcsubstrings"],
 33 | 	"fastcomp": ["fastcomp"],
 34 | }
 35 | 
 36 | sequence_compare = """\
 37 | #define SEQUENCE_COMPARE(s1, i1, s2, i2) \\
 38 | (PyObject_RichCompareBool( \\
 39 | 	PySequence_Fast_GET_ITEM((s1), (i1)), \\
 40 | 	PySequence_Fast_GET_ITEM((s2), (i2)), \\
 41 | 	Py_EQ) \\
 42 | )
 43 | """
 44 | 
 45 | def make_c_doc():
 46 | 	buff = []
 47 | 	py_sources = [f for f in os.listdir(pkg_dir) if f.endswith('.py')]
 48 | 	for file in py_sources:
 49 | 		with open(os.path.join(pkg_dir, file)) as f:
 50 | 			content = f.read()
 51 | 		tree = ast.parse(content)
 52 | 		for doc_string in parse_tree(tree, content):
 53 | 			buff.append(doc_string)
 54 | 	join_str = 2 * '\n'
 55 | 	return join_str.join(buff) + '\n'
 56 | 
 57 | 
 58 | def parse_tree(tree, content):
 59 | 	for node in ast.iter_child_nodes(tree):
 60 | 		if not isinstance(node, _ast.FunctionDef):
 61 | 			continue
 62 | 		doc_string = ast.get_docstring(node)
 63 | 		if not doc_string:
 64 | 			continue
 65 | 		func_def = re.findall("def\s%s\s*(.+?)\s*:" % node.name, content)
 66 | 		assert func_def and len(func_def) == 1
 67 | 		func_def = node.name + func_def[0] + 2 * '\\n\\\n'
 68 | 		doc_string = doc_string.replace('\n', '\\n\\\n').replace('"', '\\"')
 69 | 		doc_string = doc_string.replace('\n' + 8 * ' ', '\n' + 4 * ' ')
 70 | 		doc_string = '#define %s_doc \\\n"%s%s"\n' % (node.name, func_def, doc_string)
 71 | 		yield doc_string
 72 | 
 73 | 
 74 | def format_header():
 75 | 	yield sequence_compare
 76 | 	for cfile, cfuncs in cfunctions.items():
 77 | 		for ctype in ctypes:
 78 | 			if ctype == "array":
 79 | 				yield("#define SEQUENCE_COMP SEQUENCE_COMPARE")
 80 | 			yield('#define unicode %(type)s' % dict(type=ctype))
 81 | 			for cfunc in cfuncs:
 82 | 				yield("#define %(function)s %(tcode)s%(function)s" % dict(function=cfunc, tcode=ctype[0]))
 83 | 			yield('#include "%(file)s.c"' % dict(file=cfile))
 84 | 			yield("#undef unicode")
 85 | 			for cfunc in cfuncs:
 86 | 				yield("#undef %(function)s" % dict(function=cfunc))
 87 | 			if ctype == "array":
 88 | 				yield("#undef SEQUENCE_COMP")
 89 | 			yield("")
 90 | 
 91 | 
 92 | def prepare():
 93 | 	with open(os.path.join(cpkg_dir, "includes.h"), "w") as f:
 94 | 		f.write(make_c_doc())
 95 | 		f.write(4 * '\n')
 96 | 		f.write('\n'.join(format_header()))
 97 | 
 98 | 
 99 | args = sys.argv[1:]
100 | if "prepare" in args:
101 | 	prepare()
102 | 	sys.exit()
103 | 
104 | if "--with-c" in args:
105 | 	args.remove("--with-c")
106 | 	ext_modules = [Extension('distance.cdistance', sources=["cdistance/distance.c"])]
107 | else:
108 | 	sys.stderr.write("notice: no C support available\n")
109 | 	ext_modules = []
110 | 
111 | with open(os.path.join(this_dir, "README.md")) as f:
112 |     long_description = f.read()
113 | 
114 | setup (
115 |     name = 'Distance',
116 |     version = '0.1.3',
117 |     description = 'Utilities for comparing sequences',
118 |     long_description = long_description,
119 |     author='Michaël Meyer',
120 |     author_email='michaelnm.meyer@gmail.com',
121 |     url='https://github.com/doukremt/distance',
122 |     ext_modules = ext_modules,
123 |     script_args = args,
124 |     packages = ['distance'],
125 |     classifiers=(
126 |         'Intended Audience :: Developers',
127 |         'Natural Language :: English',
128 |         'License :: OSI Approved :: GNU General Public License (GPL)',
129 |         'Operating System :: OS Independent',
130 |         'Topic :: Software Development :: Libraries :: Python Modules',
131 |         'Programming Language :: C',
132 |         'Programming Language :: Python',
133 |         'Programming Language :: Python :: 3.3',
134 |     )
135 | )
136 | 


--------------------------------------------------------------------------------
/distance/_levenshtein.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | from array import array
  4 | 
  5 | 
  6 | def levenshtein(seq1, seq2, normalized=False, max_dist=-1):
  7 | 	"""Compute the absolute Levenshtein distance between the two sequences
  8 | 	`seq1` and `seq2`.
  9 | 	
 10 | 	The Levenshtein distance is the minimum number of edit operations necessary
 11 | 	for transforming one sequence into the other. The edit operations allowed are:
 12 | 	
 13 | 		* deletion:     ABC -> BC, AC, AB
 14 | 		* insertion:    ABC -> ABCD, EABC, AEBC..
 15 | 		* substitution: ABC -> ABE, ADC, FBC..
 16 | 	
 17 | 	The `max_dist` parameter controls at which moment we should stop computing the
 18 | 	distance between the provided sequences. If it is a negative integer, the
 19 | 	distance will be computed until the sequences are exhausted; otherwise, the
 20 | 	computation will stop at the moment the calculated distance is higher than
 21 | 	`max_dist`, and then return -1. For example:
 22 | 	
 23 | 		>>> levenshtein("abc", "abcd", max_dist=1)  # dist = 1
 24 | 		1
 25 | 		>>> levenshtein("abc", "abcde", max_dist=1) # dist = 2
 26 | 		-1
 27 | 	
 28 | 	This can be a time saver if you're not interested in the exact distance, but
 29 | 	only need to check if the distance between the given sequences is below a
 30 | 	given threshold.
 31 | 	
 32 | 	The `normalized` parameter is here for backward compatibility; providing
 33 | 	it will result in a call to `nlevenshtein`, which should be used directly
 34 | 	instead. 
 35 | 	"""
 36 | 	if normalized:
 37 | 		return nlevenshtein(seq1, seq2, method=1)
 38 | 		
 39 | 	if seq1 == seq2:
 40 | 		return 0
 41 | 	
 42 | 	len1, len2 = len(seq1), len(seq2)
 43 | 	if max_dist >= 0 and abs(len1 - len2) > max_dist:
 44 | 		return -1
 45 | 	if len1 == 0:
 46 | 		return len2
 47 | 	if len2 == 0:
 48 | 		return len1
 49 | 	if len1 < len2:
 50 | 		len1, len2 = len2, len1
 51 | 		seq1, seq2 = seq2, seq1
 52 | 	
 53 | 	column = array('L', range(len2 + 1))
 54 | 	
 55 | 	for x in range(1, len1 + 1):
 56 | 		column[0] = x
 57 | 		last = x - 1
 58 | 		for y in range(1, len2 + 1):
 59 | 			old = column[y]
 60 | 			cost = int(seq1[x - 1] != seq2[y - 1])
 61 | 			column[y] = min(column[y] + 1, column[y - 1] + 1, last + cost)
 62 | 			last = old
 63 | 		if max_dist >= 0 and min(column) > max_dist:
 64 | 			return -1
 65 | 	
 66 | 	if max_dist >= 0 and column[len2] > max_dist:
 67 | 		# stay consistent, even if we have the exact distance
 68 | 		return -1
 69 | 	return column[len2]
 70 | 
 71 | 
 72 | def nlevenshtein(seq1, seq2, method=1):
 73 | 	"""Compute the normalized Levenshtein distance between `seq1` and `seq2`.
 74 | 	
 75 | 	Two normalization methods are provided. For both of them, the normalized
 76 | 	distance will be a float between 0 and 1, where 0 means equal and 1
 77 | 	completely different. The computation obeys the following patterns:
 78 | 	
 79 | 		0.0                       if seq1 == seq2
 80 | 		1.0                       if len(seq1) == 0 or len(seq2) == 0
 81 | 		edit distance / factor    otherwise
 82 | 	
 83 | 	The `method` parameter specifies which normalization factor should be used.
 84 | 	It can have the value 1 or 2, which correspond to the following:
 85 | 	
 86 | 		1: the length of the shortest alignment between the sequences
 87 | 		   (that is, the length of the longest sequence)
 88 | 		2: the length of the longest alignment between the sequences
 89 | 	
 90 | 	Which normalization factor should be chosen is a matter of taste. The first
 91 | 	one is cheap to compute. The second one is more costly, but it accounts
 92 | 	better than the first one for parallelisms of symbols between the sequences.
 93 | 		
 94 | 	For the rationale behind the use of the second method, see:
 95 | 	Heeringa, "Measuring Dialect Pronunciation Differences using Levenshtein
 96 | 	Distance", 2004, p. 130 sq, which is available online at:
 97 | 	http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf
 98 | 	"""
 99 | 	
100 | 	if seq1 == seq2:
101 | 		return 0.0
102 | 	len1, len2 = len(seq1), len(seq2)
103 | 	if len1 == 0 or len2 == 0:
104 | 		return 1.0
105 | 	if len1 < len2: # minimize the arrays size
106 | 		len1, len2 = len2, len1
107 | 		seq1, seq2 = seq2, seq1
108 | 	
109 | 	if method == 1:
110 | 		return levenshtein(seq1, seq2) / float(len1)
111 | 	if method != 2:
112 | 		raise ValueError("expected either 1 or 2 for `method` parameter")
113 | 	
114 | 	column = array('L', range(len2 + 1))
115 | 	length = array('L', range(len2 + 1))
116 | 	
117 | 	for x in range(1, len1 + 1):
118 | 	
119 | 		column[0] = length[0] = x
120 | 		last = llast = x - 1
121 | 		
122 | 		for y in range(1, len2 + 1):
123 | 		
124 | 			# dist
125 | 			old = column[y]
126 | 			ic = column[y - 1] + 1
127 | 			dc = column[y] + 1
128 | 			rc = last + (seq1[x - 1] != seq2[y - 1])
129 | 			column[y] = min(ic, dc, rc)
130 | 			last = old
131 | 			
132 | 			# length
133 | 			lold = length[y]
134 | 			lic = length[y - 1] + 1 if ic == column[y] else 0
135 | 			ldc = length[y] + 1 if dc == column[y] else 0
136 | 			lrc = llast + 1 if rc == column[y] else 0
137 | 			length[y] = max(ldc, lic, lrc)
138 | 			llast = lold
139 | 	
140 | 	return column[y] / float(length[y])
141 | 


--------------------------------------------------------------------------------
/tests/tests.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | from array import array
  3 | try:
  4 | 	from distance import cdistance
  5 | except ImportError:
  6 | 	cdistance = None
  7 | from distance import _pyimports as pydistance
  8 | 
  9 | 
 10 | if sys.version_info.major < 3:
 11 | 	t_unicode = unicode
 12 | 	t_bytes = lambda s: s
 13 | else:
 14 | 	t_unicode = lambda s: s
 15 | 	t_bytes = lambda s: s.encode()
 16 | 
 17 | all_types = [
 18 | 	("unicode", t_unicode),
 19 | 	("bytes", t_bytes),
 20 | 	("list", list),
 21 | 	("tuple", tuple),
 22 | ]
 23 | 
 24 | 
 25 | def hamming(func, t, **kwargs):
 26 | 
 27 | 	# types; only for c
 28 | 	if kwargs["lang"] == "C":
 29 | 		try:
 30 | 			func(1, t("foo"))
 31 | 		except ValueError:
 32 | 			pass
 33 | 		try:
 34 | 			func(t("foo"), 1)
 35 | 		except ValueError:
 36 | 			pass
 37 | 
 38 | 	# empty string
 39 | 	assert func(t(""), t("")) == 0
 40 | 	
 41 | 	# common
 42 | 	assert func(t("abc"), t("abc")) == 0
 43 | 	assert func(t("abc"), t("abd")) == 1
 44 | 	
 45 | 	# wrong length
 46 | 	try:
 47 | 		func(t("foo"), t("foobar"))
 48 | 	except ValueError:
 49 | 		pass
 50 | 
 51 | 	try:
 52 | 		func(t(""), t("foo"))
 53 | 	except ValueError:
 54 | 		pass
 55 | 	
 56 | 	# normalization
 57 | 	assert func(t(""), t(""), normalized=True) == 0.0
 58 | 	assert func(t("abc"), t("abc"), normalized=True) == 0.0
 59 | 	assert func(t("ab"), t("ac"), normalized=True) == 0.5
 60 | 	assert func(t("abc"), t("def"), normalized=True) == 1.0
 61 | 
 62 | 
 63 | def fast_comp(func, t, **kwargs):
 64 | 
 65 | 	# types; only for c
 66 | 	if kwargs["lang"] == "C":
 67 | 		try:
 68 | 			func(1, t("foo"))
 69 | 		except ValueError:
 70 | 			pass
 71 | 		try:
 72 | 			func(t("foo"), 1)
 73 | 		except ValueError:
 74 | 			pass
 75 | 
 76 | 	# empty strings
 77 | 	assert func(t(""), t("")) == 0
 78 | 	assert func(t(""), t("a")) == func(t("a"), t("")) == 1
 79 | 
 80 | 	# edit ops
 81 | 	assert func(t("aa"), t("aa")) == 0
 82 | 	assert func(t("ab"), t("aa")) == 1
 83 | 	assert func(t("ab"), t("a")) == 1
 84 | 	assert func(t("ab"), t("abc")) == 1
 85 | 	
 86 | 	# dist limit
 87 | 	assert func(t("a"), t("bcd")) == func(t("bcd"), t("a")) == -1
 88 | 	
 89 | 	# transpositions
 90 | 	assert func(t("abc"), t("bac"), transpositions=True) == \
 91 | 		func(t("bac"), t("abc"), transpositions=True) == 1
 92 | 	
 93 | 
 94 | 
 95 | def levenshtein(func, t, **kwargs):
 96 | 
 97 | 	# types; only for c
 98 | 	if kwargs["lang"] == "C":
 99 | 		try:
100 | 			func(1, t("foo"))
101 | 		except ValueError:
102 | 			pass
103 | 		try:
104 | 			func(t("foo"), 1)
105 | 		except ValueError:
106 | 			pass
107 | 
108 | 	# empty strings
109 | 	assert func(t(""), t("")) == 0
110 | 	assert func(t(""), t("abcd")) == func(t("abcd"), t("")) == 4
111 | 	
112 | 	# edit ops
113 | 	assert func(t("aa"), t("aa")) == 0
114 | 	assert func(t("ab"), t("aa")) == 1
115 | 	assert func(t("ab"), t("a")) == 1
116 | 	assert func(t("ab"), t("abc")) == 1
117 | 	
118 | 	# dist limit
119 | 	assert func(t("a"), t("b"), max_dist=0) == -1
120 | 	assert func(t("a"), t("b"), max_dist=1) == 1
121 | 	assert func(t("foo"), t("bar"), max_dist=-1) == 3
122 | 
123 | 
124 | def nlevenshtein(func, t, **kwargs):
125 | 
126 | 	# types; only for c
127 | 	if kwargs["lang"] == "C":
128 | 		try:
129 | 			func(1, t("foo"))
130 | 		except ValueError:
131 | 			pass
132 | 		try:
133 | 			func(t("foo"), 1)
134 | 		except ValueError:
135 | 			pass
136 | 
137 | 	# empty strings
138 | 	assert func(t(""), t(""), 1) == func(t(""), t(""), 2) == 0.0
139 | 	assert func(t(""), t("foo"), 1) == func(t("foo"), t(""), 1) == \
140 | 		func(t(""), t("foo"), 2) == func(t("foo"), t(""), 2) == 1.0
141 | 
142 | 	assert func(t("aa"), t("aa"), 1) == func(t("aa"), t("aa"), 2) == 0.0
143 | 	assert func(t("ab"), t("aa"), 1) == func(t("ab"), t("aa"), 2) == 0.5
144 | 	assert func(t("ab"), t("a"), 1) == func(t("ab"), t("a"), 2) == 0.5
145 | 	assert func(t("ab"), t("abc"), 1) == func(t("ab"), t("abc"), 2) == 0.3333333333333333
146 | 
147 | 	# multiple alignments
148 | 	assert func(t("abc"), t("adb"), 1) == 0.6666666666666666
149 | 	assert func(t("abc"), t("adb"), 2) == 0.5
150 | 
151 | 
152 | def lcsubstrings(func, t, **kwargs):
153 | 
154 | 	# types; only for c
155 | 	if kwargs["lang"] == "C":
156 | 		try:
157 | 			func(1, t("foo"))
158 | 		except ValueError:
159 | 			pass
160 | 		try:
161 | 			func(t("foo"), 1)
162 | 		except ValueError:
163 | 			pass
164 | 
165 | 	# empty strings
166 | 	try:
167 | 		assert func(t(""), t(""), False) == set()
168 | 	except TypeError:
169 | 		if t is not list: raise
170 | 	assert func(t(""), t(""), True) == (0, ())
171 | 	try:
172 | 		assert func(t(""), t("foo"), False) == func(t("foo"), t(""), False) == set()
173 | 	except TypeError:
174 | 		if t is not list: raise
175 | 	assert func(t(""), t("foo"), True) == func(t("foo"), t(""), True) == (0, ())
176 | 	
177 | 	# common
178 | 	try:
179 | 		assert func(t("abcd"), t("cdba"), False) == {t('cd')}
180 | 	except TypeError:
181 | 		if t is not list: raise
182 | 	assert func(t("abcd"), t("cdba"), True) == (2, ((2, 0),))
183 | 	
184 | 	# reverse
185 | 	try:
186 | 		assert func(t("abcdef"), t("cdba"), False) == func(t("cdba"), t("abcdef"), False)
187 | 	except TypeError:
188 | 		if t is not list: raise
189 | 	assert func(t("abcdef"), t("cdba"), True) == func(t("cdba"), t("abcdef"), True)
190 | 
191 | 
192 | def itors_common(func, t, **kwargs):
193 | 
194 | 	if kwargs["lang"] == "C":
195 | 		# types check; only need to do it for C impl to avoid an eventual segfaults.
196 | 		try: func(1, t("foo"))
197 | 		except ValueError: pass
198 | 	
199 | 		itor = func(t("foo"), [t("foo"), 3333])
200 | 		next(itor)
201 | 		try: next(itor)
202 | 		except ValueError: pass
203 | 
204 | 	# values drop
205 | 	itor = func(t("aa"), [t("aa"), t("abcd"), t("ba")])
206 | 	assert next(itor) == (0, t("aa"))
207 | 	assert next(itor) == (1, t("ba"))
208 | 
209 | 
210 | def ilevenshtein(func, t, **kwargs):
211 | 	itors_common(lambda a, b: func(a, b, max_dist=2), t, **kwargs)
212 | 	
213 | 
214 | def ifast_comp(func, t, **kwargs):
215 | 	itors_common(func, t, **kwargs)
216 | 	#transpositions
217 | 	g = func(t("abc"), [t("bac")], transpositions=False)
218 | 	assert next(g) == (2, t('bac'))
219 | 	g = func(t("abc"), [t("bac")], transpositions=True)
220 | 	assert next(g) == (1, t("bac"))
221 | 	
222 | 
223 | write = lambda s: sys.stderr.write(s + '\n')
224 | 
225 | tests = ["hamming", "fast_comp", "levenshtein", "lcsubstrings", "nlevenshtein", "ilevenshtein", "ifast_comp"]
226 | 
227 | 
228 | def run_test(name):
229 | 	if cdistance:
230 | 		cfunc = getattr(cdistance, name)
231 | 		run_lang_test(name, cfunc, "C")
232 | 		write("")
233 | 	pyfunc = getattr(pydistance, name)
234 | 	run_lang_test(name, pyfunc, "py")
235 | 	if cdistance is None:
236 | 		write("skipped C tests")
237 | 	write("")
238 | 
239 | 
240 | def run_lang_test(name, func, lang):
241 | 	print("%s (%s)..." % (name, lang))
242 | 	for tname, typ in all_types:
243 | 		write("type: %s" % tname)
244 | 		globals()[name](func, typ, lang=lang)
245 | 
246 | if __name__ == "__main__":
247 | 	args = sys.argv[1:]
248 | 	if not args:
249 | 		for test in tests:
250 | 			run_test(test)
251 | 		sys.exit()
252 | 	for name in args:
253 | 		if name in tests:
254 | 			run_test(name)
255 | 		else:
256 | 			write("no such test: %s" % name)
257 | 			sys.exit(1)
258 | 


--------------------------------------------------------------------------------
/cdistance/includes.h:
--------------------------------------------------------------------------------
  1 | #define hamming_doc \
  2 | "hamming(seq1, seq2, normalized=False)\n\
  3 | \n\
  4 | Compute the Hamming distance between the two sequences `seq1` and `seq2`.\n\
  5 | The Hamming distance is the number of differing items in two ordered\n\
  6 | sequences of the same length. If the sequences submitted do not have the\n\
  7 | same length, an error will be raised.\n\
  8 | \n\
  9 | If `normalized` evaluates to `False`, the return value will be an integer\n\
 10 | between 0 and the length of the sequences provided, edge values included;\n\
 11 | otherwise, it will be a float between 0 and 1 included, where 0 means\n\
 12 | equal, and 1 totally different. Normalized hamming distance is computed as:\n\
 13 | \n\
 14 |     0.0                         if len(seq1) == 0\n\
 15 |     hamming_dist / len(seq1)    otherwise"
 16 | 
 17 | 
 18 | #define jaccard_doc \
 19 | "jaccard(seq1, seq2)\n\
 20 | \n\
 21 | Compute the Jaccard distance between the two sequences `seq1` and `seq2`.\n\
 22 | They should contain hashable items.\n\
 23 | \n\
 24 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different."
 25 | 
 26 | 
 27 | #define sorensen_doc \
 28 | "sorensen(seq1, seq2)\n\
 29 | \n\
 30 | Compute the Sorensen distance between the two sequences `seq1` and `seq2`.\n\
 31 | They should contain hashable items.\n\
 32 | \n\
 33 | The return value is a float between 0 and 1, where 0 means equal, and 1 totally different."
 34 | 
 35 | 
 36 | #define lcsubstrings_doc \
 37 | "lcsubstrings(seq1, seq2, positions=False)\n\
 38 | \n\
 39 | Find the longest common substring(s) in the sequences `seq1` and `seq2`.\n\
 40 | \n\
 41 | If positions evaluates to `True` only their positions will be returned,\n\
 42 | together with their length, in a tuple:\n\
 43 | \n\
 44 |     (length, [(start pos in seq1, start pos in seq2)..])\n\
 45 | \n\
 46 | Otherwise, the substrings themselves will be returned, in a set.\n\
 47 | \n\
 48 | Example:\n\
 49 | \n\
 50 |     >>> lcsubstrings(\"sedentar\", \"dentist\")\n\
 51 |     {'dent'}\n\
 52 |     >>> lcsubstrings(\"sedentar\", \"dentist\", positions=True)\n\
 53 |     (4, [(2, 0)])"
 54 | 
 55 | 
 56 | #define ilevenshtein_doc \
 57 | "ilevenshtein(seq1, seqs, max_dist=-1)\n\
 58 | \n\
 59 | Compute the Levenshtein distance between the sequence `seq1` and the series\n\
 60 | of      sequences `seqs`.\n\
 61 | \n\
 62 |     `seq1`: the reference sequence\n\
 63 |     `seqs`: a series of sequences (can be a generator)\n\
 64 |     `max_dist`: if provided and > 0, only the sequences which distance from\n\
 65 |     the reference sequence is lower or equal to this value will be returned.\n\
 66 | \n\
 67 | The return value is a series of pairs (distance, sequence).\n\
 68 | \n\
 69 | The sequence objects in `seqs` are expected to be of the same kind than\n\
 70 | the reference sequence in the C implementation; the same holds true for\n\
 71 | `ifast_comp`."
 72 | 
 73 | 
 74 | #define ifast_comp_doc \
 75 | "ifast_comp(seq1, seqs, transpositions=False)\n\
 76 | \n\
 77 | Return an iterator over all the sequences in `seqs` which distance from\n\
 78 | `seq1` is lower or equal to 2. The sequences which distance from the\n\
 79 | reference sequence is higher than that are dropped.\n\
 80 | \n\
 81 |     `seq1`: the reference sequence.\n\
 82 |     `seqs`: a series of sequences (can be a generator)\n\
 83 |     `transpositions` has the same sense than in `fast_comp`.\n\
 84 | \n\
 85 | The return value is a series of pairs (distance, sequence).\n\
 86 | \n\
 87 | You might want to call `sorted()` on the iterator to get the results in a\n\
 88 | significant order:\n\
 89 | \n\
 90 |     >>> g = ifast_comp(\"foo\", [\"fo\", \"bar\", \"foob\", \"foo\", \"foobaz\"])\n\
 91 |     >>> sorted(g)\n\
 92 |     [(0, 'foo'), (1, 'fo'), (1, 'foob')]"
 93 | 
 94 | 
 95 | #define fast_comp_doc \
 96 | "fast_comp(seq1, seq2, transpositions=False)\n\
 97 | \n\
 98 | Compute the distance between the two sequences `seq1` and `seq2` up to a\n\
 99 | maximum of 2 included, and return it. If the edit distance between the two\n\
100 | sequences is higher than that, -1 is returned.\n\
101 | \n\
102 | If `transpositions` is `True`, transpositions will be taken into account for\n\
103 | the computation of the distance. This can make a difference, e.g.:\n\
104 | \n\
105 |     >>> fast_comp(\"abc\", \"bac\", transpositions=False)\n\
106 |     2\n\
107 |     >>> fast_comp(\"abc\", \"bac\", transpositions=True)\n\
108 |     1\n\
109 | \n\
110 | This is faster than `levenshtein` by an order of magnitude, but on the\n\
111 | other hand is of limited use.\n\
112 | \n\
113 | The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.\n\
114 | I've added transpositions support to the original code."
115 | 
116 | 
117 | #define levenshtein_doc \
118 | "levenshtein(seq1, seq2, max_dist=-1, normalized=False)\n\
119 | \n\
120 | Compute the absolute Levenshtein distance between the two sequences\n\
121 | `seq1` and `seq2`.\n\
122 | \n\
123 | The Levenshtein distance is the minimum number of edit operations necessary\n\
124 | for transforming one sequence into the other. The edit operations allowed are:\n\
125 | \n\
126 |     * deletion:     ABC -> BC, AC, AB\n\
127 |     * insertion:    ABC -> ABCD, EABC, AEBC..\n\
128 |     * substitution: ABC -> ABE, ADC, FBC..\n\
129 | \n\
130 | The `max_dist` parameter controls at which moment we should stop computing the\n\
131 | distance between the provided sequences. If it is a negative integer, the\n\
132 | distance will be computed until the sequences are exhausted; otherwise, the\n\
133 | computation will stop at the moment the calculated distance is higher than\n\
134 | `max_dist`, and then return -1. For example:\n\
135 | \n\
136 |     >>> levenshtein(\"abc\", \"abcd\", max_dist=1)  # dist = 1\n\
137 |     1\n\
138 |     >>> levenshtein(\"abc\", \"abcde\", max_dist=1) # dist = 2\n\
139 |     -1\n\
140 | \n\
141 | This can be a time saver if you're not interested in the exact distance, but\n\
142 | only need to check if the distance between the given sequences is below a\n\
143 | given threshold.\n\
144 | \n\
145 | The `normalized` parameter is here for backward compatibility; providing\n\
146 | it will result in a call to `nlevenshtein`, which should be used directly\n\
147 | instead. "
148 | 
149 | 
150 | #define nlevenshtein_doc \
151 | "nlevenshtein(seq1, seq2, method=1)\n\
152 | \n\
153 | Compute the normalized Levenshtein distance between `seq1` and `seq2`.\n\
154 | \n\
155 | Two normalization methods are provided. For both of them, the normalized\n\
156 | distance will be a float between 0 and 1, where 0 means equal and 1\n\
157 | completely different. The computation obeys the following patterns:\n\
158 | \n\
159 |     0.0                       if seq1 == seq2\n\
160 |     1.0                       if len(seq1) == 0 or len(seq2) == 0\n\
161 |     edit distance / factor    otherwise\n\
162 | \n\
163 | The `method` parameter specifies which normalization factor should be used.\n\
164 | It can have the value 1 or 2, which correspond to the following:\n\
165 | \n\
166 |     1: the length of the shortest alignment between the sequences\n\
167 |        (that is, the length of the longest sequence)\n\
168 |     2: the length of the longest alignment between the sequences\n\
169 | \n\
170 | Which normalization factor should be chosen is a matter of taste. The first\n\
171 | one is cheap to compute. The second one is more costly, but it accounts\n\
172 | better than the first one for parallelisms of symbols between the sequences.\n\
173 |     \n\
174 | For the rationale behind the use of the second method, see:\n\
175 | Heeringa, \"Measuring Dialect Pronunciation Differences using Levenshtein\n\
176 | Distance\", 2004, p. 130 sq, which is available online at:\n\
177 | http://www.let.rug.nl/~heeringa/dialectology/thesis/thesis.pdf"
178 | 
179 | 
180 | 
181 | 
182 | 
183 | #define SEQUENCE_COMPARE(s1, i1, s2, i2) \
184 | (PyObject_RichCompareBool( \
185 | 	PySequence_Fast_GET_ITEM((s1), (i1)), \
186 | 	PySequence_Fast_GET_ITEM((s2), (i2)), \
187 | 	Py_EQ) \
188 | )
189 | 
190 | #define unicode unicode
191 | #define hamming uhamming
192 | #include "hamming.c"
193 | #undef unicode
194 | #undef hamming
195 | 
196 | #define unicode byte
197 | #define hamming bhamming
198 | #include "hamming.c"
199 | #undef unicode
200 | #undef hamming
201 | 
202 | #define SEQUENCE_COMP SEQUENCE_COMPARE
203 | #define unicode array
204 | #define hamming ahamming
205 | #include "hamming.c"
206 | #undef unicode
207 | #undef hamming
208 | #undef SEQUENCE_COMP
209 | 
210 | #define unicode unicode
211 | #define levenshtein ulevenshtein
212 | #define nlevenshtein unlevenshtein
213 | #include "levenshtein.c"
214 | #undef unicode
215 | #undef levenshtein
216 | #undef nlevenshtein
217 | 
218 | #define unicode byte
219 | #define levenshtein blevenshtein
220 | #define nlevenshtein bnlevenshtein
221 | #include "levenshtein.c"
222 | #undef unicode
223 | #undef levenshtein
224 | #undef nlevenshtein
225 | 
226 | #define SEQUENCE_COMP SEQUENCE_COMPARE
227 | #define unicode array
228 | #define levenshtein alevenshtein
229 | #define nlevenshtein anlevenshtein
230 | #include "levenshtein.c"
231 | #undef unicode
232 | #undef levenshtein
233 | #undef nlevenshtein
234 | #undef SEQUENCE_COMP
235 | 
236 | #define unicode unicode
237 | #define lcsubstrings ulcsubstrings
238 | #include "lcsubstrings.c"
239 | #undef unicode
240 | #undef lcsubstrings
241 | 
242 | #define unicode byte
243 | #define lcsubstrings blcsubstrings
244 | #include "lcsubstrings.c"
245 | #undef unicode
246 | #undef lcsubstrings
247 | 
248 | #define SEQUENCE_COMP SEQUENCE_COMPARE
249 | #define unicode array
250 | #define lcsubstrings alcsubstrings
251 | #include "lcsubstrings.c"
252 | #undef unicode
253 | #undef lcsubstrings
254 | #undef SEQUENCE_COMP
255 | 
256 | #define unicode unicode
257 | #define fastcomp ufastcomp
258 | #include "fastcomp.c"
259 | #undef unicode
260 | #undef fastcomp
261 | 
262 | #define unicode byte
263 | #define fastcomp bfastcomp
264 | #include "fastcomp.c"
265 | #undef unicode
266 | #undef fastcomp
267 | 
268 | #define SEQUENCE_COMP SEQUENCE_COMPARE
269 | #define unicode array
270 | #define fastcomp afastcomp
271 | #include "fastcomp.c"
272 | #undef unicode
273 | #undef fastcomp
274 | #undef SEQUENCE_COMP
275 | 


--------------------------------------------------------------------------------
/cdistance/utarray.h:
--------------------------------------------------------------------------------
  1 | /*
  2 | Copyright (c) 2008-2013, Troy D. Hanson   http://troydhanson.github.com/uthash/
  3 | All rights reserved.
  4 | 
  5 | Redistribution and use in source and binary forms, with or without
  6 | modification, are permitted provided that the following conditions are met:
  7 | 
  8 |     * Redistributions of source code must retain the above copyright
  9 |       notice, this list of conditions and the following disclaimer.
 10 | 
 11 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
 12 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 13 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
 14 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
 15 | OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 16 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 17 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 18 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
 19 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
 20 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 21 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 22 | */
 23 | 
 24 | /* a dynamic array implementation using macros 
 25 |  */
 26 | #ifndef UTARRAY_H
 27 | #define UTARRAY_H
 28 | 
 29 | #define UTARRAY_VERSION 1.9.8
 30 | 
 31 | #ifdef __GNUC__
 32 | #define _UNUSED_ __attribute__ ((__unused__)) 
 33 | #else
 34 | #define _UNUSED_ 
 35 | #endif
 36 | 
 37 | #include <stddef.h>  /* size_t */
 38 | #include <string.h>  /* memset, etc */
 39 | #include <stdlib.h>  /* exit */
 40 | 
 41 | #define oom() exit(-1)
 42 | 
 43 | typedef void (ctor_f)(void *dst, const void *src);
 44 | typedef void (dtor_f)(void *elt);
 45 | typedef void (init_f)(void *elt);
 46 | typedef struct {
 47 |     size_t sz;
 48 |     init_f *init;
 49 |     ctor_f *copy;
 50 |     dtor_f *dtor;
 51 | } UT_icd;
 52 | 
 53 | typedef struct {
 54 |     unsigned i,n;/* i: index of next available slot, n: num slots */
 55 |     UT_icd icd;  /* initializer, copy and destructor functions */
 56 |     char *d;     /* n slots of size icd->sz*/
 57 | } UT_array;
 58 | 
 59 | #define utarray_init(a,_icd) do {                                             \
 60 |   memset(a,0,sizeof(UT_array));                                               \
 61 |   (a)->icd=*_icd;                                                             \
 62 | } while(0)
 63 | 
 64 | #define utarray_done(a) do {                                                  \
 65 |   if ((a)->n) {                                                               \
 66 |     if ((a)->icd.dtor) {                                                      \
 67 |       size_t _ut_i;                                                           \
 68 |       for(_ut_i=0; _ut_i < (a)->i; _ut_i++) {                                 \
 69 |         (a)->icd.dtor(utarray_eltptr(a,_ut_i));                               \
 70 |       }                                                                       \
 71 |     }                                                                         \
 72 |     free((a)->d);                                                             \
 73 |   }                                                                           \
 74 |   (a)->n=0;                                                                   \
 75 | } while(0)
 76 | 
 77 | #define utarray_new(a,_icd) do {                                              \
 78 |   a=(UT_array*)malloc(sizeof(UT_array));                                      \
 79 |   utarray_init(a,_icd);                                                       \
 80 | } while(0)
 81 | 
 82 | #define utarray_free(a) do {                                                  \
 83 |   utarray_done(a);                                                            \
 84 |   free(a);                                                                    \
 85 | } while(0)
 86 | 
 87 | #define utarray_reserve(a,by) do {                                            \
 88 |   if (((a)->i+by) > ((a)->n)) {                                               \
 89 |     while(((a)->i+by) > ((a)->n)) { (a)->n = ((a)->n ? (2*(a)->n) : 8); }     \
 90 |     if ( ((a)->d=(char*)realloc((a)->d, (a)->n*(a)->icd.sz)) == NULL) oom();  \
 91 |   }                                                                           \
 92 | } while(0)
 93 | 
 94 | #define utarray_push_back(a,p) do {                                           \
 95 |   utarray_reserve(a,1);                                                       \
 96 |   if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,(a)->i++), p); }      \
 97 |   else { memcpy(_utarray_eltptr(a,(a)->i++), p, (a)->icd.sz); };              \
 98 | } while(0)
 99 | 
100 | #define utarray_pop_back(a) do {                                              \
101 |   if ((a)->icd.dtor) { (a)->icd.dtor( _utarray_eltptr(a,--((a)->i))); }       \
102 |   else { (a)->i--; }                                                          \
103 | } while(0)
104 | 
105 | #define utarray_extend_back(a) do {                                           \
106 |   utarray_reserve(a,1);                                                       \
107 |   if ((a)->icd.init) { (a)->icd.init(_utarray_eltptr(a,(a)->i)); }            \
108 |   else { memset(_utarray_eltptr(a,(a)->i),0,(a)->icd.sz); }                   \
109 |   (a)->i++;                                                                   \
110 | } while(0)
111 | 
112 | #define utarray_len(a) ((a)->i)
113 | 
114 | #define utarray_eltptr(a,j) (((j) < (a)->i) ? _utarray_eltptr(a,j) : NULL)
115 | #define _utarray_eltptr(a,j) ((char*)((a)->d + ((a)->icd.sz*(j) )))
116 | 
117 | #define utarray_insert(a,p,j) do {                                            \
118 |   if (j > (a)->i) utarray_resize(a,j);                                        \
119 |   utarray_reserve(a,1);                                                       \
120 |   if ((j) < (a)->i) {                                                         \
121 |     memmove( _utarray_eltptr(a,(j)+1), _utarray_eltptr(a,j),                  \
122 |              ((a)->i - (j))*((a)->icd.sz));                                   \
123 |   }                                                                           \
124 |   if ((a)->icd.copy) { (a)->icd.copy( _utarray_eltptr(a,j), p); }             \
125 |   else { memcpy(_utarray_eltptr(a,j), p, (a)->icd.sz); };                     \
126 |   (a)->i++;                                                                   \
127 | } while(0)
128 | 
129 | #define utarray_inserta(a,w,j) do {                                           \
130 |   if (utarray_len(w) == 0) break;                                             \
131 |   if (j > (a)->i) utarray_resize(a,j);                                        \
132 |   utarray_reserve(a,utarray_len(w));                                          \
133 |   if ((j) < (a)->i) {                                                         \
134 |     memmove(_utarray_eltptr(a,(j)+utarray_len(w)),                            \
135 |             _utarray_eltptr(a,j),                                             \
136 |             ((a)->i - (j))*((a)->icd.sz));                                    \
137 |   }                                                                           \
138 |   if ((a)->icd.copy) {                                                        \
139 |     size_t _ut_i;                                                             \
140 |     for(_ut_i=0;_ut_i<(w)->i;_ut_i++) {                                       \
141 |       (a)->icd.copy(_utarray_eltptr(a,j+_ut_i), _utarray_eltptr(w,_ut_i));    \
142 |     }                                                                         \
143 |   } else {                                                                    \
144 |     memcpy(_utarray_eltptr(a,j), _utarray_eltptr(w,0),                        \
145 |            utarray_len(w)*((a)->icd.sz));                                     \
146 |   }                                                                           \
147 |   (a)->i += utarray_len(w);                                                   \
148 | } while(0)
149 | 
150 | #define utarray_resize(dst,num) do {                                          \
151 |   size_t _ut_i;                                                               \
152 |   if (dst->i > (size_t)(num)) {                                               \
153 |     if ((dst)->icd.dtor) {                                                    \
154 |       for(_ut_i=num; _ut_i < dst->i; _ut_i++) {                               \
155 |         (dst)->icd.dtor(utarray_eltptr(dst,_ut_i));                           \
156 |       }                                                                       \
157 |     }                                                                         \
158 |   } else if (dst->i < (size_t)(num)) {                                        \
159 |     utarray_reserve(dst,num-dst->i);                                          \
160 |     if ((dst)->icd.init) {                                                    \
161 |       for(_ut_i=dst->i; _ut_i < num; _ut_i++) {                               \
162 |         (dst)->icd.init(utarray_eltptr(dst,_ut_i));                           \
163 |       }                                                                       \
164 |     } else {                                                                  \
165 |       memset(_utarray_eltptr(dst,dst->i),0,(dst)->icd.sz*(num-dst->i));       \
166 |     }                                                                         \
167 |   }                                                                           \
168 |   dst->i = num;                                                               \
169 | } while(0)
170 | 
171 | #define utarray_concat(dst,src) do {                                          \
172 |   utarray_inserta((dst),(src),utarray_len(dst));                              \
173 | } while(0)
174 | 
175 | #define utarray_erase(a,pos,len) do {                                         \
176 |   if ((a)->icd.dtor) {                                                        \
177 |     size_t _ut_i;                                                             \
178 |     for(_ut_i=0; _ut_i < len; _ut_i++) {                                      \
179 |       (a)->icd.dtor(utarray_eltptr((a),pos+_ut_i));                           \
180 |     }                                                                         \
181 |   }                                                                           \
182 |   if ((a)->i > (pos+len)) {                                                   \
183 |     memmove( _utarray_eltptr((a),pos), _utarray_eltptr((a),pos+len),          \
184 |             (((a)->i)-(pos+len))*((a)->icd.sz));                              \
185 |   }                                                                           \
186 |   (a)->i -= (len);                                                            \
187 | } while(0)
188 | 
189 | #define utarray_renew(a,u) do {                                               \
190 |   if (a) utarray_clear(a); \
191 |   else utarray_new((a),(u));   \
192 | } while(0) 
193 | 
194 | #define utarray_clear(a) do {                                                 \
195 |   if ((a)->i > 0) {                                                           \
196 |     if ((a)->icd.dtor) {                                                      \
197 |       size_t _ut_i;                                                           \
198 |       for(_ut_i=0; _ut_i < (a)->i; _ut_i++) {                                 \
199 |         (a)->icd.dtor(utarray_eltptr(a,_ut_i));                               \
200 |       }                                                                       \
201 |     }                                                                         \
202 |     (a)->i = 0;                                                               \
203 |   }                                                                           \
204 | } while(0)
205 | 
206 | #define utarray_sort(a,cmp) do {                                              \
207 |   qsort((a)->d, (a)->i, (a)->icd.sz, cmp);                                    \
208 | } while(0)
209 | 
210 | #define utarray_find(a,v,cmp) bsearch((v),(a)->d,(a)->i,(a)->icd.sz,cmp)
211 | 
212 | #define utarray_front(a) (((a)->i) ? (_utarray_eltptr(a,0)) : NULL)
213 | #define utarray_next(a,e) (((e)==NULL) ? utarray_front(a) : ((((a)->i) > (utarray_eltidx(a,e)+1)) ? _utarray_eltptr(a,utarray_eltidx(a,e)+1) : NULL))
214 | #define utarray_prev(a,e) (((e)==NULL) ? utarray_back(a) : ((utarray_eltidx(a,e) > 0) ? _utarray_eltptr(a,utarray_eltidx(a,e)-1) : NULL))
215 | #define utarray_back(a) (((a)->i) ? (_utarray_eltptr(a,(a)->i-1)) : NULL)
216 | #define utarray_eltidx(a,e) (((char*)(e) >= (char*)((a)->d)) ? (((char*)(e) - (char*)((a)->d))/(ssize_t)(a)->icd.sz) : -1)
217 | 
218 | /* last we pre-define a few icd for common utarrays of ints and strings */
219 | static void utarray_str_cpy(void *dst, const void *src) {
220 |   char **_src = (char**)src, **_dst = (char**)dst;
221 |   *_dst = (*_src == NULL) ? NULL : strdup(*_src);
222 | }
223 | static void utarray_str_dtor(void *elt) {
224 |   char **eltc = (char**)elt;
225 |   if (*eltc) free(*eltc);
226 | }
227 | static const UT_icd ut_str_icd _UNUSED_ = {sizeof(char*),NULL,utarray_str_cpy,utarray_str_dtor};
228 | static const UT_icd ut_int_icd _UNUSED_ = {sizeof(int),NULL,NULL,NULL};
229 | static const UT_icd ut_ptr_icd _UNUSED_ = {sizeof(void*),NULL,NULL,NULL};
230 | 
231 | 
232 | #endif /* UTARRAY_H */
233 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | distance license
  2 | ================
  3 | 
  4 | Copyright (C) 2013 Michaël Meyer
  5 | 
  6 | GNU GENERAL PUBLIC LICENSE
  7 |                        Version 2, June 1991
  8 | 
  9 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., <http://fsf.org/>
 10 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 11 |  Everyone is permitted to copy and distribute verbatim copies
 12 |  of this license document, but changing it is not allowed.
 13 | 
 14 |                             Preamble
 15 | 
 16 |   The licenses for most software are designed to take away your
 17 | freedom to share and change it.  By contrast, the GNU General Public
 18 | License is intended to guarantee your freedom to share and change free
 19 | software--to make sure the software is free for all its users.  This
 20 | General Public License applies to most of the Free Software
 21 | Foundation's software and to any other program whose authors commit to
 22 | using it.  (Some other Free Software Foundation software is covered by
 23 | the GNU Lesser General Public License instead.)  You can apply it to
 24 | your programs, too.
 25 | 
 26 |   When we speak of free software, we are referring to freedom, not
 27 | price.  Our General Public Licenses are designed to make sure that you
 28 | have the freedom to distribute copies of free software (and charge for
 29 | this service if you wish), that you receive source code or can get it
 30 | if you want it, that you can change the software or use pieces of it
 31 | in new free programs; and that you know you can do these things.
 32 | 
 33 |   To protect your rights, we need to make restrictions that forbid
 34 | anyone to deny you these rights or to ask you to surrender the rights.
 35 | These restrictions translate to certain responsibilities for you if you
 36 | distribute copies of the software, or if you modify it.
 37 | 
 38 |   For example, if you distribute copies of such a program, whether
 39 | gratis or for a fee, you must give the recipients all the rights that
 40 | you have.  You must make sure that they, too, receive or can get the
 41 | source code.  And you must show them these terms so they know their
 42 | rights.
 43 | 
 44 |   We protect your rights with two steps: (1) copyright the software, and
 45 | (2) offer you this license which gives you legal permission to copy,
 46 | distribute and/or modify the software.
 47 | 
 48 |   Also, for each author's protection and ours, we want to make certain
 49 | that everyone understands that there is no warranty for this free
 50 | software.  If the software is modified by someone else and passed on, we
 51 | want its recipients to know that what they have is not the original, so
 52 | that any problems introduced by others will not reflect on the original
 53 | authors' reputations.
 54 | 
 55 |   Finally, any free program is threatened constantly by software
 56 | patents.  We wish to avoid the danger that redistributors of a free
 57 | program will individually obtain patent licenses, in effect making the
 58 | program proprietary.  To prevent this, we have made it clear that any
 59 | patent must be licensed for everyone's free use or not licensed at all.
 60 | 
 61 |   The precise terms and conditions for copying, distribution and
 62 | modification follow.
 63 | 
 64 |                     GNU GENERAL PUBLIC LICENSE
 65 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 66 | 
 67 |   0. This License applies to any program or other work which contains
 68 | a notice placed by the copyright holder saying it may be distributed
 69 | under the terms of this General Public License.  The "Program", below,
 70 | refers to any such program or work, and a "work based on the Program"
 71 | means either the Program or any derivative work under copyright law:
 72 | that is to say, a work containing the Program or a portion of it,
 73 | either verbatim or with modifications and/or translated into another
 74 | language.  (Hereinafter, translation is included without limitation in
 75 | the term "modification".)  Each licensee is addressed as "you".
 76 | 
 77 | Activities other than copying, distribution and modification are not
 78 | covered by this License; they are outside its scope.  The act of
 79 | running the Program is not restricted, and the output from the Program
 80 | is covered only if its contents constitute a work based on the
 81 | Program (independent of having been made by running the Program).
 82 | Whether that is true depends on what the Program does.
 83 | 
 84 |   1. You may copy and distribute verbatim copies of the Program's
 85 | source code as you receive it, in any medium, provided that you
 86 | conspicuously and appropriately publish on each copy an appropriate
 87 | copyright notice and disclaimer of warranty; keep intact all the
 88 | notices that refer to this License and to the absence of any warranty;
 89 | and give any other recipients of the Program a copy of this License
 90 | along with the Program.
 91 | 
 92 | You may charge a fee for the physical act of transferring a copy, and
 93 | you may at your option offer warranty protection in exchange for a fee.
 94 | 
 95 |   2. You may modify your copy or copies of the Program or any portion
 96 | of it, thus forming a work based on the Program, and copy and
 97 | distribute such modifications or work under the terms of Section 1
 98 | above, provided that you also meet all of these conditions:
 99 | 
100 |     a) You must cause the modified files to carry prominent notices
101 |     stating that you changed the files and the date of any change.
102 | 
103 |     b) You must cause any work that you distribute or publish, that in
104 |     whole or in part contains or is derived from the Program or any
105 |     part thereof, to be licensed as a whole at no charge to all third
106 |     parties under the terms of this License.
107 | 
108 |     c) If the modified program normally reads commands interactively
109 |     when run, you must cause it, when started running for such
110 |     interactive use in the most ordinary way, to print or display an
111 |     announcement including an appropriate copyright notice and a
112 |     notice that there is no warranty (or else, saying that you provide
113 |     a warranty) and that users may redistribute the program under
114 |     these conditions, and telling the user how to view a copy of this
115 |     License.  (Exception: if the Program itself is interactive but
116 |     does not normally print such an announcement, your work based on
117 |     the Program is not required to print an announcement.)
118 | 
119 | These requirements apply to the modified work as a whole.  If
120 | identifiable sections of that work are not derived from the Program,
121 | and can be reasonably considered independent and separate works in
122 | themselves, then this License, and its terms, do not apply to those
123 | sections when you distribute them as separate works.  But when you
124 | distribute the same sections as part of a whole which is a work based
125 | on the Program, the distribution of the whole must be on the terms of
126 | this License, whose permissions for other licensees extend to the
127 | entire whole, and thus to each and every part regardless of who wrote it.
128 | 
129 | Thus, it is not the intent of this section to claim rights or contest
130 | your rights to work written entirely by you; rather, the intent is to
131 | exercise the right to control the distribution of derivative or
132 | collective works based on the Program.
133 | 
134 | In addition, mere aggregation of another work not based on the Program
135 | with the Program (or with a work based on the Program) on a volume of
136 | a storage or distribution medium does not bring the other work under
137 | the scope of this License.
138 | 
139 |   3. You may copy and distribute the Program (or a work based on it,
140 | under Section 2) in object code or executable form under the terms of
141 | Sections 1 and 2 above provided that you also do one of the following:
142 | 
143 |     a) Accompany it with the complete corresponding machine-readable
144 |     source code, which must be distributed under the terms of Sections
145 |     1 and 2 above on a medium customarily used for software interchange; or,
146 | 
147 |     b) Accompany it with a written offer, valid for at least three
148 |     years, to give any third party, for a charge no more than your
149 |     cost of physically performing source distribution, a complete
150 |     machine-readable copy of the corresponding source code, to be
151 |     distributed under the terms of Sections 1 and 2 above on a medium
152 |     customarily used for software interchange; or,
153 | 
154 |     c) Accompany it with the information you received as to the offer
155 |     to distribute corresponding source code.  (This alternative is
156 |     allowed only for noncommercial distribution and only if you
157 |     received the program in object code or executable form with such
158 |     an offer, in accord with Subsection b above.)
159 | 
160 | The source code for a work means the preferred form of the work for
161 | making modifications to it.  For an executable work, complete source
162 | code means all the source code for all modules it contains, plus any
163 | associated interface definition files, plus the scripts used to
164 | control compilation and installation of the executable.  However, as a
165 | special exception, the source code distributed need not include
166 | anything that is normally distributed (in either source or binary
167 | form) with the major components (compiler, kernel, and so on) of the
168 | operating system on which the executable runs, unless that component
169 | itself accompanies the executable.
170 | 
171 | If distribution of executable or object code is made by offering
172 | access to copy from a designated place, then offering equivalent
173 | access to copy the source code from the same place counts as
174 | distribution of the source code, even though third parties are not
175 | compelled to copy the source along with the object code.
176 | 
177 |   4. You may not copy, modify, sublicense, or distribute the Program
178 | except as expressly provided under this License.  Any attempt
179 | otherwise to copy, modify, sublicense or distribute the Program is
180 | void, and will automatically terminate your rights under this License.
181 | However, parties who have received copies, or rights, from you under
182 | this License will not have their licenses terminated so long as such
183 | parties remain in full compliance.
184 | 
185 |   5. You are not required to accept this License, since you have not
186 | signed it.  However, nothing else grants you permission to modify or
187 | distribute the Program or its derivative works.  These actions are
188 | prohibited by law if you do not accept this License.  Therefore, by
189 | modifying or distributing the Program (or any work based on the
190 | Program), you indicate your acceptance of this License to do so, and
191 | all its terms and conditions for copying, distributing or modifying
192 | the Program or works based on it.
193 | 
194 |   6. Each time you redistribute the Program (or any work based on the
195 | Program), the recipient automatically receives a license from the
196 | original licensor to copy, distribute or modify the Program subject to
197 | these terms and conditions.  You may not impose any further
198 | restrictions on the recipients' exercise of the rights granted herein.
199 | You are not responsible for enforcing compliance by third parties to
200 | this License.
201 | 
202 |   7. If, as a consequence of a court judgment or allegation of patent
203 | infringement or for any other reason (not limited to patent issues),
204 | conditions are imposed on you (whether by court order, agreement or
205 | otherwise) that contradict the conditions of this License, they do not
206 | excuse you from the conditions of this License.  If you cannot
207 | distribute so as to satisfy simultaneously your obligations under this
208 | License and any other pertinent obligations, then as a consequence you
209 | may not distribute the Program at all.  For example, if a patent
210 | license would not permit royalty-free redistribution of the Program by
211 | all those who receive copies directly or indirectly through you, then
212 | the only way you could satisfy both it and this License would be to
213 | refrain entirely from distribution of the Program.
214 | 
215 | If any portion of this section is held invalid or unenforceable under
216 | any particular circumstance, the balance of the section is intended to
217 | apply and the section as a whole is intended to apply in other
218 | circumstances.
219 | 
220 | It is not the purpose of this section to induce you to infringe any
221 | patents or other property right claims or to contest validity of any
222 | such claims; this section has the sole purpose of protecting the
223 | integrity of the free software distribution system, which is
224 | implemented by public license practices.  Many people have made
225 | generous contributions to the wide range of software distributed
226 | through that system in reliance on consistent application of that
227 | system; it is up to the author/donor to decide if he or she is willing
228 | to distribute software through any other system and a licensee cannot
229 | impose that choice.
230 | 
231 | This section is intended to make thoroughly clear what is believed to
232 | be a consequence of the rest of this License.
233 | 
234 |   8. If the distribution and/or use of the Program is restricted in
235 | certain countries either by patents or by copyrighted interfaces, the
236 | original copyright holder who places the Program under this License
237 | may add an explicit geographical distribution limitation excluding
238 | those countries, so that distribution is permitted only in or among
239 | countries not thus excluded.  In such case, this License incorporates
240 | the limitation as if written in the body of this License.
241 | 
242 |   9. The Free Software Foundation may publish revised and/or new versions
243 | of the General Public License from time to time.  Such new versions will
244 | be similar in spirit to the present version, but may differ in detail to
245 | address new problems or concerns.
246 | 
247 | Each version is given a distinguishing version number.  If the Program
248 | specifies a version number of this License which applies to it and "any
249 | later version", you have the option of following the terms and conditions
250 | either of that version or of any later version published by the Free
251 | Software Foundation.  If the Program does not specify a version number of
252 | this License, you may choose any version ever published by the Free Software
253 | Foundation.
254 | 
255 |   10. If you wish to incorporate parts of the Program into other free
256 | programs whose distribution conditions are different, write to the author
257 | to ask for permission.  For software which is copyrighted by the Free
258 | Software Foundation, write to the Free Software Foundation; we sometimes
259 | make exceptions for this.  Our decision will be guided by the two goals
260 | of preserving the free status of all derivatives of our free software and
261 | of promoting the sharing and reuse of software generally.
262 | 
263 |                             NO WARRANTY
264 | 
265 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
266 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
267 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
268 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
269 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
270 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
271 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
272 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
273 | REPAIR OR CORRECTION.
274 | 
275 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
276 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
277 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
278 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
279 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
280 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
281 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
282 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
283 | POSSIBILITY OF SUCH DAMAGES.
284 | 
285 |                      END OF TERMS AND CONDITIONS
286 | 
287 |             How to Apply These Terms to Your New Programs
288 | 
289 |   If you develop a new program, and you want it to be of the greatest
290 | possible use to the public, the best way to achieve this is to make it
291 | free software which everyone can redistribute and change under these terms.
292 | 
293 |   To do so, attach the following notices to the program.  It is safest
294 | to attach them to the start of each source file to most effectively
295 | convey the exclusion of warranty; and each file should have at least
296 | the "copyright" line and a pointer to where the full notice is found.
297 | 
298 |     {description}
299 |     Copyright (C) {year}  {fullname}
300 | 
301 |     This program is free software; you can redistribute it and/or modify
302 |     it under the terms of the GNU General Public License as published by
303 |     the Free Software Foundation; either version 2 of the License, or
304 |     (at your option) any later version.
305 | 
306 |     This program is distributed in the hope that it will be useful,
307 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
308 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
309 |     GNU General Public License for more details.
310 | 
311 |     You should have received a copy of the GNU General Public License along
312 |     with this program; if not, write to the Free Software Foundation, Inc.,
313 |     51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
314 | 
315 | Also add information on how to contact you by electronic and paper mail.
316 | 
317 | If the program is interactive, make it output a short notice like this
318 | when it starts in an interactive mode:
319 | 
320 |     Gnomovision version 69, Copyright (C) year name of author
321 |     Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
322 |     This is free software, and you are welcome to redistribute it
323 |     under certain conditions; type `show c' for details.
324 | 
325 | The hypothetical commands `show w' and `show c' should show the appropriate
326 | parts of the General Public License.  Of course, the commands you use may
327 | be called something other than `show w' and `show c'; they could even be
328 | mouse-clicks or menu items--whatever suits your program.
329 | 
330 | You should also get your employer (if you work as a programmer) or your
331 | school, if any, to sign a "copyright disclaimer" for the program, if
332 | necessary.  Here is a sample; alter the names:
333 | 
334 |   Yoyodyne, Inc., hereby disclaims all copyright interest in the program
335 |   `Gnomovision' (which makes passes at compilers) written by James Hacker.
336 | 
337 |   {signature of Ty Coon}, 1 April 1989
338 |   Ty Coon, President of Vice
339 | 
340 | This General Public License does not permit incorporating your program into
341 | proprietary programs.  If your program is a subroutine library, you may
342 | consider it more useful to permit linking proprietary applications with the
343 | library.  If this is what you want to do, use the GNU Lesser General
344 | Public License instead of this License.
345 | 
346 | 
347 | fastcomp license
348 | ================
349 | 
350 | MIT LICENSE
351 | 
352 | Copyright (c) 2012 Fujimoto<fujimoto.report@gmail.com>
353 | 
354 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
355 | 
356 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
357 | 
358 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON INFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
359 | 


--------------------------------------------------------------------------------
/cdistance/distance.c:
--------------------------------------------------------------------------------
  1 | #include "distance.h"
  2 | #include "includes.h"
  3 | 
  4 | 
  5 | static unicode *
  6 | get_unicode(PyObject *obj, Py_ssize_t *len)
  7 | {
  8 | 	unicode *u;
  9 | 	
 10 | 	if ((u = PyUnicode_AS_UNICODE(obj)) == NULL) {
 11 | 		PyErr_Format(PyExc_RuntimeError, "failed to get unicode representation of object");
 12 | 		return NULL;
 13 | 	}
 14 | 	*len = PyUnicode_GET_LENGTH(obj);
 15 | 	
 16 | 	return u;
 17 | }
 18 | 
 19 | 
 20 | static byte *
 21 | get_byte(PyObject *obj, Py_ssize_t *len)
 22 | {
 23 | 	byte *b;
 24 | 
 25 | 	b = PyBytes_AS_STRING(obj);
 26 | 	*len = PyBytes_GET_SIZE(obj);
 27 | 	
 28 | 	return b;
 29 | }
 30 | 
 31 | 
 32 | static array *
 33 | get_array(PyObject *obj, Py_ssize_t *len)
 34 | {
 35 | 	array *a;
 36 | 	
 37 | 	if ((a = PySequence_Fast(obj, "we got a problem")) == NULL)
 38 | 		return NULL;
 39 | 	*len = PySequence_Fast_GET_SIZE(a);
 40 | 	
 41 | 	return a;
 42 | }
 43 | 
 44 | 
 45 | static char
 46 | get_sequence(PyObject *obj, sequence *seq, Py_ssize_t *len, char type)
 47 | {
 48 | 	char t = '\0';
 49 | 	
 50 | 	if (PyUnicode_Check(obj)) {
 51 | 		t = 'u';
 52 | 		if ((seq->u = get_unicode(obj, len)) == NULL)
 53 | 			return '\0';
 54 | 	} else if (PyBytes_Check(obj)) {
 55 | 		t = 'b';
 56 | 		if ((seq->b = get_byte(obj, len)) == NULL)
 57 | 			return '\0';
 58 | 	} else if (PySequence_Check(obj)) {
 59 | 		t = 'a';
 60 | 		if ((seq->a = get_array(obj, len)) == NULL)
 61 | 			return '\0';
 62 | 	}
 63 | 	
 64 | 	if (!t) {
 65 | 		PyErr_SetString(PyExc_ValueError, "expected a sequence object as first argument");
 66 | 		return '\0';
 67 | 	}
 68 | 	if (type && t != type) {
 69 | 		PyErr_SetString(PyExc_ValueError, "type mismatch between the "
 70 | 			"value provided as left argument and one of the elements in "
 71 | 			"the right one, can't process the later");
 72 | 		if (t == 'a')
 73 | 			Py_DECREF(seq->a);
 74 | 		return '\0';
 75 | 	}
 76 | 	return t;
 77 | }
 78 | 
 79 | 
 80 | static char
 81 | get_sequences(PyObject *arg1, PyObject *arg2, sequence *seq1, sequence *seq2,
 82 |               Py_ssize_t *len1, Py_ssize_t *len2)
 83 | {
 84 | 	if (PyUnicode_Check(arg1) && PyUnicode_Check(arg2)) {
 85 | 		
 86 | 		if ((seq1->u = get_unicode(arg1, len1)) == NULL)
 87 | 			return '\0';
 88 | 		if ((seq2->u = get_unicode(arg2, len2)) == NULL)
 89 | 			return '\0';
 90 | 		return 'u';
 91 | 		
 92 | 	} else if (PyBytes_Check(arg1) && PyBytes_Check(arg2)) {
 93 | 	
 94 | 		if ((seq1->b = get_byte(arg1, len1)) == NULL)
 95 | 			return '\0';
 96 | 		if ((seq2->b = get_byte(arg2, len2)) == NULL)
 97 | 			return '\0';
 98 | 		return 'b';
 99 | 		
100 | 	} else if (PySequence_Check(arg1) && PySequence_Check(arg2)) {
101 | 	
102 | 		if ((seq1->a = get_array(arg1, len1)) == NULL)
103 | 			return '\0';
104 | 		if ((seq2->a = get_array(arg2, len2)) == NULL) {
105 | 			Py_DECREF(seq1->a);				/* warning ! */
106 | 			return '\0';
107 | 		}
108 | 		return 'a';
109 | 	}
110 | 	
111 | 	PyErr_SetString(PyExc_ValueError, "expected two sequence objects");
112 | 	return '\0';
113 | }
114 | 
115 | 
116 | static PyObject *
117 | hamming_py(PyObject *self, PyObject *args, PyObject *kwargs)
118 | {
119 | 	PyObject *arg1, *arg2, *odo_normalize = NULL;
120 | 	int do_normalize = 0;
121 | 	static char *keywords[] = {"seq1", "seq2", "normalized", NULL};
122 | 
123 | 	char type;
124 | 	sequence seq1, seq2;
125 | 	Py_ssize_t len1, len2;
126 | 	Py_ssize_t dist;
127 | 
128 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
129 | 		"OO|O:hamming", keywords, &arg1, &arg2, &odo_normalize))
130 | 		return NULL;
131 | 
132 | 	if (odo_normalize && (do_normalize = PyObject_IsTrue(odo_normalize)) == -1)
133 | 		return NULL;
134 | 	
135 | 	if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
136 | 		return NULL;
137 | 	
138 | 	if (len1 != len2) {
139 | 		PyErr_SetString(PyExc_ValueError, "expected two objects of the same length");
140 | 		if (type == 'a') {
141 | 			Py_DECREF(seq1.a);
142 | 			Py_DECREF(seq2.a);
143 | 		}
144 | 		return NULL;
145 | 	}
146 | 	
147 | 	switch(type) {
148 | 		case 'u':
149 | 			dist = uhamming(seq1.u, seq2.u, len1);
150 | 			break;
151 | 		case 'b':
152 | 			dist = bhamming(seq1.b, seq2.b, len1);
153 | 			break;
154 | 		default:
155 | 			dist = ahamming(seq1.a, seq2.a, len1);
156 | 			Py_DECREF(seq1.a);
157 | 			Py_DECREF(seq2.a);
158 | 	}
159 | 	
160 | 	if (dist == -1) // comparison failed
161 | 		return NULL;
162 | 	
163 | 	if (do_normalize) {
164 | 		if (len1 == 0)
165 | 			return Py_BuildValue("f", 0.0f);
166 | 		return Py_BuildValue("d", dist / (double)len1);
167 | 	}
168 | 	
169 | 	return Py_BuildValue("n", dist);
170 | }
171 | 
172 | 
173 | static PyObject *
174 | lcsubstrings_py_make_set(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen)
175 | {
176 | 	PyObject *set, *ss;
177 | 	struct pair_t *pair;
178 | 	
179 | 	if ((set = PySet_New(NULL)) == NULL) {
180 | 		utarray_free(stack);
181 | 		return NULL;
182 | 	}
183 | 
184 | 	for (pair = (struct pair_t*)utarray_front(stack);
185 | 		pair != NULL;
186 | 		pair = (struct pair_t*)utarray_next(stack, pair)) {
187 | 		
188 | 		ss = PySequence_GetSlice(arg2, pair->j - mlen + 1, pair->j + 1);
189 | 		if (ss == NULL)
190 | 			goto On_Error;
191 | 		if ((PySet_Add(set, ss)) == -1)
192 | 			goto On_Error;
193 | 	}
194 | 
195 | 	utarray_free(stack);
196 | 	return set;
197 | 	
198 | 	On_Error:
199 | 		PySet_Clear(set);
200 | 		Py_DECREF(set);
201 | 		utarray_free(stack);
202 | 		return NULL;
203 | }
204 | 
205 | 
206 | static PyObject *
207 | lcsubstrings_py_make_tuple(PyObject *arg1, PyObject *arg2, UT_array *stack, Py_ssize_t mlen)
208 | {
209 | 	PyObject *tp, *stp;
210 | 	Py_ssize_t i;
211 | 	struct pair_t *pair;
212 | 	
213 | 	if ((stp = PyTuple_New(utarray_len(stack))) == NULL) {
214 | 		utarray_free(stack);
215 | 		return NULL;
216 | 	}
217 | 	for (i = 0, pair = (struct pair_t*)utarray_front(stack);
218 | 		pair != NULL;
219 | 		++i, pair = (struct pair_t*)utarray_next(stack, pair)) {
220 | 		PyTuple_SET_ITEM(stp, i, Py_BuildValue("(nn)", pair->i - mlen + 1, pair->j - mlen + 1));
221 | 	}
222 | 	if ((tp = PyTuple_New(2)) == NULL) {
223 | 		utarray_free(stack);
224 | 		Py_DECREF(stp);
225 | 		return NULL;
226 | 	}
227 | 	PyTuple_SET_ITEM(tp, 0, Py_BuildValue("n", mlen));
228 | 	PyTuple_SET_ITEM(tp, 1, stp);
229 | 	
230 | 	utarray_free(stack);
231 | 	
232 | 	return tp;
233 | }
234 | 
235 | 
236 | static PyObject *
237 | lcsubstrings_py(PyObject *self, PyObject *args, PyObject *kwargs)
238 | {
239 | 	PyObject *arg1, *arg2, *opos = NULL;
240 | 	int positions = 0;
241 | 	static char *keywords[] = {"seq1", "seq2", "positions", NULL};
242 | 	
243 | 	char type;
244 | 	sequence seq1, seq2;
245 | 	Py_ssize_t len1, len2;
246 | 	UT_array *stack;
247 | 	Py_ssize_t mlen = -1;
248 | 	
249 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
250 | 		"OO|O:lcsubstrings", keywords, &arg1, &arg2, &opos))
251 | 		return NULL;
252 | 	if (opos && (positions = PyObject_IsTrue(opos)) == -1)
253 | 		return NULL;
254 | 
255 | 	if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
256 | 		return NULL;
257 | 	
258 | 	// special case
259 | 	if (type == 'a' && (!positions) && (PyList_Check(arg1) || PyList_Check(arg2))) {
260 | 		Py_DECREF(seq1.a);
261 | 		Py_DECREF(seq2.a);
262 | 		PyErr_SetString(PyExc_TypeError, "can't hash lists, pass in tuples instead");
263 | 		return NULL;
264 | 	}
265 | 	
266 | 	if (len1 < len2) {
267 | 		SWAP(PyObject *, arg1, arg2);
268 | 		SWAP(sequence,   seq1, seq2);
269 | 		SWAP(Py_ssize_t, len1, len2);
270 | 	}
271 | 
272 | 	switch(type) {
273 | 		case 'u':
274 | 			stack = ulcsubstrings(seq1.u, seq2.u, len1, len2, &mlen);
275 | 			break;
276 | 		case 'b':
277 | 			stack = blcsubstrings(seq1.b, seq2.b, len1, len2, &mlen);
278 | 			break;
279 | 		default:
280 | 			stack = alcsubstrings(seq1.a, seq2.a, len1, len2, &mlen);
281 | 			Py_DECREF(seq1.a);
282 | 			Py_DECREF(seq2.a);
283 | 	}
284 | 	
285 | 	if (stack == NULL) {
286 | 		/* memory allocation failed */
287 | 		return PyErr_NoMemory();
288 | 	}
289 | 	
290 | 	if (positions)
291 | 		return lcsubstrings_py_make_tuple(arg1, arg2, stack, mlen);
292 | 	return lcsubstrings_py_make_set(arg1, arg2, stack, mlen);
293 | }
294 | 
295 | 
296 | static PyObject *
297 | nlevenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs)
298 | {
299 | 	PyObject *arg1, *arg2;
300 | 	short method = 1;
301 | 	static char *keywords[] = {"seq1", "seq2", "method", NULL};
302 | 
303 | 	char type;
304 | 	sequence seq1, seq2;
305 | 	Py_ssize_t len1, len2;
306 | 	double dist;
307 | 	
308 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
309 | 		"OO|h:nlevenshtein", keywords, &arg1, &arg2, &method))
310 | 		return NULL;
311 | 	
312 | 	if (method != 1 && method != 2) {
313 | 		PyErr_SetString(PyExc_ValueError, "expected either 1 or 2 for `method` parameter");
314 | 		return NULL;
315 | 	}
316 | 	
317 | 	if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
318 | 		return NULL;
319 | 	
320 | 	if (len1 < len2) {
321 | 		SWAP(sequence,   seq1, seq2);
322 | 		SWAP(Py_ssize_t, len1, len2);
323 | 	}
324 | 	
325 | 	switch(type) {
326 | 		case 'u':
327 | 			dist = unlevenshtein(seq1.u, seq2.u, len1, len2, method);
328 | 			break;
329 | 		case 'b':
330 | 			dist = bnlevenshtein(seq1.b, seq2.b, len1, len2, method);
331 | 			break;
332 | 		default:
333 | 			dist = anlevenshtein(seq1.a, seq2.a, len1, len2, method);
334 | 			Py_DECREF(seq1.a);
335 | 			Py_DECREF(seq2.a);
336 | 	}
337 | 	
338 | 	if (dist < 0) {
339 | 		if (dist == -1) // memory allocation failed
340 | 			return PyErr_NoMemory();
341 | 		return NULL;    // comparison failed
342 | 	}
343 | 	
344 | 	return Py_BuildValue("d", dist);	
345 | }
346 | 
347 | 
348 | static PyObject *
349 | levenshtein_py(PyObject *self, PyObject *args, PyObject *kwargs)
350 | {
351 | 	PyObject *arg1, *arg2, *onorm = NULL;
352 | 	Py_ssize_t dist = -1;
353 | 	Py_ssize_t max_dist = -1;
354 | 	int normalized = 0;
355 | 	static char *keywords[] = {"seq1", "seq2", "normalized", "max_dist", NULL};
356 | 
357 | 	char type;
358 | 	sequence seq1, seq2;
359 | 	Py_ssize_t len1, len2;
360 | 	
361 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
362 | 		"OO|On:levenshtein", keywords, &arg1, &arg2, &onorm, &max_dist))
363 | 		return NULL;
364 | 	if (onorm && (normalized = PyObject_IsTrue(onorm)) == -1)
365 | 		return NULL;
366 | 	
367 | 	if (normalized) {
368 | 		onorm = NULL;
369 | 		return nlevenshtein_py(self, args, onorm);
370 | 	}
371 | 
372 | 	if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
373 | 		return NULL;
374 | 	
375 | 	switch(type) {
376 | 		case 'u':
377 | 			dist = ulevenshtein(seq1.u, seq2.u, len1, len2, max_dist);
378 | 			break;
379 | 		case 'b':
380 | 			dist = blevenshtein(seq1.b, seq2.b, len1, len2, max_dist);
381 | 			break;
382 | 		default:
383 | 			dist = alevenshtein(seq1.a, seq2.a, len1, len2, max_dist);
384 | 			Py_DECREF(seq1.a);
385 | 			Py_DECREF(seq2.a);
386 | 	}
387 | 	
388 | 	if (dist < -1) {
389 | 		if (dist == -2)
390 | 			return PyErr_NoMemory(); // memory allocation failed
391 | 		return NULL; // comparison failed
392 | 	}
393 | 	return Py_BuildValue("n", dist);
394 | }
395 | 
396 | 
397 | static PyObject *
398 | fastcomp_py(PyObject *self, PyObject *args, PyObject *kwargs)
399 | {
400 | 	PyObject *arg1, *arg2, *otr = NULL;
401 | 	int transpositions = 0;
402 | 	static char *keywords[] = {"seq1", "seq2", "transpositions", NULL};
403 | 	
404 | 	char type;
405 | 	sequence seq1, seq2;
406 | 	Py_ssize_t len1, len2;
407 | 	short dist;
408 | 
409 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:fast_comp",
410 | 		keywords, &arg1, &arg2, &transpositions))
411 | 		return NULL;
412 | 	if (otr && (transpositions = PyObject_IsTrue(otr)) == -1)
413 | 		return NULL;
414 | 
415 | 	if ((type = get_sequences(arg1, arg2, &seq1, &seq2, &len1, &len2)) == '\0')
416 | 		return NULL;
417 | 	
418 | 	if (len1 < len2) {
419 | 		SWAP(sequence,   seq1, seq2);
420 | 		SWAP(Py_ssize_t, len1, len2);
421 | 	}
422 | 
423 | 	switch(type) {
424 | 		case 'u':
425 | 			dist = ufastcomp(seq1.u, seq2.u, len1, len2, transpositions);
426 | 			break;
427 | 		case 'b':
428 | 			dist = bfastcomp(seq1.b, seq2.b, len1, len2, transpositions);
429 | 			break;
430 | 		default:
431 | 			dist = afastcomp(seq1.a, seq2.a, len1, len2, transpositions);
432 | 			Py_DECREF(seq1.a);
433 | 			Py_DECREF(seq2.a);
434 | 	}
435 | 	
436 | 	if (dist == -2)	// comparison failed
437 | 		return NULL;
438 | 	
439 | 	return Py_BuildValue("h", dist);	
440 | }
441 | 
442 | 
443 | 
444 | // Iterators (for levenshtein and fastcomp). They share the same structure.
445 | 
446 | typedef struct {
447 | 	PyObject_HEAD
448 | 	PyObject *itor;
449 | 	char seqtype;			// type of the sequence ('u', 'b', 'a')
450 | 	sequence seq1;			// the sequence itself
451 | 	Py_ssize_t len1;		// its length
452 | 	PyObject *object;		// the corresponding pyobject
453 | 	int transpos;			// only valable for fastcomp
454 | 	Py_ssize_t max_dist;	// only for levenshtein
455 | } ItorState;
456 | 
457 | 
458 | static void itor_dealloc(ItorState *state)
459 | {
460 | 	// we got two references for tuples and lists, one for the original python object,
461 | 	// and one returned by `PySequence_fast`
462 | 	if (state->seqtype == 'a')
463 | 		Py_XDECREF(state->seq1.a);
464 | 	Py_XDECREF(state->object);
465 | 	Py_XDECREF(state->itor);
466 | 	Py_TYPE(state)->tp_free(state);
467 | }
468 | 
469 | 
470 | static PyObject *
471 | ifastcomp_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
472 | {
473 | 	PyObject *arg1, *arg2, *itor;
474 | 	int transpositions = 0;
475 | 	static char *keywords[] = {"seq1", "seqs", "transpositions", NULL};
476 | 	
477 | 	char seqtype;
478 | 	sequence seq1;
479 | 	Py_ssize_t len1;
480 | 	
481 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OO|O:ifast_comp",
482 | 		keywords, &arg1, &arg2, &transpositions))
483 | 		return NULL;
484 | 	if (otr && (transpositions = PyObject_IsTrue(otr)) == -1)
485 | 		return NULL;
486 | 	
487 | 	if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0')
488 | 		return NULL;
489 | 	
490 | 	if ((itor = PyObject_GetIter(arg2)) == NULL) {
491 | 		PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument");
492 | 		return NULL;
493 | 	}
494 | 
495 | 	ItorState *state = (ItorState *)type->tp_alloc(type, 0);
496 | 	if (state == NULL) {
497 | 		Py_DECREF(itor);
498 | 		return NULL;
499 | 	}
500 | 
501 | 	Py_INCREF(arg1);
502 | 
503 | 	state->itor = itor;
504 | 	state->seqtype = seqtype;
505 | 	state->seq1 = seq1;
506 | 	state->object = arg1;
507 | 	state->len1 = len1;
508 | 	state->transpos = transpositions;
509 | 	
510 | 	return (PyObject *)state;
511 | }
512 | 
513 | 
514 | static PyObject *
515 | ilevenshtein_new(PyTypeObject *type, PyObject *args, PyObject *kwargs)
516 | {
517 | 	PyObject *arg1, *arg2, *itor;
518 | 	Py_ssize_t max_dist = -1;
519 | 	static char *keywords[] = {"seq1", "seqs", "max_dist", NULL};
520 | 	
521 | 	char seqtype;
522 | 	sequence seq1;
523 | 	Py_ssize_t len1;
524 | 
525 | 	if (!PyArg_ParseTupleAndKeywords(args, kwargs,
526 | 		"OO|n:ilevenshtein", keywords, &arg1, &arg2, &max_dist))
527 | 		return NULL;
528 | 
529 | 	if ((seqtype = get_sequence(arg1, &seq1, &len1, '\0')) == '\0')
530 | 		return NULL;
531 | 	
532 | 	if ((itor = PyObject_GetIter(arg2)) == NULL) {
533 | 		PyErr_SetString(PyExc_ValueError, "expected an iterable as second argument");
534 | 		return NULL;
535 | 	}
536 | 
537 | 	ItorState *state = (ItorState *)type->tp_alloc(type, 0);
538 | 	if (state == NULL) {
539 | 		Py_DECREF(itor);
540 | 		return NULL;
541 | 	}
542 | 
543 | 	Py_INCREF(arg1);
544 | 
545 | 	state->itor = itor;
546 | 	state->seqtype = seqtype;
547 | 	state->seq1 = seq1;
548 | 	state->object = arg1;
549 | 	state->len1 = len1;
550 | 	state->max_dist = max_dist;
551 | 	  
552 | 	return (PyObject *)state;
553 | }
554 | 
555 | 
556 | static PyObject *
557 | ilevenshtein_next(ItorState *state)
558 | {
559 | 	PyObject *arg2;
560 | 	sequence seq1, seq2;
561 | 	Py_ssize_t len2;
562 | 	
563 | 	Py_ssize_t dist = -1;
564 | 	PyObject *rv;
565 | 	
566 | 	seq1 = state->seq1;
567 | 
568 | 	while ((arg2 = PyIter_Next(state->itor)) != NULL) {
569 | 	
570 | 		if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') {
571 | 			Py_DECREF(arg2);
572 | 			return NULL;
573 | 		}
574 | 		switch(state->seqtype) {
575 | 			case 'u':
576 | 				dist = ulevenshtein(seq1.u, seq2.u, state->len1, len2, state->max_dist);
577 | 				break;
578 | 			case 'b':
579 | 				dist = blevenshtein(seq1.b, seq2.b, state->len1, len2, state->max_dist);
580 | 				break;
581 | 			default:
582 | 				dist = alevenshtein(seq1.a, seq2.a, state->len1, len2, state->max_dist);
583 | 				Py_DECREF(seq2.a);
584 | 		}
585 | 		if (dist < -1) {
586 | 			Py_DECREF(arg2);
587 | 			if (dist == -2)
588 | 				return PyErr_NoMemory(); // memory allocation failed
589 | 			return NULL; // comparison failed
590 | 		}
591 | 		if (dist != -1) {
592 | 			rv = Py_BuildValue("(nO)", dist, arg2);
593 | 			Py_DECREF(arg2);
594 | 			return rv;
595 | 		}
596 | 		Py_DECREF(arg2);
597 | 	}
598 | 	
599 | 	return NULL;
600 | }
601 | 
602 | 
603 | static PyObject *
604 | ifastcomp_next(ItorState *state)
605 | {
606 | 	PyObject *arg2;
607 | 	sequence seq1, seq2;
608 | 	Py_ssize_t len2;
609 | 	
610 | 	short dist = -1;
611 | 	PyObject *rv;
612 | 	
613 | 	seq1 = state->seq1;
614 | 	
615 | 	while ((arg2 = PyIter_Next(state->itor)) != NULL) {
616 | 	
617 | 		if (get_sequence(arg2, &seq2, &len2, state->seqtype) == '\0') {
618 | 			Py_DECREF(arg2);
619 | 			return NULL;
620 | 		}
621 | 		switch(state->seqtype) {
622 | 			case 'u':
623 | 				dist = ufastcomp(seq1.u, seq2.u, state->len1, len2, state->transpos);
624 | 				break;
625 | 			case 'b':
626 | 				dist = bfastcomp(seq1.b, seq2.b, state->len1, len2, state->transpos);
627 | 				break;
628 | 			default:
629 | 				dist = afastcomp(seq1.a, seq2.a, state->len1, len2, state->transpos);
630 | 				Py_DECREF(seq2.a);
631 | 		}
632 | 		if (dist == -2) {	// comparison failed
633 | 			Py_DECREF(arg2);
634 | 			return NULL;
635 | 		}
636 | 		if (dist != -1) {
637 | 			rv = Py_BuildValue("(hO)", dist, arg2);
638 | 			Py_DECREF(arg2);
639 | 			return rv;
640 | 		}
641 | 		Py_DECREF(arg2);
642 | 	}
643 | 	
644 | 	return NULL;
645 | }
646 | 
647 | 
648 | PyTypeObject IFastComp_Type = {
649 | 	PyVarObject_HEAD_INIT(&PyType_Type, 0)
650 | 	"distance.ifast_comp", /* tp_name */
651 | 	sizeof(ItorState), /* tp_basicsize */
652 | 	0, /* tp_itemsize */
653 | 	(destructor)itor_dealloc, /* tp_dealloc */
654 | 	0, /* tp_print */
655 | 	0, /* tp_getattr */
656 | 	0, /* tp_setattr */
657 | 	0, /* tp_reserved */
658 | 	0, /* tp_repr */
659 | 	0, /* tp_as_number */
660 | 	0, /* tp_as_sequence */
661 | 	0, /* tp_as_mapping */
662 | 	0, /* tp_hash */
663 | 	0, /* tp_call */
664 | 	0, /* tp_str */
665 | 	0, /* tp_getattro */
666 | 	0, /* tp_setattro */
667 | 	0, /* tp_as_buffer */
668 | 	Py_TPFLAGS_DEFAULT, /* tp_flags */
669 | 	ifast_comp_doc, /* tp_doc */
670 | 	0, /* tp_traverse */
671 | 	0, /* tp_clear */
672 | 	0, /* tp_richcompare */
673 | 	0, /* tp_weaklistoffset */
674 | 	PyObject_SelfIter, /* tp_iter */
675 | 	(iternextfunc)ifastcomp_next, /* tp_iternext */
676 | 	0, /* tp_methods */
677 | 	0, /* tp_members */
678 | 	0, /* tp_getset */
679 | 	0, /* tp_base */
680 | 	0, /* tp_dict */
681 | 	0, /* tp_descr_get */
682 | 	0, /* tp_descr_set */
683 | 	0, /* tp_dictoffset */
684 | 	0, /* tp_init */
685 | 	PyType_GenericAlloc, /* tp_alloc */
686 | 	ifastcomp_new, /* tp_new */
687 | };
688 | 
689 | 
690 | PyTypeObject ILevenshtein_Type = {
691 | 	PyVarObject_HEAD_INIT(&PyType_Type, 0)
692 | 	"distance.ilevenshtein", /* tp_name */
693 | 	sizeof(ItorState), /* tp_basicsize */
694 | 	0, /* tp_itemsize */
695 | 	(destructor)itor_dealloc, /* tp_dealloc */
696 | 	0, /* tp_print */
697 | 	0, /* tp_getattr */
698 | 	0, /* tp_setattr */
699 | 	0, /* tp_reserved */
700 | 	0, /* tp_repr */
701 | 	0, /* tp_as_number */
702 | 	0, /* tp_as_sequence */
703 | 	0, /* tp_as_mapping */
704 | 	0, /* tp_hash */
705 | 	0, /* tp_call */
706 | 	0, /* tp_str */
707 | 	0, /* tp_getattro */
708 | 	0, /* tp_setattro */
709 | 	0, /* tp_as_buffer */
710 | 	Py_TPFLAGS_DEFAULT, /* tp_flags */
711 | 	ilevenshtein_doc, /* tp_doc */
712 | 	0, /* tp_traverse */
713 | 	0, /* tp_clear */
714 | 	0, /* tp_richcompare */
715 | 	0, /* tp_weaklistoffset */
716 | 	PyObject_SelfIter, /* tp_iter */
717 | 	(iternextfunc)ilevenshtein_next, /* tp_iternext */
718 | 	0, /* tp_methods */
719 | 	0, /* tp_members */
720 | 	0, /* tp_getset */
721 | 	0, /* tp_base */
722 | 	0, /* tp_dict */
723 | 	0, /* tp_descr_get */
724 | 	0, /* tp_descr_set */
725 | 	0, /* tp_dictoffset */
726 | 	0, /* tp_init */
727 | 	PyType_GenericAlloc, /* tp_alloc */
728 | 	ilevenshtein_new, /* tp_new */
729 | };
730 | 
731 | 
732 | static PyMethodDef CDistanceMethods[] = {
733 | 	{"hamming", (PyCFunction)hamming_py, METH_VARARGS | METH_KEYWORDS, hamming_doc},
734 | 	{"levenshtein", (PyCFunction)levenshtein_py, METH_VARARGS | METH_KEYWORDS, levenshtein_doc},
735 | 	{"nlevenshtein", (PyCFunction)nlevenshtein_py, METH_VARARGS | METH_KEYWORDS, nlevenshtein_doc},
736 | 	{"lcsubstrings", (PyCFunction)lcsubstrings_py, METH_VARARGS | METH_KEYWORDS, lcsubstrings_doc},
737 | 	{"fast_comp", (PyCFunction)fastcomp_py, METH_VARARGS | METH_KEYWORDS, fast_comp_doc},
738 | 	{NULL, NULL, 0, NULL}
739 | };
740 | 
741 | 
742 | #if PY_MAJOR_VERSION >= 3
743 | static struct PyModuleDef cdistancemodule = {
744 | 	PyModuleDef_HEAD_INIT, "cdistance", NULL, -1, CDistanceMethods
745 | };
746 | #endif
747 | 
748 | #if PY_MAJOR_VERSION >= 3
749 | PyMODINIT_FUNC PyInit_cdistance(void)
750 | #else
751 | PyMODINIT_FUNC initcdistance(void)
752 | #endif
753 | {
754 | 	PyObject *module;
755 | 
756 | #if PY_MAJOR_VERSION >= 3
757 | 	if ((module = PyModule_Create(&cdistancemodule)) == NULL)
758 | 		return NULL;
759 | #else
760 | 	if ((module = Py_InitModule("cdistance", CDistanceMethods)) == NULL)
761 | 		return;
762 | #endif
763 | 
764 | 	if (PyType_Ready(&IFastComp_Type) != 0 || PyType_Ready(&ILevenshtein_Type) != 0)
765 | #if PY_MAJOR_VERSION >= 3
766 | 		return NULL;
767 | #else
768 | 		return;
769 | #endif
770 | 	
771 | 	Py_INCREF((PyObject *)&IFastComp_Type);
772 | 	Py_INCREF((PyObject *)&ILevenshtein_Type);
773 | 	
774 | 	PyModule_AddObject(module, "ifast_comp", (PyObject *)&IFastComp_Type);
775 | 	PyModule_AddObject(module, "ilevenshtein", (PyObject *)&ILevenshtein_Type);
776 | 
777 | #if PY_MAJOR_VERSION >= 3
778 | 	return module;
779 | #endif
780 | }
781 | 


--------------------------------------------------------------------------------