├── README.md ├── dominant_set.py └── ds_clustering.py /README.md: -------------------------------------------------------------------------------- 1 | dominant_set 2 | ============ 3 | 4 | Dominant set clustering. See 'Dominant Sets and Pairwise Clustering', by Massimiliano Pavan and Marcello Pelillo, PAMI 2007. 5 | -------------------------------------------------------------------------------- /dominant_set.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.linalg import norm 3 | 4 | def dominant_set(A, x=None, epsilon=1.0e-4): 5 | """Compute the dominant set of the similarity matrix A with the 6 | replicator dynamics optimization approach. Convergence is reached 7 | when x changes less than epsilon. 8 | 9 | See: 'Dominant Sets and Pairwise Clustering', by Massimiliano 10 | Pavan and Marcello Pelillo, PAMI 2007. 11 | """ 12 | if x is None: 13 | x = np.ones(A.shape[0])/float(A.shape[0]) 14 | 15 | distance = epsilon*2 16 | while distance > epsilon: 17 | x_old = x.copy() 18 | # x = x * np.dot(A, x) # this works only for dense A 19 | x = x * A.dot(x) # this works both for dense and sparse A 20 | x = x / x.sum() 21 | distance = norm(x - x_old) 22 | print x.size, distance 23 | 24 | return x 25 | 26 | 27 | if __name__=="__main__": 28 | 29 | from sklearn.metrics import pairwise_distances 30 | from sklearn.datasets import make_blobs 31 | import matplotlib.pyplot as plt 32 | 33 | np.random.seed(1) 34 | 35 | n = 1000 36 | d = 2 37 | 38 | X, y = make_blobs(n, d, centers=3) 39 | 40 | D = pairwise_distances(X, metric='sqeuclidean') 41 | 42 | sigma2 = np.median(D) 43 | 44 | S = np.exp(-D / sigma2) 45 | 46 | x = dominant_set(S, epsilon=2e-4) 47 | 48 | if d==2: 49 | plt.figure() 50 | for yi in np.unique(y): 51 | plt.plot(X[y==yi,0], X[y==yi,1], 'o') 52 | 53 | plt.title('Dataset') 54 | 55 | plt.figure() 56 | plt.imshow(S, interpolation='nearest') 57 | plt.title('similarity matrix') 58 | 59 | idx = np.argsort(x)[::-1] 60 | B = S[idx,:][:,idx] 61 | plt.figure() 62 | plt.imshow(B, interpolation='nearest') 63 | plt.title('Re-arranged similarity matrix') 64 | plt.figure() 65 | plt.semilogy(np.sort(x)) 66 | plt.title('Sorted weighted characteristic vector (x)') 67 | 68 | cutoff = np.median(x[x>0]) 69 | print "cutoff:", cutoff 70 | plt.figure() 71 | plt.plot(X[x<=cutoff,0], X[x<=cutoff,1], 'bo') 72 | plt.plot(X[x>cutoff,0], X[x>cutoff,1], 'ro') 73 | plt.title("Dominant set") 74 | 75 | plt.show() 76 | -------------------------------------------------------------------------------- /ds_clustering.py: -------------------------------------------------------------------------------- 1 | """Dominant set clustering: iteratively find the dominant set and then 2 | remove it from the dataset. 3 | """ 4 | 5 | import numpy as np 6 | from dominant_set import dominant_set 7 | 8 | if __name__ == '__main__': 9 | 10 | from sklearn.metrics import pairwise_distances 11 | from sklearn.datasets import make_blobs 12 | import matplotlib.pyplot as plt 13 | 14 | np.random.seed(1) 15 | 16 | n = 1000 17 | d = 2 18 | 19 | X, y = make_blobs(n, d, centers=3) 20 | 21 | D = pairwise_distances(X, metric='sqeuclidean') 22 | 23 | sigma2 = np.median(D) 24 | 25 | S = np.exp(-D / sigma2) 26 | 27 | if d==2: 28 | plt.figure() 29 | for yi in np.unique(y): 30 | plt.plot(X[y==yi,0], X[y==yi,1], 'o') 31 | 32 | plt.title('Dataset') 33 | 34 | 35 | while S.size > 10: 36 | x = dominant_set(S, epsilon=2e-4) 37 | cutoff = np.median(x[x>0]) 38 | 39 | plt.figure() 40 | plt.plot(X[x<=cutoff,0], X[x<=cutoff,1], 'bo') 41 | plt.plot(X[x>cutoff,0], X[x>cutoff,1], 'ro') 42 | plt.title("Dominant set") 43 | 44 | # remove the dominant set 45 | idx = x <= cutoff 46 | S = S[idx, :][:, idx] 47 | X = X[idx, :] 48 | 49 | plt.show() 50 | 51 | 52 | --------------------------------------------------------------------------------