├── .gitignore ├── README.md ├── distance.py ├── kmeans.png ├── kmeans.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | clustering 2 | ========== 3 | 4 | 聚类分析 5 | -------------------------------------------------------------------------------- /distance.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | 3 | 4 | def euclid(v1, v2): 5 | return sqrt(sum(power(v1 - v2, 2))) 6 | 7 | -------------------------------------------------------------------------------- /kmeans.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/darlinglele/clustering/5c9975426df267ccac1f727bc432ba5901a5caa9/kmeans.png -------------------------------------------------------------------------------- /kmeans.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pylab as plt 2 | from numpy import * 3 | from random import * 4 | import distance 5 | 6 | 7 | def kmeans(X, k, dist=distance.euclid): 8 | m, n = shape(X) 9 | centroids = mat([[randrange(min(X[:, i]), max(X[:, i])) for i in xrange(n)] 10 | for x in xrange(k)]) 11 | 12 | dist_label = zeros((m, 2)) 13 | cluster_changed = True 14 | while cluster_changed: 15 | cluster_changed = False 16 | for i, x in enumerate(X): 17 | d, l = min([(dist(x, c) ** 2, j) for j, c in enumerate(centroids)]) 18 | cluster_changed = cluster_changed or l != dist_label[i, 1] 19 | dist_label[i] = d, l 20 | for i in xrange(k): 21 | sub_X = X[nonzero(dist_label[:, 1] == i)[0]] 22 | if len(sub_X) != 0: 23 | centroids[i, :] = mean(sub_X, axis=0) 24 | return dist_label, centroids 25 | 26 | 27 | def bi_kmeans(X, k, dist=distance.euclid): 28 | m, n = shape(X) 29 | centroids = [[mean(X[:, i]) for i in xrange(n)]] 30 | dist_label = zeros((m, 2)) 31 | for i, x in enumerate(X): 32 | dist_label[i] = dist(x, centroids[0]) ** 2, 0 33 | 34 | best_label = -1 35 | best_sse = inf 36 | best_sub_centroids = None 37 | while len(centroids) < k: 38 | for i, c in enumerate(centroids): 39 | sub_X = X[nonzero(dist_label[:, 1] == i)[0]] 40 | 41 | sub_dist_label, sub_centroids = kmeans(sub_X, 2) 42 | 43 | sse = sum(dist_label[nonzero(dist_label[:, 1] != i)[0]]) + sum( 44 | sub_dist_label) 45 | 46 | # find the best cluster 47 | if best_sse > sse: 48 | best_sse = sse 49 | best_label = i 50 | best_dist_label = sub_dist_label 51 | best_sub_centroids = sub_centroids 52 | 53 | # update centroid by the best cluster 54 | centroids[best_label] = best_sub_centroids[0] 55 | centroids.append(best_sub_centroids[1]) 56 | 57 | # update the label of X 58 | for i, j in zip(nonzero(dist_label[:, 1] == best_label)[0], xrange(len(best_dist_label))): 59 | if best_dist_label[j][1] == 1: 60 | dist_label[i] = best_dist_label[j][0], len(centroids) - 1 61 | else: 62 | dist_label[i, 0] = best_dist_label[j][0] 63 | 64 | return dist_label, centroids 65 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from numpy import * 2 | import unittest 3 | import distance 4 | from kmeans import * 5 | 6 | 7 | def create_cluster(centroid, size, distance): 8 | cluster = [] 9 | for x in xrange(size): 10 | cluster.append( 11 | centroid + array([randrange(-distance, distance, 1) for x in xrange(len(centroid))])) 12 | return cluster 13 | 14 | 15 | def create_clusters(centroids, size, distance): 16 | X = [] 17 | for centroid in centroids: 18 | X.extend(create_cluster(array(centroid), size, distance)) 19 | return mat(X) 20 | 21 | 22 | class DistanceTest(unittest.TestCase): 23 | 24 | def test_euclid_array(self): 25 | a = array((1, 2)) 26 | b = array((3, 4)) 27 | dist = distance.euclid(a, b) 28 | self.assertEqual(sqrt(8), dist) 29 | 30 | def test_euclid_matrix(self): 31 | a = mat([1, 2]) 32 | b = mat([3, 4]) 33 | dist = distance.euclid(a, b) 34 | self.assertEqual(sqrt(8), dist) 35 | 36 | def test_euclid_3dms(self): 37 | a = mat([1, 2, 3]) 38 | b = mat([4, 5, 6]) 39 | dist = distance.euclid(a, b) 40 | self.assertEqual(sqrt(27), dist) 41 | 42 | def test_euclid_1dms(self): 43 | a = mat([1]) 44 | b = mat([4]) 45 | dist = distance.euclid(a, b) 46 | self.assertEqual(3, dist) 47 | 48 | 49 | class KMeansTest(unittest.TestCase): 50 | 51 | def test_kmeans(self): 52 | X = create_clusters([(20, 30), (20, 60), (30, 45), (40, 60)], 30, 8) 53 | dist_label, centroids = kmeans(X, 4) 54 | icons = ['b_', 'b.', 'bo', 'b+', 'b*'] 55 | 56 | for idx, l in enumerate(dist_label): 57 | plt.plot(X[idx, 0], X[idx, 1], icons[int(l[1])]) 58 | # plt.show() 59 | 60 | def test_bi_kmeans(self): 61 | X = create_clusters( 62 | [(20, 30), (20, 60), (40, 60), (70, 60), (90, 60),(35, 45)], 30, 8) 63 | dist_label, centroids = bi_kmeans(X, 6) 64 | icons = ['b_', 'b.', 'bo', 'b+', 'b*', 'r.', 'r*'] 65 | print dist_label 66 | for idx, l in enumerate(dist_label): 67 | plt.plot(X[idx, 0], X[idx, 1], icons[int(l[1])]) 68 | 69 | plt.show() 70 | 71 | 72 | if __name__ == '__main__': 73 | unittest.main() 74 | --------------------------------------------------------------------------------