├── .gitignore ├── LICENSE ├── README.md ├── canopy.py ├── canopy_1.png └── useCanopy.py /.gitignore: -------------------------------------------------------------------------------- 1 | /____pycache____/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Alan Lau 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 使用python实现Canopy聚类算法 2 | 3 | #### 博客移步[python实现Canopy算法](https://blog.csdn.net/alanconstantinelau/article/details/77908986) 4 | 5 | canopy.py是自己根据Canopy算法原理进行编写,代码利用注释进行说明。 6 | useCanopy.py则是根据python现成的第三方库canopy编写。 7 | 8 | ##### 技术栈: 9 | * numpy 10 | * matplotlib 11 | 12 | ##### 效果: 13 | ![canopy.py效果图](https://github.com/AlanConstantine/CanopyByPython/raw/master/canopy_1.png) 14 | -------------------------------------------------------------------------------- /canopy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Alan Lau 3 | # @Date: 2017-09-05 22:56:16 4 | # @Last Modified by: Alan Lau 5 | # @Last Modified time: 2017-09-05 22:56:16 6 | 7 | import math 8 | import random 9 | import numpy as np 10 | from datetime import datetime 11 | from pprint import pprint as p 12 | import matplotlib.pyplot as plt 13 | 14 | # 随机生成500个二维[0,1)平面点 15 | dataset = np.random.rand(500, 2) 16 | 17 | 18 | class Canopy: 19 | def __init__(self, dataset): 20 | self.dataset = dataset 21 | self.t1 = 0 22 | self.t2 = 0 23 | 24 | # 设置初始阈值 25 | def setThreshold(self, t1, t2): 26 | if t1 > t2: 27 | self.t1 = t1 28 | self.t2 = t2 29 | else: 30 | print('t1 needs to be larger than t2!') 31 | 32 | # 使用欧式距离进行距离的计算 33 | def euclideanDistance(self, vec1, vec2): 34 | return math.sqrt(((vec1 - vec2)**2).sum()) 35 | 36 | # 根据当前dataset的长度随机选择一个下标 37 | def getRandIndex(self): 38 | return random.randint(0, len(self.dataset) - 1) 39 | 40 | def clustering(self): 41 | if self.t1 == 0: 42 | print('Please set the threshold.') 43 | else: 44 | canopies = [] # 用于存放最终归类结果 45 | # while len(self.dataset) != 0: 46 | # 20180324修改 47 | while len(self.dataset) > 1: 48 | rand_index = self.getRandIndex() 49 | current_center = self.dataset[rand_index] # 随机获取一个中心点,定为P点 50 | current_center_list = [] # 初始化P点的canopy类容器 51 | delete_list = [] # 初始化P点的删除容器 52 | self.dataset = np.delete(self.dataset, rand_index, 53 | 0) # 删除随机选择的中心点P 54 | for datum_j in range(len(self.dataset)): 55 | datum = self.dataset[datum_j] 56 | distance = self.euclideanDistance( 57 | current_center, datum) # 计算选取的中心点P到每个点之间的距离 58 | if distance < self.t1: 59 | # 若距离小于t1,则将点归入P点的canopy类 60 | current_center_list.append(datum) 61 | if distance < self.t2: 62 | delete_list.append(datum_j) # 若小于t2则归入删除容器 63 | # 根据删除容器的下标,将元素从数据集中删除 64 | self.dataset = np.delete(self.dataset, delete_list, 0) 65 | canopies.append((current_center, current_center_list)) 66 | return canopies 67 | 68 | 69 | def showCanopy(canopies, dataset, t1, t2): 70 | fig = plt.figure() 71 | sc = fig.add_subplot(111) 72 | colors = [ 73 | 'brown', 'green', 'blue', 'y', 'r', 'tan', 'dodgerblue', 'deeppink', 74 | 'orangered', 'peru', 'blue', 'y', 'r', 'gold', 'dimgray', 'darkorange', 75 | 'peru', 'blue', 'y', 'r', 'cyan', 'tan', 'orchid', 'peru', 'blue', 'y', 76 | 'r', 'sienna' 77 | ] 78 | markers = [ 79 | '*', 'h', 'H', '+', 'o', '1', '2', '3', ',', 'v', 'H', '+', '1', '2', 80 | '^', '<', '>', '.', '4', 'H', '+', '1', '2', 's', 'p', 'x', 'D', 'd', 81 | '|', '_' 82 | ] 83 | for i in range(len(canopies)): 84 | canopy = canopies[i] 85 | center = canopy[0] 86 | components = canopy[1] 87 | sc.plot( 88 | center[0], 89 | center[1], 90 | marker=markers[i], 91 | color=colors[i], 92 | markersize=10) 93 | t1_circle = plt.Circle( 94 | xy=(center[0], center[1]), 95 | radius=t1, 96 | color='dodgerblue', 97 | fill=False) 98 | t2_circle = plt.Circle( 99 | xy=(center[0], center[1]), radius=t2, color='skyblue', alpha=0.2) 100 | sc.add_artist(t1_circle) 101 | sc.add_artist(t2_circle) 102 | for component in components: 103 | sc.plot( 104 | component[0], 105 | component[1], 106 | marker=markers[i], 107 | color=colors[i], 108 | markersize=1.5) 109 | maxvalue = np.amax(dataset) 110 | minvalue = np.amin(dataset) 111 | plt.xlim(minvalue - t1, maxvalue + t1) 112 | plt.ylim(minvalue - t1, maxvalue + t1) 113 | plt.show() 114 | 115 | 116 | def main(): 117 | t1 = 0.6 118 | t2 = 0.4 119 | gc = Canopy(dataset) 120 | gc.setThreshold(t1, t2) 121 | canopies = gc.clustering() 122 | print('Get %s initial centers.' % len(canopies)) 123 | showCanopy(canopies, dataset, t1, t2) 124 | 125 | 126 | if __name__ == '__main__': 127 | t_s = datetime.now() 128 | main() 129 | t_e = datetime.now() 130 | usedtime = t_e - t_s 131 | print('[%s]' % usedtime) 132 | -------------------------------------------------------------------------------- /canopy_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AlanConstantine/CanopyByPython/fd167ac535124f5d01a62dc468e99431c4439b67/canopy_1.png -------------------------------------------------------------------------------- /useCanopy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # @Author: Alan Lau 3 | # @Date: 2017-09-06 17:47:51 4 | # @Last Modified by: Alan Lau 5 | # @Last Modified time: 2017-09-06 17:47:51 6 | 7 | 8 | from canopy import Canopy 9 | import numpy as np 10 | 11 | 12 | dataset = np.random.rand(500, 2) 13 | gc = Canopy(dataset) 14 | gc.setThreshold(0.6, 0.4) 15 | canopies = gc.clustering() 16 | print(len(canopies)) 17 | --------------------------------------------------------------------------------