├── README.md
├── 关联分析(Apriori)
├── correlation_analysis.py
└── data.txt
├── 数据分类(决策树)
├── data.txt
├── data_generation.py
├── decision_tree.py
└── tree.dot
└── 数据聚类(K-means)
└── k-means.py
/README.md:
--------------------------------------------------------------------------------
1 | # 数据挖掘算法
2 | 1. [关联分析Apriori算法](#关联分析Apriori算法)
3 | 2. [数据分类决策树算法](#数据分类决策树算法)
4 | 3. [数据聚类K-means算法](#数据聚类K-means算法)
5 |
6 |
7 |
8 |
9 | ## 关联分析Apriori算法
10 | #### 1. [数据集](关联分析(Apriori)/data.txt)
11 | 以超市交易为数据集,所有商品的项集为
12 | ```bash
13 | I = {bread, beer, cake, cream, milk, tea}
14 | ```
15 | 某条交易如
16 | ```bash
17 | Ti = {bread, beer, milk}
18 | ```
19 | 简化为
20 | ```bash
21 | Ti = {a, b, d}
22 | ```
23 | data.txt数据集样本如下
24 | ```bash
25 | a, d, e,f
26 | a, d, e
27 | c, e
28 | e, f
29 | ...
30 | ```
31 |
32 | #### 2. [算法实现](关联分析(Apriori)/correlation_analysis.py)
33 | 使用经典的Apriori算法,依次扫描交易记录集,计算出 *k-候选集Ck* 然后去除**支持度sup**小的项集获得 *k-频繁集Lk*, 只计算到 *3-频繁集* ,最后计算管理规则可信度即可。
34 | > 第k个候选集只会从k-1频繁集中的各项目组合连接,然后扫描记录集,以获取Ck中各项集的支持度。
35 |
36 | #### 3.输出
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 | ## 数据分类决策树算法
45 | #### 1. [数据集](数据分类(决策树)/data.txt)
46 | 使用身高体重指数分为胖瘦两个分类,数据自己生成见 [*data_generation.py](数据分类(决策树)/data_generation.py) 比较简陋。
47 | 数据集样本如下
48 | ```bash
49 | 184 77 fat
50 | 189 81 fat
51 | 178 75 fat
52 | ...
53 | ```
54 |
55 | #### 2.[算法实现](数据分类(决策树)/decision_tree.py)
56 | 调用python实现的类库,比较简单
57 | ```python
58 | from sklearn import tree
59 | from sklearn.metrics import precision_recall_curve
60 | from sklearn.metrics import classification_report
61 | from sklearn.model_selection import train_test_split
62 |
63 | ...
64 |
65 | # 数据拆分,80%训练,20%测试
66 | x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.2,random_state=0)
67 |
68 | # 使用DecisionTreeClassifier建立模型并训练
69 | clf = tree.DecisionTreeClassifier(criterion='entropy')
70 | clf.fit(x_train, y_train)
71 |
72 | ...
73 | ```
74 | 打印后同时保持决策树到文件 [*tree.dot](数据分类(决策树)/tree.dot),通过dot命令可以生产决策树图形(或者[在线转换](http://www.webgraphviz.com/)
75 | ```python
76 | # 保存决策树为dot文件,后续图形处理
77 | with open("tree.dot", 'w') as f:
78 | f = tree.export_graphviz(clf, out_file=f)
79 | ```
80 | #### 3.输出
81 |
82 |
83 |
84 |
85 |
86 | 决策树
87 |
88 |
89 |
90 |
91 |
92 |
93 | ## 数据聚类K-means算法
94 | #### 1. 数据集
95 | 数据集采用python类库有名的iris坐标点集
96 | ```python
97 | from sklearn import datasets
98 |
99 | iris = datasets.load_iris()
100 | X, y = iris.data, iris.target
101 | ```
102 | 数据集样本如下
103 | ```bash
104 | [1.5 0.2]
105 | [3.2 0.2]
106 | [3.1 0.2]
107 | [4.6 0.2]
108 | ...
109 | ```
110 |
111 | #### 2. [算法实现](数据聚类(K-means)/k-means.py)
112 | K-means算法需要先指定要分成k类,数据样本只有熟悉,没有类别。
113 | 大概步骤:
114 | 1. 从数据集X从随机选取k个数据样本作为聚类的初始化代表点,每一个代表点表示一个类别。
115 | 2. 对于数据集中的任一样本点,都计算它与这k个初始化代表点的距离(d可用欧氏距离),然后划分到距离最近的分类中去。完成一次聚类
116 | 3. 划分好数据后,计算每个聚类的均值,并将之作为该聚类的新代表点,因此得到k个新代表点。
117 | 4. 和第二步一样,再继续计算每个点到代表点的距离,划分到距离最小的类
118 | 5. 重复3和4,直到各个聚类不再发生变化(样本点划分固定了),即误差平方和准则函数的值达到最优。
119 |
120 | #### 3.输出
121 |
122 |
123 |
124 |
125 |
--------------------------------------------------------------------------------
/关联分析(Apriori)/correlation_analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 | """
3 | 关联分析-Apriori算法
4 | """
5 |
6 | '''
7 | 从外部文件data.txt导入数据集,一个交易的集合
8 | '''
9 | def load_data_set():
10 | data_set = []
11 | fd = file("data.txt", "r")
12 | for line in fd.readlines():
13 | line = line.strip('\n')
14 | data_set.append(list(map(None, line.split(', '))))
15 | return data_set
16 |
17 | '''
18 | 直接从数据集构造1-候选集
19 | '''
20 | def create_C1(data_set):
21 | C1 = set()
22 | for t in data_set:
23 | for item in t:
24 | item_set = frozenset([item])
25 | C1.add(item_set)
26 | return C1
27 |
28 | '''
29 | 判断是否满足
30 | '''
31 | def is_apriori(Ck_item, Lksub1):
32 | for item in Ck_item:
33 | sub_Ck = Ck_item - frozenset([item])
34 | if sub_Ck not in Lksub1:
35 | return False
36 | return True
37 |
38 | '''
39 | 生成各个候选集Ck
40 | '''
41 | def create_Ck(Lksub1, k):
42 | Ck = set()
43 | len_Lksub1 = len(Lksub1)
44 | list_Lksub1 = list(Lksub1)
45 | for i in range(len_Lksub1):
46 | for j in range(1, len_Lksub1):
47 | l1 = list(list_Lksub1[i])
48 | l2 = list(list_Lksub1[j])
49 | l1.sort()
50 | l2.sort()
51 | if l1[0:k-2] == l2[0:k-2]:
52 | Ck_item = list_Lksub1[i] | list_Lksub1[j]
53 | if is_apriori(Ck_item, Lksub1):
54 | Ck.add(Ck_item)
55 | return Ck
56 |
57 | '''
58 | 通过候选集Ck生成频繁集Lk
59 | '''
60 | def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
61 | Lk = set()
62 | item_count = {}
63 | for t in data_set:
64 | for item in Ck:
65 | if item.issubset(t):
66 | if item not in item_count:
67 | item_count[item] = 1
68 | else:
69 | item_count[item] += 1
70 | t_num = float(len(data_set))
71 | for item in item_count:
72 | if (item_count[item] / t_num) >= min_support:
73 | Lk.add(item)
74 | support_data[item] = item_count[item] / t_num
75 | return Lk
76 |
77 | '''
78 | 生成各阶频繁集,最小支持度为0.2
79 | '''
80 | def generate_L(data_set, k, min_support):
81 | support_data = {}
82 | C1 = create_C1(data_set)
83 | L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
84 | Lksub1 = L1.copy()
85 | L = []
86 | L.append(Lksub1)
87 | for i in range(2, k+1):
88 | Ci = create_Ck(Lksub1, i)
89 | Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
90 | Lksub1 = Li.copy()
91 | L.append(Lksub1)
92 | return L, support_data
93 |
94 | '''
95 | 生成从频繁集关联规则分析
96 | '''
97 | def generate_big_rules(L, support_data, min_conf):
98 | big_rule_list = []
99 | sub_set_list = []
100 | for i in range(0, len(L)):
101 | for freq_set in L[i]:
102 | for sub_set in sub_set_list:
103 | if sub_set.issubset(freq_set):
104 | conf = support_data[freq_set] / support_data[freq_set - sub_set]
105 | big_rule = (freq_set - sub_set, sub_set, conf)
106 | if conf >= min_conf and big_rule not in big_rule_list:
107 | big_rule_list.append(big_rule)
108 | sub_set_list.append(freq_set)
109 | return big_rule_list
110 |
111 | if __name__ == "__main__":
112 | data_set = load_data_set()
113 | L, support_data = generate_L(data_set, k=3, min_support=0.2)
114 | big_rules_list = generate_big_rules(L, support_data, min_conf=0.7)
115 | for Lk in L:
116 | print ("=" * 50)
117 | print ("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
118 | print ("=" * 50)
119 | for freq_set in Lk:
120 | print (freq_set, support_data[freq_set])
121 | print()
122 | print ("Big Rules")
123 | for item in big_rules_list:
124 | print (item[0], "=>", item[1], "conf: ", item[2])
125 |
--------------------------------------------------------------------------------
/关联分析(Apriori)/data.txt:
--------------------------------------------------------------------------------
1 | a, d, e,f
2 | a, d, e
3 | c, e
4 | e, f
5 | a, c, e
6 | a, f
7 | b, e, f
8 | a, f
9 | a, d, e, f
10 | a, e, f
11 | a, d, e,f
12 | a, d, e
13 | c, e
14 | e, f
15 | a, c, e
16 | a, f
17 | b, e, f
18 | a, f
19 | a, d, e, f
20 | a, e, f
21 | a, d, e,f
22 | a, d, e
23 | c, e
24 | e, f
25 | a, c, e
26 | a, f
27 | b, e, f
28 | a, f
29 | a, d, e, f
30 | a, e, f
31 | a, d, e,f
32 | a, d, e
33 | c, e
34 | e, f
35 | a, c, e
36 | a, f
37 | b, e, f
38 | a, f
39 | a, d, e, f
40 | a, e, f
41 | a, d, e,f
42 | a, d, e
43 | c, e
44 | e, f
45 | a, c, e
46 | a, f
47 | b, e, f
48 | a, f
49 | a, d, e, f
50 | a, e, f
51 | a, d, e,f
52 | a, d, e
53 | c, e
54 | e, f
55 | a, c, e
56 | a, f
57 | b, e, f
58 | a, f
59 | a, d, e, f
60 | a, e, f
61 | a, d, e,f
62 | a, d, e
63 | c, e
64 | e, f
65 | a, c, e
66 | a, f
67 | b, e, f
68 | a, f
69 | a, d, e, f
70 | a, e, f
71 | a, d, e,f
72 | a, d, e
73 | c, e
74 | e, f
75 | a, c, e
76 | a, f
77 | b, e, f
78 | a, f
79 | a, d, e, f
80 | a, e, f
81 | a, d, e,f
82 | a, d, e
83 | c, e
84 | e, f
85 | a, c, e
86 | a, f
87 | b, e, f
88 | a, f
89 | a, d, e, f
90 | a, e, f
91 | a, d, e,f
92 | a, d, e
93 | c, e
94 | e, f
95 | a, c, e
96 | a, f
97 | b, e, f
98 | a, f
99 | a, d, e, f
100 | a, e, f
101 | a, d, e,f
102 | a, d, e
103 | c, e
104 | e, f
105 | a, c, e
106 | a, f
107 | b, e, f
108 | a, f
109 | a, d, e, f
110 | a, e, f
111 | a, d, e,f
112 | a, d, e
113 | c, e
114 | e, f
115 | a, c, e
116 | a, f
117 | b, e, f
118 | a, f
119 | a, d, e, f
120 | a, e, f
121 | a, d, e,f
122 | a, d, e
123 | c, e
124 | e, f
125 | a, c, e
126 | a, f
127 | b, e, f
128 | a, f
129 | a, d, e, f
130 | a, e, f
131 | a, d, e,f
132 | a, d, e
133 | c, e
134 | e, f
135 | a, c, e
136 | a, f
137 | b, e, f
138 | a, f
139 | a, d, e, f
140 | a, e, f
141 | a, d, e,f
142 | a, d, e
143 | c, e
144 | e, f
145 | a, c, e
146 | a, f
147 | b, e, f
148 | a, f
149 | a, d, e, f
150 | a, e, f
151 | a, d, e,f
152 | a, d, e
153 | c, e
154 | e, f
155 | a, c, e
156 | a, f
157 | b, e, f
158 | a, f
159 | a, d, e, f
160 | a, e, f
161 | a, d, e,f
162 | a, d, e
163 | c, e
164 | e, f
165 | a, c, e
166 | a, f
167 | b, e, f
168 | a, f
169 | a, d, e, f
170 | a, e, f
171 | a, d, e,f
172 | a, d, e
173 | c, e
174 | e, f
175 | a, c, e
176 | a, f
177 | b, e, f
178 | a, f
179 | a, d, e, f
180 | a, e, f
181 | a, d, e,f
182 | a, d, e
183 | c, e
184 | e, f
185 | a, c, e
186 | a, f
187 | b, e, f
188 | a, f
189 | a, d, e, f
190 | a, e, f
191 | a, d, e,f
192 | a, d, e
193 | c, e
194 | e, f
195 | a, c, e
196 | a, f
197 | b, e, f
198 | a, f
199 | a, d, e, f
200 | a, e, f
201 | a, d, e,f
202 | a, d, e
203 | c, e
204 | e, f
205 | a, c, e
206 | a, f
207 | b, e, f
208 | a, f
209 | a, d, e, f
210 | a, e, f
211 | a, d, e,f
212 | a, d, e
213 | c, e
214 | e, f
215 | a, c, e
216 | a, f
217 | b, e, f
218 | a, f
219 | a, d, e, f
220 | a, e, f
221 | a, d, e,f
222 | a, d, e
223 | c, e
224 | e, f
225 | a, c, e
226 | a, f
227 | b, e, f
228 | a, f
229 | a, d, e, f
230 | a, e, f
231 | a, d, e,f
232 | a, d, e
233 | c, e
234 | e, f
235 | a, c, e
236 | a, f
237 | b, e, f
238 | a, f
239 | a, d, e, f
240 | a, e, f
241 | a, d, e,f
242 | a, d, e
243 | c, e
244 | e, f
245 | a, c, e
246 | a, f
247 | b, e, f
248 | a, f
249 | a, d, e, f
250 | a, e, f
251 | a, d, e,f
252 | a, d, e
253 | c, e
254 | e, f
255 | a, c, e
256 | a, f
257 | b, e, f
258 | a, f
259 | a, d, e, f
260 | a, e, f
261 | a, d, e,f
262 | a, d, e
263 | c, e
264 | e, f
265 | a, c, e
266 | a, f
267 | b, e, f
268 | a, f
269 | a, d, e, f
270 | a, e, f
--------------------------------------------------------------------------------
/数据分类(决策树)/data.txt:
--------------------------------------------------------------------------------
1 | 184 77 fat
2 | 184 71 thin
3 | 189 81 fat
4 | 178 75 fat
5 | 160 53 thin
6 | 185 75 thin
7 | 180 76 fat
8 | 179 72 fat
9 | 188 83 fat
10 | 165 59 fat
11 | 170 62 thin
12 | 167 62 fat
13 | 177 70 fat
14 | 171 61 thin
15 | 162 54 thin
16 | 178 70 fat
17 | 177 66 thin
18 | 180 72 thin
19 | 164 59 fat
20 | 179 68 thin
21 | 170 63 fat
22 | 167 61 fat
23 | 175 71 fat
24 | 189 84 fat
25 | 174 65 thin
26 | 166 56 thin
27 | 162 52 thin
28 | 182 70 thin
29 | 177 65 thin
30 | 181 73 fat
31 | 176 68 thin
32 | 170 58 thin
33 | 162 58 fat
34 | 171 63 thin
35 | 162 56 thin
36 | 186 82 fat
37 | 189 80 fat
38 | 181 69 thin
39 | 171 68 fat
40 | 171 63 thin
41 | 172 60 thin
42 | 161 53 thin
43 | 188 74 thin
44 | 176 64 thin
45 | 181 69 thin
46 | 169 65 fat
47 | 164 60 fat
48 | 170 62 thin
49 | 169 57 thin
50 | 186 81 fat
51 | 160 53 thin
52 | 183 72 thin
53 | 162 55 thin
54 | 183 75 fat
55 | 183 70 thin
56 | 179 74 fat
57 | 175 63 thin
58 | 179 72 fat
59 | 185 72 thin
60 | 170 63 fat
61 | 180 74 fat
62 | 188 77 thin
63 | 175 63 thin
64 | 165 59 fat
65 | 183 73 thin
66 | 166 57 thin
67 | 160 56 fat
68 | 170 60 thin
69 | 168 62 fat
70 | 176 72 fat
71 | 171 59 thin
72 | 175 66 thin
73 | 167 57 thin
74 | 181 72 thin
75 | 183 74 thin
76 | 169 64 fat
77 | 180 71 thin
78 | 188 80 fat
79 | 165 62 fat
80 | 166 56 thin
81 | 170 65 fat
82 | 180 67 thin
83 | 188 78 thin
84 | 178 71 fat
85 | 163 55 thin
86 | 177 74 fat
87 | 189 80 thin
88 | 188 83 fat
89 | 176 72 fat
90 | 179 72 fat
91 | 174 62 thin
92 | 189 83 fat
93 | 170 67 fat
94 | 184 71 thin
95 | 160 56 fat
96 | 176 73 fat
97 | 172 69 fat
98 | 186 75 thin
99 | 172 67 fat
100 | 184 79 fat
101 | 176 66 thin
102 | 186 82 fat
103 | 174 69 fat
104 | 171 64 fat
105 | 162 51 thin
106 | 175 72 fat
107 | 183 71 thin
108 | 162 56 thin
109 | 166 59 thin
110 | 169 59 thin
111 | 186 82 fat
112 | 166 61 fat
113 | 162 59 fat
114 | 167 62 fat
115 | 178 66 thin
116 | 161 53 thin
117 | 166 61 fat
118 | 174 62 thin
119 | 188 79 thin
120 | 161 57 fat
121 | 170 58 thin
122 | 173 62 thin
123 | 168 65 fat
124 | 182 69 thin
125 | 174 64 thin
126 | 173 64 thin
127 | 160 52 thin
128 | 184 75 thin
129 | 189 77 thin
130 | 188 78 thin
131 | 161 56 fat
132 | 175 70 fat
133 | 182 76 fat
134 | 176 71 fat
135 | 184 79 fat
136 | 189 77 thin
137 | 173 66 fat
138 | 160 52 thin
139 | 163 58 fat
140 | 187 78 fat
141 | 161 51 thin
142 | 172 64 thin
143 | 187 83 fat
144 | 163 54 thin
145 | 185 74 thin
146 | 185 74 thin
147 | 189 77 thin
148 | 160 50 thin
149 | 165 54 thin
150 | 171 63 thin
151 | 184 79 fat
152 | 162 53 thin
153 | 164 55 thin
154 | 174 63 thin
155 | 171 64 fat
156 | 187 82 fat
157 | 184 77 fat
158 | 187 79 fat
159 | 163 59 fat
160 | 173 63 thin
161 | 164 60 fat
162 | 185 77 fat
163 | 183 77 fat
164 | 160 51 thin
165 | 164 60 fat
166 | 163 52 thin
167 | 161 60 fat
168 | 171 61 thin
169 | 187 80 fat
170 | 163 52 thin
171 | 187 79 fat
172 | 162 59 fat
173 | 180 68 thin
174 | 181 75 fat
175 | 167 62 fat
176 | 188 81 fat
177 | 160 51 thin
178 | 179 73 fat
179 | 168 62 fat
180 | 179 75 fat
181 | 172 63 thin
182 | 181 72 thin
183 | 180 72 fat
184 | 185 82 fat
185 | 175 68 fat
186 | 181 78 fat
187 | 189 78 thin
188 | 171 69 fat
189 | 173 66 fat
190 | 188 84 fat
191 |
--------------------------------------------------------------------------------
/数据分类(决策树)/data_generation.py:
--------------------------------------------------------------------------------
1 | import random
2 | with open('data_lab2.txt', 'w') as f:
3 | # f.write('height\tweight\n')
4 | # 生成300个随机数
5 | for i in range(300):
6 | height = random.randint(1600, 1900) / 10
7 | weight = (height - 100) * 0.9 + random.randint(-50, 50) / 10
8 | #标准体重=0.9*身高-90
9 | flag = 0.9 * height - 90
10 | if (flag < weight):
11 | p = "fat"
12 | else:
13 | p = "thin"
14 | f.write('%d %d %s\n' % (height, weight, p))
15 |
16 |
--------------------------------------------------------------------------------
/数据分类(决策树)/decision_tree.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy as sp
3 | import pydotplus
4 | from sklearn import tree
5 | from sklearn.metrics import precision_recall_curve
6 | from sklearn.metrics import classification_report
7 | from sklearn.model_selection import train_test_split
8 |
9 | # 读入数据
10 | data = []
11 | labels = []
12 | with open("data.txt") as ifile:
13 | for line in ifile:
14 | tokens = line.strip().split(' ')
15 | data.append([float(tk) for tk in tokens[:-1]])
16 | labels.append(tokens[-1])
17 | x = np.array(data)
18 | labels = np.array(labels)
19 | y = np.zeros(labels.shape)
20 |
21 |
22 | # 胖瘦类别数字化
23 | y[labels=='fat']=1
24 | # 数据拆分,80%训练,20%测试
25 | x_train, x_test, y_train, y_test=train_test_split(x, y, test_size = 0.2,random_state=0)
26 |
27 | # 使用DecisionTreeClassifier建立模型并训练
28 | clf = tree.DecisionTreeClassifier(criterion='entropy')
29 | clf.fit(x_train, y_train)
30 |
31 | # 测试结果
32 | answer = clf.predict(x_train)
33 | print("训练的样本数据:\n ", x_train) # 训练样本数据
34 | print("训练结果: ", answer)
35 | print("实际结果: ", y_train) # 训练样本类别
36 | print("准确率: ", np.mean( answer == y_train))
37 |
38 | print("影响比例: 身高-体重\n", clf.feature_importances_)
39 |
40 | # 保存决策树为dot文件,后续图形处理
41 | with open("tree.dot", 'w') as f:
42 | f = tree.export_graphviz(clf, out_file=f)
43 |
--------------------------------------------------------------------------------
/数据分类(决策树)/tree.dot:
--------------------------------------------------------------------------------
1 | digraph Tree {
2 | node [shape=box] ;
3 | 0 [label="X[1] <= 55.5\nentropy = 0.998\nsamples = 152\nvalue = [80, 72]"] ;
4 | 1 [label="entropy = 0.0\nsamples = 17\nvalue = [17, 0]"] ;
5 | 0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
6 | 2 [label="X[1] <= 80.5\nentropy = 0.997\nsamples = 135\nvalue = [63, 72]"] ;
7 | 0 -> 2 [labeldistance=2.5, labelangle=-45, headlabel="False"] ;
8 | 3 [label="X[0] <= 168.5\nentropy = 0.999\nsamples = 122\nvalue = [63, 59]"] ;
9 | 2 -> 3 ;
10 | 4 [label="X[1] <= 57.5\nentropy = 0.795\nsamples = 25\nvalue = [6, 19]"] ;
11 | 3 -> 4 ;
12 | 5 [label="X[0] <= 161.5\nentropy = 0.811\nsamples = 8\nvalue = [6, 2]"] ;
13 | 4 -> 5 ;
14 | 6 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 2]"] ;
15 | 5 -> 6 ;
16 | 7 [label="entropy = 0.0\nsamples = 6\nvalue = [6, 0]"] ;
17 | 5 -> 7 ;
18 | 8 [label="entropy = 0.0\nsamples = 17\nvalue = [0, 17]"] ;
19 | 4 -> 8 ;
20 | 9 [label="X[1] <= 62.5\nentropy = 0.978\nsamples = 97\nvalue = [57, 40]"] ;
21 | 3 -> 9 ;
22 | 10 [label="entropy = 0.0\nsamples = 14\nvalue = [14, 0]"] ;
23 | 9 -> 10 ;
24 | 11 [label="X[0] <= 179.5\nentropy = 0.999\nsamples = 83\nvalue = [43, 40]"] ;
25 | 9 -> 11 ;
26 | 12 [label="X[1] <= 68.5\nentropy = 0.94\nsamples = 42\nvalue = [15, 27]"] ;
27 | 11 -> 12 ;
28 | 13 [label="X[0] <= 172.5\nentropy = 0.983\nsamples = 26\nvalue = [15, 11]"] ;
29 | 12 -> 13 ;
30 | 14 [label="X[0] <= 170.5\nentropy = 0.811\nsamples = 12\nvalue = [3, 9]"] ;
31 | 13 -> 14 ;
32 | 15 [label="entropy = 0.0\nsamples = 5\nvalue = [0, 5]"] ;
33 | 14 -> 15 ;
34 | 16 [label="X[1] <= 63.5\nentropy = 0.985\nsamples = 7\nvalue = [3, 4]"] ;
35 | 14 -> 16 ;
36 | 17 [label="entropy = 0.0\nsamples = 2\nvalue = [2, 0]"] ;
37 | 16 -> 17 ;
38 | 18 [label="X[0] <= 171.5\nentropy = 0.722\nsamples = 5\nvalue = [1, 4]"] ;
39 | 16 -> 18 ;
40 | 19 [label="entropy = 0.0\nsamples = 3\nvalue = [0, 3]"] ;
41 | 18 -> 19 ;
42 | 20 [label="X[1] <= 65.5\nentropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
43 | 18 -> 20 ;
44 | 21 [label="entropy = 0.0\nsamples = 1\nvalue = [1, 0]"] ;
45 | 20 -> 21 ;
46 | 22 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
47 | 20 -> 22 ;
48 | 23 [label="X[1] <= 65.5\nentropy = 0.592\nsamples = 14\nvalue = [12, 2]"] ;
49 | 13 -> 23 ;
50 | 24 [label="entropy = 0.0\nsamples = 7\nvalue = [7, 0]"] ;
51 | 23 -> 24 ;
52 | 25 [label="X[0] <= 175.5\nentropy = 0.863\nsamples = 7\nvalue = [5, 2]"] ;
53 | 23 -> 25 ;
54 | 26 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 2]"] ;
55 | 25 -> 26 ;
56 | 27 [label="entropy = 0.0\nsamples = 5\nvalue = [5, 0]"] ;
57 | 25 -> 27 ;
58 | 28 [label="entropy = 0.0\nsamples = 16\nvalue = [0, 16]"] ;
59 | 12 -> 28 ;
60 | 29 [label="X[1] <= 74.5\nentropy = 0.901\nsamples = 41\nvalue = [28, 13]"] ;
61 | 11 -> 29 ;
62 | 30 [label="X[0] <= 181.5\nentropy = 0.485\nsamples = 19\nvalue = [17, 2]"] ;
63 | 29 -> 30 ;
64 | 31 [label="X[1] <= 72.5\nentropy = 0.811\nsamples = 8\nvalue = [6, 2]"] ;
65 | 30 -> 31 ;
66 | 32 [label="X[1] <= 71.5\nentropy = 0.592\nsamples = 7\nvalue = [6, 1]"] ;
67 | 31 -> 32 ;
68 | 33 [label="entropy = 0.0\nsamples = 3\nvalue = [3, 0]"] ;
69 | 32 -> 33 ;
70 | 34 [label="X[0] <= 180.5\nentropy = 0.811\nsamples = 4\nvalue = [3, 1]"] ;
71 | 32 -> 34 ;
72 | 35 [label="entropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
73 | 34 -> 35 ;
74 | 36 [label="entropy = 0.0\nsamples = 2\nvalue = [2, 0]"] ;
75 | 34 -> 36 ;
76 | 37 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
77 | 31 -> 37 ;
78 | 38 [label="entropy = 0.0\nsamples = 11\nvalue = [11, 0]"] ;
79 | 30 -> 38 ;
80 | 39 [label="X[0] <= 187.5\nentropy = 1.0\nsamples = 22\nvalue = [11, 11]"] ;
81 | 29 -> 39 ;
82 | 40 [label="X[1] <= 75.5\nentropy = 0.684\nsamples = 11\nvalue = [2, 9]"] ;
83 | 39 -> 40 ;
84 | 41 [label="X[0] <= 183.5\nentropy = 1.0\nsamples = 4\nvalue = [2, 2]"] ;
85 | 40 -> 41 ;
86 | 42 [label="entropy = 0.0\nsamples = 2\nvalue = [0, 2]"] ;
87 | 41 -> 42 ;
88 | 43 [label="entropy = 0.0\nsamples = 2\nvalue = [2, 0]"] ;
89 | 41 -> 43 ;
90 | 44 [label="entropy = 0.0\nsamples = 7\nvalue = [0, 7]"] ;
91 | 40 -> 44 ;
92 | 45 [label="X[1] <= 79.5\nentropy = 0.684\nsamples = 11\nvalue = [9, 2]"] ;
93 | 39 -> 45 ;
94 | 46 [label="entropy = 0.0\nsamples = 8\nvalue = [8, 0]"] ;
95 | 45 -> 46 ;
96 | 47 [label="X[0] <= 188.5\nentropy = 0.918\nsamples = 3\nvalue = [1, 2]"] ;
97 | 45 -> 47 ;
98 | 48 [label="entropy = 0.0\nsamples = 1\nvalue = [0, 1]"] ;
99 | 47 -> 48 ;
100 | 49 [label="entropy = 1.0\nsamples = 2\nvalue = [1, 1]"] ;
101 | 47 -> 49 ;
102 | 50 [label="entropy = 0.0\nsamples = 13\nvalue = [0, 13]"] ;
103 | 2 -> 50 ;
104 | }
--------------------------------------------------------------------------------
/数据聚类(K-means)/k-means.py:
--------------------------------------------------------------------------------
1 | from sklearn import datasets
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 |
5 | iris = datasets.load_iris()
6 | X, y = iris.data, iris.target
7 |
8 | # 为了便于可视化,只取两个维度
9 | data = X[:,[1,3]]
10 |
11 | print(data)
12 |
13 | plt.scatter(data[:,0],data[:,1])
14 |
15 | ck = 3
16 | '''
17 | 随机选取k个点为聚类的初始代表点,即质点
18 | '''
19 | def rand_center(data,k):
20 | """Generate k center within the range of data set."""
21 | n = data.shape[1] # features
22 | centroids = np.zeros((k,n)) # init with (0,0)....
23 | for i in range(n):
24 | dmin, dmax = np.min(data[:,i]), np.max(data[:,i])
25 | centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k)
26 | return centroids
27 |
28 | # 初始化点列表
29 | centroids = rand_center(data, ck)
30 | print(centroids)
31 |
32 | def kmeans(data,k=2):
33 | def _distance(p1,p2):
34 | """
35 | Return Eclud distance between two points.
36 | p1 = np.array([0,0]), p2 = np.array([1,1]) => 1.414
37 | """
38 | tmp = np.sum((p1-p2)**2)
39 | return np.sqrt(tmp)
40 | def _rand_center(data,k):
41 | """Generate k center within the range of data set."""
42 | n = data.shape[1] # features
43 | centroids = np.zeros((k,n)) # init with (0,0)....
44 | for i in range(n):
45 | dmin, dmax = np.min(data[:,i]), np.max(data[:,i])
46 | centroids[:,i] = dmin + (dmax - dmin) * np.random.rand(k)
47 | return centroids
48 |
49 | def _converged(centroids1, centroids2):
50 |
51 | # if centroids not changed, we say 'converged'
52 | set1 = set([tuple(c) for c in centroids1])
53 | set2 = set([tuple(c) for c in centroids2])
54 | return (set1 == set2)
55 |
56 |
57 | n = data.shape[0] # number of entries
58 | centroids = _rand_center(data,k)
59 | label = np.zeros(n,dtype=np.int) # track the nearest centroid
60 | assement = np.zeros(n) # for the assement of our model
61 | converged = False
62 |
63 | while not converged:
64 | old_centroids = np.copy(centroids)
65 | for i in range(n):
66 | # determine the nearest centroid and track it with label
67 | min_dist, min_index = np.inf, -1
68 | for j in range(k):
69 | dist = _distance(data[i],centroids[j])
70 | if dist < min_dist:
71 | min_dist, min_index = dist, j
72 | label[i] = j
73 | assement[i] = _distance(data[i],centroids[label[i]])**2
74 |
75 | # update centroid
76 | for m in range(k):
77 | centroids[m] = np.mean(data[label==m],axis=0)
78 | converged = _converged(old_centroids,centroids)
79 | return centroids, label, np.sum(assement)
80 |
81 |
82 | # 多运行
83 | best_assement = np.inf
84 | best_centroids = None
85 | best_label = None
86 |
87 | for i in range(10):
88 | centroids, label, assement = kmeans(data,ck)
89 | if assement < best_assement:
90 | best_assement = assement
91 | best_centroids = centroids
92 | best_label = label
93 |
94 | data0 = data[best_label==0]
95 | data1 = data[best_label==1]
96 |
97 | # 打印展示
98 | fig, (ax1,ax2) = plt.subplots(1,2,figsize=(12,5))
99 | ax1.scatter(data[:,0],data[:,1],c='c',s=30,marker='o')
100 | ax2.scatter(data0[:,0],data0[:,1],c='r')
101 | ax2.scatter(data1[:,0],data1[:,1],c='c')
102 | ax2.scatter(centroids[:,0],centroids[:,1],c='b',s=120,marker='o')
103 | plt.show()
--------------------------------------------------------------------------------