├── .idea
├── Machine_learning_python.iml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── AdaBoost
├── AdaBoost.log
└── AdaBoost.py
├── EM
└── EM.py
├── LICENSE
├── README.md
├── k近邻
├── knn.log
├── knn.py
├── knn_kd.py
└── knnkd.log
├── mnist
└── convert_mnist2csv.py
├── 决策树
├── C4.5.py
├── C45_decision_tree.log
├── CART.py
├── CART_decision_tree.log
├── ID3.py
└── ID3_decision_tree.log
├── 感知机算法
├── perceptron.log
└── perceptron.py
├── 支持向量机
├── SVM.log
└── SVM.py
├── 最大熵模型
└── maxEntropy.py
├── 朴素贝叶斯
├── NaiveBayes.log
└── NaiveBayes.py
└── 逻辑回归
├── LogisticRegression.log
└── LogisticRegression.py
/.idea/Machine_learning_python.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
44 |
45 |
46 |
47 | pivot
48 | self.num_features
49 | shap
50 | 0
51 | print
52 | abs
53 | math.fabs
54 |
55 |
56 | logging.info
57 | math.fabs
58 | abs
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 | 1595486360672
432 |
433 |
434 | 1595486360672
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
509 |
510 |
511 |
512 |
513 |
514 |
515 |
516 |
517 |
518 |
519 |
520 |
521 |
522 |
523 |
524 |
525 |
526 |
527 |
528 |
529 |
530 |
531 |
532 |
533 |
534 |
535 |
536 |
537 |
538 |
539 |
540 |
541 |
542 |
543 |
544 |
545 |
546 |
547 |
548 |
549 |
550 |
551 |
552 |
553 |
554 |
555 |
556 |
557 |
558 |
559 |
560 |
561 |
562 |
563 |
564 |
565 |
566 |
567 |
568 |
569 |
570 |
571 |
572 |
573 |
574 |
575 |
576 |
577 |
578 |
579 |
580 |
581 |
582 |
583 |
584 |
585 |
586 |
587 |
588 |
589 |
590 |
591 |
592 |
593 |
594 |
595 |
596 |
597 |
598 |
599 |
600 |
601 |
602 |
603 |
604 |
605 |
606 |
607 |
608 |
609 |
610 |
611 |
612 |
613 |
614 |
615 |
616 |
617 |
618 |
619 |
620 |
621 |
622 |
623 |
624 |
625 |
626 |
627 |
--------------------------------------------------------------------------------
/AdaBoost/AdaBoost.log:
--------------------------------------------------------------------------------
1 | 08-02 10:47 INFO This is an info message.
2 | 08-02 10:47 INFO Loading data....
3 | 08-02 10:47 INFO Loading data done.
4 | 08-02 10:47 INFO Training the AdaBoost model....
5 | 08-02 10:47 INFO The 0th Tree, The train data's accuracy is:0.9812
6 | 08-02 10:48 INFO The 1th Tree, The train data's accuracy is:0.9812
7 | 08-02 10:48 INFO The 2th Tree, The train data's accuracy is:0.9812
8 | 08-02 10:48 INFO The 3th Tree, The train data's accuracy is:0.9878
9 | 08-02 10:49 INFO The 4th Tree, The train data's accuracy is:0.9862
10 | 08-02 10:49 INFO The 5th Tree, The train data's accuracy is:0.9904
11 | 08-02 10:49 INFO The 6th Tree, The train data's accuracy is:0.988
12 | 08-02 10:49 INFO The 7th Tree, The train data's accuracy is:0.9924
13 | 08-02 10:50 INFO The 8th Tree, The train data's accuracy is:0.9898
14 | 08-02 10:50 INFO The 9th Tree, The train data's accuracy is:0.9939
15 | 08-02 10:50 INFO The 10th Tree, The train data's accuracy is:0.9929
16 | 08-02 10:50 INFO The 11th Tree, The train data's accuracy is:0.9942
17 | 08-02 10:51 INFO The 12th Tree, The train data's accuracy is:0.9934
18 | 08-02 10:51 INFO The 13th Tree, The train data's accuracy is:0.994
19 | 08-02 10:51 INFO The 14th Tree, The train data's accuracy is:0.9951
20 | 08-02 10:51 INFO The 15th Tree, The train data's accuracy is:0.9945
21 | 08-02 10:52 INFO The 16th Tree, The train data's accuracy is:0.9958
22 | 08-02 10:52 INFO The 17th Tree, The train data's accuracy is:0.9952
23 | 08-02 10:52 INFO The 18th Tree, The train data's accuracy is:0.9963
24 | 08-02 10:52 INFO The 19th Tree, The train data's accuracy is:0.9958
25 | 08-02 10:53 INFO The 20th Tree, The train data's accuracy is:0.997
26 | 08-02 10:53 INFO The 21th Tree, The train data's accuracy is:0.9963
27 | 08-02 10:53 INFO The 22th Tree, The train data's accuracy is:0.9965
28 | 08-02 10:54 INFO The 23th Tree, The train data's accuracy is:0.9968
29 | 08-02 10:54 INFO The 24th Tree, The train data's accuracy is:0.9974
30 | 08-02 10:54 INFO The 25th Tree, The train data's accuracy is:0.9974
31 | 08-02 10:54 INFO The 26th Tree, The train data's accuracy is:0.9974
32 | 08-02 10:55 INFO The 27th Tree, The train data's accuracy is:0.9977
33 | 08-02 10:55 INFO The 28th Tree, The train data's accuracy is:0.9978
34 | 08-02 10:55 INFO The 29th Tree, The train data's accuracy is:0.9981
35 | 08-02 10:55 INFO The 30th Tree, The train data's accuracy is:0.9981
36 | 08-02 10:56 INFO The 31th Tree, The train data's accuracy is:0.9985
37 | 08-02 10:56 INFO The 32th Tree, The train data's accuracy is:0.9978
38 | 08-02 10:56 INFO The 33th Tree, The train data's accuracy is:0.9983
39 | 08-02 10:56 INFO The 34th Tree, The train data's accuracy is:0.9982
40 | 08-02 10:57 INFO The 35th Tree, The train data's accuracy is:0.9982
41 | 08-02 10:57 INFO The 36th Tree, The train data's accuracy is:0.998
42 | 08-02 10:57 INFO The 37th Tree, The train data's accuracy is:0.9987
43 | 08-02 10:57 INFO The 38th Tree, The train data's accuracy is:0.9983
44 | 08-02 10:58 INFO The 39th Tree, The train data's accuracy is:0.9986
45 | 08-02 10:58 INFO The 40th Tree, The train data's accuracy is:0.9987
46 | 08-02 10:58 INFO The 41th Tree, The train data's accuracy is:0.9987
47 | 08-02 10:59 INFO The 42th Tree, The train data's accuracy is:0.999
48 | 08-02 10:59 INFO The 43th Tree, The train data's accuracy is:0.9987
49 | 08-02 10:59 INFO The 44th Tree, The train data's accuracy is:0.9991
50 | 08-02 10:59 INFO The 45th Tree, The train data's accuracy is:0.9989
51 | 08-02 11:00 INFO The 46th Tree, The train data's accuracy is:0.9989
52 | 08-02 11:00 INFO The 47th Tree, The train data's accuracy is:0.9987
53 | 08-02 11:00 INFO The 48th Tree, The train data's accuracy is:0.9988
54 | 08-02 11:00 INFO The 49th Tree, The train data's accuracy is:0.9987
55 | 08-02 11:00 INFO accuracy:99.9527
56 | 08-02 11:00 INFO Total Time: 789
57 |
--------------------------------------------------------------------------------
/AdaBoost/AdaBoost.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/8/1
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 | # @Emial : lxztju@163.com
7 |
8 | '''
9 | 实现AdaBoost的提升方法
10 |
11 | 利用mnist数据集其中的两类进行分类
12 |
13 | ----------------
14 | 利用统计学习方法中例题8.1.3,利用阈值分割(单层决策树)作为基分类器
15 |
16 |
17 | '''
18 |
19 | import numpy as np
20 | import logging
21 | import time
22 |
23 |
24 | def loadData(fileName):
25 | '''
26 | 加载Mnist数据集
27 | :param fileName:要加载的数据集路径
28 | :return: list形式的数据集及标记
29 | '''
30 | # 存放数据及标记的list
31 | dataArr = []
32 | labelArr = []
33 | # 打开文件
34 | fr = open(fileName, 'r')
35 | # 将文件按行读取
36 | for line in fr.readlines():
37 | # 对每一行数据按切割福','进行切割,返回字段列表
38 | curLine = line.strip().split(',')
39 |
40 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类
41 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue
42 | if int(curLine[0]) == 0 or int(curLine[0]) == 1:
43 | if int(curLine[0]) == 0:
44 | labelArr.append(1)
45 | else:
46 | labelArr.append(-1)
47 | dataArr.append([int(int(num) > 128) for num in curLine[1:]])
48 |
49 | # 返回data和label
50 | return dataArr, labelArr
51 |
52 |
53 |
54 | class SingleTree:
55 | def __init__(self, traindataList, trainlabelList):
56 | '''
57 | 构建单层的决策树作为AdaBoost的基分类器
58 | :param traindataList: 输入的数据集的list格式
59 | :param trainlabelList: 输入训练集的label的list格式
60 | :param D: 训练数据集的权重
61 | '''
62 | self.traindataArr = np.array(traindataList)
63 | self.trainlabelArr = np.array(trainlabelList)
64 | self.m, self.n = self.traindataArr.shape
65 | self.D = [1/ self.m] * self.m # 初始化数据集权重为均匀分布
66 |
67 |
68 | def calcError(self, prediction, trainlabelArr, D):
69 | '''
70 | 计算在训练数据集上的分类误差率
71 | :param prediction: 决策树预测出的prediction,与trainlabelArr长度相同
72 | :param trainlabelArr: ground truth
73 | :param D: 训练数据集的权重
74 | :return: 返回训练误差率
75 | '''
76 | # 初始化error
77 | error = 0
78 |
79 | for i in range(trainlabelArr.size):
80 | if prediction[i] != trainlabelArr[i]:
81 | error += D[i]
82 | return error
83 |
84 |
85 | def singleTree(self):
86 | '''
87 | 构建单层决策树,作为基分类器
88 | :return:
89 | '''
90 | # 利用字典构建一棵树
91 | # print(self.D)
92 | tree = {}
93 | # 切分点,由于数据集读取的过程中,每个特征的取值均为0 和 1,因此选择三个切分点,第一个小于0,第二个0,1之间,第三个大于1
94 | divides = [-0.5, 0.5, 1.5]
95 | # 指定规则,对于某个特征,less为小于切分点阈值的为1,大于的为-1
96 | # Over为大于切分点阈值的为-1, 小于的为1
97 | rules = ['Less', 'Over']
98 | # 最大的误差值为1,因此初始化为1
99 | min_error = 1
100 | # 遍历每个特征,找寻能够使得误差最小值的切分店,与切分规则还有特征值
101 | for i in range(self.n):
102 | for divide in divides:
103 |
104 | for rule in rules:
105 | #初始化预测的结果为predicition
106 | prediction = np.ones(self.m)
107 | if rule == 'Less':
108 | # 当切分规则为Less时,大于切分点的样本置为-1,因为一开始一开始初始化为1,因此预测为1的可不进行赋值处理
109 | prediction[self.traindataArr[:,i] >divide] = -1
110 | else:
111 | # 当切分点为Over时,小于切分店的样本置为-1
112 | prediction[self.traindataArr[:, i] <= divide] = -1
113 | # 对于给定的特征、切分点、切分规则,计算相对应的错误率
114 | error = self.calcError(prediction, self.trainlabelArr, self.D)
115 | # 找到最小的错误率来构建树
116 | if error < min_error:
117 | # print(prediction, self.traindataArr[:, i], trainlabelList)
118 | tree['error'] = error
119 | tree['rule'] = rule
120 | tree['divide'] = divide
121 | tree['feature'] = i
122 | tree['Gx'] = prediction
123 | min_error = error
124 | # print(tree, error)
125 | return tree
126 |
127 |
128 | class Adaboost(SingleTree):
129 | def __init__(self, traindataList, trainlabelList, treeNum = 50):
130 | super().__init__(traindataList, trainlabelList)
131 |
132 | self.treeNum = treeNum
133 |
134 | self.trees = self.BoostingTree()
135 |
136 |
137 |
138 | def BoostingTree(self):
139 | '''
140 | 构建Adaboost
141 | :return: 返回构建完成的Adaboost模型
142 | '''
143 | # 初始化树的列表,每个元素代表一棵树,从前到后一层层
144 | tree = []
145 | # 最终的预测值列表,每个元素表示对于每个样本的预测值
146 | finalPrediction = np.zeros(self.trainlabelArr.size)
147 | #迭代生成treeNum层的树
148 | for i in range(self.treeNum):
149 | # 构建单层的树
150 | curTree = self.singleTree()
151 | # 根据公式8.2,计算alpha
152 | alpha = 1/2 * np.log((1-curTree['error']) / curTree['error'])
153 | # 保留这一层树的预测值,用于后边权重值的计算
154 | Gx = curTree['Gx']
155 |
156 | # 计算数据集的权重
157 | # 式子8.4的分子部分,是一个向量,在array中 *与np.multiply表示元素对应相乘
158 | # np.dot()是向量点乘
159 | w = self.D * ( np.exp( -1 * alpha * self.trainlabelArr * Gx))
160 | # 训练集的权重分布
161 | self.D = w / sum(w)
162 | curTree['alpha'] = alpha
163 | # print(curTree)
164 |
165 | tree.append(curTree)
166 |
167 | #################################
168 | # 计算boosting的效果,提前中止
169 | finalPrediction += alpha * Gx
170 | # print(finalPrediction, self.trainlabelArr, alpha)
171 | correct_num = sum(np.sign(finalPrediction) == self.trainlabelArr)
172 | # print(correct_num, finalPrediction, self.trainlabelArr)
173 | accuracy = correct_num / self.trainlabelArr.size
174 | logging.info("The {}th Tree, The train data's accuracy is:{}".format(i, accuracy))
175 | # 如果在训练集上转却率已经达到1,提前中止
176 | if accuracy == 1:
177 | break
178 | return tree
179 |
180 | def predict(self, x, div, rule, feature):
181 | '''
182 | 对于单个样本,来计算基分类器的输出结果
183 | :param x: 输入样本
184 | :param div: 拆分点的阈值
185 | :param rule: 拆分规则, Less 或者 Over
186 | :param feature: 对应操作的特征
187 | :return: 返回预测的label
188 | '''
189 |
190 | if rule == 'Less':
191 | L, H = 1, -1
192 | else:
193 | L, H = -1, 1
194 |
195 | if x[feature] > div:
196 | return H
197 | else:
198 | return L
199 |
200 |
201 |
202 | def testModel(self, testdataList, testlabelList):
203 | '''
204 | 预测Adaboost模型的准确率
205 | :param testdataList: 输入的测试集的list格式
206 | :param testlabelList: 测试集的label
207 | :return: 返回准确率
208 | '''
209 | correct_num = 0
210 |
211 | for i in range(len(testdataList)):
212 | result = 0
213 |
214 | for curTree in self.trees:
215 |
216 | div = curTree['divide']
217 | feature = curTree['feature']
218 | rule = curTree['rule']
219 | alpha = curTree['alpha']
220 | result += alpha * self.predict(testdataList[i], div, rule, feature)
221 |
222 | if np.sign(result) == testlabelList[i]:
223 | correct_num += 1
224 |
225 | return round((correct_num /len(testlabelList)* 100), 4)
226 |
227 |
228 |
229 | if __name__ == '__main__':
230 |
231 | # 定义一个日志模块来保存日志
232 | logging.basicConfig(level=logging.DEBUG,
233 | format='%(asctime)-12s %(levelname)-8s %(message)s',
234 | datefmt='%m-%d %H:%M',
235 | filename='AdaBoost.log',
236 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
237 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
238 | console = logging.StreamHandler()
239 | console.setLevel(logging.INFO)
240 | # 设置在控制台输出格式[-
241 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
242 | console.setFormatter(formatter)
243 | # 将handler加入到根记录器
244 | logging.getLogger('').addHandler(console)
245 |
246 | # 根记录器输出信息
247 | logging.info('This is an info message.')
248 |
249 | start = time.time()
250 |
251 | # mnist数据集的存储位置
252 | import os
253 | home = os.path.expanduser('~')
254 | train_path = home + '/ML/mnist/mnist_train.csv'
255 | test_path = home + '/ML/mnist/mnist_test.csv'
256 | # train_path = home + '/ML/mnist/mnist_train_samples.csv'
257 | # test_path = home + '/ML/mnist/mnist_test_samples.csv'
258 |
259 | # 读取训练与测试集
260 | logging.info('Loading data....')
261 |
262 | traindataList, trainlabelList = loadData(train_path)
263 | testdataList, testlabelList = loadData(test_path)
264 | logging.info('Loading data done.')
265 | # print(trainlabelList[:100])
266 | logging.info('Training the AdaBoost model....')
267 |
268 | adaboost = Adaboost(traindataList[:1000], trainlabelList[:1000])
269 |
270 |
271 | # logging.info('Predicting one sample ....')
272 | # prediction = adaboost.predict([testdataList[0]], [testlabelList[0]])
273 | # logging.info('The prediction and the ground truth is : ({}, {})'.format(prediction, testlabelList[0]))
274 |
275 | # 测试Adaboost算法的准确率
276 | # 挑选测试集的前200个进行测试,防止运行时间过长
277 | accuracy = adaboost.testModel(testdataList, testlabelList)
278 |
279 | end = time.time()
280 |
281 | logging.info('accuracy:{}'.format(accuracy))
282 | logging.info('Total Time: {}'.format(round(end - start), 4))
--------------------------------------------------------------------------------
/EM/EM.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/8/3
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 | # @Emial : lxztju@163.com
7 |
8 |
9 | '''
10 | 构建两个高斯分布来混合来模拟生成的数据
11 |
12 | ---------------------------------------
13 | 第一个高斯分布
14 | alpha=0.3, mu=0, sigma=1
15 |
16 | 第二个高斯分布
17 |
18 | alpha=0.7, mu=1, sigma=3
19 | '''
20 |
21 |
22 | import numpy as np
23 | import logging
24 | import time
25 |
26 |
27 | def loadData(*args):
28 | '''
29 | 传入一个参数的列表,然后模拟高斯混合产生数据
30 | :param args: 输入的列表分别为[alpha0, mu0, sigma0, alpha1, mu1, sigma1]
31 | :return: 返回高斯混合boing生成的数据
32 | '''
33 | print(args)
34 | alpha0, mu0, sigma0, alpha1, mu1, sigma1 = args[0]
35 |
36 | # 生成数据的长度
37 | length = 1000
38 | # 第一个高斯模型产生的数据
39 | data1 = np.random.normal(mu0, sigma0, int(length*alpha0))
40 |
41 | #第二个高斯模型产生的数据
42 | data2 = np.random.normal(mu1, sigma1, int(length*alpha1))
43 | #所有的数据接起来放在一起
44 | dataArr = np.append(data1, data2)
45 | # 打乱数据
46 | np.random.shuffle(dataArr)
47 | return dataArr
48 |
49 |
50 |
51 | class EM:
52 | def __init__(self, alpha0, mu0, sigma0, alpha1, mu1, sigma1, dataArr):
53 | '''
54 | 高斯混合模型的参数
55 | :param alpha0: 第一个模型的生成概率
56 | :param mu0: 第一个高斯模型的均值
57 | :param sigma0: 第一个高斯模型的标准差
58 | :param alpha1: 第二个模型的生成概率
59 | :param mu1: 第二个模型的均值
60 | :param sigma1: 第三个模型的标准差
61 | '''
62 | self.alpha0 = alpha0
63 | self.mu0 = mu0
64 | self.sigma0 = sigma0
65 | self.alpha1 = alpha1
66 | self.mu1 = mu1
67 | self.sigma1 = sigma1
68 | self.dataArr = dataArr
69 | self.iter = 200
70 | self.train()
71 |
72 |
73 | def getGamma(self, mu, sigma):
74 | return (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp( -1 * ((self.dataArr - mu) * (self.dataArr - mu)) / (2 * sigma ** 2))
75 |
76 | def E_step(self):
77 | gamma0 = self.alpha0 * self.getGamma(self.mu0, self.sigma0)
78 |
79 | gamma1 = self.alpha1 * self.getGamma(self.mu1, self.sigma1)
80 |
81 | sum_ = gamma0 + gamma1
82 | return gamma0/sum_, gamma1/sum_
83 |
84 |
85 | def M_step(self):
86 | gamma0, gamma1 = self.E_step()
87 | # print(sum(gamma0))
88 | self.mu0 = sum(gamma0 * self.dataArr) / sum(gamma0)
89 | self.mu1 = sum(gamma1 * self.dataArr) / sum(gamma1)
90 |
91 | self.alpha0 = sum(gamma0) / self.dataArr.size
92 | self.alpha1 = sum(gamma1) / self.dataArr.size
93 |
94 | # print(self.alpha0, self.alpha1)
95 | self.sigma0 = np.sqrt(sum(gamma0 * (self.dataArr - self.mu0)*(self.dataArr - self.mu0) ) / sum(gamma0))
96 | self.sigma1 = np.sqrt(sum(gamma1 * (self.dataArr - self.mu1)*(self.dataArr - self.mu1) ) / sum(gamma1))
97 |
98 |
99 |
100 | def train(self):
101 |
102 |
103 | for i in range(self.iter):
104 | self.M_step()
105 | # print(self.alpha0, self.mu0 , self.sigma0)
106 |
107 |
108 | if __name__ == '__main__':
109 |
110 | parameters = [0.3, 0, 1, 0.7, 1, 3]
111 | dataArr = loadData(parameters)
112 | # print(dataArr.shape)
113 | em = EM(0.5, 0, 1, 0.5, 1, 2, dataArr)
114 | print(em.alpha0, em.mu0, em.sigma0, em.alpha1, em.alpha1, em.mu1, em.sigma1)
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 lxztju
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # machine_learning_python
2 | 纯python实现机器学习经典算法
3 |
4 | 目前已经完成的算法:
5 |
6 | * 感知机算法
7 |
8 | * KNN K近邻算法
9 |
10 | * 朴素贝叶斯算法
11 |
12 | * 决策树
13 |
14 | ```
15 | --ID3
16 |
17 | --C4.5
18 |
19 | --CART
20 | ```
21 |
22 |
23 |
24 | * logistic 回归
25 |
26 | * 支持向量机
27 |
28 |
29 | * AdaBoost
30 | * EM
31 |
32 |
33 |
34 |
35 | 知乎地址:[https://zhuanlan.zhihu.com/p/163688301](https://zhuanlan.zhihu.com/p/163688301)
36 |
37 | 自己学习这些机器学习的代码主要参考如下的两个代码仓库,通过自己手敲这些代码,搞明白了很多相关的算法细节东西。非常感谢大佬的开源。
38 |
39 | 参考链接:
40 |
41 | [https://github.com/Dod-o/Statistical-Learning-Method_Code](https://github.com/Dod-o/Statistical-Learning-Method_Code)
42 |
43 | [https://github.com/fengdu78/lihang-code](https://github.com/fengdu78/lihang-code)
--------------------------------------------------------------------------------
/k近邻/knn.log:
--------------------------------------------------------------------------------
1 | 07-24 21:37 INFO This is an info message.
2 | 07-24 21:37 INFO Loading data....
3 | 07-24 21:37 INFO Loading data done.
4 | 07-24 21:37 INFO test data shape is:(200,784)
5 | 07-24 21:37 INFO train data shape is:(60000,784)
6 | 07-24 21:37 INFO Testing data:(0/200), and correct_num:0
7 | 07-24 21:38 INFO Testing data:(50/200), and correct_num:49
8 | 07-24 21:39 INFO Testing data:(100/200), and correct_num:98
9 | 07-24 21:40 INFO Testing data:(150/200), and correct_num:146
10 | 07-24 21:41 INFO accuracy:0.965
11 | 07-24 21:41 INFO Total Time: 230
12 |
--------------------------------------------------------------------------------
/k近邻/knn.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/24
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 |
8 | '''
9 | 实现一个KNN算法,并实现两种进行最近邻搜索的方法,
10 | 线性搜索最近邻
11 | ---------------
12 | 距离的度量采用欧式距离与曼哈顿距离计算
13 | '''
14 |
15 | import numpy as np
16 | import time
17 | import logging
18 |
19 |
20 |
21 |
22 | class Knn:
23 | def __init__(self, k, num_classes, dist_method):
24 | self.k = k
25 | self.num_classes = num_classes
26 | self.dist_method = dist_method
27 |
28 |
29 |
30 | def loadData(self, fileName):
31 | '''
32 | 加载Mnist数据集
33 | :param fileName:要加载的数据集路径
34 | :return: list形式的数据集及标记
35 | '''
36 | # 存放数据及标记的list
37 | dataArr = []; labelArr = []
38 | # 打开文件
39 | fr = open(fileName, 'r')
40 | # 将文件按行读取
41 | for line in fr.readlines():
42 | # 对每一行数据按切割福','进行切割,返回字段列表
43 | curLine = line.strip().split(',')
44 |
45 |
46 | labelArr.append(int(curLine[0]))
47 | dataArr.append([int(num) / 255 for num in curLine[1:]])
48 | #存放标记
49 | #[int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
50 | #[int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
51 |
52 | #返回data和label
53 | return dataArr, labelArr
54 |
55 |
56 |
57 |
58 | def calculate_distance(self, x1, x2):
59 | '''
60 | 计算两个向量之间的距离
61 | :param x1: 第一个向量,numpy格式的列向量
62 | :param x2: 第二个向量,numpy格式的列向量
63 | :param method: 值为'l2, l1',l2为欧式距离, l1为曼哈顿聚类
64 | :return: 返回距离度量值,标量值
65 | '''
66 | if self.dist_method == 'l2':
67 | return np.sqrt(np.sum(np.square(x1 - x2)))
68 | else:
69 | return np.sum(np.abs(x1 - x2))
70 |
71 |
72 |
73 |
74 |
75 | def linear_get_k_cloest(self, dataMat, labelMat, x):
76 | '''
77 | 构建爱呢感知机算法,其中loss function采用错误分类点的个数
78 | :param dataMat: 输入numpy格式的训练集
79 | :param labelMat: 输入numpy格式的训练集标签数据
80 | :param x: 待查验的向量
81 | :return: label , knn预测的label值
82 | '''
83 |
84 | # 训练数据的维度大小
85 | m, n = dataMat.shape
86 |
87 | ##线性遍历每个节点,分别记录各个节点的距离,然后找到最近邻的k个节点
88 | dists = [0] * m # 记录每个节点与待查节点的距离
89 | for i in range(m):
90 | xi = dataMat[i]
91 |
92 | dist = self.calculate_distance(xi, x)
93 | dists[i] = dist
94 |
95 | # 得到待测点与所有点的距离值,然后将所有的距离值排序,找到最近的k距离值的索引
96 | # argsort返回从小到大排序的元素的索引
97 | topk_index = np.argsort(np.array(dists))[:self.k]
98 | # print(type(topk_index), topk_index)
99 | # labelList表示每个类别的近邻样本的数目
100 | labelList = [0] * self.num_classes
101 | for index in topk_index:
102 | labelList[int(labelMat[index])] += 1
103 | # 返回识别后的类别
104 | return labelList.index(max(labelList))
105 |
106 |
107 |
108 |
109 | def modelTest(self, traindataArr, trainlabelArr, testdataArr, testlabelArr):
110 | '''
111 | 测试knn模型的准确率
112 | :param traindataArr: 训练数据的list格式
113 | :param trainLabelArr: 测试数据label的list格式
114 | :param testdataArr: 测试数据的list个格式存储
115 | :param testlabelArr: 测试数据label的list格式
116 | :return:
117 | '''
118 |
119 | # 数据转换为numpy格式,方便进行矩阵运算
120 | traindataMat = np.mat(traindataArr)
121 | trainlabelMat = np.mat(trainlabelArr).T
122 | testdataMat = np.mat(testdataArr)
123 | testlabelMat = np.mat(testlabelArr).T
124 |
125 | # 测试集的维度大小
126 | m ,n = testdataMat.shape
127 | m1, n1 = traindataMat.shape
128 | logging.info('test data shape is:({},{})'.format(m,n))
129 | logging.info('train data shape is:({},{})'.format(m1,n1))
130 |
131 |
132 | # 正确分类的样本的数目
133 | correct_num = 0
134 |
135 | # 遍历所有的测试样本,查找其中的正确分类样本个数
136 | for i in range(m):
137 | xi = testdataMat[i]
138 | yi = testlabelMat[i]
139 | if i % 50 == 0:
140 | logging.info('Testing data:({}/{}), and correct_num:{}'.format(i, m, correct_num))
141 | # 统计分类正确的元素点的个数
142 | if self.linear_get_k_cloest(traindataMat, trainlabelMat, xi) == yi:
143 | correct_num += 1
144 |
145 | return round(correct_num/m, 4)
146 |
147 |
148 |
149 |
150 |
151 |
152 | if __name__ == '__main__':
153 |
154 | # 定义一个日志模块来保存日志
155 | logging.basicConfig(level=logging.DEBUG,
156 | format='%(asctime)-12s %(levelname)-8s %(message)s',
157 | datefmt='%m-%d %H:%M',
158 | filename='knn.log',
159 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
160 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
161 | console = logging.StreamHandler()
162 | console.setLevel(logging.INFO)
163 | # 设置在控制台输出格式[-
164 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
165 | console.setFormatter(formatter)
166 | # 将handler加入到根记录器
167 | logging.getLogger('').addHandler(console)
168 |
169 | # 根记录器输出信息
170 | logging.info('This is an info message.')
171 |
172 |
173 | start = time.time()
174 |
175 |
176 | # mnist数据集的存储位置
177 | import os
178 | home = os.path.expanduser('~')
179 | train_path = home + '/ML/mnist/mnist_train.csv'
180 | test_path = home + '/ML/mnist/mnist_train.csv'
181 |
182 | topk = 20
183 | num_classes = 10
184 | dist_method = 'l2'
185 | knn = Knn(topk, num_classes, dist_method)
186 |
187 | # 读取训练与测试集
188 | logging.info('Loading data....')
189 |
190 | traindataArr, trainlabelArr = knn.loadData(train_path)
191 | testdataArr, testlabelArr = knn.loadData(test_path)
192 | logging.info('Loading data done.')
193 |
194 | #测试knn算法的准确率
195 | # 挑选测试集的前200个进行测试,防止运行时间过长
196 | accuracy = knn.modelTest(traindataArr, trainlabelArr, testdataArr[:200], testlabelArr[:200])
197 |
198 |
199 | end = time.time()
200 |
201 | logging.info('accuracy:{}'.format(accuracy))
202 | logging.info('Total Time: {}'.format(round(end-start), 4))
203 |
--------------------------------------------------------------------------------
/k近邻/knn_kd.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/24
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 |
8 | '''
9 | 实现一个KNN算法
10 | 构建kd树来搜索最近邻
11 | ----------------
12 | 距离的度量依然采用欧式距离
13 | '''
14 |
15 |
16 |
17 | import time
18 | import logging
19 |
20 | from collections import namedtuple
21 | from math import sqrt
22 |
23 |
24 | def loadData(fileName):
25 | '''
26 | 加载Mnist数据集
27 | :param fileName:要加载的数据集路径
28 | :return: list形式的数据集及标记
29 | '''
30 | # 存放数据及标记的list
31 | dataArr = [];
32 | labelArr = []
33 | # 打开文件
34 | fr = open(fileName, 'r')
35 | # 将文件按行读取
36 | for line in fr.readlines():
37 | # 对每一行数据按切割福','进行切割,返回字段列表
38 | curLine = line.strip().split(',')
39 |
40 | labelArr.append(int(curLine[0]))
41 | dataArr.append([int(num) / 255 for num in curLine[1:]])
42 | # 存放标记
43 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
44 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
45 |
46 | # 返回data和label
47 | return dataArr, labelArr
48 |
49 |
50 | # kd-tree每个结点中主要包含的数据结构如下
51 | class KdNode:
52 | def __init__(self, dom_elt, split, left, right):
53 | self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点)
54 | self.split = split # 整数(进行分割维度的序号)
55 | self.left = left # 该结点分割超平面左子空间构成的kd-tree
56 | self.right = right # 该结点分割超平面右子空间构成的kd-tree
57 |
58 |
59 | class KdTree:
60 | '''
61 | 对于输入空间构建KD树
62 | '''
63 | def __init__(self, data):
64 | # data为list格式的数据
65 | k = len(data[0]) # 数据维度
66 |
67 | def CreateNode(split, data_set): # 按第split维划分数据集dataset创建KdNode
68 | if not data_set: # 数据集为空
69 | return None
70 | # 对于输入的列表版找第split维进行排序
71 | data_set.sort(key=lambda x: x[split])
72 | split_pos = len(data_set) // 2 # 找到中位数的索引
73 | median = data_set[split_pos] # 中位数分割点
74 | split_next = (split + 1) % k # cycle coordinates
75 |
76 | # 递归的创建kd树
77 | return KdNode(
78 | median,
79 | split,
80 | CreateNode(split_next, data_set[:split_pos]), # 创建左子树
81 | CreateNode(split_next, data_set[split_pos + 1:])) # 创建右子树
82 |
83 | self.root = CreateNode(0, data) # 从第0维分量开始构建kd树,返回根节点
84 |
85 |
86 |
87 |
88 |
89 | class KnnKd():
90 | def __init__(self, kd, traindataArr, trainlabelArr):
91 | self.kd = kd
92 | # 定义一个namedtuple,分别存放最近坐标点、最近距离和访问过的节点数
93 | self.result = namedtuple("Result_tuple",
94 | "nearest_point nearest_dist nodes_visited")
95 | self.data_label_dict = {''.join([str(j) for j in traindataArr[i]]): trainlabelArr[i] for i in range(len(trainlabelArr)) }
96 |
97 |
98 |
99 | def find_nearest(self, point):
100 | '''
101 | # 对构建好的kd树进行搜索,寻找与目标点最近的样本点:
102 | :param point: 待查找的某个节点
103 | :return: 返回对应的类别
104 | '''
105 | k = len(point) # 数据维度
106 |
107 | def travel(kd_node, target, max_dist):
108 | '''
109 | 递归在kd树中进行搜索,对应的point
110 | :param kd_node: kd树的节点
111 | :param target: 待查找的节点
112 | :param max_dist: 以待查找节点为圆心的超球的半径
113 | :return: 返回最终的numed_tuple
114 | '''
115 | if kd_node is None:
116 | return self.result([0] * k, float("inf"),
117 | 0) # python中用float("inf")和float("-inf")表示正负无穷
118 |
119 | nodes_visited = 1
120 |
121 | s = kd_node.split # 进行分割的维度
122 | pivot = kd_node.dom_elt # 进行分割的“轴”
123 |
124 | if target[s] <= pivot[s]: # 如果目标点第s维小于分割轴的对应值(目标离左子树更近)
125 | nearer_node = kd_node.left # 下一个访问节点为左子树根节点
126 | further_node = kd_node.right # 同时记录下右子树
127 | else: # 目标离右子树更近
128 | nearer_node = kd_node.right # 下一个访问节点为右子树根节点
129 | further_node = kd_node.left
130 |
131 | temp1 = travel(nearer_node, target, max_dist) # 进行遍历找到包含目标点的区域
132 |
133 | nearest = temp1.nearest_point # 以此叶结点作为“当前最近点”
134 | dist = temp1.nearest_dist # 更新最近距离
135 |
136 | nodes_visited += temp1.nodes_visited
137 |
138 | if dist < max_dist:
139 | max_dist = dist # 最近点将在以目标点为球心,max_dist为半径的超球体内
140 |
141 | temp_dist = abs(pivot[s] - target[s]) # 第s维上目标点与分割超平面的距离
142 | if max_dist < temp_dist: # 判断超球体是否与超平面相交
143 | return self.result(nearest, dist, nodes_visited) # 不相交则可以直接返回,不用继续判断
144 |
145 | # ----------------------------------------------------------------------
146 | # 计算目标点与分割点的欧氏距离
147 | temp_dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(pivot, target)))
148 |
149 | if temp_dist < dist: # 如果“更近”
150 | nearest = pivot # 更新最近点
151 | dist = temp_dist # 更新最近距离
152 | max_dist = dist # 更新超球体半径
153 |
154 | # 检查另一个子结点对应的区域是否有更近的点
155 | temp2 = travel(further_node, target, max_dist)
156 |
157 | nodes_visited += temp2.nodes_visited
158 | if temp2.nearest_dist < dist: # 如果另一个子结点内存在更近距离
159 | nearest = temp2.nearest_point # 更新最近点
160 | dist = temp2.nearest_dist # 更新最近距离
161 |
162 | return self.result(nearest, dist, nodes_visited)
163 |
164 | res = travel(self.kd.root, point, float("inf")) # 从根节点开始递归
165 | return self.data_label_dict[''.join([str(j)for j in res.nearest_point])]
166 |
167 |
168 |
169 | if __name__ == '__main__':
170 |
171 | # 定义一个日志模块来保存日志
172 | logging.basicConfig(level=logging.DEBUG,
173 | format='%(asctime)-12s %(levelname)-8s %(message)s',
174 | datefmt='%m-%d %H:%M',
175 | filename='knnkd.log',
176 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
177 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
178 | console = logging.StreamHandler()
179 | console.setLevel(logging.INFO)
180 | # 设置在控制台输出格式[-
181 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
182 | console.setFormatter(formatter)
183 | # 将handler加入到根记录器
184 | logging.getLogger('').addHandler(console)
185 |
186 | # 根记录器输出信息
187 | logging.info('This is an info message.')
188 |
189 |
190 | start = time.time()
191 |
192 |
193 | # mnist数据集的存储位置
194 | import os
195 | home = os.path.expanduser('~')
196 | train_path = home + '/ML/mnist/mnist_train.csv'
197 | test_path = home + '/ML/mnist/mnist_train.csv'
198 |
199 |
200 |
201 | # 读取训练与测试集
202 | logging.info('Loading data....')
203 |
204 | traindataArr, trainlabelArr = loadData(train_path)
205 | testdataArr, testlabelArr = loadData(test_path)
206 | logging.info('Loading data done.')
207 |
208 |
209 |
210 | # 构建KD树
211 | logging.info('Building Kd Tree...')
212 | kd = KdTree(traindataArr)
213 |
214 | knnkd = KnnKd(kd, traindataArr, trainlabelArr)
215 | logging.info('Classify one image.....')
216 |
217 | print(knnkd.find_nearest(testdataArr[0]), testlabelArr[0])
218 |
219 |
220 | end = time.time()
221 |
222 | # logging.info('accuracy:{}'.format(accuracy))
223 | logging.info('Total Time: {}'.format(round(end-start), 4))
224 |
--------------------------------------------------------------------------------
/k近邻/knnkd.log:
--------------------------------------------------------------------------------
1 | 07-24 21:36 INFO This is an info message.
2 | 07-24 21:36 INFO Loading data....
3 | 07-24 21:36 INFO Loading data done.
4 | 07-24 21:36 INFO Building Kd Tree...
5 | 07-24 21:36 INFO Classify one image.....
6 | 07-24 21:36 INFO Total Time: 31
7 |
--------------------------------------------------------------------------------
/mnist/convert_mnist2csv.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/23
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 | '''
8 | 将mnist数据集转换为csv格式便于查看
9 | mnist数据集下载地址:http://yann.lecun.com/exdb/mnist/
10 | 下载解压之后,运行这个文件即可转换
11 | -------
12 | '''
13 |
14 |
15 | def convert(img_path, label_path, target_path, n):
16 | img = open(img_path, "rb")
17 | target = open(target_path, "w")
18 | label = open(label_path, "rb")
19 |
20 |
21 | ## 开头要先让指针滑动一部分
22 | # 图像文件的前16个字节是头, 包含了4个字节的幻数, 4个字节表示图像数量
23 | # 4个字节表示单个图像的行数, 4个字节表示单个图像的列数.
24 | # 标记文件的前8个字节是头, 包含了4个字节的幻数, 4个字节表示标记数量
25 | img.read(16)
26 | label.read(8)
27 | images = []
28 | # s1 = label.read(1)
29 | # print(s1, ord(s1))
30 | for i in range(n):
31 | image = [ord(label.read(1))]
32 | for j in range(28*28):
33 | image.append(ord(img.read(1)))
34 | images.append(image)
35 |
36 | for image in images:
37 | target.write(",".join(str(pix) for pix in image)+"\n")
38 |
39 | img.close()
40 | target.close()
41 | label.close()
42 |
43 | if __name__ == '__main__':
44 | import os
45 | home = os.path.expanduser('~')
46 | path = home + '/ML/mnist/'
47 | convert(path + "train-images.idx3-ubyte", path + "train-labels.idx1-ubyte",
48 | path + "mnist_train.csv", 60000)
49 | convert(path + "t10k-images.idx3-ubyte", path + "t10k-labels.idx1-ubyte",
50 | path + "mnist_test.csv", 10000)
51 |
52 |
53 | convert(path + "train-images.idx3-ubyte", path + "train-labels.idx1-ubyte",
54 | path + "mnist_train_samples.csv", 200)
55 | convert(path + "t10k-images.idx3-ubyte", path + "t10k-labels.idx1-ubyte",
56 | path + "mnist_test_samples.csv", 10)
57 |
58 | # import pandas as pd
59 | # test_data = pd.read_csv('./mnist_test.csv')
60 | # print(test_data.shape)
61 | # print(test_data.head())
62 |
--------------------------------------------------------------------------------
/决策树/C4.5.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/27
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 | '''
8 | 构建决策树
9 | C4.5算法实现决策树(不剪枝)
10 | C4.5采用信息增益比作为特征选择的标准
11 | -----------------------------
12 | 这部分尚未完成,因为在实际代码实现过程和,计算信息增益比时,数据集关于特征的熵HA(D),
13 | 这部分的计算出现问题,例如log项,内部的比值会出现1,整个值为0,后边计算时分母为0
14 | 同时这一项如果log内部如果出现0,计算也会出现错误。
15 | 后边还要研究一下如何解决这个问题,会报出warning
16 |
17 | --------------------------
18 | 自己没有仔细看过这个算法怎么解决这个问题,所以实现可能有点问题。
19 | 有大神帮忙在github上一起改一改这个代码,完善一下,就更好了
20 | '''
21 |
22 | import numpy as np
23 | import logging
24 | import time
25 | import copy
26 |
27 | def loadData(fileName):
28 | '''
29 | 加载Mnist数据集
30 | :param fileName:要加载的数据集路径
31 | :return: list形式的数据集及标记
32 | '''
33 | # 存放数据及标记的list
34 | dataArr = []
35 | labelArr = []
36 | # 打开文件
37 | fr = open(fileName, 'r')
38 | # 将文件按行读取
39 | for line in fr.readlines():
40 | # 对每一行数据按切割福','进行切割,返回字段列表
41 | curLine = line.strip().split(',')
42 |
43 | labelArr.append(int(curLine[0]))
44 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0
45 | dataArr.append([int(int(num)>128) for num in curLine[1:]])
46 | # 存放标记
47 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
48 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
49 |
50 | # 返回data和label
51 | return dataArr, labelArr
52 |
53 |
54 |
55 |
56 | class C45DecisionTree:
57 | def __init__(self, traindataList, trainlabelList):
58 | '''
59 | 初始化决策树类
60 | :param traindataList: 训练数据集的list形式
61 | :param trainlabelList: 训练数据集的label的list形式
62 | '''
63 | self.traindataList = traindataList
64 | self.trainlabelList = trainlabelList
65 | self.traindataArr = np.array(self.traindataList)
66 | self.trainlabelArr = np.array(self.trainlabelList)
67 |
68 |
69 | self.tree = self.build_C45tree(self.traindataArr, self.trainlabelArr)
70 |
71 |
72 | def calculate_empirical_entropy(self, trainLabelArr):
73 | '''
74 | 计算训练数据集的经验熵,公式参考李航老师统计学习方法
75 | :param trainLabelArr: numpy格式的label
76 | :return: 返回训练集的经验熵
77 | '''
78 | # 初始化经验熵为0
79 | H_D = 0
80 | # 这里为什么不采用self.num_classes直接调用,我刚开始也是这么写的
81 | # 后来发现如果在后期的计算中,某个类别不出现,那么log0会出现错误(参考README.md参考链接中大佬的利用set的实现)
82 | labels = set([label for label in trainLabelArr])
83 | for label in labels:
84 |
85 | # 根据公式需要计算每个类别的数目
86 | num = trainLabelArr[trainLabelArr==label].size
87 | # 计算每个类别占据数目占据整个数据集的比例
88 | p = num / trainLabelArr.size
89 | # 计算经验熵
90 | H_D += -1 *(p) * np.log2(p)
91 |
92 | return H_D
93 |
94 | def calculate_HDA(self, traindataArr, A):
95 | '''
96 | 计算数据集关于特征的A
97 | :param traindataArr: 训练数据集, numpy格式
98 | :param A: 特征A
99 | :return: 返回Ha(D)熵值
100 | '''
101 | HDA = 0
102 | features = set([feature for feature in traindataArr[:,A]])
103 | for feature in features:
104 | if traindataArr[:, A][traindataArr[:, A]== feature].size == 0:
105 | print(traindataArr, traindataArr.shape, features)
106 | p = traindataArr[:, A][traindataArr[:, A]== feature].size / traindataArr[:, A].size
107 | if p == 1:
108 | HDA = 1
109 | else:
110 | HDA += -1 * p * np.log2(p)
111 |
112 | return HDA
113 |
114 |
115 |
116 | def calculate_empirical_conditional_entropy(self, trainfeatureArr, trainlabelarr):
117 | '''
118 | 计算经验条件熵
119 | :param trainfeatureArr: numpy格式的从数据集中抽离出某一个特征列
120 | :param trainlabelabelArr: numpy格式的label
121 | :return: 经验条件熵
122 | '''
123 |
124 | # 经验熵是对每个特征进行计算,因此应该返回一个列表,对于每个特征都进行计算分析
125 | # 桶计算经验熵时一样,采用set来选取特针的不同取值
126 | features = set([feature for feature in trainfeatureArr])
127 | H_D_A = 0
128 | for feature in features:
129 | # 计算取不同值时所包含的样本的数目
130 | Di = trainfeatureArr[trainfeatureArr == feature].size
131 | Di_D = Di / trainfeatureArr.size
132 |
133 | # 计算对于选取的特征取feature值时的条件熵
134 |
135 | H_D_A += Di_D * self.calculate_empirical_entropy(trainlabelarr[trainfeatureArr == feature])
136 |
137 | return H_D_A
138 |
139 |
140 |
141 |
142 | def calculate_information_gain_ratio(self, traindataArr, trainlabelArr):
143 | '''
144 | :param traindataArr: 当前数据集的数组,numpy格式,因为每次在构建决策树机型分支的过程中,随着决策树层数的加深当前数据集会比越变越小
145 | :param trainlabelArr: 当前数据集的label数组,numpy格式
146 | 计算最大的信息增益
147 | :return: 最大的信息增益及其对应的特征。
148 | '''
149 | # 获取当前数据集的特征数目
150 | num_features = traindataArr.shape[1]
151 | max_feature, max_gain = 0, 0
152 | # 计算当前数据集的经验熵
153 | H_D = self.calculate_empirical_entropy(trainlabelArr)
154 | # 计算每个特征的经验条件熵
155 | for i in range(num_features):
156 | trainfeatureArr = traindataArr[:,i]
157 | H_D_i = self.calculate_empirical_conditional_entropy(trainfeatureArr, trainlabelArr)
158 | G_D_A = H_D - H_D_i
159 | H_A_D = self.calculate_HDA(traindataArr, i)
160 | # if H_A_D == 0: return
161 | gain = G_D_A / H_A_D
162 | if gain > max_gain:
163 | max_gain = gain
164 | max_feature = i
165 | # 返回最大的信息增益,及其特征
166 | return max_feature, max_gain
167 |
168 |
169 |
170 | def updateDataSet(self, traindataArr, trainlabelArr, A, a):
171 | '''
172 | 在构建决策树的过程中,需要实时更新决策树的数据集
173 | :param traindataArr: 待更新的数据集,numpy格式
174 | :param trainlabelArr: 待更新的数据集label, numpy格式
175 | :param A: 需要删除的特征
176 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作
177 | 那么所有有工作的样本需要保留。
178 | :return: 返回新的数据集及标签,numpy格式
179 | '''
180 | newdataArr = np.delete(traindataArr[traindataArr[:,A] == a], A, axis=1)
181 | newlabelArr = trainlabelArr[traindataArr[:,A] == a]
182 | return newdataArr, newlabelArr
183 |
184 |
185 | def majorClass(self, trainlabelArr):
186 | '''
187 | 在label中找到数量最多的类别
188 | :param trainlabelArr: 训练数据集的label, numpy格式的
189 | :return: 返回最大的类别
190 | '''
191 | label = list(trainlabelArr)
192 | return max(label, key=label.count)
193 |
194 |
195 | def build_C45tree(self, traindataArr, trainlabelArr):
196 | '''
197 | 在数据集上递归构建决策树
198 | :param traindataArr: 当前节点为根节点对应的数据集 numpy
199 | :param trainlabelArr: 当前节点为根节点对应的数据集label numpy
200 | :return: 返回节点的值
201 | '''
202 | epsilon = 0.1
203 |
204 |
205 | # logging.info('Starting create a new Node. Now there are {} samples'.format(trainlabelArr.size))
206 |
207 |
208 | classDict = set(trainlabelArr)
209 | # print(classDict)
210 |
211 | if len(classDict) == 1:
212 | return int(classDict.pop())
213 | if len(traindataArr.shape) == 1:
214 |
215 | return self.majorClass(trainlabelArr)
216 |
217 | Ag, G_D_Ag_r = self.calculate_information_gain_ratio(traindataArr, trainlabelArr)
218 | # print(Ag, G_D_Ag_r)
219 | if G_D_Ag_r < epsilon:
220 | return self.majorClass(trainlabelArr)
221 |
222 | tree = {Ag:{}}
223 |
224 | features = set(feature for feature in traindataArr[:, Ag])
225 | for feature in features:
226 | a = int(feature)
227 | newdataArr, newlabelArr = self.updateDataSet(traindataArr, trainlabelArr, Ag, a)
228 |
229 | tree[Ag][a] = self.build_C45tree(newdataArr, newlabelArr)
230 | # print(tree)
231 | return tree
232 |
233 | def predict(self, testdataList):
234 | '''
235 | 使用构建完成的决策树来预测对应的测试数据
236 | :param testdataList: 输入的行测试数据,list格式
237 | :return: 返回类别
238 | '''
239 | tree = copy.deepcopy(self.tree)
240 | while True:
241 | if type(tree).__name__ != 'dict':
242 | return tree
243 | # print(tree.items())
244 | (key, value), = tree.items()
245 |
246 | if type(tree[key]).__name__ == 'dict':
247 | dataval = testdataList[key]
248 |
249 | del testdataList[key]
250 | tree = value[dataval]
251 |
252 | if type(tree).__name__ != 'dict':
253 | return tree
254 |
255 | else:
256 | return value
257 |
258 |
259 |
260 | def testModel(self, testdataList, testlabelList):
261 | '''
262 | 测试决策树模型的准确率
263 | :param testdataList: 输入测试集的数据
264 | :param testlabelList: 输入测试集数据的label
265 | :return: 准确率accuracy
266 | '''
267 | #
268 | correct_num = 0
269 |
270 | for i in range(len(testdataList)):
271 | prediction = self.predict(testdataList[i])
272 | if prediction == testlabelList[i]:
273 | correct_num += 1
274 |
275 | return round(correct_num/len(testlabelList), 4)
276 |
277 |
278 |
279 |
280 |
281 |
282 | if __name__ == '__main__':
283 |
284 | # 定义一个日志模块来保存日志
285 | logging.basicConfig(level=logging.DEBUG,
286 | format='%(asctime)-12s %(levelname)-8s %(message)s',
287 | datefmt='%m-%d %H:%M',
288 | filename='C45_decision_tree.log',
289 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
290 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
291 | console = logging.StreamHandler()
292 | console.setLevel(logging.INFO)
293 | # 设置在控制台输出格式[-
294 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
295 | console.setFormatter(formatter)
296 | # 将handler加入到根记录器
297 | logging.getLogger('').addHandler(console)
298 |
299 | # 根记录器输出信息
300 | logging.info('This is an info message.')
301 |
302 |
303 | start = time.time()
304 |
305 |
306 | # mnist数据集的存储位置
307 | import os
308 | home = os.path.expanduser('~')
309 | # train_path = home + '/ML/mnist/mnist_train.csv'
310 | # test_path = home + '/ML/mnist/mnist_test.csv'
311 | train_path = home + '/ML/mnist/mnist_train_samples.csv'
312 | test_path = home + '/ML/mnist/mnist_test_samples.csv'
313 |
314 | # 读取训练与测试集
315 | logging.info('Loading data....')
316 |
317 | traindataArr, trainlabelArr =loadData(train_path)
318 | testdataArr, testlabelArr = loadData(test_path)
319 | logging.info('Loading data done.')
320 |
321 | logging.info('Building a decision tree.')
322 | C45 = C45DecisionTree(traindataArr, trainlabelArr)
323 |
324 | logging.info('Using decision tree to predict one sample.')
325 |
326 | prediction = C45.predict(testdataArr[0])
327 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0])))
328 |
329 | #测试朴决策树算法的准确率
330 | # 挑选测试集的前200个进行测试,防止运行时间过长
331 | logging.info('Testing the decision model.')
332 | accuracy = C45.testModel(testdataArr[:200], testlabelArr[:200])
333 |
334 |
335 | end = time.time()
336 |
337 | logging.info('accuracy:{}'.format(accuracy))
338 | logging.info('Total Time: {}'.format(round(end-start), 4))
339 |
340 |
--------------------------------------------------------------------------------
/决策树/C45_decision_tree.log:
--------------------------------------------------------------------------------
1 | 07-27 15:47 INFO This is an info message.
2 | 07-27 15:47 INFO Loading data....
3 | 07-27 15:47 INFO Loading data done.
4 | 07-27 15:47 INFO Building a decision tree.
5 | 07-27 15:47 INFO Using decision tree to predict one sample.
6 | 07-27 15:47 INFO Testing processing Done,and the prediction and label are : (7,7)
7 | 07-27 15:47 INFO Testing the decision model.
8 | 07-27 15:47 INFO accuracy:0.4
9 | 07-27 15:47 INFO Total Time: 2
10 |
--------------------------------------------------------------------------------
/决策树/CART.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/28
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 | '''
8 | 实现CART分类树
9 | 这个算法与ID3和C4.5算法的主要不同在于采用Gini指数来选择特征
10 | -----------
11 | 未剪枝
12 | '''
13 |
14 | import numpy as np
15 | import logging
16 | import time
17 | import copy
18 |
19 | def loadData(fileName):
20 | '''
21 | 加载Mnist数据集
22 | :param fileName:要加载的数据集路径
23 | :return: list形式的数据集及标记
24 | '''
25 | # 存放数据及标记的list
26 | dataArr = []
27 | labelArr = []
28 | # 打开文件
29 | fr = open(fileName, 'r')
30 | # 将文件按行读取
31 | for line in fr.readlines():
32 | # 对每一行数据按切割福','进行切割,返回字段列表
33 | curLine = line.strip().split(',')
34 |
35 | labelArr.append(int(curLine[0]))
36 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0
37 | dataArr.append([int(int(num)>128) for num in curLine[1:]])
38 | # 存放标记
39 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
40 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
41 |
42 | # 返回data和label
43 | return dataArr, labelArr
44 |
45 |
46 |
47 |
48 | class CARTDecisionTree:
49 | def __init__(self, traindataList, trainlabelList):
50 | '''
51 | 初始化决策树类
52 | :param traindataList: 训练数据集的list形式
53 | :param trainlabelList: 训练数据集的label的list形式
54 | '''
55 | self.traindataList = traindataList
56 | self.trainlabelList = trainlabelList
57 | self.traindataArr = np.array(self.traindataList)
58 | self.trainlabelArr = np.array(self.trainlabelList)
59 |
60 |
61 | self.tree = self.build_CARTtree(self.traindataArr, self.trainlabelArr)
62 | print(self.tree)
63 | def calculate_Gini(self, trainlabelArr):
64 | '''
65 | 计算数据集D的Gini指数
66 | # :param traindataArr: 训练数据集 numpy格式
67 | :param trainlabelArr: 训练数据集label numpy格式
68 | :return: 返回Gini指数
69 | '''
70 | D = trainlabelArr.size
71 | labels = set([label for label in trainlabelArr])
72 | Gini = 1
73 | for label in labels:
74 | Ck = trainlabelArr[trainlabelArr==label].size
75 | Gini -= ( Ck /D) ** 2
76 | return Gini
77 |
78 |
79 |
80 | def calculate_Gini_feature(self, trainfeatureArr, trainlabelArr, a):
81 | '''
82 | 计算数据集针对特征A的取值为a时的Gini指数
83 | :param trainfeatureArr: 切分后的训练数据集某一个特征列 numpy格式
84 | :param trainlabelArr: 训练数据集label numpy格式
85 | :param a: 特征A的某一个取值
86 | :return: 返回基尼指数
87 | '''
88 | D1 = trainfeatureArr[trainfeatureArr == a].size
89 | D = trainfeatureArr.size
90 | D2 = D - D1
91 | d1 = trainlabelArr[trainfeatureArr == a]
92 | d2 = trainlabelArr[trainfeatureArr != a]
93 |
94 | Gini_D_A = abs(D1/D) * self.calculate_Gini(d1) + abs(D2/D) * self.calculate_Gini( d2 )
95 |
96 | return Gini_D_A
97 |
98 | def calculate_min_Gini(self, traindataArr, trainlabelArr):
99 | '''
100 | 计算最小的Gini指数与对应的特征
101 | :param traindataArr: 训练数据集 numpy格式
102 | :param trainlabelArr: 训练数据集的label numpy格式
103 | :return: 返回最小的Gini指数与对应的特征
104 | '''
105 | num_features = traindataArr.shape[1]
106 | min_Gini = float('inf')
107 | feature = -1
108 | v = -1
109 | for i in range(num_features):
110 | trainfeatureArr = traindataArr[:, i]
111 | values = set([value for value in trainfeatureArr])
112 | for value in values:
113 | gini = self.calculate_Gini_feature(trainfeatureArr, trainlabelArr, value)
114 | if gini < min_Gini:
115 | min_Gini = gini
116 | feature = i
117 | v = value
118 |
119 | return feature, v, min_Gini
120 |
121 |
122 |
123 |
124 | def updateDataSetleft(self, traindataArr, trainlabelArr, A, a):
125 | '''
126 | 在构建决策树的过程中,需要实时更新决策树的数据集
127 | :param traindataArr: 待更新的数据集,numpy格式
128 | :param trainlabelArr: 待更新的数据集label, numpy格式
129 | :param A: 需要删除的特征
130 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作
131 | 那么所有有工作的样本需要保留。
132 | :return: 返回新的数据集及标签,numpy格式
133 | '''
134 | newdataArr = np.delete(traindataArr[traindataArr[:,A] == a], A, axis=1)
135 | newlabelArr = trainlabelArr[traindataArr[:,A] == a]
136 | return newdataArr, newlabelArr
137 |
138 |
139 | def updateDataSetright(self, traindataArr, trainlabelArr, A, a):
140 | '''
141 | 在构建决策树的过程中,需要实时更新决策树的数据集
142 | :param traindataArr: 待更新的数据集,numpy格式
143 | :param trainlabelArr: 待更新的数据集label, numpy格式
144 | :param A: 需要删除的特征
145 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作
146 | 那么所有有工作的样本需要保留。
147 | :return: 返回新的数据集及标签,numpy格式
148 | '''
149 | newdataArr = np.delete(traindataArr[traindataArr[:,A] != a], A, axis=1)
150 | newlabelArr = trainlabelArr[traindataArr[:,A] != a]
151 | return newdataArr, newlabelArr
152 |
153 |
154 | def majorClass(self, trainlabelArr):
155 | '''
156 | 在label中找到数量最多的类别
157 | :param trainlabelArr: 训练数据集的label, numpy格式的
158 | :return: 返回最大的类别
159 | '''
160 | label = list(trainlabelArr)
161 | return max(label, key=label.count)
162 |
163 |
164 | def build_CARTtree(self, traindataArr, trainlabelArr):
165 | '''
166 | 在数据集上递归构建决策树
167 | :param traindataArr: 当前节点为根节点对应的数据集 numpy
168 | :param trainlabelArr: 当前节点为根节点对应的数据集label numpy
169 | :return: 返回节点的值
170 | '''
171 | # 信息增益的阈值
172 | epsilon = 0.1
173 |
174 | node_thresh = 5
175 |
176 |
177 | # logging.info('Starting create a new Node. Now there are {} samples'.format(trainlabelArr.size))
178 |
179 | # 判断数据集此时的类别,如果只有一类,就范会对应的类别
180 | classDict = set(trainlabelArr)
181 | # print(classDict)
182 | if len(classDict) == 1:
183 | return int(classDict.pop())
184 | # print(traindataArr.shape)
185 | # 判断数据集此时的的特征数目,如果没有特征集,那就说明没有特征进行分割,就放会这些样本中数目最多的类别
186 | if len(traindataArr.shape) == 1:
187 | return self.majorClass(trainlabelArr)
188 | # 计算最大增益及其对应的特征
189 | Ag, a, Gini = self.calculate_min_Gini(traindataArr, trainlabelArr)
190 | # print(Ag, Gini)
191 | # 如果最大的信息增益小于设定的阈值,就直接返回数目最多的类,不必要进行分割
192 | if Gini < epsilon:
193 | return self.majorClass(trainlabelArr)
194 |
195 | if trainlabelArr.size < node_thresh:
196 | return self.majorClass(trainlabelArr)
197 |
198 | tree = {Ag:{}}
199 | # 递归构建决策树
200 |
201 |
202 | newdataArrleft, newlabelArrleft = self.updateDataSetleft(traindataArr, trainlabelArr, Ag, a)
203 | newdataArrright, newlabelArrright = self.updateDataSetright(traindataArr, trainlabelArr, Ag, a)
204 | # print(newlabelArrleft.size, newlabelArrright.size, trainlabelArr.size)
205 | if newlabelArrleft.size > 0:
206 | tree[Ag][a] = {'left': self.build_CARTtree(newdataArrleft, newlabelArrleft)}
207 | if newlabelArrright.size > 0:
208 | tree[Ag][a]['right'] = self.build_CARTtree(newdataArrright, newlabelArrright)
209 |
210 | # print(tree)
211 | return tree
212 |
213 | def predict(self, testdataList):
214 | '''
215 | 使用构建完成的决策树来预测对应的测试数据
216 | :param testdataList: 输入的行测试数据,list格式
217 | :return: 返回类别
218 | '''
219 | tree = copy.deepcopy(self.tree)
220 | # print(tree)
221 | while True:
222 | if type(tree).__name__ != 'dict':
223 | return tree
224 | # print(tree.items())
225 | (key, value), = tree.items()
226 |
227 | if type(tree[key]).__name__ == 'dict':
228 | dataval = testdataList[key]
229 |
230 | del testdataList[key]
231 |
232 | k = list(value.keys())
233 | if dataval not in k:
234 | tree = value[k[0]]['right']
235 | else:
236 | tree = value[dataval]['left']
237 |
238 | if type(tree).__name__ != 'dict':
239 | return tree
240 |
241 | else:
242 | return value
243 |
244 |
245 |
246 | def testModel(self, testdataList, testlabelList):
247 | '''
248 | 测试决策树模型的准确率
249 | :param testdataList: 输入测试集的数据
250 | :param testlabelList: 输入测试集数据的label
251 | :return: 准确率accuracy
252 | '''
253 | #
254 | correct_num = 0
255 |
256 | for i in range(len(testdataList)):
257 | prediction = self.predict(testdataList[i])
258 | if prediction == testlabelList[i]:
259 | correct_num += 1
260 |
261 | return round(correct_num/len(testlabelList), 4)
262 |
263 |
264 |
265 |
266 |
267 |
268 | if __name__ == '__main__':
269 |
270 | # 定义一个日志模块来保存日志
271 | logging.basicConfig(level=logging.DEBUG,
272 | format='%(asctime)-12s %(levelname)-8s %(message)s',
273 | datefmt='%m-%d %H:%M',
274 | filename='CART_decision_tree.log',
275 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
276 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
277 | console = logging.StreamHandler()
278 | console.setLevel(logging.INFO)
279 | # 设置在控制台输出格式[-
280 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
281 | console.setFormatter(formatter)
282 | # 将handler加入到根记录器
283 | logging.getLogger('').addHandler(console)
284 |
285 | # 根记录器输出信息
286 | logging.info('This is an info message.')
287 |
288 |
289 | start = time.time()
290 |
291 |
292 | # mnist数据集的存储位置
293 | import os
294 | home = os.path.expanduser('~')
295 | # train_path = home + '/ML/mnist/mnist_train.csv'
296 | # test_path = home + '/ML/mnist/mnist_test.csv'
297 | train_path = home + '/ML/mnist/mnist_train_samples.csv'
298 | test_path = home + '/ML/mnist/mnist_test_samples.csv'
299 |
300 | # 读取训练与测试集
301 | logging.info('Loading data....')
302 |
303 | traindataArr, trainlabelArr =loadData(train_path)
304 | testdataArr, testlabelArr = loadData(test_path)
305 | logging.info('Loading data done.')
306 |
307 | logging.info('Building a decision tree.')
308 | CART = CARTDecisionTree(traindataArr, trainlabelArr)
309 |
310 | logging.info('Using decision tree to predict one sample.')
311 |
312 | prediction = CART.predict(testdataArr[0])
313 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0])))
314 |
315 | # 测试朴决策树算法的准确率
316 | # 挑选测试集的前200个进行测试,防止运行时间过长
317 | logging.info('Testing /the decision model.')
318 | accuracy = CART.testModel(testdataArr[:200], testlabelArr[:200])
319 |
320 |
321 | end = time.time()
322 |
323 | logging.info('accuracy:{}'.format(accuracy))
324 | logging.info('Total Time: {}'.format(round(end-start), 4))
325 |
326 |
--------------------------------------------------------------------------------
/决策树/CART_decision_tree.log:
--------------------------------------------------------------------------------
1 | 07-28 16:08 INFO This is an info message.
2 | 07-28 16:08 INFO Loading data....
3 | 07-28 16:08 INFO Loading data done.
4 | 07-28 16:08 INFO Building a decision tree.
5 | 07-28 16:08 INFO Using decision tree to predict one sample.
6 | 07-28 16:08 INFO Testing processing Done,and the prediction and label are : (7,7)
7 | 07-28 16:08 INFO Testing /the decision model.
8 | 07-28 16:08 INFO accuracy:0.5
9 | 07-28 16:08 INFO Total Time: 1
10 |
--------------------------------------------------------------------------------
/决策树/ID3.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/25
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 | '''
8 | 构建决策树
9 | ID3算法实现决策树(不剪枝)
10 | ID3采用信息增益作为特征选择的标准
11 | '''
12 |
13 | import numpy as np
14 | import logging
15 | import time
16 | import copy
17 |
18 | def loadData(fileName):
19 | '''
20 | 加载Mnist数据集
21 | :param fileName:要加载的数据集路径
22 | :return: list形式的数据集及标记
23 | '''
24 | # 存放数据及标记的list
25 | dataArr = []
26 | labelArr = []
27 | # 打开文件
28 | fr = open(fileName, 'r')
29 | # 将文件按行读取
30 | for line in fr.readlines():
31 | # 对每一行数据按切割福','进行切割,返回字段列表
32 | curLine = line.strip().split(',')
33 |
34 | labelArr.append(int(curLine[0]))
35 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0
36 | dataArr.append([int(int(num)>128) for num in curLine[1:]])
37 | # 存放标记
38 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
39 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
40 |
41 | # 返回data和label
42 | return dataArr, labelArr
43 |
44 |
45 |
46 |
47 | class ID3DecisionTree:
48 | def __init__(self, traindataList, trainlabelList):
49 | '''
50 | 初始化决策树类
51 | :param traindataList: 训练数据集的list形式
52 | :param trainlabelList: 训练数据集的label的list形式
53 | '''
54 | self.traindataList = traindataList
55 | self.trainlabelList = trainlabelList
56 | self.traindataArr = np.array(self.traindataList)
57 | self.trainlabelArr = np.array(self.trainlabelList)
58 |
59 |
60 | self.tree = self.build_ID3tree(self.traindataArr, self.trainlabelArr)
61 |
62 |
63 | def calculate_empirical_entropy(self, trainLabelArr):
64 | '''
65 | 计算训练数据集的经验熵,公式参考李航老师统计学习方法
66 | :param trainLabelArr: numpy格式的label
67 | :return: 返回训练集的经验熵
68 | '''
69 | # 初始化经验熵为0
70 | H_D = 0
71 | # 这里为什么不采用self.num_classes直接调用,我刚开始也是这么写的
72 | # 后来发现如果在后期的计算中,某个类别不出现,那么log0会出现错误(参考README.md参考链接中大佬的利用set的实现)
73 | labels = set([label for label in trainLabelArr])
74 | for label in labels:
75 |
76 | # 根据公式需要计算每个类别的数目
77 | num = trainLabelArr[trainLabelArr==label].size
78 | # 计算每个类别占据数目占据整个数据集的比例
79 | p = num / trainLabelArr.size
80 | # 计算经验熵
81 | H_D += -1 *(p) * np.log2(p)
82 |
83 | return H_D
84 |
85 |
86 |
87 |
88 | def calculate_empirical_conditional_entropy(self, trainfeatureArr, trainlabelarr):
89 | '''
90 | 计算经验条件熵
91 | :param trainfeatureArr: numpy格式的从数据集中抽离出某一个特征列
92 | :param trainlabelabelArr: numpy格式的label
93 | :return: 经验条件熵
94 | '''
95 |
96 | # 经验熵是对每个特征进行计算,因此应该返回一个列表,对于每个特征都进行计算分析
97 | # 桶计算经验熵时一样,采用set来选取特针的不同取值
98 | features = set([feature for feature in trainfeatureArr])
99 | H_D_A = 0
100 | for feature in features:
101 | # 计算取不同值时所包含的样本的数目
102 | Di = trainfeatureArr[trainfeatureArr == feature].size
103 | Di_D = Di / trainfeatureArr.size
104 |
105 | # 计算对于选取的特征取feature值时的条件熵
106 |
107 | H_D_A += Di_D * self.calculate_empirical_entropy(trainlabelarr[trainfeatureArr == feature])
108 |
109 | return H_D_A
110 |
111 |
112 | def calculate_information_gain(self, traindataArr, trainlabelArr):
113 | '''
114 | :param traindataArr: 当前数据集的数组,numpy格式,因为每次在构建决策树机型分支的过程中,随着决策树层数的加深当前数据集会比越变越小
115 | :param trainlabelArr: 当前数据集的label数组,numpy格式
116 | 计算最大的信息增益
117 | :return: 最大的信息增益及其对应的特征。
118 | '''
119 | # 获取当前数据集的特征数目
120 | num_features = traindataArr.shape[1]
121 | max_feature, max_G_D_A = 0, 0
122 | # 计算当前数据集的经验熵
123 | H_D = self.calculate_empirical_entropy(trainlabelArr)
124 | # 计算每个特征的经验条件熵
125 | for i in range(num_features):
126 | trainfeatureArr = traindataArr[:,i]
127 | H_D_i = self.calculate_empirical_conditional_entropy(trainfeatureArr, trainlabelArr)
128 | G_D_A = H_D - H_D_i
129 | if G_D_A > max_G_D_A:
130 | max_G_D_A = G_D_A
131 | max_feature = i
132 | # 返回最大的信息增益,及其特征
133 | return max_feature, max_G_D_A
134 |
135 |
136 | def updateDataSet(self, traindataArr, trainlabelArr, A, a):
137 | '''
138 | 在构建决策树的过程中,需要实时更新决策树的数据集
139 | :param traindataArr: 待更新的数据集,numpy格式
140 | :param trainlabelArr: 待更新的数据集label, numpy格式
141 | :param A: 需要删除的特征
142 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作
143 | 那么所有有工作的样本需要保留。
144 | :return: 返回新的数据集及标签,numpy格式
145 | '''
146 | newdataArr = np.delete(traindataArr[traindataArr[:,A] == a], A, axis=1)
147 | newlabelArr = trainlabelArr[traindataArr[:,A] == a]
148 | return newdataArr, newlabelArr
149 |
150 |
151 | def majorClass(self, trainlabelArr):
152 | '''
153 | 在label中找到数量最多的类别
154 | :param trainlabelArr: 训练数据集的label, numpy格式的
155 | :return: 返回最大的类别
156 | '''
157 | label = list(trainlabelArr)
158 | return max(label, key=label.count)
159 |
160 |
161 | def build_ID3tree(self, traindataArr, trainlabelArr):
162 | '''
163 | 在数据集上递归构建决策树
164 | :param traindataArr: 当前节点为根节点对应的数据集 numpy
165 | :param trainlabelArr: 当前节点为根节点对应的数据集label numpy
166 | :return: 返回节点的值
167 | '''
168 | # 信息增益的阈值
169 | epsilon = 0.1
170 |
171 |
172 | # logging.info('Starting create a new Node. Now there are {} samples'.format(trainlabelArr.size))
173 |
174 | # 判断数据集此时的类别,如果只有一类,就范会对应的类别
175 | classDict = set(trainlabelArr)
176 | # print(classDict)
177 | if len(classDict) == 1:
178 | return int(classDict.pop())
179 | # print(traindataArr.shape)
180 | # 判断数据集此时的的特征数目,如果没有特征集,那就说明没有特征进行分割,就放会这些样本中数目最多的类别
181 | if len(traindataArr.shape) == 1:
182 | return self.majorClass(trainlabelArr)
183 | # 计算最大增益及其对应的特征
184 | Ag, G_D_Ag = self.calculate_information_gain(traindataArr, trainlabelArr)
185 | # print(Ag, G_D_Ag)
186 | # 如果最大的信息增益小于设定的阈值,就直接返回数目最多的类,不必要进行分割
187 | if G_D_Ag < epsilon:
188 | return self.majorClass(trainlabelArr)
189 |
190 | tree = {Ag:{}}
191 | # 递归构建决策树
192 | features = set(feature for feature in traindataArr[:, Ag])
193 | for feature in features:
194 | a = int(feature)
195 | newdataArr, newlabelArr = self.updateDataSet(traindataArr, trainlabelArr, Ag, a)
196 |
197 | tree[Ag][a] = self.build_ID3tree(newdataArr, newlabelArr)
198 | # print(tree)
199 | return tree
200 |
201 | def predict(self, testdataList):
202 | '''
203 | 使用构建完成的决策树来预测对应的测试数据
204 | :param testdataList: 输入的行测试数据,list格式
205 | :return: 返回类别
206 | '''
207 | tree = copy.deepcopy(self.tree)
208 | while True:
209 | if type(tree).__name__ != 'dict':
210 | return tree
211 | # print(tree.items())
212 | (key, value), = tree.items()
213 |
214 | if type(tree[key]).__name__ == 'dict':
215 | dataval = testdataList[key]
216 |
217 | del testdataList[key]
218 | tree = value[dataval]
219 |
220 | if type(tree).__name__ != 'dict':
221 | return tree
222 |
223 | else:
224 | return value
225 |
226 |
227 |
228 | def testModel(self, testdataList, testlabelList):
229 | '''
230 | 测试决策树模型的准确率
231 | :param testdataList: 输入测试集的数据
232 | :param testlabelList: 输入测试集数据的label
233 | :return: 准确率accuracy
234 | '''
235 | #
236 | correct_num = 0
237 |
238 | for i in range(len(testdataList)):
239 | prediction = self.predict(testdataList[i])
240 | if prediction == testlabelList[i]:
241 | correct_num += 1
242 |
243 | return round(correct_num/len(testlabelList), 4)
244 |
245 |
246 |
247 |
248 |
249 |
250 | if __name__ == '__main__':
251 |
252 | # 定义一个日志模块来保存日志
253 | logging.basicConfig(level=logging.DEBUG,
254 | format='%(asctime)-12s %(levelname)-8s %(message)s',
255 | datefmt='%m-%d %H:%M',
256 | filename='ID3_decision_tree.log',
257 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
258 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
259 | console = logging.StreamHandler()
260 | console.setLevel(logging.INFO)
261 | # 设置在控制台输出格式[-
262 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
263 | console.setFormatter(formatter)
264 | # 将handler加入到根记录器
265 | logging.getLogger('').addHandler(console)
266 |
267 | # 根记录器输出信息
268 | logging.info('This is an info message.')
269 |
270 |
271 | start = time.time()
272 |
273 |
274 | # mnist数据集的存储位置
275 | import os
276 | home = os.path.expanduser('~')
277 | train_path = home + '/ML/mnist/mnist_train.csv'
278 | test_path = home + '/ML/mnist/mnist_test.csv'
279 | # train_path = home + '/ML/mnist/mnist_train_samples.csv'
280 | # test_path = home + '/ML/mnist/mnist_test_samples.csv'
281 |
282 | # 读取训练与测试集
283 | logging.info('Loading data....')
284 |
285 | traindataArr, trainlabelArr =loadData(train_path)
286 | testdataArr, testlabelArr = loadData(test_path)
287 | logging.info('Loading data done.')
288 |
289 | logging.info('Building a decision tree.')
290 | ID3 = ID3DecisionTree(traindataArr, trainlabelArr)
291 |
292 | logging.info('Using decision tree to predict one sample.')
293 |
294 | prediction = ID3.predict(testdataArr[0])
295 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0])))
296 |
297 | #测试朴决策树算法的准确率
298 | # 挑选测试集的前200个进行测试,防止运行时间过长
299 | logging.info('Testing the decision model.')
300 | accuracy = ID3.testModel(testdataArr[:200], testlabelArr[:200])
301 |
302 |
303 | end = time.time()
304 |
305 | logging.info('accuracy:{}'.format(accuracy))
306 | logging.info('Total Time: {}'.format(round(end-start), 4))
307 |
308 |
--------------------------------------------------------------------------------
/决策树/ID3_decision_tree.log:
--------------------------------------------------------------------------------
1 | 07-27 15:51 INFO This is an info message.
2 | 07-27 15:51 INFO Loading data....
3 | 07-27 15:51 INFO Loading data done.
4 | 07-27 15:51 INFO Building a decision tree.
5 | 07-27 15:54 INFO Using decision tree to predict one sample.
6 | 07-27 15:54 INFO Testing processing Done,and the prediction and label are : (7,7)
7 | 07-27 15:54 INFO Testing the decision model.
8 | 07-27 15:54 INFO accuracy:0.87
9 | 07-27 15:54 INFO Total Time: 176
10 |
--------------------------------------------------------------------------------
/感知机算法/perceptron.log:
--------------------------------------------------------------------------------
1 | 07-24 15:23 INFO this is an info message.
2 | 07-24 15:23 INFO Loading data....
3 | 07-24 15:23 INFO Loading data done.
4 | 07-24 15:23 INFO Start training...
5 | 07-24 15:23 INFO train data shape is:(12665,784)
6 | 07-24 15:23 INFO Iteration:0 / 50
7 | 07-24 15:23 INFO Iteration:1 / 50
8 | 07-24 15:23 INFO Iteration:2 / 50
9 | 07-24 15:23 INFO Iteration:3 / 50
10 | 07-24 15:23 INFO Iteration:4 / 50
11 | 07-24 15:23 INFO Iteration:5 / 50
12 | 07-24 15:23 INFO Iteration:6 / 50
13 | 07-24 15:23 INFO Iteration:7 / 50
14 | 07-24 15:23 INFO Iteration:8 / 50
15 | 07-24 15:23 INFO Iteration:9 / 50
16 | 07-24 15:23 INFO Iteration:10 / 50
17 | 07-24 15:23 INFO Iteration:11 / 50
18 | 07-24 15:23 INFO Iteration:12 / 50
19 | 07-24 15:23 INFO Iteration:13 / 50
20 | 07-24 15:23 INFO Iteration:14 / 50
21 | 07-24 15:23 INFO Iteration:15 / 50
22 | 07-24 15:23 INFO Iteration:16 / 50
23 | 07-24 15:23 INFO Iteration:17 / 50
24 | 07-24 15:23 INFO Iteration:18 / 50
25 | 07-24 15:23 INFO Iteration:19 / 50
26 | 07-24 15:23 INFO Iteration:20 / 50
27 | 07-24 15:23 INFO Iteration:21 / 50
28 | 07-24 15:23 INFO Iteration:22 / 50
29 | 07-24 15:23 INFO Iteration:23 / 50
30 | 07-24 15:23 INFO Iteration:24 / 50
31 | 07-24 15:23 INFO Iteration:25 / 50
32 | 07-24 15:23 INFO Iteration:26 / 50
33 | 07-24 15:23 INFO Iteration:27 / 50
34 | 07-24 15:23 INFO Iteration:28 / 50
35 | 07-24 15:23 INFO Iteration:29 / 50
36 | 07-24 15:23 INFO Iteration:30 / 50
37 | 07-24 15:23 INFO Iteration:31 / 50
38 | 07-24 15:23 INFO Iteration:32 / 50
39 | 07-24 15:23 INFO Iteration:33 / 50
40 | 07-24 15:23 INFO Iteration:34 / 50
41 | 07-24 15:23 INFO Iteration:35 / 50
42 | 07-24 15:23 INFO Iteration:36 / 50
43 | 07-24 15:23 INFO Iteration:37 / 50
44 | 07-24 15:23 INFO Iteration:38 / 50
45 | 07-24 15:23 INFO Iteration:39 / 50
46 | 07-24 15:23 INFO Iteration:40 / 50
47 | 07-24 15:23 INFO Iteration:41 / 50
48 | 07-24 15:23 INFO Iteration:42 / 50
49 | 07-24 15:23 INFO Iteration:43 / 50
50 | 07-24 15:23 INFO Iteration:44 / 50
51 | 07-24 15:23 INFO Iteration:45 / 50
52 | 07-24 15:23 INFO Iteration:46 / 50
53 | 07-24 15:23 INFO Iteration:47 / 50
54 | 07-24 15:23 INFO Iteration:48 / 50
55 | 07-24 15:23 INFO Iteration:49 / 50
56 | 07-24 15:23 INFO Training done.
57 | 07-24 15:23 INFO Testing this model.
58 | 07-24 15:23 INFO accuracy:0.9916
59 | 07-24 15:23 INFO Total Time:18.173146724700928
60 |
--------------------------------------------------------------------------------
/感知机算法/perceptron.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/23
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 |
8 | '''
9 | mnist为10分类,因为感知机算法为二分类
10 | 因此挑选其中0,1这两类的数据进行训练
11 | '''
12 |
13 | import numpy as np
14 | import time
15 | import logging
16 |
17 |
18 | def loadData(fileName):
19 | '''
20 | 加载Mnist数据集
21 | :param fileName:要加载的数据集路径
22 | :return: list形式的数据集及标记
23 | '''
24 | # 存放数据及标记的list
25 | dataArr = []
26 | labelArr = []
27 | # 打开文件
28 | fr = open(fileName, 'r')
29 | # 将文件按行读取
30 | for line in fr.readlines():
31 | # 对每一行数据按切割福','进行切割,返回字段列表
32 | curLine = line.strip().split(',')
33 |
34 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类
35 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue
36 | if int(curLine[0]) == 0 or int(curLine[0]) == 1:
37 | if int(curLine[0]) == 0:
38 | labelArr.append(1)
39 | else:
40 | labelArr.append(-1)
41 | dataArr.append([int(num) / 255 for num in curLine[1:]])
42 | # 存放标记
43 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
44 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
45 | # dataArr.append([int(num)/255 for num in curLine[1:]])
46 |
47 | # 返回data和label
48 | return dataArr, labelArr
49 |
50 |
51 | class Perceptron:
52 | def __init__(self):
53 | pass
54 |
55 |
56 |
57 |
58 |
59 |
60 | def perceptron(self, dataArr, labelArr, iters):
61 | '''
62 | 构建爱呢感知机算法,其中loss function采用错误分类点的个数
63 | :param dataArr: 输入list格式的训练集
64 | :param labelArr: 输入list格式的训练集标签数据
65 | :param iters: 需要迭代的次数(因为数据集不保证线性可分,因此需要设置一定的迭代次数)
66 | :return: w, b 返回超平面的参数
67 | '''
68 |
69 | # 数据转换为numpy格式,方便进行矩阵运算
70 | dataMat = np.mat(dataArr)
71 | labelMat = np.mat(labelArr).T
72 | # print(dataMat.shape)
73 | # print(labelMat.shape)
74 | # 训练数据的维度大小
75 | m, n = dataMat.shape
76 | logging.info('train data shape is:({},{})'.format(m,n))
77 |
78 | # 初始化为w,b
79 | W = np.random.randn(1, n)
80 | b = 0
81 |
82 | # 设置学习率(迭代步长)
83 | lr = 0.0001
84 |
85 | # 进行迭代训练
86 | for iteration in range(iters):
87 |
88 | # 采用sgd的方法进行权重的更新,每次选取一个错误样本更新w, b
89 | # 一共含有m个样本
90 | for i in range(m):
91 | # 选择某个样本
92 | xi = dataMat[i]
93 | yi = labelMat[i]
94 | # 如果分类正确,那么继续寻找下一个样本
95 | if yi * (W * xi.T + b) > 0: continue
96 | # 找到错误样本,更新模型参数
97 | W = W + lr * yi * xi
98 | b = b + lr * yi
99 |
100 | logging.info("Iteration:{} / {}".format(iteration, iters))
101 |
102 | return W, b
103 |
104 |
105 |
106 | def testPerceptron(self, dataArr, labelArr, W, b):
107 | '''
108 | 测试训练得到的感知机模型的准确性
109 | :param dataArr: 输入list格式的测试集数据
110 | :param labelArr: 输入list格式的测试集数据标签
111 | :param w: 感知器模型超平面的法相量参数
112 | :param b: 感知机模型的偏置
113 | :return: 感知机模型在测试集的准确率
114 | '''
115 |
116 | # 数据转换为numpy格式,方便进行矩阵运算
117 | dataMat = np.mat(dataArr)
118 | labelMat = np.mat(labelArr).T
119 |
120 | # 测试集的维度大小
121 | m ,n = dataMat.shape
122 |
123 | # 正确分类的样本的数目
124 | correct_num = 0
125 |
126 | # 遍历所有的测试样本,查找其中的正确分类样本个数
127 | for i in range(m):
128 | xi = dataMat[i]
129 | yi = labelMat[i]
130 |
131 | if (W * xi.T + b) * yi > 0:
132 | correct_num += 1
133 |
134 | return round(correct_num/m, 4)
135 |
136 |
137 |
138 |
139 |
140 |
141 | if __name__ == '__main__':
142 |
143 |
144 | # 定义一个日志模块来保存日志
145 | logging.basicConfig(level=logging.DEBUG,
146 | format='%(asctime)-12s %(levelname)-8s %(message)s',
147 | datefmt='%m-%d %H:%M',
148 | filename='perceptron.log',
149 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
150 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
151 | console = logging.StreamHandler()
152 | console.setLevel(logging.INFO)
153 | # 设置在控制台输出格式[-
154 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
155 | console.setFormatter(formatter)
156 | # 将handler加入到根记录器
157 | logging.getLogger('').addHandler(console)
158 |
159 | # 根记录器输出信息
160 | logging.info('this is an info message.')
161 |
162 | ######################################################
163 |
164 | start = time.time()
165 |
166 |
167 | # mnist数据集的存储位置
168 | import os
169 | home = os.path.expanduser('~')
170 | train_path = home + '/ML/mnist/mnist_train.csv'
171 | test_path = home + '/ML/mnist/mnist_train.csv'
172 |
173 | # 读取训练与测试集
174 | logging.info('Loading data....')
175 |
176 |
177 |
178 | p = Perceptron()
179 |
180 | train_data_array, train_label_array = loadData(train_path)
181 | test_data_array, test_label_array = loadData(test_path)
182 | logging.info('Loading data done.')
183 |
184 | #训练感知机算法
185 | logging.info('Start training...')
186 | iters = 50
187 | w, b = p.perceptron(train_data_array, train_label_array, iters)
188 | logging.info('Training done.')
189 |
190 | # 测试感知机算法的准确率
191 | logging.info('Testing this model.')
192 | accuracy = p.testPerceptron(test_data_array, test_label_array, w, b)
193 |
194 | end = time.time()
195 |
196 | logging.info('accuracy:{}'.format(accuracy))
197 | logging.info('Total Time:{}'.format(end-start))
--------------------------------------------------------------------------------
/支持向量机/SVM.log:
--------------------------------------------------------------------------------
1 | 08-01 15:20 INFO This is an info message.
2 | 08-01 15:20 INFO Loading data....
3 | 08-01 15:20 INFO Loading data done.
4 | 08-01 15:20 INFO Training the SVM model....
5 | 08-01 15:20 INFO Construct The Gaussian Kernel: (0/1000).
6 | 08-01 15:20 INFO Construct The Gaussian Kernel: (100/1000).
7 | 08-01 15:20 INFO Construct The Gaussian Kernel: (200/1000).
8 | 08-01 15:20 INFO Construct The Gaussian Kernel: (300/1000).
9 | 08-01 15:20 INFO Construct The Gaussian Kernel: (400/1000).
10 | 08-01 15:20 INFO Construct The Gaussian Kernel: (500/1000).
11 | 08-01 15:20 INFO Construct The Gaussian Kernel: (600/1000).
12 | 08-01 15:20 INFO Construct The Gaussian Kernel: (700/1000).
13 | 08-01 15:20 INFO Construct The Gaussian Kernel: (800/1000).
14 | 08-01 15:20 INFO Construct The Gaussian Kernel: (900/1000).
15 | 08-01 15:20 INFO Iter:0/13
16 | 08-01 15:20 INFO Iter:1/13
17 | 08-01 15:20 INFO Training process is Done !!!!
18 | 08-01 15:20 INFO Predicting one sample ....
19 | 08-01 15:20 INFO The prediction and the ground truth is : (-1.0, -1)
20 | 08-01 15:20 INFO Testing processing: (0/2115) and the currect prediction:0
21 | 08-01 15:20 INFO Testing processing: (100/2115) and the currect prediction:98
22 | 08-01 15:20 INFO Testing processing: (200/2115) and the currect prediction:197
23 | 08-01 15:20 INFO Testing processing: (300/2115) and the currect prediction:297
24 | 08-01 15:20 INFO Testing processing: (400/2115) and the currect prediction:397
25 | 08-01 15:20 INFO Testing processing: (500/2115) and the currect prediction:495
26 | 08-01 15:20 INFO Testing processing: (600/2115) and the currect prediction:594
27 | 08-01 15:20 INFO Testing processing: (700/2115) and the currect prediction:693
28 | 08-01 15:20 INFO Testing processing: (800/2115) and the currect prediction:790
29 | 08-01 15:20 INFO Testing processing: (900/2115) and the currect prediction:888
30 | 08-01 15:20 INFO Testing processing: (1000/2115) and the currect prediction:987
31 | 08-01 15:20 INFO Testing processing: (1100/2115) and the currect prediction:1086
32 | 08-01 15:20 INFO Testing processing: (1200/2115) and the currect prediction:1186
33 | 08-01 15:20 INFO Testing processing: (1300/2115) and the currect prediction:1286
34 | 08-01 15:20 INFO Testing processing: (1400/2115) and the currect prediction:1385
35 | 08-01 15:20 INFO Testing processing: (1500/2115) and the currect prediction:1485
36 | 08-01 15:20 INFO Testing processing: (1600/2115) and the currect prediction:1584
37 | 08-01 15:20 INFO Testing processing: (1700/2115) and the currect prediction:1684
38 | 08-01 15:20 INFO Testing processing: (1800/2115) and the currect prediction:1784
39 | 08-01 15:20 INFO Testing processing: (1900/2115) and the currect prediction:1883
40 | 08-01 15:20 INFO Testing processing: (2000/2115) and the currect prediction:1983
41 | 08-01 15:20 INFO Testing processing: (2100/2115) and the currect prediction:2080
42 | 08-01 15:20 INFO accuracy:99.0544
43 | 08-01 15:20 INFO Total Time: 31
44 |
--------------------------------------------------------------------------------
/支持向量机/SVM.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/30
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 | # @Emial : lxztju@163.com
7 |
8 | '''
9 | SVM的python实现
10 | 实现软间隔与核函数的非线性SVM二分类器
11 |
12 | 利用SMO算法进行训练
13 | '''
14 |
15 | import numpy as np
16 | import logging
17 | import time
18 | import random
19 | import math
20 |
21 | def loadData(fileName):
22 | '''
23 | 加载Mnist数据集
24 | :param fileName:要加载的数据集路径
25 | :return: list形式的数据集及标记
26 | '''
27 | # 存放数据及标记的list
28 | dataArr = []
29 | labelArr = []
30 | # 打开文件
31 | fr = open(fileName, 'r')
32 | # 将文件按行读取
33 | for line in fr.readlines():
34 | # 对每一行数据按切割福','进行切割,返回字段列表
35 | curLine = line.strip().split(',')
36 |
37 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类
38 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue
39 | if int(curLine[0]) == 0 or int(curLine[0]) == 1:
40 | if int(curLine[0]) == 0:
41 | labelArr.append(1)
42 | else:
43 | labelArr.append(-1)
44 | dataArr.append([int(num) / 255 for num in curLine[1:]])
45 | # 存放标记
46 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
47 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
48 | # dataArr.append([int(num)/255 for num in curLine[1:]])
49 |
50 | # 返回data和label
51 | return dataArr, labelArr
52 |
53 |
54 |
55 | class SVM:
56 | def __init__(self, traindataList, trainlabelList, sigma = 10, C = 200, toler = 0.001):
57 | '''
58 | SVM类的参数初始化
59 | :param traindataList: 训练数据集的LIst格式
60 | :param trainlabelList: 训练数据集label的List格式
61 | :param sigma: 高斯核的参数
62 | :param C: 软间隔的惩罚参数
63 | :param toler: 松弛变量
64 | '''
65 | self.traindataArr = np.array(traindataList) # 训练数据集转换为array格式
66 | self.trainlabelArr = np.array(trainlabelList).T # 训练数据集的label转换为array格式,进行转置变成列向量
67 | self.m, self.n = self.traindataArr.shape # m为训练集的样本个数, n为特征的个数
68 |
69 | self.sigma = sigma # 高斯核中的参数
70 | self.C = C #软间隔的惩罚参数
71 | self.toler = toler # 松弛变量
72 | self.b = 0 # SVM中的偏置项
73 | self.alpha = [1] * self.traindataArr.shape[0] # SVM对偶问题中的alpha
74 | self.kernel = self.calcKernel() # 核函数矩阵
75 | self.E = [self.calc_Ei(i) for i in range(self.m)] #SMO运算过程中的Ei
76 | # print(self.E)
77 | self.supportVecIndex = [] # 保存支持向量的索引
78 |
79 |
80 |
81 |
82 | def calcKernel(self):
83 | '''
84 | 计算核函数矩阵,采用高斯核
85 | :return: 高斯核矩阵
86 | '''
87 |
88 | # 高斯核矩阵的大小为m×m
89 | K = [[0] * self.m for _ in range(self.m)]
90 |
91 | # 遍历Xi, 这个相当于核函数方程中的x
92 | for i in range(self.m):
93 |
94 | if i % 100 == 0:
95 | logging.info('Construct The Gaussian Kernel: ({}/{}).'.format(i, self.m))
96 |
97 | Xi = self.traindataArr[i]
98 | #遍历Xj,相当于公式中的Z
99 | for j in range(self.m):
100 | Xj = self.traindataArr[j]
101 | # 计算||xi-xj||^2
102 | diff = np.dot((Xi - Xj), (Xi - Xj).T)
103 | # nisan高斯核参数矩阵
104 | K[i][j] = np.exp((-1/2) * (diff/(self.sigma ** 2 )))
105 |
106 | # 返回高斯核
107 | return K
108 |
109 |
110 |
111 | def calc_gxi(self, i):
112 | '''
113 | 根据7.104的公式计算g(xi)
114 | :param i: x的下标
115 | :return: 返回g(xi)的值
116 | '''
117 | gxi = 0
118 | for j in range(len(self.alpha)):
119 | gxi += self.alpha[j] * self.trainlabelArr[j] * self.kernel[i][j]
120 |
121 | return gxi + self.b
122 |
123 |
124 |
125 | def calc_Ei(self, i):
126 | '''
127 | 计算公式7.104,计算Ei
128 | :param i: 下标
129 | :return: Ei
130 | '''
131 | gxi = self.calc_gxi(i)
132 | return gxi - self.trainlabelArr[i]
133 |
134 |
135 |
136 | def isSatisfyKKT(self, i):
137 | '''
138 | 判断第i个alpha是否满足KKT条件, 因为在SMO算法中
139 | 第一个alpha的选取采用最不符合KKT条件的哪一个
140 | :param i: alpha的下标i
141 | :return: True or False
142 | '''
143 | gxi = self.calc_gxi(i)
144 | yi = self.trainlabelArr[i]
145 | multiply = gxi * yi
146 | alpha_i = self.alpha[i]
147 |
148 | # 书中采用的是alpha等于0,但是可以进行松弛操作
149 | # if alpha_i == 0:
150 | if (abs(self.alpha[i]) < self.toler) and (multiply >= 1):
151 | return True
152 | # 哦嗯样均采用松弛之后的
153 | # if alpha_i == self.C:
154 | if abs(self.alpha[i] - self.C) < self.toler and (multiply <= 1):
155 | return True
156 |
157 | #if 0 < alpha_i < self.C:
158 | if (self.alpha[i] > -self.toler) and (self.alpha[i] < (self.C + self.toler)) and (multiply < 1 + self.toler):
159 | return True
160 |
161 | return False
162 |
163 |
164 |
165 | def getAlpha(self):
166 | '''
167 | SMO算法的2个变量
168 | :return: 返回E1, E2, i, j
169 | '''
170 | # 首先遍历所有支持向量点,如果全部满足KKt条件,然后再去所有的数据集中查找
171 | index_list = [i for i in range(self.m) if 0 < self.alpha[i] < self.C]
172 | non_satisfy_list = [i for i in range(self.m) if i not in index_list]
173 | index_list.extend(non_satisfy_list)
174 |
175 | for i in index_list:
176 | if self.isSatisfyKKT(i):
177 | continue
178 | E1 = self.E[i]
179 |
180 | # 如果E1为正,你那么找到最小的E作为E2保证|E1-E2|最大
181 | E = {k:v for v, k in enumerate(self.E)}
182 | E_ = sorted(E.items(), key=lambda item: item[0])
183 |
184 | if E1 >= 0:
185 | j = E_[0][1]
186 | # 如果找到的j与i相同,此时i代表的值最小,因此选择下一个值,如果不进行处理,使得i, j相同,那么后边会出现错误
187 | if j == i:
188 | j = E_[1][1]
189 | # j = min(range(self.m), key = lambda x:self.E[x])
190 | # 如果E1为负,你那么找到最大的E作为E2保证|E1-E2|最大
191 | else:
192 | j = E_[-1][0]
193 | if j == i:
194 | j = E[-2][1]
195 | # j = max(range(self.m), key = lambda x:self.E[x])
196 | # print(type(i), type(j))
197 | j = int(j)
198 | E2 = self.E[j]
199 | return E1, E2, i, j
200 |
201 |
202 | def train(self, iter = 100):
203 | '''
204 | 训练SVM分类器
205 | :param iter: 最大的迭代次数
206 | :return: 无返回值,训练SVM
207 | '''
208 | iterStep = 0 # 迭代的次数,超过迭代次数依然没有收敛,则强制停止
209 | parameterChanged = 1 # 参数是否发生更改的标志,如果发生更改,那么这个值为1,如果不更改,说明算法已经收敛
210 |
211 | # 迭代训练SVM
212 | while iterStep < iter and parameterChanged > 0:
213 | logging.info('Iter:{}/{}'.format(iterStep, iter))
214 |
215 | iterStep += 1
216 | # 初始化参数变化值为0,如果参数改变,说明训练过程正在进行,那么parameterChanged置一
217 | parameterChanged = 0
218 |
219 | # 利用SMO更新的两个变量
220 | E1, E2, i, j = self.getAlpha()
221 |
222 | y1 = self.trainlabelArr[i]
223 | y2 = self.trainlabelArr[j]
224 |
225 | alpha1Old = self.alpha[i]
226 | alpha2Old = self.alpha[j]
227 |
228 | # 计算边界
229 | if y1 == y2:
230 | L = max(0, alpha2Old+alpha1Old-self.C)
231 | H = min(self.C, alpha2Old + alpha1Old)
232 | else:
233 | L = max(0, alpha2Old-alpha1Old)
234 | H = min(self.C, self.C+alpha2Old+alpha1Old)
235 |
236 | if L == H:
237 | continue
238 | # print(L, H, alpha1Old, alpha2Old)
239 | k11 = self.kernel[i][i]
240 | k22 = self.kernel[j][j]
241 | k12 = self.kernel[i][j]
242 | k21 = self.kernel[j][i]
243 |
244 | eta = (k11 + k22 - 2*k12)
245 |
246 | # 如果eta为0,在后边的分母中会报错
247 | if eta <= 0:
248 | continue
249 |
250 | alpha2NewUnc = alpha2Old + y2 * (E1-E2)/ eta
251 | # print(E1, E2, eta, alpha2Old, alpha2NewUnc)
252 | if alpha2NewUnc H:
255 | alpha2New = H
256 | else:
257 | alpha2New = alpha2NewUnc
258 | # print(alpha2New, alpha2Old)
259 | alpha1New = alpha1Old + y1 * y2 * (alpha2Old - alpha2New)
260 |
261 | b1New = -1 * E1 - y1 * k11 * (alpha1New - alpha1Old) \
262 | - y2 * k21*(alpha2NewUnc - alpha2Old) + self.b
263 |
264 | b2New = -1 * E2 - y1 * k12 * (alpha1New - alpha1Old) \
265 | - y2 * k22 * (alpha2New - alpha2Old) + self.b
266 |
267 | # 依据α1和α2的值范围确定新b
268 | if (alpha1New > 0) and (alpha1New < self.C):
269 | bNew = b1New
270 | elif (alpha2New > 0) and (alpha2New < self.C):
271 | bNew = b2New
272 | else:
273 | bNew = (b1New + b2New) / 2
274 |
275 | self.alpha[i] = alpha1New
276 | self.alpha[j] = alpha2New
277 | self.b = bNew
278 |
279 | self.E[i] = self.calc_Ei(i)
280 | self.E[j] = self.calc_Ei(j)
281 | # parameterChanged = 1
282 | # print(abs(alpha2New - alpha2Old))
283 | # 如果α2的改变量过于小,就认为该参数未改变,不增加parameterChanged值
284 | # 反之则自增1
285 | if abs(alpha2New - alpha2Old) >= 0.00001:
286 | parameterChanged = 1
287 | # break
288 | #全部计算结束后,重新遍历一遍α,查找里面的支持向量
289 | for i in range(self.m):
290 | #如果α>0,说明是支持向量
291 | if self.alpha[i] > 0:
292 | #将支持向量的索引保存起来
293 | self.supportVecIndex.append(i)
294 |
295 | logging.info('Training process is Done !!!!')
296 |
297 |
298 | def predict(self, x):
299 | '''
300 | 输入单个样本计算输出
301 | :param x: 输入的待预测的样本, list格式
302 | :return: 返回预测的label值
303 | '''
304 | x = np.array(x)
305 |
306 | result = 0
307 | ## 只有支持向量起作用
308 | for i in self.supportVecIndex:
309 | x1 = self.traindataArr[i]
310 | diff = np.dot((x1 - x), (x1 - x).T)
311 | k = np.exp((-1/2) * diff /(self.sigma ** 2))
312 | result += self.alpha[i] * self.trainlabelArr[i] * k
313 | result += self.b
314 | return np.sign(result)
315 |
316 |
317 | def testModel(self, testdataList, testlabelList):
318 | '''
319 | 测试模型的准确率
320 | :param testdataList: 输入的测试数据集, list格式
321 | :param testlabelList: 输入测试集的label, list格式
322 | :return: 返回预测的准确率
323 | '''
324 | correct_num = 0
325 |
326 | for i in range(len(testlabelList)):
327 | # print(self.predict(testdataList[i]))
328 | if i % 100== 0:
329 | logging.info('Testing processing: ({}/{}) and the currect prediction:{}'.format(i, len(testdataList), correct_num))
330 | if self.predict(testdataList[i]) == testlabelList[i]:
331 | correct_num += 1
332 | return round(correct_num / len(testlabelList)* 100, 4)
333 |
334 |
335 |
336 |
337 |
338 | if __name__ == '__main__':
339 | # 定义一个日志模块来保存日志
340 | logging.basicConfig(level=logging.DEBUG,
341 | format='%(asctime)-12s %(levelname)-8s %(message)s',
342 | datefmt='%m-%d %H:%M',
343 | filename='SVM.log',
344 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
345 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
346 | console = logging.StreamHandler()
347 | console.setLevel(logging.INFO)
348 | # 设置在控制台输出格式[-
349 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
350 | console.setFormatter(formatter)
351 | # 将handler加入到根记录器
352 | logging.getLogger('').addHandler(console)
353 |
354 | # 根记录器输出信息
355 | logging.info('This is an info message.')
356 |
357 | start = time.time()
358 |
359 | # mnist数据集的存储位置
360 | import os
361 | home = os.path.expanduser('~')
362 | train_path = home + '/ML/mnist/mnist_train.csv'
363 | test_path = home + '/ML/mnist/mnist_test.csv'
364 | # train_path = home + '/ML/mnist/mnist_train_samples.csv'
365 | # test_path = home + '/ML/mnist/mnist_test_samples.csv'
366 |
367 | # 读取训练与测试集
368 | logging.info('Loading data....')
369 |
370 | traindataList, trainlabelList = loadData(train_path)
371 | testdataList, testlabelList = loadData(test_path)
372 | logging.info('Loading data done.')
373 |
374 | logging.info('Training the SVM model....')
375 |
376 | svm = SVM(traindataList[:1000], trainlabelList[:1000])
377 |
378 |
379 | svm.train()
380 |
381 | logging.info('Predicting one sample ....')
382 | prediction = svm.predict(testdataList[0])
383 | logging.info('The prediction and the ground truth is : ({}, {})'.format(prediction, testlabelList[0]))
384 |
385 | # 测试SVM算法的准确率
386 | # 挑选测试集的前200个进行测试,防止运行时间过长
387 | accuracy = svm.testModel(testdataList, testlabelList)
388 |
389 | end = time.time()
390 |
391 | logging.info('accuracy:{}'.format(accuracy))
392 | logging.info('Total Time: {}'.format(round(end - start), 4))
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
--------------------------------------------------------------------------------
/最大熵模型/maxEntropy.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/29
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 |
8 | ''''
9 | 构建最大熵模型,并采用mnist数据集进行训练策测试
10 |
11 | --------------------------
12 | 没搞懂
13 | '''
14 | import numpy as np
15 | import logging
16 | import time
17 |
18 |
19 | def loadData(filename):
20 | '''
21 | 加载mnist数据集
22 | :param filename: 待加载的数据集路径
23 | :return: 返回加载后的数据集list
24 | '''
25 | dataList = []
26 | labelList = []
27 |
28 | f = open(filename, 'r')
29 |
30 | for line in f.readlines():
31 |
32 | curdata = line.strip().split(',')
33 |
34 | labelList.append(int(curdata[0]))
35 |
36 | dataList.append([int(int(value)>128) for value in curdata[1:]])
37 |
38 | return dataList, labelList
39 |
40 |
41 |
42 |
43 | class MaxEntropy:
44 | def __init__(self, traindataList, trainlabelList):
45 |
46 | self.traindataArr = np.array(traindataList)
47 | self.trainlabelArr = np.array(trainlabelList)
48 |
49 |
50 |
51 | if __name__ == '__main__':
52 | # 定义一个日志模块来保存日志
53 | logging.basicConfig(level=logging.DEBUG,
54 | format='%(asctime)-12s %(levelname)-8s %(message)s',
55 | datefmt='%m-%d %H:%M',
56 | filename='maxEntropy.log',
57 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
58 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
59 | console = logging.StreamHandler()
60 | console.setLevel(logging.INFO)
61 | # 设置在控制台输出格式[-
62 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
63 | console.setFormatter(formatter)
64 | # 将handler加入到根记录器
65 | logging.getLogger('').addHandler(console)
66 |
67 | # 根记录器输出信息
68 | logging.info('This is an info message.')
69 |
70 | start = time.time()
71 |
72 | # mnist数据集的存储位置
73 | import os
74 | home = os.path.expanduser('~')
75 | train_path = home + '/ML/mnist/mnist_train.csv'
76 | test_path = home + '/ML/mnist/mnist_test.csv'
77 | # train_path = home + '/ML/mnist/mnist_train_samples.csv'
78 | # test_path = home + '/ML/mnist/mnist_test_samples.csv'
79 |
80 | # 读取训练与测试集
81 | logging.info('Loading data....')
82 |
83 | traindataArr, trainlabelArr = loadData(train_path)
84 | testdataArr, testlabelArr = loadData(test_path)
85 | logging.info('Loading data done.')
86 |
87 | logging.info('Building a LogisticRegression model.')
88 | maxEntropy = MaxEntropy(traindataArr, trainlabelArr)
89 |
90 | logging.info('Using LogisticRegression to predict one sample.')
91 |
92 | prediction = maxEntropy.predict(testdataArr[0] + [1])
93 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction),
94 | str(testlabelArr[
95 | 0])))
96 |
97 | # 测试朴决策树算法的准确率
98 | # 挑选测试集的前200个进行测试,防止运行时间过长
99 | logging.info('Testing the LogisticRegression model.')
100 | accuracy = maxEntropy.testModel(testdataArr[:200], testlabelArr[:200])
101 |
102 | end = time.time()
103 |
104 | logging.info('accuracy:{}'.format(accuracy))
105 | logging.info('Total Time: {}'.format(round(end - start), 4))
--------------------------------------------------------------------------------
/朴素贝叶斯/NaiveBayes.log:
--------------------------------------------------------------------------------
1 | 07-25 15:03 INFO This is an info message.
2 | 07-25 15:03 INFO Loading data....
3 | 07-25 15:03 INFO Loading data done.
4 | 07-25 15:03 INFO Getting the prior distribution.
5 | 07-25 15:04 INFO Getting the Conditional probability distribution.
6 | 07-25 15:04 INFO Testing the testdata: (0/200.
7 | 07-25 15:04 INFO Testing the testdata: (50/200.
8 | 07-25 15:04 INFO Testing the testdata: (100/200.
9 | 07-25 15:04 INFO Testing the testdata: (150/200.
10 | 07-25 15:04 INFO accuracy:0.88
11 | 07-25 15:04 INFO Total Time: 48
12 |
--------------------------------------------------------------------------------
/朴素贝叶斯/NaiveBayes.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/25
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 | '''
8 | 实现朴素贝叶斯分类器
9 | 并采用mnist数据集测试模型
10 | '''
11 |
12 |
13 |
14 | import numpy as np
15 | import logging
16 | import time
17 |
18 |
19 |
20 |
21 | def loadData(fileName):
22 | '''
23 | 加载Mnist数据集
24 | :param fileName:要加载的数据集路径
25 | :return: list形式的数据集及标记
26 | '''
27 | # 存放数据及标记的list
28 | dataArr = []
29 | labelArr = []
30 | # 打开文件
31 | fr = open(fileName, 'r')
32 | # 将文件按行读取
33 | for line in fr.readlines():
34 | # 对每一行数据按切割福','进行切割,返回字段列表
35 | curLine = line.strip().split(',')
36 |
37 | labelArr.append(int(curLine[0]))
38 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0
39 | dataArr.append([int(int(num)>128) for num in curLine[1:]])
40 | # 存放标记
41 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
42 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
43 |
44 | # 返回data和label
45 | return dataArr, labelArr
46 |
47 |
48 | class NavieBayes:
49 | def __init__(self, num_classes, num_features, traindataArr, trianlabelArr):
50 | '''
51 | 初始化朴素贝叶斯分类器类
52 | :param num_classes: 类别数目
53 | :param num_features: 特征维度
54 | :param traindataArr: 训练集
55 | :param trianlabelArr: 训练集标签
56 | '''
57 | self.num_classes = num_classes
58 | self.num_features = num_features
59 | self.traindataArr, self.trainlabelArr = traindataArr, trainlabelArr
60 | self.py, self.px_y = self.getProbability()
61 |
62 |
63 |
64 | def naviebayes(self, x):
65 | '''
66 | 利用朴素贝叶斯进行概率估计
67 | :param py: 先验概率
68 | :param pxy: 条件概率
69 | :param x: 待测样本点
70 | :return: 返回类别
71 | '''
72 | p= [0] * self.num_classes
73 |
74 | # 计算每个类别的概率
75 | for i in range(self.num_classes):
76 | # 由于在getProbaility中计算得到的概率值已经经过了log运算,因此这里的概率值可以采用连加的形式
77 | sum = 0
78 | for j in range(self.num_features):
79 | sum += self.px_y[i][j][x[j]]
80 | p[i] = sum + self.py[i]
81 | return p.index(max(p))
82 |
83 |
84 | def getProbability(self):
85 | '''
86 | 计算所有训练集的先验与条件概率
87 | :param dataArr: 输入的训练样本集(list格式)
88 | :param labelArr: 输入的训练样本的label (list格式)
89 | :return: 返回训练集的先验概率分布与条件概率分布
90 | '''
91 |
92 | # 首先计算先验分布py,初始化py数组
93 | py = np.zeros((self.num_classes, 1))
94 |
95 | for i in range(self.num_classes):
96 | # 不考虑出现概率值为0的情况
97 | # np.mat(self.trainlabelArr == i)会让对应与等于i的为True, 不等的为False
98 | # py[i] = np.sum(np.mat(self.trainlabelArr == i)) / (len(self.trainlabelArr))
99 |
100 | # 考虑概率值为0的情况,采用laplace平滑
101 | py[i] = np.sum(np.mat(self.trainlabelArr == i) + 1) / (len(self.trainlabelArr) + self.num_classes)
102 |
103 | # 最后求后验概率估计的时候,形式是各项的相乘(“4.1 朴素贝叶斯法的学习” 式4.7),这里存在两个问题:1.某一项为0时,结果为0.
104 | # 这个问题通过分子和分母加上一个相应的数可以排除,前面已经做好了处理。2.如果特诊特别多(例如在这里,需要连乘的项目有784个特征
105 | # 加一个先验概率分布一共795项相乘,所有数都是0-1之间,结果一定是一个很小的接近0的数。)理论上可以通过结果的大小值判断, 但在
106 | # 程序运行中很可能会向下溢出无法比较,因为值太小了。所以人为把值进行log处理。log在定义域内是一个递增函数,也就是说log(x)中,
107 | # x越大,log也就越大,单调性和原数据保持一致。所以加上log对结果没有影响。此外连乘项通过log以后,可以变成各项累加,简化了计算。
108 | py = np.log(py)
109 |
110 | logging.info('Getting the prior distribution.')
111 |
112 |
113 | # 计算条件概率分布pxy,初始化pxy数组
114 | # 一共有num_classes类,一共有num_features个特征, 每个特征有两种取值,1或者0
115 | px_y = np.zeros((self.num_classes, self.num_features, 2))
116 |
117 | # 对标记集进行遍历
118 | for i in range(len(self.trainlabelArr)):
119 | # 获取当前循环所使用的标记
120 | label = self.trainlabelArr[i]
121 | # 获取当前要处理的样本
122 | x = self.traindataArr[i]
123 | # 对该样本的每一维特诊进行遍历
124 | for j in range(self.num_features):
125 | # 在矩阵中对应位置加1
126 | # 这里还没有计算条件概率,先把所有数累加,全加完以后,在后续步骤中再求对应的条件概率
127 | px_y[label][j][x[j]] += 1
128 |
129 | for label in range(self.num_classes):
130 | for j in range(self.num_features):
131 | # 分别计算第j个特征为0和1的个数
132 | px_y0 = px_y[label][j][0]
133 | px_y1 = px_y[label][j][1]
134 |
135 | # 计算条件概率
136 | px_y[label][j][0] = np.log((px_y0 +1) / (px_y0 + px_y1 + 2))
137 | px_y[label][j][1] = np.log((px_y1 +1) / (px_y0 + px_y1 + 2))
138 | logging.info('Getting the Conditional probability distribution.')
139 |
140 | return py, px_y
141 |
142 | def testModel(self,dataArr, labelArr):
143 | '''
144 | 利用测试集测试训练集的
145 | :param py: 先验概率分布
146 | :param pxy: 条件概率分布
147 | :param dataArr: 测试集数据
148 | :param labelArr: 测试集的label
149 | :return: 返回准确率
150 | '''
151 | correct_num = 0
152 | for i in range(len(dataArr)):
153 | if i %50 == 0:
154 | logging.info('Testing the testdata: ({}/{}).'.format(i, len(labelArr)))
155 |
156 | label = self.naviebayes(dataArr[i])
157 | if label == labelArr[i]:
158 | correct_num += 1
159 | return round(correct_num / len(labelArr), 4)
160 |
161 |
162 |
163 |
164 |
165 |
166 | if __name__ == '__main__':
167 |
168 | # 定义一个日志模块来保存日志
169 | logging.basicConfig(level=logging.DEBUG,
170 | format='%(asctime)-12s %(levelname)-8s %(message)s',
171 | datefmt='%m-%d %H:%M',
172 | filename='NaiveBayes.log',
173 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
174 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
175 | console = logging.StreamHandler()
176 | console.setLevel(logging.INFO)
177 | # 设置在控制台输出格式[-
178 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
179 | console.setFormatter(formatter)
180 | # 将handler加入到根记录器
181 | logging.getLogger('').addHandler(console)
182 |
183 | # 根记录器输出信息
184 | logging.info('This is an info message.')
185 |
186 |
187 | start = time.time()
188 |
189 |
190 | # mnist数据集的存储位置
191 | import os
192 | home = os.path.expanduser('~')
193 | train_path = home + '/ML/mnist/mnist_train.csv'
194 | test_path = home + '/ML/mnist/mnist_train.csv'
195 |
196 | # 读取训练与测试集
197 | logging.info('Loading data....')
198 |
199 | traindataArr, trainlabelArr =loadData(train_path)
200 | testdataArr, testlabelArr = loadData(test_path)
201 | logging.info('Loading data done.')
202 |
203 | num_classes = 10
204 | num_features = 28 * 28
205 | Naviebayes = NavieBayes(num_classes, num_features,traindataArr, trainlabelArr)
206 |
207 |
208 |
209 | #测试朴素贝页斯算法的准确率
210 | # 挑选测试集的前200个进行测试,防止运行时间过长
211 | accuracy = Naviebayes.testModel(testdataArr[:200], testlabelArr[:200])
212 |
213 |
214 | end = time.time()
215 |
216 | logging.info('accuracy:{}'.format(accuracy))
217 | logging.info('Total Time: {}'.format(round(end-start), 4))
218 |
--------------------------------------------------------------------------------
/逻辑回归/LogisticRegression.log:
--------------------------------------------------------------------------------
1 | 07-28 22:11 INFO This is an info message.
2 | 07-28 22:11 INFO Loading data....
3 | 07-28 22:11 INFO Loading data done.
4 | 07-28 22:11 INFO Building a LogisticRegression model.
5 | 07-28 22:12 INFO Using LogisticRegression to predict one sample.
6 | 07-28 22:12 INFO Testing processing Done,and the prediction and label are : (0,0)
7 | 07-28 22:12 INFO Testing the LogisticRegression model.
8 | 07-28 22:12 INFO accuracy:1.0
9 | 07-28 22:12 INFO Total Time: 29
10 |
--------------------------------------------------------------------------------
/逻辑回归/LogisticRegression.py:
--------------------------------------------------------------------------------
1 | # -*- coding:utf-8 -*-
2 | # @time :2020/7/28
3 | # @IDE : pycharm
4 | # @author :lxztju
5 | # @github : https://github.com/lxztju
6 |
7 |
8 | '''
9 | 逻辑斯蒂二分类器
10 | ------------------
11 | 在计算sigmoid的exp的时候可能会出现数值较大,溢出
12 | 因此采用修正的sigmoid防止溢出
13 | -----
14 | 修正后的sigmoid:
15 | wx = np.dot(self.w, x)
16 | if wx >= 0:
17 | probabilty = 1 /(1+ np.exp(-wx))
18 | else:
19 | e = np.exp(wx)
20 | probabilty = e / (1 + e)
21 | '''
22 |
23 |
24 | import numpy as np
25 | import logging
26 | import time
27 |
28 |
29 | def loadData(fileName):
30 | '''
31 | 加载Mnist数据集
32 | :param fileName:要加载的数据集路径
33 | :return: list形式的数据集及标记
34 | '''
35 | # 存放数据及标记的list
36 | dataArr = []
37 | labelArr = []
38 | # 打开文件
39 | fr = open(fileName, 'r')
40 | # 将文件按行读取
41 | for line in fr.readlines():
42 | # 对每一行数据按切割福','进行切割,返回字段列表
43 | curLine = line.strip().split(',')
44 |
45 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类
46 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue
47 | if int(curLine[0]) == 0 or int(curLine[0]) == 1:
48 | if int(curLine[0]) == 0:
49 | labelArr.append(1)
50 | else:
51 | labelArr.append(0)
52 | dataArr.append([int(num) / 255 for num in curLine[1:]])
53 | # 存放标记
54 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型
55 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化)
56 | # dataArr.append([int(num)/255 for num in curLine[1:]])
57 |
58 | # 返回data和label
59 | return dataArr, labelArr
60 |
61 |
62 |
63 | class LogisticRegression:
64 | def __init__(self, traindataList, trainlabelList):
65 | for i in range(len(traindataList)):
66 | traindataList[i].append(1)
67 |
68 | self.traindataArr = np.array(traindataList)
69 | self.trainlabelArr = np.array(trainlabelList)
70 | # print(self.traindataArr.shape)
71 | self.w = np.zeros(self.traindataArr.shape[1])
72 | self.num_samples, self.num_features = self.traindataArr.shape
73 | self.train()
74 |
75 | def train(self, lr= 0.01, max_epoch= 200):
76 | '''
77 | 训练得到逻辑斯蒂分类器
78 | :param lr: 学习率步长
79 | :param max_epoch: 最大的迭代次数
80 | :return: None,得到逻辑斯蒂分类器的权重
81 | '''
82 |
83 | for _ in range(max_epoch):
84 | grad = 0
85 | for i in range(self.num_samples):
86 | xi = self.traindataArr[i]
87 | yi = self.trainlabelArr[i]
88 | wx = np.dot(xi, self.w)
89 |
90 | ## 对sigmoid进行修正,防止溢出
91 | if wx >= 0:
92 | grad += xi * yi -1.0/(1+np.exp(-wx)) * xi
93 | else:
94 | e = np.exp(wx)
95 | grad += xi * yi - ( e / (1+e) ) * xi
96 | self.w += lr * grad
97 |
98 |
99 |
100 |
101 |
102 | def predict(self, x):
103 | '''
104 | 输入x,利用逻辑斯蒂回归进行预测
105 | :param x: 输入的x,numpy格式的array
106 | :return: label
107 | '''
108 | wx = np.dot(self.w, x)
109 | if wx >= 0:
110 | probabilty = 1 /(1+ np.exp(-wx))
111 | else:
112 | e = np.exp(wx)
113 | probabilty = e / (1 + e)
114 | if probabilty > 0.5:
115 | return 1
116 | else:
117 | return 0
118 |
119 | def testModel(self, testdataArr, testlabelArr):
120 | '''
121 | 测试模型的准确度
122 | :param testdataArr: numpy array
123 | :param testlabelArr: numpy array
124 | :return: 准确率
125 | '''
126 | # testdataArr = np.array(testdataArr)
127 | correct_num = 0
128 | for i in range(len(testdataArr)):
129 | # print(testdataArr[i].shape)
130 | if self.predict(testdataArr[i] + [1]) == testlabelArr[i]:
131 | correct_num += 1
132 | return round(correct_num / len(testdataArr), 4 )
133 |
134 |
135 |
136 |
137 | if __name__ == '__main__':
138 |
139 | # 定义一个日志模块来保存日志
140 | logging.basicConfig(level=logging.DEBUG,
141 | format='%(asctime)-12s %(levelname)-8s %(message)s',
142 | datefmt='%m-%d %H:%M',
143 | filename='LogisticRegression.log',
144 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失
145 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler
146 | console = logging.StreamHandler()
147 | console.setLevel(logging.INFO)
148 | # 设置在控制台输出格式[-
149 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s')
150 | console.setFormatter(formatter)
151 | # 将handler加入到根记录器
152 | logging.getLogger('').addHandler(console)
153 |
154 | # 根记录器输出信息
155 | logging.info('This is an info message.')
156 |
157 |
158 | start = time.time()
159 |
160 |
161 | # mnist数据集的存储位置
162 | import os
163 | home = os.path.expanduser('~')
164 | train_path = home + '/ML/mnist/mnist_train.csv'
165 | test_path = home + '/ML/mnist/mnist_test.csv'
166 | # train_path = home + '/ML/mnist/mnist_train_samples.csv'
167 | # test_path = home + '/ML/mnist/mnist_test_samples.csv'
168 |
169 | # 读取训练与测试集
170 | logging.info('Loading data....')
171 |
172 | traindataArr, trainlabelArr =loadData(train_path)
173 | testdataArr, testlabelArr = loadData(test_path)
174 | logging.info('Loading data done.')
175 |
176 | logging.info('Building a LogisticRegression model.')
177 | logisiticRegression = LogisticRegression(traindataArr, trainlabelArr)
178 |
179 | logging.info('Using LogisticRegression to predict one sample.')
180 |
181 | prediction = logisiticRegression.predict(testdataArr[0] + [1])
182 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0])))
183 |
184 | #测试朴决策树算法的准确率
185 | # 挑选测试集的前200个进行测试,防止运行时间过长
186 | logging.info('Testing the LogisticRegression model.')
187 | accuracy = logisiticRegression.testModel(testdataArr[:200], testlabelArr[:200])
188 |
189 |
190 | end = time.time()
191 |
192 | logging.info('accuracy:{}'.format(accuracy))
193 | logging.info('Total Time: {}'.format(round(end-start), 4))
194 |
195 |
--------------------------------------------------------------------------------