├── .idea ├── Machine_learning_python.iml ├── misc.xml ├── modules.xml ├── vcs.xml └── workspace.xml ├── AdaBoost ├── AdaBoost.log └── AdaBoost.py ├── EM └── EM.py ├── LICENSE ├── README.md ├── k近邻 ├── knn.log ├── knn.py ├── knn_kd.py └── knnkd.log ├── mnist └── convert_mnist2csv.py ├── 决策树 ├── C4.5.py ├── C45_decision_tree.log ├── CART.py ├── CART_decision_tree.log ├── ID3.py └── ID3_decision_tree.log ├── 感知机算法 ├── perceptron.log └── perceptron.py ├── 支持向量机 ├── SVM.log └── SVM.py ├── 最大熵模型 └── maxEntropy.py ├── 朴素贝叶斯 ├── NaiveBayes.log └── NaiveBayes.py └── 逻辑回归 ├── LogisticRegression.log └── LogisticRegression.py /.idea/Machine_learning_python.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 11 | 12 | 22 | 23 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 144 | 145 | 146 | 147 | 148 | 168 | 169 | 170 | 190 | 191 | 192 | 212 | 213 | 214 | 234 | 235 | 236 | 256 | 257 | 258 | 278 | 279 | 280 | 300 | 301 | 302 | 322 | 323 | 324 | 344 | 345 | 346 | 366 | 367 | 368 | 388 | 389 | 390 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 1595486360672 432 | 453 | 454 | 455 | 456 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 | 531 | 532 | 533 | 534 | 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | -------------------------------------------------------------------------------- /AdaBoost/AdaBoost.log: -------------------------------------------------------------------------------- 1 | 08-02 10:47 INFO This is an info message. 2 | 08-02 10:47 INFO Loading data.... 3 | 08-02 10:47 INFO Loading data done. 4 | 08-02 10:47 INFO Training the AdaBoost model.... 5 | 08-02 10:47 INFO The 0th Tree, The train data's accuracy is:0.9812 6 | 08-02 10:48 INFO The 1th Tree, The train data's accuracy is:0.9812 7 | 08-02 10:48 INFO The 2th Tree, The train data's accuracy is:0.9812 8 | 08-02 10:48 INFO The 3th Tree, The train data's accuracy is:0.9878 9 | 08-02 10:49 INFO The 4th Tree, The train data's accuracy is:0.9862 10 | 08-02 10:49 INFO The 5th Tree, The train data's accuracy is:0.9904 11 | 08-02 10:49 INFO The 6th Tree, The train data's accuracy is:0.988 12 | 08-02 10:49 INFO The 7th Tree, The train data's accuracy is:0.9924 13 | 08-02 10:50 INFO The 8th Tree, The train data's accuracy is:0.9898 14 | 08-02 10:50 INFO The 9th Tree, The train data's accuracy is:0.9939 15 | 08-02 10:50 INFO The 10th Tree, The train data's accuracy is:0.9929 16 | 08-02 10:50 INFO The 11th Tree, The train data's accuracy is:0.9942 17 | 08-02 10:51 INFO The 12th Tree, The train data's accuracy is:0.9934 18 | 08-02 10:51 INFO The 13th Tree, The train data's accuracy is:0.994 19 | 08-02 10:51 INFO The 14th Tree, The train data's accuracy is:0.9951 20 | 08-02 10:51 INFO The 15th Tree, The train data's accuracy is:0.9945 21 | 08-02 10:52 INFO The 16th Tree, The train data's accuracy is:0.9958 22 | 08-02 10:52 INFO The 17th Tree, The train data's accuracy is:0.9952 23 | 08-02 10:52 INFO The 18th Tree, The train data's accuracy is:0.9963 24 | 08-02 10:52 INFO The 19th Tree, The train data's accuracy is:0.9958 25 | 08-02 10:53 INFO The 20th Tree, The train data's accuracy is:0.997 26 | 08-02 10:53 INFO The 21th Tree, The train data's accuracy is:0.9963 27 | 08-02 10:53 INFO The 22th Tree, The train data's accuracy is:0.9965 28 | 08-02 10:54 INFO The 23th Tree, The train data's accuracy is:0.9968 29 | 08-02 10:54 INFO The 24th Tree, The train data's accuracy is:0.9974 30 | 08-02 10:54 INFO The 25th Tree, The train data's accuracy is:0.9974 31 | 08-02 10:54 INFO The 26th Tree, The train data's accuracy is:0.9974 32 | 08-02 10:55 INFO The 27th Tree, The train data's accuracy is:0.9977 33 | 08-02 10:55 INFO The 28th Tree, The train data's accuracy is:0.9978 34 | 08-02 10:55 INFO The 29th Tree, The train data's accuracy is:0.9981 35 | 08-02 10:55 INFO The 30th Tree, The train data's accuracy is:0.9981 36 | 08-02 10:56 INFO The 31th Tree, The train data's accuracy is:0.9985 37 | 08-02 10:56 INFO The 32th Tree, The train data's accuracy is:0.9978 38 | 08-02 10:56 INFO The 33th Tree, The train data's accuracy is:0.9983 39 | 08-02 10:56 INFO The 34th Tree, The train data's accuracy is:0.9982 40 | 08-02 10:57 INFO The 35th Tree, The train data's accuracy is:0.9982 41 | 08-02 10:57 INFO The 36th Tree, The train data's accuracy is:0.998 42 | 08-02 10:57 INFO The 37th Tree, The train data's accuracy is:0.9987 43 | 08-02 10:57 INFO The 38th Tree, The train data's accuracy is:0.9983 44 | 08-02 10:58 INFO The 39th Tree, The train data's accuracy is:0.9986 45 | 08-02 10:58 INFO The 40th Tree, The train data's accuracy is:0.9987 46 | 08-02 10:58 INFO The 41th Tree, The train data's accuracy is:0.9987 47 | 08-02 10:59 INFO The 42th Tree, The train data's accuracy is:0.999 48 | 08-02 10:59 INFO The 43th Tree, The train data's accuracy is:0.9987 49 | 08-02 10:59 INFO The 44th Tree, The train data's accuracy is:0.9991 50 | 08-02 10:59 INFO The 45th Tree, The train data's accuracy is:0.9989 51 | 08-02 11:00 INFO The 46th Tree, The train data's accuracy is:0.9989 52 | 08-02 11:00 INFO The 47th Tree, The train data's accuracy is:0.9987 53 | 08-02 11:00 INFO The 48th Tree, The train data's accuracy is:0.9988 54 | 08-02 11:00 INFO The 49th Tree, The train data's accuracy is:0.9987 55 | 08-02 11:00 INFO accuracy:99.9527 56 | 08-02 11:00 INFO Total Time: 789 57 | -------------------------------------------------------------------------------- /AdaBoost/AdaBoost.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/8/1 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | # @Emial : lxztju@163.com 7 | 8 | ''' 9 | 实现AdaBoost的提升方法 10 | 11 | 利用mnist数据集其中的两类进行分类 12 | 13 | ---------------- 14 | 利用统计学习方法中例题8.1.3,利用阈值分割(单层决策树)作为基分类器 15 | 16 | 17 | ''' 18 | 19 | import numpy as np 20 | import logging 21 | import time 22 | 23 | 24 | def loadData(fileName): 25 | ''' 26 | 加载Mnist数据集 27 | :param fileName:要加载的数据集路径 28 | :return: list形式的数据集及标记 29 | ''' 30 | # 存放数据及标记的list 31 | dataArr = [] 32 | labelArr = [] 33 | # 打开文件 34 | fr = open(fileName, 'r') 35 | # 将文件按行读取 36 | for line in fr.readlines(): 37 | # 对每一行数据按切割福','进行切割,返回字段列表 38 | curLine = line.strip().split(',') 39 | 40 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类 41 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue 42 | if int(curLine[0]) == 0 or int(curLine[0]) == 1: 43 | if int(curLine[0]) == 0: 44 | labelArr.append(1) 45 | else: 46 | labelArr.append(-1) 47 | dataArr.append([int(int(num) > 128) for num in curLine[1:]]) 48 | 49 | # 返回data和label 50 | return dataArr, labelArr 51 | 52 | 53 | 54 | class SingleTree: 55 | def __init__(self, traindataList, trainlabelList): 56 | ''' 57 | 构建单层的决策树作为AdaBoost的基分类器 58 | :param traindataList: 输入的数据集的list格式 59 | :param trainlabelList: 输入训练集的label的list格式 60 | :param D: 训练数据集的权重 61 | ''' 62 | self.traindataArr = np.array(traindataList) 63 | self.trainlabelArr = np.array(trainlabelList) 64 | self.m, self.n = self.traindataArr.shape 65 | self.D = [1/ self.m] * self.m # 初始化数据集权重为均匀分布 66 | 67 | 68 | def calcError(self, prediction, trainlabelArr, D): 69 | ''' 70 | 计算在训练数据集上的分类误差率 71 | :param prediction: 决策树预测出的prediction,与trainlabelArr长度相同 72 | :param trainlabelArr: ground truth 73 | :param D: 训练数据集的权重 74 | :return: 返回训练误差率 75 | ''' 76 | # 初始化error 77 | error = 0 78 | 79 | for i in range(trainlabelArr.size): 80 | if prediction[i] != trainlabelArr[i]: 81 | error += D[i] 82 | return error 83 | 84 | 85 | def singleTree(self): 86 | ''' 87 | 构建单层决策树,作为基分类器 88 | :return: 89 | ''' 90 | # 利用字典构建一棵树 91 | # print(self.D) 92 | tree = {} 93 | # 切分点,由于数据集读取的过程中,每个特征的取值均为0 和 1,因此选择三个切分点,第一个小于0,第二个0,1之间,第三个大于1 94 | divides = [-0.5, 0.5, 1.5] 95 | # 指定规则,对于某个特征,less为小于切分点阈值的为1,大于的为-1 96 | # Over为大于切分点阈值的为-1, 小于的为1 97 | rules = ['Less', 'Over'] 98 | # 最大的误差值为1,因此初始化为1 99 | min_error = 1 100 | # 遍历每个特征,找寻能够使得误差最小值的切分店,与切分规则还有特征值 101 | for i in range(self.n): 102 | for divide in divides: 103 | 104 | for rule in rules: 105 | #初始化预测的结果为predicition 106 | prediction = np.ones(self.m) 107 | if rule == 'Less': 108 | # 当切分规则为Less时,大于切分点的样本置为-1,因为一开始一开始初始化为1,因此预测为1的可不进行赋值处理 109 | prediction[self.traindataArr[:,i] >divide] = -1 110 | else: 111 | # 当切分点为Over时,小于切分店的样本置为-1 112 | prediction[self.traindataArr[:, i] <= divide] = -1 113 | # 对于给定的特征、切分点、切分规则,计算相对应的错误率 114 | error = self.calcError(prediction, self.trainlabelArr, self.D) 115 | # 找到最小的错误率来构建树 116 | if error < min_error: 117 | # print(prediction, self.traindataArr[:, i], trainlabelList) 118 | tree['error'] = error 119 | tree['rule'] = rule 120 | tree['divide'] = divide 121 | tree['feature'] = i 122 | tree['Gx'] = prediction 123 | min_error = error 124 | # print(tree, error) 125 | return tree 126 | 127 | 128 | class Adaboost(SingleTree): 129 | def __init__(self, traindataList, trainlabelList, treeNum = 50): 130 | super().__init__(traindataList, trainlabelList) 131 | 132 | self.treeNum = treeNum 133 | 134 | self.trees = self.BoostingTree() 135 | 136 | 137 | 138 | def BoostingTree(self): 139 | ''' 140 | 构建Adaboost 141 | :return: 返回构建完成的Adaboost模型 142 | ''' 143 | # 初始化树的列表,每个元素代表一棵树,从前到后一层层 144 | tree = [] 145 | # 最终的预测值列表,每个元素表示对于每个样本的预测值 146 | finalPrediction = np.zeros(self.trainlabelArr.size) 147 | #迭代生成treeNum层的树 148 | for i in range(self.treeNum): 149 | # 构建单层的树 150 | curTree = self.singleTree() 151 | # 根据公式8.2,计算alpha 152 | alpha = 1/2 * np.log((1-curTree['error']) / curTree['error']) 153 | # 保留这一层树的预测值,用于后边权重值的计算 154 | Gx = curTree['Gx'] 155 | 156 | # 计算数据集的权重 157 | # 式子8.4的分子部分,是一个向量,在array中 *与np.multiply表示元素对应相乘 158 | # np.dot()是向量点乘 159 | w = self.D * ( np.exp( -1 * alpha * self.trainlabelArr * Gx)) 160 | # 训练集的权重分布 161 | self.D = w / sum(w) 162 | curTree['alpha'] = alpha 163 | # print(curTree) 164 | 165 | tree.append(curTree) 166 | 167 | ################################# 168 | # 计算boosting的效果,提前中止 169 | finalPrediction += alpha * Gx 170 | # print(finalPrediction, self.trainlabelArr, alpha) 171 | correct_num = sum(np.sign(finalPrediction) == self.trainlabelArr) 172 | # print(correct_num, finalPrediction, self.trainlabelArr) 173 | accuracy = correct_num / self.trainlabelArr.size 174 | logging.info("The {}th Tree, The train data's accuracy is:{}".format(i, accuracy)) 175 | # 如果在训练集上转却率已经达到1,提前中止 176 | if accuracy == 1: 177 | break 178 | return tree 179 | 180 | def predict(self, x, div, rule, feature): 181 | ''' 182 | 对于单个样本,来计算基分类器的输出结果 183 | :param x: 输入样本 184 | :param div: 拆分点的阈值 185 | :param rule: 拆分规则, Less 或者 Over 186 | :param feature: 对应操作的特征 187 | :return: 返回预测的label 188 | ''' 189 | 190 | if rule == 'Less': 191 | L, H = 1, -1 192 | else: 193 | L, H = -1, 1 194 | 195 | if x[feature] > div: 196 | return H 197 | else: 198 | return L 199 | 200 | 201 | 202 | def testModel(self, testdataList, testlabelList): 203 | ''' 204 | 预测Adaboost模型的准确率 205 | :param testdataList: 输入的测试集的list格式 206 | :param testlabelList: 测试集的label 207 | :return: 返回准确率 208 | ''' 209 | correct_num = 0 210 | 211 | for i in range(len(testdataList)): 212 | result = 0 213 | 214 | for curTree in self.trees: 215 | 216 | div = curTree['divide'] 217 | feature = curTree['feature'] 218 | rule = curTree['rule'] 219 | alpha = curTree['alpha'] 220 | result += alpha * self.predict(testdataList[i], div, rule, feature) 221 | 222 | if np.sign(result) == testlabelList[i]: 223 | correct_num += 1 224 | 225 | return round((correct_num /len(testlabelList)* 100), 4) 226 | 227 | 228 | 229 | if __name__ == '__main__': 230 | 231 | # 定义一个日志模块来保存日志 232 | logging.basicConfig(level=logging.DEBUG, 233 | format='%(asctime)-12s %(levelname)-8s %(message)s', 234 | datefmt='%m-%d %H:%M', 235 | filename='AdaBoost.log', 236 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 237 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 238 | console = logging.StreamHandler() 239 | console.setLevel(logging.INFO) 240 | # 设置在控制台输出格式[- 241 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 242 | console.setFormatter(formatter) 243 | # 将handler加入到根记录器 244 | logging.getLogger('').addHandler(console) 245 | 246 | # 根记录器输出信息 247 | logging.info('This is an info message.') 248 | 249 | start = time.time() 250 | 251 | # mnist数据集的存储位置 252 | import os 253 | home = os.path.expanduser('~') 254 | train_path = home + '/ML/mnist/mnist_train.csv' 255 | test_path = home + '/ML/mnist/mnist_test.csv' 256 | # train_path = home + '/ML/mnist/mnist_train_samples.csv' 257 | # test_path = home + '/ML/mnist/mnist_test_samples.csv' 258 | 259 | # 读取训练与测试集 260 | logging.info('Loading data....') 261 | 262 | traindataList, trainlabelList = loadData(train_path) 263 | testdataList, testlabelList = loadData(test_path) 264 | logging.info('Loading data done.') 265 | # print(trainlabelList[:100]) 266 | logging.info('Training the AdaBoost model....') 267 | 268 | adaboost = Adaboost(traindataList[:1000], trainlabelList[:1000]) 269 | 270 | 271 | # logging.info('Predicting one sample ....') 272 | # prediction = adaboost.predict([testdataList[0]], [testlabelList[0]]) 273 | # logging.info('The prediction and the ground truth is : ({}, {})'.format(prediction, testlabelList[0])) 274 | 275 | # 测试Adaboost算法的准确率 276 | # 挑选测试集的前200个进行测试,防止运行时间过长 277 | accuracy = adaboost.testModel(testdataList, testlabelList) 278 | 279 | end = time.time() 280 | 281 | logging.info('accuracy:{}'.format(accuracy)) 282 | logging.info('Total Time: {}'.format(round(end - start), 4)) -------------------------------------------------------------------------------- /EM/EM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/8/3 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | # @Emial : lxztju@163.com 7 | 8 | 9 | ''' 10 | 构建两个高斯分布来混合来模拟生成的数据 11 | 12 | --------------------------------------- 13 | 第一个高斯分布 14 | alpha=0.3, mu=0, sigma=1 15 | 16 | 第二个高斯分布 17 | 18 | alpha=0.7, mu=1, sigma=3 19 | ''' 20 | 21 | 22 | import numpy as np 23 | import logging 24 | import time 25 | 26 | 27 | def loadData(*args): 28 | ''' 29 | 传入一个参数的列表,然后模拟高斯混合产生数据 30 | :param args: 输入的列表分别为[alpha0, mu0, sigma0, alpha1, mu1, sigma1] 31 | :return: 返回高斯混合boing生成的数据 32 | ''' 33 | print(args) 34 | alpha0, mu0, sigma0, alpha1, mu1, sigma1 = args[0] 35 | 36 | # 生成数据的长度 37 | length = 1000 38 | # 第一个高斯模型产生的数据 39 | data1 = np.random.normal(mu0, sigma0, int(length*alpha0)) 40 | 41 | #第二个高斯模型产生的数据 42 | data2 = np.random.normal(mu1, sigma1, int(length*alpha1)) 43 | #所有的数据接起来放在一起 44 | dataArr = np.append(data1, data2) 45 | # 打乱数据 46 | np.random.shuffle(dataArr) 47 | return dataArr 48 | 49 | 50 | 51 | class EM: 52 | def __init__(self, alpha0, mu0, sigma0, alpha1, mu1, sigma1, dataArr): 53 | ''' 54 | 高斯混合模型的参数 55 | :param alpha0: 第一个模型的生成概率 56 | :param mu0: 第一个高斯模型的均值 57 | :param sigma0: 第一个高斯模型的标准差 58 | :param alpha1: 第二个模型的生成概率 59 | :param mu1: 第二个模型的均值 60 | :param sigma1: 第三个模型的标准差 61 | ''' 62 | self.alpha0 = alpha0 63 | self.mu0 = mu0 64 | self.sigma0 = sigma0 65 | self.alpha1 = alpha1 66 | self.mu1 = mu1 67 | self.sigma1 = sigma1 68 | self.dataArr = dataArr 69 | self.iter = 200 70 | self.train() 71 | 72 | 73 | def getGamma(self, mu, sigma): 74 | return (1 / (np.sqrt(2 * np.pi) * sigma)) * np.exp( -1 * ((self.dataArr - mu) * (self.dataArr - mu)) / (2 * sigma ** 2)) 75 | 76 | def E_step(self): 77 | gamma0 = self.alpha0 * self.getGamma(self.mu0, self.sigma0) 78 | 79 | gamma1 = self.alpha1 * self.getGamma(self.mu1, self.sigma1) 80 | 81 | sum_ = gamma0 + gamma1 82 | return gamma0/sum_, gamma1/sum_ 83 | 84 | 85 | def M_step(self): 86 | gamma0, gamma1 = self.E_step() 87 | # print(sum(gamma0)) 88 | self.mu0 = sum(gamma0 * self.dataArr) / sum(gamma0) 89 | self.mu1 = sum(gamma1 * self.dataArr) / sum(gamma1) 90 | 91 | self.alpha0 = sum(gamma0) / self.dataArr.size 92 | self.alpha1 = sum(gamma1) / self.dataArr.size 93 | 94 | # print(self.alpha0, self.alpha1) 95 | self.sigma0 = np.sqrt(sum(gamma0 * (self.dataArr - self.mu0)*(self.dataArr - self.mu0) ) / sum(gamma0)) 96 | self.sigma1 = np.sqrt(sum(gamma1 * (self.dataArr - self.mu1)*(self.dataArr - self.mu1) ) / sum(gamma1)) 97 | 98 | 99 | 100 | def train(self): 101 | 102 | 103 | for i in range(self.iter): 104 | self.M_step() 105 | # print(self.alpha0, self.mu0 , self.sigma0) 106 | 107 | 108 | if __name__ == '__main__': 109 | 110 | parameters = [0.3, 0, 1, 0.7, 1, 3] 111 | dataArr = loadData(parameters) 112 | # print(dataArr.shape) 113 | em = EM(0.5, 0, 1, 0.5, 1, 2, dataArr) 114 | print(em.alpha0, em.mu0, em.sigma0, em.alpha1, em.alpha1, em.mu1, em.sigma1) 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 lxztju 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # machine_learning_python 2 | 纯python实现机器学习经典算法 3 | 4 | 目前已经完成的算法: 5 | 6 | * 感知机算法 7 | 8 | * KNN K近邻算法 9 | 10 | * 朴素贝叶斯算法 11 | 12 | * 决策树 13 | 14 | ``` 15 | --ID3 16 | 17 | ​ --C4.5 18 | 19 | ​ --CART 20 | ``` 21 | 22 | 23 | 24 | * logistic 回归 25 | 26 | * 支持向量机 27 | 28 | 29 | * AdaBoost 30 | * EM 31 | 32 | 33 | 34 | 35 | 知乎地址:[https://zhuanlan.zhihu.com/p/163688301](https://zhuanlan.zhihu.com/p/163688301) 36 | 37 | 自己学习这些机器学习的代码主要参考如下的两个代码仓库,通过自己手敲这些代码,搞明白了很多相关的算法细节东西。非常感谢大佬的开源。 38 | 39 | 参考链接: 40 | 41 | ​ [https://github.com/Dod-o/Statistical-Learning-Method_Code](https://github.com/Dod-o/Statistical-Learning-Method_Code) 42 | 43 | ​ [https://github.com/fengdu78/lihang-code](https://github.com/fengdu78/lihang-code) -------------------------------------------------------------------------------- /k近邻/knn.log: -------------------------------------------------------------------------------- 1 | 07-24 21:37 INFO This is an info message. 2 | 07-24 21:37 INFO Loading data.... 3 | 07-24 21:37 INFO Loading data done. 4 | 07-24 21:37 INFO test data shape is:(200,784) 5 | 07-24 21:37 INFO train data shape is:(60000,784) 6 | 07-24 21:37 INFO Testing data:(0/200), and correct_num:0 7 | 07-24 21:38 INFO Testing data:(50/200), and correct_num:49 8 | 07-24 21:39 INFO Testing data:(100/200), and correct_num:98 9 | 07-24 21:40 INFO Testing data:(150/200), and correct_num:146 10 | 07-24 21:41 INFO accuracy:0.965 11 | 07-24 21:41 INFO Total Time: 230 12 | -------------------------------------------------------------------------------- /k近邻/knn.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/24 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | 8 | ''' 9 | 实现一个KNN算法,并实现两种进行最近邻搜索的方法, 10 | 线性搜索最近邻 11 | --------------- 12 | 距离的度量采用欧式距离与曼哈顿距离计算 13 | ''' 14 | 15 | import numpy as np 16 | import time 17 | import logging 18 | 19 | 20 | 21 | 22 | class Knn: 23 | def __init__(self, k, num_classes, dist_method): 24 | self.k = k 25 | self.num_classes = num_classes 26 | self.dist_method = dist_method 27 | 28 | 29 | 30 | def loadData(self, fileName): 31 | ''' 32 | 加载Mnist数据集 33 | :param fileName:要加载的数据集路径 34 | :return: list形式的数据集及标记 35 | ''' 36 | # 存放数据及标记的list 37 | dataArr = []; labelArr = [] 38 | # 打开文件 39 | fr = open(fileName, 'r') 40 | # 将文件按行读取 41 | for line in fr.readlines(): 42 | # 对每一行数据按切割福','进行切割,返回字段列表 43 | curLine = line.strip().split(',') 44 | 45 | 46 | labelArr.append(int(curLine[0])) 47 | dataArr.append([int(num) / 255 for num in curLine[1:]]) 48 | #存放标记 49 | #[int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 50 | #[int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 51 | 52 | #返回data和label 53 | return dataArr, labelArr 54 | 55 | 56 | 57 | 58 | def calculate_distance(self, x1, x2): 59 | ''' 60 | 计算两个向量之间的距离 61 | :param x1: 第一个向量,numpy格式的列向量 62 | :param x2: 第二个向量,numpy格式的列向量 63 | :param method: 值为'l2, l1',l2为欧式距离, l1为曼哈顿聚类 64 | :return: 返回距离度量值,标量值 65 | ''' 66 | if self.dist_method == 'l2': 67 | return np.sqrt(np.sum(np.square(x1 - x2))) 68 | else: 69 | return np.sum(np.abs(x1 - x2)) 70 | 71 | 72 | 73 | 74 | 75 | def linear_get_k_cloest(self, dataMat, labelMat, x): 76 | ''' 77 | 构建爱呢感知机算法,其中loss function采用错误分类点的个数 78 | :param dataMat: 输入numpy格式的训练集 79 | :param labelMat: 输入numpy格式的训练集标签数据 80 | :param x: 待查验的向量 81 | :return: label , knn预测的label值 82 | ''' 83 | 84 | # 训练数据的维度大小 85 | m, n = dataMat.shape 86 | 87 | ##线性遍历每个节点,分别记录各个节点的距离,然后找到最近邻的k个节点 88 | dists = [0] * m # 记录每个节点与待查节点的距离 89 | for i in range(m): 90 | xi = dataMat[i] 91 | 92 | dist = self.calculate_distance(xi, x) 93 | dists[i] = dist 94 | 95 | # 得到待测点与所有点的距离值,然后将所有的距离值排序,找到最近的k距离值的索引 96 | # argsort返回从小到大排序的元素的索引 97 | topk_index = np.argsort(np.array(dists))[:self.k] 98 | # print(type(topk_index), topk_index) 99 | # labelList表示每个类别的近邻样本的数目 100 | labelList = [0] * self.num_classes 101 | for index in topk_index: 102 | labelList[int(labelMat[index])] += 1 103 | # 返回识别后的类别 104 | return labelList.index(max(labelList)) 105 | 106 | 107 | 108 | 109 | def modelTest(self, traindataArr, trainlabelArr, testdataArr, testlabelArr): 110 | ''' 111 | 测试knn模型的准确率 112 | :param traindataArr: 训练数据的list格式 113 | :param trainLabelArr: 测试数据label的list格式 114 | :param testdataArr: 测试数据的list个格式存储 115 | :param testlabelArr: 测试数据label的list格式 116 | :return: 117 | ''' 118 | 119 | # 数据转换为numpy格式,方便进行矩阵运算 120 | traindataMat = np.mat(traindataArr) 121 | trainlabelMat = np.mat(trainlabelArr).T 122 | testdataMat = np.mat(testdataArr) 123 | testlabelMat = np.mat(testlabelArr).T 124 | 125 | # 测试集的维度大小 126 | m ,n = testdataMat.shape 127 | m1, n1 = traindataMat.shape 128 | logging.info('test data shape is:({},{})'.format(m,n)) 129 | logging.info('train data shape is:({},{})'.format(m1,n1)) 130 | 131 | 132 | # 正确分类的样本的数目 133 | correct_num = 0 134 | 135 | # 遍历所有的测试样本,查找其中的正确分类样本个数 136 | for i in range(m): 137 | xi = testdataMat[i] 138 | yi = testlabelMat[i] 139 | if i % 50 == 0: 140 | logging.info('Testing data:({}/{}), and correct_num:{}'.format(i, m, correct_num)) 141 | # 统计分类正确的元素点的个数 142 | if self.linear_get_k_cloest(traindataMat, trainlabelMat, xi) == yi: 143 | correct_num += 1 144 | 145 | return round(correct_num/m, 4) 146 | 147 | 148 | 149 | 150 | 151 | 152 | if __name__ == '__main__': 153 | 154 | # 定义一个日志模块来保存日志 155 | logging.basicConfig(level=logging.DEBUG, 156 | format='%(asctime)-12s %(levelname)-8s %(message)s', 157 | datefmt='%m-%d %H:%M', 158 | filename='knn.log', 159 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 160 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 161 | console = logging.StreamHandler() 162 | console.setLevel(logging.INFO) 163 | # 设置在控制台输出格式[- 164 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 165 | console.setFormatter(formatter) 166 | # 将handler加入到根记录器 167 | logging.getLogger('').addHandler(console) 168 | 169 | # 根记录器输出信息 170 | logging.info('This is an info message.') 171 | 172 | 173 | start = time.time() 174 | 175 | 176 | # mnist数据集的存储位置 177 | import os 178 | home = os.path.expanduser('~') 179 | train_path = home + '/ML/mnist/mnist_train.csv' 180 | test_path = home + '/ML/mnist/mnist_train.csv' 181 | 182 | topk = 20 183 | num_classes = 10 184 | dist_method = 'l2' 185 | knn = Knn(topk, num_classes, dist_method) 186 | 187 | # 读取训练与测试集 188 | logging.info('Loading data....') 189 | 190 | traindataArr, trainlabelArr = knn.loadData(train_path) 191 | testdataArr, testlabelArr = knn.loadData(test_path) 192 | logging.info('Loading data done.') 193 | 194 | #测试knn算法的准确率 195 | # 挑选测试集的前200个进行测试,防止运行时间过长 196 | accuracy = knn.modelTest(traindataArr, trainlabelArr, testdataArr[:200], testlabelArr[:200]) 197 | 198 | 199 | end = time.time() 200 | 201 | logging.info('accuracy:{}'.format(accuracy)) 202 | logging.info('Total Time: {}'.format(round(end-start), 4)) 203 | -------------------------------------------------------------------------------- /k近邻/knn_kd.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/24 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | 8 | ''' 9 | 实现一个KNN算法 10 | 构建kd树来搜索最近邻 11 | ---------------- 12 | 距离的度量依然采用欧式距离 13 | ''' 14 | 15 | 16 | 17 | import time 18 | import logging 19 | 20 | from collections import namedtuple 21 | from math import sqrt 22 | 23 | 24 | def loadData(fileName): 25 | ''' 26 | 加载Mnist数据集 27 | :param fileName:要加载的数据集路径 28 | :return: list形式的数据集及标记 29 | ''' 30 | # 存放数据及标记的list 31 | dataArr = []; 32 | labelArr = [] 33 | # 打开文件 34 | fr = open(fileName, 'r') 35 | # 将文件按行读取 36 | for line in fr.readlines(): 37 | # 对每一行数据按切割福','进行切割,返回字段列表 38 | curLine = line.strip().split(',') 39 | 40 | labelArr.append(int(curLine[0])) 41 | dataArr.append([int(num) / 255 for num in curLine[1:]]) 42 | # 存放标记 43 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 44 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 45 | 46 | # 返回data和label 47 | return dataArr, labelArr 48 | 49 | 50 | # kd-tree每个结点中主要包含的数据结构如下 51 | class KdNode: 52 | def __init__(self, dom_elt, split, left, right): 53 | self.dom_elt = dom_elt # k维向量节点(k维空间中的一个样本点) 54 | self.split = split # 整数(进行分割维度的序号) 55 | self.left = left # 该结点分割超平面左子空间构成的kd-tree 56 | self.right = right # 该结点分割超平面右子空间构成的kd-tree 57 | 58 | 59 | class KdTree: 60 | ''' 61 | 对于输入空间构建KD树 62 | ''' 63 | def __init__(self, data): 64 | # data为list格式的数据 65 | k = len(data[0]) # 数据维度 66 | 67 | def CreateNode(split, data_set): # 按第split维划分数据集dataset创建KdNode 68 | if not data_set: # 数据集为空 69 | return None 70 | # 对于输入的列表版找第split维进行排序 71 | data_set.sort(key=lambda x: x[split]) 72 | split_pos = len(data_set) // 2 # 找到中位数的索引 73 | median = data_set[split_pos] # 中位数分割点 74 | split_next = (split + 1) % k # cycle coordinates 75 | 76 | # 递归的创建kd树 77 | return KdNode( 78 | median, 79 | split, 80 | CreateNode(split_next, data_set[:split_pos]), # 创建左子树 81 | CreateNode(split_next, data_set[split_pos + 1:])) # 创建右子树 82 | 83 | self.root = CreateNode(0, data) # 从第0维分量开始构建kd树,返回根节点 84 | 85 | 86 | 87 | 88 | 89 | class KnnKd(): 90 | def __init__(self, kd, traindataArr, trainlabelArr): 91 | self.kd = kd 92 | # 定义一个namedtuple,分别存放最近坐标点、最近距离和访问过的节点数 93 | self.result = namedtuple("Result_tuple", 94 | "nearest_point nearest_dist nodes_visited") 95 | self.data_label_dict = {''.join([str(j) for j in traindataArr[i]]): trainlabelArr[i] for i in range(len(trainlabelArr)) } 96 | 97 | 98 | 99 | def find_nearest(self, point): 100 | ''' 101 | # 对构建好的kd树进行搜索,寻找与目标点最近的样本点: 102 | :param point: 待查找的某个节点 103 | :return: 返回对应的类别 104 | ''' 105 | k = len(point) # 数据维度 106 | 107 | def travel(kd_node, target, max_dist): 108 | ''' 109 | 递归在kd树中进行搜索,对应的point 110 | :param kd_node: kd树的节点 111 | :param target: 待查找的节点 112 | :param max_dist: 以待查找节点为圆心的超球的半径 113 | :return: 返回最终的numed_tuple 114 | ''' 115 | if kd_node is None: 116 | return self.result([0] * k, float("inf"), 117 | 0) # python中用float("inf")和float("-inf")表示正负无穷 118 | 119 | nodes_visited = 1 120 | 121 | s = kd_node.split # 进行分割的维度 122 | pivot = kd_node.dom_elt # 进行分割的“轴” 123 | 124 | if target[s] <= pivot[s]: # 如果目标点第s维小于分割轴的对应值(目标离左子树更近) 125 | nearer_node = kd_node.left # 下一个访问节点为左子树根节点 126 | further_node = kd_node.right # 同时记录下右子树 127 | else: # 目标离右子树更近 128 | nearer_node = kd_node.right # 下一个访问节点为右子树根节点 129 | further_node = kd_node.left 130 | 131 | temp1 = travel(nearer_node, target, max_dist) # 进行遍历找到包含目标点的区域 132 | 133 | nearest = temp1.nearest_point # 以此叶结点作为“当前最近点” 134 | dist = temp1.nearest_dist # 更新最近距离 135 | 136 | nodes_visited += temp1.nodes_visited 137 | 138 | if dist < max_dist: 139 | max_dist = dist # 最近点将在以目标点为球心,max_dist为半径的超球体内 140 | 141 | temp_dist = abs(pivot[s] - target[s]) # 第s维上目标点与分割超平面的距离 142 | if max_dist < temp_dist: # 判断超球体是否与超平面相交 143 | return self.result(nearest, dist, nodes_visited) # 不相交则可以直接返回,不用继续判断 144 | 145 | # ---------------------------------------------------------------------- 146 | # 计算目标点与分割点的欧氏距离 147 | temp_dist = sqrt(sum((p1 - p2) ** 2 for p1, p2 in zip(pivot, target))) 148 | 149 | if temp_dist < dist: # 如果“更近” 150 | nearest = pivot # 更新最近点 151 | dist = temp_dist # 更新最近距离 152 | max_dist = dist # 更新超球体半径 153 | 154 | # 检查另一个子结点对应的区域是否有更近的点 155 | temp2 = travel(further_node, target, max_dist) 156 | 157 | nodes_visited += temp2.nodes_visited 158 | if temp2.nearest_dist < dist: # 如果另一个子结点内存在更近距离 159 | nearest = temp2.nearest_point # 更新最近点 160 | dist = temp2.nearest_dist # 更新最近距离 161 | 162 | return self.result(nearest, dist, nodes_visited) 163 | 164 | res = travel(self.kd.root, point, float("inf")) # 从根节点开始递归 165 | return self.data_label_dict[''.join([str(j)for j in res.nearest_point])] 166 | 167 | 168 | 169 | if __name__ == '__main__': 170 | 171 | # 定义一个日志模块来保存日志 172 | logging.basicConfig(level=logging.DEBUG, 173 | format='%(asctime)-12s %(levelname)-8s %(message)s', 174 | datefmt='%m-%d %H:%M', 175 | filename='knnkd.log', 176 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 177 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 178 | console = logging.StreamHandler() 179 | console.setLevel(logging.INFO) 180 | # 设置在控制台输出格式[- 181 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 182 | console.setFormatter(formatter) 183 | # 将handler加入到根记录器 184 | logging.getLogger('').addHandler(console) 185 | 186 | # 根记录器输出信息 187 | logging.info('This is an info message.') 188 | 189 | 190 | start = time.time() 191 | 192 | 193 | # mnist数据集的存储位置 194 | import os 195 | home = os.path.expanduser('~') 196 | train_path = home + '/ML/mnist/mnist_train.csv' 197 | test_path = home + '/ML/mnist/mnist_train.csv' 198 | 199 | 200 | 201 | # 读取训练与测试集 202 | logging.info('Loading data....') 203 | 204 | traindataArr, trainlabelArr = loadData(train_path) 205 | testdataArr, testlabelArr = loadData(test_path) 206 | logging.info('Loading data done.') 207 | 208 | 209 | 210 | # 构建KD树 211 | logging.info('Building Kd Tree...') 212 | kd = KdTree(traindataArr) 213 | 214 | knnkd = KnnKd(kd, traindataArr, trainlabelArr) 215 | logging.info('Classify one image.....') 216 | 217 | print(knnkd.find_nearest(testdataArr[0]), testlabelArr[0]) 218 | 219 | 220 | end = time.time() 221 | 222 | # logging.info('accuracy:{}'.format(accuracy)) 223 | logging.info('Total Time: {}'.format(round(end-start), 4)) 224 | -------------------------------------------------------------------------------- /k近邻/knnkd.log: -------------------------------------------------------------------------------- 1 | 07-24 21:36 INFO This is an info message. 2 | 07-24 21:36 INFO Loading data.... 3 | 07-24 21:36 INFO Loading data done. 4 | 07-24 21:36 INFO Building Kd Tree... 5 | 07-24 21:36 INFO Classify one image..... 6 | 07-24 21:36 INFO Total Time: 31 7 | -------------------------------------------------------------------------------- /mnist/convert_mnist2csv.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/23 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | ''' 8 | 将mnist数据集转换为csv格式便于查看 9 | mnist数据集下载地址:http://yann.lecun.com/exdb/mnist/ 10 | 下载解压之后,运行这个文件即可转换 11 | ------- 12 | ''' 13 | 14 | 15 | def convert(img_path, label_path, target_path, n): 16 | img = open(img_path, "rb") 17 | target = open(target_path, "w") 18 | label = open(label_path, "rb") 19 | 20 | 21 | ## 开头要先让指针滑动一部分 22 | # 图像文件的前16个字节是头, 包含了4个字节的幻数, 4个字节表示图像数量 23 | # 4个字节表示单个图像的行数, 4个字节表示单个图像的列数. 24 | # 标记文件的前8个字节是头, 包含了4个字节的幻数, 4个字节表示标记数量 25 | img.read(16) 26 | label.read(8) 27 | images = [] 28 | # s1 = label.read(1) 29 | # print(s1, ord(s1)) 30 | for i in range(n): 31 | image = [ord(label.read(1))] 32 | for j in range(28*28): 33 | image.append(ord(img.read(1))) 34 | images.append(image) 35 | 36 | for image in images: 37 | target.write(",".join(str(pix) for pix in image)+"\n") 38 | 39 | img.close() 40 | target.close() 41 | label.close() 42 | 43 | if __name__ == '__main__': 44 | import os 45 | home = os.path.expanduser('~') 46 | path = home + '/ML/mnist/' 47 | convert(path + "train-images.idx3-ubyte", path + "train-labels.idx1-ubyte", 48 | path + "mnist_train.csv", 60000) 49 | convert(path + "t10k-images.idx3-ubyte", path + "t10k-labels.idx1-ubyte", 50 | path + "mnist_test.csv", 10000) 51 | 52 | 53 | convert(path + "train-images.idx3-ubyte", path + "train-labels.idx1-ubyte", 54 | path + "mnist_train_samples.csv", 200) 55 | convert(path + "t10k-images.idx3-ubyte", path + "t10k-labels.idx1-ubyte", 56 | path + "mnist_test_samples.csv", 10) 57 | 58 | # import pandas as pd 59 | # test_data = pd.read_csv('./mnist_test.csv') 60 | # print(test_data.shape) 61 | # print(test_data.head()) 62 | -------------------------------------------------------------------------------- /决策树/C4.5.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/27 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | ''' 8 | 构建决策树 9 | C4.5算法实现决策树(不剪枝) 10 | C4.5采用信息增益比作为特征选择的标准 11 | ----------------------------- 12 | 这部分尚未完成,因为在实际代码实现过程和,计算信息增益比时,数据集关于特征的熵HA(D), 13 | 这部分的计算出现问题,例如log项,内部的比值会出现1,整个值为0,后边计算时分母为0 14 | 同时这一项如果log内部如果出现0,计算也会出现错误。 15 | 后边还要研究一下如何解决这个问题,会报出warning 16 | 17 | -------------------------- 18 | 自己没有仔细看过这个算法怎么解决这个问题,所以实现可能有点问题。 19 | 有大神帮忙在github上一起改一改这个代码,完善一下,就更好了 20 | ''' 21 | 22 | import numpy as np 23 | import logging 24 | import time 25 | import copy 26 | 27 | def loadData(fileName): 28 | ''' 29 | 加载Mnist数据集 30 | :param fileName:要加载的数据集路径 31 | :return: list形式的数据集及标记 32 | ''' 33 | # 存放数据及标记的list 34 | dataArr = [] 35 | labelArr = [] 36 | # 打开文件 37 | fr = open(fileName, 'r') 38 | # 将文件按行读取 39 | for line in fr.readlines(): 40 | # 对每一行数据按切割福','进行切割,返回字段列表 41 | curLine = line.strip().split(',') 42 | 43 | labelArr.append(int(curLine[0])) 44 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0 45 | dataArr.append([int(int(num)>128) for num in curLine[1:]]) 46 | # 存放标记 47 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 48 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 49 | 50 | # 返回data和label 51 | return dataArr, labelArr 52 | 53 | 54 | 55 | 56 | class C45DecisionTree: 57 | def __init__(self, traindataList, trainlabelList): 58 | ''' 59 | 初始化决策树类 60 | :param traindataList: 训练数据集的list形式 61 | :param trainlabelList: 训练数据集的label的list形式 62 | ''' 63 | self.traindataList = traindataList 64 | self.trainlabelList = trainlabelList 65 | self.traindataArr = np.array(self.traindataList) 66 | self.trainlabelArr = np.array(self.trainlabelList) 67 | 68 | 69 | self.tree = self.build_C45tree(self.traindataArr, self.trainlabelArr) 70 | 71 | 72 | def calculate_empirical_entropy(self, trainLabelArr): 73 | ''' 74 | 计算训练数据集的经验熵,公式参考李航老师统计学习方法 75 | :param trainLabelArr: numpy格式的label 76 | :return: 返回训练集的经验熵 77 | ''' 78 | # 初始化经验熵为0 79 | H_D = 0 80 | # 这里为什么不采用self.num_classes直接调用,我刚开始也是这么写的 81 | # 后来发现如果在后期的计算中,某个类别不出现,那么log0会出现错误(参考README.md参考链接中大佬的利用set的实现) 82 | labels = set([label for label in trainLabelArr]) 83 | for label in labels: 84 | 85 | # 根据公式需要计算每个类别的数目 86 | num = trainLabelArr[trainLabelArr==label].size 87 | # 计算每个类别占据数目占据整个数据集的比例 88 | p = num / trainLabelArr.size 89 | # 计算经验熵 90 | H_D += -1 *(p) * np.log2(p) 91 | 92 | return H_D 93 | 94 | def calculate_HDA(self, traindataArr, A): 95 | ''' 96 | 计算数据集关于特征的A 97 | :param traindataArr: 训练数据集, numpy格式 98 | :param A: 特征A 99 | :return: 返回Ha(D)熵值 100 | ''' 101 | HDA = 0 102 | features = set([feature for feature in traindataArr[:,A]]) 103 | for feature in features: 104 | if traindataArr[:, A][traindataArr[:, A]== feature].size == 0: 105 | print(traindataArr, traindataArr.shape, features) 106 | p = traindataArr[:, A][traindataArr[:, A]== feature].size / traindataArr[:, A].size 107 | if p == 1: 108 | HDA = 1 109 | else: 110 | HDA += -1 * p * np.log2(p) 111 | 112 | return HDA 113 | 114 | 115 | 116 | def calculate_empirical_conditional_entropy(self, trainfeatureArr, trainlabelarr): 117 | ''' 118 | 计算经验条件熵 119 | :param trainfeatureArr: numpy格式的从数据集中抽离出某一个特征列 120 | :param trainlabelabelArr: numpy格式的label 121 | :return: 经验条件熵 122 | ''' 123 | 124 | # 经验熵是对每个特征进行计算,因此应该返回一个列表,对于每个特征都进行计算分析 125 | # 桶计算经验熵时一样,采用set来选取特针的不同取值 126 | features = set([feature for feature in trainfeatureArr]) 127 | H_D_A = 0 128 | for feature in features: 129 | # 计算取不同值时所包含的样本的数目 130 | Di = trainfeatureArr[trainfeatureArr == feature].size 131 | Di_D = Di / trainfeatureArr.size 132 | 133 | # 计算对于选取的特征取feature值时的条件熵 134 | 135 | H_D_A += Di_D * self.calculate_empirical_entropy(trainlabelarr[trainfeatureArr == feature]) 136 | 137 | return H_D_A 138 | 139 | 140 | 141 | 142 | def calculate_information_gain_ratio(self, traindataArr, trainlabelArr): 143 | ''' 144 | :param traindataArr: 当前数据集的数组,numpy格式,因为每次在构建决策树机型分支的过程中,随着决策树层数的加深当前数据集会比越变越小 145 | :param trainlabelArr: 当前数据集的label数组,numpy格式 146 | 计算最大的信息增益 147 | :return: 最大的信息增益及其对应的特征。 148 | ''' 149 | # 获取当前数据集的特征数目 150 | num_features = traindataArr.shape[1] 151 | max_feature, max_gain = 0, 0 152 | # 计算当前数据集的经验熵 153 | H_D = self.calculate_empirical_entropy(trainlabelArr) 154 | # 计算每个特征的经验条件熵 155 | for i in range(num_features): 156 | trainfeatureArr = traindataArr[:,i] 157 | H_D_i = self.calculate_empirical_conditional_entropy(trainfeatureArr, trainlabelArr) 158 | G_D_A = H_D - H_D_i 159 | H_A_D = self.calculate_HDA(traindataArr, i) 160 | # if H_A_D == 0: return 161 | gain = G_D_A / H_A_D 162 | if gain > max_gain: 163 | max_gain = gain 164 | max_feature = i 165 | # 返回最大的信息增益,及其特征 166 | return max_feature, max_gain 167 | 168 | 169 | 170 | def updateDataSet(self, traindataArr, trainlabelArr, A, a): 171 | ''' 172 | 在构建决策树的过程中,需要实时更新决策树的数据集 173 | :param traindataArr: 待更新的数据集,numpy格式 174 | :param trainlabelArr: 待更新的数据集label, numpy格式 175 | :param A: 需要删除的特征 176 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作 177 | 那么所有有工作的样本需要保留。 178 | :return: 返回新的数据集及标签,numpy格式 179 | ''' 180 | newdataArr = np.delete(traindataArr[traindataArr[:,A] == a], A, axis=1) 181 | newlabelArr = trainlabelArr[traindataArr[:,A] == a] 182 | return newdataArr, newlabelArr 183 | 184 | 185 | def majorClass(self, trainlabelArr): 186 | ''' 187 | 在label中找到数量最多的类别 188 | :param trainlabelArr: 训练数据集的label, numpy格式的 189 | :return: 返回最大的类别 190 | ''' 191 | label = list(trainlabelArr) 192 | return max(label, key=label.count) 193 | 194 | 195 | def build_C45tree(self, traindataArr, trainlabelArr): 196 | ''' 197 | 在数据集上递归构建决策树 198 | :param traindataArr: 当前节点为根节点对应的数据集 numpy 199 | :param trainlabelArr: 当前节点为根节点对应的数据集label numpy 200 | :return: 返回节点的值 201 | ''' 202 | epsilon = 0.1 203 | 204 | 205 | # logging.info('Starting create a new Node. Now there are {} samples'.format(trainlabelArr.size)) 206 | 207 | 208 | classDict = set(trainlabelArr) 209 | # print(classDict) 210 | 211 | if len(classDict) == 1: 212 | return int(classDict.pop()) 213 | if len(traindataArr.shape) == 1: 214 | 215 | return self.majorClass(trainlabelArr) 216 | 217 | Ag, G_D_Ag_r = self.calculate_information_gain_ratio(traindataArr, trainlabelArr) 218 | # print(Ag, G_D_Ag_r) 219 | if G_D_Ag_r < epsilon: 220 | return self.majorClass(trainlabelArr) 221 | 222 | tree = {Ag:{}} 223 | 224 | features = set(feature for feature in traindataArr[:, Ag]) 225 | for feature in features: 226 | a = int(feature) 227 | newdataArr, newlabelArr = self.updateDataSet(traindataArr, trainlabelArr, Ag, a) 228 | 229 | tree[Ag][a] = self.build_C45tree(newdataArr, newlabelArr) 230 | # print(tree) 231 | return tree 232 | 233 | def predict(self, testdataList): 234 | ''' 235 | 使用构建完成的决策树来预测对应的测试数据 236 | :param testdataList: 输入的行测试数据,list格式 237 | :return: 返回类别 238 | ''' 239 | tree = copy.deepcopy(self.tree) 240 | while True: 241 | if type(tree).__name__ != 'dict': 242 | return tree 243 | # print(tree.items()) 244 | (key, value), = tree.items() 245 | 246 | if type(tree[key]).__name__ == 'dict': 247 | dataval = testdataList[key] 248 | 249 | del testdataList[key] 250 | tree = value[dataval] 251 | 252 | if type(tree).__name__ != 'dict': 253 | return tree 254 | 255 | else: 256 | return value 257 | 258 | 259 | 260 | def testModel(self, testdataList, testlabelList): 261 | ''' 262 | 测试决策树模型的准确率 263 | :param testdataList: 输入测试集的数据 264 | :param testlabelList: 输入测试集数据的label 265 | :return: 准确率accuracy 266 | ''' 267 | # 268 | correct_num = 0 269 | 270 | for i in range(len(testdataList)): 271 | prediction = self.predict(testdataList[i]) 272 | if prediction == testlabelList[i]: 273 | correct_num += 1 274 | 275 | return round(correct_num/len(testlabelList), 4) 276 | 277 | 278 | 279 | 280 | 281 | 282 | if __name__ == '__main__': 283 | 284 | # 定义一个日志模块来保存日志 285 | logging.basicConfig(level=logging.DEBUG, 286 | format='%(asctime)-12s %(levelname)-8s %(message)s', 287 | datefmt='%m-%d %H:%M', 288 | filename='C45_decision_tree.log', 289 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 290 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 291 | console = logging.StreamHandler() 292 | console.setLevel(logging.INFO) 293 | # 设置在控制台输出格式[- 294 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 295 | console.setFormatter(formatter) 296 | # 将handler加入到根记录器 297 | logging.getLogger('').addHandler(console) 298 | 299 | # 根记录器输出信息 300 | logging.info('This is an info message.') 301 | 302 | 303 | start = time.time() 304 | 305 | 306 | # mnist数据集的存储位置 307 | import os 308 | home = os.path.expanduser('~') 309 | # train_path = home + '/ML/mnist/mnist_train.csv' 310 | # test_path = home + '/ML/mnist/mnist_test.csv' 311 | train_path = home + '/ML/mnist/mnist_train_samples.csv' 312 | test_path = home + '/ML/mnist/mnist_test_samples.csv' 313 | 314 | # 读取训练与测试集 315 | logging.info('Loading data....') 316 | 317 | traindataArr, trainlabelArr =loadData(train_path) 318 | testdataArr, testlabelArr = loadData(test_path) 319 | logging.info('Loading data done.') 320 | 321 | logging.info('Building a decision tree.') 322 | C45 = C45DecisionTree(traindataArr, trainlabelArr) 323 | 324 | logging.info('Using decision tree to predict one sample.') 325 | 326 | prediction = C45.predict(testdataArr[0]) 327 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0]))) 328 | 329 | #测试朴决策树算法的准确率 330 | # 挑选测试集的前200个进行测试,防止运行时间过长 331 | logging.info('Testing the decision model.') 332 | accuracy = C45.testModel(testdataArr[:200], testlabelArr[:200]) 333 | 334 | 335 | end = time.time() 336 | 337 | logging.info('accuracy:{}'.format(accuracy)) 338 | logging.info('Total Time: {}'.format(round(end-start), 4)) 339 | 340 | -------------------------------------------------------------------------------- /决策树/C45_decision_tree.log: -------------------------------------------------------------------------------- 1 | 07-27 15:47 INFO This is an info message. 2 | 07-27 15:47 INFO Loading data.... 3 | 07-27 15:47 INFO Loading data done. 4 | 07-27 15:47 INFO Building a decision tree. 5 | 07-27 15:47 INFO Using decision tree to predict one sample. 6 | 07-27 15:47 INFO Testing processing Done,and the prediction and label are : (7,7) 7 | 07-27 15:47 INFO Testing the decision model. 8 | 07-27 15:47 INFO accuracy:0.4 9 | 07-27 15:47 INFO Total Time: 2 10 | -------------------------------------------------------------------------------- /决策树/CART.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/28 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | ''' 8 | 实现CART分类树 9 | 这个算法与ID3和C4.5算法的主要不同在于采用Gini指数来选择特征 10 | ----------- 11 | 未剪枝 12 | ''' 13 | 14 | import numpy as np 15 | import logging 16 | import time 17 | import copy 18 | 19 | def loadData(fileName): 20 | ''' 21 | 加载Mnist数据集 22 | :param fileName:要加载的数据集路径 23 | :return: list形式的数据集及标记 24 | ''' 25 | # 存放数据及标记的list 26 | dataArr = [] 27 | labelArr = [] 28 | # 打开文件 29 | fr = open(fileName, 'r') 30 | # 将文件按行读取 31 | for line in fr.readlines(): 32 | # 对每一行数据按切割福','进行切割,返回字段列表 33 | curLine = line.strip().split(',') 34 | 35 | labelArr.append(int(curLine[0])) 36 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0 37 | dataArr.append([int(int(num)>128) for num in curLine[1:]]) 38 | # 存放标记 39 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 40 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 41 | 42 | # 返回data和label 43 | return dataArr, labelArr 44 | 45 | 46 | 47 | 48 | class CARTDecisionTree: 49 | def __init__(self, traindataList, trainlabelList): 50 | ''' 51 | 初始化决策树类 52 | :param traindataList: 训练数据集的list形式 53 | :param trainlabelList: 训练数据集的label的list形式 54 | ''' 55 | self.traindataList = traindataList 56 | self.trainlabelList = trainlabelList 57 | self.traindataArr = np.array(self.traindataList) 58 | self.trainlabelArr = np.array(self.trainlabelList) 59 | 60 | 61 | self.tree = self.build_CARTtree(self.traindataArr, self.trainlabelArr) 62 | print(self.tree) 63 | def calculate_Gini(self, trainlabelArr): 64 | ''' 65 | 计算数据集D的Gini指数 66 | # :param traindataArr: 训练数据集 numpy格式 67 | :param trainlabelArr: 训练数据集label numpy格式 68 | :return: 返回Gini指数 69 | ''' 70 | D = trainlabelArr.size 71 | labels = set([label for label in trainlabelArr]) 72 | Gini = 1 73 | for label in labels: 74 | Ck = trainlabelArr[trainlabelArr==label].size 75 | Gini -= ( Ck /D) ** 2 76 | return Gini 77 | 78 | 79 | 80 | def calculate_Gini_feature(self, trainfeatureArr, trainlabelArr, a): 81 | ''' 82 | 计算数据集针对特征A的取值为a时的Gini指数 83 | :param trainfeatureArr: 切分后的训练数据集某一个特征列 numpy格式 84 | :param trainlabelArr: 训练数据集label numpy格式 85 | :param a: 特征A的某一个取值 86 | :return: 返回基尼指数 87 | ''' 88 | D1 = trainfeatureArr[trainfeatureArr == a].size 89 | D = trainfeatureArr.size 90 | D2 = D - D1 91 | d1 = trainlabelArr[trainfeatureArr == a] 92 | d2 = trainlabelArr[trainfeatureArr != a] 93 | 94 | Gini_D_A = abs(D1/D) * self.calculate_Gini(d1) + abs(D2/D) * self.calculate_Gini( d2 ) 95 | 96 | return Gini_D_A 97 | 98 | def calculate_min_Gini(self, traindataArr, trainlabelArr): 99 | ''' 100 | 计算最小的Gini指数与对应的特征 101 | :param traindataArr: 训练数据集 numpy格式 102 | :param trainlabelArr: 训练数据集的label numpy格式 103 | :return: 返回最小的Gini指数与对应的特征 104 | ''' 105 | num_features = traindataArr.shape[1] 106 | min_Gini = float('inf') 107 | feature = -1 108 | v = -1 109 | for i in range(num_features): 110 | trainfeatureArr = traindataArr[:, i] 111 | values = set([value for value in trainfeatureArr]) 112 | for value in values: 113 | gini = self.calculate_Gini_feature(trainfeatureArr, trainlabelArr, value) 114 | if gini < min_Gini: 115 | min_Gini = gini 116 | feature = i 117 | v = value 118 | 119 | return feature, v, min_Gini 120 | 121 | 122 | 123 | 124 | def updateDataSetleft(self, traindataArr, trainlabelArr, A, a): 125 | ''' 126 | 在构建决策树的过程中,需要实时更新决策树的数据集 127 | :param traindataArr: 待更新的数据集,numpy格式 128 | :param trainlabelArr: 待更新的数据集label, numpy格式 129 | :param A: 需要删除的特征 130 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作 131 | 那么所有有工作的样本需要保留。 132 | :return: 返回新的数据集及标签,numpy格式 133 | ''' 134 | newdataArr = np.delete(traindataArr[traindataArr[:,A] == a], A, axis=1) 135 | newlabelArr = trainlabelArr[traindataArr[:,A] == a] 136 | return newdataArr, newlabelArr 137 | 138 | 139 | def updateDataSetright(self, traindataArr, trainlabelArr, A, a): 140 | ''' 141 | 在构建决策树的过程中,需要实时更新决策树的数据集 142 | :param traindataArr: 待更新的数据集,numpy格式 143 | :param trainlabelArr: 待更新的数据集label, numpy格式 144 | :param A: 需要删除的特征 145 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作 146 | 那么所有有工作的样本需要保留。 147 | :return: 返回新的数据集及标签,numpy格式 148 | ''' 149 | newdataArr = np.delete(traindataArr[traindataArr[:,A] != a], A, axis=1) 150 | newlabelArr = trainlabelArr[traindataArr[:,A] != a] 151 | return newdataArr, newlabelArr 152 | 153 | 154 | def majorClass(self, trainlabelArr): 155 | ''' 156 | 在label中找到数量最多的类别 157 | :param trainlabelArr: 训练数据集的label, numpy格式的 158 | :return: 返回最大的类别 159 | ''' 160 | label = list(trainlabelArr) 161 | return max(label, key=label.count) 162 | 163 | 164 | def build_CARTtree(self, traindataArr, trainlabelArr): 165 | ''' 166 | 在数据集上递归构建决策树 167 | :param traindataArr: 当前节点为根节点对应的数据集 numpy 168 | :param trainlabelArr: 当前节点为根节点对应的数据集label numpy 169 | :return: 返回节点的值 170 | ''' 171 | # 信息增益的阈值 172 | epsilon = 0.1 173 | 174 | node_thresh = 5 175 | 176 | 177 | # logging.info('Starting create a new Node. Now there are {} samples'.format(trainlabelArr.size)) 178 | 179 | # 判断数据集此时的类别,如果只有一类,就范会对应的类别 180 | classDict = set(trainlabelArr) 181 | # print(classDict) 182 | if len(classDict) == 1: 183 | return int(classDict.pop()) 184 | # print(traindataArr.shape) 185 | # 判断数据集此时的的特征数目,如果没有特征集,那就说明没有特征进行分割,就放会这些样本中数目最多的类别 186 | if len(traindataArr.shape) == 1: 187 | return self.majorClass(trainlabelArr) 188 | # 计算最大增益及其对应的特征 189 | Ag, a, Gini = self.calculate_min_Gini(traindataArr, trainlabelArr) 190 | # print(Ag, Gini) 191 | # 如果最大的信息增益小于设定的阈值,就直接返回数目最多的类,不必要进行分割 192 | if Gini < epsilon: 193 | return self.majorClass(trainlabelArr) 194 | 195 | if trainlabelArr.size < node_thresh: 196 | return self.majorClass(trainlabelArr) 197 | 198 | tree = {Ag:{}} 199 | # 递归构建决策树 200 | 201 | 202 | newdataArrleft, newlabelArrleft = self.updateDataSetleft(traindataArr, trainlabelArr, Ag, a) 203 | newdataArrright, newlabelArrright = self.updateDataSetright(traindataArr, trainlabelArr, Ag, a) 204 | # print(newlabelArrleft.size, newlabelArrright.size, trainlabelArr.size) 205 | if newlabelArrleft.size > 0: 206 | tree[Ag][a] = {'left': self.build_CARTtree(newdataArrleft, newlabelArrleft)} 207 | if newlabelArrright.size > 0: 208 | tree[Ag][a]['right'] = self.build_CARTtree(newdataArrright, newlabelArrright) 209 | 210 | # print(tree) 211 | return tree 212 | 213 | def predict(self, testdataList): 214 | ''' 215 | 使用构建完成的决策树来预测对应的测试数据 216 | :param testdataList: 输入的行测试数据,list格式 217 | :return: 返回类别 218 | ''' 219 | tree = copy.deepcopy(self.tree) 220 | # print(tree) 221 | while True: 222 | if type(tree).__name__ != 'dict': 223 | return tree 224 | # print(tree.items()) 225 | (key, value), = tree.items() 226 | 227 | if type(tree[key]).__name__ == 'dict': 228 | dataval = testdataList[key] 229 | 230 | del testdataList[key] 231 | 232 | k = list(value.keys()) 233 | if dataval not in k: 234 | tree = value[k[0]]['right'] 235 | else: 236 | tree = value[dataval]['left'] 237 | 238 | if type(tree).__name__ != 'dict': 239 | return tree 240 | 241 | else: 242 | return value 243 | 244 | 245 | 246 | def testModel(self, testdataList, testlabelList): 247 | ''' 248 | 测试决策树模型的准确率 249 | :param testdataList: 输入测试集的数据 250 | :param testlabelList: 输入测试集数据的label 251 | :return: 准确率accuracy 252 | ''' 253 | # 254 | correct_num = 0 255 | 256 | for i in range(len(testdataList)): 257 | prediction = self.predict(testdataList[i]) 258 | if prediction == testlabelList[i]: 259 | correct_num += 1 260 | 261 | return round(correct_num/len(testlabelList), 4) 262 | 263 | 264 | 265 | 266 | 267 | 268 | if __name__ == '__main__': 269 | 270 | # 定义一个日志模块来保存日志 271 | logging.basicConfig(level=logging.DEBUG, 272 | format='%(asctime)-12s %(levelname)-8s %(message)s', 273 | datefmt='%m-%d %H:%M', 274 | filename='CART_decision_tree.log', 275 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 276 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 277 | console = logging.StreamHandler() 278 | console.setLevel(logging.INFO) 279 | # 设置在控制台输出格式[- 280 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 281 | console.setFormatter(formatter) 282 | # 将handler加入到根记录器 283 | logging.getLogger('').addHandler(console) 284 | 285 | # 根记录器输出信息 286 | logging.info('This is an info message.') 287 | 288 | 289 | start = time.time() 290 | 291 | 292 | # mnist数据集的存储位置 293 | import os 294 | home = os.path.expanduser('~') 295 | # train_path = home + '/ML/mnist/mnist_train.csv' 296 | # test_path = home + '/ML/mnist/mnist_test.csv' 297 | train_path = home + '/ML/mnist/mnist_train_samples.csv' 298 | test_path = home + '/ML/mnist/mnist_test_samples.csv' 299 | 300 | # 读取训练与测试集 301 | logging.info('Loading data....') 302 | 303 | traindataArr, trainlabelArr =loadData(train_path) 304 | testdataArr, testlabelArr = loadData(test_path) 305 | logging.info('Loading data done.') 306 | 307 | logging.info('Building a decision tree.') 308 | CART = CARTDecisionTree(traindataArr, trainlabelArr) 309 | 310 | logging.info('Using decision tree to predict one sample.') 311 | 312 | prediction = CART.predict(testdataArr[0]) 313 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0]))) 314 | 315 | # 测试朴决策树算法的准确率 316 | # 挑选测试集的前200个进行测试,防止运行时间过长 317 | logging.info('Testing /the decision model.') 318 | accuracy = CART.testModel(testdataArr[:200], testlabelArr[:200]) 319 | 320 | 321 | end = time.time() 322 | 323 | logging.info('accuracy:{}'.format(accuracy)) 324 | logging.info('Total Time: {}'.format(round(end-start), 4)) 325 | 326 | -------------------------------------------------------------------------------- /决策树/CART_decision_tree.log: -------------------------------------------------------------------------------- 1 | 07-28 16:08 INFO This is an info message. 2 | 07-28 16:08 INFO Loading data.... 3 | 07-28 16:08 INFO Loading data done. 4 | 07-28 16:08 INFO Building a decision tree. 5 | 07-28 16:08 INFO Using decision tree to predict one sample. 6 | 07-28 16:08 INFO Testing processing Done,and the prediction and label are : (7,7) 7 | 07-28 16:08 INFO Testing /the decision model. 8 | 07-28 16:08 INFO accuracy:0.5 9 | 07-28 16:08 INFO Total Time: 1 10 | -------------------------------------------------------------------------------- /决策树/ID3.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/25 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | ''' 8 | 构建决策树 9 | ID3算法实现决策树(不剪枝) 10 | ID3采用信息增益作为特征选择的标准 11 | ''' 12 | 13 | import numpy as np 14 | import logging 15 | import time 16 | import copy 17 | 18 | def loadData(fileName): 19 | ''' 20 | 加载Mnist数据集 21 | :param fileName:要加载的数据集路径 22 | :return: list形式的数据集及标记 23 | ''' 24 | # 存放数据及标记的list 25 | dataArr = [] 26 | labelArr = [] 27 | # 打开文件 28 | fr = open(fileName, 'r') 29 | # 将文件按行读取 30 | for line in fr.readlines(): 31 | # 对每一行数据按切割福','进行切割,返回字段列表 32 | curLine = line.strip().split(',') 33 | 34 | labelArr.append(int(curLine[0])) 35 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0 36 | dataArr.append([int(int(num)>128) for num in curLine[1:]]) 37 | # 存放标记 38 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 39 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 40 | 41 | # 返回data和label 42 | return dataArr, labelArr 43 | 44 | 45 | 46 | 47 | class ID3DecisionTree: 48 | def __init__(self, traindataList, trainlabelList): 49 | ''' 50 | 初始化决策树类 51 | :param traindataList: 训练数据集的list形式 52 | :param trainlabelList: 训练数据集的label的list形式 53 | ''' 54 | self.traindataList = traindataList 55 | self.trainlabelList = trainlabelList 56 | self.traindataArr = np.array(self.traindataList) 57 | self.trainlabelArr = np.array(self.trainlabelList) 58 | 59 | 60 | self.tree = self.build_ID3tree(self.traindataArr, self.trainlabelArr) 61 | 62 | 63 | def calculate_empirical_entropy(self, trainLabelArr): 64 | ''' 65 | 计算训练数据集的经验熵,公式参考李航老师统计学习方法 66 | :param trainLabelArr: numpy格式的label 67 | :return: 返回训练集的经验熵 68 | ''' 69 | # 初始化经验熵为0 70 | H_D = 0 71 | # 这里为什么不采用self.num_classes直接调用,我刚开始也是这么写的 72 | # 后来发现如果在后期的计算中,某个类别不出现,那么log0会出现错误(参考README.md参考链接中大佬的利用set的实现) 73 | labels = set([label for label in trainLabelArr]) 74 | for label in labels: 75 | 76 | # 根据公式需要计算每个类别的数目 77 | num = trainLabelArr[trainLabelArr==label].size 78 | # 计算每个类别占据数目占据整个数据集的比例 79 | p = num / trainLabelArr.size 80 | # 计算经验熵 81 | H_D += -1 *(p) * np.log2(p) 82 | 83 | return H_D 84 | 85 | 86 | 87 | 88 | def calculate_empirical_conditional_entropy(self, trainfeatureArr, trainlabelarr): 89 | ''' 90 | 计算经验条件熵 91 | :param trainfeatureArr: numpy格式的从数据集中抽离出某一个特征列 92 | :param trainlabelabelArr: numpy格式的label 93 | :return: 经验条件熵 94 | ''' 95 | 96 | # 经验熵是对每个特征进行计算,因此应该返回一个列表,对于每个特征都进行计算分析 97 | # 桶计算经验熵时一样,采用set来选取特针的不同取值 98 | features = set([feature for feature in trainfeatureArr]) 99 | H_D_A = 0 100 | for feature in features: 101 | # 计算取不同值时所包含的样本的数目 102 | Di = trainfeatureArr[trainfeatureArr == feature].size 103 | Di_D = Di / trainfeatureArr.size 104 | 105 | # 计算对于选取的特征取feature值时的条件熵 106 | 107 | H_D_A += Di_D * self.calculate_empirical_entropy(trainlabelarr[trainfeatureArr == feature]) 108 | 109 | return H_D_A 110 | 111 | 112 | def calculate_information_gain(self, traindataArr, trainlabelArr): 113 | ''' 114 | :param traindataArr: 当前数据集的数组,numpy格式,因为每次在构建决策树机型分支的过程中,随着决策树层数的加深当前数据集会比越变越小 115 | :param trainlabelArr: 当前数据集的label数组,numpy格式 116 | 计算最大的信息增益 117 | :return: 最大的信息增益及其对应的特征。 118 | ''' 119 | # 获取当前数据集的特征数目 120 | num_features = traindataArr.shape[1] 121 | max_feature, max_G_D_A = 0, 0 122 | # 计算当前数据集的经验熵 123 | H_D = self.calculate_empirical_entropy(trainlabelArr) 124 | # 计算每个特征的经验条件熵 125 | for i in range(num_features): 126 | trainfeatureArr = traindataArr[:,i] 127 | H_D_i = self.calculate_empirical_conditional_entropy(trainfeatureArr, trainlabelArr) 128 | G_D_A = H_D - H_D_i 129 | if G_D_A > max_G_D_A: 130 | max_G_D_A = G_D_A 131 | max_feature = i 132 | # 返回最大的信息增益,及其特征 133 | return max_feature, max_G_D_A 134 | 135 | 136 | def updateDataSet(self, traindataArr, trainlabelArr, A, a): 137 | ''' 138 | 在构建决策树的过程中,需要实时更新决策树的数据集 139 | :param traindataArr: 待更新的数据集,numpy格式 140 | :param trainlabelArr: 待更新的数据集label, numpy格式 141 | :param A: 需要删除的特征 142 | :param a: 对于需要删除的特征A,如果其取值为a,那说明这个样本需要保留(解释一下,例如对于是否有工作这个特征,a为有工作 143 | 那么所有有工作的样本需要保留。 144 | :return: 返回新的数据集及标签,numpy格式 145 | ''' 146 | newdataArr = np.delete(traindataArr[traindataArr[:,A] == a], A, axis=1) 147 | newlabelArr = trainlabelArr[traindataArr[:,A] == a] 148 | return newdataArr, newlabelArr 149 | 150 | 151 | def majorClass(self, trainlabelArr): 152 | ''' 153 | 在label中找到数量最多的类别 154 | :param trainlabelArr: 训练数据集的label, numpy格式的 155 | :return: 返回最大的类别 156 | ''' 157 | label = list(trainlabelArr) 158 | return max(label, key=label.count) 159 | 160 | 161 | def build_ID3tree(self, traindataArr, trainlabelArr): 162 | ''' 163 | 在数据集上递归构建决策树 164 | :param traindataArr: 当前节点为根节点对应的数据集 numpy 165 | :param trainlabelArr: 当前节点为根节点对应的数据集label numpy 166 | :return: 返回节点的值 167 | ''' 168 | # 信息增益的阈值 169 | epsilon = 0.1 170 | 171 | 172 | # logging.info('Starting create a new Node. Now there are {} samples'.format(trainlabelArr.size)) 173 | 174 | # 判断数据集此时的类别,如果只有一类,就范会对应的类别 175 | classDict = set(trainlabelArr) 176 | # print(classDict) 177 | if len(classDict) == 1: 178 | return int(classDict.pop()) 179 | # print(traindataArr.shape) 180 | # 判断数据集此时的的特征数目,如果没有特征集,那就说明没有特征进行分割,就放会这些样本中数目最多的类别 181 | if len(traindataArr.shape) == 1: 182 | return self.majorClass(trainlabelArr) 183 | # 计算最大增益及其对应的特征 184 | Ag, G_D_Ag = self.calculate_information_gain(traindataArr, trainlabelArr) 185 | # print(Ag, G_D_Ag) 186 | # 如果最大的信息增益小于设定的阈值,就直接返回数目最多的类,不必要进行分割 187 | if G_D_Ag < epsilon: 188 | return self.majorClass(trainlabelArr) 189 | 190 | tree = {Ag:{}} 191 | # 递归构建决策树 192 | features = set(feature for feature in traindataArr[:, Ag]) 193 | for feature in features: 194 | a = int(feature) 195 | newdataArr, newlabelArr = self.updateDataSet(traindataArr, trainlabelArr, Ag, a) 196 | 197 | tree[Ag][a] = self.build_ID3tree(newdataArr, newlabelArr) 198 | # print(tree) 199 | return tree 200 | 201 | def predict(self, testdataList): 202 | ''' 203 | 使用构建完成的决策树来预测对应的测试数据 204 | :param testdataList: 输入的行测试数据,list格式 205 | :return: 返回类别 206 | ''' 207 | tree = copy.deepcopy(self.tree) 208 | while True: 209 | if type(tree).__name__ != 'dict': 210 | return tree 211 | # print(tree.items()) 212 | (key, value), = tree.items() 213 | 214 | if type(tree[key]).__name__ == 'dict': 215 | dataval = testdataList[key] 216 | 217 | del testdataList[key] 218 | tree = value[dataval] 219 | 220 | if type(tree).__name__ != 'dict': 221 | return tree 222 | 223 | else: 224 | return value 225 | 226 | 227 | 228 | def testModel(self, testdataList, testlabelList): 229 | ''' 230 | 测试决策树模型的准确率 231 | :param testdataList: 输入测试集的数据 232 | :param testlabelList: 输入测试集数据的label 233 | :return: 准确率accuracy 234 | ''' 235 | # 236 | correct_num = 0 237 | 238 | for i in range(len(testdataList)): 239 | prediction = self.predict(testdataList[i]) 240 | if prediction == testlabelList[i]: 241 | correct_num += 1 242 | 243 | return round(correct_num/len(testlabelList), 4) 244 | 245 | 246 | 247 | 248 | 249 | 250 | if __name__ == '__main__': 251 | 252 | # 定义一个日志模块来保存日志 253 | logging.basicConfig(level=logging.DEBUG, 254 | format='%(asctime)-12s %(levelname)-8s %(message)s', 255 | datefmt='%m-%d %H:%M', 256 | filename='ID3_decision_tree.log', 257 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 258 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 259 | console = logging.StreamHandler() 260 | console.setLevel(logging.INFO) 261 | # 设置在控制台输出格式[- 262 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 263 | console.setFormatter(formatter) 264 | # 将handler加入到根记录器 265 | logging.getLogger('').addHandler(console) 266 | 267 | # 根记录器输出信息 268 | logging.info('This is an info message.') 269 | 270 | 271 | start = time.time() 272 | 273 | 274 | # mnist数据集的存储位置 275 | import os 276 | home = os.path.expanduser('~') 277 | train_path = home + '/ML/mnist/mnist_train.csv' 278 | test_path = home + '/ML/mnist/mnist_test.csv' 279 | # train_path = home + '/ML/mnist/mnist_train_samples.csv' 280 | # test_path = home + '/ML/mnist/mnist_test_samples.csv' 281 | 282 | # 读取训练与测试集 283 | logging.info('Loading data....') 284 | 285 | traindataArr, trainlabelArr =loadData(train_path) 286 | testdataArr, testlabelArr = loadData(test_path) 287 | logging.info('Loading data done.') 288 | 289 | logging.info('Building a decision tree.') 290 | ID3 = ID3DecisionTree(traindataArr, trainlabelArr) 291 | 292 | logging.info('Using decision tree to predict one sample.') 293 | 294 | prediction = ID3.predict(testdataArr[0]) 295 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0]))) 296 | 297 | #测试朴决策树算法的准确率 298 | # 挑选测试集的前200个进行测试,防止运行时间过长 299 | logging.info('Testing the decision model.') 300 | accuracy = ID3.testModel(testdataArr[:200], testlabelArr[:200]) 301 | 302 | 303 | end = time.time() 304 | 305 | logging.info('accuracy:{}'.format(accuracy)) 306 | logging.info('Total Time: {}'.format(round(end-start), 4)) 307 | 308 | -------------------------------------------------------------------------------- /决策树/ID3_decision_tree.log: -------------------------------------------------------------------------------- 1 | 07-27 15:51 INFO This is an info message. 2 | 07-27 15:51 INFO Loading data.... 3 | 07-27 15:51 INFO Loading data done. 4 | 07-27 15:51 INFO Building a decision tree. 5 | 07-27 15:54 INFO Using decision tree to predict one sample. 6 | 07-27 15:54 INFO Testing processing Done,and the prediction and label are : (7,7) 7 | 07-27 15:54 INFO Testing the decision model. 8 | 07-27 15:54 INFO accuracy:0.87 9 | 07-27 15:54 INFO Total Time: 176 10 | -------------------------------------------------------------------------------- /感知机算法/perceptron.log: -------------------------------------------------------------------------------- 1 | 07-24 15:23 INFO this is an info message. 2 | 07-24 15:23 INFO Loading data.... 3 | 07-24 15:23 INFO Loading data done. 4 | 07-24 15:23 INFO Start training... 5 | 07-24 15:23 INFO train data shape is:(12665,784) 6 | 07-24 15:23 INFO Iteration:0 / 50 7 | 07-24 15:23 INFO Iteration:1 / 50 8 | 07-24 15:23 INFO Iteration:2 / 50 9 | 07-24 15:23 INFO Iteration:3 / 50 10 | 07-24 15:23 INFO Iteration:4 / 50 11 | 07-24 15:23 INFO Iteration:5 / 50 12 | 07-24 15:23 INFO Iteration:6 / 50 13 | 07-24 15:23 INFO Iteration:7 / 50 14 | 07-24 15:23 INFO Iteration:8 / 50 15 | 07-24 15:23 INFO Iteration:9 / 50 16 | 07-24 15:23 INFO Iteration:10 / 50 17 | 07-24 15:23 INFO Iteration:11 / 50 18 | 07-24 15:23 INFO Iteration:12 / 50 19 | 07-24 15:23 INFO Iteration:13 / 50 20 | 07-24 15:23 INFO Iteration:14 / 50 21 | 07-24 15:23 INFO Iteration:15 / 50 22 | 07-24 15:23 INFO Iteration:16 / 50 23 | 07-24 15:23 INFO Iteration:17 / 50 24 | 07-24 15:23 INFO Iteration:18 / 50 25 | 07-24 15:23 INFO Iteration:19 / 50 26 | 07-24 15:23 INFO Iteration:20 / 50 27 | 07-24 15:23 INFO Iteration:21 / 50 28 | 07-24 15:23 INFO Iteration:22 / 50 29 | 07-24 15:23 INFO Iteration:23 / 50 30 | 07-24 15:23 INFO Iteration:24 / 50 31 | 07-24 15:23 INFO Iteration:25 / 50 32 | 07-24 15:23 INFO Iteration:26 / 50 33 | 07-24 15:23 INFO Iteration:27 / 50 34 | 07-24 15:23 INFO Iteration:28 / 50 35 | 07-24 15:23 INFO Iteration:29 / 50 36 | 07-24 15:23 INFO Iteration:30 / 50 37 | 07-24 15:23 INFO Iteration:31 / 50 38 | 07-24 15:23 INFO Iteration:32 / 50 39 | 07-24 15:23 INFO Iteration:33 / 50 40 | 07-24 15:23 INFO Iteration:34 / 50 41 | 07-24 15:23 INFO Iteration:35 / 50 42 | 07-24 15:23 INFO Iteration:36 / 50 43 | 07-24 15:23 INFO Iteration:37 / 50 44 | 07-24 15:23 INFO Iteration:38 / 50 45 | 07-24 15:23 INFO Iteration:39 / 50 46 | 07-24 15:23 INFO Iteration:40 / 50 47 | 07-24 15:23 INFO Iteration:41 / 50 48 | 07-24 15:23 INFO Iteration:42 / 50 49 | 07-24 15:23 INFO Iteration:43 / 50 50 | 07-24 15:23 INFO Iteration:44 / 50 51 | 07-24 15:23 INFO Iteration:45 / 50 52 | 07-24 15:23 INFO Iteration:46 / 50 53 | 07-24 15:23 INFO Iteration:47 / 50 54 | 07-24 15:23 INFO Iteration:48 / 50 55 | 07-24 15:23 INFO Iteration:49 / 50 56 | 07-24 15:23 INFO Training done. 57 | 07-24 15:23 INFO Testing this model. 58 | 07-24 15:23 INFO accuracy:0.9916 59 | 07-24 15:23 INFO Total Time:18.173146724700928 60 | -------------------------------------------------------------------------------- /感知机算法/perceptron.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/23 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | 8 | ''' 9 | mnist为10分类,因为感知机算法为二分类 10 | 因此挑选其中0,1这两类的数据进行训练 11 | ''' 12 | 13 | import numpy as np 14 | import time 15 | import logging 16 | 17 | 18 | def loadData(fileName): 19 | ''' 20 | 加载Mnist数据集 21 | :param fileName:要加载的数据集路径 22 | :return: list形式的数据集及标记 23 | ''' 24 | # 存放数据及标记的list 25 | dataArr = [] 26 | labelArr = [] 27 | # 打开文件 28 | fr = open(fileName, 'r') 29 | # 将文件按行读取 30 | for line in fr.readlines(): 31 | # 对每一行数据按切割福','进行切割,返回字段列表 32 | curLine = line.strip().split(',') 33 | 34 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类 35 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue 36 | if int(curLine[0]) == 0 or int(curLine[0]) == 1: 37 | if int(curLine[0]) == 0: 38 | labelArr.append(1) 39 | else: 40 | labelArr.append(-1) 41 | dataArr.append([int(num) / 255 for num in curLine[1:]]) 42 | # 存放标记 43 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 44 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 45 | # dataArr.append([int(num)/255 for num in curLine[1:]]) 46 | 47 | # 返回data和label 48 | return dataArr, labelArr 49 | 50 | 51 | class Perceptron: 52 | def __init__(self): 53 | pass 54 | 55 | 56 | 57 | 58 | 59 | 60 | def perceptron(self, dataArr, labelArr, iters): 61 | ''' 62 | 构建爱呢感知机算法,其中loss function采用错误分类点的个数 63 | :param dataArr: 输入list格式的训练集 64 | :param labelArr: 输入list格式的训练集标签数据 65 | :param iters: 需要迭代的次数(因为数据集不保证线性可分,因此需要设置一定的迭代次数) 66 | :return: w, b 返回超平面的参数 67 | ''' 68 | 69 | # 数据转换为numpy格式,方便进行矩阵运算 70 | dataMat = np.mat(dataArr) 71 | labelMat = np.mat(labelArr).T 72 | # print(dataMat.shape) 73 | # print(labelMat.shape) 74 | # 训练数据的维度大小 75 | m, n = dataMat.shape 76 | logging.info('train data shape is:({},{})'.format(m,n)) 77 | 78 | # 初始化为w,b 79 | W = np.random.randn(1, n) 80 | b = 0 81 | 82 | # 设置学习率(迭代步长) 83 | lr = 0.0001 84 | 85 | # 进行迭代训练 86 | for iteration in range(iters): 87 | 88 | # 采用sgd的方法进行权重的更新,每次选取一个错误样本更新w, b 89 | # 一共含有m个样本 90 | for i in range(m): 91 | # 选择某个样本 92 | xi = dataMat[i] 93 | yi = labelMat[i] 94 | # 如果分类正确,那么继续寻找下一个样本 95 | if yi * (W * xi.T + b) > 0: continue 96 | # 找到错误样本,更新模型参数 97 | W = W + lr * yi * xi 98 | b = b + lr * yi 99 | 100 | logging.info("Iteration:{} / {}".format(iteration, iters)) 101 | 102 | return W, b 103 | 104 | 105 | 106 | def testPerceptron(self, dataArr, labelArr, W, b): 107 | ''' 108 | 测试训练得到的感知机模型的准确性 109 | :param dataArr: 输入list格式的测试集数据 110 | :param labelArr: 输入list格式的测试集数据标签 111 | :param w: 感知器模型超平面的法相量参数 112 | :param b: 感知机模型的偏置 113 | :return: 感知机模型在测试集的准确率 114 | ''' 115 | 116 | # 数据转换为numpy格式,方便进行矩阵运算 117 | dataMat = np.mat(dataArr) 118 | labelMat = np.mat(labelArr).T 119 | 120 | # 测试集的维度大小 121 | m ,n = dataMat.shape 122 | 123 | # 正确分类的样本的数目 124 | correct_num = 0 125 | 126 | # 遍历所有的测试样本,查找其中的正确分类样本个数 127 | for i in range(m): 128 | xi = dataMat[i] 129 | yi = labelMat[i] 130 | 131 | if (W * xi.T + b) * yi > 0: 132 | correct_num += 1 133 | 134 | return round(correct_num/m, 4) 135 | 136 | 137 | 138 | 139 | 140 | 141 | if __name__ == '__main__': 142 | 143 | 144 | # 定义一个日志模块来保存日志 145 | logging.basicConfig(level=logging.DEBUG, 146 | format='%(asctime)-12s %(levelname)-8s %(message)s', 147 | datefmt='%m-%d %H:%M', 148 | filename='perceptron.log', 149 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 150 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 151 | console = logging.StreamHandler() 152 | console.setLevel(logging.INFO) 153 | # 设置在控制台输出格式[- 154 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 155 | console.setFormatter(formatter) 156 | # 将handler加入到根记录器 157 | logging.getLogger('').addHandler(console) 158 | 159 | # 根记录器输出信息 160 | logging.info('this is an info message.') 161 | 162 | ###################################################### 163 | 164 | start = time.time() 165 | 166 | 167 | # mnist数据集的存储位置 168 | import os 169 | home = os.path.expanduser('~') 170 | train_path = home + '/ML/mnist/mnist_train.csv' 171 | test_path = home + '/ML/mnist/mnist_train.csv' 172 | 173 | # 读取训练与测试集 174 | logging.info('Loading data....') 175 | 176 | 177 | 178 | p = Perceptron() 179 | 180 | train_data_array, train_label_array = loadData(train_path) 181 | test_data_array, test_label_array = loadData(test_path) 182 | logging.info('Loading data done.') 183 | 184 | #训练感知机算法 185 | logging.info('Start training...') 186 | iters = 50 187 | w, b = p.perceptron(train_data_array, train_label_array, iters) 188 | logging.info('Training done.') 189 | 190 | # 测试感知机算法的准确率 191 | logging.info('Testing this model.') 192 | accuracy = p.testPerceptron(test_data_array, test_label_array, w, b) 193 | 194 | end = time.time() 195 | 196 | logging.info('accuracy:{}'.format(accuracy)) 197 | logging.info('Total Time:{}'.format(end-start)) -------------------------------------------------------------------------------- /支持向量机/SVM.log: -------------------------------------------------------------------------------- 1 | 08-01 15:20 INFO This is an info message. 2 | 08-01 15:20 INFO Loading data.... 3 | 08-01 15:20 INFO Loading data done. 4 | 08-01 15:20 INFO Training the SVM model.... 5 | 08-01 15:20 INFO Construct The Gaussian Kernel: (0/1000). 6 | 08-01 15:20 INFO Construct The Gaussian Kernel: (100/1000). 7 | 08-01 15:20 INFO Construct The Gaussian Kernel: (200/1000). 8 | 08-01 15:20 INFO Construct The Gaussian Kernel: (300/1000). 9 | 08-01 15:20 INFO Construct The Gaussian Kernel: (400/1000). 10 | 08-01 15:20 INFO Construct The Gaussian Kernel: (500/1000). 11 | 08-01 15:20 INFO Construct The Gaussian Kernel: (600/1000). 12 | 08-01 15:20 INFO Construct The Gaussian Kernel: (700/1000). 13 | 08-01 15:20 INFO Construct The Gaussian Kernel: (800/1000). 14 | 08-01 15:20 INFO Construct The Gaussian Kernel: (900/1000). 15 | 08-01 15:20 INFO Iter:0/13 16 | 08-01 15:20 INFO Iter:1/13 17 | 08-01 15:20 INFO Training process is Done !!!! 18 | 08-01 15:20 INFO Predicting one sample .... 19 | 08-01 15:20 INFO The prediction and the ground truth is : (-1.0, -1) 20 | 08-01 15:20 INFO Testing processing: (0/2115) and the currect prediction:0 21 | 08-01 15:20 INFO Testing processing: (100/2115) and the currect prediction:98 22 | 08-01 15:20 INFO Testing processing: (200/2115) and the currect prediction:197 23 | 08-01 15:20 INFO Testing processing: (300/2115) and the currect prediction:297 24 | 08-01 15:20 INFO Testing processing: (400/2115) and the currect prediction:397 25 | 08-01 15:20 INFO Testing processing: (500/2115) and the currect prediction:495 26 | 08-01 15:20 INFO Testing processing: (600/2115) and the currect prediction:594 27 | 08-01 15:20 INFO Testing processing: (700/2115) and the currect prediction:693 28 | 08-01 15:20 INFO Testing processing: (800/2115) and the currect prediction:790 29 | 08-01 15:20 INFO Testing processing: (900/2115) and the currect prediction:888 30 | 08-01 15:20 INFO Testing processing: (1000/2115) and the currect prediction:987 31 | 08-01 15:20 INFO Testing processing: (1100/2115) and the currect prediction:1086 32 | 08-01 15:20 INFO Testing processing: (1200/2115) and the currect prediction:1186 33 | 08-01 15:20 INFO Testing processing: (1300/2115) and the currect prediction:1286 34 | 08-01 15:20 INFO Testing processing: (1400/2115) and the currect prediction:1385 35 | 08-01 15:20 INFO Testing processing: (1500/2115) and the currect prediction:1485 36 | 08-01 15:20 INFO Testing processing: (1600/2115) and the currect prediction:1584 37 | 08-01 15:20 INFO Testing processing: (1700/2115) and the currect prediction:1684 38 | 08-01 15:20 INFO Testing processing: (1800/2115) and the currect prediction:1784 39 | 08-01 15:20 INFO Testing processing: (1900/2115) and the currect prediction:1883 40 | 08-01 15:20 INFO Testing processing: (2000/2115) and the currect prediction:1983 41 | 08-01 15:20 INFO Testing processing: (2100/2115) and the currect prediction:2080 42 | 08-01 15:20 INFO accuracy:99.0544 43 | 08-01 15:20 INFO Total Time: 31 44 | -------------------------------------------------------------------------------- /支持向量机/SVM.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/30 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | # @Emial : lxztju@163.com 7 | 8 | ''' 9 | SVM的python实现 10 | 实现软间隔与核函数的非线性SVM二分类器 11 | 12 | 利用SMO算法进行训练 13 | ''' 14 | 15 | import numpy as np 16 | import logging 17 | import time 18 | import random 19 | import math 20 | 21 | def loadData(fileName): 22 | ''' 23 | 加载Mnist数据集 24 | :param fileName:要加载的数据集路径 25 | :return: list形式的数据集及标记 26 | ''' 27 | # 存放数据及标记的list 28 | dataArr = [] 29 | labelArr = [] 30 | # 打开文件 31 | fr = open(fileName, 'r') 32 | # 将文件按行读取 33 | for line in fr.readlines(): 34 | # 对每一行数据按切割福','进行切割,返回字段列表 35 | curLine = line.strip().split(',') 36 | 37 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类 38 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue 39 | if int(curLine[0]) == 0 or int(curLine[0]) == 1: 40 | if int(curLine[0]) == 0: 41 | labelArr.append(1) 42 | else: 43 | labelArr.append(-1) 44 | dataArr.append([int(num) / 255 for num in curLine[1:]]) 45 | # 存放标记 46 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 47 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 48 | # dataArr.append([int(num)/255 for num in curLine[1:]]) 49 | 50 | # 返回data和label 51 | return dataArr, labelArr 52 | 53 | 54 | 55 | class SVM: 56 | def __init__(self, traindataList, trainlabelList, sigma = 10, C = 200, toler = 0.001): 57 | ''' 58 | SVM类的参数初始化 59 | :param traindataList: 训练数据集的LIst格式 60 | :param trainlabelList: 训练数据集label的List格式 61 | :param sigma: 高斯核的参数 62 | :param C: 软间隔的惩罚参数 63 | :param toler: 松弛变量 64 | ''' 65 | self.traindataArr = np.array(traindataList) # 训练数据集转换为array格式 66 | self.trainlabelArr = np.array(trainlabelList).T # 训练数据集的label转换为array格式,进行转置变成列向量 67 | self.m, self.n = self.traindataArr.shape # m为训练集的样本个数, n为特征的个数 68 | 69 | self.sigma = sigma # 高斯核中的参数 70 | self.C = C #软间隔的惩罚参数 71 | self.toler = toler # 松弛变量 72 | self.b = 0 # SVM中的偏置项 73 | self.alpha = [1] * self.traindataArr.shape[0] # SVM对偶问题中的alpha 74 | self.kernel = self.calcKernel() # 核函数矩阵 75 | self.E = [self.calc_Ei(i) for i in range(self.m)] #SMO运算过程中的Ei 76 | # print(self.E) 77 | self.supportVecIndex = [] # 保存支持向量的索引 78 | 79 | 80 | 81 | 82 | def calcKernel(self): 83 | ''' 84 | 计算核函数矩阵,采用高斯核 85 | :return: 高斯核矩阵 86 | ''' 87 | 88 | # 高斯核矩阵的大小为m×m 89 | K = [[0] * self.m for _ in range(self.m)] 90 | 91 | # 遍历Xi, 这个相当于核函数方程中的x 92 | for i in range(self.m): 93 | 94 | if i % 100 == 0: 95 | logging.info('Construct The Gaussian Kernel: ({}/{}).'.format(i, self.m)) 96 | 97 | Xi = self.traindataArr[i] 98 | #遍历Xj,相当于公式中的Z 99 | for j in range(self.m): 100 | Xj = self.traindataArr[j] 101 | # 计算||xi-xj||^2 102 | diff = np.dot((Xi - Xj), (Xi - Xj).T) 103 | # nisan高斯核参数矩阵 104 | K[i][j] = np.exp((-1/2) * (diff/(self.sigma ** 2 ))) 105 | 106 | # 返回高斯核 107 | return K 108 | 109 | 110 | 111 | def calc_gxi(self, i): 112 | ''' 113 | 根据7.104的公式计算g(xi) 114 | :param i: x的下标 115 | :return: 返回g(xi)的值 116 | ''' 117 | gxi = 0 118 | for j in range(len(self.alpha)): 119 | gxi += self.alpha[j] * self.trainlabelArr[j] * self.kernel[i][j] 120 | 121 | return gxi + self.b 122 | 123 | 124 | 125 | def calc_Ei(self, i): 126 | ''' 127 | 计算公式7.104,计算Ei 128 | :param i: 下标 129 | :return: Ei 130 | ''' 131 | gxi = self.calc_gxi(i) 132 | return gxi - self.trainlabelArr[i] 133 | 134 | 135 | 136 | def isSatisfyKKT(self, i): 137 | ''' 138 | 判断第i个alpha是否满足KKT条件, 因为在SMO算法中 139 | 第一个alpha的选取采用最不符合KKT条件的哪一个 140 | :param i: alpha的下标i 141 | :return: True or False 142 | ''' 143 | gxi = self.calc_gxi(i) 144 | yi = self.trainlabelArr[i] 145 | multiply = gxi * yi 146 | alpha_i = self.alpha[i] 147 | 148 | # 书中采用的是alpha等于0,但是可以进行松弛操作 149 | # if alpha_i == 0: 150 | if (abs(self.alpha[i]) < self.toler) and (multiply >= 1): 151 | return True 152 | # 哦嗯样均采用松弛之后的 153 | # if alpha_i == self.C: 154 | if abs(self.alpha[i] - self.C) < self.toler and (multiply <= 1): 155 | return True 156 | 157 | #if 0 < alpha_i < self.C: 158 | if (self.alpha[i] > -self.toler) and (self.alpha[i] < (self.C + self.toler)) and (multiply < 1 + self.toler): 159 | return True 160 | 161 | return False 162 | 163 | 164 | 165 | def getAlpha(self): 166 | ''' 167 | SMO算法的2个变量 168 | :return: 返回E1, E2, i, j 169 | ''' 170 | # 首先遍历所有支持向量点,如果全部满足KKt条件,然后再去所有的数据集中查找 171 | index_list = [i for i in range(self.m) if 0 < self.alpha[i] < self.C] 172 | non_satisfy_list = [i for i in range(self.m) if i not in index_list] 173 | index_list.extend(non_satisfy_list) 174 | 175 | for i in index_list: 176 | if self.isSatisfyKKT(i): 177 | continue 178 | E1 = self.E[i] 179 | 180 | # 如果E1为正,你那么找到最小的E作为E2保证|E1-E2|最大 181 | E = {k:v for v, k in enumerate(self.E)} 182 | E_ = sorted(E.items(), key=lambda item: item[0]) 183 | 184 | if E1 >= 0: 185 | j = E_[0][1] 186 | # 如果找到的j与i相同,此时i代表的值最小,因此选择下一个值,如果不进行处理,使得i, j相同,那么后边会出现错误 187 | if j == i: 188 | j = E_[1][1] 189 | # j = min(range(self.m), key = lambda x:self.E[x]) 190 | # 如果E1为负,你那么找到最大的E作为E2保证|E1-E2|最大 191 | else: 192 | j = E_[-1][0] 193 | if j == i: 194 | j = E[-2][1] 195 | # j = max(range(self.m), key = lambda x:self.E[x]) 196 | # print(type(i), type(j)) 197 | j = int(j) 198 | E2 = self.E[j] 199 | return E1, E2, i, j 200 | 201 | 202 | def train(self, iter = 100): 203 | ''' 204 | 训练SVM分类器 205 | :param iter: 最大的迭代次数 206 | :return: 无返回值,训练SVM 207 | ''' 208 | iterStep = 0 # 迭代的次数,超过迭代次数依然没有收敛,则强制停止 209 | parameterChanged = 1 # 参数是否发生更改的标志,如果发生更改,那么这个值为1,如果不更改,说明算法已经收敛 210 | 211 | # 迭代训练SVM 212 | while iterStep < iter and parameterChanged > 0: 213 | logging.info('Iter:{}/{}'.format(iterStep, iter)) 214 | 215 | iterStep += 1 216 | # 初始化参数变化值为0,如果参数改变,说明训练过程正在进行,那么parameterChanged置一 217 | parameterChanged = 0 218 | 219 | # 利用SMO更新的两个变量 220 | E1, E2, i, j = self.getAlpha() 221 | 222 | y1 = self.trainlabelArr[i] 223 | y2 = self.trainlabelArr[j] 224 | 225 | alpha1Old = self.alpha[i] 226 | alpha2Old = self.alpha[j] 227 | 228 | # 计算边界 229 | if y1 == y2: 230 | L = max(0, alpha2Old+alpha1Old-self.C) 231 | H = min(self.C, alpha2Old + alpha1Old) 232 | else: 233 | L = max(0, alpha2Old-alpha1Old) 234 | H = min(self.C, self.C+alpha2Old+alpha1Old) 235 | 236 | if L == H: 237 | continue 238 | # print(L, H, alpha1Old, alpha2Old) 239 | k11 = self.kernel[i][i] 240 | k22 = self.kernel[j][j] 241 | k12 = self.kernel[i][j] 242 | k21 = self.kernel[j][i] 243 | 244 | eta = (k11 + k22 - 2*k12) 245 | 246 | # 如果eta为0,在后边的分母中会报错 247 | if eta <= 0: 248 | continue 249 | 250 | alpha2NewUnc = alpha2Old + y2 * (E1-E2)/ eta 251 | # print(E1, E2, eta, alpha2Old, alpha2NewUnc) 252 | if alpha2NewUnc H: 255 | alpha2New = H 256 | else: 257 | alpha2New = alpha2NewUnc 258 | # print(alpha2New, alpha2Old) 259 | alpha1New = alpha1Old + y1 * y2 * (alpha2Old - alpha2New) 260 | 261 | b1New = -1 * E1 - y1 * k11 * (alpha1New - alpha1Old) \ 262 | - y2 * k21*(alpha2NewUnc - alpha2Old) + self.b 263 | 264 | b2New = -1 * E2 - y1 * k12 * (alpha1New - alpha1Old) \ 265 | - y2 * k22 * (alpha2New - alpha2Old) + self.b 266 | 267 | # 依据α1和α2的值范围确定新b 268 | if (alpha1New > 0) and (alpha1New < self.C): 269 | bNew = b1New 270 | elif (alpha2New > 0) and (alpha2New < self.C): 271 | bNew = b2New 272 | else: 273 | bNew = (b1New + b2New) / 2 274 | 275 | self.alpha[i] = alpha1New 276 | self.alpha[j] = alpha2New 277 | self.b = bNew 278 | 279 | self.E[i] = self.calc_Ei(i) 280 | self.E[j] = self.calc_Ei(j) 281 | # parameterChanged = 1 282 | # print(abs(alpha2New - alpha2Old)) 283 | # 如果α2的改变量过于小,就认为该参数未改变,不增加parameterChanged值 284 | # 反之则自增1 285 | if abs(alpha2New - alpha2Old) >= 0.00001: 286 | parameterChanged = 1 287 | # break 288 | #全部计算结束后,重新遍历一遍α,查找里面的支持向量 289 | for i in range(self.m): 290 | #如果α>0,说明是支持向量 291 | if self.alpha[i] > 0: 292 | #将支持向量的索引保存起来 293 | self.supportVecIndex.append(i) 294 | 295 | logging.info('Training process is Done !!!!') 296 | 297 | 298 | def predict(self, x): 299 | ''' 300 | 输入单个样本计算输出 301 | :param x: 输入的待预测的样本, list格式 302 | :return: 返回预测的label值 303 | ''' 304 | x = np.array(x) 305 | 306 | result = 0 307 | ## 只有支持向量起作用 308 | for i in self.supportVecIndex: 309 | x1 = self.traindataArr[i] 310 | diff = np.dot((x1 - x), (x1 - x).T) 311 | k = np.exp((-1/2) * diff /(self.sigma ** 2)) 312 | result += self.alpha[i] * self.trainlabelArr[i] * k 313 | result += self.b 314 | return np.sign(result) 315 | 316 | 317 | def testModel(self, testdataList, testlabelList): 318 | ''' 319 | 测试模型的准确率 320 | :param testdataList: 输入的测试数据集, list格式 321 | :param testlabelList: 输入测试集的label, list格式 322 | :return: 返回预测的准确率 323 | ''' 324 | correct_num = 0 325 | 326 | for i in range(len(testlabelList)): 327 | # print(self.predict(testdataList[i])) 328 | if i % 100== 0: 329 | logging.info('Testing processing: ({}/{}) and the currect prediction:{}'.format(i, len(testdataList), correct_num)) 330 | if self.predict(testdataList[i]) == testlabelList[i]: 331 | correct_num += 1 332 | return round(correct_num / len(testlabelList)* 100, 4) 333 | 334 | 335 | 336 | 337 | 338 | if __name__ == '__main__': 339 | # 定义一个日志模块来保存日志 340 | logging.basicConfig(level=logging.DEBUG, 341 | format='%(asctime)-12s %(levelname)-8s %(message)s', 342 | datefmt='%m-%d %H:%M', 343 | filename='SVM.log', 344 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 345 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 346 | console = logging.StreamHandler() 347 | console.setLevel(logging.INFO) 348 | # 设置在控制台输出格式[- 349 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 350 | console.setFormatter(formatter) 351 | # 将handler加入到根记录器 352 | logging.getLogger('').addHandler(console) 353 | 354 | # 根记录器输出信息 355 | logging.info('This is an info message.') 356 | 357 | start = time.time() 358 | 359 | # mnist数据集的存储位置 360 | import os 361 | home = os.path.expanduser('~') 362 | train_path = home + '/ML/mnist/mnist_train.csv' 363 | test_path = home + '/ML/mnist/mnist_test.csv' 364 | # train_path = home + '/ML/mnist/mnist_train_samples.csv' 365 | # test_path = home + '/ML/mnist/mnist_test_samples.csv' 366 | 367 | # 读取训练与测试集 368 | logging.info('Loading data....') 369 | 370 | traindataList, trainlabelList = loadData(train_path) 371 | testdataList, testlabelList = loadData(test_path) 372 | logging.info('Loading data done.') 373 | 374 | logging.info('Training the SVM model....') 375 | 376 | svm = SVM(traindataList[:1000], trainlabelList[:1000]) 377 | 378 | 379 | svm.train() 380 | 381 | logging.info('Predicting one sample ....') 382 | prediction = svm.predict(testdataList[0]) 383 | logging.info('The prediction and the ground truth is : ({}, {})'.format(prediction, testlabelList[0])) 384 | 385 | # 测试SVM算法的准确率 386 | # 挑选测试集的前200个进行测试,防止运行时间过长 387 | accuracy = svm.testModel(testdataList, testlabelList) 388 | 389 | end = time.time() 390 | 391 | logging.info('accuracy:{}'.format(accuracy)) 392 | logging.info('Total Time: {}'.format(round(end - start), 4)) 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | -------------------------------------------------------------------------------- /最大熵模型/maxEntropy.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/29 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | 8 | '''' 9 | 构建最大熵模型,并采用mnist数据集进行训练策测试 10 | 11 | -------------------------- 12 | 没搞懂 13 | ''' 14 | import numpy as np 15 | import logging 16 | import time 17 | 18 | 19 | def loadData(filename): 20 | ''' 21 | 加载mnist数据集 22 | :param filename: 待加载的数据集路径 23 | :return: 返回加载后的数据集list 24 | ''' 25 | dataList = [] 26 | labelList = [] 27 | 28 | f = open(filename, 'r') 29 | 30 | for line in f.readlines(): 31 | 32 | curdata = line.strip().split(',') 33 | 34 | labelList.append(int(curdata[0])) 35 | 36 | dataList.append([int(int(value)>128) for value in curdata[1:]]) 37 | 38 | return dataList, labelList 39 | 40 | 41 | 42 | 43 | class MaxEntropy: 44 | def __init__(self, traindataList, trainlabelList): 45 | 46 | self.traindataArr = np.array(traindataList) 47 | self.trainlabelArr = np.array(trainlabelList) 48 | 49 | 50 | 51 | if __name__ == '__main__': 52 | # 定义一个日志模块来保存日志 53 | logging.basicConfig(level=logging.DEBUG, 54 | format='%(asctime)-12s %(levelname)-8s %(message)s', 55 | datefmt='%m-%d %H:%M', 56 | filename='maxEntropy.log', 57 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 58 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 59 | console = logging.StreamHandler() 60 | console.setLevel(logging.INFO) 61 | # 设置在控制台输出格式[- 62 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 63 | console.setFormatter(formatter) 64 | # 将handler加入到根记录器 65 | logging.getLogger('').addHandler(console) 66 | 67 | # 根记录器输出信息 68 | logging.info('This is an info message.') 69 | 70 | start = time.time() 71 | 72 | # mnist数据集的存储位置 73 | import os 74 | home = os.path.expanduser('~') 75 | train_path = home + '/ML/mnist/mnist_train.csv' 76 | test_path = home + '/ML/mnist/mnist_test.csv' 77 | # train_path = home + '/ML/mnist/mnist_train_samples.csv' 78 | # test_path = home + '/ML/mnist/mnist_test_samples.csv' 79 | 80 | # 读取训练与测试集 81 | logging.info('Loading data....') 82 | 83 | traindataArr, trainlabelArr = loadData(train_path) 84 | testdataArr, testlabelArr = loadData(test_path) 85 | logging.info('Loading data done.') 86 | 87 | logging.info('Building a LogisticRegression model.') 88 | maxEntropy = MaxEntropy(traindataArr, trainlabelArr) 89 | 90 | logging.info('Using LogisticRegression to predict one sample.') 91 | 92 | prediction = maxEntropy.predict(testdataArr[0] + [1]) 93 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), 94 | str(testlabelArr[ 95 | 0]))) 96 | 97 | # 测试朴决策树算法的准确率 98 | # 挑选测试集的前200个进行测试,防止运行时间过长 99 | logging.info('Testing the LogisticRegression model.') 100 | accuracy = maxEntropy.testModel(testdataArr[:200], testlabelArr[:200]) 101 | 102 | end = time.time() 103 | 104 | logging.info('accuracy:{}'.format(accuracy)) 105 | logging.info('Total Time: {}'.format(round(end - start), 4)) -------------------------------------------------------------------------------- /朴素贝叶斯/NaiveBayes.log: -------------------------------------------------------------------------------- 1 | 07-25 15:03 INFO This is an info message. 2 | 07-25 15:03 INFO Loading data.... 3 | 07-25 15:03 INFO Loading data done. 4 | 07-25 15:03 INFO Getting the prior distribution. 5 | 07-25 15:04 INFO Getting the Conditional probability distribution. 6 | 07-25 15:04 INFO Testing the testdata: (0/200. 7 | 07-25 15:04 INFO Testing the testdata: (50/200. 8 | 07-25 15:04 INFO Testing the testdata: (100/200. 9 | 07-25 15:04 INFO Testing the testdata: (150/200. 10 | 07-25 15:04 INFO accuracy:0.88 11 | 07-25 15:04 INFO Total Time: 48 12 | -------------------------------------------------------------------------------- /朴素贝叶斯/NaiveBayes.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/25 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | ''' 8 | 实现朴素贝叶斯分类器 9 | 并采用mnist数据集测试模型 10 | ''' 11 | 12 | 13 | 14 | import numpy as np 15 | import logging 16 | import time 17 | 18 | 19 | 20 | 21 | def loadData(fileName): 22 | ''' 23 | 加载Mnist数据集 24 | :param fileName:要加载的数据集路径 25 | :return: list形式的数据集及标记 26 | ''' 27 | # 存放数据及标记的list 28 | dataArr = [] 29 | labelArr = [] 30 | # 打开文件 31 | fr = open(fileName, 'r') 32 | # 将文件按行读取 33 | for line in fr.readlines(): 34 | # 对每一行数据按切割福','进行切割,返回字段列表 35 | curLine = line.strip().split(',') 36 | 37 | labelArr.append(int(curLine[0])) 38 | # 进行二值化处理,将大于128的标记为1, 小于128的标记为0 39 | dataArr.append([int(int(num)>128) for num in curLine[1:]]) 40 | # 存放标记 41 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 42 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 43 | 44 | # 返回data和label 45 | return dataArr, labelArr 46 | 47 | 48 | class NavieBayes: 49 | def __init__(self, num_classes, num_features, traindataArr, trianlabelArr): 50 | ''' 51 | 初始化朴素贝叶斯分类器类 52 | :param num_classes: 类别数目 53 | :param num_features: 特征维度 54 | :param traindataArr: 训练集 55 | :param trianlabelArr: 训练集标签 56 | ''' 57 | self.num_classes = num_classes 58 | self.num_features = num_features 59 | self.traindataArr, self.trainlabelArr = traindataArr, trainlabelArr 60 | self.py, self.px_y = self.getProbability() 61 | 62 | 63 | 64 | def naviebayes(self, x): 65 | ''' 66 | 利用朴素贝叶斯进行概率估计 67 | :param py: 先验概率 68 | :param pxy: 条件概率 69 | :param x: 待测样本点 70 | :return: 返回类别 71 | ''' 72 | p= [0] * self.num_classes 73 | 74 | # 计算每个类别的概率 75 | for i in range(self.num_classes): 76 | # 由于在getProbaility中计算得到的概率值已经经过了log运算,因此这里的概率值可以采用连加的形式 77 | sum = 0 78 | for j in range(self.num_features): 79 | sum += self.px_y[i][j][x[j]] 80 | p[i] = sum + self.py[i] 81 | return p.index(max(p)) 82 | 83 | 84 | def getProbability(self): 85 | ''' 86 | 计算所有训练集的先验与条件概率 87 | :param dataArr: 输入的训练样本集(list格式) 88 | :param labelArr: 输入的训练样本的label (list格式) 89 | :return: 返回训练集的先验概率分布与条件概率分布 90 | ''' 91 | 92 | # 首先计算先验分布py,初始化py数组 93 | py = np.zeros((self.num_classes, 1)) 94 | 95 | for i in range(self.num_classes): 96 | # 不考虑出现概率值为0的情况 97 | # np.mat(self.trainlabelArr == i)会让对应与等于i的为True, 不等的为False 98 | # py[i] = np.sum(np.mat(self.trainlabelArr == i)) / (len(self.trainlabelArr)) 99 | 100 | # 考虑概率值为0的情况,采用laplace平滑 101 | py[i] = np.sum(np.mat(self.trainlabelArr == i) + 1) / (len(self.trainlabelArr) + self.num_classes) 102 | 103 | # 最后求后验概率估计的时候,形式是各项的相乘(“4.1 朴素贝叶斯法的学习” 式4.7),这里存在两个问题:1.某一项为0时,结果为0. 104 | # 这个问题通过分子和分母加上一个相应的数可以排除,前面已经做好了处理。2.如果特诊特别多(例如在这里,需要连乘的项目有784个特征 105 | # 加一个先验概率分布一共795项相乘,所有数都是0-1之间,结果一定是一个很小的接近0的数。)理论上可以通过结果的大小值判断, 但在 106 | # 程序运行中很可能会向下溢出无法比较,因为值太小了。所以人为把值进行log处理。log在定义域内是一个递增函数,也就是说log(x)中, 107 | # x越大,log也就越大,单调性和原数据保持一致。所以加上log对结果没有影响。此外连乘项通过log以后,可以变成各项累加,简化了计算。 108 | py = np.log(py) 109 | 110 | logging.info('Getting the prior distribution.') 111 | 112 | 113 | # 计算条件概率分布pxy,初始化pxy数组 114 | # 一共有num_classes类,一共有num_features个特征, 每个特征有两种取值,1或者0 115 | px_y = np.zeros((self.num_classes, self.num_features, 2)) 116 | 117 | # 对标记集进行遍历 118 | for i in range(len(self.trainlabelArr)): 119 | # 获取当前循环所使用的标记 120 | label = self.trainlabelArr[i] 121 | # 获取当前要处理的样本 122 | x = self.traindataArr[i] 123 | # 对该样本的每一维特诊进行遍历 124 | for j in range(self.num_features): 125 | # 在矩阵中对应位置加1 126 | # 这里还没有计算条件概率,先把所有数累加,全加完以后,在后续步骤中再求对应的条件概率 127 | px_y[label][j][x[j]] += 1 128 | 129 | for label in range(self.num_classes): 130 | for j in range(self.num_features): 131 | # 分别计算第j个特征为0和1的个数 132 | px_y0 = px_y[label][j][0] 133 | px_y1 = px_y[label][j][1] 134 | 135 | # 计算条件概率 136 | px_y[label][j][0] = np.log((px_y0 +1) / (px_y0 + px_y1 + 2)) 137 | px_y[label][j][1] = np.log((px_y1 +1) / (px_y0 + px_y1 + 2)) 138 | logging.info('Getting the Conditional probability distribution.') 139 | 140 | return py, px_y 141 | 142 | def testModel(self,dataArr, labelArr): 143 | ''' 144 | 利用测试集测试训练集的 145 | :param py: 先验概率分布 146 | :param pxy: 条件概率分布 147 | :param dataArr: 测试集数据 148 | :param labelArr: 测试集的label 149 | :return: 返回准确率 150 | ''' 151 | correct_num = 0 152 | for i in range(len(dataArr)): 153 | if i %50 == 0: 154 | logging.info('Testing the testdata: ({}/{}).'.format(i, len(labelArr))) 155 | 156 | label = self.naviebayes(dataArr[i]) 157 | if label == labelArr[i]: 158 | correct_num += 1 159 | return round(correct_num / len(labelArr), 4) 160 | 161 | 162 | 163 | 164 | 165 | 166 | if __name__ == '__main__': 167 | 168 | # 定义一个日志模块来保存日志 169 | logging.basicConfig(level=logging.DEBUG, 170 | format='%(asctime)-12s %(levelname)-8s %(message)s', 171 | datefmt='%m-%d %H:%M', 172 | filename='NaiveBayes.log', 173 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 174 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 175 | console = logging.StreamHandler() 176 | console.setLevel(logging.INFO) 177 | # 设置在控制台输出格式[- 178 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 179 | console.setFormatter(formatter) 180 | # 将handler加入到根记录器 181 | logging.getLogger('').addHandler(console) 182 | 183 | # 根记录器输出信息 184 | logging.info('This is an info message.') 185 | 186 | 187 | start = time.time() 188 | 189 | 190 | # mnist数据集的存储位置 191 | import os 192 | home = os.path.expanduser('~') 193 | train_path = home + '/ML/mnist/mnist_train.csv' 194 | test_path = home + '/ML/mnist/mnist_train.csv' 195 | 196 | # 读取训练与测试集 197 | logging.info('Loading data....') 198 | 199 | traindataArr, trainlabelArr =loadData(train_path) 200 | testdataArr, testlabelArr = loadData(test_path) 201 | logging.info('Loading data done.') 202 | 203 | num_classes = 10 204 | num_features = 28 * 28 205 | Naviebayes = NavieBayes(num_classes, num_features,traindataArr, trainlabelArr) 206 | 207 | 208 | 209 | #测试朴素贝页斯算法的准确率 210 | # 挑选测试集的前200个进行测试,防止运行时间过长 211 | accuracy = Naviebayes.testModel(testdataArr[:200], testlabelArr[:200]) 212 | 213 | 214 | end = time.time() 215 | 216 | logging.info('accuracy:{}'.format(accuracy)) 217 | logging.info('Total Time: {}'.format(round(end-start), 4)) 218 | -------------------------------------------------------------------------------- /逻辑回归/LogisticRegression.log: -------------------------------------------------------------------------------- 1 | 07-28 22:11 INFO This is an info message. 2 | 07-28 22:11 INFO Loading data.... 3 | 07-28 22:11 INFO Loading data done. 4 | 07-28 22:11 INFO Building a LogisticRegression model. 5 | 07-28 22:12 INFO Using LogisticRegression to predict one sample. 6 | 07-28 22:12 INFO Testing processing Done,and the prediction and label are : (0,0) 7 | 07-28 22:12 INFO Testing the LogisticRegression model. 8 | 07-28 22:12 INFO accuracy:1.0 9 | 07-28 22:12 INFO Total Time: 29 10 | -------------------------------------------------------------------------------- /逻辑回归/LogisticRegression.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @time :2020/7/28 3 | # @IDE : pycharm 4 | # @author :lxztju 5 | # @github : https://github.com/lxztju 6 | 7 | 8 | ''' 9 | 逻辑斯蒂二分类器 10 | ------------------ 11 | 在计算sigmoid的exp的时候可能会出现数值较大,溢出 12 | 因此采用修正的sigmoid防止溢出 13 | ----- 14 | 修正后的sigmoid: 15 | wx = np.dot(self.w, x) 16 | if wx >= 0: 17 | probabilty = 1 /(1+ np.exp(-wx)) 18 | else: 19 | e = np.exp(wx) 20 | probabilty = e / (1 + e) 21 | ''' 22 | 23 | 24 | import numpy as np 25 | import logging 26 | import time 27 | 28 | 29 | def loadData(fileName): 30 | ''' 31 | 加载Mnist数据集 32 | :param fileName:要加载的数据集路径 33 | :return: list形式的数据集及标记 34 | ''' 35 | # 存放数据及标记的list 36 | dataArr = [] 37 | labelArr = [] 38 | # 打开文件 39 | fr = open(fileName, 'r') 40 | # 将文件按行读取 41 | for line in fr.readlines(): 42 | # 对每一行数据按切割福','进行切割,返回字段列表 43 | curLine = line.strip().split(',') 44 | 45 | # Mnsit有0-9是个标记,由于是二分类任务,所以仅仅挑选其中的0和1两类作为正负类进行分类 46 | # if int(curLine[0]) != 0 or int(curLine[0]) !=1: continue 47 | if int(curLine[0]) == 0 or int(curLine[0]) == 1: 48 | if int(curLine[0]) == 0: 49 | labelArr.append(1) 50 | else: 51 | labelArr.append(0) 52 | dataArr.append([int(num) / 255 for num in curLine[1:]]) 53 | # 存放标记 54 | # [int(num) for num in curLine[1:]] -> 遍历每一行中除了以第一个元素(标记)外将所有元素转换成int类型 55 | # [int(num)/255 for num in curLine[1:]] -> 将所有数据除255归一化(非必须步骤,可以不归一化) 56 | # dataArr.append([int(num)/255 for num in curLine[1:]]) 57 | 58 | # 返回data和label 59 | return dataArr, labelArr 60 | 61 | 62 | 63 | class LogisticRegression: 64 | def __init__(self, traindataList, trainlabelList): 65 | for i in range(len(traindataList)): 66 | traindataList[i].append(1) 67 | 68 | self.traindataArr = np.array(traindataList) 69 | self.trainlabelArr = np.array(trainlabelList) 70 | # print(self.traindataArr.shape) 71 | self.w = np.zeros(self.traindataArr.shape[1]) 72 | self.num_samples, self.num_features = self.traindataArr.shape 73 | self.train() 74 | 75 | def train(self, lr= 0.01, max_epoch= 200): 76 | ''' 77 | 训练得到逻辑斯蒂分类器 78 | :param lr: 学习率步长 79 | :param max_epoch: 最大的迭代次数 80 | :return: None,得到逻辑斯蒂分类器的权重 81 | ''' 82 | 83 | for _ in range(max_epoch): 84 | grad = 0 85 | for i in range(self.num_samples): 86 | xi = self.traindataArr[i] 87 | yi = self.trainlabelArr[i] 88 | wx = np.dot(xi, self.w) 89 | 90 | ## 对sigmoid进行修正,防止溢出 91 | if wx >= 0: 92 | grad += xi * yi -1.0/(1+np.exp(-wx)) * xi 93 | else: 94 | e = np.exp(wx) 95 | grad += xi * yi - ( e / (1+e) ) * xi 96 | self.w += lr * grad 97 | 98 | 99 | 100 | 101 | 102 | def predict(self, x): 103 | ''' 104 | 输入x,利用逻辑斯蒂回归进行预测 105 | :param x: 输入的x,numpy格式的array 106 | :return: label 107 | ''' 108 | wx = np.dot(self.w, x) 109 | if wx >= 0: 110 | probabilty = 1 /(1+ np.exp(-wx)) 111 | else: 112 | e = np.exp(wx) 113 | probabilty = e / (1 + e) 114 | if probabilty > 0.5: 115 | return 1 116 | else: 117 | return 0 118 | 119 | def testModel(self, testdataArr, testlabelArr): 120 | ''' 121 | 测试模型的准确度 122 | :param testdataArr: numpy array 123 | :param testlabelArr: numpy array 124 | :return: 准确率 125 | ''' 126 | # testdataArr = np.array(testdataArr) 127 | correct_num = 0 128 | for i in range(len(testdataArr)): 129 | # print(testdataArr[i].shape) 130 | if self.predict(testdataArr[i] + [1]) == testlabelArr[i]: 131 | correct_num += 1 132 | return round(correct_num / len(testdataArr), 4 ) 133 | 134 | 135 | 136 | 137 | if __name__ == '__main__': 138 | 139 | # 定义一个日志模块来保存日志 140 | logging.basicConfig(level=logging.DEBUG, 141 | format='%(asctime)-12s %(levelname)-8s %(message)s', 142 | datefmt='%m-%d %H:%M', 143 | filename='LogisticRegression.log', 144 | filemode='w') # filemode默认为a,追加信息到日志文件,指定为‘w',重新写入 文件,之前的文件信息丢失 145 | # 定义一个handler来将信息输出到控制台,StreamHandler与FileHandler 146 | console = logging.StreamHandler() 147 | console.setLevel(logging.INFO) 148 | # 设置在控制台输出格式[- 149 | formatter = logging.Formatter('%(asctime)-12s: %(levelname)-8s %(message)s') 150 | console.setFormatter(formatter) 151 | # 将handler加入到根记录器 152 | logging.getLogger('').addHandler(console) 153 | 154 | # 根记录器输出信息 155 | logging.info('This is an info message.') 156 | 157 | 158 | start = time.time() 159 | 160 | 161 | # mnist数据集的存储位置 162 | import os 163 | home = os.path.expanduser('~') 164 | train_path = home + '/ML/mnist/mnist_train.csv' 165 | test_path = home + '/ML/mnist/mnist_test.csv' 166 | # train_path = home + '/ML/mnist/mnist_train_samples.csv' 167 | # test_path = home + '/ML/mnist/mnist_test_samples.csv' 168 | 169 | # 读取训练与测试集 170 | logging.info('Loading data....') 171 | 172 | traindataArr, trainlabelArr =loadData(train_path) 173 | testdataArr, testlabelArr = loadData(test_path) 174 | logging.info('Loading data done.') 175 | 176 | logging.info('Building a LogisticRegression model.') 177 | logisiticRegression = LogisticRegression(traindataArr, trainlabelArr) 178 | 179 | logging.info('Using LogisticRegression to predict one sample.') 180 | 181 | prediction = logisiticRegression.predict(testdataArr[0] + [1]) 182 | logging.info('Testing processing Done,and the prediction and label are : ({},{})'.format(str(prediction), str(testlabelArr[0]))) 183 | 184 | #测试朴决策树算法的准确率 185 | # 挑选测试集的前200个进行测试,防止运行时间过长 186 | logging.info('Testing the LogisticRegression model.') 187 | accuracy = logisiticRegression.testModel(testdataArr[:200], testlabelArr[:200]) 188 | 189 | 190 | end = time.time() 191 | 192 | logging.info('accuracy:{}'.format(accuracy)) 193 | logging.info('Total Time: {}'.format(round(end-start), 4)) 194 | 195 | --------------------------------------------------------------------------------