├── Ch02-KNN ├── 2.1.py ├── 2.2.1.py ├── 2.2.2.py ├── 2.2.3.py ├── 2.2.4.py ├── 2.2.5.py └── 2.3.2.py ├── Ch03-DecisionTree ├── 3.2.1-1.py ├── 3.2.1-2.py ├── 3.2.2.py ├── 3.3.py ├── 3.4.py ├── 3.5.1.py ├── 3.5.2.py ├── 3.6.2-1.py ├── 3.6.2-2.py └── 3.6.2-3.py ├── Ch04-NaiveBayes ├── 4.7.1.py ├── 4.7.2.py ├── 4.7.3.py ├── 4.8.1.py ├── 4.8.2.py ├── 4.9.1.py ├── 4.9.2-1.py ├── 4.9.2-2.py ├── 4.9.2-3.py └── 4.9.2-4.py ├── Ch05-Logistic ├── 5.4.1.py ├── 5.4.2.py ├── 5.4.3.py ├── 5.4.4.py ├── 5.4.5.py ├── 5.5.2-1.py ├── 5.5.2-2.py └── 5.6.py ├── Ch06-SVM ├── 6.3.py ├── 6.4.py ├── 6.5.1.py ├── 6.5.2.py ├── 6.6.py └── 6.7.py ├── Ch07-AdaBoost ├── 7.3.1.py ├── 7.3.2.py ├── 7.4.1.py ├── 7.4.2.py ├── 7.5.py ├── 7.6.py └── 7.8.py ├── Ch08-Regression ├── 8.2.1.py ├── 8.2.2.py ├── 8.2.3.py ├── 8.3.py ├── 8.4.py ├── 8.5.1.py ├── 8.5.3.py ├── 8.6.1.py ├── 8.6.2-1.py ├── 8.6.2-2.py ├── 8.6.2-3.py └── 8.7.py ├── Ch09-Regression Trees ├── 9.3.py ├── 9.4.1.py ├── 9.4.2.py ├── 9.4.3.py ├── 9.4.4.py ├── 9.4.5.py ├── 9.5.1-1.py ├── 9.5.1-2.py ├── 9.5.2.py ├── 9.6.1.py ├── 9.6.2.py ├── 9.7.1.py ├── 9.7.2.py └── 9.8.py ├── Machine Learning in Action.pdf ├── README.md ├── 机器学习实战.pdf ├── 机器学习实战总目录.md └── 机器学习实战数据集.zip /Ch02-KNN/2.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:35 4 | # @Author : GXl 5 | # @File : 2.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import operator 11 | 12 | """ 13 | Parameters: 14 | 无 15 | Returns: 16 | group - 数据集 17 | labels - 分类标签 18 | """ 19 | # 函数说明:创建数据集 20 | def createDataSet(): 21 | #六组二维特征 22 | group = np.array([[3,104],[2,100],[1,81],[101,10],[99,5],[98,2]]) 23 | #六组特征的标签 24 | labels = ['爱情片','爱情片','爱情片','动作片','动作片','动作片'] 25 | return group, labels 26 | 27 | """ 28 | Parameters: 29 | inX - 用于分类的数据(测试集) 30 | dataSet - 用于训练的数据(训练集) 31 | labes - 分类标签 32 | k - kNN算法参数,选择距离最小的k个点 33 | Returns: 34 | sortedClassCount[0][0] - 分类结果 35 | """ 36 | # 函数说明:kNN算法,分类器 37 | def classify0(inX, dataSet, labels, k): 38 | #numpy函数shape[0]返回dataSet的行数 39 | dataSetSize = dataSet.shape[0] 40 | #在列向量方向上重复inX共1次(横向),行向量方向上重复inX共dataSetSize次(纵向) 41 | diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet 42 | #二维特征相减后平方 43 | sqDiffMat = diffMat**2 44 | #sum()所有元素相加,sum(0)列相加,sum(1)行相加 45 | sqDistances = sqDiffMat.sum(axis=1) 46 | #开方,计算出距离 47 | distances = sqDistances**0.5 48 | #返回distances中元素从小到大排序后的索引值 49 | sortedDistIndices = distances.argsort() 50 | #定一个记录类别次数的字典 51 | classCount = {} 52 | for i in range(k): 53 | #取出前k个元素的类别 54 | voteIlabel = labels[sortedDistIndices[i]] 55 | #dict.get(key,default=None),字典的get()方法,返回指定键的值,如果值不在字典中返回默认值。 56 | #计算类别次数 57 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 58 | #python3中用items()替换python2中的iteritems() 59 | #key=operator.itemgetter(1)根据字典的值进行排序 60 | #key=operator.itemgetter(0)根据字典的键进行排序 61 | #reverse降序排序字典 62 | sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) 63 | #返回次数最多的类别,即所要分类的类别 64 | return sortedClassCount[0][0] 65 | 66 | if __name__ == '__main__': 67 | #创建数据集 68 | group, labels = createDataSet() 69 | #测试集 70 | test = [101,20] 71 | #kNN分类 72 | test_class = classify0(test, group, labels, 3) 73 | #打印分类结果 74 | print(test_class) -------------------------------------------------------------------------------- /Ch02-KNN/2.2.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:35 4 | # @Author : GXl 5 | # @File : 2.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | """ 12 | Parameters: 13 | filename - 文件名 14 | Returns: 15 | returnMat - 特征矩阵 16 | classLabelVector - 分类Label向量 17 | """ 18 | # 函数说明:打开并解析文件,对数据进行分类:1代表不喜欢,2代表魅力一般,3代表极具魅力 19 | def file2matrix(filename): 20 | #打开文件 21 | fr = open(filename) 22 | #读取文件所有内容 23 | arrayOLines = fr.readlines() 24 | #得到文件行数 25 | numberOfLines = len(arrayOLines) 26 | #返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列 27 | returnMat = np.zeros((numberOfLines,3)) 28 | #返回的分类标签向量 29 | classLabelVector = [] 30 | #行的索引值 31 | index = 0 32 | for line in arrayOLines: 33 | #s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ') 34 | line = line.strip() 35 | #使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。 36 | listFromLine = line.split('\t') 37 | #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 38 | returnMat[index,:] = listFromLine[0:3] 39 | #根据文本中标记的喜欢的程度进行分类,1代表不喜欢,2代表魅力一般,3代表极具魅力 40 | if listFromLine[-1] == 'didntLike': 41 | classLabelVector.append(1) 42 | elif listFromLine[-1] == 'smallDoses': 43 | classLabelVector.append(2) 44 | elif listFromLine[-1] == 'largeDoses': 45 | classLabelVector.append(3) 46 | index += 1 47 | return returnMat, classLabelVector 48 | 49 | 50 | if __name__ == '__main__': 51 | #打开的文件名 52 | filename = "datingTestSet.txt" 53 | #打开并处理数据 54 | datingDataMat, datingLabels = file2matrix(filename) 55 | print(datingDataMat) 56 | print(datingLabels) -------------------------------------------------------------------------------- /Ch02-KNN/2.2.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:36 4 | # @Author : GXl 5 | # @File : 2.2.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.lines as mlines 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | 14 | """ 15 | Parameters: 16 | filename - 文件名 17 | Returns: 18 | returnMat - 特征矩阵 19 | classLabelVector - 分类Label向量 20 | """ 21 | # 函数说明:打开并解析文件,对数据进行分类:1代表不喜欢,2代表魅力一般,3代表极具魅力 22 | def file2matrix(filename): 23 | #打开文件 24 | fr = open(filename) 25 | #读取文件所有内容 26 | arrayOLines = fr.readlines() 27 | #得到文件行数 28 | numberOfLines = len(arrayOLines) 29 | #返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列 30 | returnMat = np.zeros((numberOfLines,3)) 31 | #返回的分类标签向量 32 | classLabelVector = [] 33 | #行的索引值 34 | index = 0 35 | for line in arrayOLines: 36 | #s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ') 37 | line = line.strip() 38 | #使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。 39 | listFromLine = line.split('\t') 40 | #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 41 | returnMat[index,:] = listFromLine[0:3] 42 | #根据文本中标记的喜欢的程度进行分类,1代表不喜欢,2代表魅力一般,3代表极具魅力 43 | if listFromLine[-1] == 'didntLike': 44 | classLabelVector.append(1) 45 | elif listFromLine[-1] == 'smallDoses': 46 | classLabelVector.append(2) 47 | elif listFromLine[-1] == 'largeDoses': 48 | classLabelVector.append(3) 49 | index += 1 50 | return returnMat, classLabelVector 51 | 52 | """ 53 | Parameters: 54 | datingDataMat - 特征矩阵 55 | datingLabels - 分类Label 56 | Returns: 57 | 无 58 | """ 59 | # 函数说明:可视化数据 60 | def showdatas(datingDataMat, datingLabels): 61 | #设置汉字格式 62 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 63 | #将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8) 64 | #当nrow=2,nclos=2时,代表fig画布被分为四个区域,axs[0][0]表示第一行第一个区域 65 | fig, axs = plt.subplots(nrows=2, ncols=2,sharex=False, sharey=False, figsize=(13,8)) 66 | 67 | numberOfLabels = len(datingLabels) 68 | LabelsColors = [] 69 | for i in datingLabels: 70 | if i == 1: 71 | LabelsColors.append('black') 72 | if i == 2: 73 | LabelsColors.append('orange') 74 | if i == 3: 75 | LabelsColors.append('red') 76 | #画出散点图,以datingDataMat矩阵的第一(飞行常客例程)、第二列(玩游戏)数据画散点数据,散点大小为15,透明度为0.5 77 | axs[0][0].scatter(x=datingDataMat[:,0], y=datingDataMat[:,1], color=LabelsColors,s=15, alpha=.5) 78 | #设置标题,x轴label,y轴label 79 | axs0_title_text = axs[0][0].set_title(u'每年获得的飞行常客里程数与玩视频游戏所消耗时间占比',FontProperties=font) 80 | axs0_xlabel_text = axs[0][0].set_xlabel(u'每年获得的飞行常客里程数',FontProperties=font) 81 | axs0_ylabel_text = axs[0][0].set_ylabel(u'玩视频游戏所消耗时间占',FontProperties=font) 82 | plt.setp(axs0_title_text, size=9, weight='bold', color='red') 83 | plt.setp(axs0_xlabel_text, size=7, weight='bold', color='black') 84 | plt.setp(axs0_ylabel_text, size=7, weight='bold', color='black') 85 | 86 | #画出散点图,以datingDataMat矩阵的第一(飞行常客例程)、第三列(冰激凌)数据画散点数据,散点大小为15,透明度为0.5 87 | axs[0][1].scatter(x=datingDataMat[:,0], y=datingDataMat[:,2], color=LabelsColors,s=15, alpha=.5) 88 | #设置标题,x轴label,y轴label 89 | axs1_title_text = axs[0][1].set_title(u'每年获得的飞行常客里程数与每周消费的冰激淋公升数',FontProperties=font) 90 | axs1_xlabel_text = axs[0][1].set_xlabel(u'每年获得的飞行常客里程数',FontProperties=font) 91 | axs1_ylabel_text = axs[0][1].set_ylabel(u'每周消费的冰激淋公升数',FontProperties=font) 92 | plt.setp(axs1_title_text, size=9, weight='bold', color='red') 93 | plt.setp(axs1_xlabel_text, size=7, weight='bold', color='black') 94 | plt.setp(axs1_ylabel_text, size=7, weight='bold', color='black') 95 | 96 | #画出散点图,以datingDataMat矩阵的第二(玩游戏)、第三列(冰激凌)数据画散点数据,散点大小为15,透明度为0.5 97 | axs[1][0].scatter(x=datingDataMat[:,1], y=datingDataMat[:,2], color=LabelsColors,s=15, alpha=.5) 98 | #设置标题,x轴label,y轴label 99 | axs2_title_text = axs[1][0].set_title(u'玩视频游戏所消耗时间占比与每周消费的冰激淋公升数',FontProperties=font) 100 | axs2_xlabel_text = axs[1][0].set_xlabel(u'玩视频游戏所消耗时间占比',FontProperties=font) 101 | axs2_ylabel_text = axs[1][0].set_ylabel(u'每周消费的冰激淋公升数',FontProperties=font) 102 | plt.setp(axs2_title_text, size=9, weight='bold', color='red') 103 | plt.setp(axs2_xlabel_text, size=7, weight='bold', color='black') 104 | plt.setp(axs2_ylabel_text, size=7, weight='bold', color='black') 105 | #设置图例 106 | didntLike = mlines.Line2D([], [], color='black', marker='.', 107 | markersize=6, label='didntLike') 108 | smallDoses = mlines.Line2D([], [], color='orange', marker='.', 109 | markersize=6, label='smallDoses') 110 | largeDoses = mlines.Line2D([], [], color='red', marker='.', 111 | markersize=6, label='largeDoses') 112 | #添加图例 113 | axs[0][0].legend(handles=[didntLike,smallDoses,largeDoses]) 114 | axs[0][1].legend(handles=[didntLike,smallDoses,largeDoses]) 115 | axs[1][0].legend(handles=[didntLike,smallDoses,largeDoses]) 116 | #显示图片 117 | plt.show() 118 | 119 | 120 | if __name__ == '__main__': 121 | #打开的文件名 122 | filename = "datingTestSet.txt" 123 | #打开并处理数据 124 | datingDataMat, datingLabels = file2matrix(filename) 125 | showdatas(datingDataMat, datingLabels) -------------------------------------------------------------------------------- /Ch02-KNN/2.2.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:38 4 | # @Author : GXl 5 | # @File : 2.2.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | """ 12 | Parameters: 13 | filename - 文件名 14 | Returns: 15 | returnMat - 特征矩阵 16 | classLabelVector - 分类Label向量 17 | """ 18 | # 函数说明:打开并解析文件,对数据进行分类:1代表不喜欢,2代表魅力一般,3代表极具魅力 19 | def file2matrix(filename): 20 | #打开文件 21 | fr = open(filename) 22 | #读取文件所有内容 23 | arrayOLines = fr.readlines() 24 | #得到文件行数 25 | numberOfLines = len(arrayOLines) 26 | #返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列 27 | returnMat = np.zeros((numberOfLines,3)) 28 | #返回的分类标签向量 29 | classLabelVector = [] 30 | #行的索引值 31 | index = 0 32 | for line in arrayOLines: 33 | #s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ') 34 | line = line.strip() 35 | #使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。 36 | listFromLine = line.split('\t') 37 | #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 38 | returnMat[index,:] = listFromLine[0:3] 39 | #根据文本中标记的喜欢的程度进行分类,1代表不喜欢,2代表魅力一般,3代表极具魅力 40 | if listFromLine[-1] == 'didntLike': 41 | classLabelVector.append(1) 42 | elif listFromLine[-1] == 'smallDoses': 43 | classLabelVector.append(2) 44 | elif listFromLine[-1] == 'largeDoses': 45 | classLabelVector.append(3) 46 | index += 1 47 | return returnMat, classLabelVector 48 | 49 | """ 50 | Parameters: 51 | dataSet - 特征矩阵 52 | Returns: 53 | normDataSet - 归一化后的特征矩阵 54 | ranges - 数据范围 55 | minVals - 数据最小值 56 | """ 57 | # 函数说明:对数据进行归一化 58 | def autoNorm(dataSet): 59 | #获得数据的最小值 60 | minVals = dataSet.min(0) 61 | maxVals = dataSet.max(0) 62 | #最大值和最小值的范围 63 | ranges = maxVals - minVals 64 | #shape(dataSet)返回dataSet的矩阵行列数 65 | normDataSet = np.zeros(np.shape(dataSet)) 66 | #返回dataSet的行数 67 | m = dataSet.shape[0] 68 | #原始值减去最小值 69 | normDataSet = dataSet - np.tile(minVals, (m, 1)) 70 | #除以最大和最小值的差,得到归一化数据 71 | normDataSet = normDataSet / np.tile(ranges, (m, 1)) 72 | #返回归一化数据结果,数据范围,最小值 73 | return normDataSet, ranges, minVals 74 | 75 | 76 | if __name__ == '__main__': 77 | #打开的文件名 78 | filename = "datingTestSet.txt" 79 | #打开并处理数据 80 | datingDataMat, datingLabels = file2matrix(filename) 81 | normDataSet, ranges, minVals = autoNorm(datingDataMat) 82 | print(normDataSet) 83 | print(ranges) 84 | print(minVals) -------------------------------------------------------------------------------- /Ch02-KNN/2.2.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:39 4 | # @Author : GXl 5 | # @File : 2.2.4.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import operator 11 | 12 | """ 13 | Parameters: 14 | inX - 用于分类的数据(测试集) 15 | dataSet - 用于训练的数据(训练集) 16 | labes - 分类标签 17 | k - kNN算法参数,选择距离最小的k个点 18 | Returns: 19 | sortedClassCount[0][0] - 分类结果 20 | """ 21 | # 函数说明:kNN算法,分类器 22 | def classify0(inX, dataSet, labels, k): 23 | #numpy函数shape[0]返回dataSet的行数 24 | dataSetSize = dataSet.shape[0] 25 | #在列向量方向上重复inX共1次(横向),行向量方向上重复inX共dataSetSize次(纵向) 26 | diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet 27 | #二维特征相减后平方 28 | sqDiffMat = diffMat**2 29 | #sum()所有元素相加,sum(0)列相加,sum(1)行相加 30 | sqDistances = sqDiffMat.sum(axis=1) 31 | #开方,计算出距离 32 | distances = sqDistances**0.5 33 | #返回distances中元素从小到大排序后的索引值 34 | sortedDistIndices = distances.argsort() 35 | #定一个记录类别次数的字典 36 | classCount = {} 37 | for i in range(k): 38 | #取出前k个元素的类别 39 | voteIlabel = labels[sortedDistIndices[i]] 40 | #dict.get(key,default=None),字典的get()方法,返回指定键的值,如果值不在字典中返回默认值。 41 | #计算类别次数 42 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 43 | #python3中用items()替换python2中的iteritems() 44 | #key=operator.itemgetter(1)根据字典的值进行排序 45 | #key=operator.itemgetter(0)根据字典的键进行排序 46 | #reverse降序排序字典 47 | sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) 48 | #返回次数最多的类别,即所要分类的类别 49 | return sortedClassCount[0][0] 50 | 51 | """ 52 | Parameters: 53 | filename - 文件名 54 | Returns: 55 | returnMat - 特征矩阵 56 | classLabelVector - 分类Label向量 57 | """ 58 | # 函数说明:打开并解析文件,对数据进行分类:1代表不喜欢,2代表魅力一般,3代表极具魅力 59 | def file2matrix(filename): 60 | #打开文件 61 | fr = open(filename) 62 | #读取文件所有内容 63 | arrayOLines = fr.readlines() 64 | #得到文件行数 65 | numberOfLines = len(arrayOLines) 66 | #返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列 67 | returnMat = np.zeros((numberOfLines,3)) 68 | #返回的分类标签向量 69 | classLabelVector = [] 70 | #行的索引值 71 | index = 0 72 | for line in arrayOLines: 73 | #s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ') 74 | line = line.strip() 75 | #使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。 76 | listFromLine = line.split('\t') 77 | #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 78 | returnMat[index,:] = listFromLine[0:3] 79 | #根据文本中标记的喜欢的程度进行分类,1代表不喜欢,2代表魅力一般,3代表极具魅力 80 | if listFromLine[-1] == 'didntLike': 81 | classLabelVector.append(1) 82 | elif listFromLine[-1] == 'smallDoses': 83 | classLabelVector.append(2) 84 | elif listFromLine[-1] == 'largeDoses': 85 | classLabelVector.append(3) 86 | index += 1 87 | return returnMat, classLabelVector 88 | 89 | """ 90 | Parameters: 91 | dataSet - 特征矩阵 92 | Returns: 93 | normDataSet - 归一化后的特征矩阵 94 | ranges - 数据范围 95 | minVals - 数据最小值 96 | """ 97 | # 函数说明:对数据进行归一化 98 | def autoNorm(dataSet): 99 | #获得数据的最小值 100 | minVals = dataSet.min(0) 101 | maxVals = dataSet.max(0) 102 | #最大值和最小值的范围 103 | ranges = maxVals - minVals 104 | #shape(dataSet)返回dataSet的矩阵行列数 105 | normDataSet = np.zeros(np.shape(dataSet)) 106 | #返回dataSet的行数 107 | m = dataSet.shape[0] 108 | #原始值减去最小值 109 | normDataSet = dataSet - np.tile(minVals, (m, 1)) 110 | #除以最大和最小值的差,得到归一化数据 111 | normDataSet = normDataSet / np.tile(ranges, (m, 1)) 112 | #返回归一化数据结果,数据范围,最小值 113 | return normDataSet, ranges, minVals 114 | 115 | """ 116 | Parameters: 117 | 无 118 | Returns: 119 | normDataSet - 归一化后的特征矩阵 120 | ranges - 数据范围 121 | minVals - 数据最小值 122 | """ 123 | # 函数说明:分类器测试函数 124 | def datingClassTest(): 125 | #打开的文件名 126 | filename = "datingTestSet.txt" 127 | #将返回的特征矩阵和分类向量分别存储到datingDataMat和datingLabels中 128 | datingDataMat, datingLabels = file2matrix(filename) 129 | #取所有数据的百分之十 130 | hoRatio = 0.10 131 | #数据归一化,返回归一化后的矩阵,数据范围,数据最小值 132 | normMat, ranges, minVals = autoNorm(datingDataMat) 133 | #获得normMat的行数 134 | m = normMat.shape[0] 135 | #百分之十的测试数据的个数 136 | numTestVecs = int(m * hoRatio) 137 | #分类错误计数 138 | errorCount = 0.0 139 | 140 | for i in range(numTestVecs): 141 | #前numTestVecs个数据作为测试集,后m-numTestVecs个数据作为训练集 142 | classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], 143 | datingLabels[numTestVecs:m], 4) 144 | print("分类结果:%d\t真实类别:%d" % (classifierResult, datingLabels[i])) 145 | if classifierResult != datingLabels[i]: 146 | errorCount += 1.0 147 | print("错误率:%f%%" %(errorCount/float(numTestVecs)*100)) 148 | 149 | 150 | if __name__ == '__main__': 151 | datingClassTest() -------------------------------------------------------------------------------- /Ch02-KNN/2.2.5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:40 4 | # @Author : GXl 5 | # @File : 2.2.5.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import operator 11 | 12 | """ 13 | Parameters: 14 | inX - 用于分类的数据(测试集) 15 | dataSet - 用于训练的数据(训练集) 16 | labes - 分类标签 17 | k - kNN算法参数,选择距离最小的k个点 18 | Returns: 19 | sortedClassCount[0][0] - 分类结果 20 | """ 21 | # 函数说明:kNN算法,分类器 22 | def classify0(inX, dataSet, labels, k): 23 | #numpy函数shape[0]返回dataSet的行数 24 | dataSetSize = dataSet.shape[0] 25 | #在列向量方向上重复inX共1次(横向),行向量方向上重复inX共dataSetSize次(纵向) 26 | diffMat = np.tile(inX, (dataSetSize, 1)) - dataSet 27 | #二维特征相减后平方 28 | sqDiffMat = diffMat**2 29 | #sum()所有元素相加,sum(0)列相加,sum(1)行相加 30 | sqDistances = sqDiffMat.sum(axis=1) 31 | #开方,计算出距离 32 | distances = sqDistances**0.5 33 | #返回distances中元素从小到大排序后的索引值 34 | sortedDistIndices = distances.argsort() 35 | #定一个记录类别次数的字典 36 | classCount = {} 37 | for i in range(k): 38 | #取出前k个元素的类别 39 | voteIlabel = labels[sortedDistIndices[i]] 40 | #dict.get(key,default=None),字典的get()方法,返回指定键的值,如果值不在字典中返回默认值。 41 | #计算类别次数 42 | classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1 43 | #python3中用items()替换python2中的iteritems() 44 | #key=operator.itemgetter(1)根据字典的值进行排序 45 | #key=operator.itemgetter(0)根据字典的键进行排序 46 | #reverse降序排序字典 47 | sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True) 48 | #返回次数最多的类别,即所要分类的类别 49 | return sortedClassCount[0][0] 50 | 51 | """ 52 | Parameters: 53 | filename - 文件名 54 | Returns: 55 | returnMat - 特征矩阵 56 | classLabelVector - 分类Label向量 57 | """ 58 | # 函数说明:打开并解析文件,对数据进行分类:1代表不喜欢,2代表魅力一般,3代表极具魅力 59 | def file2matrix(filename): 60 | #打开文件 61 | fr = open(filename) 62 | #读取文件所有内容 63 | arrayOLines = fr.readlines() 64 | #得到文件行数 65 | numberOfLines = len(arrayOLines) 66 | #返回的NumPy矩阵,解析完成的数据:numberOfLines行,3列 67 | returnMat = np.zeros((numberOfLines,3)) 68 | #返回的分类标签向量 69 | classLabelVector = [] 70 | #行的索引值 71 | index = 0 72 | for line in arrayOLines: 73 | #s.strip(rm),当rm空时,默认删除空白符(包括'\n','\r','\t',' ') 74 | line = line.strip() 75 | #使用s.split(str="",num=string,cout(str))将字符串根据'\t'分隔符进行切片。 76 | listFromLine = line.split('\t') 77 | #将数据前三列提取出来,存放到returnMat的NumPy矩阵中,也就是特征矩阵 78 | returnMat[index,:] = listFromLine[0:3] 79 | #根据文本中标记的喜欢的程度进行分类,1代表不喜欢,2代表魅力一般,3代表极具魅力 80 | if listFromLine[-1] == 'didntLike': 81 | classLabelVector.append(1) 82 | elif listFromLine[-1] == 'smallDoses': 83 | classLabelVector.append(2) 84 | elif listFromLine[-1] == 'largeDoses': 85 | classLabelVector.append(3) 86 | index += 1 87 | return returnMat, classLabelVector 88 | 89 | """ 90 | Parameters: 91 | dataSet - 特征矩阵 92 | Returns: 93 | normDataSet - 归一化后的特征矩阵 94 | ranges - 数据范围 95 | minVals - 数据最小值 96 | """ 97 | # 函数说明:对数据进行归一化 98 | def autoNorm(dataSet): 99 | #获得数据的最小值 100 | minVals = dataSet.min(0) 101 | maxVals = dataSet.max(0) 102 | #最大值和最小值的范围 103 | ranges = maxVals - minVals 104 | #shape(dataSet)返回dataSet的矩阵行列数 105 | normDataSet = np.zeros(np.shape(dataSet)) 106 | #返回dataSet的行数 107 | m = dataSet.shape[0] 108 | #原始值减去最小值 109 | normDataSet = dataSet - np.tile(minVals, (m, 1)) 110 | #除以最大和最小值的差,得到归一化数据 111 | normDataSet = normDataSet / np.tile(ranges, (m, 1)) 112 | #返回归一化数据结果,数据范围,最小值 113 | return normDataSet, ranges, minVals 114 | 115 | # 函数说明:通过输入一个人的三维特征,进行分类输出 116 | def classifyPerson(): 117 | #输出结果 118 | resultList = ['讨厌','有些喜欢','非常喜欢'] 119 | #三维特征用户输入 120 | precentTats = float(input("玩视频游戏所耗时间百分比:")) 121 | ffMiles = float(input("每年获得的飞行常客里程数:")) 122 | iceCream = float(input("每周消费的冰激淋公升数:")) 123 | #打开的文件名 124 | filename = "datingTestSet.txt" 125 | #打开并处理数据 126 | datingDataMat, datingLabels = file2matrix(filename) 127 | #训练集归一化 128 | normMat, ranges, minVals = autoNorm(datingDataMat) 129 | #生成NumPy数组,测试集 130 | inArr = np.array([precentTats, ffMiles, iceCream]) 131 | #测试集归一化 132 | norminArr = (inArr - minVals) / ranges 133 | #返回分类结果 134 | classifierResult = classify0(norminArr, normMat, datingLabels, 3) 135 | #打印结果 136 | print("你可能%s这个人" % (resultList[classifierResult-1])) 137 | 138 | 139 | if __name__ == '__main__': 140 | classifyPerson() -------------------------------------------------------------------------------- /Ch02-KNN/2.3.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:41 4 | # @Author : GXl 5 | # @File : 2.3.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import operator 11 | from os import listdir 12 | from sklearn.neighbors import KNeighborsClassifier as kNN 13 | 14 | """ 15 | Parameters: 16 | filename - 文件名 17 | Returns: 18 | returnVect - 返回的二进制图像的1x1024向量 19 | """ 20 | # 函数说明:将32x32的二进制图像转换为1x1024向量。 21 | def img2vector(filename): 22 | #创建1x1024零向量 23 | returnVect = np.zeros((1, 1024)) 24 | #打开文件 25 | fr = open(filename) 26 | #按行读取 27 | for i in range(32): 28 | #读一行数据 29 | lineStr = fr.readline() 30 | #每一行的前32个元素依次添加到returnVect中 31 | for j in range(32): 32 | returnVect[0, 32*i+j] = int(lineStr[j]) 33 | #返回转换后的1x1024向量 34 | return returnVect 35 | 36 | # 函数说明:手写数字分类测试 37 | def handwritingClassTest(): 38 | #测试集的Labels 39 | hwLabels = [] 40 | #返回trainingDigits目录下的文件名 41 | trainingFileList = listdir('trainingDigits') 42 | #返回文件夹下文件的个数 43 | m = len(trainingFileList) 44 | #初始化训练的Mat矩阵,测试集 45 | trainingMat = np.zeros((m, 1024)) 46 | #从文件名中解析出训练集的类别 47 | for i in range(m): 48 | #获得文件的名字 49 | fileNameStr = trainingFileList[i] 50 | #获得分类的数字 51 | classNumber = int(fileNameStr.split('_')[0]) 52 | #将获得的类别添加到hwLabels中 53 | hwLabels.append(classNumber) 54 | #将每一个文件的1x1024数据存储到trainingMat矩阵中 55 | trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) 56 | #构建kNN分类器 57 | neigh = kNN(n_neighbors = 3, algorithm = 'auto') 58 | #拟合模型, trainingMat为测试矩阵,hwLabels为对应的标签 59 | neigh.fit(trainingMat, hwLabels) 60 | #返回testDigits目录下的文件列表 61 | testFileList = listdir('testDigits') 62 | #错误检测计数 63 | errorCount = 0.0 64 | #测试数据的数量 65 | mTest = len(testFileList) 66 | #从文件中解析出测试集的类别并进行分类测试 67 | for i in range(mTest): 68 | #获得文件的名字 69 | fileNameStr = testFileList[i] 70 | #获得分类的数字 71 | classNumber = int(fileNameStr.split('_')[0]) 72 | #获得测试集的1x1024向量,用于训练 73 | vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) 74 | #获得预测结果 75 | # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 76 | classifierResult = neigh.predict(vectorUnderTest) 77 | print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) 78 | if(classifierResult != classNumber): 79 | errorCount += 1.0 80 | print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100)) 81 | 82 | 83 | if __name__ == '__main__': 84 | handwritingClassTest() -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.2.1-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:45 4 | # @Author : GXl 5 | # @File : 3.2.1-1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from math import log 10 | 11 | """ 12 | Parameters: 13 | 无 14 | Returns: 15 | dataSet - 数据集 16 | labels - 分类属性 17 | """ 18 | # 函数说明:创建测试数据集 19 | def createDataSet(): 20 | dataSet = [[0, 0, 0, 0, 'no'],#数据集 21 | [0, 0, 0, 1, 'no'], 22 | [0, 1, 0, 1, 'yes'], 23 | [0, 1, 1, 0, 'yes'], 24 | [0, 0, 0, 0, 'no'], 25 | [1, 0, 0, 0, 'no'], 26 | [1, 0, 0, 1, 'no'], 27 | [1, 1, 1, 1, 'yes'], 28 | [1, 0, 1, 2, 'yes'], 29 | [1, 0, 1, 2, 'yes'], 30 | [2, 0, 1, 2, 'yes'], 31 | [2, 0, 1, 1, 'yes'], 32 | [2, 1, 0, 1, 'yes'], 33 | [2, 1, 0, 2, 'yes'], 34 | [2, 0, 0, 0, 'no']] 35 | labels = ['年龄', '有工作', '有自己的房子', '信贷情况']#分类属性 36 | return dataSet, labels#返回数据集和分类属性 37 | 38 | """ 39 | Parameters: 40 | dataSet - 数据集 41 | Returns: 42 | shannonEnt - 经验熵(香农熵) 43 | """ 44 | # 函数说明:计算给定数据集的经验熵(香农熵) 45 | def calcShannonEnt(dataSet): 46 | numEntires = len(dataSet) #返回数据集的行数 47 | labelCounts = {} #保存每个标签(Label)出现次数的字典 48 | for featVec in dataSet: #对每组特征向量进行统计 49 | currentLabel = featVec[-1] #提取标签(Label)信息 50 | if currentLabel not in labelCounts.keys(): #如果标签(Label)没有放入统计次数的字典,添加进去 51 | labelCounts[currentLabel] = 0 52 | labelCounts[currentLabel] += 1 #Label计数 53 | shannonEnt = 0.0 #经验熵(香农熵) 54 | for key in labelCounts: #计算香农熵 55 | prob = float(labelCounts[key]) / numEntires #选择该标签(Label)的概率 56 | shannonEnt -= prob * log(prob, 2) #利用公式计算 57 | return shannonEnt #返回经验熵(香农熵) 58 | 59 | 60 | if __name__ == '__main__': 61 | dataSet, features = createDataSet() 62 | print(dataSet) 63 | print(calcShannonEnt(dataSet)) -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.2.1-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:47 4 | # @Author : GXl 5 | # @File : 3.2.1-2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from math import log 10 | 11 | """ 12 | Parameters: 13 | dataSet - 数据集 14 | Returns: 15 | shannonEnt - 经验熵(香农熵) 16 | """ 17 | # 函数说明:计算给定数据集的经验熵(香农熵) 18 | def calcShannonEnt(dataSet): 19 | numEntires = len(dataSet) #返回数据集的行数 20 | labelCounts = {} #保存每个标签(Label)出现次数的字典 21 | for featVec in dataSet: #对每组特征向量进行统计 22 | currentLabel = featVec[-1] #提取标签(Label)信息 23 | if currentLabel not in labelCounts.keys(): #如果标签(Label)没有放入统计次数的字典,添加进去 24 | labelCounts[currentLabel] = 0 25 | labelCounts[currentLabel] += 1 #Label计数 26 | shannonEnt = 0.0 #经验熵(香农熵) 27 | for key in labelCounts: #计算香农熵 28 | prob = float(labelCounts[key]) / numEntires #选择该标签(Label)的概率 29 | shannonEnt -= prob * log(prob, 2) #利用公式计算 30 | return shannonEnt #返回经验熵(香农熵) 31 | 32 | """ 33 | Parameters: 34 | 无 35 | Returns: 36 | dataSet - 数据集 37 | labels - 分类属性 38 | """ 39 | # 函数说明:创建测试数据集 40 | def createDataSet(): 41 | dataSet = [[0, 0, 0, 0, 'no'],#数据集 42 | [0, 0, 0, 1, 'no'], 43 | [0, 1, 0, 1, 'yes'], 44 | [0, 1, 1, 0, 'yes'], 45 | [0, 0, 0, 0, 'no'], 46 | [1, 0, 0, 0, 'no'], 47 | [1, 0, 0, 1, 'no'], 48 | [1, 1, 1, 1, 'yes'], 49 | [1, 0, 1, 2, 'yes'], 50 | [1, 0, 1, 2, 'yes'], 51 | [2, 0, 1, 2, 'yes'], 52 | [2, 0, 1, 1, 'yes'], 53 | [2, 1, 0, 1, 'yes'], 54 | [2, 1, 0, 2, 'yes'], 55 | [2, 0, 0, 0, 'no']] 56 | labels = ['年龄', '有工作', '有自己的房子', '信贷情况']#分类属性 57 | return dataSet, labels#返回数据集和分类属性 58 | 59 | """ 60 | Parameters: 61 | dataSet - 待划分的数据集 62 | axis - 划分数据集的特征 63 | value - 需要返回的特征的值 64 | Returns: 65 | 无 66 | """ 67 | # 函数说明:按照给定特征划分数据集 68 | def splitDataSet(dataSet, axis, value): 69 | retDataSet = [] #创建返回的数据集列表 70 | for featVec in dataSet: #遍历数据集 71 | if featVec[axis] == value: 72 | reducedFeatVec = featVec[:axis] #去掉axis特征 73 | reducedFeatVec.extend(featVec[axis+1:])#将符合条件的添加到返回的数据集 74 | retDataSet.append(reducedFeatVec) 75 | return retDataSet #返回划分后的数据集 76 | 77 | """ 78 | Parameters: 79 | dataSet - 数据集 80 | Returns: 81 | bestFeature - 信息增益最大的(最优)特征的索引值 82 | """ 83 | # 函数说明:选择最优特征 84 | def chooseBestFeatureToSplit(dataSet): 85 | numFeatures = len(dataSet[0]) - 1 #特征数量 86 | baseEntropy = calcShannonEnt(dataSet) #计算数据集的香农熵 87 | bestInfoGain = 0.0 #信息增益 88 | bestFeature = -1 #最优特征的索引值 89 | for i in range(numFeatures): #遍历所有特征 90 | #获取dataSet的第i个所有特征 91 | featList = [example[i] for example in dataSet] 92 | uniqueVals = set(featList) #创建set集合{},元素不可重复 93 | newEntropy = 0.0 #经验条件熵 94 | for value in uniqueVals: #计算信息增益 95 | subDataSet = splitDataSet(dataSet, i, value) #subDataSet划分后的子集 96 | prob = len(subDataSet) / float(len(dataSet)) #计算子集的概率 97 | newEntropy += prob * calcShannonEnt(subDataSet)#根据公式计算经验条件熵 98 | infoGain = baseEntropy - newEntropy #信息增益 99 | print("第%d个特征的增益为%.3f" % (i, infoGain)) #打印每个特征的信息增益 100 | if (infoGain > bestInfoGain): #计算信息增益 101 | bestInfoGain = infoGain #更新信息增益,找到最大的信息增益 102 | bestFeature = i #记录信息增益最大的特征的索引值 103 | return bestFeature #返回信息增益最大的特征的索引值 104 | 105 | 106 | if __name__ == '__main__': 107 | dataSet, features = createDataSet() 108 | print("最优特征索引值:" + str(chooseBestFeatureToSplit(dataSet))) -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.2.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:52 4 | # @Author : GXl 5 | # @File : 3.2.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from math import log 10 | import operator 11 | 12 | """ 13 | Parameters: 14 | dataSet - 数据集 15 | Returns: 16 | shannonEnt - 经验熵(香农熵) 17 | """ 18 | # 函数说明:计算给定数据集的经验熵(香农熵) 19 | def calcShannonEnt(dataSet): 20 | numEntires = len(dataSet) #返回数据集的行数 21 | labelCounts = {} #保存每个标签(Label)出现次数的字典 22 | for featVec in dataSet: #对每组特征向量进行统计 23 | currentLabel = featVec[-1] #提取标签(Label)信息 24 | if currentLabel not in labelCounts.keys(): #如果标签(Label)没有放入统计次数的字典,添加进去 25 | labelCounts[currentLabel] = 0 26 | labelCounts[currentLabel] += 1 #Label计数 27 | shannonEnt = 0.0 #经验熵(香农熵) 28 | for key in labelCounts: #计算香农熵 29 | prob = float(labelCounts[key]) / numEntires#选择该标签(Label)的概率 30 | shannonEnt -= prob * log(prob, 2) #利用公式计算 31 | return shannonEnt #返回经验熵(香农熵) 32 | 33 | """ 34 | Parameters: 35 | 无 36 | Returns: 37 | dataSet - 数据集 38 | labels - 特征标签 39 | """ 40 | # 函数说明:创建测试数据集 41 | def createDataSet(): 42 | dataSet = [[0, 0, 0, 0, 'no'],#数据集 43 | [0, 0, 0, 1, 'no'], 44 | [0, 1, 0, 1, 'yes'], 45 | [0, 1, 1, 0, 'yes'], 46 | [0, 0, 0, 0, 'no'], 47 | [1, 0, 0, 0, 'no'], 48 | [1, 0, 0, 1, 'no'], 49 | [1, 1, 1, 1, 'yes'], 50 | [1, 0, 1, 2, 'yes'], 51 | [1, 0, 1, 2, 'yes'], 52 | [2, 0, 1, 2, 'yes'], 53 | [2, 0, 1, 1, 'yes'], 54 | [2, 1, 0, 1, 'yes'], 55 | [2, 1, 0, 2, 'yes'], 56 | [2, 0, 0, 0, 'no']] 57 | labels = ['年龄', '有工作', '有自己的房子', '信贷情况']#特征标签 58 | return dataSet, labels#返回数据集和分类属性 59 | 60 | """ 61 | Parameters: 62 | dataSet - 待划分的数据集 63 | axis - 划分数据集的特征 64 | value - 需要返回的特征的值 65 | Returns: 66 | 无 67 | """ 68 | # 函数说明:按照给定特征划分数据集 69 | def splitDataSet(dataSet, axis, value): 70 | retDataSet = [] #创建返回的数据集列表 71 | for featVec in dataSet: #遍历数据集 72 | if featVec[axis] == value: 73 | reducedFeatVec = featVec[:axis] #去掉axis特征 74 | reducedFeatVec.extend(featVec[axis+1:])#将符合条件的添加到返回的数据集 75 | retDataSet.append(reducedFeatVec) 76 | return retDataSet #返回划分后的数据集 77 | 78 | """ 79 | Parameters: 80 | dataSet - 数据集 81 | Returns: 82 | bestFeature - 信息增益最大的(最优)特征的索引值 83 | """ 84 | # 函数说明:选择最优特征 85 | def chooseBestFeatureToSplit(dataSet): 86 | numFeatures = len(dataSet[0]) - 1 #特征数量 87 | baseEntropy = calcShannonEnt(dataSet) #计算数据集的香农熵 88 | bestInfoGain = 0.0 #信息增益 89 | bestFeature = -1 #最优特征的索引值 90 | for i in range(numFeatures): #遍历所有特征 91 | #获取dataSet的第i个所有特征 92 | featList = [example[i] for example in dataSet] 93 | uniqueVals = set(featList) #创建set集合{},元素不可重复 94 | newEntropy = 0.0 #经验条件熵 95 | for value in uniqueVals: #计算信息增益 96 | subDataSet = splitDataSet(dataSet, i, value) #subDataSet划分后的子集 97 | prob = len(subDataSet) / float(len(dataSet)) #计算子集的概率 98 | newEntropy += prob * calcShannonEnt(subDataSet)#根据公式计算经验条件熵 99 | infoGain = baseEntropy - newEntropy #信息增益 100 | # print("第%d个特征的增益为%.3f" % (i, infoGain)) #打印每个特征的信息增益 101 | if (infoGain > bestInfoGain): #计算信息增益 102 | bestInfoGain = infoGain #更新信息增益,找到最大的信息增益 103 | bestFeature = i #记录信息增益最大的特征的索引值 104 | return bestFeature #返回信息增益最大的特征的索引值 105 | 106 | """ 107 | Parameters: 108 | classList - 类标签列表 109 | Returns: 110 | sortedClassCount[0][0] - 出现此处最多的元素(类标签) 111 | """ 112 | # 函数说明:统计classList中出现此处最多的元素(类标签) 113 | def majorityCnt(classList): 114 | classCount = {} 115 | for vote in classList:#统计classList中每个元素出现的次数 116 | if vote not in classCount.keys():classCount[vote] = 0 117 | classCount[vote] += 1 118 | sortedClassCount = sorted(classCount.items(), key = operator.itemgetter(1), reverse = True)#根据字典的值降序排序 119 | return sortedClassCount[0][0]#返回classList中出现次数最多的元素 120 | 121 | """ 122 | Parameters: 123 | dataSet - 训练数据集 124 | labels - 分类属性标签 125 | featLabels - 存储选择的最优特征标签 126 | Returns: 127 | myTree - 决策树 128 | """ 129 | # 函数说明:创建决策树 130 | def createTree(dataSet, labels, featLabels): 131 | classList = [example[-1] for example in dataSet] #取分类标签(是否放贷:yes or no) 132 | if classList.count(classList[0]) == len(classList): #如果类别完全相同则停止继续划分 133 | return classList[0] 134 | if len(dataSet[0]) == 1: #遍历完所有特征时返回出现次数最多的类标签 135 | return majorityCnt(classList) 136 | bestFeat = chooseBestFeatureToSplit(dataSet) #选择最优特征 137 | bestFeatLabel = labels[bestFeat] #最优特征的标签 138 | featLabels.append(bestFeatLabel) 139 | myTree = {bestFeatLabel:{}} #根据最优特征的标签生成树 140 | del(labels[bestFeat]) #删除已经使用特征标签 141 | featValues = [example[bestFeat] for example in dataSet]#得到训练集中所有最优特征的属性值 142 | uniqueVals = set(featValues) #去掉重复的属性值 143 | for value in uniqueVals: #遍历特征,创建决策树。 144 | myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), labels, featLabels) 145 | return myTree 146 | 147 | 148 | if __name__ == '__main__': 149 | dataSet, labels = createDataSet() 150 | featLabels = [] 151 | myTree = createTree(dataSet, labels, featLabels) 152 | print(myTree) -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.5.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:57 4 | # @Author : GXl 5 | # @File : 3.5.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import pickle 10 | 11 | """ 12 | Parameters: 13 | inputTree - 已经生成的决策树 14 | filename - 决策树的存储文件名 15 | Returns: 16 | 无 17 | """ 18 | # 函数说明:存储决策树 19 | def storeTree(inputTree, filename): 20 | with open(filename, 'wb') as fw: 21 | pickle.dump(inputTree, fw) 22 | 23 | 24 | if __name__ == '__main__': 25 | myTree = {'有自己的房子': {0: {'有工作': {0: 'no', 1: 'yes'}}, 1: 'yes'}} 26 | storeTree(myTree, 'classifierStorage.txt') -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.5.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:58 4 | # @Author : GXl 5 | # @File : 3.5.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import pickle 10 | 11 | """ 12 | Parameters: 13 | filename - 决策树的存储文件名 14 | Returns: 15 | pickle.load(fr) - 决策树字典 16 | """ 17 | # 函数说明:读取决策树 18 | def grabTree(filename): 19 | fr = open(filename, 'rb') 20 | return pickle.load(fr) 21 | 22 | 23 | if __name__ == '__main__': 24 | myTree = grabTree('classifierStorage.txt') 25 | print(myTree) -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.6.2-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:58 4 | # @Author : GXl 5 | # @File : 3.6.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import pandas as pd 10 | 11 | if __name__ == '__main__': 12 | with open('lenses.txt', 'r') as fr: #加载文件 13 | lenses = [inst.strip().split('\t') for inst in fr.readlines()]#处理文件 14 | lenses_target = [] #提取每组数据的类别,保存在列表里 15 | for each in lenses: 16 | lenses_target.append(each[-1]) 17 | 18 | lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签 19 | lenses_list = [] #保存lenses数据的临时列表 20 | lenses_dict = {} #保存lenses数据的字典,用于生成pandas 21 | for each_label in lensesLabels: #提取信息,生成字典 22 | for each in lenses: 23 | lenses_list.append(each[lensesLabels.index(each_label)]) 24 | lenses_dict[each_label] = lenses_list 25 | lenses_list = [] 26 | print(lenses_dict) #打印字典信息 27 | lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame 28 | print(lenses_pd) -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.6.2-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 13:59 4 | # @Author : GXl 5 | # @File : 3.6.2-2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import pandas as pd 10 | from sklearn.preprocessing import LabelEncoder 11 | 12 | if __name__ == '__main__': 13 | with open('lenses.txt', 'r') as fr: #加载文件 14 | lenses = [inst.strip().split('\t') for inst in fr.readlines()]#处理文件 15 | lenses_target = [] #提取每组数据的类别,保存在列表里 16 | for each in lenses: 17 | lenses_target.append(each[-1]) 18 | 19 | lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签 20 | lenses_list = [] #保存lenses数据的临时列表 21 | lenses_dict = {} #保存lenses数据的字典,用于生成pandas 22 | for each_label in lensesLabels: #提取信息,生成字典 23 | for each in lenses: 24 | lenses_list.append(each[lensesLabels.index(each_label)]) 25 | lenses_dict[each_label] = lenses_list 26 | lenses_list = [] 27 | # print(lenses_dict) #打印字典信息 28 | lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame 29 | print(lenses_pd) #打印pandas.DataFrame 30 | le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 31 | for col in lenses_pd.columns: #为每一列序列化 32 | lenses_pd[col] = le.fit_transform(lenses_pd[col]) 33 | print(lenses_pd) -------------------------------------------------------------------------------- /Ch03-DecisionTree/3.6.2-3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:01 4 | # @Author : GXl 5 | # @File : 3.6.2-3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import pandas as pd 10 | import numpy as np 11 | import matplotlib.pyplot as plt 12 | 13 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 14 | from sklearn.externals.six import StringIO 15 | from sklearn import tree 16 | 17 | if __name__ == '__main__': 18 | with open('lenses.txt', 'r') as fr: #加载文件 19 | lenses = [inst.strip().split('\t') for inst in fr.readlines()]#处理文件 20 | lenses_target = [] #提取每组数据的类别,保存在列表里 21 | for each in lenses: 22 | lenses_target.append(each[-1]) 23 | print(lenses_target) 24 | 25 | lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate'] #特征标签 26 | lenses_list = [] #保存lenses数据的临时列表 27 | lenses_dict = {} #保存lenses数据的字典,用于生成pandas 28 | for each_label in lensesLabels: #提取信息,生成字典 29 | for each in lenses: 30 | lenses_list.append(each[lensesLabels.index(each_label)]) 31 | lenses_dict[each_label] = lenses_list 32 | lenses_list = [] 33 | # print(lenses_dict) #打印字典信息 34 | lenses_pd = pd.DataFrame(lenses_dict) #生成pandas.DataFrame 35 | # print(lenses_pd) #打印pandas.DataFrame 36 | le = LabelEncoder() #创建LabelEncoder()对象,用于序列化 37 | for col in lenses_pd.columns: #序列化 38 | lenses_pd[col] = le.fit_transform(lenses_pd[col]) 39 | # print(lenses_pd) #打印编码信息 40 | 41 | clf = tree.DecisionTreeClassifier(max_depth = 4) #创建DecisionTreeClassifier()类 42 | # clf = clf.fit(lenses_pd.values.tolist(), lenses_target) #使用数据,构建决策树 43 | tree.plot_tree(clf.fit(lenses_pd.values.tolist(), lenses_target), filled=True) 44 | plt.show() -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.7.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:06 4 | # @Author : GXl 5 | # @File : 4.7.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | ''' 10 | Parameters: 11 | 无 12 | Returns: 13 | postingList - 实验样本切分的词条 14 | classVec - 类别标签向量 15 | ''' 16 | # 函数说明:创建实验样本 17 | def loadDataSet(): 18 | postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #切分的词条 19 | ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 20 | ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 21 | ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 22 | ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 23 | ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] 24 | classVec = [0,1,0,1,0,1]#类别标签向量,1代表侮辱性词汇,0代表不是 25 | return postingList,classVec 26 | 27 | ''' 28 | Parameters: 29 | vocabList - createVocabList返回的列表 30 | inputSet - 切分的词条列表 31 | Returns: 32 | returnVec - 文档向量,词集模型 33 | ''' 34 | # 函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0 35 | def setOfWords2Vec(vocabList, inputSet): 36 | returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 37 | for word in inputSet: #遍历每个词条 38 | if word in vocabList: #如果词条存在于词汇表中,则置1 39 | returnVec[vocabList.index(word)] = 1 40 | else: print("the word: %s is not in my Vocabulary!" % word) 41 | return returnVec #返回文档向量 42 | 43 | ''' 44 | Parameters: 45 | dataSet - 整理的样本数据集 46 | Returns: 47 | vocabSet - 返回不重复的词条列表,也就是词汇表 48 | ''' 49 | # 函数说明:将切分的实验样本词条整理成不重复的词条列表,也就是词汇表 50 | def createVocabList(dataSet): 51 | vocabSet = set([]) #创建一个空的不重复列表 52 | for document in dataSet: 53 | vocabSet = vocabSet | set(document) #取并集 54 | return list(vocabSet) 55 | 56 | 57 | if __name__ == '__main__': 58 | postingList, classVec = loadDataSet() 59 | print('postingList:\n',postingList) 60 | myVocabList = createVocabList(postingList) 61 | print('myVocabList:\n',myVocabList) 62 | trainMat = [] 63 | for postinDoc in postingList: 64 | trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) 65 | print('trainMat:\n', trainMat) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.7.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:07 4 | # @Author : GXl 5 | # @File : 4.7.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | ''' 12 | Parameters: 13 | 无 14 | Returns: 15 | postingList - 实验样本切分的词条 16 | classVec - 类别标签向量 17 | ''' 18 | # 函数说明:创建实验样本 19 | def loadDataSet(): 20 | postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #切分的词条 21 | ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 22 | ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 23 | ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 24 | ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 25 | ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] 26 | classVec = [0,1,0,1,0,1]#类别标签向量,1代表侮辱性词汇,0代表不是 27 | return postingList,classVec 28 | 29 | ''' 30 | Parameters: 31 | vocabList - createVocabList返回的列表 32 | inputSet - 切分的词条列表 33 | Returns: 34 | returnVec - 文档向量,词集模型 35 | ''' 36 | # 函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0 37 | def setOfWords2Vec(vocabList, inputSet): 38 | returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 39 | for word in inputSet: #遍历每个词条 40 | if word in vocabList: #如果词条存在于词汇表中,则置1 41 | returnVec[vocabList.index(word)] = 1 42 | else: print("the word: %s is not in my Vocabulary!" % word) 43 | return returnVec #返回文档向量 44 | 45 | ''' 46 | Parameters: 47 | dataSet - 整理的样本数据集 48 | Returns: 49 | vocabSet - 返回不重复的词条列表,也就是词汇表 50 | ''' 51 | # 函数说明:将切分的实验样本词条整理成不重复的词条列表,也就是词汇表 52 | def createVocabList(dataSet): 53 | vocabSet = set([]) #创建一个空的不重复列表 54 | for document in dataSet: 55 | vocabSet = vocabSet | set(document) #取并集 56 | return list(vocabSet) 57 | 58 | ''' 59 | Parameters: 60 | trainMatrix - 训练文档矩阵,即setOfWords2Vec返回的returnVec构成的矩阵 61 | trainCategory - 训练类别标签向量,即loadDataSet返回的classVec 62 | Returns: 63 | p0Vect - 侮辱类的条件概率数组 64 | p1Vect - 非侮辱类的条件概率数组 65 | pAbusive - 文档属于侮辱类的概率 66 | ''' 67 | # 函数说明:朴素贝叶斯分类器训练函数 68 | def trainNB0(trainMatrix,trainCategory): 69 | numTrainDocs = len(trainMatrix) #计算训练的文档数目 70 | numWords = len(trainMatrix[0]) #计算每篇文档的词条数 71 | pAbusive = sum(trainCategory)/float(numTrainDocs) #文档属于侮辱类的概率 72 | p0Num = np.zeros(numWords); p1Num = np.zeros(numWords)#创建numpy.zeros数组,词条出现数初始化为0 73 | p0Denom = 0.0; p1Denom = 0.0 #分母初始化为0 74 | for i in range(numTrainDocs): 75 | if trainCategory[i] == 1: #统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)··· 76 | p1Num += trainMatrix[i] 77 | p1Denom += sum(trainMatrix[i]) 78 | else: #统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)··· 79 | p0Num += trainMatrix[i] 80 | p0Denom += sum(trainMatrix[i]) 81 | p1Vect = p1Num/p1Denom 82 | p0Vect = p0Num/p0Denom 83 | return p0Vect,p1Vect,pAbusive#返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率 84 | 85 | 86 | if __name__ == '__main__': 87 | postingList, classVec = loadDataSet() 88 | myVocabList = createVocabList(postingList) 89 | print('myVocabList:\n', myVocabList) 90 | trainMat = [] 91 | for postinDoc in postingList: 92 | trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) 93 | p0V, p1V, pAb = trainNB0(trainMat, classVec) 94 | print('p0V:\n', p0V) 95 | print('p1V:\n', p1V) 96 | print('classVec:\n', classVec) 97 | print('pAb:\n', pAb) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.7.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:08 4 | # @Author : GXl 5 | # @File : 4.7.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | ''' 12 | Parameters: 13 | 无 14 | Returns: 15 | postingList - 实验样本切分的词条 16 | classVec - 类别标签向量 17 | ''' 18 | # 函数说明:创建实验样本 19 | def loadDataSet(): 20 | postingList=[['my', 'dog', 'has', 'flea', 'problems', 'help', 'please'], #切分的词条 21 | ['maybe', 'not', 'take', 'him', 'to', 'dog', 'park', 'stupid'], 22 | ['my', 'dalmation', 'is', 'so', 'cute', 'I', 'love', 'him'], 23 | ['stop', 'posting', 'stupid', 'worthless', 'garbage'], 24 | ['mr', 'licks', 'ate', 'my', 'steak', 'how', 'to', 'stop', 'him'], 25 | ['quit', 'buying', 'worthless', 'dog', 'food', 'stupid']] 26 | classVec = [0,1,0,1,0,1]#类别标签向量,1代表侮辱性词汇,0代表不是 27 | return postingList,classVec 28 | 29 | ''' 30 | Parameters: 31 | vocabList - createVocabList返回的列表 32 | inputSet - 切分的词条列表 33 | Returns: 34 | returnVec - 文档向量,词集模型 35 | ''' 36 | # 函数说明:根据vocabList词汇表,将inputSet向量化,向量的每个元素为1或0 37 | def setOfWords2Vec(vocabList, inputSet): 38 | returnVec = [0] * len(vocabList) #创建一个其中所含元素都为0的向量 39 | for word in inputSet: #遍历每个词条 40 | if word in vocabList: #如果词条存在于词汇表中,则置1 41 | returnVec[vocabList.index(word)] = 1 42 | else: print("the word: %s is not in my Vocabulary!" % word) 43 | return returnVec #返回文档向量 44 | 45 | ''' 46 | Parameters: 47 | dataSet - 整理的样本数据集 48 | Returns: 49 | vocabSet - 返回不重复的词条列表,也就是词汇表 50 | ''' 51 | # 函数说明:将切分的实验样本词条整理成不重复的词条列表,也就是词汇表 52 | def createVocabList(dataSet): 53 | vocabSet = set([]) #创建一个空的不重复列表 54 | for document in dataSet: 55 | vocabSet = vocabSet | set(document) #取并集 56 | return list(vocabSet) 57 | 58 | ''' 59 | Parameters: 60 | trainMatrix - 训练文档矩阵,即setOfWords2Vec返回的returnVec构成的矩阵 61 | trainCategory - 训练类别标签向量,即loadDataSet返回的classVec 62 | Returns: 63 | p0Vect - 侮辱类的条件概率数组 64 | p1Vect - 非侮辱类的条件概率数组 65 | pAbusive - 文档属于侮辱类的概率 66 | ''' 67 | # 函数说明:朴素贝叶斯分类器训练函数 68 | def trainNB0(trainMatrix,trainCategory): 69 | numTrainDocs = len(trainMatrix) #计算训练的文档数目 70 | numWords = len(trainMatrix[0]) #计算每篇文档的词条数 71 | pAbusive = sum(trainCategory)/float(numTrainDocs) #文档属于侮辱类的概率 72 | p0Num = np.ones(numWords); p1Num = np.ones(numWords)#创建numpy.ones数组,词条出现数初始化为1,拉普拉斯平滑 73 | p0Denom = 2.0; p1Denom = 2.0 #分母初始化为2,拉普拉斯平滑 74 | for i in range(numTrainDocs): 75 | if trainCategory[i] == 1:#统计属于侮辱类的条件概率所需的数据,即P(w0|1),P(w1|1),P(w2|1)··· 76 | p1Num += trainMatrix[i] 77 | p1Denom += sum(trainMatrix[i]) 78 | else: #统计属于非侮辱类的条件概率所需的数据,即P(w0|0),P(w1|0),P(w2|0)··· 79 | p0Num += trainMatrix[i] 80 | p0Denom += sum(trainMatrix[i]) 81 | p1Vect = np.log(p1Num/p1Denom) #取对数,防止下溢出 82 | p0Vect = np.log(p0Num/p0Denom) 83 | #返回属于侮辱类的条件概率数组,属于非侮辱类的条件概率数组,文档属于侮辱类的概率 84 | return p0Vect,p1Vect,pAbusive 85 | 86 | if __name__ == '__main__': 87 | postingList, classVec = loadDataSet() 88 | myVocabList = createVocabList(postingList) 89 | print('myVocabList:\n', myVocabList) 90 | trainMat = [] 91 | for postinDoc in postingList: 92 | trainMat.append(setOfWords2Vec(myVocabList, postinDoc)) 93 | p0V, p1V, pAb = trainNB0(trainMat, classVec) 94 | print('p0V:\n', p0V) 95 | print('p1V:\n', p1V) 96 | print('classVec:\n', classVec) 97 | print('pAb:\n', pAb) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.8.1.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import re 3 | 4 | # 函数说明:接收一个大字符串并将其解析为字符串列表 5 | def textParse(bigString): #将字符串转换为字符列表 6 | #将特殊符号作为切分标志进行字符串切分,即非字母、非数字 7 | listOfTokens = re.split(r'\W*', bigString) 8 | return [tok.lower() for tok in listOfTokens if len(tok) > 2]#除了单个字母,例如大写的I,其它单词变成小写 9 | 10 | ''' 11 | Parameters: 12 | dataSet - 整理的样本数据集 13 | Returns: 14 | vocabSet - 返回不重复的词条列表,也就是词汇表 15 | ''' 16 | # 函数说明:将切分的实验样本词条整理成不重复的词条列表,也就是词汇表 17 | def createVocabList(dataSet): 18 | vocabSet = set([]) #创建一个空的不重复列表 19 | for document in dataSet: 20 | vocabSet = vocabSet | set(document) #取并集 21 | return list(vocabSet) 22 | 23 | if __name__ == '__main__': 24 | docList = []; classList = [] 25 | for i in range(1, 26): #遍历25个txt文件 26 | wordList = textParse(open('email/spam/%d.txt' % i, 'r').read())#读取每个垃圾邮件,并字符串转换成字符串列表 27 | docList.append(wordList) 28 | classList.append(1) #标记垃圾邮件,1表示垃圾文件 29 | wordList = textParse(open('email/ham/%d.txt' % i, 'r').read()) #读取每个非垃圾邮件,并字符串转换成字符串列表 30 | docList.append(wordList) 31 | classList.append(0) #标记非垃圾邮件,1表示垃圾文件 32 | vocabList = createVocabList(docList) #创建词汇表,不重复 33 | print(vocabList) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.9.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:24 4 | # @Author : GXl 5 | # @File : 4.9.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | # -*- coding: UTF-8 -*- 10 | import os 11 | import jieba 12 | 13 | def TextProcessing(folder_path): 14 | folder_list = os.listdir(folder_path) #查看folder_path下的文件 15 | data_list = [] #训练集 16 | class_list = [] 17 | 18 | #遍历每个子文件夹 19 | for folder in folder_list: 20 | new_folder_path = os.path.join(folder_path, folder)#根据子文件夹,生成新的路径 21 | files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表 22 | 23 | j = 1 24 | #遍历每个txt文件 25 | for file in files: 26 | if j > 100: #每类txt样本数最多100个 27 | break 28 | with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f:#打开txt文件 29 | raw = f.read() 30 | 31 | word_cut = jieba.cut(raw, cut_all = False) #精简模式,返回一个可迭代的generator 32 | word_list = list(word_cut) #generator转换为list 33 | 34 | data_list.append(word_list) 35 | class_list.append(folder) 36 | j += 1 37 | print(data_list) 38 | print(class_list) 39 | 40 | 41 | if __name__ == '__main__': 42 | #文本预处理 43 | folder_path = './SogouC/Sample' #训练集存放地址 44 | TextProcessing(folder_path) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.9.2-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:25 4 | # @Author : GXl 5 | # @File : 4.9.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | # -*- coding: UTF-8 -*- 10 | import os 11 | import random 12 | import jieba 13 | 14 | """ 15 | 函数说明:中文文本处理 16 | 17 | Parameters: 18 | folder_path - 文本存放的路径 19 | test_size - 测试集占比,默认占所有数据集的百分之20 20 | Returns: 21 | all_words_list - 按词频降序排序的训练集列表 22 | train_data_list - 训练集列表 23 | test_data_list - 测试集列表 24 | train_class_list - 训练集标签列表 25 | test_class_list - 测试集标签列表 26 | """ 27 | def TextProcessing(folder_path, test_size = 0.2): 28 | folder_list = os.listdir(folder_path) #查看folder_path下的文件 29 | data_list = [] #数据集数据 30 | class_list = [] #数据集类别 31 | 32 | #遍历每个子文件夹 33 | for folder in folder_list: 34 | new_folder_path = os.path.join(folder_path, folder) #根据子文件夹,生成新的路径 35 | files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表 36 | 37 | j = 1 38 | #遍历每个txt文件 39 | for file in files: 40 | if j > 100: #每类txt样本数最多100个 41 | break 42 | with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打开txt文件 43 | raw = f.read() 44 | 45 | word_cut = jieba.cut(raw, cut_all = False) #精简模式,返回一个可迭代的generator 46 | word_list = list(word_cut) #generator转换为list 47 | 48 | data_list.append(word_list) #添加数据集数据 49 | class_list.append(folder) #添加数据集类别 50 | j += 1 51 | 52 | data_class_list = list(zip(data_list, class_list)) #zip压缩合并,将数据与标签对应压缩 53 | random.shuffle(data_class_list) #将data_class_list乱序 54 | index = int(len(data_class_list) * test_size) + 1 #训练集和测试集切分的索引值 55 | train_list = data_class_list[index:] #训练集 56 | test_list = data_class_list[:index] #测试集 57 | train_data_list, train_class_list = zip(*train_list) #训练集解压缩 58 | test_data_list, test_class_list = zip(*test_list) #测试集解压缩 59 | 60 | all_words_dict = {} #统计训练集词频 61 | for word_list in train_data_list: 62 | for word in word_list: 63 | if word in all_words_dict.keys(): 64 | all_words_dict[word] += 1 65 | else: 66 | all_words_dict[word] = 1 67 | 68 | #根据键的值倒序排序 69 | all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) 70 | all_words_list, all_words_nums = zip(*all_words_tuple_list) #解压缩 71 | all_words_list = list(all_words_list) #转换成列表 72 | return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list 73 | 74 | if __name__ == '__main__': 75 | #文本预处理 76 | folder_path = './SogouC/Sample' #训练集存放地址 77 | all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2) 78 | print(all_words_list) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.9.2-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:26 4 | # @Author : GXl 5 | # @File : 4.9.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | # -*- coding: UTF-8 -*- 10 | import os 11 | import random 12 | import jieba 13 | 14 | """ 15 | 函数说明:中文文本处理 16 | 17 | Parameters: 18 | folder_path - 文本存放的路径 19 | test_size - 测试集占比,默认占所有数据集的百分之20 20 | Returns: 21 | all_words_list - 按词频降序排序的训练集列表 22 | train_data_list - 训练集列表 23 | test_data_list - 测试集列表 24 | train_class_list - 训练集标签列表 25 | test_class_list - 测试集标签列表 26 | """ 27 | def TextProcessing(folder_path, test_size = 0.2): 28 | folder_list = os.listdir(folder_path) #查看folder_path下的文件 29 | data_list = [] #数据集数据 30 | class_list = [] #数据集类别 31 | 32 | #遍历每个子文件夹 33 | for folder in folder_list: 34 | new_folder_path = os.path.join(folder_path, folder) #根据子文件夹,生成新的路径 35 | files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表 36 | 37 | j = 1 38 | #遍历每个txt文件 39 | for file in files: 40 | if j > 100: #每类txt样本数最多100个 41 | break 42 | with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打开txt文件 43 | raw = f.read() 44 | 45 | word_cut = jieba.cut(raw, cut_all = False) #精简模式,返回一个可迭代的generator 46 | word_list = list(word_cut) #generator转换为list 47 | 48 | data_list.append(word_list) #添加数据集数据 49 | class_list.append(folder) #添加数据集类别 50 | j += 1 51 | 52 | data_class_list = list(zip(data_list, class_list)) #zip压缩合并,将数据与标签对应压缩 53 | random.shuffle(data_class_list) #将data_class_list乱序 54 | index = int(len(data_class_list) * test_size) + 1 #训练集和测试集切分的索引值 55 | train_list = data_class_list[index:] #训练集 56 | test_list = data_class_list[:index] #测试集 57 | train_data_list, train_class_list = zip(*train_list) #训练集解压缩 58 | test_data_list, test_class_list = zip(*test_list) #测试集解压缩 59 | 60 | all_words_dict = {} #统计训练集词频 61 | for word_list in train_data_list: 62 | for word in word_list: 63 | if word in all_words_dict.keys(): 64 | all_words_dict[word] += 1 65 | else: 66 | all_words_dict[word] = 1 67 | 68 | #根据键的值倒序排序 69 | all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) 70 | all_words_list, all_words_nums = zip(*all_words_tuple_list) #解压缩 71 | all_words_list = list(all_words_list) #转换成列表 72 | return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list 73 | 74 | """ 75 | 函数说明:读取文件里的内容,并去重 76 | 77 | Parameters: 78 | words_file - 文件路径 79 | Returns: 80 | words_set - 读取的内容的set集合 81 | """ 82 | def MakeWordsSet(words_file): 83 | words_set = set() #创建set集合 84 | with open(words_file, 'r', encoding = 'utf-8') as f: #打开文件 85 | for line in f.readlines(): #一行一行读取 86 | word = line.strip() #去回车 87 | if len(word) > 0: #有文本,则添加到words_set中 88 | words_set.add(word) 89 | return words_set #返回处理结果 90 | 91 | """ 92 | 函数说明:文本特征选取 93 | 94 | Parameters: 95 | all_words_list - 训练集所有文本列表 96 | deleteN - 删除词频最高的deleteN个词 97 | stopwords_set - 指定的结束语 98 | Returns: 99 | feature_words - 特征集 100 | """ 101 | def words_dict(all_words_list, deleteN, stopwords_set = set()): 102 | feature_words = [] #特征列表 103 | n = 1 104 | for t in range(deleteN, len(all_words_list), 1): 105 | if n > 1000: #feature_words的维度为1000 106 | break 107 | #如果这个词不是数字,并且不是指定的结束语,并且单词长度大于1小于5,那么这个词就可以作为特征词 108 | if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: 109 | feature_words.append(all_words_list[t]) 110 | n += 1 111 | return feature_words 112 | 113 | if __name__ == '__main__': 114 | #文本预处理 115 | folder_path = './SogouC/Sample' #训练集存放地址 116 | all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2) 117 | 118 | #生成stopwords_set 119 | stopwords_file = './stopwords_cn.txt' 120 | stopwords_set = MakeWordsSet(stopwords_file) 121 | 122 | feature_words = words_dict(all_words_list, 100, stopwords_set) 123 | print(feature_words) -------------------------------------------------------------------------------- /Ch04-NaiveBayes/4.9.2-4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:27 4 | # @Author : GXl 5 | # @File : 4.9.5.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | # -*- coding: UTF-8 -*- 10 | from sklearn.naive_bayes import MultinomialNB 11 | import matplotlib.pyplot as plt 12 | import os 13 | import random 14 | import jieba 15 | 16 | """ 17 | 函数说明:中文文本处理 18 | 19 | Parameters: 20 | folder_path - 文本存放的路径 21 | test_size - 测试集占比,默认占所有数据集的百分之20 22 | Returns: 23 | all_words_list - 按词频降序排序的训练集列表 24 | train_data_list - 训练集列表 25 | test_data_list - 测试集列表 26 | train_class_list - 训练集标签列表 27 | test_class_list - 测试集标签列表 28 | """ 29 | def TextProcessing(folder_path, test_size = 0.2): 30 | folder_list = os.listdir(folder_path) #查看folder_path下的文件 31 | data_list = [] #数据集数据 32 | class_list = [] #数据集类别 33 | 34 | #遍历每个子文件夹 35 | for folder in folder_list: 36 | new_folder_path = os.path.join(folder_path, folder) #根据子文件夹,生成新的路径 37 | files = os.listdir(new_folder_path) #存放子文件夹下的txt文件的列表 38 | 39 | j = 1 40 | #遍历每个txt文件 41 | for file in files: 42 | if j > 100: #每类txt样本数最多100个 43 | break 44 | with open(os.path.join(new_folder_path, file), 'r', encoding = 'utf-8') as f: #打开txt文件 45 | raw = f.read() 46 | 47 | word_cut = jieba.cut(raw, cut_all = False) #精简模式,返回一个可迭代的generator 48 | word_list = list(word_cut) #generator转换为list 49 | 50 | data_list.append(word_list) #添加数据集数据 51 | class_list.append(folder) #添加数据集类别 52 | j += 1 53 | 54 | data_class_list = list(zip(data_list, class_list)) #zip压缩合并,将数据与标签对应压缩 55 | random.shuffle(data_class_list) #将data_class_list乱序 56 | index = int(len(data_class_list) * test_size) + 1 #训练集和测试集切分的索引值 57 | train_list = data_class_list[index:] #训练集 58 | test_list = data_class_list[:index] #测试集 59 | train_data_list, train_class_list = zip(*train_list) #训练集解压缩 60 | test_data_list, test_class_list = zip(*test_list) #测试集解压缩 61 | 62 | all_words_dict = {} #统计训练集词频 63 | for word_list in train_data_list: 64 | for word in word_list: 65 | if word in all_words_dict.keys(): 66 | all_words_dict[word] += 1 67 | else: 68 | all_words_dict[word] = 1 69 | 70 | #根据键的值倒序排序 71 | all_words_tuple_list = sorted(all_words_dict.items(), key = lambda f:f[1], reverse = True) 72 | all_words_list, all_words_nums = zip(*all_words_tuple_list) #解压缩 73 | all_words_list = list(all_words_list) #转换成列表 74 | return all_words_list, train_data_list, test_data_list, train_class_list, test_class_list 75 | 76 | """ 77 | 函数说明:读取文件里的内容,并去重 78 | 79 | Parameters: 80 | words_file - 文件路径 81 | Returns: 82 | words_set - 读取的内容的set集合 83 | """ 84 | def MakeWordsSet(words_file): 85 | words_set = set() #创建set集合 86 | with open(words_file, 'r', encoding = 'utf-8') as f: #打开文件 87 | for line in f.readlines(): #一行一行读取 88 | word = line.strip() #去回车 89 | if len(word) > 0: #有文本,则添加到words_set中 90 | words_set.add(word) 91 | return words_set #返回处理结果 92 | 93 | """ 94 | 函数说明:根据feature_words将文本向量化 95 | 96 | Parameters: 97 | train_data_list - 训练集 98 | test_data_list - 测试集 99 | feature_words - 特征集 100 | Returns: 101 | train_feature_list - 训练集向量化列表 102 | test_feature_list - 测试集向量化列表 103 | """ 104 | def TextFeatures(train_data_list, test_data_list, feature_words): 105 | def text_features(text, feature_words): #出现在特征集中,则置1 106 | text_words = set(text) 107 | features = [1 if word in text_words else 0 for word in feature_words] 108 | return features 109 | train_feature_list = [text_features(text, feature_words) for text in train_data_list] 110 | test_feature_list = [text_features(text, feature_words) for text in test_data_list] 111 | return train_feature_list, test_feature_list #返回结果 112 | 113 | 114 | """ 115 | 函数说明:文本特征选取 116 | 117 | Parameters: 118 | all_words_list - 训练集所有文本列表 119 | deleteN - 删除词频最高的deleteN个词 120 | stopwords_set - 指定的结束语 121 | Returns: 122 | feature_words - 特征集 123 | """ 124 | def words_dict(all_words_list, deleteN, stopwords_set = set()): 125 | feature_words = [] #特征列表 126 | n = 1 127 | for t in range(deleteN, len(all_words_list), 1): 128 | if n > 1000: #feature_words的维度为1000 129 | break 130 | #如果这个词不是数字,并且不是指定的结束语,并且单词长度大于1小于5,那么这个词就可以作为特征词 131 | if not all_words_list[t].isdigit() and all_words_list[t] not in stopwords_set and 1 < len(all_words_list[t]) < 5: 132 | feature_words.append(all_words_list[t]) 133 | n += 1 134 | return feature_words 135 | 136 | """ 137 | 函数说明:新闻分类器 138 | 139 | Parameters: 140 | train_feature_list - 训练集向量化的特征文本 141 | test_feature_list - 测试集向量化的特征文本 142 | train_class_list - 训练集分类标签 143 | test_class_list - 测试集分类标签 144 | Returns: 145 | test_accuracy - 分类器精度 146 | """ 147 | def TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list): 148 | classifier = MultinomialNB().fit(train_feature_list, train_class_list) 149 | test_accuracy = classifier.score(test_feature_list, test_class_list) 150 | return test_accuracy 151 | 152 | if __name__ == '__main__': 153 | #文本预处理 154 | folder_path = './SogouC/Sample' #训练集存放地址 155 | all_words_list, train_data_list, test_data_list, train_class_list, test_class_list = TextProcessing(folder_path, test_size=0.2) 156 | 157 | # 生成stopwords_set 158 | stopwords_file = './stopwords_cn.txt' 159 | stopwords_set = MakeWordsSet(stopwords_file) 160 | 161 | test_accuracy_list = [] 162 | feature_words = words_dict(all_words_list, 450, stopwords_set) 163 | train_feature_list, test_feature_list = TextFeatures(train_data_list, test_data_list, feature_words) 164 | test_accuracy = TextClassifier(train_feature_list, test_feature_list, train_class_list, test_class_list) 165 | test_accuracy_list.append(test_accuracy) 166 | ave = lambda c: sum(c) / len(c) -------------------------------------------------------------------------------- /Ch05-Logistic/5.4.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:40 4 | # @Author : GXl 5 | # @File : 5.4.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | """ 13 | Parameters: 14 | 无 15 | Returns: 16 | dataMat - 数据列表 17 | labelMat - 标签列表 18 | """ 19 | # 函数说明:加载数据 20 | def loadDataSet(): 21 | dataMat = [] #创建数据列表 22 | labelMat = [] #创建标签列表 23 | fr = open('testSet.txt') #打开文件 24 | for line in fr.readlines(): #逐行读取 25 | lineArr = line.strip().split() #去回车,放入列表 26 | dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #添加数据 27 | labelMat.append(int(lineArr[2])) #添加标签 28 | fr.close() #关闭文件 29 | return dataMat, labelMat #返回 30 | 31 | # 函数说明:绘制数据集 32 | def plotDataSet(): 33 | dataMat, labelMat = loadDataSet() #加载数据集 34 | dataArr = np.array(dataMat) #转换成numpy的array数组 35 | n = np.shape(dataMat)[0] #数据个数 36 | xcord1 = []; ycord1 = [] #正样本 37 | xcord2 = []; ycord2 = [] #负样本 38 | for i in range(n): #根据数据集标签进行分类 39 | if int(labelMat[i]) == 1: 40 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 41 | else: 42 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 43 | fig = plt.figure() 44 | ax = fig.add_subplot(111) #添加subplot 45 | ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 46 | ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5) #绘制负样本 47 | plt.title('DataSet') #绘制title 48 | plt.xlabel('x'); plt.ylabel('y') #绘制label 49 | plt.show() #显示 50 | 51 | 52 | if __name__ == '__main__': 53 | plotDataSet() -------------------------------------------------------------------------------- /Ch05-Logistic/5.4.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:40 4 | # @Author : GXl 5 | # @File : 5.4.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | ''' 12 | Parameters: 13 | 无 14 | Returns: 15 | dataMat - 数据列表 16 | labelMat - 标签列表 17 | ''' 18 | # 函数说明:加载数据 19 | def loadDataSet(): 20 | dataMat = [] #创建数据列表 21 | labelMat = [] #创建标签列表 22 | fr = open('testSet.txt') #打开文件 23 | for line in fr.readlines(): #逐行读取 24 | lineArr = line.strip().split() #去回车,放入列表 25 | dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #添加数据 26 | labelMat.append(int(lineArr[2])) #添加标签 27 | fr.close() #关闭文件 28 | return dataMat, labelMat #返回 29 | 30 | ''' 31 | Parameters: 32 | inX - 数据 33 | Returns: 34 | sigmoid函数 35 | ''' 36 | # 函数说明:sigmoid函数 37 | def sigmoid(inX): 38 | return 1.0 / (1 + np.exp(-inX)) 39 | 40 | ''' 41 | Parameters: 42 | dataMatIn - 数据集 43 | classLabels - 数据标签 44 | Returns: 45 | ''' 46 | # 函数说明:梯度上升算法 47 | def gradAscent(dataMatIn, classLabels): 48 | dataMatrix = np.mat(dataMatIn) #转换成numpy的mat 49 | labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 50 | m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 51 | alpha = 0.001 #移动步长,也就是学习速率,控制更新的幅度。 52 | maxCycles = 500 #最大迭代次数 53 | weights = np.ones((n,1)) 54 | for k in range(maxCycles): 55 | h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 56 | error = labelMat - h 57 | weights = weights + alpha * dataMatrix.transpose() * error 58 | return weights.getA() #将矩阵转换为数组,返回权重数组 59 | 60 | 61 | if __name__ == '__main__': 62 | dataMat, labelMat = loadDataSet() 63 | print(gradAscent(dataMat, labelMat)) -------------------------------------------------------------------------------- /Ch05-Logistic/5.4.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:41 4 | # @Author : GXl 5 | # @File : 5.4.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | ''' 13 | Parameters: 14 | 无 15 | Returns: 16 | dataMat - 数据列表 17 | labelMat - 标签列表 18 | ''' 19 | # 函数说明:加载数据 20 | def loadDataSet(): 21 | dataMat = [] #创建数据列表 22 | labelMat = [] #创建标签列表 23 | fr = open('testSet.txt') #打开文件 24 | for line in fr.readlines(): #逐行读取 25 | lineArr = line.strip().split() #去回车,放入列表 26 | dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #添加数据 27 | labelMat.append(int(lineArr[2])) #添加标签 28 | fr.close() #关闭文件 29 | return dataMat, labelMat #返回 30 | 31 | ''' 32 | Parameters: 33 | inX - 数据 34 | Returns: 35 | sigmoid函数 36 | ''' 37 | # 函数说明:sigmoid函数 38 | def sigmoid(inX): 39 | return 1.0 / (1 + np.exp(-inX)) 40 | 41 | ''' 42 | Parameters: 43 | dataMatIn - 数据集 44 | classLabels - 数据标签 45 | Returns: 46 | weights.getA() - 求得的权重数组(最优参数) 47 | ''' 48 | # 函数说明:梯度上升算法 49 | def gradAscent(dataMatIn, classLabels): 50 | dataMatrix = np.mat(dataMatIn) #转换成numpy的mat 51 | labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 52 | m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 53 | alpha = 0.001 #移动步长,也就是学习速率,控制更新的幅度。 54 | maxCycles = 500 #最大迭代次数 55 | weights = np.ones((n,1)) 56 | for k in range(maxCycles): 57 | h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 58 | error = labelMat - h 59 | weights = weights + alpha * dataMatrix.transpose() * error 60 | return weights.getA() #将矩阵转换为数组,返回权重数组 61 | 62 | ''' 63 | Parameters: 64 | weights - 权重参数数组 65 | Returns: 66 | 无 67 | ''' 68 | # 函数说明:绘制数据集 69 | def plotBestFit(weights): 70 | dataMat, labelMat = loadDataSet() #加载数据集 71 | dataArr = np.array(dataMat) #转换成numpy的array数组 72 | n = np.shape(dataMat)[0] #数据个数 73 | xcord1 = []; ycord1 = [] #正样本 74 | xcord2 = []; ycord2 = [] #负样本 75 | for i in range(n): #根据数据集标签进行分类 76 | if int(labelMat[i]) == 1: 77 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 78 | else: 79 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 80 | fig = plt.figure() 81 | ax = fig.add_subplot(111) #添加subplot 82 | ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 83 | ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5) #绘制负样本 84 | x = np.arange(-3.0, 3.0, 0.1) 85 | y = (-weights[0] - weights[1] * x) / weights[2] 86 | ax.plot(x, y) 87 | plt.title('BestFit') #绘制title 88 | plt.xlabel('X1'); plt.ylabel('X2') #绘制label 89 | plt.show() 90 | 91 | 92 | if __name__ == '__main__': 93 | dataMat, labelMat = loadDataSet() 94 | weights = gradAscent(dataMat, labelMat) 95 | plotBestFit(weights) -------------------------------------------------------------------------------- /Ch05-Logistic/5.4.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:42 4 | # @Author : GXl 5 | # @File : 5.4.4.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import random 13 | 14 | ''' 15 | Parameters: 16 | 无 17 | Returns: 18 | dataMat - 数据列表 19 | labelMat - 标签列表 20 | ''' 21 | # 函数说明:加载数据 22 | def loadDataSet(): 23 | dataMat = [] #创建数据列表 24 | labelMat = [] #创建标签列表 25 | fr = open('testSet.txt') #打开文件 26 | for line in fr.readlines(): #逐行读取 27 | lineArr = line.strip().split() #去回车,放入列表 28 | dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #添加数据 29 | labelMat.append(int(lineArr[2])) #添加标签 30 | fr.close() #关闭文件 31 | return dataMat, labelMat #返回 32 | 33 | ''' 34 | Parameters: 35 | inX - 数据 36 | Returns: 37 | sigmoid函数 38 | ''' 39 | # 函数说明:sigmoid函数 40 | def sigmoid(inX): 41 | return 1.0 / (1 + np.exp(-inX)) 42 | 43 | ''' 44 | Parameters: 45 | weights - 权重参数数组 46 | Returns: 47 | 无 48 | ''' 49 | # 函数说明:绘制数据集 50 | def plotBestFit(weights): 51 | dataMat, labelMat = loadDataSet() #加载数据集 52 | dataArr = np.array(dataMat) #转换成numpy的array数组 53 | n = np.shape(dataMat)[0] #数据个数 54 | xcord1 = []; ycord1 = [] #正样本 55 | xcord2 = []; ycord2 = [] #负样本 56 | for i in range(n): #根据数据集标签进行分类 57 | if int(labelMat[i]) == 1: 58 | xcord1.append(dataArr[i,1]); ycord1.append(dataArr[i,2]) #1为正样本 59 | else: 60 | xcord2.append(dataArr[i,1]); ycord2.append(dataArr[i,2]) #0为负样本 61 | fig = plt.figure() 62 | ax = fig.add_subplot(111) #添加subplot 63 | ax.scatter(xcord1, ycord1, s = 20, c = 'red', marker = 's',alpha=.5)#绘制正样本 64 | ax.scatter(xcord2, ycord2, s = 20, c = 'green',alpha=.5) #绘制负样本 65 | x = np.arange(-3.0, 3.0, 0.1) 66 | y = (-weights[0] - weights[1] * x) / weights[2] 67 | ax.plot(x, y) 68 | plt.title('BestFit') #绘制title 69 | plt.xlabel('X1'); plt.ylabel('X2') #绘制label 70 | plt.show() 71 | 72 | ''' 73 | Parameters: 74 | dataMatrix - 数据数组 75 | classLabels - 数据标签 76 | numIter - 迭代次数 77 | Returns: 78 | weights - 求得的回归系数数组(最优参数) 79 | ''' 80 | # 函数说明:改进的随机梯度上升算法 81 | def stocGradAscent1(dataMatrix, classLabels, numIter=150): 82 | m,n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 83 | weights = np.ones(n) #参数初始化 84 | for j in range(numIter): 85 | dataIndex = list(range(m)) 86 | for i in range(m): 87 | alpha = 4/(1.0+j+i)+0.01 #降低alpha的大小,每次减小1/(j+i)。 88 | randIndex = int(random.uniform(0,len(dataIndex))) #随机选取样本 89 | h = sigmoid(sum(dataMatrix[randIndex]*weights)) #选择随机选取的一个样本,计算h 90 | error = classLabels[randIndex] - h #计算误差 91 | weights = weights + alpha * error * dataMatrix[randIndex]#更新回归系数 92 | del(dataIndex[randIndex]) #删除已经使用的样本 93 | return weights #返回 94 | 95 | 96 | if __name__ == '__main__': 97 | dataMat, labelMat = loadDataSet() 98 | weights = stocGradAscent1(np.array(dataMat), labelMat) 99 | plotBestFit(weights) -------------------------------------------------------------------------------- /Ch05-Logistic/5.4.5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:43 4 | # @Author : GXl 5 | # @File : 5.4.5.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | import random 13 | 14 | ''' 15 | Parameters: 16 | 无 17 | Returns: 18 | dataMat - 数据列表 19 | labelMat - 标签列表 20 | ''' 21 | # 函数说明:加载数据 22 | def loadDataSet(): 23 | dataMat = [] #创建数据列表 24 | labelMat = [] #创建标签列表 25 | fr = open('testSet.txt') #打开文件 26 | for line in fr.readlines(): #逐行读取 27 | lineArr = line.strip().split() #去回车,放入列表 28 | dataMat.append([1.0, float(lineArr[0]), float(lineArr[1])]) #添加数据 29 | labelMat.append(int(lineArr[2])) #添加标签 30 | fr.close() #关闭文件 31 | return dataMat, labelMat #返回 32 | 33 | ''' 34 | Parameters: 35 | inX - 数据 36 | Returns: 37 | sigmoid函数 38 | ''' 39 | # 函数说明:sigmoid函数 40 | def sigmoid(inX): 41 | return 1.0 / (1 + np.exp(-inX)) 42 | 43 | ''' 44 | Parameters: 45 | dataMatIn - 数据集 46 | classLabels - 数据标签 47 | Returns: 48 | ''' 49 | # 函数说明:梯度上升算法 50 | def gradAscent(dataMatIn, classLabels): 51 | dataMatrix = np.mat(dataMatIn) #转换成numpy的mat 52 | labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 53 | m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 54 | alpha = 0.001 #移动步长,也就是学习速率,控制更新的幅度。 55 | maxCycles = 500 #最大迭代次数 56 | weights = np.ones((n,1)) 57 | for k in range(maxCycles): 58 | h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 59 | error = labelMat - h 60 | weights = weights + alpha * dataMatrix.transpose() * error 61 | return weights.getA() #将矩阵转换为数组,返回权重数组 62 | 63 | ''' 64 | Parameters: 65 | dataMatrix - 数据数组 66 | classLabels - 数据标签 67 | numIter - 迭代次数 68 | Returns: 69 | weights - 求得的回归系数数组(最优参数) 70 | ''' 71 | # 函数说明:改进的随机梯度上升算法 72 | def stocGradAscent1(dataMatrix, classLabels, numIter=150): 73 | m,n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 74 | weights = np.ones(n) #参数初始化 75 | for j in range(numIter): 76 | dataIndex = list(range(m)) 77 | for i in range(m): 78 | alpha = 4/(1.0+j+i)+0.01 #降低alpha的大小,每次减小1/(j+i)。 79 | randIndex = int(random.uniform(0,len(dataIndex))) #随机选取样本 80 | h = sigmoid(sum(dataMatrix[randIndex]*weights)) #选择随机选取的一个样本,计算h 81 | error = classLabels[randIndex] - h #计算误差 82 | weights = weights + alpha * error * dataMatrix[randIndex]#更新回归系数 83 | del(dataIndex[randIndex]) #删除已经使用的样本 84 | return weights #返回 85 | 86 | ''' 87 | Parameters: 88 | weights_array1 - 回归系数数组1 89 | weights_array2 - 回归系数数组2 90 | Returns: 91 | 无 92 | ''' 93 | # 函数说明:绘制回归系数与迭代次数的关系 94 | def plotWeights(weights_array1,weights_array2): 95 | #设置汉字格式 96 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 97 | #将fig画布分隔成1行1列,不共享x轴和y轴,fig画布的大小为(13,8) 98 | #当nrow=3,nclos=2时,代表fig画布被分为六个区域,axs[0][0]表示第一行第一列 99 | fig, axs = plt.subplots(nrows=3, ncols=2,sharex=False, sharey=False, figsize=(20,10)) 100 | x1 = np.arange(0, len(weights_array1), 1) 101 | #绘制w0与迭代次数的关系 102 | axs[0][0].plot(x1,weights_array1[:,0]) 103 | axs0_title_text = axs[0][0].set_title(u'梯度上升算法:回归系数与迭代次数关系',FontProperties=font) 104 | axs0_ylabel_text = axs[0][0].set_ylabel(u'W0',FontProperties=font) 105 | plt.setp(axs0_title_text, size=20, weight='bold', color='black') 106 | plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black') 107 | #绘制w1与迭代次数的关系 108 | axs[1][0].plot(x1,weights_array1[:,1]) 109 | axs1_ylabel_text = axs[1][0].set_ylabel(u'W1',FontProperties=font) 110 | plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black') 111 | #绘制w2与迭代次数的关系 112 | axs[2][0].plot(x1,weights_array1[:,2]) 113 | axs2_xlabel_text = axs[2][0].set_xlabel(u'迭代次数',FontProperties=font) 114 | axs2_ylabel_text = axs[2][0].set_ylabel(u'W1',FontProperties=font) 115 | plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') 116 | plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black') 117 | 118 | 119 | x2 = np.arange(0, len(weights_array2), 1) 120 | #绘制w0与迭代次数的关系 121 | axs[0][1].plot(x2,weights_array2[:,0]) 122 | axs0_title_text = axs[0][1].set_title(u'改进的随机梯度上升算法:回归系数与迭代次数关系',FontProperties=font) 123 | axs0_ylabel_text = axs[0][1].set_ylabel(u'W0',FontProperties=font) 124 | plt.setp(axs0_title_text, size=20, weight='bold', color='black') 125 | plt.setp(axs0_ylabel_text, size=20, weight='bold', color='black') 126 | #绘制w1与迭代次数的关系 127 | axs[1][1].plot(x2,weights_array2[:,1]) 128 | axs1_ylabel_text = axs[1][1].set_ylabel(u'W1',FontProperties=font) 129 | plt.setp(axs1_ylabel_text, size=20, weight='bold', color='black') 130 | #绘制w2与迭代次数的关系 131 | axs[2][1].plot(x2,weights_array2[:,2]) 132 | axs2_xlabel_text = axs[2][1].set_xlabel(u'迭代次数',FontProperties=font) 133 | axs2_ylabel_text = axs[2][1].set_ylabel(u'W1',FontProperties=font) 134 | plt.setp(axs2_xlabel_text, size=20, weight='bold', color='black') 135 | plt.setp(axs2_ylabel_text, size=20, weight='bold', color='black') 136 | 137 | plt.show() 138 | 139 | 140 | if __name__ == '__main__': 141 | dataMat, labelMat = loadDataSet() 142 | weights1,weights_array1 = stocGradAscent1(np.array(dataMat), labelMat) 143 | 144 | weights2,weights_array2 = gradAscent(dataMat, labelMat) 145 | plotWeights(weights_array1, weights_array2) -------------------------------------------------------------------------------- /Ch05-Logistic/5.5.2-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:44 4 | # @Author : GXl 5 | # @File : 5.5.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import random 11 | 12 | ''' 13 | Parameters: 14 | inX - 数据 15 | Returns: 16 | sigmoid函数 17 | ''' 18 | # 函数说明:sigmoid函数 19 | def sigmoid(inX): 20 | return 1.0 / (1 + np.exp(-inX)) 21 | 22 | ''' 23 | Parameters: 24 | dataMatrix - 数据数组 25 | classLabels - 数据标签 26 | numIter - 迭代次数 27 | Returns: 28 | weights - 求得的回归系数数组(最优参数) 29 | ''' 30 | # 函数说明:改进的随机梯度上升算法 31 | def stocGradAscent1(dataMatrix, classLabels, numIter=150): 32 | m,n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 33 | weights = np.ones(n) #参数初始化 34 | #存储每次更新的回归系数 35 | for j in range(numIter): 36 | dataIndex = list(range(m)) 37 | for i in range(m): 38 | alpha = 4/(1.0+j+i)+0.01 #降低alpha的大小,每次减小1/(j+i)。 39 | randIndex = int(random.uniform(0,len(dataIndex))) #随机选取样本 40 | h = sigmoid(sum(dataMatrix[randIndex]*weights)) #选择随机选取的一个样本,计算h 41 | error = classLabels[randIndex] - h #计算误差 42 | weights = weights + alpha * error * dataMatrix[randIndex] #更新回归系数 43 | del(dataIndex[randIndex]) #删除已经使用的样本 44 | return weights #返回 45 | 46 | # 函数说明:使用Python写的Logistic分类器做预测 47 | def colicTest(): 48 | frTrain = open('horseColicTraining.txt') #打开训练集 49 | frTest = open('horseColicTest.txt') #打开测试集 50 | trainingSet = []; trainingLabels = [] 51 | for line in frTrain.readlines(): 52 | currLine = line.strip().split('\t') 53 | lineArr = [] 54 | for i in range(len(currLine)-1): 55 | lineArr.append(float(currLine[i])) 56 | trainingSet.append(lineArr) 57 | trainingLabels.append(float(currLine[-1])) 58 | trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500) #使用改进的随即上升梯度训练 59 | errorCount = 0; numTestVec = 0.0 60 | for line in frTest.readlines(): 61 | numTestVec += 1.0 62 | currLine = line.strip().split('\t') 63 | lineArr =[] 64 | for i in range(len(currLine)-1): 65 | lineArr.append(float(currLine[i])) 66 | if int(classifyVector(np.array(lineArr), trainWeights))!= int(currLine[-1]): 67 | errorCount += 1 68 | errorRate = (float(errorCount)/numTestVec) * 100 #错误率计算 69 | print("测试集错误率为: %.2f%%" % errorRate) 70 | 71 | ''' 72 | Parameters: 73 | inX - 特征向量 74 | weights - 回归系数 75 | Returns: 76 | 分类结果 77 | ''' 78 | # 函数说明:分类函数 79 | def classifyVector(inX, weights): 80 | prob = sigmoid(sum(inX*weights)) 81 | if prob > 0.5: return 1.0 82 | else: return 0.0 83 | 84 | 85 | if __name__ == '__main__': 86 | colicTest() -------------------------------------------------------------------------------- /Ch05-Logistic/5.5.2-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:45 4 | # @Author : GXl 5 | # @File : 5.5.2-2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import random 11 | 12 | ''' 13 | Parameters: 14 | inX - 数据 15 | Returns: 16 | sigmoid函数 17 | ''' 18 | # 函数说明:sigmoid函数 19 | def sigmoid(inX): 20 | return 1.0 / (1 + np.exp(-inX)) 21 | 22 | ''' 23 | Parameters: 24 | dataMatIn - 数据集 25 | classLabels - 数据标签 26 | Returns: 27 | weights.getA() - 求得的权重数组(最优参数) 28 | ''' 29 | # 函数说明:梯度上升算法 30 | def gradAscent(dataMatIn, classLabels): 31 | dataMatrix = np.mat(dataMatIn) #转换成numpy的mat 32 | labelMat = np.mat(classLabels).transpose() #转换成numpy的mat,并进行转置 33 | m, n = np.shape(dataMatrix) #返回dataMatrix的大小。m为行数,n为列数。 34 | alpha = 0.01 #移动步长,也就是学习速率,控制更新的幅度。 35 | maxCycles = 500 #最大迭代次数 36 | weights = np.ones((n,1)) 37 | for k in range(maxCycles): 38 | h = sigmoid(dataMatrix * weights) #梯度上升矢量化公式 39 | error = labelMat - h 40 | weights = weights + alpha * dataMatrix.transpose() * error 41 | return weights.getA() #将矩阵转换为数组,并返回 42 | 43 | # 函数说明:使用Python写的Logistic分类器做预测 44 | def colicTest(): 45 | frTrain = open('horseColicTraining.txt') #打开训练集 46 | frTest = open('horseColicTest.txt') #打开测试集 47 | trainingSet = []; trainingLabels = [] 48 | for line in frTrain.readlines(): 49 | currLine = line.strip().split('\t') 50 | lineArr = [] 51 | for i in range(len(currLine)-1): 52 | lineArr.append(float(currLine[i])) 53 | trainingSet.append(lineArr) 54 | trainingLabels.append(float(currLine[-1])) 55 | trainWeights = stocGradAscent1(np.array(trainingSet), trainingLabels, 500) #使用改进的随即上升梯度训练 56 | errorCount = 0; numTestVec = 0.0 57 | for line in frTest.readlines(): 58 | numTestVec += 1.0 59 | currLine = line.strip().split('\t') 60 | lineArr =[] 61 | for i in range(len(currLine)-1): 62 | lineArr.append(float(currLine[i])) 63 | if int(classifyVector(np.array(lineArr), trainWeights))!= int(currLine[-1]): 64 | errorCount += 1 65 | errorRate = (float(errorCount)/numTestVec) * 100 #错误率计算 66 | print("测试集错误率为: %.2f%%" % errorRate) 67 | 68 | ''' 69 | Parameters: 70 | inX - 特征向量 71 | weights - 回归系数 72 | Returns: 73 | 分类结果 74 | ''' 75 | # 函数说明:分类函数 76 | def classifyVector(inX, weights): 77 | prob = sigmoid(sum(inX*weights)) 78 | if prob > 0.5: return 1.0 79 | else: return 0.0 80 | 81 | 82 | if __name__ == '__main__': 83 | colicTest() -------------------------------------------------------------------------------- /Ch05-Logistic/5.6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 14:49 4 | # @Author : GXl 5 | # @File : 5.6.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from sklearn.linear_model import LogisticRegression 10 | 11 | # 函数说明:使用Sklearn构建Logistic回归分类器 12 | def colicSklearn(): 13 | frTrain = open('horseColicTraining.txt') #打开训练集 14 | frTest = open('horseColicTest.txt') #打开测试集 15 | trainingSet = []; trainingLabels = [] 16 | testSet = []; testLabels = [] 17 | for line in frTrain.readlines(): 18 | currLine = line.strip().split('\t') 19 | lineArr = [] 20 | for i in range(len(currLine)-1): 21 | lineArr.append(float(currLine[i])) 22 | trainingSet.append(lineArr) 23 | trainingLabels.append(float(currLine[-1])) 24 | for line in frTest.readlines(): 25 | currLine = line.strip().split('\t') 26 | lineArr =[] 27 | for i in range(len(currLine)-1): 28 | lineArr.append(float(currLine[i])) 29 | testSet.append(lineArr) 30 | testLabels.append(float(currLine[-1])) 31 | classifier = LogisticRegression(solver='liblinear',max_iter=10).fit(trainingSet, trainingLabels) 32 | test_accurcy = classifier.score(testSet, testLabels) * 100 33 | print('正确率:%f%%' % test_accurcy) 34 | 35 | 36 | if __name__ == '__main__': 37 | colicSklearn() -------------------------------------------------------------------------------- /Ch06-SVM/6.5.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:05 4 | # @Author : GXl 5 | # @File : 6.5.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | # -*-coding:utf-8 -*- 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | """ 14 | Parameters: 15 | fileName - 文件名 16 | Returns: 17 | dataMat - 数据矩阵 18 | labelMat - 数据标签 19 | """ 20 | # 读取数据 21 | def loadDataSet(fileName): 22 | dataMat = []; labelMat = [] 23 | fr = open(fileName) 24 | for line in fr.readlines():#逐行读取,滤除空格等 25 | lineArr = line.strip().split('\t') 26 | dataMat.append([float(lineArr[0]), float(lineArr[1])])#添加数据 27 | labelMat.append(float(lineArr[2]))#添加标签 28 | return dataMat,labelMat 29 | 30 | """ 31 | 数据可视化 32 | Parameters: 33 | dataMat - 数据矩阵 34 | labelMat - 数据标签 35 | Returns: 36 | 无 37 | """ 38 | def showDataSet(dataMat, labelMat): 39 | data_plus = []#正样本 40 | data_minus = []#负样本 41 | for i in range(len(dataMat)): 42 | if labelMat[i] > 0: 43 | data_plus.append(dataMat[i]) 44 | else: 45 | data_minus.append(dataMat[i]) 46 | data_plus_np = np.array(data_plus)#转换为numpy矩阵 47 | data_minus_np = np.array(data_minus)#转换为numpy矩阵 48 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1])#正样本散点图 49 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1])#负样本散点图 50 | plt.show() 51 | 52 | if __name__ == '__main__': 53 | dataArr,labelArr = loadDataSet('testSetRBF.txt')#加载训练集 54 | showDataSet(dataArr, labelArr) -------------------------------------------------------------------------------- /Ch06-SVM/6.6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:10 4 | # @Author : GXl 5 | # @File : 6.6.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | -------------------------------------------------------------------------------- /Ch06-SVM/6.7.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | import numpy as np 3 | import operator 4 | from os import listdir 5 | from sklearn.svm import SVC 6 | 7 | """ 8 | Parameters: 9 | filename - 文件名 10 | Returns: 11 | returnVect - 返回的二进制图像的1x1024向量 12 | """ 13 | # 将32x32的二进制图像转换为1x1024向量 14 | def img2vector(filename): 15 | #创建1x1024零向量 16 | returnVect = np.zeros((1, 1024)) 17 | #打开文件 18 | fr = open(filename) 19 | #按行读取 20 | for i in range(32): 21 | #读一行数据 22 | lineStr = fr.readline() 23 | #每一行的前32个元素依次添加到returnVect中 24 | for j in range(32): 25 | returnVect[0, 32*i+j] = int(lineStr[j]) 26 | #返回转换后的1x1024向量 27 | return returnVect 28 | 29 | # 手写数字分类测试 30 | def handwritingClassTest(): 31 | #测试集的Labels 32 | hwLabels = [] 33 | #返回trainingDigits目录下的文件名 34 | trainingFileList = listdir('trainingDigits') 35 | #返回文件夹下文件的个数 36 | m = len(trainingFileList) 37 | #初始化训练的Mat矩阵,测试集 38 | trainingMat = np.zeros((m, 1024)) 39 | #从文件名中解析出训练集的类别 40 | for i in range(m): 41 | #获得文件的名字 42 | fileNameStr = trainingFileList[i] 43 | #获得分类的数字 44 | classNumber = int(fileNameStr.split('_')[0]) 45 | #将获得的类别添加到hwLabels中 46 | hwLabels.append(classNumber) 47 | #将每一个文件的1x1024数据存储到trainingMat矩阵中 48 | trainingMat[i,:] = img2vector('trainingDigits/%s' % (fileNameStr)) 49 | clf = SVC(C=200,kernel='rbf') 50 | clf.fit(trainingMat,hwLabels) 51 | #返回testDigits目录下的文件列表 52 | testFileList = listdir('testDigits') 53 | #错误检测计数 54 | errorCount = 0.0 55 | #测试数据的数量 56 | mTest = len(testFileList) 57 | #从文件中解析出测试集的类别并进行分类测试 58 | for i in range(mTest): 59 | #获得文件的名字 60 | fileNameStr = testFileList[i] 61 | #获得分类的数字 62 | classNumber = int(fileNameStr.split('_')[0]) 63 | #获得测试集的1x1024向量,用于训练 64 | vectorUnderTest = img2vector('testDigits/%s' % (fileNameStr)) 65 | #获得预测结果 66 | # classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3) 67 | classifierResult = clf.predict(vectorUnderTest) 68 | print("分类返回结果为%d\t真实结果为%d" % (classifierResult, classNumber)) 69 | if(classifierResult != classNumber): 70 | errorCount += 1.0 71 | print("总共错了%d个数据\n错误率为%f%%" % (errorCount, errorCount/mTest * 100)) 72 | 73 | 74 | if __name__ == '__main__': 75 | handwritingClassTest() -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.3.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:22 4 | # @Author : GXl 5 | # @File : 7.4.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | """ 13 | Parameters: 14 | 无 15 | Returns: 16 | dataMat - 数据矩阵 17 | classLabels - 数据标签 18 | """ 19 | 20 | 21 | # 创建单层决策树的数据集 22 | def loadSimpData(): 23 | datMat = np.matrix([[1., 2.1], 24 | [1.5, 1.6], 25 | [1.3, 1.], 26 | [1., 1.], 27 | [2., 1.]]) 28 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] 29 | return datMat, classLabels 30 | 31 | 32 | """ 33 | Parameters: 34 | dataMat - 数据矩阵 35 | labelMat - 数据标签 36 | Returns: 37 | 无 38 | """ 39 | 40 | 41 | # 数据可视化 42 | def showDataSet(dataMat, labelMat): 43 | data_plus = [] # 正样本 44 | data_minus = [] # 负样本 45 | for i in range(len(dataMat)): 46 | if labelMat[i] > 0: 47 | data_plus.append(dataMat[i]) 48 | else: 49 | data_minus.append(dataMat[i]) 50 | data_plus_np = np.array(data_plus) # 转换为numpy矩阵 51 | data_minus_np = np.array(data_minus) # 转换为numpy矩阵 52 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) # 正样本散点图 53 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) # 负样本散点图 54 | plt.show() 55 | 56 | 57 | if __name__ == '__main__': 58 | dataArr, classLabels = loadSimpData() 59 | showDataSet(dataArr, classLabels) -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.3.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:23 4 | # @Author : GXl 5 | # @File : 7.3.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | """ 13 | Parameters: 14 | 无 15 | Returns: 16 | dataMat - 数据矩阵 17 | classLabels - 数据标签 18 | """ 19 | # 创建单层决策树的数据集 20 | def loadSimpData(): 21 | datMat = np.matrix([[ 1. , 2.1], 22 | [ 1.5, 1.6], 23 | [ 1.3, 1. ], 24 | [ 1. , 1. ], 25 | [ 2. , 1. ]]) 26 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] 27 | return datMat,classLabels 28 | 29 | """ 30 | Parameters: 31 | dataMatrix - 数据矩阵 32 | dimen - 第dimen列,也就是第几个特征 33 | threshVal - 阈值 34 | threshIneq - 标志 35 | Returns: 36 | retArray - 分类结果 37 | """ 38 | # 单层决策树分类函数 39 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 40 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 41 | if threshIneq == 'lt': 42 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 43 | else: 44 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 45 | return retArray 46 | 47 | """ 48 | Parameters: 49 | dataArr - 数据矩阵 50 | classLabels - 数据标签 51 | D - 样本权重 52 | Returns: 53 | bestStump - 最佳单层决策树信息 54 | minError - 最小误差 55 | bestClasEst - 最佳的分类结果 56 | """ 57 | # 找到数据集上最佳的单层决策树 58 | def buildStump(dataArr,classLabels,D): 59 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 60 | m,n = np.shape(dataMatrix) 61 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 62 | minError = float('inf') #最小误差初始化为正无穷大 63 | for i in range(n): #遍历所有特征 64 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 65 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 66 | for j in range(-1, int(numSteps) + 1): 67 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 68 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 69 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 70 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 71 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 72 | weightedError = D.T * errArr #计算误差 73 | print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 74 | if weightedError < minError: #找到误差最小的分类方式 75 | minError = weightedError 76 | bestClasEst = predictedVals.copy() 77 | bestStump['dim'] = i 78 | bestStump['thresh'] = threshVal 79 | bestStump['ineq'] = inequal 80 | return bestStump,minError,bestClasEst 81 | 82 | 83 | if __name__ == '__main__': 84 | dataArr,classLabels = loadSimpData() 85 | D = np.mat(np.ones((5, 1)) / 5) 86 | bestStump,minError,bestClasEst = buildStump(dataArr,classLabels,D) 87 | print('bestStump:\n', bestStump) 88 | print('minError:\n', minError) 89 | print('bestClasEst:\n', bestClasEst) -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.4.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:24 4 | # @Author : GXl 5 | # @File : 7.4.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | """ 13 | Parameters: 14 | 无 15 | Returns: 16 | dataMat - 数据矩阵 17 | classLabels - 数据标签 18 | """ 19 | # 创建单层决策树的数据集 20 | def loadSimpData(): 21 | datMat = np.matrix([[ 1. , 2.1], 22 | [ 1.5, 1.6], 23 | [ 1.3, 1. ], 24 | [ 1. , 1. ], 25 | [ 2. , 1. ]]) 26 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] 27 | return datMat,classLabels 28 | 29 | """ 30 | Parameters: 31 | dataMatrix - 数据矩阵 32 | dimen - 第dimen列,也就是第几个特征 33 | threshVal - 阈值 34 | threshIneq - 标志 35 | Returns: 36 | retArray - 分类结果 37 | """ 38 | # 单层决策树分类函数 39 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 40 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 41 | if threshIneq == 'lt': 42 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 43 | else: 44 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 45 | return retArray 46 | 47 | """ 48 | Parameters: 49 | dataArr - 数据矩阵 50 | classLabels - 数据标签 51 | D - 样本权重 52 | Returns: 53 | bestStump - 最佳单层决策树信息 54 | minError - 最小误差 55 | bestClasEst - 最佳的分类结果 56 | """ 57 | # 找到数据集上最佳的单层决策树 58 | def buildStump(dataArr,classLabels,D): 59 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 60 | m,n = np.shape(dataMatrix) 61 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 62 | minError = float('inf') #最小误差初始化为正无穷大 63 | for i in range(n): #遍历所有特征 64 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 65 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 66 | for j in range(-1, int(numSteps) + 1): 67 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 68 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 69 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal) #计算分类结果 70 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 71 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 72 | weightedError = D.T * errArr #计算误差 73 | print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 74 | if weightedError < minError: #找到误差最小的分类方式 75 | minError = weightedError 76 | bestClasEst = predictedVals.copy() 77 | bestStump['dim'] = i 78 | bestStump['thresh'] = threshVal 79 | bestStump['ineq'] = inequal 80 | return bestStump, minError, bestClasEst 81 | 82 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 83 | weakClassArr = [] 84 | m = np.shape(dataArr)[0] 85 | D = np.mat(np.ones((m, 1)) / m) #初始化权重 86 | aggClassEst = np.mat(np.zeros((m,1))) 87 | for i in range(numIt): 88 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 89 | print("D:",D.T) 90 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 91 | bestStump['alpha'] = alpha #存储弱学习算法权重 92 | weakClassArr.append(bestStump) #存储单层决策树 93 | print("classEst: ", classEst.T) 94 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) #计算e的指数项 95 | D = np.multiply(D, np.exp(expon)) 96 | D = D / D.sum() #根据样本权重公式,更新样本权重 97 | #计算AdaBoost误差,当误差为0的时候,退出循环 98 | aggClassEst += alpha * classEst 99 | print("aggClassEst: ", aggClassEst.T) 100 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1)))#计算误差 101 | errorRate = aggErrors.sum() / m 102 | print("total error: ", errorRate) 103 | if errorRate == 0.0: break #误差为0,退出循环 104 | return weakClassArr, aggClassEst 105 | 106 | 107 | if __name__ == '__main__': 108 | dataArr,classLabels = loadSimpData() 109 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, classLabels) 110 | print(weakClassArr) 111 | print(aggClassEst) -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.4.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:25 4 | # @Author : GXl 5 | # @File : 7.4.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | """ 13 | Parameters: 14 | 无 15 | Returns: 16 | dataMat - 数据矩阵 17 | classLabels - 数据标签 18 | """ 19 | 20 | 21 | # 创建单层决策树的数据集 22 | def loadSimpData(): 23 | datMat = np.matrix([[1., 2.1], 24 | [1.5, 1.6], 25 | [1.3, 1.], 26 | [1., 1.], 27 | [2., 1.]]) 28 | classLabels = [1.0, 1.0, -1.0, -1.0, 1.0] 29 | return datMat, classLabels 30 | 31 | 32 | """ 33 | Parameters: 34 | dataMat - 数据矩阵 35 | labelMat - 数据标签 36 | Returns: 37 | 无 38 | """ 39 | 40 | 41 | # 数据可视化 42 | def showDataSet(dataMat, labelMat): 43 | data_plus = [] # 正样本 44 | data_minus = [] # 负样本 45 | for i in range(len(dataMat)): 46 | if labelMat[i] > 0: 47 | data_plus.append(dataMat[i]) 48 | else: 49 | data_minus.append(dataMat[i]) 50 | data_plus_np = np.array(data_plus) # 转换为numpy矩阵 51 | data_minus_np = np.array(data_minus) # 转换为numpy矩阵 52 | plt.scatter(np.transpose(data_plus_np)[0], np.transpose(data_plus_np)[1]) # 正样本散点图 53 | plt.scatter(np.transpose(data_minus_np)[0], np.transpose(data_minus_np)[1]) # 负样本散点图 54 | plt.show() 55 | 56 | 57 | """ 58 | Parameters: 59 | dataMatrix - 数据矩阵 60 | dimen - 第dimen列,也就是第几个特征 61 | threshVal - 阈值 62 | threshIneq - 标志 63 | Returns: 64 | retArray - 分类结果 65 | """ 66 | 67 | 68 | # 单层决策树分类函数 69 | def stumpClassify(dataMatrix, dimen, threshVal, threshIneq): 70 | retArray = np.ones((np.shape(dataMatrix)[0], 1)) # 初始化retArray为1 71 | if threshIneq == 'lt': 72 | retArray[dataMatrix[:, dimen] <= threshVal] = -1.0 # 如果小于阈值,则赋值为-1 73 | else: 74 | retArray[dataMatrix[:, dimen] > threshVal] = -1.0 # 如果大于阈值,则赋值为-1 75 | return retArray 76 | 77 | 78 | """ 79 | Parameters: 80 | dataArr - 数据矩阵 81 | classLabels - 数据标签 82 | D - 样本权重 83 | Returns: 84 | bestStump - 最佳单层决策树信息 85 | minError - 最小误差 86 | bestClasEst - 最佳的分类结果 87 | """ 88 | 89 | 90 | # 找到数据集上最佳的单层决策树 91 | def buildStump(dataArr, classLabels, D): 92 | dataMatrix = np.mat(dataArr); 93 | labelMat = np.mat(classLabels).T 94 | m, n = np.shape(dataMatrix) 95 | numSteps = 10.0; 96 | bestStump = {}; 97 | bestClasEst = np.mat(np.zeros((m, 1))) 98 | minError = float('inf') # 最小误差初始化为正无穷大 99 | for i in range(n): # 遍历所有特征 100 | rangeMin = dataMatrix[:, i].min(); 101 | rangeMax = dataMatrix[:, i].max() # 找到特征中最小的值和最大值 102 | stepSize = (rangeMax - rangeMin) / numSteps # 计算步长 103 | for j in range(-1, int(numSteps) + 1): 104 | for inequal in ['lt', 'gt']: # 大于和小于的情况,均遍历。lt:less than,gt:greater than 105 | threshVal = (rangeMin + float(j) * stepSize) # 计算阈值 106 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal) # 计算分类结果 107 | errArr = np.mat(np.ones((m, 1))) # 初始化误差矩阵 108 | errArr[predictedVals == labelMat] = 0 # 分类正确的,赋值为0 109 | weightedError = D.T * errArr # 计算误差 110 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 111 | if weightedError < minError: # 找到误差最小的分类方式 112 | minError = weightedError 113 | bestClasEst = predictedVals.copy() 114 | bestStump['dim'] = i 115 | bestStump['thresh'] = threshVal 116 | bestStump['ineq'] = inequal 117 | return bestStump, minError, bestClasEst 118 | 119 | 120 | """ 121 | Parameters: 122 | dataArr - 数据矩阵 123 | classLabels - 数据标签 124 | numIt - 最大迭代次数 125 | Returns: 126 | weakClassArr - 训练好的分类器 127 | aggClassEst - 类别估计累计值 128 | """ 129 | 130 | 131 | # 使用AdaBoost算法提升弱分类器性能 132 | def adaBoostTrainDS(dataArr, classLabels, numIt=40): 133 | weakClassArr = [] 134 | m = np.shape(dataArr)[0] 135 | D = np.mat(np.ones((m, 1)) / m) # 初始化权重 136 | aggClassEst = np.mat(np.zeros((m, 1))) 137 | for i in range(numIt): 138 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) # 构建单层决策树 139 | # print("D:",D.T) 140 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) # 计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 141 | bestStump['alpha'] = alpha # 存储弱学习算法权重 142 | weakClassArr.append(bestStump) # 存储单层决策树 143 | # print("classEst: ", classEst.T) 144 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) # 计算e的指数项 145 | D = np.multiply(D, np.exp(expon)) 146 | D = D / D.sum() # 根据样本权重公式,更新样本权重 147 | # 计算AdaBoost误差,当误差为0的时候,退出循环 148 | aggClassEst += alpha * classEst # 计算类别估计累计值 149 | # print("aggClassEst: ", aggClassEst.T) 150 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) # 计算误差 151 | errorRate = aggErrors.sum() / m 152 | # print("total error: ", errorRate) 153 | if errorRate == 0.0: break # 误差为0,退出循环 154 | return weakClassArr, aggClassEst 155 | 156 | 157 | """ 158 | Parameters: 159 | datToClass - 待分类样例 160 | classifierArr - 训练好的分类器 161 | Returns: 162 | 分类结果 163 | """ 164 | 165 | 166 | # AdaBoost分类函数 167 | def adaClassify(datToClass, classifierArr): 168 | dataMatrix = np.mat(datToClass) 169 | m = np.shape(dataMatrix)[0] 170 | aggClassEst = np.mat(np.zeros((m, 1))) 171 | for i in range(len(classifierArr)): # 遍历所有分类器,进行分类 172 | classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], 173 | classifierArr[i]['ineq']) 174 | aggClassEst += classifierArr[i]['alpha'] * classEst 175 | print(aggClassEst) 176 | return np.sign(aggClassEst) 177 | 178 | 179 | if __name__ == '__main__': 180 | dataArr, classLabels = loadSimpData() 181 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, classLabels) 182 | print(adaClassify([[0, 0], [5, 5]], weakClassArr)) -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:26 4 | # @Author : GXl 5 | # @File : 7.5.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | def loadDataSet(fileName): 14 | numFeat = len((open(fileName).readline().split('\t'))) 15 | dataMat = []; 16 | labelMat = [] 17 | fr = open(fileName) 18 | for line in fr.readlines(): 19 | lineArr = [] 20 | curLine = line.strip().split('\t') 21 | for i in range(numFeat - 1): 22 | lineArr.append(float(curLine[i])) 23 | dataMat.append(lineArr) 24 | labelMat.append(float(curLine[-1])) 25 | return dataMat, labelMat 26 | 27 | 28 | """ 29 | Parameters: 30 | dataMatrix - 数据矩阵 31 | dimen - 第dimen列,也就是第几个特征 32 | threshVal - 阈值 33 | threshIneq - 标志 34 | Returns: 35 | retArray - 分类结果 36 | """ 37 | 38 | 39 | # 单层决策树分类函数 40 | def stumpClassify(dataMatrix, dimen, threshVal, threshIneq): 41 | retArray = np.ones((np.shape(dataMatrix)[0], 1)) # 初始化retArray为1 42 | if threshIneq == 'lt': 43 | retArray[dataMatrix[:, dimen] <= threshVal] = -1.0 # 如果小于阈值,则赋值为-1 44 | else: 45 | retArray[dataMatrix[:, dimen] > threshVal] = -1.0 # 如果大于阈值,则赋值为-1 46 | return retArray 47 | 48 | 49 | """ 50 | Parameters: 51 | dataArr - 数据矩阵 52 | classLabels - 数据标签 53 | D - 样本权重 54 | Returns: 55 | bestStump - 最佳单层决策树信息 56 | minError - 最小误差 57 | bestClasEst - 最佳的分类结果 58 | """ 59 | 60 | 61 | # 找到数据集上最佳的单层决策树 62 | def buildStump(dataArr, classLabels, D): 63 | dataMatrix = np.mat(dataArr); 64 | labelMat = np.mat(classLabels).T 65 | m, n = np.shape(dataMatrix) 66 | numSteps = 10.0; 67 | bestStump = {}; 68 | bestClasEst = np.mat(np.zeros((m, 1))) 69 | minError = float('inf') # 最小误差初始化为正无穷大 70 | for i in range(n): # 遍历所有特征 71 | rangeMin = dataMatrix[:, i].min(); 72 | rangeMax = dataMatrix[:, i].max() # 找到特征中最小的值和最大值 73 | stepSize = (rangeMax - rangeMin) / numSteps # 计算步长 74 | for j in range(-1, int(numSteps) + 1): 75 | for inequal in ['lt', 'gt']: # 大于和小于的情况,均遍历。lt:less than,gt:greater than 76 | threshVal = (rangeMin + float(j) * stepSize) # 计算阈值 77 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal) # 计算分类结果 78 | errArr = np.mat(np.ones((m, 1))) # 初始化误差矩阵 79 | errArr[predictedVals == labelMat] = 0 # 分类正确的,赋值为0 80 | weightedError = D.T * errArr # 计算误差 81 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 82 | if weightedError < minError: # 找到误差最小的分类方式 83 | minError = weightedError 84 | bestClasEst = predictedVals.copy() 85 | bestStump['dim'] = i 86 | bestStump['thresh'] = threshVal 87 | bestStump['ineq'] = inequal 88 | return bestStump, minError, bestClasEst 89 | 90 | 91 | """ 92 | Parameters: 93 | dataArr - 数据矩阵 94 | classLabels - 数据标签 95 | numIt - 最大迭代次数 96 | Returns: 97 | weakClassArr - 训练好的分类器 98 | aggClassEst - 类别估计累计值 99 | """ 100 | 101 | 102 | # 使用AdaBoost算法提升弱分类器性能 103 | def adaBoostTrainDS(dataArr, classLabels, numIt=40): 104 | weakClassArr = [] 105 | m = np.shape(dataArr)[0] 106 | D = np.mat(np.ones((m, 1)) / m) # 初始化权重 107 | aggClassEst = np.mat(np.zeros((m, 1))) 108 | for i in range(numIt): 109 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) # 构建单层决策树 110 | # print("D:",D.T) 111 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) # 计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 112 | bestStump['alpha'] = alpha # 存储弱学习算法权重 113 | weakClassArr.append(bestStump) # 存储单层决策树 114 | # print("classEst: ", classEst.T) 115 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst) # 计算e的指数项 116 | D = np.multiply(D, np.exp(expon)) 117 | D = D / D.sum() # 根据样本权重公式,更新样本权重 118 | # 计算AdaBoost误差,当误差为0的时候,退出循环 119 | aggClassEst += alpha * classEst # 计算类别估计累计值 120 | # print("aggClassEst: ", aggClassEst.T) 121 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m, 1))) # 计算误差 122 | errorRate = aggErrors.sum() / m 123 | # print("total error: ", errorRate) 124 | if errorRate == 0.0: break # 误差为0,退出循环 125 | return weakClassArr, aggClassEst 126 | 127 | 128 | """ 129 | Parameters: 130 | datToClass - 待分类样例 131 | classifierArr - 训练好的分类器 132 | Returns: 133 | 分类结果 134 | """ 135 | 136 | 137 | # AdaBoost分类函数 138 | def adaClassify(datToClass, classifierArr): 139 | dataMatrix = np.mat(datToClass) 140 | m = np.shape(dataMatrix)[0] 141 | aggClassEst = np.mat(np.zeros((m, 1))) 142 | print(len(classifierArr)) 143 | for i in range(len(classifierArr)): # 遍历所有分类器,进行分类 144 | classEst = stumpClassify(dataMatrix, classifierArr[i]['dim'], classifierArr[i]['thresh'], 145 | classifierArr[i]['ineq']) 146 | aggClassEst += classifierArr[i]['alpha'] * classEst 147 | # print(aggClassEst) 148 | return np.sign(aggClassEst) 149 | 150 | 151 | if __name__ == '__main__': 152 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') 153 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr) 154 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') 155 | print(weakClassArr) 156 | predictions = adaClassify(dataArr, weakClassArr) 157 | errArr = np.mat(np.ones((len(dataArr), 1))) 158 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != np.mat(LabelArr).T].sum() / len(dataArr) * 100)) 159 | predictions = adaClassify(testArr, weakClassArr) 160 | errArr = np.mat(np.ones((len(testArr), 1))) 161 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != np.mat(testLabelArr).T].sum() / len(testArr) * 100)) -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.6.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:28 4 | # @Author : GXl 5 | # @File : 7.6.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | from sklearn.ensemble import AdaBoostClassifier 11 | from sklearn.tree import DecisionTreeClassifier 12 | 13 | def loadDataSet(fileName): 14 | numFeat = len((open(fileName).readline().split('\t'))) 15 | dataMat = []; labelMat = [] 16 | fr = open(fileName) 17 | for line in fr.readlines(): 18 | lineArr = [] 19 | curLine = line.strip().split('\t') 20 | for i in range(numFeat - 1): 21 | lineArr.append(float(curLine[i])) 22 | dataMat.append(lineArr) 23 | labelMat.append(float(curLine[-1])) 24 | return dataMat, labelMat 25 | 26 | 27 | if __name__ == '__main__': 28 | dataArr, classLabels = loadDataSet('horseColicTraining2.txt') 29 | testArr, testLabelArr = loadDataSet('horseColicTest2.txt') 30 | bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2), algorithm = "SAMME", n_estimators = 10) 31 | bdt.fit(dataArr, classLabels) 32 | predictions = bdt.predict(dataArr) 33 | errArr = np.mat(np.ones((len(dataArr), 1))) 34 | print('训练集的错误率:%.3f%%' % float(errArr[predictions != classLabels].sum() / len(dataArr) * 100)) 35 | predictions = bdt.predict(testArr) 36 | errArr = np.mat(np.ones((len(testArr), 1))) 37 | print('测试集的错误率:%.3f%%' % float(errArr[predictions != testLabelArr].sum() / len(testArr) * 100)) -------------------------------------------------------------------------------- /Ch07-AdaBoost/7.8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:29 4 | # @Author : GXl 5 | # @File : 7.8.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | import matplotlib.pyplot as plt 11 | from matplotlib.font_manager import FontProperties 12 | 13 | def loadDataSet(fileName): 14 | numFeat = len((open(fileName).readline().split('\t'))) 15 | dataMat = []; labelMat = [] 16 | fr = open(fileName) 17 | for line in fr.readlines(): 18 | lineArr = [] 19 | curLine = line.strip().split('\t') 20 | for i in range(numFeat - 1): 21 | lineArr.append(float(curLine[i])) 22 | dataMat.append(lineArr) 23 | labelMat.append(float(curLine[-1])) 24 | 25 | return dataMat, labelMat 26 | 27 | """ 28 | Parameters: 29 | dataMatrix - 数据矩阵 30 | dimen - 第dimen列,也就是第几个特征 31 | threshVal - 阈值 32 | threshIneq - 标志 33 | Returns: 34 | retArray - 分类结果 35 | """ 36 | # 单层决策树分类函数 37 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 38 | retArray = np.ones((np.shape(dataMatrix)[0],1)) #初始化retArray为1 39 | if threshIneq == 'lt': 40 | retArray[dataMatrix[:,dimen] <= threshVal] = -1.0 #如果小于阈值,则赋值为-1 41 | else: 42 | retArray[dataMatrix[:,dimen] > threshVal] = -1.0 #如果大于阈值,则赋值为-1 43 | return retArray 44 | 45 | """ 46 | Parameters: 47 | dataArr - 数据矩阵 48 | classLabels - 数据标签 49 | D - 样本权重 50 | Returns: 51 | bestStump - 最佳单层决策树信息 52 | minError - 最小误差 53 | bestClasEst - 最佳的分类结果 54 | """ 55 | # 找到数据集上最佳的单层决策树 56 | def buildStump(dataArr,classLabels,D): 57 | dataMatrix = np.mat(dataArr); labelMat = np.mat(classLabels).T 58 | m,n = np.shape(dataMatrix) 59 | numSteps = 10.0; bestStump = {}; bestClasEst = np.mat(np.zeros((m,1))) 60 | minError = float('inf') #最小误差初始化为正无穷大 61 | for i in range(n): #遍历所有特征 62 | rangeMin = dataMatrix[:,i].min(); rangeMax = dataMatrix[:,i].max() #找到特征中最小的值和最大值 63 | stepSize = (rangeMax - rangeMin) / numSteps #计算步长 64 | for j in range(-1, int(numSteps) + 1): 65 | for inequal in ['lt', 'gt']: #大于和小于的情况,均遍历。lt:less than,gt:greater than 66 | threshVal = (rangeMin + float(j) * stepSize) #计算阈值 67 | predictedVals = stumpClassify(dataMatrix, i, threshVal, inequal)#计算分类结果 68 | errArr = np.mat(np.ones((m,1))) #初始化误差矩阵 69 | errArr[predictedVals == labelMat] = 0 #分类正确的,赋值为0 70 | weightedError = D.T * errArr #计算误差 71 | # print("split: dim %d, thresh %.2f, thresh ineqal: %s, the weighted error is %.3f" % (i, threshVal, inequal, weightedError)) 72 | if weightedError < minError: #找到误差最小的分类方式 73 | minError = weightedError 74 | bestClasEst = predictedVals.copy() 75 | bestStump['dim'] = i 76 | bestStump['thresh'] = threshVal 77 | bestStump['ineq'] = inequal 78 | return bestStump, minError, bestClasEst 79 | 80 | """ 81 | Parameters: 82 | dataArr - 数据矩阵 83 | classLabels - 数据标签 84 | numIt - 最大迭代次数 85 | Returns: 86 | weakClassArr - 训练好的分类器 87 | aggClassEst - 类别估计累计值 88 | """ 89 | # 使用AdaBoost算法训练分类器 90 | def adaBoostTrainDS(dataArr, classLabels, numIt = 40): 91 | weakClassArr = [] 92 | m = np.shape(dataArr)[0] 93 | D = np.mat(np.ones((m, 1)) / m) #初始化权重 94 | aggClassEst = np.mat(np.zeros((m,1))) 95 | for i in range(numIt): 96 | bestStump, error, classEst = buildStump(dataArr, classLabels, D) #构建单层决策树 97 | # print("D:",D.T) 98 | alpha = float(0.5 * np.log((1.0 - error) / max(error, 1e-16))) #计算弱学习算法权重alpha,使error不等于0,因为分母不能为0 99 | bestStump['alpha'] = alpha #存储弱学习算法权重 100 | weakClassArr.append(bestStump) #存储单层决策树 101 | # print("classEst: ", classEst.T) 102 | expon = np.multiply(-1 * alpha * np.mat(classLabels).T, classEst)#计算e的指数项 103 | D = np.multiply(D, np.exp(expon)) 104 | D = D / D.sum() #根据样本权重公式,更新样本权重 105 | #计算AdaBoost误差,当误差为0的时候,退出循环 106 | aggClassEst += alpha * classEst #计算类别估计累计值 107 | # print("aggClassEst: ", aggClassEst.T) 108 | aggErrors = np.multiply(np.sign(aggClassEst) != np.mat(classLabels).T, np.ones((m,1)))#计算误差 109 | errorRate = aggErrors.sum() / m 110 | # print("total error: ", errorRate) 111 | if errorRate == 0.0: break #误差为0,退出循环 112 | return weakClassArr, aggClassEst 113 | 114 | """ 115 | Parameters: 116 | predStrengths - 分类器的预测强度 117 | classLabels - 类别 118 | Returns: 119 | 无 120 | """ 121 | # 绘制ROC 122 | def plotROC(predStrengths, classLabels): 123 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 124 | cur = (1.0, 1.0) #绘制光标的位置 125 | ySum = 0.0 #用于计算AUC 126 | numPosClas = np.sum(np.array(classLabels) == 1.0) #统计正类的数量 127 | yStep = 1 / float(numPosClas) #y轴步长 128 | xStep = 1 / float(len(classLabels) - numPosClas) #x轴步长 129 | 130 | sortedIndicies = predStrengths.argsort() #预测强度排序 131 | 132 | fig = plt.figure() 133 | fig.clf() 134 | ax = plt.subplot(111) 135 | for index in sortedIndicies.tolist()[0]: 136 | if classLabels[index] == 1.0: 137 | delX = 0; delY = yStep 138 | else: 139 | delX = xStep; delY = 0 140 | ySum += cur[1] #高度累加 141 | ax.plot([cur[0], cur[0] - delX], [cur[1], cur[1] - delY], c = 'b')#绘制ROC 142 | cur = (cur[0] - delX, cur[1] - delY) #更新绘制光标的位置 143 | ax.plot([0,1], [0,1], 'b--') 144 | plt.title('AdaBoost马疝病检测系统的ROC曲线', FontProperties = font) 145 | plt.xlabel('假阳率', FontProperties = font) 146 | plt.ylabel('真阳率', FontProperties = font) 147 | ax.axis([0, 1, 0, 1]) 148 | print('AUC面积为:', ySum * xStep) #计算AUC 149 | plt.show() 150 | 151 | 152 | if __name__ == '__main__': 153 | dataArr, LabelArr = loadDataSet('horseColicTraining2.txt') 154 | weakClassArr, aggClassEst = adaBoostTrainDS(dataArr, LabelArr) 155 | plotROC(aggClassEst.T, LabelArr) -------------------------------------------------------------------------------- /Ch08-Regression/8.2.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:30 4 | # @Author : GXl 5 | # @File : 8.2.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 加载数据 13 | def loadDataSet(fileName): 14 | """ 15 | Parameters: 16 | fileName - 文件名 17 | Returns: 18 | xArr - x数据集 19 | yArr - y数据集 20 | """ 21 | numFeat = len(open(fileName).readline().split('\t')) - 1 22 | xArr = []; yArr = [] 23 | fr = open(fileName) 24 | for line in fr.readlines(): 25 | lineArr =[] 26 | curLine = line.strip().split('\t') 27 | for i in range(numFeat): 28 | lineArr.append(float(curLine[i])) 29 | xArr.append(lineArr) 30 | yArr.append(float(curLine[-1])) 31 | return xArr, yArr 32 | 33 | # 绘制数据集 34 | def plotDataSet(): 35 | xArr, yArr = loadDataSet('ex0.txt') #加载数据集 36 | n = len(xArr) #数据个数 37 | xcord = []; ycord = [] #样本点 38 | for i in range(n): 39 | xcord.append(xArr[i][1]); ycord.append(yArr[i]) #样本点 40 | fig = plt.figure() 41 | ax = fig.add_subplot(111) #添加subplot 42 | ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) #绘制样本点 43 | plt.title('DataSet') #绘制title 44 | plt.xlabel('X') 45 | plt.ylabel('Y') 46 | plt.show() 47 | 48 | 49 | if __name__ == '__main__': 50 | plotDataSet() 51 | -------------------------------------------------------------------------------- /Ch08-Regression/8.2.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:31 4 | # @Author : GXl 5 | # @File : 8.2.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 加载数据 13 | def loadDataSet(fileName): 14 | """ 15 | Parameters: 16 | fileName - 文件名 17 | Returns: 18 | xArr - x数据集 19 | yArr - y数据集 20 | """ 21 | numFeat = len(open(fileName).readline().split('\t')) - 1 22 | xArr = []; yArr = [] 23 | fr = open(fileName) 24 | for line in fr.readlines(): 25 | lineArr =[] 26 | curLine = line.strip().split('\t') 27 | for i in range(numFeat): 28 | lineArr.append(float(curLine[i])) 29 | xArr.append(lineArr) 30 | yArr.append(float(curLine[-1])) 31 | return xArr, yArr 32 | 33 | # 计算回归系数w 34 | def standRegres(xArr,yArr): 35 | """ 36 | Parameters: 37 | xArr - x数据集 38 | yArr - y数据集 39 | Returns: 40 | ws - 回归系数 41 | """ 42 | xMat = np.mat(xArr); yMat = np.mat(yArr).T 43 | xTx = xMat.T * xMat #根据文中推导的公示计算回归系数 44 | if np.linalg.det(xTx) == 0.0: 45 | print("矩阵为奇异矩阵,不能求逆") 46 | return 47 | ws = xTx.I * (xMat.T*yMat) 48 | return ws 49 | 50 | # 绘制回归曲线和数据点 51 | def plotRegression(): 52 | xArr, yArr = loadDataSet('ex0.txt') #加载数据集 53 | ws = standRegres(xArr, yArr) #计算回归系数 54 | xMat = np.mat(xArr) #创建xMat矩阵 55 | yMat = np.mat(yArr) #创建yMat矩阵 56 | xCopy = xMat.copy() #深拷贝xMat矩阵 57 | xCopy.sort(0) #排序 58 | yHat = xCopy * ws #计算对应的y值 59 | fig = plt.figure() 60 | ax = fig.add_subplot(111) #添加subplot 61 | ax.plot(xCopy[:, 1], yHat, c = 'red') #绘制回归曲线 62 | ax.scatter(xMat[:,1].flatten().A[0], yMat.flatten().A[0], s = 20, c = 'blue',alpha = .5) #绘制样本点 63 | plt.title('DataSet') #绘制title 64 | plt.xlabel('X') 65 | plt.ylabel('Y') 66 | plt.show() 67 | 68 | 69 | if __name__ == '__main__': 70 | plotRegression() 71 | -------------------------------------------------------------------------------- /Ch08-Regression/8.2.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:31 4 | # @Author : GXl 5 | # @File : 8.2.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 加载数据 12 | def loadDataSet(fileName): 13 | """ 14 | Parameters: 15 | fileName - 文件名 16 | Returns: 17 | xArr - x数据集 18 | yArr - y数据集 19 | """ 20 | numFeat = len(open(fileName).readline().split('\t')) - 1 21 | xArr = []; yArr = [] 22 | fr = open(fileName) 23 | for line in fr.readlines(): 24 | lineArr =[] 25 | curLine = line.strip().split('\t') 26 | for i in range(numFeat): 27 | lineArr.append(float(curLine[i])) 28 | xArr.append(lineArr) 29 | yArr.append(float(curLine[-1])) 30 | return xArr, yArr 31 | 32 | # 计算回归系数w 33 | def standRegres(xArr,yArr): 34 | """ 35 | Parameters: 36 | xArr - x数据集 37 | yArr - y数据集 38 | Returns: 39 | ws - 回归系数 40 | """ 41 | xMat = np.mat(xArr); yMat = np.mat(yArr).T 42 | xTx = xMat.T * xMat #根据文中推导的公示计算回归系数 43 | if np.linalg.det(xTx) == 0.0: 44 | print("矩阵为奇异矩阵,不能求逆") 45 | return 46 | ws = xTx.I * (xMat.T*yMat) 47 | return ws 48 | 49 | if __name__ == '__main__': 50 | xArr, yArr = loadDataSet('ex0.txt') #加载数据集 51 | ws = standRegres(xArr, yArr) #计算回归系数 52 | xMat = np.mat(xArr) #创建xMat矩阵 53 | yMat = np.mat(yArr) #创建yMat矩阵 54 | yHat = xMat * ws 55 | print(np.corrcoef(yHat.T, yMat)) 56 | -------------------------------------------------------------------------------- /Ch08-Regression/8.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:32 4 | # @Author : GXl 5 | # @File : 8.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | 14 | # 加载数据 15 | def loadDataSet(fileName): 16 | """ 17 | Parameters: 18 | fileName - 文件名 19 | Returns: 20 | xArr - x数据集 21 | yArr - y数据集 22 | """ 23 | numFeat = len(open(fileName).readline().split('\t')) - 1 24 | xArr = []; 25 | yArr = [] 26 | fr = open(fileName) 27 | for line in fr.readlines(): 28 | lineArr = [] 29 | curLine = line.strip().split('\t') 30 | for i in range(numFeat): 31 | lineArr.append(float(curLine[i])) 32 | xArr.append(lineArr) 33 | yArr.append(float(curLine[-1])) 34 | return xArr, yArr 35 | 36 | 37 | # 绘制多条局部加权回归曲线 38 | def plotlwlrRegression(): 39 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 40 | xArr, yArr = loadDataSet('ex0.txt') # 加载数据集 41 | yHat_1 = lwlrTest(xArr, xArr, yArr, 1.0) # 根据局部加权线性回归计算yHat 42 | yHat_2 = lwlrTest(xArr, xArr, yArr, 0.01) # 根据局部加权线性回归计算yHat 43 | yHat_3 = lwlrTest(xArr, xArr, yArr, 0.003) # 根据局部加权线性回归计算yHat 44 | xMat = np.mat(xArr) # 创建xMat矩阵 45 | yMat = np.mat(yArr) # 创建yMat矩阵 46 | srtInd = xMat[:, 1].argsort(0) # 排序,返回索引值 47 | xSort = xMat[srtInd][:, 0, :] 48 | fig, axs = plt.subplots(nrows=3, ncols=1, sharex=False, sharey=False, figsize=(10, 8)) 49 | axs[0].plot(xSort[:, 1], yHat_1[srtInd], c='red') # 绘制回归曲线 50 | axs[1].plot(xSort[:, 1], yHat_2[srtInd], c='red') # 绘制回归曲线 51 | axs[2].plot(xSort[:, 1], yHat_3[srtInd], c='red') # 绘制回归曲线 52 | axs[0].scatter(xMat[:, 1].flatten().A[0], yMat.flatten().A[0], s=20, c='blue', alpha=.5) # 绘制样本点 53 | axs[1].scatter(xMat[:, 1].flatten().A[0], yMat.flatten().A[0], s=20, c='blue', alpha=.5) # 绘制样本点 54 | axs[2].scatter(xMat[:, 1].flatten().A[0], yMat.flatten().A[0], s=20, c='blue', alpha=.5) # 绘制样本点 55 | # 设置标题,x轴label,y轴label 56 | axs0_title_text = axs[0].set_title(u'局部加权回归曲线,k=1.0', FontProperties=font) 57 | axs1_title_text = axs[1].set_title(u'局部加权回归曲线,k=0.01', FontProperties=font) 58 | axs2_title_text = axs[2].set_title(u'局部加权回归曲线,k=0.003', FontProperties=font) 59 | plt.setp(axs0_title_text, size=8, weight='bold', color='red') 60 | plt.setp(axs1_title_text, size=8, weight='bold', color='red') 61 | plt.setp(axs2_title_text, size=8, weight='bold', color='red') 62 | plt.xlabel('X') 63 | plt.show() 64 | 65 | 66 | # 使用局部加权线性回归计算回归系数w 67 | def lwlr(testPoint, xArr, yArr, k=1.0): 68 | """ 69 | Parameters: 70 | testPoint - 测试样本点 71 | xArr - x数据集 72 | yArr - y数据集 73 | k - 高斯核的k,自定义参数 74 | Returns: 75 | ws - 回归系数 76 | """ 77 | xMat = np.mat(xArr); 78 | yMat = np.mat(yArr).T 79 | m = np.shape(xMat)[0] 80 | weights = np.mat(np.eye((m))) # 创建权重对角矩阵 81 | for j in range(m): # 遍历数据集计算每个样本的权重 82 | diffMat = testPoint - xMat[j, :] 83 | weights[j, j] = np.exp(diffMat * diffMat.T / (-2.0 * k ** 2)) 84 | xTx = xMat.T * (weights * xMat) 85 | if np.linalg.det(xTx) == 0.0: 86 | print("矩阵为奇异矩阵,不能求逆") 87 | return 88 | ws = xTx.I * (xMat.T * (weights * yMat)) # 计算回归系数 89 | return testPoint * ws 90 | 91 | 92 | # 局部加权线性回归测试 93 | def lwlrTest(testArr, xArr, yArr, k=1.0): 94 | """ 95 | Parameters: 96 | testArr - 测试数据集 97 | xArr - x数据集 98 | yArr - y数据集 99 | k - 高斯核的k,自定义参数 100 | Returns: 101 | ws - 回归系数 102 | """ 103 | m = np.shape(testArr)[0] # 计算测试数据集大小 104 | yHat = np.zeros(m) 105 | for i in range(m): # 对每个样本点进行预测 106 | yHat[i] = lwlr(testArr[i], xArr, yArr, k) 107 | return yHat 108 | 109 | 110 | if __name__ == '__main__': 111 | plotlwlrRegression() 112 | -------------------------------------------------------------------------------- /Ch08-Regression/8.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:32 4 | # @Author : GXl 5 | # @File : 8.4.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | # 加载数据 14 | def loadDataSet(fileName): 15 | """ 16 | Parameters: 17 | fileName - 文件名 18 | Returns: 19 | xArr - x数据集 20 | yArr - y数据集 21 | """ 22 | numFeat = len(open(fileName).readline().split('\t')) - 1 23 | xArr = []; yArr = [] 24 | fr = open(fileName) 25 | for line in fr.readlines(): 26 | lineArr =[] 27 | curLine = line.strip().split('\t') 28 | for i in range(numFeat): 29 | lineArr.append(float(curLine[i])) 30 | xArr.append(lineArr) 31 | yArr.append(float(curLine[-1])) 32 | return xArr, yArr 33 | 34 | # 使用局部加权线性回归计算回归系数w 35 | def lwlr(testPoint, xArr, yArr, k = 1.0): 36 | """ 37 | Parameters: 38 | testPoint - 测试样本点 39 | xArr - x数据集 40 | yArr - y数据集 41 | k - 高斯核的k,自定义参数 42 | Returns: 43 | ws - 回归系数 44 | """ 45 | xMat = np.mat(xArr); yMat = np.mat(yArr).T 46 | m = np.shape(xMat)[0] 47 | weights = np.mat(np.eye((m))) #创建权重对角矩阵 48 | for j in range(m): #遍历数据集计算每个样本的权重 49 | diffMat = testPoint - xMat[j, :] 50 | weights[j, j] = np.exp(diffMat * diffMat.T/(-2.0 * k**2)) 51 | xTx = xMat.T * (weights * xMat) 52 | if np.linalg.det(xTx) == 0.0: 53 | print("矩阵为奇异矩阵,不能求逆") 54 | return 55 | ws = xTx.I * (xMat.T * (weights * yMat)) #计算回归系数 56 | return testPoint * ws 57 | 58 | # 局部加权线性回归测试 59 | def lwlrTest(testArr, xArr, yArr, k=1.0): 60 | """ 61 | Parameters: 62 | testArr - 测试数据集,测试集 63 | xArr - x数据集,训练集 64 | yArr - y数据集,训练集 65 | k - 高斯核的k,自定义参数 66 | Returns: 67 | ws - 回归系数 68 | """ 69 | m = np.shape(testArr)[0] #计算测试数据集大小 70 | yHat = np.zeros(m) 71 | for i in range(m): #对每个样本点进行预测 72 | yHat[i] = lwlr(testArr[i],xArr,yArr,k) 73 | return yHat 74 | 75 | # 计算回归系数w 76 | def standRegres(xArr,yArr): 77 | """ 78 | Parameters: 79 | xArr - x数据集 80 | yArr - y数据集 81 | Returns: 82 | ws - 回归系数 83 | """ 84 | xMat = np.mat(xArr); yMat = np.mat(yArr).T 85 | xTx = xMat.T * xMat #根据文中推导的公示计算回归系数 86 | if np.linalg.det(xTx) == 0.0: 87 | print("矩阵为奇异矩阵,不能求逆") 88 | return 89 | ws = xTx.I * (xMat.T*yMat) 90 | return ws 91 | 92 | 93 | def rssError(yArr, yHatArr): 94 | """ 95 | 误差大小评价函数 96 | Parameters: 97 | yArr - 真实数据 98 | yHatArr - 预测数据 99 | Returns: 100 | 误差大小 101 | """ 102 | return ((yArr - yHatArr) **2).sum() 103 | 104 | 105 | if __name__ == '__main__': 106 | abX, abY = loadDataSet('abalone.txt') 107 | print('训练集与测试集相同:局部加权线性回归,核k的大小对预测的影响:') 108 | yHat01 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 0.1) 109 | yHat1 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 1) 110 | yHat10 = lwlrTest(abX[0:99], abX[0:99], abY[0:99], 10) 111 | print('k=0.1时,误差大小为:',rssError(abY[0:99], yHat01.T)) 112 | print('k=1 时,误差大小为:',rssError(abY[0:99], yHat1.T)) 113 | print('k=10 时,误差大小为:',rssError(abY[0:99], yHat10.T)) 114 | print('') 115 | print('训练集与测试集不同:局部加权线性回归,核k的大小是越小越好吗?更换数据集,测试结果如下:') 116 | yHat01 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 0.1) 117 | yHat1 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 1) 118 | yHat10 = lwlrTest(abX[100:199], abX[0:99], abY[0:99], 10) 119 | print('k=0.1时,误差大小为:',rssError(abY[100:199], yHat01.T)) 120 | print('k=1 时,误差大小为:',rssError(abY[100:199], yHat1.T)) 121 | print('k=10 时,误差大小为:',rssError(abY[100:199], yHat10.T)) 122 | print('') 123 | print('训练集与测试集不同:简单的线性归回与k=1时的局部加权线性回归对比:') 124 | print('k=1时,误差大小为:', rssError(abY[100:199], yHat1.T)) 125 | ws = standRegres(abX[0:99], abY[0:99]) 126 | yHat = np.mat(abX[100:199]) * ws 127 | print('简单的线性回归误差大小:', rssError(abY[100:199], yHat.T.A)) 128 | -------------------------------------------------------------------------------- /Ch08-Regression/8.5.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:33 4 | # @Author : GXl 5 | # @File : 8.5.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | # 加载数据 14 | def loadDataSet(fileName): 15 | """ 16 | Parameters: 17 | fileName - 文件名 18 | Returns: 19 | xArr - x数据集 20 | yArr - y数据集 21 | """ 22 | numFeat = len(open(fileName).readline().split('\t')) - 1 23 | xArr = []; yArr = [] 24 | fr = open(fileName) 25 | for line in fr.readlines(): 26 | lineArr =[] 27 | curLine = line.strip().split('\t') 28 | for i in range(numFeat): 29 | lineArr.append(float(curLine[i])) 30 | xArr.append(lineArr) 31 | yArr.append(float(curLine[-1])) 32 | return xArr, yArr 33 | 34 | # 岭回归 35 | def ridgeRegres(xMat, yMat, lam = 0.2): 36 | """ 37 | Parameters: 38 | xMat - x数据集 39 | yMat - y数据集 40 | lam - 缩减系数 41 | Returns: 42 | ws - 回归系数 43 | """ 44 | xTx = xMat.T * xMat 45 | denom = xTx + np.eye(np.shape(xMat)[1]) * lam 46 | if np.linalg.det(denom) == 0.0: 47 | print("矩阵为奇异矩阵,不能转置") 48 | return 49 | ws = denom.I * (xMat.T * yMat) 50 | return ws 51 | 52 | # 岭回归测试 53 | def ridgeTest(xArr, yArr): 54 | """ 55 | Parameters: 56 | xMat - x数据集 57 | yMat - y数据集 58 | Returns: 59 | wMat - 回归系数矩阵 60 | """ 61 | xMat = np.mat(xArr); yMat = np.mat(yArr).T 62 | #数据标准化 63 | yMean = np.mean(yMat, axis = 0) #行与行操作,求均值 64 | yMat = yMat - yMean #数据减去均值 65 | xMeans = np.mean(xMat, axis = 0) #行与行操作,求均值 66 | xVar = np.var(xMat, axis = 0) #行与行操作,求方差 67 | xMat = (xMat - xMeans) / xVar #数据减去均值除以方差实现标准化 68 | numTestPts = 30 #30个不同的lambda测试 69 | wMat = np.zeros((numTestPts, np.shape(xMat)[1])) #初始回归系数矩阵 70 | for i in range(numTestPts): #改变lambda计算回归系数 71 | ws = ridgeRegres(xMat, yMat, np.exp(i - 10)) #lambda以e的指数变化,最初是一个非常小的数, 72 | wMat[i, :] = ws.T #计算回归系数矩阵 73 | return wMat 74 | 75 | # 绘制岭回归系数矩阵 76 | def plotwMat(): 77 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 78 | abX, abY = loadDataSet('abalone.txt') 79 | redgeWeights = ridgeTest(abX, abY) 80 | fig = plt.figure() 81 | ax = fig.add_subplot(111) 82 | ax.plot(redgeWeights) 83 | ax_title_text = ax.set_title(u'log(lambada)与回归系数的关系', FontProperties = font) 84 | ax_xlabel_text = ax.set_xlabel(u'log(lambada)', FontProperties = font) 85 | ax_ylabel_text = ax.set_ylabel(u'回归系数', FontProperties = font) 86 | plt.setp(ax_title_text, size = 20, weight = 'bold', color = 'red') 87 | plt.setp(ax_xlabel_text, size = 10, weight = 'bold', color = 'black') 88 | plt.setp(ax_ylabel_text, size = 10, weight = 'bold', color = 'black') 89 | plt.show() 90 | 91 | 92 | if __name__ == '__main__': 93 | plotwMat() 94 | -------------------------------------------------------------------------------- /Ch08-Regression/8.5.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:33 4 | # @Author : GXl 5 | # @File : 8.5.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from matplotlib.font_manager import FontProperties 10 | import matplotlib.pyplot as plt 11 | import numpy as np 12 | 13 | 14 | # 加载数据 15 | def loadDataSet(fileName): 16 | """ 17 | Parameters: 18 | fileName - 文件名 19 | Returns: 20 | xArr - x数据集 21 | yArr - y数据集 22 | """ 23 | numFeat = len(open(fileName).readline().split('\t')) - 1 24 | xArr = []; 25 | yArr = [] 26 | fr = open(fileName) 27 | for line in fr.readlines(): 28 | lineArr = [] 29 | curLine = line.strip().split('\t') 30 | for i in range(numFeat): 31 | lineArr.append(float(curLine[i])) 32 | xArr.append(lineArr) 33 | yArr.append(float(curLine[-1])) 34 | return xArr, yArr 35 | 36 | 37 | # 数据标准化 38 | def regularize(xMat, yMat): 39 | """ 40 | Parameters: 41 | xMat - x数据集 42 | yMat - y数据集 43 | Returns: 44 | inxMat - 标准化后的x数据集 45 | inyMat - 标准化后的y数据集 46 | """ 47 | inxMat = xMat.copy() # 数据拷贝 48 | inyMat = yMat.copy() 49 | yMean = np.mean(yMat, 0) # 行与行操作,求均值 50 | inyMat = yMat - yMean # 数据减去均值 51 | inMeans = np.mean(inxMat, 0) # 行与行操作,求均值 52 | inVar = np.var(inxMat, 0) # 行与行操作,求方差 53 | inxMat = (inxMat - inMeans) / inVar # 数据减去均值除以方差实现标准化 54 | return inxMat, inyMat 55 | 56 | 57 | # 计算平方误差 58 | def rssError(yArr, yHatArr): 59 | """ 60 | Parameters: 61 | yArr - 预测值 62 | yHatArr - 真实值 63 | Returns: 64 | """ 65 | return ((yArr - yHatArr) ** 2).sum() 66 | 67 | 68 | # 前向逐步线性回归 69 | def stageWise(xArr, yArr, eps=0.01, numIt=100): 70 | """ 71 | Parameters: 72 | xArr - x输入数据 73 | yArr - y预测数据 74 | eps - 每次迭代需要调整的步长 75 | numIt - 迭代次数 76 | Returns: 77 | returnMat - numIt次迭代的回归系数矩阵 78 | """ 79 | xMat = np.mat(xArr); 80 | yMat = np.mat(yArr).T # 数据集 81 | xMat, yMat = regularize(xMat, yMat) # 数据标准化 82 | m, n = np.shape(xMat) 83 | returnMat = np.zeros((numIt, n)) # 初始化numIt次迭代的回归系数矩阵 84 | ws = np.zeros((n, 1)) # 初始化回归系数矩阵 85 | wsTest = ws.copy() 86 | wsMax = ws.copy() 87 | for i in range(numIt): # 迭代numIt次 88 | # print(ws.T) #打印当前回归系数矩阵 89 | lowestError = float('inf'); # 正无穷 90 | for j in range(n): # 遍历每个特征的回归系数 91 | for sign in [-1, 1]: 92 | wsTest = ws.copy() 93 | wsTest[j] += eps * sign # 微调回归系数 94 | yTest = xMat * wsTest # 计算预测值 95 | rssE = rssError(yMat.A, yTest.A) # 计算平方误差 96 | if rssE < lowestError: # 如果误差更小,则更新当前的最佳回归系数 97 | lowestError = rssE 98 | wsMax = wsTest 99 | ws = wsMax.copy() 100 | returnMat[i, :] = ws.T # 记录numIt次迭代的回归系数矩阵 101 | return returnMat 102 | 103 | 104 | # 绘制岭回归系数矩阵 105 | def plotstageWiseMat(): 106 | font = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=14) 107 | xArr, yArr = loadDataSet('abalone.txt') 108 | returnMat = stageWise(xArr, yArr, 0.005, 1000) 109 | fig = plt.figure() 110 | ax = fig.add_subplot(111) 111 | ax.plot(returnMat) 112 | ax_title_text = ax.set_title(u'前向逐步回归:迭代次数与回归系数的关系', FontProperties=font) 113 | ax_xlabel_text = ax.set_xlabel(u'迭代次数', FontProperties=font) 114 | ax_ylabel_text = ax.set_ylabel(u'回归系数', FontProperties=font) 115 | plt.setp(ax_title_text, size=15, weight='bold', color='red') 116 | plt.setp(ax_xlabel_text, size=10, weight='bold', color='black') 117 | plt.setp(ax_ylabel_text, size=10, weight='bold', color='black') 118 | plt.show() 119 | 120 | 121 | if __name__ == '__main__': 122 | plotstageWiseMat() 123 | -------------------------------------------------------------------------------- /Ch08-Regression/8.6.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:34 4 | # @Author : GXl 5 | # @File : 8.6.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from bs4 import BeautifulSoup 10 | 11 | # 从页面读取数据,生成retX和retY列表 12 | def scrapePage(retX, retY, inFile, yr, numPce, origPrc): 13 | """ 14 | Parameters: 15 | retX - 数据X 16 | retY - 数据Y 17 | inFile - HTML文件 18 | yr - 年份 19 | numPce - 乐高部件数目 20 | origPrc - 原价 21 | Returns: 22 | 无 23 | """ 24 | # 打开并读取HTML文件 25 | with open(inFile, encoding='utf-8') as f: 26 | html = f.read() 27 | soup = BeautifulSoup(html) 28 | i = 1 29 | # 根据HTML页面结构进行解析 30 | currentRow = soup.find_all('table', r="%d" % i) 31 | while (len(currentRow) != 0): 32 | currentRow = soup.find_all('table', r="%d" % i) 33 | title = currentRow[0].find_all('a')[1].text 34 | lwrTitle = title.lower() 35 | # 查找是否有全新标签 36 | if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): 37 | newFlag = 1.0 38 | else: 39 | newFlag = 0.0 40 | # 查找是否已经标志出售,我们只收集已出售的数据 41 | soldUnicde = currentRow[0].find_all('td')[3].find_all('span') 42 | if len(soldUnicde) == 0: 43 | print("商品 #%d 没有出售" % i) 44 | else: 45 | # 解析页面获取当前价格 46 | soldPrice = currentRow[0].find_all('td')[4] 47 | priceStr = soldPrice.text 48 | priceStr = priceStr.replace('$', '') 49 | priceStr = priceStr.replace(',', '') 50 | if len(soldPrice) > 1: 51 | priceStr = priceStr.replace('Free shipping', '') 52 | sellingPrice = float(priceStr) 53 | # 去掉不完整的套装价格 54 | if sellingPrice > origPrc * 0.5: 55 | print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice)) 56 | retX.append([yr, numPce, newFlag, origPrc]) 57 | retY.append(sellingPrice) 58 | i += 1 59 | currentRow = soup.find_all('table', r="%d" % i) 60 | 61 | # 依次读取六种乐高套装的数据,并生成数据矩阵 62 | def setDataCollect(retX, retY): 63 | # 2006年的乐高8288,部件数目800,原价49.99 64 | scrapePage(retX, retY, './setHtml/lego8288.html', 2006, 800, 49.99) 65 | # 2002年的乐高10030,部件数目3096,原价269.99 66 | scrapePage(retX, retY, './setHtml/lego10030.html', 2002, 3096, 269.99) 67 | # 2007年的乐高10179,部件数目5195,原价499.99 68 | scrapePage(retX, retY, './setHtml/lego10179.html', 2007, 5195, 499.99) 69 | # 2007年的乐高10181,部件数目3428,原价199.99 70 | scrapePage(retX, retY, './setHtml/lego10181.html', 2007, 3428, 199.99) 71 | # 2008年的乐高10189,部件数目5922,原价299.99 72 | scrapePage(retX, retY, './setHtml/lego10189.html', 2008, 5922, 299.99) 73 | # 2009年的乐高10196,部件数目3263,原价249.99 74 | scrapePage(retX, retY, './setHtml/lego10196.html', 2009, 3263, 249.99) 75 | 76 | 77 | if __name__ == '__main__': 78 | lgX = [] 79 | lgY = [] 80 | setDataCollect(lgX, lgY) 81 | -------------------------------------------------------------------------------- /Ch08-Regression/8.6.2-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:34 4 | # @Author : GXl 5 | # @File : 8.6.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | from bs4 import BeautifulSoup 11 | 12 | 13 | # 页面读取数据,生成retX和retY列表 14 | def scrapePage(retX, retY, inFile, yr, numPce, origPrc): 15 | """ 16 | Parameters: 17 | retX - 数据X 18 | retY - 数据Y 19 | inFile - HTML文件 20 | yr - 年份 21 | numPce - 乐高部件数目 22 | origPrc - 原价 23 | Returns: 24 | 无 25 | """ 26 | # 打开并读取HTML文件 27 | with open(inFile, encoding='utf-8') as f: 28 | html = f.read() 29 | soup = BeautifulSoup(html) 30 | i = 1 31 | # 根据HTML页面结构进行解析 32 | currentRow = soup.find_all('table', r="%d" % i) 33 | while (len(currentRow) != 0): 34 | currentRow = soup.find_all('table', r="%d" % i) 35 | title = currentRow[0].find_all('a')[1].text 36 | lwrTitle = title.lower() 37 | # 查找是否有全新标签 38 | if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): 39 | newFlag = 1.0 40 | else: 41 | newFlag = 0.0 42 | # 查找是否已经标志出售,我们只收集已出售的数据 43 | soldUnicde = currentRow[0].find_all('td')[3].find_all('span') 44 | if len(soldUnicde) == 0: 45 | print("商品 #%d 没有出售" % i) 46 | else: 47 | # 解析页面获取当前价格 48 | soldPrice = currentRow[0].find_all('td')[4] 49 | priceStr = soldPrice.text 50 | priceStr = priceStr.replace('$', '') 51 | priceStr = priceStr.replace(',', '') 52 | if len(soldPrice) > 1: 53 | priceStr = priceStr.replace('Free shipping', '') 54 | sellingPrice = float(priceStr) 55 | # 去掉不完整的套装价格 56 | if sellingPrice > origPrc * 0.5: 57 | print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice)) 58 | retX.append([yr, numPce, newFlag, origPrc]) 59 | retY.append(sellingPrice) 60 | i += 1 61 | currentRow = soup.find_all('table', r="%d" % i) 62 | 63 | 64 | # 依次读取六种乐高套装的数据,并生成数据矩阵 65 | def setDataCollect(retX, retY): 66 | # 2006年的乐高8288,部件数目800,原价49.99 67 | scrapePage(retX, retY, './setHtml/lego8288.html', 2006, 800, 49.99) 68 | # 2002年的乐高10030,部件数目3096,原价269.99 69 | scrapePage(retX, retY, './setHtml/lego10030.html', 2002, 3096, 269.99) 70 | # 2007年的乐高10179,部件数目5195,原价499.99 71 | scrapePage(retX, retY, './setHtml/lego10179.html', 2007, 5195, 499.99) 72 | # 2007年的乐高10181,部件数目3428,原价199.99 73 | scrapePage(retX, retY, './setHtml/lego10181.html', 2007, 3428, 199.99) 74 | # 2008年的乐高10189,部件数目5922,原价299.99 75 | scrapePage(retX, retY, './setHtml/lego10189.html', 2008, 5922, 299.99) 76 | # 2009年的乐高10196,部件数目3263,原价249.99 77 | scrapePage(retX, retY, './setHtml/lego10196.html', 2009, 3263, 249.99) 78 | 79 | 80 | # 数据标准化 81 | def regularize(xMat, yMat): 82 | """ 83 | Parameters: 84 | xMat - x数据集 85 | yMat - y数据集 86 | Returns: 87 | inxMat - 标准化后的x数据集 88 | inyMat - 标准化后的y数据集 89 | """ 90 | inxMat = xMat.copy() # 数据拷贝 91 | inyMat = yMat.copy() 92 | yMean = np.mean(yMat, 0) # 行与行操作,求均值 93 | inyMat = yMat - yMean # 数据减去均值 94 | inMeans = np.mean(inxMat, 0) # 行与行操作,求均值 95 | inVar = np.var(inxMat, 0) # 行与行操作,求方差 96 | # print(inxMat) 97 | print(inMeans) 98 | # print(inVar) 99 | inxMat = (inxMat - inMeans) / inVar # 数据减去均值除以方差实现标准化 100 | return inxMat, inyMat 101 | 102 | 103 | # 计算平方误差 104 | def rssError(yArr, yHatArr): 105 | """ 106 | Parameters: 107 | yArr - 预测值 108 | yHatArr - 真实值 109 | Returns: 110 | 111 | """ 112 | return ((yArr - yHatArr) ** 2).sum() 113 | 114 | 115 | # 计算回归系数w 116 | def standRegres(xArr, yArr): 117 | """ 118 | Parameters: 119 | xArr - x数据集 120 | yArr - y数据集 121 | Returns: 122 | ws - 回归系数 123 | """ 124 | xMat = np.mat(xArr); 125 | yMat = np.mat(yArr).T 126 | xTx = xMat.T * xMat # 根据文中推导的公示计算回归系数 127 | if np.linalg.det(xTx) == 0.0: 128 | print("矩阵为奇异矩阵,不能转置") 129 | return 130 | ws = xTx.I * (xMat.T * yMat) 131 | return ws 132 | 133 | 134 | # 使用简单的线性回归 135 | def useStandRegres(): 136 | lgX = [] 137 | lgY = [] 138 | setDataCollect(lgX, lgY) 139 | data_num, features_num = np.shape(lgX) 140 | lgX1 = np.mat(np.ones((data_num, features_num + 1))) 141 | lgX1[:, 1:5] = np.mat(lgX) 142 | ws = standRegres(lgX1, lgY) 143 | print('%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价' % (ws[0], ws[1], ws[2], ws[3], ws[4])) 144 | 145 | 146 | if __name__ == '__main__': 147 | useStandRegres() 148 | -------------------------------------------------------------------------------- /Ch08-Regression/8.6.2-3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:35 4 | # @Author : GXl 5 | # @File : 8.6.2-3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | from bs4 import BeautifulSoup 11 | import random 12 | 13 | 14 | # 从页面读取数据,生成retX和retY列表 15 | def scrapePage(retX, retY, inFile, yr, numPce, origPrc): 16 | """ 17 | Parameters: 18 | retX - 数据X 19 | retY - 数据Y 20 | inFile - HTML文件 21 | yr - 年份 22 | numPce - 乐高部件数目 23 | origPrc - 原价 24 | Returns: 25 | 无 26 | """ 27 | # 打开并读取HTML文件 28 | with open(inFile, encoding='utf-8') as f: 29 | html = f.read() 30 | soup = BeautifulSoup(html) 31 | i = 1 32 | # 根据HTML页面结构进行解析 33 | currentRow = soup.find_all('table', r="%d" % i) 34 | while (len(currentRow) != 0): 35 | currentRow = soup.find_all('table', r="%d" % i) 36 | title = currentRow[0].find_all('a')[1].text 37 | lwrTitle = title.lower() 38 | # 查找是否有全新标签 39 | if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): 40 | newFlag = 1.0 41 | else: 42 | newFlag = 0.0 43 | # 查找是否已经标志出售,我们只收集已出售的数据 44 | soldUnicde = currentRow[0].find_all('td')[3].find_all('span') 45 | if len(soldUnicde) == 0: 46 | print("商品 #%d 没有出售" % i) 47 | else: 48 | # 解析页面获取当前价格 49 | soldPrice = currentRow[0].find_all('td')[4] 50 | priceStr = soldPrice.text 51 | priceStr = priceStr.replace('$', '') 52 | priceStr = priceStr.replace(',', '') 53 | if len(soldPrice) > 1: 54 | priceStr = priceStr.replace('Free shipping', '') 55 | sellingPrice = float(priceStr) 56 | # 去掉不完整的套装价格 57 | if sellingPrice > origPrc * 0.5: 58 | print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice)) 59 | retX.append([yr, numPce, newFlag, origPrc]) 60 | retY.append(sellingPrice) 61 | i += 1 62 | currentRow = soup.find_all('table', r="%d" % i) 63 | 64 | 65 | # 岭回归 66 | def ridgeRegres(xMat, yMat, lam=0.2): 67 | """ 68 | Parameters: 69 | xMat - x数据集 70 | yMat - y数据集 71 | lam - 缩减系数 72 | Returns: 73 | ws - 回归系数 74 | """ 75 | xTx = xMat.T * xMat 76 | denom = xTx + np.eye(np.shape(xMat)[1]) * lam 77 | if np.linalg.det(denom) == 0.0: 78 | print("矩阵为奇异矩阵,不能转置") 79 | return 80 | ws = denom.I * (xMat.T * yMat) 81 | return ws 82 | 83 | 84 | # 依次读取六种乐高套装的数据,并生成数据矩阵 85 | def setDataCollect(retX, retY): 86 | # 2006年的乐高8288,部件数目800,原价49.99 87 | scrapePage(retX, retY, './setHtml/lego8288.html', 2006, 800, 49.99) 88 | # 2002年的乐高10030,部件数目3096,原价269.99 89 | scrapePage(retX, retY, './setHtml/lego10030.html', 2002, 3096, 269.99) 90 | # 2007年的乐高10179,部件数目5195,原价499.99 91 | scrapePage(retX, retY, './setHtml/lego10179.html', 2007, 5195, 499.99) 92 | # 2007年的乐高10181,部件数目3428,原价199.99 93 | scrapePage(retX, retY, './setHtml/lego10181.html', 2007, 3428, 199.99) 94 | # 2008年的乐高10189,部件数目5922,原价299.99 95 | scrapePage(retX, retY, './setHtml/lego10189.html', 2008, 5922, 299.99) 96 | # 2009年的乐高10196,部件数目3263,原价249.99 97 | scrapePage(retX, retY, './setHtml/lego10196.html', 2009, 3263, 249.99) 98 | 99 | 100 | # 数据标准化 101 | def regularize(xMat, yMat): 102 | """ 103 | Parameters: 104 | xMat - x数据集 105 | yMat - y数据集 106 | Returns: 107 | inxMat - 标准化后的x数据集 108 | inyMat - 标准化后的y数据集 109 | """ 110 | inxMat = xMat.copy() # 数据拷贝 111 | inyMat = yMat.copy() 112 | yMean = np.mean(yMat, 0) # 行与行操作,求均值 113 | inyMat = yMat - yMean # 数据减去均值 114 | inMeans = np.mean(inxMat, 0) # 行与行操作,求均值 115 | inVar = np.var(inxMat, 0) # 行与行操作,求方差 116 | # print(inxMat) 117 | print(inMeans) 118 | # print(inVar) 119 | inxMat = (inxMat - inMeans) / inVar # 数据减去均值除以方差实现标准化 120 | return inxMat, inyMat 121 | 122 | 123 | # 计算平方误差 124 | def rssError(yArr, yHatArr): 125 | """ 126 | Parameters: 127 | yArr - 预测值 128 | yHatArr - 真实值 129 | Returns: 130 | 131 | """ 132 | return ((yArr - yHatArr) ** 2).sum() 133 | 134 | 135 | # 计算回归系数w 136 | def standRegres(xArr, yArr): 137 | """ 138 | Parameters: 139 | xArr - x数据集 140 | yArr - y数据集 141 | Returns: 142 | ws - 回归系数 143 | """ 144 | xMat = np.mat(xArr); 145 | yMat = np.mat(yArr).T 146 | xTx = xMat.T * xMat # 根据文中推导的公示计算回归系数 147 | if np.linalg.det(xTx) == 0.0: 148 | print("矩阵为奇异矩阵,不能转置") 149 | return 150 | ws = xTx.I * (xMat.T * yMat) 151 | return ws 152 | 153 | 154 | # 岭回归测试 155 | def ridgeTest(xArr, yArr): 156 | """ 157 | Parameters: 158 | xMat - x数据集 159 | yMat - y数据集 160 | Returns: 161 | wMat - 回归系数矩阵 162 | """ 163 | xMat = np.mat(xArr); 164 | yMat = np.mat(yArr).T 165 | # 数据标准化 166 | yMean = np.mean(yMat, axis=0) # 行与行操作,求均值 167 | yMat = yMat - yMean # 数据减去均值 168 | xMeans = np.mean(xMat, axis=0) # 行与行操作,求均值 169 | xVar = np.var(xMat, axis=0) # 行与行操作,求方差 170 | xMat = (xMat - xMeans) / xVar # 数据减去均值除以方差实现标准化 171 | numTestPts = 30 # 30个不同的lambda测试 172 | wMat = np.zeros((numTestPts, np.shape(xMat)[1])) # 初始回归系数矩阵 173 | for i in range(numTestPts): # 改变lambda计算回归系数 174 | ws = ridgeRegres(xMat, yMat, np.exp(i - 10)) # lambda以e的指数变化,最初是一个非常小的数, 175 | wMat[i, :] = ws.T # 计算回归系数矩阵 176 | return wMat 177 | 178 | 179 | if __name__ == '__main__': 180 | lgX = [] 181 | lgY = [] 182 | setDataCollect(lgX, lgY) 183 | print(ridgeTest(lgX, lgY)) 184 | -------------------------------------------------------------------------------- /Ch08-Regression/8.7.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:35 4 | # @Author : GXl 5 | # @File : 8.7.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | from bs4 import BeautifulSoup 11 | import random 12 | 13 | # 从页面读取数据,生成retX和retY列表 14 | def scrapePage(retX, retY, inFile, yr, numPce, origPrc): 15 | """ 16 | Parameters: 17 | retX - 数据X 18 | retY - 数据Y 19 | inFile - HTML文件 20 | yr - 年份 21 | numPce - 乐高部件数目 22 | origPrc - 原价 23 | Returns: 24 | 无 25 | """ 26 | # 打开并读取HTML文件 27 | with open(inFile, encoding='utf-8') as f: 28 | html = f.read() 29 | soup = BeautifulSoup(html) 30 | i = 1 31 | # 根据HTML页面结构进行解析 32 | currentRow = soup.find_all('table', r = "%d" % i) 33 | while(len(currentRow) != 0): 34 | currentRow = soup.find_all('table', r = "%d" % i) 35 | title = currentRow[0].find_all('a')[1].text 36 | lwrTitle = title.lower() 37 | # 查找是否有全新标签 38 | if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1): 39 | newFlag = 1.0 40 | else: 41 | newFlag = 0.0 42 | # 查找是否已经标志出售,我们只收集已出售的数据 43 | soldUnicde = currentRow[0].find_all('td')[3].find_all('span') 44 | if len(soldUnicde) == 0: 45 | print("商品 #%d 没有出售" % i) 46 | else: 47 | # 解析页面获取当前价格 48 | soldPrice = currentRow[0].find_all('td')[4] 49 | priceStr = soldPrice.text 50 | priceStr = priceStr.replace('$','') 51 | priceStr = priceStr.replace(',','') 52 | if len(soldPrice) > 1: 53 | priceStr = priceStr.replace('Free shipping', '') 54 | sellingPrice = float(priceStr) 55 | # 去掉不完整的套装价格 56 | if sellingPrice > origPrc * 0.5: 57 | print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice)) 58 | retX.append([yr, numPce, newFlag, origPrc]) 59 | retY.append(sellingPrice) 60 | i += 1 61 | currentRow = soup.find_all('table', r = "%d" % i) 62 | 63 | # 依次读取六种乐高套装的数据,并生成数据矩阵 64 | def setDataCollect(retX, retY): 65 | # 2006年的乐高8288,部件数目800,原价49.99 66 | scrapePage(retX, retY, './setHtml/lego8288.html', 2006, 800, 49.99) 67 | # 2002年的乐高10030,部件数目3096,原价269.99 68 | scrapePage(retX, retY, './setHtml/lego10030.html', 2002, 3096, 269.99) 69 | # 2007年的乐高10179,部件数目5195,原价499.99 70 | scrapePage(retX, retY, './setHtml/lego10179.html', 2007, 5195, 499.99) 71 | # 2007年的乐高10181,部件数目3428,原价199.99 72 | scrapePage(retX, retY, './setHtml/lego10181.html', 2007, 3428, 199.99) 73 | # 2008年的乐高10189,部件数目5922,原价299.99 74 | scrapePage(retX, retY, './setHtml/lego10189.html', 2008, 5922, 299.99) 75 | # 2009年的乐高10196,部件数目3263,原价249.99 76 | scrapePage(retX, retY, './setHtml/lego10196.html', 2009, 3263, 249.99) 77 | 78 | # 使用sklearn 79 | def usesklearn(): 80 | from sklearn import linear_model 81 | reg = linear_model.Ridge(alpha = .5) 82 | lgX = [] 83 | lgY = [] 84 | setDataCollect(lgX, lgY) 85 | reg.fit(lgX, lgY) 86 | print('%f%+f*年份%+f*部件数量%+f*是否为全新%+f*原价' % (reg.intercept_, reg.coef_[0], reg.coef_[1], reg.coef_[2], reg.coef_[3])) 87 | 88 | 89 | if __name__ == '__main__': 90 | usesklearn() 91 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:36 4 | # @Author : GXl 5 | # @File : 9.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 函数说明:根据给定特征和特征值,通过数组过滤的方式切分数据集合 12 | """ 13 | Parameters: 14 | dataSet - 数据集合 15 | feature - 待切分的特征 16 | value - 特征的某个值 17 | """ 18 | def binSplitDataSet(dataSet, feature, value): 19 | mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:] 20 | mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:] 21 | return mat0, mat1 22 | 23 | 24 | if __name__ == '__main__': 25 | testMat = np.mat(np.eye(4)) 26 | print("testMat:", testMat) 27 | mat0, mat1 = binSplitDataSet(testMat, 1, 0.5) 28 | print("mat0:", mat0) 29 | print("mat1:", mat1) 30 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.4.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:37 4 | # @Author : GXl 5 | # @File : 9.4.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from numpy import * 10 | import matplotlib.pyplot as plt 11 | 12 | # 函数说明:加载数据 13 | """ 14 | Parameters: 15 | filename - 文件名 16 | """ 17 | def loadDataSet(fileName): 18 | dataMat = [] 19 | fr = open(fileName) 20 | for line in fr.readlines(): 21 | curLine = line.strip().split('\t') 22 | fltLine = list(map(float, curLine)) 23 | dataMat.append(fltLine) 24 | return dataMat 25 | 26 | # 函数说明:绘制数据集分布 27 | """ 28 | Parameters: 29 | filename - 文件名 30 | """ 31 | def plotDataSet(filename): 32 | dataMat = loadDataSet(filename) 33 | n = len(dataMat) # 数据个数 34 | xcord = [] 35 | ycord = [] # 样本点 36 | for i in range(n): 37 | xcord.append(dataMat[i][0]) 38 | ycord.append(dataMat[i][1]) # 样本点 39 | fig = plt.figure() 40 | ax = fig.add_subplot(111) # 添加subplot 41 | ax.scatter(xcord, ycord, s=20, c='blue', alpha=.5) # 绘制样本点 42 | plt.title('DataSet') # 绘制title 43 | plt.xlabel('X') 44 | plt.show() 45 | 46 | 47 | if __name__ == '__main__': 48 | filename = 'ex00.txt' 49 | plotDataSet(filename) 50 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.4.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:37 4 | # @Author : GXl 5 | # @File : 9.4.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 函数说明:加载数据 12 | """ 13 | Parameters: 14 | fileName - 文件名 15 | """ 16 | def loadDataSet(fileName): 17 | dataMat = [] 18 | fr = open(fileName) 19 | for line in fr.readlines(): 20 | curLine = line.strip().split('\t') 21 | fltLine = list(map(float, curLine)) #转化为float类型 22 | dataMat.append(fltLine) 23 | return dataMat 24 | 25 | # 函数说明:根据特征切分数据集合 26 | """ 27 | Parameters: 28 | dataSet - 数据集合 29 | feature - 带切分的特征 30 | value - 该特征的值 31 | """ 32 | def binSplitDataSet(dataSet, feature, value): 33 | mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:] 34 | mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:] 35 | return mat0, mat1 36 | 37 | # 函数说明:生成叶结点 38 | """ 39 | Parameters: 40 | dataSet - 数据集合 41 | """ 42 | def regLeaf(dataSet): 43 | return np.mean(dataSet[:,-1]) 44 | 45 | # 函数说明:误差估计函数 46 | """ 47 | Parameters: 48 | dataSet - 数据集合 49 | """ 50 | def regErr(dataSet): 51 | return np.var(dataSet[:,-1]) * np.shape(dataSet)[0] 52 | 53 | # 函数说明:找到数据的最佳二元切分方式函数 54 | """ 55 | Parameters: 56 | dataSet - 数据集合 57 | leafType - 生成叶结点 58 | regErr - 误差估计函数 59 | ops - 用户定义的参数构成的元组 60 | """ 61 | def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)): 62 | import types 63 | #tolS允许的误差下降值,tolN切分的最少样本数 64 | tolS = ops[0]; tolN = ops[1] 65 | #如果当前所有值相等,则退出。(根据set的特性) 66 | if len(set(dataSet[:,-1].T.tolist()[0])) == 1: 67 | return None, leafType(dataSet) 68 | #统计数据集合的行m和列n 69 | m, n = np.shape(dataSet) 70 | #默认最后一个特征为最佳切分特征,计算其误差估计 71 | S = errType(dataSet) 72 | #分别为最佳误差,最佳特征切分的索引值,最佳特征值 73 | bestS = float('inf'); bestIndex = 0; bestValue = 0 74 | #遍历所有特征列 75 | for featIndex in range(n - 1): 76 | #遍历所有特征值 77 | for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]): 78 | #根据特征和特征值切分数据集 79 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 80 | #如果数据少于tolN,则退出 81 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 82 | #计算误差估计 83 | newS = errType(mat0) + errType(mat1) 84 | #如果误差估计更小,则更新特征索引值和特征值 85 | if newS < bestS: 86 | bestIndex = featIndex 87 | bestValue = splitVal 88 | bestS = newS 89 | #如果误差减少不大则退出 90 | if (S - bestS) < tolS: 91 | return None, leafType(dataSet) 92 | #根据最佳的切分特征和特征值切分数据集合 93 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 94 | #如果切分出的数据集很小则退出 95 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 96 | return None, leafType(dataSet) 97 | #返回最佳切分特征和特征值 98 | return bestIndex, bestValue 99 | 100 | if __name__ == '__main__': 101 | myDat = loadDataSet('ex00.txt') 102 | myMat = np.mat(myDat) 103 | feat, val = chooseBestSplit(myMat, regLeaf, regErr, (1, 4)) 104 | print(feat) 105 | print(val) 106 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.4.3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:37 4 | # @Author : GXl 5 | # @File : 9.4.3.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 函数说明:加载数据 12 | """ 13 | Parameters: 14 | fileName - 文件名 15 | """ 16 | 17 | 18 | def loadDataSet(fileName): 19 | dataMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | curLine = line.strip().split('\t') 23 | fltLine = list(map(float, curLine)) # 转化为float类型 24 | dataMat.append(fltLine) 25 | return dataMat 26 | 27 | 28 | # 函数说明:根据特征切分数据集合 29 | """ 30 | Parameters: 31 | dataSet - 数据集合 32 | feature - 带切分的特征 33 | value - 该特征的值 34 | """ 35 | 36 | 37 | def binSplitDataSet(dataSet, feature, value): 38 | mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :] 39 | mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :] 40 | return mat0, mat1 41 | 42 | 43 | # 函数说明:生成叶结点 44 | """ 45 | Parameters: 46 | dataSet - 数据集合 47 | """ 48 | 49 | 50 | def regLeaf(dataSet): 51 | return np.mean(dataSet[:, -1]) 52 | 53 | 54 | # 函数说明:误差估计函数 55 | """ 56 | Parameters: 57 | dataSet - 数据集合 58 | """ 59 | 60 | 61 | def regErr(dataSet): 62 | return np.var(dataSet[:, -1]) * np.shape(dataSet)[0] 63 | 64 | 65 | # 函数说明:找到数据的最佳二元切分方式函数 66 | """ 67 | Parameters: 68 | dataSet - 数据集合 69 | leafType - 生成叶结点 70 | regErr - 误差估计函数 71 | ops - 用户定义的参数构成的元组 72 | """ 73 | 74 | 75 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 76 | import types 77 | # tolS允许的误差下降值,tolN切分的最少样本数 78 | tolS = ops[0]; 79 | tolN = ops[1] 80 | # 如果当前所有值相等,则退出。(根据set的特性) 81 | if len(set(dataSet[:, -1].T.tolist()[0])) == 1: 82 | return None, leafType(dataSet) 83 | # 统计数据集合的行m和列n 84 | m, n = np.shape(dataSet) 85 | # 默认最后一个特征为最佳切分特征,计算其误差估计 86 | S = errType(dataSet) 87 | # 分别为最佳误差,最佳特征切分的索引值,最佳特征值 88 | bestS = float('inf'); 89 | bestIndex = 0; 90 | bestValue = 0 91 | # 遍历所有特征列 92 | for featIndex in range(n - 1): 93 | # 遍历所有特征值 94 | for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]): 95 | # 根据特征和特征值切分数据集 96 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 97 | # 如果数据少于tolN,则退出 98 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 99 | # 计算误差估计 100 | newS = errType(mat0) + errType(mat1) 101 | # 如果误差估计更小,则更新特征索引值和特征值 102 | if newS < bestS: 103 | bestIndex = featIndex 104 | bestValue = splitVal 105 | bestS = newS 106 | # 如果误差减少不大则退出 107 | if (S - bestS) < tolS: 108 | return None, leafType(dataSet) 109 | # 根据最佳的切分特征和特征值切分数据集合 110 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 111 | # 如果切分出的数据集很小则退出 112 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 113 | return None, leafType(dataSet) 114 | # 返回最佳切分特征和特征值 115 | return bestIndex, bestValue 116 | 117 | 118 | # 函数说明:树构建函数 119 | """ 120 | Parameters: 121 | dataSet - 数据集合 122 | leafType - 建立叶结点的函数 123 | errType - 误差计算函数 124 | ops - 包含树构建所有其他参数的元组 125 | """ 126 | 127 | 128 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 129 | # 选择最佳切分特征和特征值 130 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops) 131 | # r如果没有特征,则返回特征值 132 | if feat == None: return val 133 | # 回归树 134 | retTree = {} 135 | retTree['spInd'] = feat 136 | retTree['spVal'] = val 137 | # 分成左数据集和右数据集 138 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 139 | # 创建左子树和右子树 140 | retTree['left'] = createTree(lSet, leafType, errType, ops) 141 | retTree['right'] = createTree(rSet, leafType, errType, ops) 142 | return retTree 143 | 144 | 145 | if __name__ == '__main__': 146 | myDat = loadDataSet('ex00.txt') 147 | myMat = np.mat(myDat) 148 | print(createTree(myMat)) 149 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.4.4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:37 4 | # @Author : GXl 5 | # @File : 9.4.4.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 函数说明:加载数据 13 | """ 14 | Parameters: 15 | filename - 文件名 16 | """ 17 | def loadDataSet(fileName): 18 | dataMat = [] 19 | fr = open(fileName) 20 | for line in fr.readlines(): 21 | curLine = line.strip().split('\t') 22 | fltLine = list(map(float, curLine)) #转化为float类型 23 | dataMat.append(fltLine) 24 | return dataMat 25 | 26 | # 函数说明:绘制数据集 27 | """ 28 | Parameters: 29 | filename - 文件名 30 | """ 31 | def plotDataSet(filename): 32 | dataMat = loadDataSet(filename) #加载数据集 33 | n = len(dataMat) #数据个数 34 | xcord = []; ycord = [] #样本点 35 | for i in range(n): 36 | xcord.append(dataMat[i][1]); ycord.append(dataMat[i][2]) #样本点 37 | fig = plt.figure() 38 | ax = fig.add_subplot(111) #添加subplot 39 | ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) #绘制样本点 40 | plt.title('DataSet') #绘制title 41 | plt.xlabel('X') 42 | plt.show() 43 | 44 | if __name__ == '__main__': 45 | filename = 'ex0.txt' 46 | plotDataSet(filename) 47 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.4.5.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:38 4 | # @Author : GXl 5 | # @File : 9.4.5.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 函数说明:加载数据 12 | """ 13 | Parameters: 14 | fileName - 文件名 15 | """ 16 | 17 | 18 | def loadDataSet(fileName): 19 | dataMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | curLine = line.strip().split('\t') 23 | fltLine = list(map(float, curLine)) # 转化为float类型 24 | dataMat.append(fltLine) 25 | return dataMat 26 | 27 | 28 | # 函数说明:根据特征切分数据集合 29 | """ 30 | Parameters: 31 | dataSet - 数据集合 32 | feature - 带切分的特征 33 | value - 该特征的值 34 | """ 35 | 36 | 37 | def binSplitDataSet(dataSet, feature, value): 38 | mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :] 39 | mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :] 40 | return mat0, mat1 41 | 42 | 43 | # 函数说明:生成叶结点 44 | """ 45 | Parameters: 46 | dataSet - 数据集合 47 | """ 48 | 49 | 50 | def regLeaf(dataSet): 51 | return np.mean(dataSet[:, -1]) 52 | 53 | 54 | # 函数说明:误差估计函数 55 | """ 56 | Parameters: 57 | dataSet - 数据集合 58 | """ 59 | 60 | 61 | def regErr(dataSet): 62 | return np.var(dataSet[:, -1]) * np.shape(dataSet)[0] 63 | 64 | 65 | # 函数说明:找到数据的最佳二元切分方式函数 66 | """ 67 | Parameters: 68 | dataSet - 数据集合 69 | leafType - 生成叶结点 70 | regErr - 误差估计函数 71 | ops - 用户定义的参数构成的元组 72 | """ 73 | 74 | 75 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 76 | import types 77 | # tolS允许的误差下降值,tolN切分的最少样本数 78 | tolS = ops[0]; 79 | tolN = ops[1] 80 | # 如果当前所有值相等,则退出。(根据set的特性) 81 | if len(set(dataSet[:, -1].T.tolist()[0])) == 1: 82 | return None, leafType(dataSet) 83 | # 统计数据集合的行m和列n 84 | m, n = np.shape(dataSet) 85 | # 默认最后一个特征为最佳切分特征,计算其误差估计 86 | S = errType(dataSet) 87 | # 分别为最佳误差,最佳特征切分的索引值,最佳特征值 88 | bestS = float('inf'); 89 | bestIndex = 0; 90 | bestValue = 0 91 | # 遍历所有特征列 92 | for featIndex in range(n - 1): 93 | # 遍历所有特征值 94 | for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]): 95 | # 根据特征和特征值切分数据集 96 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 97 | # 如果数据少于tolN,则退出 98 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 99 | # 计算误差估计 100 | newS = errType(mat0) + errType(mat1) 101 | # 如果误差估计更小,则更新特征索引值和特征值 102 | if newS < bestS: 103 | bestIndex = featIndex 104 | bestValue = splitVal 105 | bestS = newS 106 | # 如果误差减少不大则退出 107 | if (S - bestS) < tolS: 108 | return None, leafType(dataSet) 109 | # 根据最佳的切分特征和特征值切分数据集合 110 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 111 | # 如果切分出的数据集很小则退出 112 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 113 | return None, leafType(dataSet) 114 | # 返回最佳切分特征和特征值 115 | return bestIndex, bestValue 116 | 117 | 118 | # 函数说明:树构建函数 119 | """ 120 | Parameters: 121 | dataSet - 数据集合 122 | leafType - 建立叶结点的函数 123 | errType - 误差计算函数 124 | ops - 包含树构建所有其他参数的元组 125 | """ 126 | 127 | 128 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 129 | # 选择最佳切分特征和特征值 130 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops) 131 | # r如果没有特征,则返回特征值 132 | if feat == None: return val 133 | # 回归树 134 | retTree = {} 135 | retTree['spInd'] = feat 136 | retTree['spVal'] = val 137 | # 分成左数据集和右数据集 138 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 139 | # 创建左子树和右子树 140 | retTree['left'] = createTree(lSet, leafType, errType, ops) 141 | retTree['right'] = createTree(rSet, leafType, errType, ops) 142 | return retTree 143 | 144 | 145 | if __name__ == '__main__': 146 | myDat = loadDataSet('ex0.txt') 147 | myMat = np.mat(myDat) 148 | print(createTree(myMat)) 149 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.5.1-1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:38 4 | # @Author : GXl 5 | # @File : 9.5.1-1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 函数说明:加载数据 13 | """ 14 | Parameters: 15 | fileName - 文件名 16 | """ 17 | def loadDataSet(fileName): 18 | 19 | dataMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | curLine = line.strip().split('\t') 23 | fltLine = list(map(float, curLine)) #转化为float类型 24 | dataMat.append(fltLine) 25 | return dataMat 26 | 27 | # 函数说明:绘制数据集 28 | """ 29 | Parameters: 30 | filename - 文件名 31 | """ 32 | def plotDataSet(filename): 33 | dataMat = loadDataSet(filename) #加载数据集 34 | n = len(dataMat) #数据个数 35 | xcord = []; ycord = [] #样本点 36 | for i in range(n): 37 | xcord.append(dataMat[i][0]); ycord.append(dataMat[i][1]) #样本点 38 | fig = plt.figure() 39 | ax = fig.add_subplot(111) #添加subplot 40 | ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) #绘制样本点 41 | plt.title('DataSet') #绘制title 42 | plt.xlabel('X') 43 | plt.show() 44 | 45 | 46 | if __name__ == '__main__': 47 | filename = 'ex2.txt' 48 | plotDataSet(filename) 49 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.5.1-2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:38 4 | # @Author : GXl 5 | # @File : 9.5.1-2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 函数说明:加载数据 12 | """ 13 | Parameters: 14 | fileName - 文件名 15 | """ 16 | def loadDataSet(fileName): 17 | dataMat = [] 18 | fr = open(fileName) 19 | for line in fr.readlines(): 20 | curLine = line.strip().split('\t') 21 | fltLine = list(map(float, curLine)) # 转化为float类型 22 | dataMat.append(fltLine) 23 | return dataMat 24 | 25 | # 函数说明:根据特征切分数据集合 26 | """ 27 | Parameters: 28 | dataSet - 数据集合 29 | feature - 带切分的特征 30 | value - 该特征的值 31 | """ 32 | def binSplitDataSet(dataSet, feature, value): 33 | mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :] 34 | mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :] 35 | return mat0, mat1 36 | 37 | # 函数说明:生成叶结点 38 | """ 39 | Parameters: 40 | dataSet - 数据集合 41 | """ 42 | def regLeaf(dataSet): 43 | return np.mean(dataSet[:, -1]) 44 | 45 | # 函数说明:误差估计函数 46 | """ 47 | Parameters: 48 | dataSet - 数据集合 49 | """ 50 | def regErr(dataSet): 51 | return np.var(dataSet[:, -1]) * np.shape(dataSet)[0] 52 | 53 | # 函数说明:找到数据的最佳二元切分方式函数 54 | """ 55 | Parameters: 56 | dataSet - 数据集合 57 | leafType - 生成叶结点 58 | regErr - 误差估计函数 59 | ops - 用户定义的参数构成的元组 60 | """ 61 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 62 | import types 63 | # tolS允许的误差下降值,tolN切分的最少样本数 64 | tolS = ops[0]; 65 | tolN = ops[1] 66 | # 如果当前所有值相等,则退出。(根据set的特性) 67 | if len(set(dataSet[:, -1].T.tolist()[0])) == 1: 68 | return None, leafType(dataSet) 69 | # 统计数据集合的行m和列n 70 | m, n = np.shape(dataSet) 71 | # 默认最后一个特征为最佳切分特征,计算其误差估计 72 | S = errType(dataSet) 73 | # 分别为最佳误差,最佳特征切分的索引值,最佳特征值 74 | bestS = float('inf'); 75 | bestIndex = 0; 76 | bestValue = 0 77 | # 遍历所有特征列 78 | for featIndex in range(n - 1): 79 | # 遍历所有特征值 80 | for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]): 81 | # 根据特征和特征值切分数据集 82 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 83 | # 如果数据少于tolN,则退出 84 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 85 | # 计算误差估计 86 | newS = errType(mat0) + errType(mat1) 87 | # 如果误差估计更小,则更新特征索引值和特征值 88 | if newS < bestS: 89 | bestIndex = featIndex 90 | bestValue = splitVal 91 | bestS = newS 92 | # 如果误差减少不大则退出 93 | if (S - bestS) < tolS: 94 | return None, leafType(dataSet) 95 | # 根据最佳的切分特征和特征值切分数据集合 96 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 97 | # 如果切分出的数据集很小则退出 98 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 99 | return None, leafType(dataSet) 100 | # 返回最佳切分特征和特征值 101 | return bestIndex, bestValue 102 | 103 | # 函数说明:树构建函数 104 | """ 105 | Parameters: 106 | dataSet - 数据集合 107 | leafType - 建立叶结点的函数 108 | errType - 误差计算函数 109 | ops - 包含树构建所有其他参数的元组 110 | """ 111 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 112 | # 选择最佳切分特征和特征值 113 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops) 114 | # r如果没有特征,则返回特征值 115 | if feat == None: return val 116 | # 回归树 117 | retTree = {} 118 | retTree['spInd'] = feat 119 | retTree['spVal'] = val 120 | # 分成左数据集和右数据集 121 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 122 | # 创建左子树和右子树 123 | retTree['left'] = createTree(lSet, leafType, errType, ops) 124 | retTree['right'] = createTree(rSet, leafType, errType, ops) 125 | return retTree 126 | 127 | 128 | if __name__ == '__main__': 129 | myDat = loadDataSet('ex2.txt') 130 | myMat = np.mat(myDat) 131 | print(createTree(myMat)) 132 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.5.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:39 4 | # @Author : GXl 5 | # @File : 9.5.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 函数说明:加载数据 13 | """ 14 | Parameters: 15 | fileName - 文件名 16 | """ 17 | def loadDataSet(fileName): 18 | dataMat = [] 19 | fr = open(fileName) 20 | for line in fr.readlines(): 21 | curLine = line.strip().split('\t') 22 | fltLine = list(map(float, curLine)) #转化为float类型 23 | dataMat.append(fltLine) 24 | return dataMat 25 | 26 | # 函数说明:绘制数据集 27 | """ 28 | Parameters: 29 | filename - 文件名 30 | """ 31 | def plotDataSet(filename): 32 | dataMat = loadDataSet(filename) #加载数据集 33 | n = len(dataMat) #数据个数 34 | xcord = []; ycord = [] #样本点 35 | for i in range(n): 36 | xcord.append(dataMat[i][0]); ycord.append(dataMat[i][1]) #样本点 37 | fig = plt.figure() 38 | ax = fig.add_subplot(111) #添加subplot 39 | ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) #绘制样本点 40 | plt.title('DataSet') #绘制title 41 | plt.xlabel('X') 42 | plt.show() 43 | 44 | # 函数说明:根据特征切分数据集合 45 | """ 46 | Parameters: 47 | dataSet - 数据集合 48 | feature - 带切分的特征 49 | value - 该特征的值 50 | """ 51 | def binSplitDataSet(dataSet, feature, value): 52 | mat0 = dataSet[np.nonzero(dataSet[:,feature] > value)[0],:] 53 | mat1 = dataSet[np.nonzero(dataSet[:,feature] <= value)[0],:] 54 | return mat0, mat1 55 | 56 | # 函数说明:生成叶结点 57 | """ 58 | Parameters: 59 | dataSet - 数据集合 60 | """ 61 | def regLeaf(dataSet): 62 | return np.mean(dataSet[:,-1]) 63 | 64 | # 函数说明:误差估计函数 65 | """ 66 | Parameters: 67 | dataSet - 数据集合 68 | """ 69 | def regErr(dataSet): 70 | return np.var(dataSet[:,-1]) * np.shape(dataSet)[0] 71 | 72 | # 函数说明:找到数据的最佳二元切分方式函数 73 | """ 74 | Parameters: 75 | dataSet - 数据集合 76 | leafType - 生成叶结点 77 | regErr - 误差估计函数 78 | ops - 用户定义的参数构成的元组 79 | """ 80 | def chooseBestSplit(dataSet, leafType = regLeaf, errType = regErr, ops = (1,4)): 81 | import types 82 | #tolS允许的误差下降值,tolN切分的最少样本数 83 | tolS = ops[0]; tolN = ops[1] 84 | #如果当前所有值相等,则退出。(根据set的特性) 85 | if len(set(dataSet[:,-1].T.tolist()[0])) == 1: 86 | return None, leafType(dataSet) 87 | #统计数据集合的行m和列n 88 | m, n = np.shape(dataSet) 89 | #默认最后一个特征为最佳切分特征,计算其误差估计 90 | S = errType(dataSet) 91 | #分别为最佳误差,最佳特征切分的索引值,最佳特征值 92 | bestS = float('inf'); bestIndex = 0; bestValue = 0 93 | #遍历所有特征列 94 | for featIndex in range(n - 1): 95 | #遍历所有特征值 96 | for splitVal in set(dataSet[:,featIndex].T.A.tolist()[0]): 97 | #根据特征和特征值切分数据集 98 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 99 | #如果数据少于tolN,则退出 100 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 101 | #计算误差估计 102 | newS = errType(mat0) + errType(mat1) 103 | #如果误差估计更小,则更新特征索引值和特征值 104 | if newS < bestS: 105 | bestIndex = featIndex 106 | bestValue = splitVal 107 | bestS = newS 108 | #如果误差减少不大则退出 109 | if (S - bestS) < tolS: 110 | return None, leafType(dataSet) 111 | #根据最佳的切分特征和特征值切分数据集合 112 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 113 | #如果切分出的数据集很小则退出 114 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 115 | return None, leafType(dataSet) 116 | #返回最佳切分特征和特征值 117 | return bestIndex, bestValue 118 | 119 | # 函数说明:树构建函数 120 | """ 121 | Parameters: 122 | dataSet - 数据集合 123 | leafType - 建立叶结点的函数 124 | errType - 误差计算函数 125 | ops - 包含树构建所有其他参数的元组 126 | """ 127 | def createTree(dataSet, leafType = regLeaf, errType = regErr, ops = (1, 4)): 128 | #选择最佳切分特征和特征值 129 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops) 130 | #r如果没有特征,则返回特征值 131 | if feat == None: return val 132 | #回归树 133 | retTree = {} 134 | retTree['spInd'] = feat 135 | retTree['spVal'] = val 136 | #分成左数据集和右数据集 137 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 138 | #创建左子树和右子树 139 | retTree['left'] = createTree(lSet, leafType, errType, ops) 140 | retTree['right'] = createTree(rSet, leafType, errType, ops) 141 | return retTree 142 | 143 | # 函数说明:判断测试输入变量是否是一棵树 144 | """ 145 | Parameters: 146 | obj - 测试对象 147 | """ 148 | def isTree(obj): 149 | import types 150 | return (type(obj).__name__ == 'dict') 151 | 152 | # 函数说明:对树进行塌陷处理(即返回树平均值) 153 | """ 154 | Parameters: 155 | tree - 树 156 | """ 157 | def getMean(tree): 158 | if isTree(tree['right']): tree['right'] = getMean(tree['right']) 159 | if isTree(tree['left']): tree['left'] = getMean(tree['left']) 160 | return (tree['left'] + tree['right']) / 2.0 161 | 162 | # 函数说明:后剪枝 163 | """ 164 | Parameters: 165 | tree - 树 166 | test - 测试集 167 | """ 168 | def prune(tree, testData): 169 | #如果测试集为空,则对树进行塌陷处理 170 | if np.shape(testData)[0] == 0: return getMean(tree) 171 | #如果有左子树或者右子树,则切分数据集 172 | if (isTree(tree['right']) or isTree(tree['left'])): 173 | lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) 174 | #处理左子树(剪枝) 175 | if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet) 176 | #处理右子树(剪枝) 177 | if isTree(tree['right']): tree['right'] = prune(tree['right'], rSet) 178 | #如果当前结点的左右结点为叶结点 179 | if not isTree(tree['left']) and not isTree(tree['right']): 180 | lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal']) 181 | #计算没有合并的误差 182 | errorNoMerge = np.sum(np.power(lSet[:,-1] - tree['left'],2)) + np.sum(np.power(rSet[:,-1] - tree['right'],2)) 183 | #计算合并的均值 184 | treeMean = (tree['left'] + tree['right']) / 2.0 185 | #计算合并的误差 186 | errorMerge = np.sum(np.power(testData[:,-1] - treeMean, 2)) 187 | #如果合并的误差小于没有合并的误差,则合并 188 | if errorMerge < errorNoMerge: 189 | return treeMean 190 | else: return tree 191 | else: return tree 192 | 193 | if __name__ == '__main__': 194 | train_filename = 'ex2.txt' 195 | train_Data = loadDataSet(train_filename) 196 | train_Mat = np.mat(train_Data) 197 | tree = createTree(train_Mat) 198 | print(tree) 199 | test_filename = 'ex2test.txt' 200 | test_Data = loadDataSet(test_filename) 201 | test_Mat = np.mat(test_Data) 202 | print(prune(tree, test_Mat)) 203 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.6.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:39 4 | # @Author : GXl 5 | # @File : 9.6.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 函数说明:加载数据 13 | """ 14 | Parameters: 15 | fileName - 文件名 16 | """ 17 | def loadDataSet(fileName): 18 | 19 | dataMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | curLine = line.strip().split('\t') 23 | fltLine = list(map(float, curLine)) #转化为float类型 24 | dataMat.append(fltLine) 25 | return dataMat 26 | 27 | # 函数说明:绘制数据集 28 | """ 29 | Parameters: 30 | filename - 文件名 31 | """ 32 | def plotDataSet(filename): 33 | dataMat = loadDataSet(filename) #加载数据集 34 | n = len(dataMat) #数据个数 35 | xcord = []; ycord = [] #样本点 36 | for i in range(n): 37 | xcord.append(dataMat[i][0]); ycord.append(dataMat[i][1]) #样本点 38 | fig = plt.figure() 39 | ax = fig.add_subplot(111) #添加subplot 40 | ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) #绘制样本点 41 | plt.title('DataSet') #绘制title 42 | plt.xlabel('X') 43 | plt.show() 44 | 45 | 46 | if __name__ == '__main__': 47 | filename = 'exp2.txt' 48 | plotDataSet(filename) 49 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.6.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:39 4 | # @Author : GXl 5 | # @File : 9.6.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import numpy as np 10 | 11 | # 函数说明:加载数据 12 | """ 13 | Parameters: 14 | fileName - 文件名 15 | """ 16 | def loadDataSet(fileName): 17 | dataMat = [] 18 | fr = open(fileName) 19 | for line in fr.readlines(): 20 | curLine = line.strip().split('\t') 21 | fltLine = list(map(float, curLine)) # 转化为float类型 22 | dataMat.append(fltLine) 23 | return dataMat 24 | 25 | # 函数说明:根据特征切分数据集合 26 | """ 27 | Parameters: 28 | dataSet - 数据集合 29 | feature - 带切分的特征 30 | value - 该特征的值 31 | """ 32 | def binSplitDataSet(dataSet, feature, value): 33 | mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :] 34 | mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :] 35 | return mat0, mat1 36 | 37 | # 函数说明:生成叶结点 38 | """ 39 | Parameters: 40 | dataSet - 数据集合 41 | """ 42 | def regLeaf(dataSet): 43 | return np.mean(dataSet[:, -1]) 44 | 45 | # 函数说明:误差估计函数 46 | """ 47 | Parameters: 48 | dataSet - 数据集合 49 | """ 50 | def regErr(dataSet): 51 | return np.var(dataSet[:, -1]) * np.shape(dataSet)[0] 52 | 53 | # 函数说明:找到数据的最佳二元切分方式函数 54 | """ 55 | Parameters: 56 | dataSet - 数据集合 57 | leafType - 生成叶结点 58 | regErr - 误差估计函数 59 | ops - 用户定义的参数构成的元组 60 | """ 61 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 62 | import types 63 | # tolS允许的误差下降值,tolN切分的最少样本数 64 | tolS = ops[0]; 65 | tolN = ops[1] 66 | # 如果当前所有值相等,则退出。(根据set的特性) 67 | if len(set(dataSet[:, -1].T.tolist()[0])) == 1: 68 | return None, leafType(dataSet) 69 | # 统计数据集合的行m和列n 70 | m, n = np.shape(dataSet) 71 | # 默认最后一个特征为最佳切分特征,计算其误差估计 72 | S = errType(dataSet) 73 | # 分别为最佳误差,最佳特征切分的索引值,最佳特征值 74 | bestS = float('inf'); 75 | bestIndex = 0; 76 | bestValue = 0 77 | # 遍历所有特征列 78 | for featIndex in range(n - 1): 79 | # 遍历所有特征值 80 | for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]): 81 | # 根据特征和特征值切分数据集 82 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 83 | # 如果数据少于tolN,则退出 84 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 85 | # 计算误差估计 86 | newS = errType(mat0) + errType(mat1) 87 | # 如果误差估计更小,则更新特征索引值和特征值 88 | if newS < bestS: 89 | bestIndex = featIndex 90 | bestValue = splitVal 91 | bestS = newS 92 | # 如果误差减少不大则退出 93 | if (S - bestS) < tolS: 94 | return None, leafType(dataSet) 95 | # 根据最佳的切分特征和特征值切分数据集合 96 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 97 | # 如果切分出的数据集很小则退出 98 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 99 | return None, leafType(dataSet) 100 | # 返回最佳切分特征和特征值 101 | return bestIndex, bestValue 102 | 103 | # 函数说明:树构建函数 104 | """ 105 | Parameters: 106 | dataSet - 数据集合 107 | leafType - 建立叶结点的函数 108 | errType - 误差计算函数 109 | ops - 包含树构建所有其他参数的元组 110 | """ 111 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 112 | # 选择最佳切分特征和特征值 113 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops) 114 | # r如果没有特征,则返回特征值 115 | if feat == None: return val 116 | # 回归树 117 | retTree = {} 118 | retTree['spInd'] = feat 119 | retTree['spVal'] = val 120 | # 分成左数据集和右数据集 121 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 122 | # 创建左子树和右子树 123 | retTree['left'] = createTree(lSet, leafType, errType, ops) 124 | retTree['right'] = createTree(rSet, leafType, errType, ops) 125 | return retTree 126 | 127 | def linearSolve(dataSet): #helper function used in two places 128 | m,n = np.shape(dataSet) 129 | X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))#create a copy of data with 1 in 0th postion 130 | X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y 131 | xTx = X.T*X 132 | if np.linalg.det(xTx) == 0.0: 133 | raise NameError('This matrix is singular, cannot do inverse,\n\ 134 | try increasing the second value of ops') 135 | ws = xTx.I * (X.T * Y) 136 | return ws,X,Y 137 | 138 | def modelLeaf(dataSet):#create linear model and return coeficients 139 | ws,X,Y = linearSolve(dataSet) 140 | return ws 141 | 142 | def modelErr(dataSet): 143 | ws,X,Y = linearSolve(dataSet) 144 | yHat = X * ws 145 | return np.sum(np.power(Y - yHat,2)) 146 | 147 | 148 | if __name__ == '__main__': 149 | myDat = loadDataSet('exp2.txt') 150 | myMat = np.mat(myDat) 151 | print(createTree(myMat, modelLeaf, modelErr,(1,10))) 152 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.7.1.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:40 4 | # @Author : GXl 5 | # @File : 9.7.1.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import numpy as np 11 | 12 | # 函数说明:加载数据 13 | """ 14 | Parameters: 15 | fileName - 文件名 16 | """ 17 | def loadDataSet(fileName): 18 | 19 | dataMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | curLine = line.strip().split('\t') 23 | fltLine = list(map(float, curLine)) #转化为float类型 24 | dataMat.append(fltLine) 25 | return dataMat 26 | 27 | # 函数说明:绘制数据集 28 | """ 29 | Parameters: 30 | filename - 文件名 31 | """ 32 | def plotDataSet(filename): 33 | dataMat = loadDataSet(filename) #加载数据集 34 | n = len(dataMat) #数据个数 35 | xcord = []; ycord = [] #样本点 36 | for i in range(n): 37 | xcord.append(dataMat[i][0]); ycord.append(dataMat[i][1]) #样本点 38 | fig = plt.figure() 39 | ax = fig.add_subplot(111) #添加subplot 40 | ax.scatter(xcord, ycord, s = 20, c = 'blue',alpha = .5) #绘制样本点 41 | plt.title('DataSet') #绘制title 42 | plt.xlabel('X') 43 | plt.show() 44 | 45 | 46 | if __name__ == '__main__': 47 | filename = 'bikeSpeedVsIq_train.txt' 48 | plotDataSet(filename) 49 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.7.2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:40 4 | # @Author : GXl 5 | # @File : 9.7.2.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | import matplotlib.pyplot as plt 10 | import regTrees 11 | import numpy as np 12 | 13 | # 函数说明:加载数据 14 | """ 15 | Parameters: 16 | fileName - 文件名 17 | """ 18 | def loadDataSet(fileName): 19 | dataMat = [] 20 | fr = open(fileName) 21 | for line in fr.readlines(): 22 | curLine = line.strip().split('\t') 23 | fltLine = list(map(float, curLine)) # 转化为float类型 24 | dataMat.append(fltLine) 25 | return dataMat 26 | 27 | # 函数说明:根据特征切分数据集合 28 | """ 29 | Parameters: 30 | dataSet - 数据集合 31 | feature - 带切分的特征 32 | value - 该特征的值 33 | """ 34 | def binSplitDataSet(dataSet, feature, value): 35 | mat0 = dataSet[np.nonzero(dataSet[:, feature] > value)[0], :] 36 | mat1 = dataSet[np.nonzero(dataSet[:, feature] <= value)[0], :] 37 | return mat0, mat1 38 | 39 | # 函数说明:生成叶结点 40 | """ 41 | Parameters: 42 | dataSet - 数据集合 43 | """ 44 | def regLeaf(dataSet): 45 | return np.mean(dataSet[:, -1]) 46 | 47 | # 函数说明:误差估计函数 48 | """ 49 | Parameters: 50 | dataSet - 数据集合 51 | """ 52 | def regErr(dataSet): 53 | return np.var(dataSet[:, -1]) * np.shape(dataSet)[0] 54 | 55 | # 函数说明:找到数据的最佳二元切分方式函数 56 | """ 57 | Parameters: 58 | dataSet - 数据集合 59 | leafType - 生成叶结点 60 | regErr - 误差估计函数 61 | ops - 用户定义的参数构成的元组 62 | """ 63 | def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 64 | import types 65 | # tolS允许的误差下降值,tolN切分的最少样本数 66 | tolS = ops[0]; 67 | tolN = ops[1] 68 | # 如果当前所有值相等,则退出。(根据set的特性) 69 | if len(set(dataSet[:, -1].T.tolist()[0])) == 1: 70 | return None, leafType(dataSet) 71 | # 统计数据集合的行m和列n 72 | m, n = np.shape(dataSet) 73 | # 默认最后一个特征为最佳切分特征,计算其误差估计 74 | S = errType(dataSet) 75 | # 分别为最佳误差,最佳特征切分的索引值,最佳特征值 76 | bestS = float('inf'); 77 | bestIndex = 0; 78 | bestValue = 0 79 | # 遍历所有特征列 80 | for featIndex in range(n - 1): 81 | # 遍历所有特征值 82 | for splitVal in set(dataSet[:, featIndex].T.A.tolist()[0]): 83 | # 根据特征和特征值切分数据集 84 | mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal) 85 | # 如果数据少于tolN,则退出 86 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): continue 87 | # 计算误差估计 88 | newS = errType(mat0) + errType(mat1) 89 | # 如果误差估计更小,则更新特征索引值和特征值 90 | if newS < bestS: 91 | bestIndex = featIndex 92 | bestValue = splitVal 93 | bestS = newS 94 | # 如果误差减少不大则退出 95 | if (S - bestS) < tolS: 96 | return None, leafType(dataSet) 97 | # 根据最佳的切分特征和特征值切分数据集合 98 | mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue) 99 | # 如果切分出的数据集很小则退出 100 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN): 101 | return None, leafType(dataSet) 102 | # 返回最佳切分特征和特征值 103 | return bestIndex, bestValue 104 | 105 | # 函数说明:树构建函数 106 | """ 107 | Parameters: 108 | dataSet - 数据集合 109 | leafType - 建立叶结点的函数 110 | errType - 误差计算函数 111 | ops - 包含树构建所有其他参数的元组 112 | """ 113 | def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1, 4)): 114 | # 选择最佳切分特征和特征值 115 | feat, val = chooseBestSplit(dataSet, leafType, errType, ops) 116 | # r如果没有特征,则返回特征值 117 | if feat == None: return val 118 | # 回归树 119 | retTree = {} 120 | retTree['spInd'] = feat 121 | retTree['spVal'] = val 122 | # 分成左数据集和右数据集 123 | lSet, rSet = binSplitDataSet(dataSet, feat, val) 124 | # 创建左子树和右子树 125 | retTree['left'] = createTree(lSet, leafType, errType, ops) 126 | retTree['right'] = createTree(rSet, leafType, errType, ops) 127 | return retTree 128 | 129 | def linearSolve(dataSet): #helper function used in two places 130 | m,n = np.shape(dataSet) 131 | X = np.mat(np.ones((m,n))); Y = np.mat(np.ones((m,1)))#create a copy of data with 1 in 0th postion 132 | X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y 133 | xTx = X.T*X 134 | if np.linalg.det(xTx) == 0.0: 135 | raise NameError('This matrix is singular, cannot do inverse,\n\ 136 | try increasing the second value of ops') 137 | ws = xTx.I * (X.T * Y) 138 | return ws,X,Y 139 | 140 | def modelLeaf(dataSet):#create linear model and return coeficients 141 | ws,X,Y = linearSolve(dataSet) 142 | return ws 143 | 144 | def modelErr(dataSet): 145 | ws,X,Y = linearSolve(dataSet) 146 | yHat = X * ws 147 | return np.sum(np.power(Y - yHat,2)) 148 | 149 | def isTree(obj): 150 | return (type(obj).__name__=='dict') 151 | 152 | def regTreeEval(model, inDat): 153 | return float(model) 154 | 155 | def modelTreeEval(model, inDat): 156 | n = np.shape(inDat)[1] 157 | X = np.mat(np.ones((1,n+1))) 158 | X[:,1:n+1]=inDat 159 | return float(X*model) 160 | 161 | def treeForeCast(tree, inData, modelEval=regTreeEval): 162 | if not isTree(tree): return modelEval(tree, inData) 163 | if inData[tree['spInd']] > tree['spVal']: 164 | if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval) 165 | else: return modelEval(tree['left'], inData) 166 | else: 167 | if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval) 168 | else: return modelEval(tree['right'], inData) 169 | 170 | def createForeCast(tree, testData, modelEval=regTreeEval): 171 | m=len(testData) 172 | yHat = np.mat(np.zeros((m,1))) 173 | for i in range(m): 174 | yHat[i,0] = treeForeCast(tree, np.mat(testData[i]), modelEval) 175 | return yHat 176 | 177 | 178 | if __name__ == '__main__': 179 | trainMat = np.mat(loadDataSet("bikeSpeedVsIq_train.txt")) 180 | testMat = np.mat(loadDataSet("bikeSpeedVsIq_test.txt")) 181 | myTree = createTree(trainMat, ops=(1, 20)) 182 | yHat = createForeCast(myTree, testMat[:,0]) 183 | print(np.corrcoef(yHat, testMat[:,1], rowvar=0)[0, 1]) 184 | -------------------------------------------------------------------------------- /Ch09-Regression Trees/9.8.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # @Time : 2019/10/8 15:40 4 | # @Author : GXl 5 | # @File : 9.8.py 6 | # @Software: win10 Tensorflow1.13.1 python3.5.6 7 | 8 | 9 | from numpy import * 10 | 11 | from tkinter import * 12 | import regTrees 13 | 14 | import matplotlib 15 | 16 | matplotlib.use('TkAgg') 17 | from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg 18 | from matplotlib.figure import Figure 19 | 20 | 21 | # 绘制树 22 | def reDraw(tolS, tolN): 23 | reDraw.f.clf() # clear the figure 24 | reDraw.a = reDraw.f.add_subplot(111) 25 | if chkBtnVar.get(): 26 | if tolN < 2: tolN = 2 27 | myTree = regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf, \ 28 | regTrees.modelErr, (tolS, tolN)) 29 | yHat = regTrees.createForeCast(myTree, reDraw.testDat, \ 30 | regTrees.modelTreeEval) 31 | else: 32 | myTree = regTrees.createTree(reDraw.rawDat, ops=(tolS, tolN)) 33 | yHat = regTrees.createForeCast(myTree, reDraw.testDat) 34 | reDraw.a.scatter(array(reDraw.rawDat[:, 0]), array(reDraw.rawDat[:, 1]), s=5) # 离散型散点图 35 | reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0) # 构建yHat的连续曲线 36 | reDraw.canvas.draw() 37 | 38 | 39 | def getInputs(): 40 | try: 41 | tolN = int(tolNentry.get()) 42 | except: 43 | tolN = 10 44 | print("enter Integer for tolN") 45 | tolNentry.delete(0, END) 46 | tolNentry.insert(0, '10') 47 | try: 48 | tolS = float(tolSentry.get()) 49 | except: 50 | tolS = 1.0 51 | print("enter Float for tolS") 52 | tolSentry.delete(0, END) 53 | tolSentry.insert(0, '1.0') 54 | return tolN, tolS 55 | 56 | 57 | # 理解用户输入并防止程序崩溃 58 | def drawNewTree(): 59 | tolN, tolS = getInputs() # 从输入框中获取值 60 | reDraw(tolS, tolN) # 生成图 61 | 62 | 63 | # Tk类型的根部件 64 | root = Tk() 65 | 66 | # 创造画布 67 | reDraw.f = Figure(figsize=(5, 4), dpi=100) 68 | # 调用Agg,把Agg呈现在画布上 69 | # Agg是一个C++的库,可以从图像创建光栅图 70 | reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root) 71 | reDraw.canvas.draw() 72 | reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3) 73 | 74 | Label(root, text="tolN").grid(row=1, column=0) 75 | # 文本输入框1 76 | tolNentry = Entry(root) 77 | tolNentry.grid(row=1, column=1) 78 | tolNentry.insert(0, '10') 79 | Label(root, text="tolS").grid(row=2, column=0) 80 | # 文本输入框2 81 | tolSentry = Entry(root) 82 | tolSentry.grid(row=2, column=1) 83 | tolSentry.insert(0, '1.0') 84 | # 初始化与reDraw()关联的全局变量 85 | Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3) 86 | # 按钮整数值 87 | chkBtnVar = IntVar() 88 | # 复选按钮 89 | chkBtn = Checkbutton(root, text="Model Tree", variable=chkBtnVar) 90 | chkBtn.grid(row=3, column=0, columnspan=2) 91 | 92 | reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt')) 93 | reDraw.testDat = arange(min(reDraw.rawDat[:, 0]), max(reDraw.rawDat[:, 0]), 0.01) 94 | reDraw(1.0, 10) 95 | 96 | root.mainloop() 97 | -------------------------------------------------------------------------------- /Machine Learning in Action.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeFuirnever/Machine-Learning-in-Action/0d77031a24d639a499f64f9b5190db504712269a/Machine Learning in Action.pdf -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ⭐ Machine-Learning-in-Action(更新ing) 2 | 3 | # 🎄 种树最好的时间是10年前,其次是现在!!! 4 | 5 | - #### 📝📝📝 [欢迎关注我的 CSDN 博客](https://blog.csdn.net/tefuirnever) 6 | - #### 📚📚📚 [机器学习实战数据集](https://github.com/TeFuirnever/Machine-Learning-in-Action/blob/master/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E6%88%98%E6%95%B0%E6%8D%AE%E9%9B%86.zip) 7 | 8 | 目录 9 | --- 10 | 11 | - #### [《机器学习实战》博客 - 总目录](https://blog.csdn.net/TeFuirnever/article/details/99701256) 12 | 13 | - #### [《机器学习实战》代码](https://github.com/TeFuirnever/Machine-Learning-in-Action) 14 | 15 | --- 16 | 17 | - #### [第1章 - 机器学习基础 - 博客](https://blog.csdn.net/TeFuirnever/article/details/99734084) 18 | 19 | --- 20 | 21 | - #### [第2章 - k-近邻算法 - 博客](https://blog.csdn.net/TeFuirnever/article/details/99739021) 22 | 23 | - #### [Ch02-KNN - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch02-KNN) 24 | 25 | --- 26 | 27 | - #### [第3章 - 决策树 - 博客](https://blog.csdn.net/TeFuirnever/article/details/99955515) 28 | 29 | - #### [Ch03-DecisionTree - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch03-DecisionTree) 30 | 31 | --- 32 | 33 | - #### [第4章 - 基于概率论的分类方法:朴素贝叶斯 - 博客](https://blog.csdn.net/TeFuirnever/article/details/100108341) 34 | 35 | - #### [Ch04-NaiveBayes - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch04-NaiveBayes) 36 | 37 | --- 38 | 39 | - #### [第5章 - Logistic 回归 - 博客](https://blog.csdn.net/TeFuirnever/article/details/100159150) 40 | 41 | - #### [Ch05-Logistic - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch05-Logistic) 42 | 43 | --- 44 | 45 | - #### [第6章 - 支持向量机 - 博客](https://blog.csdn.net/TeFuirnever/article/details/99701322) 46 | 47 | - #### [Ch06-SVM - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch06-SVM) 48 | 49 | --- 50 | 51 | - #### [第7章 - 利用AdaBoost 元算法提高分类性能 - 博客](https://blog.csdn.net/TeFuirnever/article/details/100191706) 52 | 53 | - #### [Ch07-AdaBoost - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch07-AdaBoost) 54 | 55 | --- 56 | 57 | - #### [第8章 - 预测数值型数据:回归 - 博客](https://blog.csdn.net/TeFuirnever/article/details/100572055) 58 | 59 | - #### [Ch08-Regression - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch08-Regression) 60 | 61 | --- 62 | 63 | - #### [第9章 - 树回归 - 博客](https://blog.csdn.net/TeFuirnever/article/details/101294837) 64 | 65 | - #### [Ch09-Regression Trees - 代码](https://github.com/TeFuirnever/Machine-Learning-in-Action/tree/master/Ch09-Regression%20Trees) 66 | 67 | --- 68 | 69 | - #### 第10章 - 利用K-均值聚类算法对未标注数据分组 - 博客 70 | 71 | - #### 72 | 73 | --- 74 | 75 | - #### 第11章 - 使用Apriori 算法进行关联分析 - 博客 76 | 77 | - #### 78 | 79 | --- 80 | 81 | - #### 第12章 - 使用FP-growth 算法来高效发现频繁项集 - 博客 82 | 83 | - #### 84 | 85 | --- 86 | 87 | - #### 第13章 - 利用PCA 来简化数据 - 博客 88 | 89 | - #### 90 | 91 | --- 92 | 93 | - #### 第14章 - 利用SVD 简化数据 - 博客 94 | 95 | - #### 96 | 97 | --- 98 | 99 | - #### 第15章 - 大数据与MapReduce - 博客 100 | 101 | - #### 102 | 103 | # 📢 全部本人所写✏,仅供参考📜 104 | # 📢 建议使用博客和代码一起完成学习📒 105 | # 📢 鉴于水平有限,如有问题,可以博客留言🌈 106 | -------------------------------------------------------------------------------- /机器学习实战.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeFuirnever/Machine-Learning-in-Action/0d77031a24d639a499f64f9b5190db504712269a/机器学习实战.pdf -------------------------------------------------------------------------------- /机器学习实战总目录.md: -------------------------------------------------------------------------------- 1 | # 欢迎关注我的[[CSDN博客](https://blog.csdn.net/tefuirnever)] 2 | 3 | ## 目录 4 | 5 | - 好好看书,好好写博客,好好码代码,好好搞深度学习,好好搞机器学习,希望能坚持下去 :) 6 | 7 | ### 《机器学习实战》读书笔记及代码 - 总目录 8 | 9 | - [https://blog.csdn.net/TeFuirnever/article/details/99701256](https://blog.csdn.net/TeFuirnever/article/details/99701256) 10 | 11 | ### 第1章 - 机器学习基础 12 | 13 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/99734084](https://blog.csdn.net/TeFuirnever/article/details/99734084) 14 | 15 | ### 第2章 - k-近邻算法 16 | 17 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/99739021](https://blog.csdn.net/TeFuirnever/article/details/99739021) 18 | 19 | ### 第3章 - 决策树 20 | 21 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/99955515](https://blog.csdn.net/TeFuirnever/article/details/99955515) 22 | 23 | ### 第4章 - 基于概率论的分类方法:朴素贝叶斯 24 | 25 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/100108341](https://blog.csdn.net/TeFuirnever/article/details/100108341) 26 | 27 | ### 第5章 - Logistic 回归 28 | 29 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/100159150](https://blog.csdn.net/TeFuirnever/article/details/100159150) 30 | 31 | ### 第6章 - 支持向量机 32 | 33 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/99701322](https://blog.csdn.net/TeFuirnever/article/details/99701322) 34 | 35 | ### 第7章 - 利用AdaBoost 元算法提高分类性能 36 | 37 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/100191706](https://blog.csdn.net/TeFuirnever/article/details/100191706) 38 | 39 | ### 第8章 - 预测数值型数据:回归 40 | 41 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/100572055](https://blog.csdn.net/TeFuirnever/article/details/100572055) 42 | 43 | ### 第9章 - 树回归 44 | 45 | - 读书笔记 [https://blog.csdn.net/TeFuirnever/article/details/101294837](https://blog.csdn.net/TeFuirnever/article/details/101294837) 46 | 47 | ### 第10章 - 利用K-均值聚类算法对未标注数据分组 48 | 49 | - 读书笔记 50 | 51 | ### 第11章 - 使用Apriori 算法进行关联分析 52 | 53 | - 读书笔记 54 | 55 | ### 第12章 - 使用FP-growth 算法来高效发现频繁项集 56 | 57 | - 读书笔记 58 | 59 | ### 第13章 - 利用PCA 来简化数据 60 | 61 | - 读书笔记 62 | 63 | ### 第14章 - 利用SVD 简化数据 64 | 65 | - 读书笔记 66 | 67 | ### 第15章 - 大数据与MapReduce 68 | 69 | - 读书笔记 70 | 71 | ### 参考文章 72 | - 《机器学习实战》 73 | -------------------------------------------------------------------------------- /机器学习实战数据集.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TeFuirnever/Machine-Learning-in-Action/0d77031a24d639a499f64f9b5190db504712269a/机器学习实战数据集.zip --------------------------------------------------------------------------------