├── README.md
├── 人才价格计算器.ipynb
├── 造个轮子-决策树(ID3).ipynb
├── 造个轮子-Naive Bayes.ipynb
├── 造个轮子-KNN.ipynb
└── AdaBoost.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Machine_Learning_Algorithm_Wheel
2 | **机器学习算法纯手工轮子,主要目的在于深入理解算法原理和参数,包含:**
3 | ### 1. 分类
4 | - KNN
5 | - Naive Bayes
6 | - Decision Tree
7 | - Logistic Regression
8 | - SVM
9 | - AdaBoost
10 | ### 2. 回归
11 | - Linear Regression/Ridge Regression/Lasso Regression
12 | - Regression Tree:回归树/模型树
13 | ### 3. 聚类
14 | - KMeans/BiKMeans
15 | ### 4. 关联分析
16 | - Apriori:频繁项集、关联规则
17 | - FP-Growth:频繁项集
18 | ### 5. 降维
19 | - PCA
20 | - SVD
21 |
--------------------------------------------------------------------------------
/人才价格计算器.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 人才价格计算器\n",
8 | " - 输入:求职者——求职职位、工作年限、学历;招聘方——公司规模\n",
9 | " - 输出:在指定公司规模、求职职位和经验、学历的条件下,市场水平的月薪均值\n",
10 | " \n",
11 | " - 原理:KNN分类器改进版(改进输出为离散值)——认为离输入x最近的k个点对应的月薪均值的均值是x的输出\n",
12 | " - k默认为训练样本数/100+1\n",
13 | " - 距离度量为欧氏距离\n",
14 | " - 决策规则:加权距离最近的k个点的y的均值\n",
15 | " - 加权距离:根据不同岗位下,学历、经验、公司规模与薪水的相关系数,计算输入x与训练集x的加权距离\n",
16 | " - y的均值:为了改进输出为离散值的问题,认为离输入x最近的k个点对应的月薪均值的均值是x的输出\n",
17 | " \n",
18 | " - 不足:\n",
19 | " - 输入的工作年限为个体值,但用于计算距离的工作年限为范围均值,会导致在范围交界点的个体值错误归类\n",
20 | " - 改进:将输入的个体值映射到各个工作年限要求范围均值上去(已改进)\n",
21 | " - 没有将职位描述这个信息量丰富的长文本纳入建模\n",
22 | " - 改进:可加入职位描述关键词与求职者能力关键词匹配程度,影响月薪水平\n",
23 | " - 训练样本太少,总共只有2398条数据,分岗位来看就更少了,因为样本量不足,会导致预测结果不准确\n",
24 | " - 改进:从boss、猎聘等网站拓展样本数量"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 9,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pandas as pd\n",
34 | "import numpy as np\n",
35 | "import operator"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 13,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# 人才价格计算器\n",
45 | "\n",
46 | "def KNNClassify(newinput, dataset, labels, data,position,k=None): \n",
47 | " # 若未设定k值,则默认为10\n",
48 | " if k==None:\n",
49 | " k=10\n",
50 | "\n",
51 | " # 由于每个职位各因素与薪水相关性不同,因此引入不同职位的相关系数,放入距离加权计算中\n",
52 | " # 计算不同岗位类型下,公司规模、经验、学历与薪水的相关系数,用于后面计算k个最近点的加权距离\n",
53 | " corr=data[data.classified_zw==position][['salary_mean','companysize_mean','workyear_mean','degree_num']].corr()\n",
54 | " corr=np.array(corr)\n",
55 | " \n",
56 | " #salary_companysize=corr[1][0]\n",
57 | " #salary_workyear=corr[2][0]\n",
58 | " #salary_degree=corr[3][0]\n",
59 | " weight=np.array([corr[1][0],corr[2][0],corr[3][0]])\n",
60 | " \n",
61 | " numSamples=dataset.shape[0]\n",
62 | " \"\"\"step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)\"\"\"\n",
63 | " diff=np.tile(newinput,(numSamples,1)) - dataset # 凸显numpy数组的高效性——元素级的运算\n",
64 | " squaredist=(diff**2)/weight # 相关系数越大,因素越重要,表现为加权距离小,所以要除以相关系数\n",
65 | " distance = (squaredist.sum(axis=1))**0.5 # axis=1,按行累加\n",
66 | " \n",
67 | " \"\"\"step2:将距离按升序排序,并取距离最近的k个近邻点\"\"\"\n",
68 | " # 对数组distance按升序排序,返回数组排序后的值对应的索引值\n",
69 | " sortedDistance=distance.argsort() \n",
70 | " \n",
71 | " # 定义一个空字典,存放k个近邻点的分类计数\n",
72 | " classCount={}\n",
73 | " \n",
74 | " # 对k个近邻点分类计数,多数表决法\n",
75 | " for i in range(k):\n",
76 | " # 第i个近邻点在distance数组中的索引,对应的分类\n",
77 | " votelabel=labels[sortedDistance[i]]\n",
78 | " \n",
79 | " # votelabel作为字典的key,对相同的key值累加(多数表决法)\n",
80 | " classCount[votelabel]=classCount.get(votelabel,0)+1 \n",
81 | " \n",
82 | " # 求k个近邻点的y值的均值,作为新输入x的预测输出y\n",
83 | " predict=0\n",
84 | " point_k=0\n",
85 | " for key,value in classCount.items():\n",
86 | " predict+=key*value\n",
87 | " point_k+=value\n",
88 | " predict=predict/point_k\n",
89 | " \n",
90 | " return position,predict\n",
91 | "\n",
92 | "\n",
93 | "# data:清洗后的数据集,如拉勾成都地区招聘岗位的数据,lagou_origin。这里对于data要求比较死,如果更换数据,需要维护代码\n",
94 | "# position:求职职位:数据分析师,算法工程师,java工程师、等\n",
95 | "# newinput=[\"companysize_mean\",\"workyear_mean\",\"degree_num\"]\n",
96 | "def talent_calculator(data,k=None):\n",
97 | "\n",
98 | " # 交互,用户输入职位、公司规模、经验、学历\n",
99 | " print (\"可选职位:java工程师、前端、数据分析师、算法工程师\")\n",
100 | " print (\"产品运营、测试工程师、产品经理、数据挖掘、建模工程师、爬虫工程师、产业研究员职位也可计算,但由于数据量较小,结果准确性差\")\n",
101 | " position=str(input(\"请输入职位:\"))\n",
102 | " companysize=float(input(\"请输入公司规模(人):\"))\n",
103 | " workyear=float(input(\"请输入求职者工作年限(年):\"))\n",
104 | " degree=float(input(\"请输入求职者学历(0-不限,1-大专,2-本科,3-硕士,4-博士):\"))\n",
105 | " \n",
106 | " # 将输入的个体值映射到各个工作年限要求范围均值上去(避免在范围交界点的工作年限归类错误)\n",
107 | " # 若未输入工作年限,则默认为0年\n",
108 | " if workyear==None:\n",
109 | " workyear=0\n",
110 | " elif workyear>0 and workyear<1:\n",
111 | " workyear=0.5\n",
112 | " elif workyear>=1 and workyear<3:\n",
113 | " workyear=2\n",
114 | " elif workyear>=3 and workyear<5:\n",
115 | " workyear=4\n",
116 | " elif workyear>=5 and workyear<10:\n",
117 | " workyear=7.5\n",
118 | " elif workyear>=10:\n",
119 | " workyear=10\n",
120 | " else:\n",
121 | " workyear=workyear\n",
122 | " \n",
123 | " newinput=[companysize,workyear,degree]\n",
124 | " \n",
125 | " # 建立训练集\n",
126 | " data_x=data[data.classified_zw==position].loc[:,[\"companysize_mean\",\"workyear_mean\",\"degree_num\"]]\n",
127 | " data_y=data[data.classified_zw==position].loc[:,[\"salary_mean\"]]\n",
128 | "\n",
129 | " np.random.seed(7)\n",
130 | " indices=np.random.permutation(len(data_x))\n",
131 | "\n",
132 | " data_x_train=np.array(data_x.iloc[indices])\n",
133 | " data_y_train=np.array(data_y.iloc[indices])\n",
134 | "\n",
135 | " data_y_train.shape=(len(data_x),)\n",
136 | " \n",
137 | " dataset=data_x_train\n",
138 | " labels=data_y_train\n",
139 | " \n",
140 | " # 若未设定k值,则默认为输入岗位的职位数量/100+1\n",
141 | " if k==None:\n",
142 | " k=int(len(data_x)/100)+1\n",
143 | " \n",
144 | " # KNN分类器(改进版)\n",
145 | " # 由于每个职位各因素与薪水相关性不同,因此引入不同职位的相关系数,放入距离加权计算中\n",
146 | " # 计算不同岗位类型下,公司规模、经验、学历与薪水的相关系数,用于后面计算k个最近点的加权距离\n",
147 | " corr=data[data.classified_zw==position][['salary_mean','companysize_mean','workyear_mean','degree_num']].corr()\n",
148 | " corr=np.array(corr)\n",
149 | " \n",
150 | " #salary_companysize=corr[1][0]\n",
151 | " #salary_workyear=corr[2][0]\n",
152 | " #salary_degree=corr[3][0]\n",
153 | " weight=np.array([corr[1][0],corr[2][0],corr[3][0]])\n",
154 | " \n",
155 | " numSamples=dataset.shape[0]\n",
156 | " \"\"\"step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)\"\"\"\n",
157 | " diff=np.tile(newinput,(numSamples,1)) - dataset # 凸显numpy数组的高效性——元素级的运算\n",
158 | " squaredist=(diff**2)/weight # 相关系数越大,因素越重要,表现为加权距离小,所以要除以相关系数\n",
159 | " distance = (squaredist.sum(axis=1))**0.5 # axis=1,按行累加\n",
160 | " \n",
161 | " \"\"\"step2:将距离按升序排序,并取距离最近的k个近邻点\"\"\"\n",
162 | " # 对数组distance按升序排序,返回数组排序后的值对应的索引值\n",
163 | " sortedDistance=distance.argsort() \n",
164 | " \n",
165 | " # 定义一个空字典,存放k个近邻点的分类计数\n",
166 | " classCount={}\n",
167 | " \n",
168 | " # 对k个近邻点分类计数,多数表决法\n",
169 | " for i in range(k):\n",
170 | " # 第i个近邻点在distance数组中的索引,对应的分类\n",
171 | " votelabel=labels[sortedDistance[i]]\n",
172 | " \n",
173 | " # votelabel作为字典的key,对相同的key值累加(多数表决法)\n",
174 | " classCount[votelabel]=classCount.get(votelabel,0)+1 \n",
175 | " \n",
176 | " # 求k个近邻点的y值的均值,作为新输入x的预测输出y\n",
177 | " predict=0\n",
178 | " point_k=0\n",
179 | " for key,value in classCount.items():\n",
180 | " predict+=key*value\n",
181 | " point_k+=value\n",
182 | " predict=round(predict/point_k,2)\n",
183 | " \n",
184 | " print (\"\\n求职岗位为:\",position,\"\\n人才价格为\",predict,\"K\")\n",
185 | " \n",
186 | " return position,predict"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "#### 使用人才价格计算器"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 18,
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "可选职位:java工程师、前端、数据分析师、算法工程师\n",
206 | "产品运营、测试工程师、产品经理、数据挖掘、建模工程师、爬虫工程师、产业研究员职位也可计算,但由于数据量较小,结果准确性差\n",
207 | "请输入职位:数据分析师\n",
208 | "请输入公司规模(人):60\n",
209 | "请输入求职者工作年限(年):5\n",
210 | "请输入求职者学历(0-不限,1-大专,2-本科,3-硕士,4-博士):2\n",
211 | "\n",
212 | "求职岗位为: 数据分析师 \n",
213 | "人才价格为 22.5 K\n"
214 | ]
215 | },
216 | {
217 | "data": {
218 | "text/plain": [
219 | "('数据分析师', 22.5)"
220 | ]
221 | },
222 | "execution_count": 18,
223 | "metadata": {},
224 | "output_type": "execute_result"
225 | }
226 | ],
227 | "source": [
228 | "# 载入原始数据集\n",
229 | "lagou_orgin=pd.read_csv(r\"E:\\python\\data\\lagou\\lagou2018_chuli.csv\",encoding=\"utf-8\",delimiter=\"\\t\")\n",
230 | "# 填入原始数据集后,调用人才价格计算器\n",
231 | "talent_calculator(lagou_orgin)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "\n"
241 | ]
242 | }
243 | ],
244 | "metadata": {
245 | "kernelspec": {
246 | "display_name": "Python 3",
247 | "language": "python",
248 | "name": "python3"
249 | },
250 | "language_info": {
251 | "codemirror_mode": {
252 | "name": "ipython",
253 | "version": 3
254 | },
255 | "file_extension": ".py",
256 | "mimetype": "text/x-python",
257 | "name": "python",
258 | "nbconvert_exporter": "python",
259 | "pygments_lexer": "ipython3",
260 | "version": "3.6.4"
261 | }
262 | },
263 | "nbformat": 4,
264 | "nbformat_minor": 2
265 | }
266 |
--------------------------------------------------------------------------------
/造个轮子-决策树(ID3).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 实现思路(ID3)\n",
8 | " 1. 自定义信息熵计算函数,用于计算数据集的信息熵\n",
9 | " 2. 自定义数据划分函数,用于根据指定特征的指定取值,划分数据集\n",
10 | " 3. step2的自数据集作为输入给step1的函数,可以计算出按某指定特征的某指定取值(A=ai)划分的数据集的信息熵H(Di),同时计算按某指定特征的某指定取值(A=ai)划分的数据集的样本概率|Di|/|D|\n",
11 | " 4. 遍历该特征各个取值,计算各取值下划分的数据集的信息熵H(Di)和样本概率|Di|/|D|,相乘,再求和得到得到特征A对数据集D的经验条件熵H(D|A)\n",
12 | " 5. 计算特征A对数据集的信息增益g(D,A)=H(D)-H(D|A)\n",
13 | " 6. 以此类推,计算各特征对数据集的信息增益,取信息增益最大的特征为最佳划分特征,得到树T1\n",
14 | " 7. 对T1各结点继续step3-6,选择信息增益最大的特征,继续划分数据,得到新的决策树\n",
15 | " 8. 直到信息增益小于阈值,或无特征可划分,或每个分支下的所有实例都具有相同的分类,决策树完成\n",
16 | "- **注意,ID3一直在分支,容易过拟合,因此需要对决策树剪枝,提高对测试集数据预测的性能**"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import pandas as pd\n",
26 | "import numpy as np\n",
27 | "import operator\n",
28 | "from math import log\n",
29 | "\n",
30 | "\n",
31 | "\"\"\"\n",
32 | "信息熵计算函数,用于计算数据集的信息熵\n",
33 | "输入:数据集,每一行是一条数据,最后一列是各条数据集的类别\n",
34 | "输出:该数据集的信息熵\n",
35 | "思路:\n",
36 | "建立一个字典,对数据集各数据的类别计数,\n",
37 | "从而计算各类别出现频率(作为概率pi),\n",
38 | "最后调用信息熵公式计算 H(D)=-求和(pi*logpi)\n",
39 | "\"\"\"\n",
40 | "def calEntropy(dataset):\n",
41 | " n=len(dataset)\n",
42 | " labelCounts={}\n",
43 | " \n",
44 | " #对数据集各数据的类别计数\n",
45 | " for data in dataset:\n",
46 | " datalabel=data[-1] #取data最后一列,类别列\n",
47 | " if datalabel not in labelCounts.keys():\n",
48 | " labelCounts[datalabel]=0\n",
49 | " labelCounts[datalabel]+=1\n",
50 | " \n",
51 | " entropy=0.0\n",
52 | " \n",
53 | " #计算各类别出现频率(作为概率pi),调用信息熵公式计算 H(D)=-求和(pi*logpi)\n",
54 | " for key in labelCounts.keys():\n",
55 | " prob=float(labelCounts[key])/n\n",
56 | " entropy -= prob*log(prob,2)\n",
57 | " return entropy\n",
58 | "\n",
59 | "\n",
60 | "\"\"\"\n",
61 | "数据划分函数,用于根据指定特征的指定取值,划分数据集\n",
62 | "输入:数据集、特征所在列索引、特征取值\n",
63 | "输出:满足指定特征等于指定取值的数据子集\n",
64 | "\"\"\"\n",
65 | "def splitDataset(dataset,index,value):\n",
66 | " subDataset=[]\n",
67 | " for data in dataset:\n",
68 | " if data[index]==value:\n",
69 | " #抽取除了data[index]的内容(一个特征用于计算其对数据集的经验条件熵时,不需要此特征在子数据集中)\n",
70 | " splitData=data[:index] #取索引之前的元素\n",
71 | " splitData.extend(data[index+1:]) #再合并索引之后的元素\n",
72 | " subDataset.append(splitData)\n",
73 | " return subDataset\n",
74 | "\n",
75 | "\n",
76 | "\"\"\"\n",
77 | "选择信息增益最大的特征作为数据集划分特征\n",
78 | "输入:数据集\n",
79 | "输出:该数据集的最佳划分特征\n",
80 | "\"\"\"\n",
81 | "def chooseFeature(dataset):\n",
82 | " #初始化\n",
83 | " numFeature=len(dataset[0])-1 #因为最后一列是类别\n",
84 | " baseEntropy=calEntropy(dataset) #H(D)\n",
85 | " bestInfoGain=0.0\n",
86 | " bestFeatureIndex=-1\n",
87 | " \n",
88 | " #创建特征A各取值a的列表\n",
89 | " for i in range(numFeature):\n",
90 | " featureList=[data[i] for data in dataset]\n",
91 | " uniqueValue=set(featureList)\n",
92 | " empEntropy=0.0 #初始化特征A对数据集D的经验条件熵H(D|A)\n",
93 | " \n",
94 | " #计算特征A各取值a的信息熵H(Di)和样本概率|Di|/|D|,并相乘\n",
95 | " for value in uniqueValue:\n",
96 | " subDataset=splitDataset(dataset,i,value) #(列索引为i的特征)特征A取value值所划分的子数据集\n",
97 | " prob=len(subDataset)/float(len(dataset)) #计算|Di|/|D|\n",
98 | " empEntropy += prob*calEntropy(subDataset) #H(D|A)\n",
99 | " \n",
100 | " #取信息增益最大的特征为最佳划分特征\n",
101 | " infoGain=baseEntropy-empEntropy #信息增益\n",
102 | " if infoGain>bestInfoGain:\n",
103 | " bestInfoGain=infoGain\n",
104 | " bestFeatureIndex=i\n",
105 | " return bestFeatureIndex\n",
106 | "\n",
107 | "\n",
108 | "\"\"\"\n",
109 | "对数据集各数据类别进行计数排序\n",
110 | "\"\"\"\n",
111 | "def majorClass(classList):\n",
112 | " classCount={}\n",
113 | " for vote in classList:\n",
114 | " if vote not in classCount.keys():\n",
115 | " classCount[vote]=0\n",
116 | " classCount[vote]+=1\n",
117 | " \n",
118 | " #对classCount按value降序排序\n",
119 | " sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)\n",
120 | " return sortedClassCount[0][0] #返回类别最大的类别名\n",
121 | "\n",
122 | "\n",
123 | "\n",
124 | "\"\"\"\n",
125 | "主函数:递归构建决策树\n",
126 | "输入:数据集(list类型),数据集特征列表(按在数据集的位置排序)(list类型)\n",
127 | "输出:该数据集的决策树\n",
128 | "思路:【递归】\n",
129 | " 1. 若数据集属于同一类,则返回该类别,划分停止\n",
130 | " 2. 若数据集所有特征已经遍历,返回当前计数最多的类别为该结点类别,划分停止\n",
131 | " 3. 否则继续分支,调用chooseFeature()函数,选择当前数据集最优特征\n",
132 | " 4. 遍历当前最优特征各属性值,划分数据集,并递归调用自身createTree()构建子数据集的决策树\n",
133 | " 5. 完成\n",
134 | "\"\"\"\n",
135 | "def createTree(dataset,featureLabels):\n",
136 | " classList=[data[-1] for data in dataset] #取数据集各数据类别\n",
137 | " \n",
138 | " #若数据集属于同一类,则返回该类别,划分停止\n",
139 | " if classList.count(classList[0])==len(classList):\n",
140 | " return classList[0]\n",
141 | " \n",
142 | " #若数据集所有特征已经遍历,返回当前计数最多的类别为该结点类别,划分停止\n",
143 | " if len(dataset[0])==1:\n",
144 | " return majorClass(classList)\n",
145 | " \n",
146 | " #否则继续分支,调用chooseFeature()函数,选择当前数据集最优特征\n",
147 | " bestFeatureIndex=chooseFeature(dataset)\n",
148 | " bestFeature=featureLabels[bestFeatureIndex]\n",
149 | " \n",
150 | " #用于存储决策树,字典结构存储树的所有信息,并可体现包含关系\n",
151 | " desitionTree={bestFeature:{}} \n",
152 | " del(featureLabels[bestFeatureIndex]) #删除已被用于划分数据的特征\n",
153 | " \n",
154 | " #得到当前最优划分特征的各属性值\n",
155 | " featureValues=[data[bestFeatureIndex] for data in dataset]\n",
156 | " uniqueValues=set(featureValues)\n",
157 | " \n",
158 | " #遍历当前最优特征各属性值,划分数据集,并递归调用自身createTree()构建子数据集的决策树\n",
159 | " for value in uniqueValues:\n",
160 | " #得到已删除当前最优划分特征的特征列表,用于递归调用\n",
161 | " subFeatureLabels=featureLabels[:] \n",
162 | " \n",
163 | " #用当前最优划分特征的指定值分割子数据集,用于递归调用\n",
164 | " subData=splitDataset(dataset,bestFeatureIndex,value) \n",
165 | " desitionTree[bestFeature][value]=createTree(subData,subFeatureLabels)\n",
166 | " return desitionTree"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "#### 测试\n",
174 | "- 西瓜分类数据集测试"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 2,
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "data": {
184 | "text/plain": [
185 | "(17, 7)"
186 | ]
187 | },
188 | "execution_count": 2,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "watermalon=pd.read_csv(r\"D:\\python\\data\\watermalon.txt\",sep=\"\\t\")\n",
195 | "watermalon.shape"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 5,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "[['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],\n",
207 | " ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],\n",
208 | " ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],\n",
209 | " ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],\n",
210 | " ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],\n",
211 | " ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '是'],\n",
212 | " ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'],\n",
213 | " ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '是'],\n",
214 | " ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '否'],\n",
215 | " ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'],\n",
216 | " ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '否'],\n",
217 | " ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '否'],\n",
218 | " ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '否'],\n",
219 | " ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'],\n",
220 | " ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'],\n",
221 | " ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否'],\n",
222 | " ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否']]"
223 | ]
224 | },
225 | "execution_count": 5,
226 | "metadata": {},
227 | "output_type": "execute_result"
228 | }
229 | ],
230 | "source": [
231 | "watermalon_list=np.array(watermalon).tolist() #构建数据集\n",
232 | "watermalon_list"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 6,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "{'纹理': {'模糊': '否',\n",
244 | " '清晰': {'根蒂': {'硬挺': '否',\n",
245 | " '稍蜷': {'色泽': {'乌黑': {'触感': {'硬滑': '是', '软粘': '否'}}, '青绿': '是'}},\n",
246 | " '蜷缩': '是'}},\n",
247 | " '稍糊': {'触感': {'硬滑': '否', '软粘': '是'}}}}"
248 | ]
249 | },
250 | "execution_count": 6,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "features=watermalon.columns.tolist()[0:-1] #提取特征列表\n",
257 | "my_tree=createTree(watermalon_list,features)\n",
258 | "my_tree"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": []
267 | }
268 | ],
269 | "metadata": {
270 | "kernelspec": {
271 | "display_name": "Python 3",
272 | "language": "python",
273 | "name": "python3"
274 | },
275 | "language_info": {
276 | "codemirror_mode": {
277 | "name": "ipython",
278 | "version": 3
279 | },
280 | "file_extension": ".py",
281 | "mimetype": "text/x-python",
282 | "name": "python",
283 | "nbconvert_exporter": "python",
284 | "pygments_lexer": "ipython3",
285 | "version": "3.6.4"
286 | }
287 | },
288 | "nbformat": 4,
289 | "nbformat_minor": 2
290 | }
291 |
--------------------------------------------------------------------------------
/造个轮子-Naive Bayes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 自定义NB模型\n",
8 | "#### 输入\n",
9 | "- X_train,Y_train,X_test,Y_test\n",
10 | " - 其中,X为文本特征向量化处理后(CountVectorizer)的array数组,Y为X对应的分类标签list,X的行业与Y一一对应\n",
11 | " \n",
12 | "#### 输出\n",
13 | "- X_test各样本的预测分类结果Y_predict,以及分类准确率\n",
14 | "\n",
15 | "#### 过程\n",
16 | "- 利用训练集各特征词出现的频率和对应标签概率,训练NB模型各概率参数\n",
17 | "- 求测试集各特征在训练集对应的先验概率\n",
18 | "- 将测试集各特征在训练集对应的先验概率乘以条件概率P(Y=ck),得到测试集各样本后验概率,取后验概率最大的标签类别为该测试样本类别\n",
19 | "\n",
20 | "#### 1. 利用训练集,训练概率参数(拉普拉斯平滑)[类似mnb.fit()]\n",
21 | "- 条件概率:P(Y=ck)\n",
22 | "- 先验概率:P(X1=0|Y=ck),P(X1=1|Y=ck),P(X2=0|Y=ck)……\n",
23 | "\n",
24 | "#### 2. 将测试集各特征向量值带入训练的概率参数中,计算后验概率,取使后验概率最大的Y=ck为测试样本的分类[类似mnb.predict(), mnb.predict_proba()]\n",
25 | "- 测试集样本特征向量为0时,不将刚才训练的对应概率参数纳入计算\n",
26 | "- 测试集样本特征向量>=1时(即测试样本出现该特征向量的词),将刚才训练的特征向量对应的概率参数纳入计算\n",
27 | "- 分别计算垃圾邮件下和正常邮件下每个样本的后验概率,取后验概率最大的类别为样本分类\n",
28 | "\n",
29 | "#### 3. 计算分类准确率 [类似mnb.score()]"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 30,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "\"\"\"\n",
39 | "输入:X_train,Y_train,X_test,Y_test\n",
40 | " 其中,X为文本特征向量化处理后(CountVectorizer)的array,Y为X对应的分类标签list,XY一一对应\n",
41 | "输出:X_test各样本的预测分类结果Y_predict,分类准确率\n",
42 | " 其中,0-正常邮件,1-垃圾邮件\n",
43 | "\"\"\"\n",
44 | "def wheel_nb(X_train,Y_train,X_test,Y_test):\n",
45 | " import pandas as pd\n",
46 | " import numpy as np\n",
47 | " import re\n",
48 | " \n",
49 | " #先将训练集的内容和标签合为一个dataframe\n",
50 | " d={\"content\":X_train.tolist(),\"label\":Y_train}\n",
51 | " emails_train=pd.DataFrame(data=d)\n",
52 | "\n",
53 | " #将正常邮件(Y=0)和垃圾邮件(Y=1)分为两个子集\n",
54 | " normal=emails_train[emails_train.label==0]\n",
55 | " normal.reset_index(inplace=True,drop=True) #重置normal索引,作用于原表,丢弃之前的索引\n",
56 | " spam=emails_train[emails_train.label==1]\n",
57 | " spam.reset_index(inplace=True,drop=True) #重置spam索引,作用于原表,丢弃之前的索引\n",
58 | "\n",
59 | " \"\"\"计算Y_train=0、1的条件概率(拉普拉斯平滑)\"\"\"\n",
60 | " Py0=(len(normal)+1)/(len(emails_train)+2)\n",
61 | " Py1=(len(spam)+1)/(len(emails_train)+2)\n",
62 | "\n",
63 | " \"\"\"计算X_train各特征向量取各特征值时的先验概率(拉普拉斯平滑)\"\"\"\n",
64 | " \"\"\"计算垃圾邮件中,各特征向量的先验概率\"\"\"\n",
65 | " vd=len(spam.content[0]) #特征向量的维度\n",
66 | " spam_count_dict={} #用于保存content特征向量按列累加的结果\n",
67 | " spam_count_prob={} #用于保存垃圾邮件中各特征向量出现的概率\n",
68 | "\n",
69 | " #求content各特征向量按列累加的结果,用于计算各向量在训练集中出现的概率\n",
70 | " for i in range(len(spam)):\n",
71 | " for j in range(vd):\n",
72 | " spam_count_dict[j]=spam_count_dict.get(j,0)+spam.content[i][j] #计算垃圾邮件中各特征向量出现的次数,即,求content各特征向量count按列累加的结果\n",
73 | "\n",
74 | " for j in range(vd):\n",
75 | " spam_count_prob[j]=(spam_count_dict.get(j,0)+1)/(len(spam)+2)#计算垃圾邮件中各特征向量出现的概率(拉普拉斯平滑)\n",
76 | "\n",
77 | " \"\"\"计算正常邮件中,各特征向量的先验概率\"\"\"\n",
78 | " normal_count_dict={} #用于保存content特征向量按列累加的结果\n",
79 | " normal_count_prob={} #用于保存正常邮件中各特征向量出现的概率\n",
80 | "\n",
81 | " #求content各特征向量按列累加的结果,用于计算各向量在训练集中出现的概率\n",
82 | " for i in range(len(normal)):\n",
83 | " for j in range(vd):\n",
84 | " normal_count_dict[j]=normal_count_dict.get(j,0)+normal.content[i][j] #计算垃圾邮件中各特征向量出现的次数,即,求content各特征向量count按列累加的结果\n",
85 | "\n",
86 | " for j in range(vd):\n",
87 | " normal_count_prob[j]=(normal_count_dict.get(j,0)+1)/(len(normal)+2)#计算垃圾邮件中各特征向量出现的概率(拉普拉斯平滑)\n",
88 | "\n",
89 | " \"\"\"计算各测试样本的后验概率\"\"\"\n",
90 | " test_classify={} #用于保存测试集各样本的后验概率 P(Y|X)=P(Y)*P(X|Y)/P(X)\n",
91 | " Px_spam={} #用于保存测试集各样本在垃圾邮件下的先验概率 P(X|Y)\n",
92 | " Px_normal={} #用于保存测试集各样本在正常邮件下的先验概率 P(X|Y)\n",
93 | "\n",
94 | " for i in range(X_test.shape[0]):\n",
95 | " for j in range(X_test.shape[1]):\n",
96 | " if X_test[i][j]!=0:\n",
97 | " Px_spam[i]=Px_spam.get(i,1)*spam_count_prob.get(j)#计算垃圾邮件下,各测试样本的后验概率\n",
98 | " Px_normal[i]=Px_normal.get(i,1)*normal_count_prob.get(j)#计算正常邮件下,各测试样本的后验概率\n",
99 | "\n",
100 | " test_classify[i]=Py0*Px_normal.get(i,0),Py1*Px_spam.get(i,0) #后验概率P(Y|X)=P(Y)*P(X|Y)/P(X)\n",
101 | "\n",
102 | " #比较各样本属于不同分类时(正常/垃圾)的后验概率,去后验概率大的为样本分类结果\n",
103 | " results={} #用于存放邮件判定结果\n",
104 | " for key,value in test_classify.items():\n",
105 | " if value[0]<=value[1]: #value[0]-样本为正常邮件的后验概率,value[1]-样本为垃圾邮件的后验概率\n",
106 | " results[key]=1\n",
107 | " else:\n",
108 | " results[key]=0\n",
109 | "\n",
110 | " \"\"\"计算分类准确率\"\"\"\n",
111 | " count=0 #计数,统计被正确分类的邮件数量\n",
112 | " for key,value in results.items():\n",
113 | " if value==Y_test[key]:\n",
114 | " count+=1\n",
115 | " score=count/len(Y_test)\n",
116 | " \n",
117 | " print (\"测试样本预测分类为(按索引排序):\")\n",
118 | " print (results.values(),\"\\n\")\n",
119 | " print (\"测试样本实际分类为(按索引排序):\")\n",
120 | " print (Y_test,\"\\n\")\n",
121 | " print (\"NB模型分类准确率为:{0}%\".format(score*100))\n",
122 | "\n",
123 | " return results,score"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "### 测试NB模型"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 36,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/html": [
141 | "
\n",
142 | "\n",
155 | "
\n",
156 | " \n",
157 | " \n",
158 | " | \n",
159 | " content | \n",
160 | " type | \n",
161 | "
\n",
162 | " \n",
163 | " \n",
164 | " \n",
165 | " | 0 | \n",
166 | " 招商银行信用卡电子账单2018年6月?-?07/13?¥1,540.00?$?0.00?¥1... | \n",
167 | " 0 | \n",
168 | "
\n",
169 | " \n",
170 | " | 1 | \n",
171 | " 密码重置邮件-来自智联招聘? | \n",
172 | " 0 | \n",
173 | "
\n",
174 | " \n",
175 | " | 2 | \n",
176 | " 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥2189?¥58... | \n",
177 | " 0 | \n",
178 | "
\n",
179 | " \n",
180 | " | 3 | \n",
181 | " Apple 提供的收据?-?收据?APPLE?ID?348708632@qq.com付款信息... | \n",
182 | " 0 | \n",
183 | "
\n",
184 | " \n",
185 | " | 4 | \n",
186 | " 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥1540?¥64... | \n",
187 | " 0 | \n",
188 | "
\n",
189 | " \n",
190 | " | 5 | \n",
191 | " 6月20日徐晨阳《硅谷创新机制解密》报告?-?各位校友:?通知请见:https://www.... | \n",
192 | " 0 | \n",
193 | "
\n",
194 | " \n",
195 | " | 6 | \n",
196 | " 中国科学技术大学六十周年校庆纪念活动 校友邀请函?-??尊敬的校友:?您好!红专并进一甲子,... | \n",
197 | " 0 | \n",
198 | "
\n",
199 | " \n",
200 | " | 7 | \n",
201 | " 少女心晒一“夏”,ELLE Club等你解锁夏季最潮玩法!(?-?如果您不能正常浏览此邮件,... | \n",
202 | " 1 | \n",
203 | "
\n",
204 | " \n",
205 | " | 8 | \n",
206 | " 网上购票系统--用户支付通知?-??尊敬的?邓女士:?您好!?您于2018年06月04日在中... | \n",
207 | " 0 | \n",
208 | "
\n",
209 | " \n",
210 | "
\n",
211 | "
"
212 | ],
213 | "text/plain": [
214 | " content type\n",
215 | "0 招商银行信用卡电子账单2018年6月?-?07/13?¥1,540.00?$?0.00?¥1... 0\n",
216 | "1 密码重置邮件-来自智联招聘? 0\n",
217 | "2 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥2189?¥58... 0\n",
218 | "3 Apple 提供的收据?-?收据?APPLE?ID?348708632@qq.com付款信息... 0\n",
219 | "4 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥1540?¥64... 0\n",
220 | "5 6月20日徐晨阳《硅谷创新机制解密》报告?-?各位校友:?通知请见:https://www.... 0\n",
221 | "6 中国科学技术大学六十周年校庆纪念活动 校友邀请函?-??尊敬的校友:?您好!红专并进一甲子,... 0\n",
222 | "7 少女心晒一“夏”,ELLE Club等你解锁夏季最潮玩法!(?-?如果您不能正常浏览此邮件,... 1\n",
223 | "8 网上购票系统--用户支付通知?-??尊敬的?邓女士:?您好!?您于2018年06月04日在中... 0"
224 | ]
225 | },
226 | "execution_count": 36,
227 | "metadata": {},
228 | "output_type": "execute_result"
229 | }
230 | ],
231 | "source": [
232 | "#读取数据\n",
233 | "emails=pd.read_csv(r\"E:\\python\\data\\emails_spam.csv\",encoding=\"utf-8\")\n",
234 | "emails.head(9)"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 37,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "#清洗数据\n",
244 | "def text_format():\n",
245 | " import jieba\n",
246 | " import re\n",
247 | " import pandas as pd\n",
248 | " \n",
249 | " print (\"待处理文本格式要求:utf-8编码格式,仅包含待处理文本,每行为一条文本\")\n",
250 | " text_path=input(\"请输入待清洗文本路径+名字:\")\n",
251 | " \n",
252 | " #加载用户自定义词典用于分词\n",
253 | " userdict_path=input(\"请输入自定义分词词典路径+名字(可不输入):\")\n",
254 | " if userdict_path !=\"\":\n",
255 | " jieba.load_userdict(userdict_path)\n",
256 | " \n",
257 | " #根据用户输入地址,读取文件\n",
258 | " with open(text_path,\"r\",encoding=\"utf-8\") as file:\n",
259 | " text=file.readlines()\n",
260 | " for i in range(len(text)):\n",
261 | " text[i]=text[i].strip()\n",
262 | " \n",
263 | " #定义一个空列表,用于存放分词后的文本,长度和text一致\n",
264 | " text_word=[[] for i in range(len(text))]\n",
265 | " \n",
266 | " splitter=re.compile(r\"\\W+|\\d+|[a-z]+\") #正则匹配,去除文本中的符号、数字、字母等非中文字符的元素\n",
267 | " for i in range(len(text)):\n",
268 | " text[i]=splitter.split(text[i].lower())\n",
269 | " text[i]=[word for word in text[i] if len(word)>1] #每条文本已经被分为一段一段的句子,每条文本此时是一个list,先去除其中字段长度小于等于1的单词\n",
270 | " for word in text[i]:\n",
271 | " text_word[i].extend(jieba.lcut(word))\n",
272 | " text_word[i]=\" \".join(text_word[i]) #为了便于TfidfVectorizer等文本向量化处理,将每条标题用元素用空格连起来\n",
273 | " \n",
274 | " return text_word"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 38,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "待处理文本格式要求:utf-8编码格式,仅包含待处理文本,每行为一条文本\n",
287 | "请输入待清洗文本路径+名字:E:\\python\\data\\emails_spam.txt\n",
288 | "请输入自定义分词词典路径+名字(可不输入):\n"
289 | ]
290 | }
291 | ],
292 | "source": [
293 | "emails_format=text_format()"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 39,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "from sklearn.cross_validation import train_test_split\n",
303 | "from sklearn.feature_extraction.text import CountVectorizer\n",
304 | "\n",
305 | "#建立训练集、测试集\n",
306 | "label=emails.type.tolist()\n",
307 | "X_train,X_test,Y_train,Y_test=train_test_split(emails_format,label,test_size=0.2,random_state=7)\n",
308 | "\n",
309 | "#加载并处理停用词典\n",
310 | "with open(r\"E:\\python\\data\\stopwords.txt\",\"r\",encoding=\"utf-8\") as file:\n",
311 | " stop_words=file.readlines()\n",
312 | "for i in range(len(stop_words)):\n",
313 | " stop_words[i]=stop_words[i].strip(\"\\n\")\n",
314 | " \n",
315 | "#构成词袋模型,记录各个词出现的次数\n",
316 | "cv=CountVectorizer(stop_words=stop_words)\n",
317 | "X_train_count=cv.fit_transform(X_train)\n",
318 | "X_test_count=cv.transform(X_test)"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "#### 将数据带入NB模型进行测试"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 40,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stdout",
335 | "output_type": "stream",
336 | "text": [
337 | "测试样本预测分类为(按索引排序):\n",
338 | "dict_values([0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) \n",
339 | "\n",
340 | "测试样本实际分类为(按索引排序):\n",
341 | "[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0] \n",
342 | "\n",
343 | "NB模型分类准确率为:86.66666666666667%\n"
344 | ]
345 | },
346 | {
347 | "data": {
348 | "text/plain": [
349 | "({0: 0,\n",
350 | " 1: 0,\n",
351 | " 2: 0,\n",
352 | " 3: 1,\n",
353 | " 4: 0,\n",
354 | " 5: 1,\n",
355 | " 6: 0,\n",
356 | " 7: 0,\n",
357 | " 8: 0,\n",
358 | " 9: 0,\n",
359 | " 10: 0,\n",
360 | " 11: 0,\n",
361 | " 12: 0,\n",
362 | " 13: 0,\n",
363 | " 14: 0},\n",
364 | " 0.8666666666666667)"
365 | ]
366 | },
367 | "execution_count": 40,
368 | "metadata": {},
369 | "output_type": "execute_result"
370 | }
371 | ],
372 | "source": [
373 | "#将数据带入NB模型进行测试\n",
374 | "wheel_nb(X_train_count.toarray(),Y_train,X_test_count.toarray(),Y_test)"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {},
381 | "outputs": [],
382 | "source": []
383 | }
384 | ],
385 | "metadata": {
386 | "kernelspec": {
387 | "display_name": "Python 3",
388 | "language": "python",
389 | "name": "python3"
390 | },
391 | "language_info": {
392 | "codemirror_mode": {
393 | "name": "ipython",
394 | "version": 3
395 | },
396 | "file_extension": ".py",
397 | "mimetype": "text/x-python",
398 | "name": "python",
399 | "nbconvert_exporter": "python",
400 | "pygments_lexer": "ipython3",
401 | "version": "3.6.4"
402 | }
403 | },
404 | "nbformat": 4,
405 | "nbformat_minor": 2
406 | }
407 |
--------------------------------------------------------------------------------
/造个轮子-KNN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 二、KNN的自定义函数实现\n",
8 | " - 算法实现: (小数据量,线性扫描)\n",
9 | " - https://www.cnblogs.com/hemiy/p/6155425.html\n",
10 | " 1. 输入x与训练集各点的距离distance\n",
11 | " 2. 按distance排序,取distance最近的k个点(k为用户输入)\n",
12 | " 3. 对k个点的类归类计数,x归为多数类(多数表决)\n",
13 | " 4. or 对k个点按1/square(distance)权重归类计数,x归为计数大的类(加权表决)\n",
14 | "\n",
15 | " - 对于大数据量,线性扫描效率极低,于是采用kd树储存训练集,通过搜索kd树的方法寻找输入的近邻,将输入归类(算法如何实现?自定义函数2)"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "import operator"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### 自定义KNN分类器\n",
34 | " - newInput: 新输入的待分类数据(x_test),**本分类器一次只能对一个新输入分类**\n",
35 | " - dataset:输入的训练数据集(x_train),array类型,**每一行为一个输入训练集**\n",
36 | " - labels:输入训练集对应的类别标签(y_train),**格式为['A','B']而不是[['A'],['B']]**\n",
37 | " - k:近邻数\n",
38 | " - weight:决策规则,\"uniform\" 多数表决法,\"distance\" 距离加权表决法"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# newInput: 新输入的待分类数据(x_test),本分类器一次只能对一个新输入分类\n",
48 | "# dataset:输入的训练数据集(x_train),array类型,每一行为一个输入训练集\n",
49 | "# labels:输入训练集对应的类别标签(y_train),格式为['A','B']而不是[['A'],['B']]\n",
50 | "# k:近邻数\n",
51 | "# weight:决策规则,\"uniform\" 多数表决法,\"distance\" 距离加权表决法\n",
52 | "\n",
53 | "def KNNClassify(newInput, dataset, labels, k, weight):\n",
54 | " numSamples=dataset.shape[0]\n",
55 | " \n",
56 | " \"\"\"step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)\"\"\"\n",
57 | " diff=np.tile(newInput,(numSamples,1)) - dataset # 凸显numpy数组的高效性——元素级的运算\n",
58 | " squaredist=diff**2\n",
59 | " distance = (squaredist.sum(axis=1))**0.5 # axis=1,按行累加\n",
60 | " \n",
61 | " \"\"\"step2:将距离按升序排序,并取距离最近的k个近邻点\"\"\"\n",
62 | " # 对数组distance按升序排序,返回数组排序后的值对应的索引值\n",
63 | " sortedDistance=distance.argsort() \n",
64 | " \n",
65 | " # 定义一个空字典,存放k个近邻点的分类计数\n",
66 | " classCount={}\n",
67 | " \n",
68 | " # 对k个近邻点分类计数,多数表决法\n",
69 | " for i in range(k):\n",
70 | " # 第i个近邻点在distance数组中的索引,对应的分类\n",
71 | " votelabel=labels[sortedDistance[i]]\n",
72 | " if weight==\"uniform\":\n",
73 | " # votelabel作为字典的key,对相同的key值累加(多数表决法)\n",
74 | " classCount[votelabel]=classCount.get(votelabel,0)+1 \n",
75 | " elif weight==\"distance\":\n",
76 | " # 对相同的key值按距离加权累加(加权表决法)\n",
77 | " classCount[votelabel]=classCount.get(votelabel,0)+(1/distance[sortedDistance[i]])\n",
78 | " else:\n",
79 | " print (\"分类决策规则错误!\")\n",
80 | " print (\"\\\"uniform\\\"多数表决法\\\"distance\\\"距离加权表决法\")\n",
81 | " break \n",
82 | " \n",
83 | " # 对k个近邻点的分类计数按降序排序,返回得票数最多的分类结果\n",
84 | " sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)\n",
85 | " if weight==\"uniform\":\n",
86 | " print (\"新输入到训练集的最近%d个点的计数为:\"%k,\"\\n\",classCount)\n",
87 | " print (\"新输入的类别是:\", sortedClassCount[0][0])\n",
88 | " \n",
89 | " elif weight==\"distance\":\n",
90 | " print (\"新输入到训练集的最近%d个点的距离加权计数为:\"%k,\"\\n\",classCount)\n",
91 | " print (\"新输入的类别是:\", sortedClassCount[0][0])\n",
92 | " \n",
93 | " return sortedClassCount[0][0]"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "#### 鸢尾花数据集分类测试"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "text/html": [
111 | "\n",
112 | "\n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " | \n",
129 | " sepallength | \n",
130 | " sepalwidth | \n",
131 | " petallength | \n",
132 | " petalwidth | \n",
133 | " species | \n",
134 | "
\n",
135 | " \n",
136 | " \n",
137 | " \n",
138 | " | 0 | \n",
139 | " 5.1 | \n",
140 | " 3.5 | \n",
141 | " 1.4 | \n",
142 | " 0.2 | \n",
143 | " Iris-setosa | \n",
144 | "
\n",
145 | " \n",
146 | " | 1 | \n",
147 | " 4.9 | \n",
148 | " 3.0 | \n",
149 | " 1.4 | \n",
150 | " 0.2 | \n",
151 | " Iris-setosa | \n",
152 | "
\n",
153 | " \n",
154 | " | 2 | \n",
155 | " 4.7 | \n",
156 | " 3.2 | \n",
157 | " 1.3 | \n",
158 | " 0.2 | \n",
159 | " Iris-setosa | \n",
160 | "
\n",
161 | " \n",
162 | " | 3 | \n",
163 | " 4.6 | \n",
164 | " 3.1 | \n",
165 | " 1.5 | \n",
166 | " 0.2 | \n",
167 | " Iris-setosa | \n",
168 | "
\n",
169 | " \n",
170 | " | 4 | \n",
171 | " 5.0 | \n",
172 | " 3.6 | \n",
173 | " 1.4 | \n",
174 | " 0.2 | \n",
175 | " Iris-setosa | \n",
176 | "
\n",
177 | " \n",
178 | "
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " sepallength sepalwidth petallength petalwidth species\n",
183 | "0 5.1 3.5 1.4 0.2 Iris-setosa\n",
184 | "1 4.9 3.0 1.4 0.2 Iris-setosa\n",
185 | "2 4.7 3.2 1.3 0.2 Iris-setosa\n",
186 | "3 4.6 3.1 1.5 0.2 Iris-setosa\n",
187 | "4 5.0 3.6 1.4 0.2 Iris-setosa"
188 | ]
189 | },
190 | "execution_count": 137,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "iris=pd.read_csv(\"E:\\python\\practice\\iris.txt\")\n",
197 | "iris.head()"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "### 建立训练集、测试集\n",
205 | " - 注意训练集x、y的格式"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "[5.1 3.8 1.5 0.3]\n",
218 | "['Iris-setosa']\n"
219 | ]
220 | },
221 | {
222 | "data": {
223 | "text/plain": [
224 | "array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',\n",
225 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',\n",
226 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',\n",
227 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',\n",
228 | " 'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',\n",
229 | " 'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',\n",
230 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',\n",
231 | " 'Iris-virginica', 'Iris-virginica', 'Iris-virginica',\n",
232 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',\n",
233 | " 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',\n",
234 | " 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',\n",
235 | " 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
236 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',\n",
237 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',\n",
238 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',\n",
239 | " 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
240 | " 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',\n",
241 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',\n",
242 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',\n",
243 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',\n",
244 | " 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
245 | " 'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',\n",
246 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',\n",
247 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',\n",
248 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',\n",
249 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',\n",
250 | " 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',\n",
251 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',\n",
252 | " 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
253 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',\n",
254 | " 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
255 | " 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',\n",
256 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',\n",
257 | " 'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',\n",
258 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',\n",
259 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',\n",
260 | " 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',\n",
261 | " 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',\n",
262 | " 'Iris-versicolor'], dtype=object)"
263 | ]
264 | },
265 | "execution_count": 138,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "iris_x=iris.iloc[:,[0,1,2,3]]\n",
272 | "iris_y=iris.iloc[:,[4]]\n",
273 | "\n",
274 | "np.random.seed(7)\n",
275 | "indices=np.random.permutation(len(iris_x))\n",
276 | "\n",
277 | "iris_x_train=iris_x.iloc[indices[0:130]]\n",
278 | "iris_y_train=iris_y.iloc[indices[0:130]]\n",
279 | "\n",
280 | "iris_x_test=iris_x.iloc[indices[130:150]]\n",
281 | "iris_y_test=iris_y.iloc[indices[130:150]]\n",
282 | "\n",
283 | "# 将dataframe格式的数据转换为numpy array格式,便于调用函数计算\n",
284 | "iris_x_train=np.array(iris_x_train)\n",
285 | "iris_y_train=np.array(iris_y_train)\n",
286 | "\n",
287 | "iris_x_test=np.array(iris_x_test)\n",
288 | "iris_y_test=np.array(iris_y_test) \n",
289 | "\n",
290 | "print (iris_x_test[1])\n",
291 | "print (iris_y_test[1])\n",
292 | "\n",
293 | "\"\"\"运行错误测试:\n",
294 | "dis=(((np.tile(iris_x_test[1],(130,1))-iris_x_train)**2).sum(axis=1))**0.5\n",
295 | "sortdis=dis.argsort()\n",
296 | "cc={}\n",
297 | "for i in range(10):\n",
298 | " votel=iris_y_train[sortdis[i]]\n",
299 | " cc[votel]=cc.get(votel,0)+1\n",
300 | "\n",
301 | "sortedcc=sorted(cc,key=operator.itemgetter(1),reversed=True)\n",
302 | "sortedcc[0][0]\"\"\"\n",
303 | "\n",
304 | "# 将labels的形状设置为(130,)\n",
305 | "iris_y_train.shape=(130,)\n",
306 | "iris_y_train"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "### 将训练集、测试集带入自定义KNN分类器进行分类"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 139,
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "name": "stdout",
323 | "output_type": "stream",
324 | "text": [
325 | "新输入到训练集的最近20个点的距离加权计数为: \n",
326 | " {'Iris-versicolor': 45.596003202769246}\n",
327 | "新输入的类别是: Iris-versicolor\n",
328 | "Iris-versicolor\n",
329 | "新输入的实际类别是: ['Iris-versicolor']\n",
330 | "\n",
331 | "\n",
332 | "预测准确!\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "# 将训练集、测试集带入自定义KNN分类器进行分类\n",
338 | "test_index=12\n",
339 | "predict=KNNClassify(iris_x_test[test_index],iris_x_train,iris_y_train,20,\"distance\")\n",
340 | "print (predict)\n",
341 | "print (\"新输入的实际类别是:\", iris_y_test[test_index])\n",
342 | "print (\"\\n\")\n",
343 | "\n",
344 | "if predict==iris_y_test[test_index]:\n",
345 | " print (\"预测准确!\")\n",
346 | "else:\n",
347 | " print (\"预测错误!\")"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "#### 另一组简单的测试数据分类"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 140,
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "name": "stdout",
364 | "output_type": "stream",
365 | "text": [
366 | "新输入到训练集的最近4个点的距离加权计数为: \n",
367 | " {'A': 9.472135954999581, 'B': 1.4018812887604746}\n",
368 | "新输入的类别是: A\n"
369 | ]
370 | },
371 | {
372 | "data": {
373 | "text/plain": [
374 | "'A'"
375 | ]
376 | },
377 | "execution_count": 140,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "# 另一组简单的测试数据分类\n",
384 | "group = np.array([[1.0, 0.9], [1.0, 1.0], [0.1, 0.2], [0.0, 0.1]])\n",
385 | "labels = np.array(['A', 'A', 'B', 'B'])\n",
386 | "testX = np.array([1.2, 1.0])\n",
387 | "\n",
388 | "KNNClassify(testX,group,labels,4,\"distance\")"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 141,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "(4,)"
400 | ]
401 | },
402 | "execution_count": 141,
403 | "metadata": {},
404 | "output_type": "execute_result"
405 | }
406 | ],
407 | "source": [
408 | "labels = np.array(['A', 'A', 'B', 'B'])\n",
409 | "labels.shape"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 142,
415 | "metadata": {},
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "(130,)"
421 | ]
422 | },
423 | "execution_count": 142,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "iris_y_train.shape"
430 | ]
431 | }
432 | ],
433 | "metadata": {
434 | "kernelspec": {
435 | "display_name": "Python 3",
436 | "language": "python",
437 | "name": "python3"
438 | },
439 | "language_info": {
440 | "codemirror_mode": {
441 | "name": "ipython",
442 | "version": 3
443 | },
444 | "file_extension": ".py",
445 | "mimetype": "text/x-python",
446 | "name": "python",
447 | "nbconvert_exporter": "python",
448 | "pygments_lexer": "ipython3",
449 | "version": "3.6.4"
450 | }
451 | },
452 | "nbformat": 4,
453 | "nbformat_minor": 2
454 | }
455 |
--------------------------------------------------------------------------------
/AdaBoost.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "from numpy import *\n",
11 | "import time"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "### 自定义函数实现AdaBoost\n",
19 | "- 包含函数\n",
20 | " 1. stumpClassify() 通过阈值对数据分类\n",
21 | " 2. buildStump() 生成单层决策树,需要调用stumpClassify()\n",
22 | " 3. adaBoostTrainDS() 训练出多个弱决策树分类器,需要调用buildStump() \n",
23 | " 4. adaClassify() 利用训练好的弱分类器分类数据\n",
24 | " 5. loadDataSet() 加载数据集"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "#### 1. 构建单层决策树\n",
32 | "- 遍历数据集的每个特征:\n",
33 | " - 遍历特征的每个步长:\n",
34 | " - 遍历步长的每个阈值对比方式:\n",
35 | " - 计算每次迭代的weightedError\n",
36 | "- 认为weightedError最小的点(特征,阈值,方式)是最佳决策点,以此构建一棵决策树桩(stump)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "#通过阈值对数据分类+1 -1\n",
46 | "#dimen为dataMat的列索引值,即特征位置;threshIneq为阈值对比方式,大于或小于\n",
47 | "def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):\n",
48 | " retArray=ones((shape(dataMatrix)[0],1))#注意,有两个()\n",
49 | " #阈值的模式,将小于某一阈值的特征归类为-1\n",
50 | " if threshIneq=='lt':#less than\n",
51 | " retArray[dataMatrix[:,dimen]<=threshVal]=-1.0\n",
52 | " #将大于某一阈值的特征归类为-1\n",
53 | " else:#greater than\n",
54 | " retArray[dataMatrix[:,dimen]>threshVal]=-1.0\n",
55 | " return retArray\n",
56 | "\n",
57 | "#单层决策树生成函数\n",
58 | "#D为各样本权重,shape=(m,1);label为样本标签,shape=(1,m)\n",
59 | "def buildStump(dataArr,classLabels,D):\n",
60 | "#将数据集和标签列表转为矩阵形式\n",
61 | " dataMatrix=mat(dataArr);labelMat=mat(classLabels).T\n",
62 | " m,n=shape(dataMatrix)\n",
63 | " #步长或区间总数 最优决策树信息 最优单层决策树预测结果\n",
64 | " numSteps=10.0;bestStump={};bestClasEst=mat(zeros((m,1))) #注意,有两个()\n",
65 | " #最小错误率初始化为+∞\n",
66 | " minError=inf\n",
67 | " \n",
68 | " #遍历数据集的每个特征:遍历特征的每个步长:遍历步长的每个阈值对比方式\n",
69 | " for i in range(n):\n",
70 | " #找出列中特征值的最小值和最大值\n",
71 | " rangeMin=dataMatrix[:,i].min();rangeMax=dataMatrix[:,i].max()\n",
72 | " #求取步长大小或者说区间间隔\n",
73 | " stepSize=(rangeMax-rangeMin)/numSteps\n",
74 | " #遍历各个步长区间\n",
75 | " for j in range(-1,int(numSteps)+1):\n",
76 | " #两种阈值过滤模式\n",
77 | " for inequal in ['lt','gt']:\n",
78 | " threshVal=rangeMin+float(j)*stepSize\n",
79 | " #选定阈值后,调用阈值过滤函数分类预测\n",
80 | " predictedVals=\\\n",
81 | " stumpClassify(dataMatrix,i,threshVal,inequal)\n",
82 | " #初始化错误向量\n",
83 | " errArr=mat(ones((m,1)))\n",
84 | " #将错误向量中分类正确项置0\n",
85 | " errArr[predictedVals==labelMat]=0\n",
86 | " #计算\"加权\"的错误率\n",
87 | " weigthedError=D.T*errArr\n",
88 | " #print (\"分割特征为第{0}个,分割阈值为{1},分割方式为{2},weight error为{3}\"\\\n",
89 | " # .format(i+1,threshVal,threshIneq,weightedErr))\n",
90 | " if weigthedError阈值时认为是+1.0类\n",
454 | " 将该阈值样本的假设类别-1.0与该阈值样本的真实类别classLabels[index]对比\n",
455 | " 若真实类别为1.0,则拉低真阳率(漏报率);若真实类别为-1.0,则拉低假阳率(误报率)\n",
456 | " \"\"\"\n",
457 | " for index in sortedIndex.tolist()[0]:\n",
458 | " #若判别为最小的样本真实为正例(说明正例被错判,FN),则减小Y值(TP/(TP+FN))\n",
459 | " if classLabels[index]==1.0: \n",
460 | " delX=0.0;delY=yStep\n",
461 | " #若判别为最小的样本真实为反例(说明反例被对判,TN),则减小X值(FP/(FP+TN))\n",
462 | " else:\n",
463 | " delX=xStep;delY=0.0\n",
464 | " ySum+=cur[1] #x每移动一个xStep,ySum就叠加一个当时的y值,用于计算曲线下矩形面积\n",
465 | " \n",
466 | " ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY],c='r') #绘制ROC曲线\n",
467 | " cur=(cur[0]-delX,cur[1]-delY) #更新光标位置\n",
468 | " \n",
469 | " ax.plot([0,1],[0,1],'b--') #[0,1],[0,1]表示x,y值的一一对应关系,即(0,0)(1,1)两个点,并用线连接起来\n",
470 | " plt.xlabel(\"假阳率\",fontsize=16,fontweight=\"bold\")\n",
471 | " plt.ylabel(\"真阳率\",fontsize=16,fontweight=\"bold\")\n",
472 | " plt.title(\"AdaBoost疝病马预测ROC曲线\",fontsize=20,fontweight=\"bold\")\n",
473 | " ax.axis([0,1,0,1])\n",
474 | " plt.show()\n",
475 | " print(\"ROC曲线的AUC为:\",ySum*xStep)"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 21,
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "dataArr,labelArr=loadDataSet(r'D:\\DM\\python\\data\\MLiA_SourceCode\\machinelearninginaction\\Ch07\\horseColicTraining2.txt')\n",
485 | "classifierArr,trainErrorRate,aggClassEst=adaBoostTrainDS(dataArr,labelArr,50)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 59,
491 | "metadata": {
492 | "collapsed": true
493 | },
494 | "outputs": [
495 | {
496 | "data": {
497 | "text/plain": [
498 | "[176,\n",
499 | " 35,\n",
500 | " 40,\n",
501 | " 192,\n",
502 | " 272,\n",
503 | " 49,\n",
504 | " 101,\n",
505 | " 16,\n",
506 | " 83,\n",
507 | " 181,\n",
508 | " 225,\n",
509 | " 44,\n",
510 | " 230,\n",
511 | " 172,\n",
512 | " 256,\n",
513 | " 170,\n",
514 | " 59,\n",
515 | " 179,\n",
516 | " 97,\n",
517 | " 168,\n",
518 | " 227,\n",
519 | " 244,\n",
520 | " 285,\n",
521 | " 296,\n",
522 | " 150,\n",
523 | " 239,\n",
524 | " 141,\n",
525 | " 295,\n",
526 | " 121,\n",
527 | " 45,\n",
528 | " 291,\n",
529 | " 46,\n",
530 | " 173,\n",
531 | " 43,\n",
532 | " 234,\n",
533 | " 73,\n",
534 | " 245,\n",
535 | " 246,\n",
536 | " 210,\n",
537 | " 86,\n",
538 | " 30,\n",
539 | " 134,\n",
540 | " 159,\n",
541 | " 48,\n",
542 | " 290,\n",
543 | " 109,\n",
544 | " 113,\n",
545 | " 133,\n",
546 | " 178,\n",
547 | " 204,\n",
548 | " 128,\n",
549 | " 108,\n",
550 | " 280,\n",
551 | " 219,\n",
552 | " 99,\n",
553 | " 110,\n",
554 | " 281,\n",
555 | " 80,\n",
556 | " 58,\n",
557 | " 252,\n",
558 | " 8,\n",
559 | " 292,\n",
560 | " 171,\n",
561 | " 200,\n",
562 | " 253,\n",
563 | " 229,\n",
564 | " 262,\n",
565 | " 123,\n",
566 | " 15,\n",
567 | " 64,\n",
568 | " 273,\n",
569 | " 249,\n",
570 | " 36,\n",
571 | " 261,\n",
572 | " 62,\n",
573 | " 203,\n",
574 | " 151,\n",
575 | " 207,\n",
576 | " 216,\n",
577 | " 0,\n",
578 | " 54,\n",
579 | " 91,\n",
580 | " 184,\n",
581 | " 140,\n",
582 | " 75,\n",
583 | " 177,\n",
584 | " 221,\n",
585 | " 70,\n",
586 | " 135,\n",
587 | " 78,\n",
588 | " 104,\n",
589 | " 209,\n",
590 | " 57,\n",
591 | " 72,\n",
592 | " 271,\n",
593 | " 34,\n",
594 | " 3,\n",
595 | " 117,\n",
596 | " 195,\n",
597 | " 297,\n",
598 | " 4,\n",
599 | " 19,\n",
600 | " 294,\n",
601 | " 241,\n",
602 | " 10,\n",
603 | " 242,\n",
604 | " 162,\n",
605 | " 270,\n",
606 | " 147,\n",
607 | " 238,\n",
608 | " 143,\n",
609 | " 31,\n",
610 | " 55,\n",
611 | " 93,\n",
612 | " 126,\n",
613 | " 237,\n",
614 | " 247,\n",
615 | " 37,\n",
616 | " 254,\n",
617 | " 286,\n",
618 | " 84,\n",
619 | " 68,\n",
620 | " 282,\n",
621 | " 18,\n",
622 | " 63,\n",
623 | " 164,\n",
624 | " 287,\n",
625 | " 174,\n",
626 | " 28,\n",
627 | " 186,\n",
628 | " 278,\n",
629 | " 39,\n",
630 | " 218,\n",
631 | " 167,\n",
632 | " 25,\n",
633 | " 258,\n",
634 | " 74,\n",
635 | " 196,\n",
636 | " 263,\n",
637 | " 274,\n",
638 | " 26,\n",
639 | " 232,\n",
640 | " 251,\n",
641 | " 131,\n",
642 | " 20,\n",
643 | " 56,\n",
644 | " 118,\n",
645 | " 188,\n",
646 | " 79,\n",
647 | " 13,\n",
648 | " 226,\n",
649 | " 66,\n",
650 | " 114,\n",
651 | " 17,\n",
652 | " 215,\n",
653 | " 124,\n",
654 | " 24,\n",
655 | " 41,\n",
656 | " 190,\n",
657 | " 160,\n",
658 | " 206,\n",
659 | " 156,\n",
660 | " 130,\n",
661 | " 265,\n",
662 | " 51,\n",
663 | " 82,\n",
664 | " 250,\n",
665 | " 266,\n",
666 | " 1,\n",
667 | " 268,\n",
668 | " 154,\n",
669 | " 65,\n",
670 | " 201,\n",
671 | " 298,\n",
672 | " 42,\n",
673 | " 269,\n",
674 | " 205,\n",
675 | " 193,\n",
676 | " 211,\n",
677 | " 33,\n",
678 | " 53,\n",
679 | " 127,\n",
680 | " 163,\n",
681 | " 7,\n",
682 | " 87,\n",
683 | " 9,\n",
684 | " 243,\n",
685 | " 106,\n",
686 | " 231,\n",
687 | " 146,\n",
688 | " 275,\n",
689 | " 220,\n",
690 | " 144,\n",
691 | " 96,\n",
692 | " 105,\n",
693 | " 180,\n",
694 | " 27,\n",
695 | " 90,\n",
696 | " 14,\n",
697 | " 102,\n",
698 | " 185,\n",
699 | " 198,\n",
700 | " 138,\n",
701 | " 187,\n",
702 | " 139,\n",
703 | " 217,\n",
704 | " 119,\n",
705 | " 32,\n",
706 | " 284,\n",
707 | " 259,\n",
708 | " 189,\n",
709 | " 264,\n",
710 | " 11,\n",
711 | " 212,\n",
712 | " 81,\n",
713 | " 88,\n",
714 | " 111,\n",
715 | " 112,\n",
716 | " 228,\n",
717 | " 129,\n",
718 | " 169,\n",
719 | " 222,\n",
720 | " 6,\n",
721 | " 255,\n",
722 | " 157,\n",
723 | " 60,\n",
724 | " 267,\n",
725 | " 94,\n",
726 | " 233,\n",
727 | " 155,\n",
728 | " 136,\n",
729 | " 236,\n",
730 | " 2,\n",
731 | " 52,\n",
732 | " 50,\n",
733 | " 23,\n",
734 | " 5,\n",
735 | " 95,\n",
736 | " 276,\n",
737 | " 240,\n",
738 | " 22,\n",
739 | " 132,\n",
740 | " 166,\n",
741 | " 103,\n",
742 | " 145,\n",
743 | " 152,\n",
744 | " 92,\n",
745 | " 137,\n",
746 | " 120,\n",
747 | " 197,\n",
748 | " 148,\n",
749 | " 61,\n",
750 | " 161,\n",
751 | " 76,\n",
752 | " 12,\n",
753 | " 235,\n",
754 | " 71,\n",
755 | " 142,\n",
756 | " 38,\n",
757 | " 98,\n",
758 | " 199,\n",
759 | " 213,\n",
760 | " 288,\n",
761 | " 100,\n",
762 | " 158,\n",
763 | " 69,\n",
764 | " 279,\n",
765 | " 122,\n",
766 | " 260,\n",
767 | " 47,\n",
768 | " 224,\n",
769 | " 182,\n",
770 | " 107,\n",
771 | " 289,\n",
772 | " 115,\n",
773 | " 149,\n",
774 | " 125,\n",
775 | " 191,\n",
776 | " 223,\n",
777 | " 85,\n",
778 | " 277,\n",
779 | " 77,\n",
780 | " 67,\n",
781 | " 165,\n",
782 | " 21,\n",
783 | " 248,\n",
784 | " 183,\n",
785 | " 175,\n",
786 | " 257,\n",
787 | " 89,\n",
788 | " 214,\n",
789 | " 202,\n",
790 | " 283,\n",
791 | " 194,\n",
792 | " 208,\n",
793 | " 29,\n",
794 | " 116,\n",
795 | " 153,\n",
796 | " 293]"
797 | ]
798 | },
799 | "execution_count": 59,
800 | "metadata": {},
801 | "output_type": "execute_result"
802 | }
803 | ],
804 | "source": [
805 | "argsort(aggClassEst.T).tolist()[0]"
806 | ]
807 | },
808 | {
809 | "cell_type": "code",
810 | "execution_count": 44,
811 | "metadata": {},
812 | "outputs": [
813 | {
814 | "data": {
815 | "text/plain": [
816 | "((299, 1), (299, 1))"
817 | ]
818 | },
819 | "execution_count": 44,
820 | "metadata": {},
821 | "output_type": "execute_result"
822 | }
823 | ],
824 | "source": [
825 | "shape(aggClassEst),shape(mat(labelArr).T)"
826 | ]
827 | },
828 | {
829 | "cell_type": "code",
830 | "execution_count": 63,
831 | "metadata": {},
832 | "outputs": [
833 | {
834 | "data": {
835 | "image/png": "\n",
836 | "text/plain": [
837 | ""
838 | ]
839 | },
840 | "metadata": {},
841 | "output_type": "display_data"
842 | },
843 | {
844 | "name": "stdout",
845 | "output_type": "stream",
846 | "text": [
847 | "ROC曲线的AUC为: 0.8953941870182941\n"
848 | ]
849 | }
850 | ],
851 | "source": [
852 | "plotROC(aggClassEst,mat(labelArr).T)"
853 | ]
854 | },
855 | {
856 | "cell_type": "markdown",
857 | "metadata": {},
858 | "source": [
859 | "### sklearn实现AdaBoost,用于疝病马数据集"
860 | ]
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": 1,
865 | "metadata": {},
866 | "outputs": [],
867 | "source": [
868 | "from sklearn.ensemble import AdaBoostClassifier"
869 | ]
870 | },
871 | {
872 | "cell_type": "code",
873 | "execution_count": 27,
874 | "metadata": {},
875 | "outputs": [],
876 | "source": [
877 | "dataArr,labelArr=loadDataSet(r'D:\\DM\\python\\data\\MLiA_SourceCode\\machinelearninginaction\\Ch07\\horseColicTraining2.txt')\n",
878 | "dataTest,labelTest=loadDataSet(r'D:\\DM\\python\\data\\MLiA_SourceCode\\machinelearninginaction\\Ch07\\horseColicTest2.txt')\n",
879 | "abc=AdaBoostClassifier(n_estimators=50)"
880 | ]
881 | },
882 | {
883 | "cell_type": "code",
884 | "execution_count": 28,
885 | "metadata": {},
886 | "outputs": [
887 | {
888 | "data": {
889 | "text/plain": [
890 | "0.7761194029850746"
891 | ]
892 | },
893 | "execution_count": 28,
894 | "metadata": {},
895 | "output_type": "execute_result"
896 | }
897 | ],
898 | "source": [
899 | "#训练弱分类器\n",
900 | "abc.fit(dataArr,labelArr)\n",
901 | "Y_pred=abc.predict(dataTest)\n",
902 | "abc.score(dataTest,labelTest)"
903 | ]
904 | },
905 | {
906 | "cell_type": "code",
907 | "execution_count": 33,
908 | "metadata": {},
909 | "outputs": [
910 | {
911 | "name": "stdout",
912 | "output_type": "stream",
913 | "text": [
914 | "[-1. 1.]\n",
915 | "[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n",
916 | " 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n",
917 | " 1. 1.]\n",
918 | "[0.28428094 0.33214454 0.39257738 0.40697642 0.41930599 0.40339763\n",
919 | " 0.4357326 0.42456092 0.44568626 0.42683673 0.45293997 0.45518735\n",
920 | " 0.4225583 0.43627423 0.43044037 0.42723727 0.4744538 0.431832\n",
921 | " 0.44713836 0.44962627 0.46277843 0.43343921 0.43638023 0.47462149\n",
922 | " 0.45972328 0.46639559 0.43662019 0.42964436 0.43240759 0.44019402\n",
923 | " 0.48131686 0.48714723 0.47491754 0.45261708 0.45541079 0.45172395\n",
924 | " 0.44629907 0.4741625 0.47014168 0.45891498 0.44262397 0.48395967\n",
925 | " 0.4574864 0.45985565 0.45594619 0.45687237 0.45398568 0.46546841\n",
926 | " 0.47404671 0.45379753]\n"
927 | ]
928 | }
929 | ],
930 | "source": [
931 | "#返回模型属性\n",
932 | "print (abc.classes_) #返回类别标签\n",
933 | "print (abc.estimator_weights_ ) #返回训练得到的各分类器权重 alpham\n",
934 | "print (abc.estimator_errors_) #返回训练得到的各分类器分类误差\n",
935 | "#print (abc.estimators_) #返回训练得到的弱分类器,列表格式"
936 | ]
937 | },
938 | {
939 | "cell_type": "code",
940 | "execution_count": null,
941 | "metadata": {},
942 | "outputs": [],
943 | "source": []
944 | }
945 | ],
946 | "metadata": {
947 | "kernelspec": {
948 | "display_name": "Python 3",
949 | "language": "python",
950 | "name": "python3"
951 | },
952 | "language_info": {
953 | "codemirror_mode": {
954 | "name": "ipython",
955 | "version": 3
956 | },
957 | "file_extension": ".py",
958 | "mimetype": "text/x-python",
959 | "name": "python",
960 | "nbconvert_exporter": "python",
961 | "pygments_lexer": "ipython3",
962 | "version": "3.6.4"
963 | }
964 | },
965 | "nbformat": 4,
966 | "nbformat_minor": 2
967 | }
968 |
--------------------------------------------------------------------------------