├── README.md
├── 人才价格计算器.ipynb
├── 造个轮子-决策树(ID3).ipynb
├── 造个轮子-Naive Bayes.ipynb
├── 造个轮子-KNN.ipynb
└── AdaBoost.ipynb
/README.md:
--------------------------------------------------------------------------------
1 | # Machine_Learning_Algorithm_Wheel
2 | **机器学习算法纯手工轮子,主要目的在于深入理解算法原理和参数,包含:**
3 | ### 1. 分类
4 | - KNN
5 | - Naive Bayes
6 | - Decision Tree
7 | - Logistic Regression
8 | - SVM
9 | - AdaBoost
10 | ### 2. 回归
11 | - Linear Regression/Ridge Regression/Lasso Regression
12 | - Regression Tree:回归树/模型树
13 | ### 3. 聚类
14 | - KMeans/BiKMeans
15 | ### 4. 关联分析
16 | - Apriori:频繁项集、关联规则
17 | - FP-Growth:频繁项集
18 | ### 5. 降维
19 | - PCA
20 | - SVD
21 |
--------------------------------------------------------------------------------
/人才价格计算器.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 人才价格计算器\n",
8 | " - 输入:求职者——求职职位、工作年限、学历;招聘方——公司规模\n",
9 | " - 输出:在指定公司规模、求职职位和经验、学历的条件下,市场水平的月薪均值\n",
10 | " \n",
11 | " - 原理:KNN分类器改进版(改进输出为离散值)——认为离输入x最近的k个点对应的月薪均值的均值是x的输出\n",
12 | " - k默认为训练样本数/100+1\n",
13 | " - 距离度量为欧氏距离\n",
14 | " - 决策规则:加权距离最近的k个点的y的均值\n",
15 | " - 加权距离:根据不同岗位下,学历、经验、公司规模与薪水的相关系数,计算输入x与训练集x的加权距离\n",
16 | " - y的均值:为了改进输出为离散值的问题,认为离输入x最近的k个点对应的月薪均值的均值是x的输出\n",
17 | " \n",
18 | " - 不足:\n",
19 | " - 输入的工作年限为个体值,但用于计算距离的工作年限为范围均值,会导致在范围交界点的个体值错误归类\n",
20 | " - 改进:将输入的个体值映射到各个工作年限要求范围均值上去(已改进)\n",
21 | " - 没有将职位描述这个信息量丰富的长文本纳入建模\n",
22 | " - 改进:可加入职位描述关键词与求职者能力关键词匹配程度,影响月薪水平\n",
23 | " - 训练样本太少,总共只有2398条数据,分岗位来看就更少了,因为样本量不足,会导致预测结果不准确\n",
24 | " - 改进:从boss、猎聘等网站拓展样本数量"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 9,
30 | "metadata": {},
31 | "outputs": [],
32 | "source": [
33 | "import pandas as pd\n",
34 | "import numpy as np\n",
35 | "import operator"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 13,
41 | "metadata": {},
42 | "outputs": [],
43 | "source": [
44 | "# 人才价格计算器\n",
45 | "\n",
46 | "def KNNClassify(newinput, dataset, labels, data,position,k=None): \n",
47 | " # 若未设定k值,则默认为10\n",
48 | " if k==None:\n",
49 | " k=10\n",
50 | "\n",
51 | " # 由于每个职位各因素与薪水相关性不同,因此引入不同职位的相关系数,放入距离加权计算中\n",
52 | " # 计算不同岗位类型下,公司规模、经验、学历与薪水的相关系数,用于后面计算k个最近点的加权距离\n",
53 | " corr=data[data.classified_zw==position][['salary_mean','companysize_mean','workyear_mean','degree_num']].corr()\n",
54 | " corr=np.array(corr)\n",
55 | " \n",
56 | " #salary_companysize=corr[1][0]\n",
57 | " #salary_workyear=corr[2][0]\n",
58 | " #salary_degree=corr[3][0]\n",
59 | " weight=np.array([corr[1][0],corr[2][0],corr[3][0]])\n",
60 | " \n",
61 | " numSamples=dataset.shape[0]\n",
62 | " \"\"\"step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)\"\"\"\n",
63 | " diff=np.tile(newinput,(numSamples,1)) - dataset # 凸显numpy数组的高效性——元素级的运算\n",
64 | " squaredist=(diff**2)/weight # 相关系数越大,因素越重要,表现为加权距离小,所以要除以相关系数\n",
65 | " distance = (squaredist.sum(axis=1))**0.5 # axis=1,按行累加\n",
66 | " \n",
67 | " \"\"\"step2:将距离按升序排序,并取距离最近的k个近邻点\"\"\"\n",
68 | " # 对数组distance按升序排序,返回数组排序后的值对应的索引值\n",
69 | " sortedDistance=distance.argsort() \n",
70 | " \n",
71 | " # 定义一个空字典,存放k个近邻点的分类计数\n",
72 | " classCount={}\n",
73 | " \n",
74 | " # 对k个近邻点分类计数,多数表决法\n",
75 | " for i in range(k):\n",
76 | " # 第i个近邻点在distance数组中的索引,对应的分类\n",
77 | " votelabel=labels[sortedDistance[i]]\n",
78 | " \n",
79 | " # votelabel作为字典的key,对相同的key值累加(多数表决法)\n",
80 | " classCount[votelabel]=classCount.get(votelabel,0)+1 \n",
81 | " \n",
82 | " # 求k个近邻点的y值的均值,作为新输入x的预测输出y\n",
83 | " predict=0\n",
84 | " point_k=0\n",
85 | " for key,value in classCount.items():\n",
86 | " predict+=key*value\n",
87 | " point_k+=value\n",
88 | " predict=predict/point_k\n",
89 | " \n",
90 | " return position,predict\n",
91 | "\n",
92 | "\n",
93 | "# data:清洗后的数据集,如拉勾成都地区招聘岗位的数据,lagou_origin。这里对于data要求比较死,如果更换数据,需要维护代码\n",
94 | "# position:求职职位:数据分析师,算法工程师,java工程师、等\n",
95 | "# newinput=[\"companysize_mean\",\"workyear_mean\",\"degree_num\"]\n",
96 | "def talent_calculator(data,k=None):\n",
97 | "\n",
98 | " # 交互,用户输入职位、公司规模、经验、学历\n",
99 | " print (\"可选职位:java工程师、前端、数据分析师、算法工程师\")\n",
100 | " print (\"产品运营、测试工程师、产品经理、数据挖掘、建模工程师、爬虫工程师、产业研究员职位也可计算,但由于数据量较小,结果准确性差\")\n",
101 | " position=str(input(\"请输入职位:\"))\n",
102 | " companysize=float(input(\"请输入公司规模(人):\"))\n",
103 | " workyear=float(input(\"请输入求职者工作年限(年):\"))\n",
104 | " degree=float(input(\"请输入求职者学历(0-不限,1-大专,2-本科,3-硕士,4-博士):\"))\n",
105 | " \n",
106 | " # 将输入的个体值映射到各个工作年限要求范围均值上去(避免在范围交界点的工作年限归类错误)\n",
107 | " # 若未输入工作年限,则默认为0年\n",
108 | " if workyear==None:\n",
109 | " workyear=0\n",
110 | " elif workyear>0 and workyear<1:\n",
111 | " workyear=0.5\n",
112 | " elif workyear>=1 and workyear<3:\n",
113 | " workyear=2\n",
114 | " elif workyear>=3 and workyear<5:\n",
115 | " workyear=4\n",
116 | " elif workyear>=5 and workyear<10:\n",
117 | " workyear=7.5\n",
118 | " elif workyear>=10:\n",
119 | " workyear=10\n",
120 | " else:\n",
121 | " workyear=workyear\n",
122 | " \n",
123 | " newinput=[companysize,workyear,degree]\n",
124 | " \n",
125 | " # 建立训练集\n",
126 | " data_x=data[data.classified_zw==position].loc[:,[\"companysize_mean\",\"workyear_mean\",\"degree_num\"]]\n",
127 | " data_y=data[data.classified_zw==position].loc[:,[\"salary_mean\"]]\n",
128 | "\n",
129 | " np.random.seed(7)\n",
130 | " indices=np.random.permutation(len(data_x))\n",
131 | "\n",
132 | " data_x_train=np.array(data_x.iloc[indices])\n",
133 | " data_y_train=np.array(data_y.iloc[indices])\n",
134 | "\n",
135 | " data_y_train.shape=(len(data_x),)\n",
136 | " \n",
137 | " dataset=data_x_train\n",
138 | " labels=data_y_train\n",
139 | " \n",
140 | " # 若未设定k值,则默认为输入岗位的职位数量/100+1\n",
141 | " if k==None:\n",
142 | " k=int(len(data_x)/100)+1\n",
143 | " \n",
144 | " # KNN分类器(改进版)\n",
145 | " # 由于每个职位各因素与薪水相关性不同,因此引入不同职位的相关系数,放入距离加权计算中\n",
146 | " # 计算不同岗位类型下,公司规模、经验、学历与薪水的相关系数,用于后面计算k个最近点的加权距离\n",
147 | " corr=data[data.classified_zw==position][['salary_mean','companysize_mean','workyear_mean','degree_num']].corr()\n",
148 | " corr=np.array(corr)\n",
149 | " \n",
150 | " #salary_companysize=corr[1][0]\n",
151 | " #salary_workyear=corr[2][0]\n",
152 | " #salary_degree=corr[3][0]\n",
153 | " weight=np.array([corr[1][0],corr[2][0],corr[3][0]])\n",
154 | " \n",
155 | " numSamples=dataset.shape[0]\n",
156 | " \"\"\"step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)\"\"\"\n",
157 | " diff=np.tile(newinput,(numSamples,1)) - dataset # 凸显numpy数组的高效性——元素级的运算\n",
158 | " squaredist=(diff**2)/weight # 相关系数越大,因素越重要,表现为加权距离小,所以要除以相关系数\n",
159 | " distance = (squaredist.sum(axis=1))**0.5 # axis=1,按行累加\n",
160 | " \n",
161 | " \"\"\"step2:将距离按升序排序,并取距离最近的k个近邻点\"\"\"\n",
162 | " # 对数组distance按升序排序,返回数组排序后的值对应的索引值\n",
163 | " sortedDistance=distance.argsort() \n",
164 | " \n",
165 | " # 定义一个空字典,存放k个近邻点的分类计数\n",
166 | " classCount={}\n",
167 | " \n",
168 | " # 对k个近邻点分类计数,多数表决法\n",
169 | " for i in range(k):\n",
170 | " # 第i个近邻点在distance数组中的索引,对应的分类\n",
171 | " votelabel=labels[sortedDistance[i]]\n",
172 | " \n",
173 | " # votelabel作为字典的key,对相同的key值累加(多数表决法)\n",
174 | " classCount[votelabel]=classCount.get(votelabel,0)+1 \n",
175 | " \n",
176 | " # 求k个近邻点的y值的均值,作为新输入x的预测输出y\n",
177 | " predict=0\n",
178 | " point_k=0\n",
179 | " for key,value in classCount.items():\n",
180 | " predict+=key*value\n",
181 | " point_k+=value\n",
182 | " predict=round(predict/point_k,2)\n",
183 | " \n",
184 | " print (\"\\n求职岗位为:\",position,\"\\n人才价格为\",predict,\"K\")\n",
185 | " \n",
186 | " return position,predict"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {},
192 | "source": [
193 | "#### 使用人才价格计算器"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 18,
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "可选职位:java工程师、前端、数据分析师、算法工程师\n",
206 | "产品运营、测试工程师、产品经理、数据挖掘、建模工程师、爬虫工程师、产业研究员职位也可计算,但由于数据量较小,结果准确性差\n",
207 | "请输入职位:数据分析师\n",
208 | "请输入公司规模(人):60\n",
209 | "请输入求职者工作年限(年):5\n",
210 | "请输入求职者学历(0-不限,1-大专,2-本科,3-硕士,4-博士):2\n",
211 | "\n",
212 | "求职岗位为: 数据分析师 \n",
213 | "人才价格为 22.5 K\n"
214 | ]
215 | },
216 | {
217 | "data": {
218 | "text/plain": [
219 | "('数据分析师', 22.5)"
220 | ]
221 | },
222 | "execution_count": 18,
223 | "metadata": {},
224 | "output_type": "execute_result"
225 | }
226 | ],
227 | "source": [
228 | "# 载入原始数据集\n",
229 | "lagou_orgin=pd.read_csv(r\"E:\\python\\data\\lagou\\lagou2018_chuli.csv\",encoding=\"utf-8\",delimiter=\"\\t\")\n",
230 | "# 填入原始数据集后,调用人才价格计算器\n",
231 | "talent_calculator(lagou_orgin)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "\n"
241 | ]
242 | }
243 | ],
244 | "metadata": {
245 | "kernelspec": {
246 | "display_name": "Python 3",
247 | "language": "python",
248 | "name": "python3"
249 | },
250 | "language_info": {
251 | "codemirror_mode": {
252 | "name": "ipython",
253 | "version": 3
254 | },
255 | "file_extension": ".py",
256 | "mimetype": "text/x-python",
257 | "name": "python",
258 | "nbconvert_exporter": "python",
259 | "pygments_lexer": "ipython3",
260 | "version": "3.6.4"
261 | }
262 | },
263 | "nbformat": 4,
264 | "nbformat_minor": 2
265 | }
266 |
--------------------------------------------------------------------------------
/造个轮子-决策树(ID3).ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 实现思路(ID3)\n",
8 | " 1. 自定义信息熵计算函数,用于计算数据集的信息熵\n",
9 | " 2. 自定义数据划分函数,用于根据指定特征的指定取值,划分数据集\n",
10 | " 3. step2的自数据集作为输入给step1的函数,可以计算出按某指定特征的某指定取值(A=ai)划分的数据集的信息熵H(Di),同时计算按某指定特征的某指定取值(A=ai)划分的数据集的样本概率|Di|/|D|\n",
11 | " 4. 遍历该特征各个取值,计算各取值下划分的数据集的信息熵H(Di)和样本概率|Di|/|D|,相乘,再求和得到得到特征A对数据集D的经验条件熵H(D|A)\n",
12 | " 5. 计算特征A对数据集的信息增益g(D,A)=H(D)-H(D|A)\n",
13 | " 6. 以此类推,计算各特征对数据集的信息增益,取信息增益最大的特征为最佳划分特征,得到树T1\n",
14 | " 7. 对T1各结点继续step3-6,选择信息增益最大的特征,继续划分数据,得到新的决策树\n",
15 | " 8. 直到信息增益小于阈值,或无特征可划分,或每个分支下的所有实例都具有相同的分类,决策树完成\n",
16 | "- **注意,ID3一直在分支,容易过拟合,因此需要对决策树剪枝,提高对测试集数据预测的性能**"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": 1,
22 | "metadata": {},
23 | "outputs": [],
24 | "source": [
25 | "import pandas as pd\n",
26 | "import numpy as np\n",
27 | "import operator\n",
28 | "from math import log\n",
29 | "\n",
30 | "\n",
31 | "\"\"\"\n",
32 | "信息熵计算函数,用于计算数据集的信息熵\n",
33 | "输入:数据集,每一行是一条数据,最后一列是各条数据集的类别\n",
34 | "输出:该数据集的信息熵\n",
35 | "思路:\n",
36 | "建立一个字典,对数据集各数据的类别计数,\n",
37 | "从而计算各类别出现频率(作为概率pi),\n",
38 | "最后调用信息熵公式计算 H(D)=-求和(pi*logpi)\n",
39 | "\"\"\"\n",
40 | "def calEntropy(dataset):\n",
41 | " n=len(dataset)\n",
42 | " labelCounts={}\n",
43 | " \n",
44 | " #对数据集各数据的类别计数\n",
45 | " for data in dataset:\n",
46 | " datalabel=data[-1] #取data最后一列,类别列\n",
47 | " if datalabel not in labelCounts.keys():\n",
48 | " labelCounts[datalabel]=0\n",
49 | " labelCounts[datalabel]+=1\n",
50 | " \n",
51 | " entropy=0.0\n",
52 | " \n",
53 | " #计算各类别出现频率(作为概率pi),调用信息熵公式计算 H(D)=-求和(pi*logpi)\n",
54 | " for key in labelCounts.keys():\n",
55 | " prob=float(labelCounts[key])/n\n",
56 | " entropy -= prob*log(prob,2)\n",
57 | " return entropy\n",
58 | "\n",
59 | "\n",
60 | "\"\"\"\n",
61 | "数据划分函数,用于根据指定特征的指定取值,划分数据集\n",
62 | "输入:数据集、特征所在列索引、特征取值\n",
63 | "输出:满足指定特征等于指定取值的数据子集\n",
64 | "\"\"\"\n",
65 | "def splitDataset(dataset,index,value):\n",
66 | " subDataset=[]\n",
67 | " for data in dataset:\n",
68 | " if data[index]==value:\n",
69 | " #抽取除了data[index]的内容(一个特征用于计算其对数据集的经验条件熵时,不需要此特征在子数据集中)\n",
70 | " splitData=data[:index] #取索引之前的元素\n",
71 | " splitData.extend(data[index+1:]) #再合并索引之后的元素\n",
72 | " subDataset.append(splitData)\n",
73 | " return subDataset\n",
74 | "\n",
75 | "\n",
76 | "\"\"\"\n",
77 | "选择信息增益最大的特征作为数据集划分特征\n",
78 | "输入:数据集\n",
79 | "输出:该数据集的最佳划分特征\n",
80 | "\"\"\"\n",
81 | "def chooseFeature(dataset):\n",
82 | " #初始化\n",
83 | " numFeature=len(dataset[0])-1 #因为最后一列是类别\n",
84 | " baseEntropy=calEntropy(dataset) #H(D)\n",
85 | " bestInfoGain=0.0\n",
86 | " bestFeatureIndex=-1\n",
87 | " \n",
88 | " #创建特征A各取值a的列表\n",
89 | " for i in range(numFeature):\n",
90 | " featureList=[data[i] for data in dataset]\n",
91 | " uniqueValue=set(featureList)\n",
92 | " empEntropy=0.0 #初始化特征A对数据集D的经验条件熵H(D|A)\n",
93 | " \n",
94 | " #计算特征A各取值a的信息熵H(Di)和样本概率|Di|/|D|,并相乘\n",
95 | " for value in uniqueValue:\n",
96 | " subDataset=splitDataset(dataset,i,value) #(列索引为i的特征)特征A取value值所划分的子数据集\n",
97 | " prob=len(subDataset)/float(len(dataset)) #计算|Di|/|D|\n",
98 | " empEntropy += prob*calEntropy(subDataset) #H(D|A)\n",
99 | " \n",
100 | " #取信息增益最大的特征为最佳划分特征\n",
101 | " infoGain=baseEntropy-empEntropy #信息增益\n",
102 | " if infoGain>bestInfoGain:\n",
103 | " bestInfoGain=infoGain\n",
104 | " bestFeatureIndex=i\n",
105 | " return bestFeatureIndex\n",
106 | "\n",
107 | "\n",
108 | "\"\"\"\n",
109 | "对数据集各数据类别进行计数排序\n",
110 | "\"\"\"\n",
111 | "def majorClass(classList):\n",
112 | " classCount={}\n",
113 | " for vote in classList:\n",
114 | " if vote not in classCount.keys():\n",
115 | " classCount[vote]=0\n",
116 | " classCount[vote]+=1\n",
117 | " \n",
118 | " #对classCount按value降序排序\n",
119 | " sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)\n",
120 | " return sortedClassCount[0][0] #返回类别最大的类别名\n",
121 | "\n",
122 | "\n",
123 | "\n",
124 | "\"\"\"\n",
125 | "主函数:递归构建决策树\n",
126 | "输入:数据集(list类型),数据集特征列表(按在数据集的位置排序)(list类型)\n",
127 | "输出:该数据集的决策树\n",
128 | "思路:【递归】\n",
129 | " 1. 若数据集属于同一类,则返回该类别,划分停止\n",
130 | " 2. 若数据集所有特征已经遍历,返回当前计数最多的类别为该结点类别,划分停止\n",
131 | " 3. 否则继续分支,调用chooseFeature()函数,选择当前数据集最优特征\n",
132 | " 4. 遍历当前最优特征各属性值,划分数据集,并递归调用自身createTree()构建子数据集的决策树\n",
133 | " 5. 完成\n",
134 | "\"\"\"\n",
135 | "def createTree(dataset,featureLabels):\n",
136 | " classList=[data[-1] for data in dataset] #取数据集各数据类别\n",
137 | " \n",
138 | " #若数据集属于同一类,则返回该类别,划分停止\n",
139 | " if classList.count(classList[0])==len(classList):\n",
140 | " return classList[0]\n",
141 | " \n",
142 | " #若数据集所有特征已经遍历,返回当前计数最多的类别为该结点类别,划分停止\n",
143 | " if len(dataset[0])==1:\n",
144 | " return majorClass(classList)\n",
145 | " \n",
146 | " #否则继续分支,调用chooseFeature()函数,选择当前数据集最优特征\n",
147 | " bestFeatureIndex=chooseFeature(dataset)\n",
148 | " bestFeature=featureLabels[bestFeatureIndex]\n",
149 | " \n",
150 | " #用于存储决策树,字典结构存储树的所有信息,并可体现包含关系\n",
151 | " desitionTree={bestFeature:{}} \n",
152 | " del(featureLabels[bestFeatureIndex]) #删除已被用于划分数据的特征\n",
153 | " \n",
154 | " #得到当前最优划分特征的各属性值\n",
155 | " featureValues=[data[bestFeatureIndex] for data in dataset]\n",
156 | " uniqueValues=set(featureValues)\n",
157 | " \n",
158 | " #遍历当前最优特征各属性值,划分数据集,并递归调用自身createTree()构建子数据集的决策树\n",
159 | " for value in uniqueValues:\n",
160 | " #得到已删除当前最优划分特征的特征列表,用于递归调用\n",
161 | " subFeatureLabels=featureLabels[:] \n",
162 | " \n",
163 | " #用当前最优划分特征的指定值分割子数据集,用于递归调用\n",
164 | " subData=splitDataset(dataset,bestFeatureIndex,value) \n",
165 | " desitionTree[bestFeature][value]=createTree(subData,subFeatureLabels)\n",
166 | " return desitionTree"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {},
172 | "source": [
173 | "#### 测试\n",
174 | "- 西瓜分类数据集测试"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 2,
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "data": {
184 | "text/plain": [
185 | "(17, 7)"
186 | ]
187 | },
188 | "execution_count": 2,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "watermalon=pd.read_csv(r\"D:\\python\\data\\watermalon.txt\",sep=\"\\t\")\n",
195 | "watermalon.shape"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 5,
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "data": {
205 | "text/plain": [
206 | "[['青绿', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],\n",
207 | " ['乌黑', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],\n",
208 | " ['乌黑', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],\n",
209 | " ['青绿', '蜷缩', '沉闷', '清晰', '凹陷', '硬滑', '是'],\n",
210 | " ['浅白', '蜷缩', '浊响', '清晰', '凹陷', '硬滑', '是'],\n",
211 | " ['青绿', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '是'],\n",
212 | " ['乌黑', '稍蜷', '浊响', '稍糊', '稍凹', '软粘', '是'],\n",
213 | " ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '硬滑', '是'],\n",
214 | " ['乌黑', '稍蜷', '沉闷', '稍糊', '稍凹', '硬滑', '否'],\n",
215 | " ['青绿', '硬挺', '清脆', '清晰', '平坦', '软粘', '否'],\n",
216 | " ['浅白', '硬挺', '清脆', '模糊', '平坦', '硬滑', '否'],\n",
217 | " ['浅白', '蜷缩', '浊响', '模糊', '平坦', '软粘', '否'],\n",
218 | " ['青绿', '稍蜷', '浊响', '稍糊', '凹陷', '硬滑', '否'],\n",
219 | " ['浅白', '稍蜷', '沉闷', '稍糊', '凹陷', '硬滑', '否'],\n",
220 | " ['乌黑', '稍蜷', '浊响', '清晰', '稍凹', '软粘', '否'],\n",
221 | " ['浅白', '蜷缩', '浊响', '模糊', '平坦', '硬滑', '否'],\n",
222 | " ['青绿', '蜷缩', '沉闷', '稍糊', '稍凹', '硬滑', '否']]"
223 | ]
224 | },
225 | "execution_count": 5,
226 | "metadata": {},
227 | "output_type": "execute_result"
228 | }
229 | ],
230 | "source": [
231 | "watermalon_list=np.array(watermalon).tolist() #构建数据集\n",
232 | "watermalon_list"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 6,
238 | "metadata": {},
239 | "outputs": [
240 | {
241 | "data": {
242 | "text/plain": [
243 | "{'纹理': {'模糊': '否',\n",
244 | " '清晰': {'根蒂': {'硬挺': '否',\n",
245 | " '稍蜷': {'色泽': {'乌黑': {'触感': {'硬滑': '是', '软粘': '否'}}, '青绿': '是'}},\n",
246 | " '蜷缩': '是'}},\n",
247 | " '稍糊': {'触感': {'硬滑': '否', '软粘': '是'}}}}"
248 | ]
249 | },
250 | "execution_count": 6,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "features=watermalon.columns.tolist()[0:-1] #提取特征列表\n",
257 | "my_tree=createTree(watermalon_list,features)\n",
258 | "my_tree"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": []
267 | }
268 | ],
269 | "metadata": {
270 | "kernelspec": {
271 | "display_name": "Python 3",
272 | "language": "python",
273 | "name": "python3"
274 | },
275 | "language_info": {
276 | "codemirror_mode": {
277 | "name": "ipython",
278 | "version": 3
279 | },
280 | "file_extension": ".py",
281 | "mimetype": "text/x-python",
282 | "name": "python",
283 | "nbconvert_exporter": "python",
284 | "pygments_lexer": "ipython3",
285 | "version": "3.6.4"
286 | }
287 | },
288 | "nbformat": 4,
289 | "nbformat_minor": 2
290 | }
291 |
--------------------------------------------------------------------------------
/造个轮子-Naive Bayes.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "### 自定义NB模型\n",
8 | "#### 输入\n",
9 | "- X_train,Y_train,X_test,Y_test\n",
10 | " - 其中,X为文本特征向量化处理后(CountVectorizer)的array数组,Y为X对应的分类标签list,X的行业与Y一一对应\n",
11 | " \n",
12 | "#### 输出\n",
13 | "- X_test各样本的预测分类结果Y_predict,以及分类准确率\n",
14 | "\n",
15 | "#### 过程\n",
16 | "- 利用训练集各特征词出现的频率和对应标签概率,训练NB模型各概率参数\n",
17 | "- 求测试集各特征在训练集对应的先验概率\n",
18 | "- 将测试集各特征在训练集对应的先验概率乘以条件概率P(Y=ck),得到测试集各样本后验概率,取后验概率最大的标签类别为该测试样本类别\n",
19 | "\n",
20 | "#### 1. 利用训练集,训练概率参数(拉普拉斯平滑)[类似mnb.fit()]\n",
21 | "- 条件概率:P(Y=ck)\n",
22 | "- 先验概率:P(X1=0|Y=ck),P(X1=1|Y=ck),P(X2=0|Y=ck)……\n",
23 | "\n",
24 | "#### 2. 将测试集各特征向量值带入训练的概率参数中,计算后验概率,取使后验概率最大的Y=ck为测试样本的分类[类似mnb.predict(), mnb.predict_proba()]\n",
25 | "- 测试集样本特征向量为0时,不将刚才训练的对应概率参数纳入计算\n",
26 | "- 测试集样本特征向量>=1时(即测试样本出现该特征向量的词),将刚才训练的特征向量对应的概率参数纳入计算\n",
27 | "- 分别计算垃圾邮件下和正常邮件下每个样本的后验概率,取后验概率最大的类别为样本分类\n",
28 | "\n",
29 | "#### 3. 计算分类准确率 [类似mnb.score()]"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 30,
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "\"\"\"\n",
39 | "输入:X_train,Y_train,X_test,Y_test\n",
40 | " 其中,X为文本特征向量化处理后(CountVectorizer)的array,Y为X对应的分类标签list,XY一一对应\n",
41 | "输出:X_test各样本的预测分类结果Y_predict,分类准确率\n",
42 | " 其中,0-正常邮件,1-垃圾邮件\n",
43 | "\"\"\"\n",
44 | "def wheel_nb(X_train,Y_train,X_test,Y_test):\n",
45 | " import pandas as pd\n",
46 | " import numpy as np\n",
47 | " import re\n",
48 | " \n",
49 | " #先将训练集的内容和标签合为一个dataframe\n",
50 | " d={\"content\":X_train.tolist(),\"label\":Y_train}\n",
51 | " emails_train=pd.DataFrame(data=d)\n",
52 | "\n",
53 | " #将正常邮件(Y=0)和垃圾邮件(Y=1)分为两个子集\n",
54 | " normal=emails_train[emails_train.label==0]\n",
55 | " normal.reset_index(inplace=True,drop=True) #重置normal索引,作用于原表,丢弃之前的索引\n",
56 | " spam=emails_train[emails_train.label==1]\n",
57 | " spam.reset_index(inplace=True,drop=True) #重置spam索引,作用于原表,丢弃之前的索引\n",
58 | "\n",
59 | " \"\"\"计算Y_train=0、1的条件概率(拉普拉斯平滑)\"\"\"\n",
60 | " Py0=(len(normal)+1)/(len(emails_train)+2)\n",
61 | " Py1=(len(spam)+1)/(len(emails_train)+2)\n",
62 | "\n",
63 | " \"\"\"计算X_train各特征向量取各特征值时的先验概率(拉普拉斯平滑)\"\"\"\n",
64 | " \"\"\"计算垃圾邮件中,各特征向量的先验概率\"\"\"\n",
65 | " vd=len(spam.content[0]) #特征向量的维度\n",
66 | " spam_count_dict={} #用于保存content特征向量按列累加的结果\n",
67 | " spam_count_prob={} #用于保存垃圾邮件中各特征向量出现的概率\n",
68 | "\n",
69 | " #求content各特征向量按列累加的结果,用于计算各向量在训练集中出现的概率\n",
70 | " for i in range(len(spam)):\n",
71 | " for j in range(vd):\n",
72 | " spam_count_dict[j]=spam_count_dict.get(j,0)+spam.content[i][j] #计算垃圾邮件中各特征向量出现的次数,即,求content各特征向量count按列累加的结果\n",
73 | "\n",
74 | " for j in range(vd):\n",
75 | " spam_count_prob[j]=(spam_count_dict.get(j,0)+1)/(len(spam)+2)#计算垃圾邮件中各特征向量出现的概率(拉普拉斯平滑)\n",
76 | "\n",
77 | " \"\"\"计算正常邮件中,各特征向量的先验概率\"\"\"\n",
78 | " normal_count_dict={} #用于保存content特征向量按列累加的结果\n",
79 | " normal_count_prob={} #用于保存正常邮件中各特征向量出现的概率\n",
80 | "\n",
81 | " #求content各特征向量按列累加的结果,用于计算各向量在训练集中出现的概率\n",
82 | " for i in range(len(normal)):\n",
83 | " for j in range(vd):\n",
84 | " normal_count_dict[j]=normal_count_dict.get(j,0)+normal.content[i][j] #计算垃圾邮件中各特征向量出现的次数,即,求content各特征向量count按列累加的结果\n",
85 | "\n",
86 | " for j in range(vd):\n",
87 | " normal_count_prob[j]=(normal_count_dict.get(j,0)+1)/(len(normal)+2)#计算垃圾邮件中各特征向量出现的概率(拉普拉斯平滑)\n",
88 | "\n",
89 | " \"\"\"计算各测试样本的后验概率\"\"\"\n",
90 | " test_classify={} #用于保存测试集各样本的后验概率 P(Y|X)=P(Y)*P(X|Y)/P(X)\n",
91 | " Px_spam={} #用于保存测试集各样本在垃圾邮件下的先验概率 P(X|Y)\n",
92 | " Px_normal={} #用于保存测试集各样本在正常邮件下的先验概率 P(X|Y)\n",
93 | "\n",
94 | " for i in range(X_test.shape[0]):\n",
95 | " for j in range(X_test.shape[1]):\n",
96 | " if X_test[i][j]!=0:\n",
97 | " Px_spam[i]=Px_spam.get(i,1)*spam_count_prob.get(j)#计算垃圾邮件下,各测试样本的后验概率\n",
98 | " Px_normal[i]=Px_normal.get(i,1)*normal_count_prob.get(j)#计算正常邮件下,各测试样本的后验概率\n",
99 | "\n",
100 | " test_classify[i]=Py0*Px_normal.get(i,0),Py1*Px_spam.get(i,0) #后验概率P(Y|X)=P(Y)*P(X|Y)/P(X)\n",
101 | "\n",
102 | " #比较各样本属于不同分类时(正常/垃圾)的后验概率,去后验概率大的为样本分类结果\n",
103 | " results={} #用于存放邮件判定结果\n",
104 | " for key,value in test_classify.items():\n",
105 | " if value[0]<=value[1]: #value[0]-样本为正常邮件的后验概率,value[1]-样本为垃圾邮件的后验概率\n",
106 | " results[key]=1\n",
107 | " else:\n",
108 | " results[key]=0\n",
109 | "\n",
110 | " \"\"\"计算分类准确率\"\"\"\n",
111 | " count=0 #计数,统计被正确分类的邮件数量\n",
112 | " for key,value in results.items():\n",
113 | " if value==Y_test[key]:\n",
114 | " count+=1\n",
115 | " score=count/len(Y_test)\n",
116 | " \n",
117 | " print (\"测试样本预测分类为(按索引排序):\")\n",
118 | " print (results.values(),\"\\n\")\n",
119 | " print (\"测试样本实际分类为(按索引排序):\")\n",
120 | " print (Y_test,\"\\n\")\n",
121 | " print (\"NB模型分类准确率为:{0}%\".format(score*100))\n",
122 | "\n",
123 | " return results,score"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "### 测试NB模型"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 36,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/html": [
141 | "
\n",
142 | "\n",
155 | "
\n",
156 | " \n",
157 | " \n",
158 | " | \n",
159 | " content | \n",
160 | " type | \n",
161 | "
\n",
162 | " \n",
163 | " \n",
164 | " \n",
165 | " | 0 | \n",
166 | " 招商银行信用卡电子账单2018年6月?-?07/13?¥1,540.00?$?0.00?¥1... | \n",
167 | " 0 | \n",
168 | "
\n",
169 | " \n",
170 | " | 1 | \n",
171 | " 密码重置邮件-来自智联招聘? | \n",
172 | " 0 | \n",
173 | "
\n",
174 | " \n",
175 | " | 2 | \n",
176 | " 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥2189?¥58... | \n",
177 | " 0 | \n",
178 | "
\n",
179 | " \n",
180 | " | 3 | \n",
181 | " Apple 提供的收据?-?收据?APPLE?ID?348708632@qq.com付款信息... | \n",
182 | " 0 | \n",
183 | "
\n",
184 | " \n",
185 | " | 4 | \n",
186 | " 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥1540?¥64... | \n",
187 | " 0 | \n",
188 | "
\n",
189 | " \n",
190 | " | 5 | \n",
191 | " 6月20日徐晨阳《硅谷创新机制解密》报告?-?各位校友:?通知请见:https://www.... | \n",
192 | " 0 | \n",
193 | "
\n",
194 | " \n",
195 | " | 6 | \n",
196 | " 中国科学技术大学六十周年校庆纪念活动 校友邀请函?-??尊敬的校友:?您好!红专并进一甲子,... | \n",
197 | " 0 | \n",
198 | "
\n",
199 | " \n",
200 | " | 7 | \n",
201 | " 少女心晒一“夏”,ELLE Club等你解锁夏季最潮玩法!(?-?如果您不能正常浏览此邮件,... | \n",
202 | " 1 | \n",
203 | "
\n",
204 | " \n",
205 | " | 8 | \n",
206 | " 网上购票系统--用户支付通知?-??尊敬的?邓女士:?您好!?您于2018年06月04日在中... | \n",
207 | " 0 | \n",
208 | "
\n",
209 | " \n",
210 | "
\n",
211 | "
"
212 | ],
213 | "text/plain": [
214 | " content type\n",
215 | "0 招商银行信用卡电子账单2018年6月?-?07/13?¥1,540.00?$?0.00?¥1... 0\n",
216 | "1 密码重置邮件-来自智联招聘? 0\n",
217 | "2 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥2189?¥58... 0\n",
218 | "3 Apple 提供的收据?-?收据?APPLE?ID?348708632@qq.com付款信息... 0\n",
219 | "4 信用管家消费提醒?-?尊敬的邓莎女士:?您好,感谢您选择招商银行信用卡!?¥1540?¥64... 0\n",
220 | "5 6月20日徐晨阳《硅谷创新机制解密》报告?-?各位校友:?通知请见:https://www.... 0\n",
221 | "6 中国科学技术大学六十周年校庆纪念活动 校友邀请函?-??尊敬的校友:?您好!红专并进一甲子,... 0\n",
222 | "7 少女心晒一“夏”,ELLE Club等你解锁夏季最潮玩法!(?-?如果您不能正常浏览此邮件,... 1\n",
223 | "8 网上购票系统--用户支付通知?-??尊敬的?邓女士:?您好!?您于2018年06月04日在中... 0"
224 | ]
225 | },
226 | "execution_count": 36,
227 | "metadata": {},
228 | "output_type": "execute_result"
229 | }
230 | ],
231 | "source": [
232 | "#读取数据\n",
233 | "emails=pd.read_csv(r\"E:\\python\\data\\emails_spam.csv\",encoding=\"utf-8\")\n",
234 | "emails.head(9)"
235 | ]
236 | },
237 | {
238 | "cell_type": "code",
239 | "execution_count": 37,
240 | "metadata": {},
241 | "outputs": [],
242 | "source": [
243 | "#清洗数据\n",
244 | "def text_format():\n",
245 | " import jieba\n",
246 | " import re\n",
247 | " import pandas as pd\n",
248 | " \n",
249 | " print (\"待处理文本格式要求:utf-8编码格式,仅包含待处理文本,每行为一条文本\")\n",
250 | " text_path=input(\"请输入待清洗文本路径+名字:\")\n",
251 | " \n",
252 | " #加载用户自定义词典用于分词\n",
253 | " userdict_path=input(\"请输入自定义分词词典路径+名字(可不输入):\")\n",
254 | " if userdict_path !=\"\":\n",
255 | " jieba.load_userdict(userdict_path)\n",
256 | " \n",
257 | " #根据用户输入地址,读取文件\n",
258 | " with open(text_path,\"r\",encoding=\"utf-8\") as file:\n",
259 | " text=file.readlines()\n",
260 | " for i in range(len(text)):\n",
261 | " text[i]=text[i].strip()\n",
262 | " \n",
263 | " #定义一个空列表,用于存放分词后的文本,长度和text一致\n",
264 | " text_word=[[] for i in range(len(text))]\n",
265 | " \n",
266 | " splitter=re.compile(r\"\\W+|\\d+|[a-z]+\") #正则匹配,去除文本中的符号、数字、字母等非中文字符的元素\n",
267 | " for i in range(len(text)):\n",
268 | " text[i]=splitter.split(text[i].lower())\n",
269 | " text[i]=[word for word in text[i] if len(word)>1] #每条文本已经被分为一段一段的句子,每条文本此时是一个list,先去除其中字段长度小于等于1的单词\n",
270 | " for word in text[i]:\n",
271 | " text_word[i].extend(jieba.lcut(word))\n",
272 | " text_word[i]=\" \".join(text_word[i]) #为了便于TfidfVectorizer等文本向量化处理,将每条标题用元素用空格连起来\n",
273 | " \n",
274 | " return text_word"
275 | ]
276 | },
277 | {
278 | "cell_type": "code",
279 | "execution_count": 38,
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "待处理文本格式要求:utf-8编码格式,仅包含待处理文本,每行为一条文本\n",
287 | "请输入待清洗文本路径+名字:E:\\python\\data\\emails_spam.txt\n",
288 | "请输入自定义分词词典路径+名字(可不输入):\n"
289 | ]
290 | }
291 | ],
292 | "source": [
293 | "emails_format=text_format()"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 39,
299 | "metadata": {},
300 | "outputs": [],
301 | "source": [
302 | "from sklearn.cross_validation import train_test_split\n",
303 | "from sklearn.feature_extraction.text import CountVectorizer\n",
304 | "\n",
305 | "#建立训练集、测试集\n",
306 | "label=emails.type.tolist()\n",
307 | "X_train,X_test,Y_train,Y_test=train_test_split(emails_format,label,test_size=0.2,random_state=7)\n",
308 | "\n",
309 | "#加载并处理停用词典\n",
310 | "with open(r\"E:\\python\\data\\stopwords.txt\",\"r\",encoding=\"utf-8\") as file:\n",
311 | " stop_words=file.readlines()\n",
312 | "for i in range(len(stop_words)):\n",
313 | " stop_words[i]=stop_words[i].strip(\"\\n\")\n",
314 | " \n",
315 | "#构成词袋模型,记录各个词出现的次数\n",
316 | "cv=CountVectorizer(stop_words=stop_words)\n",
317 | "X_train_count=cv.fit_transform(X_train)\n",
318 | "X_test_count=cv.transform(X_test)"
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "#### 将数据带入NB模型进行测试"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 40,
331 | "metadata": {},
332 | "outputs": [
333 | {
334 | "name": "stdout",
335 | "output_type": "stream",
336 | "text": [
337 | "测试样本预测分类为(按索引排序):\n",
338 | "dict_values([0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]) \n",
339 | "\n",
340 | "测试样本实际分类为(按索引排序):\n",
341 | "[0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0] \n",
342 | "\n",
343 | "NB模型分类准确率为:86.66666666666667%\n"
344 | ]
345 | },
346 | {
347 | "data": {
348 | "text/plain": [
349 | "({0: 0,\n",
350 | " 1: 0,\n",
351 | " 2: 0,\n",
352 | " 3: 1,\n",
353 | " 4: 0,\n",
354 | " 5: 1,\n",
355 | " 6: 0,\n",
356 | " 7: 0,\n",
357 | " 8: 0,\n",
358 | " 9: 0,\n",
359 | " 10: 0,\n",
360 | " 11: 0,\n",
361 | " 12: 0,\n",
362 | " 13: 0,\n",
363 | " 14: 0},\n",
364 | " 0.8666666666666667)"
365 | ]
366 | },
367 | "execution_count": 40,
368 | "metadata": {},
369 | "output_type": "execute_result"
370 | }
371 | ],
372 | "source": [
373 | "#将数据带入NB模型进行测试\n",
374 | "wheel_nb(X_train_count.toarray(),Y_train,X_test_count.toarray(),Y_test)"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": null,
380 | "metadata": {},
381 | "outputs": [],
382 | "source": []
383 | }
384 | ],
385 | "metadata": {
386 | "kernelspec": {
387 | "display_name": "Python 3",
388 | "language": "python",
389 | "name": "python3"
390 | },
391 | "language_info": {
392 | "codemirror_mode": {
393 | "name": "ipython",
394 | "version": 3
395 | },
396 | "file_extension": ".py",
397 | "mimetype": "text/x-python",
398 | "name": "python",
399 | "nbconvert_exporter": "python",
400 | "pygments_lexer": "ipython3",
401 | "version": "3.6.4"
402 | }
403 | },
404 | "nbformat": 4,
405 | "nbformat_minor": 2
406 | }
407 |
--------------------------------------------------------------------------------
/造个轮子-KNN.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "## 二、KNN的自定义函数实现\n",
8 | " - 算法实现: (小数据量,线性扫描)\n",
9 | " - https://www.cnblogs.com/hemiy/p/6155425.html\n",
10 | " 1. 输入x与训练集各点的距离distance\n",
11 | " 2. 按distance排序,取distance最近的k个点(k为用户输入)\n",
12 | " 3. 对k个点的类归类计数,x归为多数类(多数表决)\n",
13 | " 4. or 对k个点按1/square(distance)权重归类计数,x归为计数大的类(加权表决)\n",
14 | "\n",
15 | " - 对于大数据量,线性扫描效率极低,于是采用kd树储存训练集,通过搜索kd树的方法寻找输入的近邻,将输入归类(算法如何实现?自定义函数2)"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "import pandas as pd\n",
25 | "import numpy as np\n",
26 | "import operator"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "### 自定义KNN分类器\n",
34 | " - newInput: 新输入的待分类数据(x_test),**本分类器一次只能对一个新输入分类**\n",
35 | " - dataset:输入的训练数据集(x_train),array类型,**每一行为一个输入训练集**\n",
36 | " - labels:输入训练集对应的类别标签(y_train),**格式为['A','B']而不是[['A'],['B']]**\n",
37 | " - k:近邻数\n",
38 | " - weight:决策规则,\"uniform\" 多数表决法,\"distance\" 距离加权表决法"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": [
47 | "# newInput: 新输入的待分类数据(x_test),本分类器一次只能对一个新输入分类\n",
48 | "# dataset:输入的训练数据集(x_train),array类型,每一行为一个输入训练集\n",
49 | "# labels:输入训练集对应的类别标签(y_train),格式为['A','B']而不是[['A'],['B']]\n",
50 | "# k:近邻数\n",
51 | "# weight:决策规则,\"uniform\" 多数表决法,\"distance\" 距离加权表决法\n",
52 | "\n",
53 | "def KNNClassify(newInput, dataset, labels, k, weight):\n",
54 | " numSamples=dataset.shape[0]\n",
55 | " \n",
56 | " \"\"\"step1: 计算待分类数据与训练集各数据点的距离(欧氏距离:距离差值平方和开根号)\"\"\"\n",
57 | " diff=np.tile(newInput,(numSamples,1)) - dataset # 凸显numpy数组的高效性——元素级的运算\n",
58 | " squaredist=diff**2\n",
59 | " distance = (squaredist.sum(axis=1))**0.5 # axis=1,按行累加\n",
60 | " \n",
61 | " \"\"\"step2:将距离按升序排序,并取距离最近的k个近邻点\"\"\"\n",
62 | " # 对数组distance按升序排序,返回数组排序后的值对应的索引值\n",
63 | " sortedDistance=distance.argsort() \n",
64 | " \n",
65 | " # 定义一个空字典,存放k个近邻点的分类计数\n",
66 | " classCount={}\n",
67 | " \n",
68 | " # 对k个近邻点分类计数,多数表决法\n",
69 | " for i in range(k):\n",
70 | " # 第i个近邻点在distance数组中的索引,对应的分类\n",
71 | " votelabel=labels[sortedDistance[i]]\n",
72 | " if weight==\"uniform\":\n",
73 | " # votelabel作为字典的key,对相同的key值累加(多数表决法)\n",
74 | " classCount[votelabel]=classCount.get(votelabel,0)+1 \n",
75 | " elif weight==\"distance\":\n",
76 | " # 对相同的key值按距离加权累加(加权表决法)\n",
77 | " classCount[votelabel]=classCount.get(votelabel,0)+(1/distance[sortedDistance[i]])\n",
78 | " else:\n",
79 | " print (\"分类决策规则错误!\")\n",
80 | " print (\"\\\"uniform\\\"多数表决法\\\"distance\\\"距离加权表决法\")\n",
81 | " break \n",
82 | " \n",
83 | " # 对k个近邻点的分类计数按降序排序,返回得票数最多的分类结果\n",
84 | " sortedClassCount=sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)\n",
85 | " if weight==\"uniform\":\n",
86 | " print (\"新输入到训练集的最近%d个点的计数为:\"%k,\"\\n\",classCount)\n",
87 | " print (\"新输入的类别是:\", sortedClassCount[0][0])\n",
88 | " \n",
89 | " elif weight==\"distance\":\n",
90 | " print (\"新输入到训练集的最近%d个点的距离加权计数为:\"%k,\"\\n\",classCount)\n",
91 | " print (\"新输入的类别是:\", sortedClassCount[0][0])\n",
92 | " \n",
93 | " return sortedClassCount[0][0]"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "#### 鸢尾花数据集分类测试"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": null,
106 | "metadata": {},
107 | "outputs": [
108 | {
109 | "data": {
110 | "text/html": [
111 | "\n",
112 | "\n",
125 | "
\n",
126 | " \n",
127 | " \n",
128 | " | \n",
129 | " sepallength | \n",
130 | " sepalwidth | \n",
131 | " petallength | \n",
132 | " petalwidth | \n",
133 | " species | \n",
134 | "
\n",
135 | " \n",
136 | " \n",
137 | " \n",
138 | " | 0 | \n",
139 | " 5.1 | \n",
140 | " 3.5 | \n",
141 | " 1.4 | \n",
142 | " 0.2 | \n",
143 | " Iris-setosa | \n",
144 | "
\n",
145 | " \n",
146 | " | 1 | \n",
147 | " 4.9 | \n",
148 | " 3.0 | \n",
149 | " 1.4 | \n",
150 | " 0.2 | \n",
151 | " Iris-setosa | \n",
152 | "
\n",
153 | " \n",
154 | " | 2 | \n",
155 | " 4.7 | \n",
156 | " 3.2 | \n",
157 | " 1.3 | \n",
158 | " 0.2 | \n",
159 | " Iris-setosa | \n",
160 | "
\n",
161 | " \n",
162 | " | 3 | \n",
163 | " 4.6 | \n",
164 | " 3.1 | \n",
165 | " 1.5 | \n",
166 | " 0.2 | \n",
167 | " Iris-setosa | \n",
168 | "
\n",
169 | " \n",
170 | " | 4 | \n",
171 | " 5.0 | \n",
172 | " 3.6 | \n",
173 | " 1.4 | \n",
174 | " 0.2 | \n",
175 | " Iris-setosa | \n",
176 | "
\n",
177 | " \n",
178 | "
\n",
179 | "
"
180 | ],
181 | "text/plain": [
182 | " sepallength sepalwidth petallength petalwidth species\n",
183 | "0 5.1 3.5 1.4 0.2 Iris-setosa\n",
184 | "1 4.9 3.0 1.4 0.2 Iris-setosa\n",
185 | "2 4.7 3.2 1.3 0.2 Iris-setosa\n",
186 | "3 4.6 3.1 1.5 0.2 Iris-setosa\n",
187 | "4 5.0 3.6 1.4 0.2 Iris-setosa"
188 | ]
189 | },
190 | "execution_count": 137,
191 | "metadata": {},
192 | "output_type": "execute_result"
193 | }
194 | ],
195 | "source": [
196 | "iris=pd.read_csv(\"E:\\python\\practice\\iris.txt\")\n",
197 | "iris.head()"
198 | ]
199 | },
200 | {
201 | "cell_type": "markdown",
202 | "metadata": {},
203 | "source": [
204 | "### 建立训练集、测试集\n",
205 | " - 注意训练集x、y的格式"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": null,
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "[5.1 3.8 1.5 0.3]\n",
218 | "['Iris-setosa']\n"
219 | ]
220 | },
221 | {
222 | "data": {
223 | "text/plain": [
224 | "array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',\n",
225 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',\n",
226 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',\n",
227 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',\n",
228 | " 'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',\n",
229 | " 'Iris-virginica', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',\n",
230 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',\n",
231 | " 'Iris-virginica', 'Iris-virginica', 'Iris-virginica',\n",
232 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',\n",
233 | " 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',\n",
234 | " 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',\n",
235 | " 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
236 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-virginica',\n",
237 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa',\n",
238 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',\n",
239 | " 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
240 | " 'Iris-setosa', 'Iris-virginica', 'Iris-versicolor',\n",
241 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',\n",
242 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',\n",
243 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',\n",
244 | " 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
245 | " 'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',\n",
246 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-versicolor',\n",
247 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',\n",
248 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',\n",
249 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-setosa',\n",
250 | " 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',\n",
251 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-setosa', 'Iris-virginica',\n",
252 | " 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
253 | " 'Iris-versicolor', 'Iris-virginica', 'Iris-virginica',\n",
254 | " 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa', 'Iris-virginica',\n",
255 | " 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica',\n",
256 | " 'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',\n",
257 | " 'Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa',\n",
258 | " 'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',\n",
259 | " 'Iris-setosa', 'Iris-versicolor', 'Iris-versicolor',\n",
260 | " 'Iris-virginica', 'Iris-virginica', 'Iris-versicolor',\n",
261 | " 'Iris-virginica', 'Iris-setosa', 'Iris-versicolor',\n",
262 | " 'Iris-versicolor'], dtype=object)"
263 | ]
264 | },
265 | "execution_count": 138,
266 | "metadata": {},
267 | "output_type": "execute_result"
268 | }
269 | ],
270 | "source": [
271 | "iris_x=iris.iloc[:,[0,1,2,3]]\n",
272 | "iris_y=iris.iloc[:,[4]]\n",
273 | "\n",
274 | "np.random.seed(7)\n",
275 | "indices=np.random.permutation(len(iris_x))\n",
276 | "\n",
277 | "iris_x_train=iris_x.iloc[indices[0:130]]\n",
278 | "iris_y_train=iris_y.iloc[indices[0:130]]\n",
279 | "\n",
280 | "iris_x_test=iris_x.iloc[indices[130:150]]\n",
281 | "iris_y_test=iris_y.iloc[indices[130:150]]\n",
282 | "\n",
283 | "# 将dataframe格式的数据转换为numpy array格式,便于调用函数计算\n",
284 | "iris_x_train=np.array(iris_x_train)\n",
285 | "iris_y_train=np.array(iris_y_train)\n",
286 | "\n",
287 | "iris_x_test=np.array(iris_x_test)\n",
288 | "iris_y_test=np.array(iris_y_test) \n",
289 | "\n",
290 | "print (iris_x_test[1])\n",
291 | "print (iris_y_test[1])\n",
292 | "\n",
293 | "\"\"\"运行错误测试:\n",
294 | "dis=(((np.tile(iris_x_test[1],(130,1))-iris_x_train)**2).sum(axis=1))**0.5\n",
295 | "sortdis=dis.argsort()\n",
296 | "cc={}\n",
297 | "for i in range(10):\n",
298 | " votel=iris_y_train[sortdis[i]]\n",
299 | " cc[votel]=cc.get(votel,0)+1\n",
300 | "\n",
301 | "sortedcc=sorted(cc,key=operator.itemgetter(1),reversed=True)\n",
302 | "sortedcc[0][0]\"\"\"\n",
303 | "\n",
304 | "# 将labels的形状设置为(130,)\n",
305 | "iris_y_train.shape=(130,)\n",
306 | "iris_y_train"
307 | ]
308 | },
309 | {
310 | "cell_type": "markdown",
311 | "metadata": {},
312 | "source": [
313 | "### 将训练集、测试集带入自定义KNN分类器进行分类"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": 139,
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "name": "stdout",
323 | "output_type": "stream",
324 | "text": [
325 | "新输入到训练集的最近20个点的距离加权计数为: \n",
326 | " {'Iris-versicolor': 45.596003202769246}\n",
327 | "新输入的类别是: Iris-versicolor\n",
328 | "Iris-versicolor\n",
329 | "新输入的实际类别是: ['Iris-versicolor']\n",
330 | "\n",
331 | "\n",
332 | "预测准确!\n"
333 | ]
334 | }
335 | ],
336 | "source": [
337 | "# 将训练集、测试集带入自定义KNN分类器进行分类\n",
338 | "test_index=12\n",
339 | "predict=KNNClassify(iris_x_test[test_index],iris_x_train,iris_y_train,20,\"distance\")\n",
340 | "print (predict)\n",
341 | "print (\"新输入的实际类别是:\", iris_y_test[test_index])\n",
342 | "print (\"\\n\")\n",
343 | "\n",
344 | "if predict==iris_y_test[test_index]:\n",
345 | " print (\"预测准确!\")\n",
346 | "else:\n",
347 | " print (\"预测错误!\")"
348 | ]
349 | },
350 | {
351 | "cell_type": "markdown",
352 | "metadata": {},
353 | "source": [
354 | "#### 另一组简单的测试数据分类"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 140,
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "name": "stdout",
364 | "output_type": "stream",
365 | "text": [
366 | "新输入到训练集的最近4个点的距离加权计数为: \n",
367 | " {'A': 9.472135954999581, 'B': 1.4018812887604746}\n",
368 | "新输入的类别是: A\n"
369 | ]
370 | },
371 | {
372 | "data": {
373 | "text/plain": [
374 | "'A'"
375 | ]
376 | },
377 | "execution_count": 140,
378 | "metadata": {},
379 | "output_type": "execute_result"
380 | }
381 | ],
382 | "source": [
383 | "# 另一组简单的测试数据分类\n",
384 | "group = np.array([[1.0, 0.9], [1.0, 1.0], [0.1, 0.2], [0.0, 0.1]])\n",
385 | "labels = np.array(['A', 'A', 'B', 'B'])\n",
386 | "testX = np.array([1.2, 1.0])\n",
387 | "\n",
388 | "KNNClassify(testX,group,labels,4,\"distance\")"
389 | ]
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 141,
394 | "metadata": {},
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | "(4,)"
400 | ]
401 | },
402 | "execution_count": 141,
403 | "metadata": {},
404 | "output_type": "execute_result"
405 | }
406 | ],
407 | "source": [
408 | "labels = np.array(['A', 'A', 'B', 'B'])\n",
409 | "labels.shape"
410 | ]
411 | },
412 | {
413 | "cell_type": "code",
414 | "execution_count": 142,
415 | "metadata": {},
416 | "outputs": [
417 | {
418 | "data": {
419 | "text/plain": [
420 | "(130,)"
421 | ]
422 | },
423 | "execution_count": 142,
424 | "metadata": {},
425 | "output_type": "execute_result"
426 | }
427 | ],
428 | "source": [
429 | "iris_y_train.shape"
430 | ]
431 | }
432 | ],
433 | "metadata": {
434 | "kernelspec": {
435 | "display_name": "Python 3",
436 | "language": "python",
437 | "name": "python3"
438 | },
439 | "language_info": {
440 | "codemirror_mode": {
441 | "name": "ipython",
442 | "version": 3
443 | },
444 | "file_extension": ".py",
445 | "mimetype": "text/x-python",
446 | "name": "python",
447 | "nbconvert_exporter": "python",
448 | "pygments_lexer": "ipython3",
449 | "version": "3.6.4"
450 | }
451 | },
452 | "nbformat": 4,
453 | "nbformat_minor": 2
454 | }
455 |
--------------------------------------------------------------------------------
/AdaBoost.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "from numpy import *\n",
11 | "import time"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "### 自定义函数实现AdaBoost\n",
19 | "- 包含函数\n",
20 | " 1. stumpClassify() 通过阈值对数据分类\n",
21 | " 2. buildStump() 生成单层决策树,需要调用stumpClassify()\n",
22 | " 3. adaBoostTrainDS() 训练出多个弱决策树分类器,需要调用buildStump() \n",
23 | " 4. adaClassify() 利用训练好的弱分类器分类数据\n",
24 | " 5. loadDataSet() 加载数据集"
25 | ]
26 | },
27 | {
28 | "cell_type": "markdown",
29 | "metadata": {},
30 | "source": [
31 | "#### 1. 构建单层决策树\n",
32 | "- 遍历数据集的每个特征:\n",
33 | " - 遍历特征的每个步长:\n",
34 | " - 遍历步长的每个阈值对比方式:\n",
35 | " - 计算每次迭代的weightedError\n",
36 | "- 认为weightedError最小的点(特征,阈值,方式)是最佳决策点,以此构建一棵决策树桩(stump)"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 4,
42 | "metadata": {},
43 | "outputs": [],
44 | "source": [
45 | "#通过阈值对数据分类+1 -1\n",
46 | "#dimen为dataMat的列索引值,即特征位置;threshIneq为阈值对比方式,大于或小于\n",
47 | "def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):\n",
48 | " retArray=ones((shape(dataMatrix)[0],1))#注意,有两个()\n",
49 | " #阈值的模式,将小于某一阈值的特征归类为-1\n",
50 | " if threshIneq=='lt':#less than\n",
51 | " retArray[dataMatrix[:,dimen]<=threshVal]=-1.0\n",
52 | " #将大于某一阈值的特征归类为-1\n",
53 | " else:#greater than\n",
54 | " retArray[dataMatrix[:,dimen]>threshVal]=-1.0\n",
55 | " return retArray\n",
56 | "\n",
57 | "#单层决策树生成函数\n",
58 | "#D为各样本权重,shape=(m,1);label为样本标签,shape=(1,m)\n",
59 | "def buildStump(dataArr,classLabels,D):\n",
60 | "#将数据集和标签列表转为矩阵形式\n",
61 | " dataMatrix=mat(dataArr);labelMat=mat(classLabels).T\n",
62 | " m,n=shape(dataMatrix)\n",
63 | " #步长或区间总数 最优决策树信息 最优单层决策树预测结果\n",
64 | " numSteps=10.0;bestStump={};bestClasEst=mat(zeros((m,1))) #注意,有两个()\n",
65 | " #最小错误率初始化为+∞\n",
66 | " minError=inf\n",
67 | " \n",
68 | " #遍历数据集的每个特征:遍历特征的每个步长:遍历步长的每个阈值对比方式\n",
69 | " for i in range(n):\n",
70 | " #找出列中特征值的最小值和最大值\n",
71 | " rangeMin=dataMatrix[:,i].min();rangeMax=dataMatrix[:,i].max()\n",
72 | " #求取步长大小或者说区间间隔\n",
73 | " stepSize=(rangeMax-rangeMin)/numSteps\n",
74 | " #遍历各个步长区间\n",
75 | " for j in range(-1,int(numSteps)+1):\n",
76 | " #两种阈值过滤模式\n",
77 | " for inequal in ['lt','gt']:\n",
78 | " threshVal=rangeMin+float(j)*stepSize\n",
79 | " #选定阈值后,调用阈值过滤函数分类预测\n",
80 | " predictedVals=\\\n",
81 | " stumpClassify(dataMatrix,i,threshVal,inequal)\n",
82 | " #初始化错误向量\n",
83 | " errArr=mat(ones((m,1)))\n",
84 | " #将错误向量中分类正确项置0\n",
85 | " errArr[predictedVals==labelMat]=0\n",
86 | " #计算\"加权\"的错误率\n",
87 | " weigthedError=D.T*errArr\n",
88 | " #print (\"分割特征为第{0}个,分割阈值为{1},分割方式为{2},weight error为{3}\"\\\n",
89 | " # .format(i+1,threshVal,threshIneq,weightedErr))\n",
90 | " if weigthedError阈值时认为是+1.0类\n",
454 | " 将该阈值样本的假设类别-1.0与该阈值样本的真实类别classLabels[index]对比\n",
455 | " 若真实类别为1.0,则拉低真阳率(漏报率);若真实类别为-1.0,则拉低假阳率(误报率)\n",
456 | " \"\"\"\n",
457 | " for index in sortedIndex.tolist()[0]:\n",
458 | " #若判别为最小的样本真实为正例(说明正例被错判,FN),则减小Y值(TP/(TP+FN))\n",
459 | " if classLabels[index]==1.0: \n",
460 | " delX=0.0;delY=yStep\n",
461 | " #若判别为最小的样本真实为反例(说明反例被对判,TN),则减小X值(FP/(FP+TN))\n",
462 | " else:\n",
463 | " delX=xStep;delY=0.0\n",
464 | " ySum+=cur[1] #x每移动一个xStep,ySum就叠加一个当时的y值,用于计算曲线下矩形面积\n",
465 | " \n",
466 | " ax.plot([cur[0],cur[0]-delX],[cur[1],cur[1]-delY],c='r') #绘制ROC曲线\n",
467 | " cur=(cur[0]-delX,cur[1]-delY) #更新光标位置\n",
468 | " \n",
469 | " ax.plot([0,1],[0,1],'b--') #[0,1],[0,1]表示x,y值的一一对应关系,即(0,0)(1,1)两个点,并用线连接起来\n",
470 | " plt.xlabel(\"假阳率\",fontsize=16,fontweight=\"bold\")\n",
471 | " plt.ylabel(\"真阳率\",fontsize=16,fontweight=\"bold\")\n",
472 | " plt.title(\"AdaBoost疝病马预测ROC曲线\",fontsize=20,fontweight=\"bold\")\n",
473 | " ax.axis([0,1,0,1])\n",
474 | " plt.show()\n",
475 | " print(\"ROC曲线的AUC为:\",ySum*xStep)"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 21,
481 | "metadata": {},
482 | "outputs": [],
483 | "source": [
484 | "dataArr,labelArr=loadDataSet(r'D:\\DM\\python\\data\\MLiA_SourceCode\\machinelearninginaction\\Ch07\\horseColicTraining2.txt')\n",
485 | "classifierArr,trainErrorRate,aggClassEst=adaBoostTrainDS(dataArr,labelArr,50)"
486 | ]
487 | },
488 | {
489 | "cell_type": "code",
490 | "execution_count": 59,
491 | "metadata": {
492 | "collapsed": true
493 | },
494 | "outputs": [
495 | {
496 | "data": {
497 | "text/plain": [
498 | "[176,\n",
499 | " 35,\n",
500 | " 40,\n",
501 | " 192,\n",
502 | " 272,\n",
503 | " 49,\n",
504 | " 101,\n",
505 | " 16,\n",
506 | " 83,\n",
507 | " 181,\n",
508 | " 225,\n",
509 | " 44,\n",
510 | " 230,\n",
511 | " 172,\n",
512 | " 256,\n",
513 | " 170,\n",
514 | " 59,\n",
515 | " 179,\n",
516 | " 97,\n",
517 | " 168,\n",
518 | " 227,\n",
519 | " 244,\n",
520 | " 285,\n",
521 | " 296,\n",
522 | " 150,\n",
523 | " 239,\n",
524 | " 141,\n",
525 | " 295,\n",
526 | " 121,\n",
527 | " 45,\n",
528 | " 291,\n",
529 | " 46,\n",
530 | " 173,\n",
531 | " 43,\n",
532 | " 234,\n",
533 | " 73,\n",
534 | " 245,\n",
535 | " 246,\n",
536 | " 210,\n",
537 | " 86,\n",
538 | " 30,\n",
539 | " 134,\n",
540 | " 159,\n",
541 | " 48,\n",
542 | " 290,\n",
543 | " 109,\n",
544 | " 113,\n",
545 | " 133,\n",
546 | " 178,\n",
547 | " 204,\n",
548 | " 128,\n",
549 | " 108,\n",
550 | " 280,\n",
551 | " 219,\n",
552 | " 99,\n",
553 | " 110,\n",
554 | " 281,\n",
555 | " 80,\n",
556 | " 58,\n",
557 | " 252,\n",
558 | " 8,\n",
559 | " 292,\n",
560 | " 171,\n",
561 | " 200,\n",
562 | " 253,\n",
563 | " 229,\n",
564 | " 262,\n",
565 | " 123,\n",
566 | " 15,\n",
567 | " 64,\n",
568 | " 273,\n",
569 | " 249,\n",
570 | " 36,\n",
571 | " 261,\n",
572 | " 62,\n",
573 | " 203,\n",
574 | " 151,\n",
575 | " 207,\n",
576 | " 216,\n",
577 | " 0,\n",
578 | " 54,\n",
579 | " 91,\n",
580 | " 184,\n",
581 | " 140,\n",
582 | " 75,\n",
583 | " 177,\n",
584 | " 221,\n",
585 | " 70,\n",
586 | " 135,\n",
587 | " 78,\n",
588 | " 104,\n",
589 | " 209,\n",
590 | " 57,\n",
591 | " 72,\n",
592 | " 271,\n",
593 | " 34,\n",
594 | " 3,\n",
595 | " 117,\n",
596 | " 195,\n",
597 | " 297,\n",
598 | " 4,\n",
599 | " 19,\n",
600 | " 294,\n",
601 | " 241,\n",
602 | " 10,\n",
603 | " 242,\n",
604 | " 162,\n",
605 | " 270,\n",
606 | " 147,\n",
607 | " 238,\n",
608 | " 143,\n",
609 | " 31,\n",
610 | " 55,\n",
611 | " 93,\n",
612 | " 126,\n",
613 | " 237,\n",
614 | " 247,\n",
615 | " 37,\n",
616 | " 254,\n",
617 | " 286,\n",
618 | " 84,\n",
619 | " 68,\n",
620 | " 282,\n",
621 | " 18,\n",
622 | " 63,\n",
623 | " 164,\n",
624 | " 287,\n",
625 | " 174,\n",
626 | " 28,\n",
627 | " 186,\n",
628 | " 278,\n",
629 | " 39,\n",
630 | " 218,\n",
631 | " 167,\n",
632 | " 25,\n",
633 | " 258,\n",
634 | " 74,\n",
635 | " 196,\n",
636 | " 263,\n",
637 | " 274,\n",
638 | " 26,\n",
639 | " 232,\n",
640 | " 251,\n",
641 | " 131,\n",
642 | " 20,\n",
643 | " 56,\n",
644 | " 118,\n",
645 | " 188,\n",
646 | " 79,\n",
647 | " 13,\n",
648 | " 226,\n",
649 | " 66,\n",
650 | " 114,\n",
651 | " 17,\n",
652 | " 215,\n",
653 | " 124,\n",
654 | " 24,\n",
655 | " 41,\n",
656 | " 190,\n",
657 | " 160,\n",
658 | " 206,\n",
659 | " 156,\n",
660 | " 130,\n",
661 | " 265,\n",
662 | " 51,\n",
663 | " 82,\n",
664 | " 250,\n",
665 | " 266,\n",
666 | " 1,\n",
667 | " 268,\n",
668 | " 154,\n",
669 | " 65,\n",
670 | " 201,\n",
671 | " 298,\n",
672 | " 42,\n",
673 | " 269,\n",
674 | " 205,\n",
675 | " 193,\n",
676 | " 211,\n",
677 | " 33,\n",
678 | " 53,\n",
679 | " 127,\n",
680 | " 163,\n",
681 | " 7,\n",
682 | " 87,\n",
683 | " 9,\n",
684 | " 243,\n",
685 | " 106,\n",
686 | " 231,\n",
687 | " 146,\n",
688 | " 275,\n",
689 | " 220,\n",
690 | " 144,\n",
691 | " 96,\n",
692 | " 105,\n",
693 | " 180,\n",
694 | " 27,\n",
695 | " 90,\n",
696 | " 14,\n",
697 | " 102,\n",
698 | " 185,\n",
699 | " 198,\n",
700 | " 138,\n",
701 | " 187,\n",
702 | " 139,\n",
703 | " 217,\n",
704 | " 119,\n",
705 | " 32,\n",
706 | " 284,\n",
707 | " 259,\n",
708 | " 189,\n",
709 | " 264,\n",
710 | " 11,\n",
711 | " 212,\n",
712 | " 81,\n",
713 | " 88,\n",
714 | " 111,\n",
715 | " 112,\n",
716 | " 228,\n",
717 | " 129,\n",
718 | " 169,\n",
719 | " 222,\n",
720 | " 6,\n",
721 | " 255,\n",
722 | " 157,\n",
723 | " 60,\n",
724 | " 267,\n",
725 | " 94,\n",
726 | " 233,\n",
727 | " 155,\n",
728 | " 136,\n",
729 | " 236,\n",
730 | " 2,\n",
731 | " 52,\n",
732 | " 50,\n",
733 | " 23,\n",
734 | " 5,\n",
735 | " 95,\n",
736 | " 276,\n",
737 | " 240,\n",
738 | " 22,\n",
739 | " 132,\n",
740 | " 166,\n",
741 | " 103,\n",
742 | " 145,\n",
743 | " 152,\n",
744 | " 92,\n",
745 | " 137,\n",
746 | " 120,\n",
747 | " 197,\n",
748 | " 148,\n",
749 | " 61,\n",
750 | " 161,\n",
751 | " 76,\n",
752 | " 12,\n",
753 | " 235,\n",
754 | " 71,\n",
755 | " 142,\n",
756 | " 38,\n",
757 | " 98,\n",
758 | " 199,\n",
759 | " 213,\n",
760 | " 288,\n",
761 | " 100,\n",
762 | " 158,\n",
763 | " 69,\n",
764 | " 279,\n",
765 | " 122,\n",
766 | " 260,\n",
767 | " 47,\n",
768 | " 224,\n",
769 | " 182,\n",
770 | " 107,\n",
771 | " 289,\n",
772 | " 115,\n",
773 | " 149,\n",
774 | " 125,\n",
775 | " 191,\n",
776 | " 223,\n",
777 | " 85,\n",
778 | " 277,\n",
779 | " 77,\n",
780 | " 67,\n",
781 | " 165,\n",
782 | " 21,\n",
783 | " 248,\n",
784 | " 183,\n",
785 | " 175,\n",
786 | " 257,\n",
787 | " 89,\n",
788 | " 214,\n",
789 | " 202,\n",
790 | " 283,\n",
791 | " 194,\n",
792 | " 208,\n",
793 | " 29,\n",
794 | " 116,\n",
795 | " 153,\n",
796 | " 293]"
797 | ]
798 | },
799 | "execution_count": 59,
800 | "metadata": {},
801 | "output_type": "execute_result"
802 | }
803 | ],
804 | "source": [
805 | "argsort(aggClassEst.T).tolist()[0]"
806 | ]
807 | },
808 | {
809 | "cell_type": "code",
810 | "execution_count": 44,
811 | "metadata": {},
812 | "outputs": [
813 | {
814 | "data": {
815 | "text/plain": [
816 | "((299, 1), (299, 1))"
817 | ]
818 | },
819 | "execution_count": 44,
820 | "metadata": {},
821 | "output_type": "execute_result"
822 | }
823 | ],
824 | "source": [
825 | "shape(aggClassEst),shape(mat(labelArr).T)"
826 | ]
827 | },
828 | {
829 | "cell_type": "code",
830 | "execution_count": 63,
831 | "metadata": {},
832 | "outputs": [
833 | {
834 | "data": {
835 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAf0AAAH5CAYAAACLXeeeAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMS4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvNQv5yAAAIABJREFUeJzs3XecVNX5x/HPw8ICgiIoYhBLRNRgrKCJioAIRI0F7N2gxpZEExMj0WjUGI2x1/zsDXsXxYggoMaKDWOLqGADRYp0Fnaf3x/nTnYYZndnd2fnzp35vl+vee3MnTt3nm3z3HPOc841d0dERERKX6u4AxAREZHCUNIXEREpE0r6IiIiZUJJX0REpEwo6YuIiJQJJX2RBDGzLmZ2pJltlafjbW5mvzOz9fNxvLiZWetmvHYbM9skn/Hki5m1NrM1zaxd2raOZraJma3ZiONYy0QoSaGkL5IsOwB3Avvl6XhbAZcDu+f6AjP7qZltZ2Y/jk4aGrqt1ZTAzGx1M2tvZqt8TlnQzsxWT9tWCUw0syvMrG20rauZ9chy65blLccAf2tKrAWwIzAXGJm2bSjwMXBkI47zipldln7yAGBmPzezGWZ2YfR4tejnKSWmyWfFIhKLF4DlwK7AuamNUQuuLdDW3b9Pf0HU+q0rqXeOvu5vZjPSXwa0Bz509ykZr3m5kTH/Grguc6OZHUw44VgGVGU8PQK4Ddgs2reuY38EbB7db0VIgr8F+pnZfsAooH+W171LOOFJtzS6FaMl0dfZadsWZ3ytl5kNIJw0vuLumd/nt8C6wBIzWw14H5hA+D1ICVHSFylCZnYkoUVfl/5mlm1lrUVAx4xtFcDoBt7yZ9Et03lAZtLfj5CkqwgnIHUZCPwFmF7H8y8Bv4yOswxYDbiPkIA+i+Lx6LkBwP3AnsDrhJOSdtFXAKJENsLMXiW0iI2QLB8Bzkh734uB9QHM7CpgrrufC9RE71eMUkl6Sdq2FdHX6hyPcRYhmZ+e5blZ0dfv3X2xmV0A3GRmj7n7442OVoqWkr60iKi1MBv43N03y/E1Dtzh7r9oxPtsREgQ6RYDbwLnuvv4XI8VNzNbF9je3UdT2/LtD3yQw8tbEVr6bdKOty/wgrvPMbNq4EbgtFzDiY63IvMJd380Ov4BQG/gPnf/b5bvZ8vo7rRsb+DuXwBfmFkPd//SzG4kJPjB7j4z41ip7uiX3X1efYG7+/+Z2e3uvtTMVgALgC+Adu7+vZktICR4gD7AzGzHMbNpwIZpm74HpgK3Aje5+/K0fTcFLgJ2I/wOJgNnufuLGcdsDfyBcLLTA/gEuMTdb0vbx4h6ONL8MPq6rpmlejZSdRg/SNuWMjP952Rmg4AhQH93rzKzk4G33D3Va5P6PdcAuPvNZvY+8F8zG+Duk7L9jCR5lPSlpQwhtMQ2NbNNsyWFPHuBkNQA1gZOBv5lZtu7+9st/N45MbNhwDZRqzKbi4B9zWwzapP+t+7+XZZjPQK0dvd96niv9sC1wCdm9jNCD8AKQiJfBlS5e02210avN8LnQyczW+ruqyR/YBBwEmEsPJsfRF/raukTjff/x8zGA8OAn0UnA6kEmSqs24FwErludHKU8klG8u3s7nOzdF9vEr3P7zO2ryDLiU2a7wnDExZ9PwcThip2Ao6I3nND4N+EHpYrgXnA74DxZraru7+Udrw7gMMIww4vAnsAt5rZmu5+RbRPG+o+0ftrdEt3YXRLdxxwSxTf6sDNwG3u/oKZdScMq0w3s+3cfVHa67aMTgi2IvzMtwEWRvHV+fciCeLuuumW9xvhQ8aj2+9zfI0DtzfyfTbK9jpCK6gGeCjun0VaTLeHf7k6n9+ckOzvBPaKvq+edez7PGFstr73+ykhwV9FaM1eRUhI3sjbRnUc/97o+G3qeP4WQtd5Qz+XPYA5hHH2yrTt6+UQ2wZp+59KaNFvl7btyejn3iPa/7jo8SvR8xMJPRUQWvG3p712GqHFnB5rG+Cd6Fjrpv9egX3T9vsx4WTihbRtqd/p9WnbjDDMsQDomLa9GpiYcTwHjkvbNjja9ou0bb+Ith0RPV4dmEQ4Ydqc0HPxT8JwwU+B86Pf40fR61YA/yH8Df4e2Af4EWBx///olp+bqvcl76JW4s+BpwkfnHsXOgYPrcXvCB9YieDuHxJa50cC/RrYfTkh4dZ3vFeAAwnj6hA+1LcjfPD/AOga3faPnv9N2rauhBOnXsBXdbxFF+ALT2tpZ1gX+Lqu+KLq+0p3f5rw/b5I6IlISX1/R7q7EX6XP4ruHxo9l96if4lQv/C8mWUWLqbGvRfUFU8uou/12ejhRtHf+jBghqeNfbv7f4BXgJ3NbO1o87HR10vS9nPgMUIvwU/T3qq5repU70VrYHvC7+oDwv/jiYQi0NcIvQ6VhF6yxcCV7v5jdz/K3S9z9yfc/QNCr52UACV9aQnbEz7wJ0S3fmbWOX0HMzvGzD4ys6Vm9oaZ7ZjtQGa2u5m9YGbzzOw7M3vGzLZtKIBo7vJaZIzXmtnWZvakmS0ws/lm9piZ9c7y+h+a2b1mNtfMFpvZeDPbKct+m5vZw9F0p8Vm9q6ZHZv2/EAz86he4ehom6fdNso45LnA3cBb0WM3s55mNtLMeqXtl2rppt6nnZldamYrnWBFH9rzah/6p+7+ubvPdPfvPAwdpBLn7NS26Palu0+tJ6l3o7YALJt1gW/qef52YFn0s3mPkIzmRz+X5aTVJ0SuIZxIpvtf1b+7v05InHOIftZpUj+rJTRfaix9JrAB0InQOs70AaElv0X0eGfgG3fPrEF5lVCkmB5brsV59XL3uYQhmD0JdQE3EnpULnP3GnffxN33d/fLCD+3Thb0NLPhFqY/TiEMn0kJ0Ji+tIRU4plA+GAcQejCvQfAzE4CricUO/2R0O2Y+WGeKj56Engb+BOhRfJrwlj9Zr5yQVfbtBZVd+Dv0f0r0463EzCO0MV9AaFVeCrwalSs9Ga03yaEaWltCF3i8wkfnBPNbJi7j4n26wA8R/hgv5LQszAYuNnM5rr7I4QP/tQ86uOBXVh5XvVKSdPd5wNHmNlx0aZlhNb2RYQTgY8zf06RakJ37DKyV+obUBMVxGUu5pKas75axng5wCJ3r6t1vC7wYZYispTuwKyM56d57Xj72YRK+qWEZLSM8Pu9nlDJn/m+7WmgqNHdP4/+bjJ7GNaIvjY26Vva39XqwAHR7QV3n2Zm20fPzc7y2tS2rtG4elfC33xmzJMIXfDZ3jv1GV0RfW2Vtq1VPdvSj39HdLBtCN3/A1IncmbWkzBEtiWhNX8wcAih52EJ4WTsNcKJiZQAJX1pCXsTCqDeoralvTdwj5lVEKaBzSR8+CwGMLP5rDytCkILaTRwort/E+33MfAUoZAqvYDskOiW4sBvPFTCp9xI6Pbc3t2/io53H6GV9n+EwiUIib5ztN9b0X63EqY73WRmG0UfmpsTusl/6+5XRa+9xcxmEbVSo7hHRccYDOzi7qOy/dDMbD3Ch+7R1M4hX0pYlAVWncte+826Lw89zbXd3Wa2TvQzOQlYJ/rehxOdfGVxc5ZtV5Cl4t/CgjlrE6bS1ZeI1814vnfqsbv/7wTGzBYBS939QzP7MWGOfqb2QIMFoe4+1cLiO4up/YxLFRWm1jCoWOWF2a3DyidmTjhBPSZ6nOr2zva7SW1rTzhhgMaddPRn1SmRN0S3dLdEtzpFJx2jgJujYZ+UfxO+x6mEk5SewOGEaZpTgR3dXa38EqLufckrC8u5bk2YS92ZkISmA7tHrZHNCC2ex1IJP7JKwnH3a9x9OFBtYcWwvxCSEIQPqnRjCTMGhhI+kF8ErjKz30Rx9SScRDyYSvjRe3wKPA5sb2bdo5bwUGBSKuFH+80ldEd3JwxfQCh++g44ycwON7PNzMzc/RR3vz/3nxpEJ0PjgUsJrfnUh/hSase2c5lD7mnHe53w85pASICp+fAQEkrn6JZ6r53StnUm9B5kTVJR13Brd7dcbtSudFffcECqHmQwaT00aTpRx/S6jGP0Aj4ysyHU1ghsHH1N/e5T2436TwDmEv6uhgBfRu+/X+oklNqTrGyr16W2LaF2jH6VhpaF5XW3yRwCI7Swe0W3PaNtZ6ZtSw1hnJG2LfPEmWg4bDzhZ/CCmR1lZn8ws4HAvkBnd9+UsLZBa8K0yA8JvWvPm1m234UklFr6km+prv3BrPoBvwu1LZcZGc99mXmgqFv4huh1Swhjka8Bm2Z53xnuPi7ttXcRWiuXm9kDhBYnhOruTJ9HX9clJIHWOeyHuy80sz0IPRc3AB2AOWZ2PzAy6qrPibtXR7UAS9z9TTO7PPqec1ptrY7jnQ98SjgBOokwZpsq8FqQGh4xsz7RtjuB3dz982g7afunTiTaRzHWO+Yc9QRUEqrx5xO6653aXou64naixYDMrFPG8dan9neQkjnuD6E2YA1Cy/VbQl3BEGCeu39lZp9E3weEhF/fcrNVqb8rM/s7odDyOGpXGEwl/65ZXts1bZ85hJOobPsNI/RsHAQ8mLZ9ibtPjd471aMwK23bRtG2b9O2fZvl+D+h9kT1XsLPZSbwnrtPTNsv1YW/R/TzPpdQjHh5lmNKQinpS76lkv7BhA86CC3/S6PnUt22mR9+2dZCf4zQuutPaH1Um1lfovnR9XH3FRYWF/kRoYAp9eHcI8vuqW2pD+cVOeyXep/JwM+jFmpPwgf4xYREctwqR6g/5n+nPewLTHV3tyZeI8XdU/O0N4o2fU1Gb4GFpVm3IRTJDSe07HZO6w1J3/9HhBOv+pbFzTSdMGbcGZjf0MlChvRK/u0IP9MP69mHaPz9BODhqEbjYDNrQzjJnGhmOwDXRj03EJL/Sseox82E1u8ZZnaTu1cRTg7nEsbEM/Um/Pze97AgznvAFhbmvKfXo6QWAFrlxDdPniDUsbxDqKmoq/dmhpm9RJjzvxahd2CfjB45STh170veRIVtuwIfuPsD7j4uaiVdT2i17k0Yz51HWIQm/cN2RMax1iIMBYx39xejhG+ElcxyjSXVuvkiagl9CBxgZj9I228jQqJ+092/ij4QxwEDLO1KdlGr8xeEFtLr0bYTzOxLM+vjwVR3v5SQGFP1Aem+SztWfbF3JrTOUmOvzf0/Tf0c3mXliv8OhFqGJYT52gcRTmz+laWrGULX+DGE6XIHNnA7FDiK2gvEdCOHrvkMywkt6o8IxXOLgdeiJP4GYYphZm/KOYRx9nPSth1KSGJPAs8AT6Vazu7ex91zutiQuy+jdgnfY6JtDjwKrGNhrX/gf6sR7kAo+EsV9I0i9CyckrZfG8LPax61Mzbyyt2/dvf7PEy9W2pmm5rZEWZ2XXrhpoUFnRYRflaTgZ8r4ZcetfQln4YSWk1PpW909yVm9hxh7v4mhPHdSwgtr3sIY5HHZrxmtpl9STg5OJ0wHn0woQUMtd2zKRub2RGED9XuhA/6DQgtvlSr9XjC2P/rZnYNIZmeSkiEJ6Yd67eEOd+TLKzNnqreX4cwnpsq0JoEXA3ca2Y3E5L6Twg9G1dn+fk8Saiwv93MniDMnd7U3U/I2O84Qqs2NR/8f+PP0YlSarnd9tF4bUdWXW8/XX9CJfyH1K5w146wJv3mhOWKvwO+M7MrCNPeVuk2j1rH2QrscrEZYaghZ9H7/drMehBWWHwS2JawuMw+7n5t+v5mtl20XyrBYWYdCTM1pkSxzyL0IF1N+HtorJsIrf2RZnZLVNB5HuGE9s4o2c8jrG+/nJXH2K8i9Kb8JUq27xJmcmxBKAbN+8V+om76vxL+x35I+D2sTihonEw0NdDMtiYM7/yIcJK1BaEG4AMz25Uwn/8v7l7nuguSEF4EKwTpVho3wprkDuya5bmToudOT3s8lZDM/0M4YVhpZT1C8hxPSFhzCHOZ+xG635+K9tmIVVdpm00Yn/wN4apz6XFsS6j6Xxgd9wngx1ni7Rm93zxCC3MC0C/Lfj8lFAJ+TSjq+oTwIVtZx8/oSMI0qGXRsW/MeL4X4SRjLmGteAjL3da3Kt2K6DUO/DnjeJWERJdadW5YtF9/QvJ8nbQV9QgnA20IY+IOnJOHv4vUSnQX17PPi8CjWbZ3IbSA5xFO4voQKvi/AX6atl+r6O/Iga3Stt8bbRuctu26aNt+0eNNo7+/xYTq9tR+08hYkS/anjpRTF8drxfwcNrfyyRg5yyv7UCYfvk5tX/7R2bZb3kDv/OGboekHesyQm/O08CvCEMRFj23cfR3UBV9vz8l9Mp8SbimxRDC/4EDO8X9GaNb82+xB6CbbrqFW5Rwp0YfsJenbe9GuOLcLoSx7c0I3fBdSDupiV53XsYxT4627x89PjB6vE30ePUscZxNuGCRE6ZLNvb7aEuY3vg14WQtlYh+kmXfuwjDKdXANRnPDYmS4zzSTiQJ6ww8HyXXfdK2bwNcnfZ4RPS+mcftRJiqtm30+Pxovxrg4CL4O7AontcIPTGNuf2JtGV4077f1bK8z+Do5+6EXpDOac/9iDCck/rdPRX3z0W3/NzUvS9SJDxcFe4swnoCf0/b/g1hLDoXmcMeuxMSZ2qJ2JUq1T37wjvTCT0irwMP5Pi+6cdcZmYvEOovZhFa8be6e7YFXuYTFm56CvhHamO0LsIIwsJMu7v7+2nHn2dhmd2ngP8zswnuvsDDhZVOSdvvNjObQegtSo/ve8LqeCm3E35G49x9WmO/3xaQGs5Z7GHqXM6i7xfS/g6i73cV7j7OzP4AvOhhNcP05z6IZnVcQOj1ObkxcUjxSnXxiEiRsDxelTAqfvyxu7/biNd0BDb3MDOhqe/bFljhjavWT399B8I1Ae7x7Ff4S8XZo7GJUaScKemLiIiUiYJP2TOzblHXX13PtzGz0Wb2bzM7pq79REREpHEKmvSjub93ECpY6/Ib4A1335kwp3r1evYVERGRHBW6kK+aMNf68Xr2GUjtgh7PE+ZlT0jfwcyOJ5pj26FDhz6bb17XRb5ERASAKVOgpgbaRzV+y6LLMLRtu+rj+p4r1L7FGFPs8bdj2tJuLKlpy+Ka975z92zLOteroEnfo7XIG1jCswO1F8WYQ5blWd39RkKFM3379vXJk5tcbyQiUh4GDgxfJ06MMwppoupqOPpoeONu+Mtf4LzzbHpTjlOMU/YWEqabfE9YZWxhvOGIiBSRbbeFWbNgk01g6tSwbZNoocX6Hr/9NnSsb+FGKWatWsF668Hf/gZnngnnnde04xRj0n+DsOraQ4QV2V6pf3cRkTIyaxYsbEJbqGNH6Nro3mCJWVUVfPEF9OwJf/87NPH6W/8Ta9I3s0FAb195De07gDFmtgvhKlXZFvQQEUmO9NY5rNwCb0xrHULC79hR3fRlYOlSOOAAmDwZPvoIOtV7qa7cxJL03X1g9PU54LmM56ab2RBCa/+cpi7uISJSNJraOs9GLfaysHgxDB8OY8fCDTfkJ+FDcXbv4+FKTo1e/lNEpCilWulqnUsOFi2CvfcOfy633gojRjT4kpwVZdIXEWmUpha3FWpfFdFJI5x3HkyaBHfdBYcfnt9jK+mLSPLls/u8JahLXhrh3HNh6FAYPDj/x1bSF5Hi0NTWOqi4TRJvzhwYORIuvRTWWKNlEj7EsPa+iEhWzWmtqyUtCTZrFgwaBHfeGUaCWpJa+iJSHFTsJmVo5szQqv/kE3jiCejfv2XfT0lfRAqjvrnqoGI3KTtffQW77RYW3xkzBnbdteXfU0lfRAqjoe57ddFLmVm2DCoq4JlnoF+/wrynkr6ItJz01r2K7UQA+OabcH678cbh4ocVFYV7bxXyiUjLSW/dqyUvwtSp0LdvuGgOFDbhg1r6ItLS1LoXAeDDD0OV/vLlcOih8cSglr6IiEgL+89/YMAAqKmBCRNg663jiUMtfRFpOanKfJEytnQp7L576Mp/7jnYfPP4YlHSF5HGaczKeZqGJ0K7dnDzzdCzJ/TqFW8sSvoi0jiNWTlPxXtSxl55BT79FA47LLT0i4GSvog0rvWuqXciDXrhBdhzT1hvPTjgAKisjDuiQIV8IqLWu0geTZgQWvbrrRfG8Isl4YNa+iICWvdeJE/GjoV99w3j9+PHQ7ducUe0MiV9kXKUuQ6+Cu5E8uK112CzzeDZZ4uzQ0zd+yLlKLM7X132Is2yaFH4etZZ8NJLxfvvpJa+SLlSMZ5IXjzwAJxyShi/790bVlst7ojqppa+iIhIE40aFZbU7dUL1l8/7mgappa+SDnSSnkizXbbbXDssTBwIIweDR06xB1Rw5T0RcqBCvdE8mrMGDjmGBg6FB59tLi79NOpe1+kHKhwTySvBg2C886Dxx9PTsIHtfRFkiuz9Z6+cp5W0RNpEXfeCXvvDZ07wznnxB1N46mlL5JUWkVPpKAuvBCOPhquuCLuSJpOLX2RYlbfmvhqvYsUhHvoyj/vPDjiiGS28FOU9EWKWX2tebXeRVqcO5x5Jvz97zBiBNx0E1RUxB1V0ynpixQzrYkvEqt58+C+++CEE+D666FVwgfFlfRFilmqS19ECqqmJrTyO3cO6+mvvTaYxR1V8ynpi4iIpKmpgRNPhOrq0J1fSqNoSvoiTdGY6XL1PW5o31SxnogURHV1WGXvjjvCxXNKoXWfTklfpCkaM12uOVSsJ1IwK1aEKXn33APnnw9nnx13RPmnpC/SFCqwEyk5xx4bEv7f/w5nnBF3NC1DSV+kKVRgJ1JyjjgCttsOTj017khaTsInH4iIiDTd0qXh4jkAQ4aUdsIHtfRFajVlLXsRSazFi2HffWHCBPjww/K44rSSvkiK1rIXKRsLF4YL50yaBLfdVh4JH5T0RWqpOE+kLMyfD3vuCa+8AqNGwWGHxR1R4SjpS/nK7M5/+2112YuUgccfh1dfDcvrHnBA3NEUlpK+lK/M7nx12YuUNPew2M6RR8JPfgKbbhp3RIWnpC+lTZemFRHCx8ABB8Dll0OfPuWZ8EFJX0qdLk0rUvZmzoTddoPPPoO5c+OOJl5K+lLaVJwnUta++goGDQpfn34aBgyIO6J4KelLadPKeSJla8aMkOS//RaeeQZ23jnuiOKnFflERKQkrbUW7LgjPPusEn6KWvpSWjKn4WnlPJGyM3UqdOoUSnbuuivuaIqLWvpSWjQNT6SsffAB9O9fXgvuNIZa+lJ6NA1PpCz95z+hSt8Mrrwy7miKk1r6IiKSeG+9BQMHQuvWYT39LbaIO6LipJa+lJZyuWqGiPyPO5x0EnToAM89Bz17xh1R8VLSl+JQ38p59T3OfE7r54uUHTN46CFYsQI22ijuaIqbuvelODTmsrb1UeGeSNl4/nk4/nioroYePZTwc6GWvsSjrql1KsATkRyMHw977w0bbhiW1l177bgjSga19CUemlonIk30zDOw115h7H7iRCX8xlBLX+KhNfFFpAmefBL23x969w4r7SnhN46SvsRDa+KLSBOssQbstBM88gh07hx3NMmj7n0RESl6qXZC//5hWp4SftMo6UthbLttKK8dODDc8lGpLyJl4a67YPPN4dFHw2OzeONJMiV9KQwV7olIE9x6Kxx9dLhE7tChcUeTfBrTl8JQ4Z6INNI//wknnwy77x7G8Nu3jzui5FPSl8JQ4Z6INMI774SEv/fe8OCD0LZt3BGVBiV9EREpOltvHcbw99wTKivjjqZ0aExfRESKxj/+AS+/HO4PG6aEn29K+iIiEjt3OOccOOMMGDUq7mhKl7r3pTB0yVsRqYM7/OlPcPHFcOyxcPXVcUdUupT0pTBUyCciWbjDaafBlVfCSSfBtddCK/VBtxj9aEVEJDY1NfDVV3DqqXDddUr4LU0tfRERKbiaGpg3D7p0gXvugYoKrbRXCDqnEhGRgqquhmOOgZ13Dgt1tm6thF8oaulL/my7bVhuN1W0lxrH32ST8J/dsWN8sYlIUVixAo46Cu69F84/Xx8LhaakL/mTub5+Oq21L1L2qqrgsMPg4YdDpf4f/xh3ROVHSV/yR+vri0g9Ro4MCf+KK+C3v407mvKkpC/5o2l5IlKPM86A7baDI46IO5LypUI+ERFpMYsXw9/+BsuXQ7duSvhxU0tfmi6zcE/FeiKSZuFC2GsveOGFUKk/cGDcEYmSvjRdZuGeivVEJPL99+EKea++CnffrYRfLJT0pelUuCciWcydCz/7Gbz1Ftx/P+y/f9wRSUrBx/TN7BYze9nM/lzH853NbIyZTTazGwodnzTC1Kkq3hORVXz2Wbg9/LASfrEpaNI3s/2ACnffEdjYzHpl2e1I4G537wusbmZ9CxmjiIg0zZIl4et228Gnn8I++8Qbj6yq0C39gcAD0f2xQL8s+8wGfmxmawLrA19k7mBmx0c9AZNnzZrVUrFKpm23hR49wuDcwIF1L8QjImVnxgzo06f2srirrx5vPJJdoZN+B+Cr6P4coFuWfV4ENgROAT6I9luJu9/o7n3dvW9XFY4Vjgr3RCSLL7+EAQPg889h663jjkbqU+hCvoVA++h+R7KfdPwFONHd55vZacAI4MYCxSf1UeGeiGSYPh0GDYLvvoOxY2GnneKOSOpT6Jb+G9R26W8NTMuyT2dgSzOrAH4CeGFCkwapcE9E0ixaFFr4c+bAs88q4SdBoZP+Y8CRZnY5cBDwnpldkLHPRYSW/fdAF+DewoYoIiK56NABzjoLxo+HHXaIOxrJRUG796Mu+4HAEOAf7j4TeCdjn9eALQoZl4iI5O7990OJz4AB8Mtfxh2NNEbBF+dx97nUVvCLiEiCTJkCgwdDp07wwQfQWku8JYp+XZK7VCGfiJSlN9+EIUOgfXt46ikl/CTSr6zcZV40J1Wol+3x22/rgjoiZeq118LSup06wXPPwcYbxx2RNIUurVvuMufe10fz8kXK1m23QZcuMGmSEn6SqaVfjtJb96nL4WruvYhkUVMDrVrBNdeEqXnrrBN3RNIcaumXo/TWvVrvIlKHcePCOvpffx3G75Xwk08t/XKl1r2I1OPpp2H4cNh0UxXslRK19EVEZCVPPAHDhkHv3jBhglr4pUTnb+VIU+9EpA7PPAP77x+69f/1L+jcOe6IJJ+U9EtRQ9PwNPVOROrghosUAAAgAElEQVTQpw8cfTRcdlmYnielRd37paihaXgq3hORDM8+C8uWwdprw803K+GXKrX0S5UK9UQkR7fcEtbQP+88OPvsuKORlqSWvohIGbv+ejjuuLDa3h/+EHc00tLU0i9FKtQTkRxceSX87newzz7wwAPQtm3cEUlLU9IvRanCPRGROsyaFbrz998f7rkHKivjjkgKQUlfRKQMde0Kr7wCPXtq8Z1yojF9EZEy4R4K9S68MDzebDMl/HKjpC8iUgbc4Ywz4IIL4LPPwmMpPzrHK0Uq5BORNO6hYO+qq+Dkk8MV88zijkrioKRfilTIJyJpfvMbuO66kPgvu0wJv5ype19EpMRttx2MHKmEL2rpl4709fYXLtTa+iJlbsUK+M9/YJtt4Jhj4o5GioVa+qUifb19ra0vUtaWL4cjjoAdd4Tp0+OORoqJWvqlIlW8p/X2RcpaVRUccgg8+ihccglsuGHcEUkxUdIvFSreEyl7S5fCgQfCk0+GSv1TTok7Iik2SvoiIiXipptCwv/nP+HEE+OORoqRkr6ISIn41a9gq61gwIC4I5FipUI+EZEEW7AgFO1Nnw6tWinhS/2U9EVEEur77+FnP4P77oO33447GkkCde+LiCTQ3LkwdCi88w48+CDsu2/cEUkSKOmXCq23L1I2vvsOhgyB99+HRx6BvfaKOyJJCiX9UqEpeyJlo3Vr6NABnngidO+L5EpJX0QkIWbOhE6dYM014YUXtI6+NJ6SflKlr7UPWm9fpMR98QUMGhT+9R94QAlfmkbV+0mVvtY+aL19kRI2bVqYivftt+HyuCJNpZZ+UmmtfZGy8MknsOuuYT7+uHGw/fZxRyRJpqSfVCrcEyl57nDAAbB4MTz3XOjaF2kOJX0RkSJlBrfdBhUVsOWWcUcjpUBj+iIiRWbKFLj44nB/m22U8CV/lPRFRIrIm2+GMfxrr4U5c+KORkqNuveTSivwiZScV18Ni+2suSZMmABdusQdkZQaJf04Zc61TxXnZXuc+dzbb2tevkgJefFF2HNPWGedULS3wQZxRySlSN37ccqca98YmpcvUlK++AJ69IBJk5TwpeWopV9o6a371Cp6mmsvUrbmzoXOneHQQ2H//aGyMu6IpJSppV9o6a17tdZFytqYMbDRRmH8HpTwpeWppV9oWklPRIDHH4cDDwzT8bbaKu5opFwo6be0zGI9FeCJlL0HH4TDDoM+feBf/wrV+iKFoO79lqYL44hImtdfh0MOgZ/+FMaOVcKXwlJLP9/quuStuvNFBOjbF664Ao45Rp1+Unhq6eebWvYiksVdd4XlNszglFOU8CUeaum3BLXsRSTNddfBr38Nxx0HN90UdzRSztTSFxFpQZdfHhL+vvuG9fRF4qSWfr5pTXwRiVx0EZx5Zpiad/fd0KZN3BFJuVPSz7fUGvkiUtaqqmD0aDj8cLj9dmitT1spAvozFBHJI/eQ8Nu2hWeegdVWg4qKuKMSCZT0myJ9Wl7m1e9SU/REpOy4w+mnw5tvhiV2V1897ohEVqZCvqao7+p4mqInUpbc4dRT4bLLYIstQktfpNiopd8UWj9fRNLU1MDJJ8MNN8Bpp8Gll4b5+CLFRkk/F1o/X0TqccYZIeGPHAkXXqiEL8VLST8XWmVPROoxYkT4SDj9dCV8KW5K+rnSKnsikmb5crj//jAlr3fvcBMpdirkExFppGXL4KCD4Mgj4d//jjsakdyppZ8LrbInIpGlS+GAA+Cpp+Caa6Bfv7gjEsmdkn4utMqeiACLF8Pw4TB2bCjcO/74uCMSaRwlfRGRHL36aijtufXWULwnkjRK+iIiDaipgVatYNddQ8ff+uvHHZFI06iQT0SkHvPmwYAB8Oij4bESviSZWvq5UCGfSFmaMweGDoUpU+KORCQ/lPRzoUI+kbIzaxYMGQIffACPPAJ77RV3RCLNp6QvIpJh/vwwfv/JJzB6dGjti5QCJX0RkQyrrw577x1a+oMGxR2NSP4o6YuIRL74AhYtgs03h4suijsakfxT0hcRAT77LLTq27eHd9+Fioq4IxLJPyV9ESl7U6eGhL9wITz0kBK+lC4l/Vxoyp5Iyfrww5Dwly+HCRNg663jjkik5Sjpp2y7bZijs8kmtVP0Usn+7bfDpXVFpOT85S9hxb2JE2GLLeKORqRlKemnzJoV+vay6dgRunYtbDwiUhC33AIzZ6pDT8qDkn66jh3D6b6IlLTJk+H88+Gee8K/vRK+lAslfREpK6+8Aj/7GXTpEpbZ1cidlJOCX3DHzG4xs5fN7M8N7He9me1dqLjYZBOd7ouUuBdeCAvurLMOPP88bLBB3BGJFFZBk76Z7QdUuPuOwMZm1quO/XYB1nX30QULbupUrbEvUsImTYLdd4cePcJ9XS1PylGhW/oDgQei+2OBfpk7mFkb4CZgmpntm+0gZna8mU02s8mzZs1qqVhFpIR07w79+oWyne7d445GJB6FTvodgK+i+3OAbln2OQp4H/gHsIOZ/SZzB3e/0d37unvfrqqqF5F6vPsuuEOvXvDMM9At26eOSJkodNJfCLSP7nes4/23BW5095nAKGDXAsUmIiXm0UehTx+46qq4IxEpDoVO+m9Q26W/NTAtyz5TgY2j+32B6S0fFirkEykxDzwABx4Ykv6IEXFHI1IcCj1l7zHgBTPrDuwBHGJmF7h7eiX/LcCtZnYI0AY4IC/vnL7iHmjVPZESNmoUHH007LQTjBkTLpUrIgVO+u4+38wGAkOAf0Rd+O9k7LMAODDvb17finugVfdESsSMGfDLX8KAATB6NHToEHdEIsWj4IvzuPtcaiv4C0sr7omUvB/8AMaODd36q60WdzQixaXgi/OIiLSEa6+Fe+8N93fZRQlfJJvySfoq1BMpWZddBr/5TajWd487GpHiVT5r72u1PZGSdOGFcNZZcNBBoYDPLO6IRIpX+bT0RaSkuMO554aEf8QRcPfd0KZN3FGJFDclfRFJrKqqMAf/9tuhdfn0W4o0mf5NRCRR3OGrr8KFc/72t/C4lZovIjnRv4qIJEZNTSjY23Zb+PrrMH6vhC+SO/27iEgi1NTAiSfCddfBL34R5uOLSOOUT/e+puuJJFZ1NRx7LNxxRyjc++tfVaUv0hTlk/Q1ZU8ksa64IiT8886Dc86JOxqR5CqfpC8iifWrX0H37nDYYXFHIpJsGtMXkaK0bBmMHAnffw/t2yvhi+SDkr6IFJ2lS2H4cLj44nDxHBHJj/Lp3lchn0giLF4M++4L48fDDTfAgfm/0LZI2SqfpK9CPpGit3Ah7L03TJoEt94apuaJSP40u3vfzFqZ2S75CEZEytucOfD55+HCOUr4IvnXYNI3s0oz+5MF7dO2tzOzo4G2wDMtGaSIlLYFC8LiOxtsAO+9p6I9kZaSS0u/FXA68BsgfYbsKOBMwIDl+Q9NRMrB7NkwcCD89rfhcbt2sYYjUtJyGdOvAhYBY4DJZvYy0AsYCPRx98VmVt1yIeaJCvlEis6338KQIfDRR3DBBXFHI1L6Gkz67l5jZsvdfaqZ/Q74HHgLeA3Y18weaOkg80KFfCJFZcYMGDwYPvsMnnwy3BeRltXY6v2Z7v62mW0LXA30Bl7Of1giUsqqq2GPPWD6dHj6aRgwIO6IRMpDzknfzHYAHjaz/YD1gU+Btdz9dTNd+kJEcldRARddBGusATvvHHc0IuWj3qRvZj8F/hQ9fAu4BHgMOBw4EPjezC4H2kdfIRT+tXP3E1smZBFJqk8/hddfh4MPDi19ESmshlr6GwMdgDbAo8C5wKnALYAD84GehET/w+g1FUDx1d+qkE8kVh9/DIMGhTX1d98dOnWKOyKR8lNv0nf3e4B7zOxLQoK/mJDsdwMeB1YDjgU+dvfhLRxr86iQTyQ2H3wQEv6KFWF5XSV8kXjkuiJflbsfBswFOgFLgQOANYANCScCIiKrePfdUKjnDhMnwlZbxR2RSPlq7DK8/wf8CJhN6Prv6+5v5D0qESkZzz4LbdqE9fS32CLuaETKWy7L8BrQ1sy6APcRxvc7EKbsrdOy4YlIUi2P1uk87bTQ2t9ss3jjEZHcWvptCWP3uwP3uvt/3P1T4CjgTjPrCVS2YIwikjAvvwybbgrvvBMed+kSbzwiEuSS9FcAvya08kemNrr708CVQA3hxEBEhOefh6FDQ5e+kr1IccllGd4VwN3Rw0UZz10Udf/3aYHY8ktT9kRa3PjxsM8+4Wp548dD9+5xRyQi6Rq7DO8q3N2BKXmIJf+23RZmzQoJ/+23oWPHuCMSKVmvvw577RX+3caNg27d4o5IRDI1mPTN7AZgGaGbvyEVQI27/665geXFrFmwcGG437EjdO0abzwiJWyrreCkk+DMM2HtteOORkSyyaWl/0vgjoxtRwF3Zmw7FLgXWJyHuPKnY8cwOVhEWsQzz0DfvrDWWnD55Q3vLyLxyal7391HpD82s6OzbBuWuU1EStv998Phh8Oxx8INN8QdjYg0JJekn221vVy3xUvFeyIt5q674Be/gH794NJL445GRHLR7EK+oqb19kVaxK23wnHHwa67whNPQIcOcUckIrlo7DK8IlLmli6Fiy4Kc/GffFIJXyRJcmnpm5l9msO2TmnbtnL3hc0PT0SKiTu0axfW0e/SJdwXkeTIJekfRbiq3vIc9q0A2kX7i0gJueQSeO+90LWvRXdEkimXpN8TGOXun6RvNLOdgb0JF9+5191faoH4mkeFfCJ5ccEFcPbZcMghUFMDrTQwKJJIufzrngO8b2avmNnpZrajmT0BPAl0J1xqd6yZrdmSgTbJ1Kkq5hNpBnc455yQ8I88EkaNgtalXf4rUtJySfrzgXWBG4HdgOcJF9np6e5HAXsAVcCglgpSROJx7rnw17+Gefi33QYVFXFHJCLNkdM8fXefC9wK3GpmnaPHqSeXm9kvgQktFWTO0tfah7AEr9bbF2my/v3h1FPDSnvq0hdJvlyS/upm9lz6hnBhvVX8Kn27uxe+5Z++1j5ovX2RJqipgX//G3bZBXbbLdxEpDTkkvR/S6jGr6bhVfcMaAu0aWZcTZNq4WutfZEmqamBE06Am2+GV1+FHXaIOyIRyacGk767X9fQPma2FfCeu1fnJaqmUtGeSJNVV8Mxx8Cdd8Kf/wzbbx93RCKSbw2O0pnZT8yszv3MrAJ4C+hqZm3MbPN8BigiLW/5cjjiiJDwzz8/FO9lH8UTkSTLpTTnXsKCO1lFrXsDlgFHAOOiEwERSYgxY+C+++Dii8P0PBEpTbmM6VcBy8zs3OhxTZZ9nDDm/1vgodi7+UWkUfbdF157TV36IqUul6SfSvKnAlOAfsArwE+BjwnJHmBLwup9mq8vkgBLloRL4/7hDyHZK+GLlL7GzLx1YCihK3+/6OvlwPnR/WHA/e4+O99Bikh+LVoEe+8NDz4I778fdzQiUihNWVDTWXXqngP/R5iuFx+ttS/SoAULYK+94MUX4fbb4aij4o5IRAqlzqQfVezfBHQB+hOuoPe/p7O8ZJa7z89veI2kKXsi9VqwAHbfPczBv/vucAEdESkf9XXvtyFcUKcjMCb6KiIJ1q4d9OgB99+vhC9Sjups6bv7MmAPM3ufUKQ3q4FjbW5mG7r7g/kMUESab/ZsWLECunULCV9EylOuhXxex9d0Q4ALmx2RiOTVt9/CrruGwr2abBNuRaRs5FrIZ9Ht1ejruGj7WdRO6bsJON7M9nD3p/MaZa5UyCeykhkzwgVzpk2D0aN1pTyRcpdr0nfgguj+7RnPGXAZ4aI8VwC/BOJJ+irkE/mfL7+EQYPg66/h6adhwIC4IxKRuOWS9CuBdu5+WbYnLVxP9zJCdf+dwF/MrI27L89fmCLSWCecAN98A2PHwk47xR2NiBSDXJL+ddSuupdNO6JL6rr7TDMbpIQvEr+bbgqt/L59445ERIpFgyN87n5FVMlf1/NLgB8C30SP38pfeCLSGP/9L5xySqjU795dCV9EVtaUFflW4e7T83GcZlMhn5Sx998PRXvV1fC738EPfxh3RCJSbPKS9IuGCvmkTE2ZAoMHQ0UFTJyohC8i2WkCj0jCvflmmIdfWQmTJkHv3nFHJCLFSklfJOGWLIF11w0Jf9NN445GRIpZaXXvi5SRmTNDst9559C9X1HR8GtEpLyppS+SQJMmQa9ecOed4bESvojkQklfJGHGjYM99oANNoAhQ+KORkSSpLS69zVlT0rc00/D8OFh7H7cOFhnnbgjEpEkSX7SnzIFBg4M999+Gzp2jDUckZYyfToMGwZbbAHPPgtrrRV3RCKSNMnv3k+/VmjHjtC1a3yxiLSgDTeE226D8eOV8EWkaczd446hWfpWVvrkqqq4wxBpMffdB+utB7vsEnckIlIszOwNd2/0QtvJb+mLlLA77oDDD4dLLok7EhEpBclP+m3bxh2BSIu4+WYYMQIGDQqtfRGR5kp+0l9W5wUARRLr+uvhl7+E3XeH0aNhtdXijkhESkHyk75IiXGHF16AffaBRx+Fdu3ijkhESkXyp+yJlJBFi6BDh7DSnnu4iI6ISL4UvKVvZreY2ctm9ucG9utmZm8VKi6RuP31r9CnD8yeDW3aKOGLSP4VNOmb2X5AhbvvCGxsZr3q2f1SoH2DB1UhnyScO5x9NpxzDvzkJ7DmmnFHJCKlqtAt/YHAA9H9sUC/bDuZ2SBgETCzjuePN7PJZja5ZsmSlohTpCDc4Ywz4IIL4LjjwuI7uniOiLSUQif9DsBX0f05QLfMHcysEjgbGFnXQdz9Rnfv6+59W7VSLaIk12WXhTn4J58MN9wA+nMWkZZU6EK+hdR22Xck+0nHSOB6d59nZgULTCQORx0VVpI+/XTQn7uItLRCtyveoLZLf2tgWpZ9BgO/MrOJwDZmdnNhQhMpjOrqMA9/+fJwlbw//lEJX0QKo9At/ceAF8ysO7AHcIiZXeDu/6vkd/f+qftmNtHdj6v3iCrkkwRZsQKOOQbuuitcNOfgg+OOSETKSUGTvrvPN7OBwBDgH+4+E3innv0HNnhQrcgnCbF8ORx5JNx/fyjcU8IXkUIr+OI87j6X2gp+kbJQVQWHHBJW2LvkEvjDH+KOSETKkVbkEymAqVPhuefgqqvglFPijkZEypWSvkgLWrECWreG3r3h44+ha9e4IxKRcqZZwSItZNEi+NnP4NJLw2MlfBGJm5K+SAtYsAD22AMmToR11407GhGRIPnd+5qyJ0Xm++9Dwn/tNbjnHlXpi0jxSH7S15Q9KSIrVsDQofDWW/DggzB8eNwRiYjUSn7SFykirVvDscdC9+6w115xRyMisjIlfZE8+OYb+O9/YZdd4Pjj445GRCQ7JX2RZvr6a9htN5g9Gz77DDp0iDsiEZHskp/0VcgnMfriCxg0CGbOhDFjlPBFpLglP+mrkE9iMm1aSPizZ8PYsbDjjnFHJCJSv+QnfZGYXH89zJ0L48bB9tvHHY2ISMPM3eOOoVn6Vlb65KqquMOQMuIOZlBdHVr7PXvGHZGIlBsze8Pd+zb2dVqRT6QR3nsP+vWDL7+EigolfBFJluR376uQTwrknXdg8GBo0wYWLow7GhGRxkt+S1+FfFIAb7wBu+4K7drBpEmw+eZxRyQi0njJT/oiLeytt8I8/DXWgOefh1694o5IRKRplPRFGrDhhiHpP/88/PCHcUcjItJ0SvoidXjjjTB61KULPPwwbLBB3BGJiDRP8pO+CvmkBYwdG6r0R46MOxIRkfxJftJXIZ/k2ZgxsM8+sNlmcOaZcUcjIpI/yU/6Inn0+OMwbBj8+Mfw3HPQtWvcEYmI5I+Svkhk0SI44QTYbruwtG6XLnFHJCKSX8lfnEckTzp0gGefDdX6a6wRdzQiIvmnlr6UvTvugAsuCPe33FIJX0RKl5K+lLWbboIRI8IqeytWxB2NiEjLSn7S15Q9aaLrroPjj4fdd4fRo6G1BrtEpMQlP+lryp40wRVXwK9/DfvuC48+GtbUFxEpdclP+iJN0KULHHQQPPigOotEpHyYu8cdQ7P0raz0yVVVcYchCeAOU6fWXjDHHczijUlEpCnM7A1379vY16mlL2XBHf7851CdP2VK2KaELyLlJvlJX32z0gB3OP10uPBCOProsNqeiEg5Sn69sgr5pB7ucOqpcM01oXDv6qvVwheR8pX8lr5IPe67LyT83/9eCV9EJPktfZF6HHxwGAEaPlwJX0RELX0pOStWhJb9tGnQqhXst58SvogIlELSVyGfpFm+HA47DC6/HMaMiTsaEZHikvzufRXySWTZMjjkEHjsMbjsMjj55LgjEhEpLslP+iLA0qWw//6hdZ+q1BcRkZUp6UtJqKqC2bPhhhvCRXRERGRVSvqSaIsWhSK9NdaAF1/UlfJEROqjQj5JrPnzw2VxDzggLMKjhC8iUr/kf0yqkK8szZsHe+wBr78O996rKXkiIrlIftKXsjNnDgwdGi6c89BDMGxY3BGJiCSDkr4kzqGHwrvvwqOPws9/Hnc0IiLJoaQviXPppTBjRmjti4hI7pJfyCdl4euv4YorQsHellsq4YuINIVa+lL0vvgCBg2Cb74J6+hvuGHcEYmIJFPyW/qaslfSPvsM+veHWbNg7FglfBGR5kh+S19T9krW1Kmhhb9wIYwfD336xB2RiEiyJT/pS8maMiUsrzthAmy9ddzRiIgkn7l73DE0S9/KSp9cVRV3GJJHS5ZA+/bh/sKF0LFjvPGIiBQbM3vD3fs29nXJH9OXkvL229CzJzz9dHishC8ikj/JT/oq5CsZkyeHMfzWraFXr7ijEREpPclP+irkKwmvvAK77QadOsHzz8Mmm8QdkYhI6Ul+0pfE++QTGDIEunYNCX+jjeKOSESkNCnpS+w23hhGjoRJk2D99eOORkSkdCnpS2zGjYOPPw6XxT3rLFhvvbgjEhEpbclP+irkS6SnngpXyPvd7+KORESkfCQ/6auQL3EefRSGDw8XzrnzzrijEREpH8lP+pIoDzwABx4YltQdNw66dIk7IhGR8qGkLwVTUwPXXw877hgunrPmmnFHJCJSXrT2vhREdTVUVMATT4SvHTrEHZGISPlJfktfhXxF74YbYPBgWLQI1lhDCV9EJC7JT/oq5Ctq11wDJ54YEn1FRdzRiIiUt+QnfSlal14Kp5wSKvUfeQTatYs7IhGR8qakLy3i6qvh9NPh4IPh/vuhsjLuiERERElfWsTQoaGVP2oUtGkTdzQiIgJK+pJH7vDkk+Hr5pvDVVeFy+SKiEhxUNKXvHCH3/8e9t4bHnss7mhERCSb5LfDNGUvdjU1oSv/uuvC12HD4o5IRESySX5LX1P2YlVTE6bkXXcd/OEPcOWV4ap5IiJSfJKf9CVWb70Ft90WLo37j38o4YuIFLPkd+9LrPr0gXfegd69445EREQaopa+NNry5XDYYfDQQ+GxEr6ISDIkP+mrkK+gli0Ll8a991748su4oxERkcZIfve+CvkKZulS2H9/GDMGrr0WfvWruCMSEZHGSH7Sl4KoqoJ99oFx48JV844/Pu6IRESksQqe9M3sFqA38JS7X5Dl+U7AfUAFsAg42N2rChulZGrTBrbdNozl/+IXcUcjIiJNUdAxfTPbD6hw9x2Bjc2sV5bdDgcud/ehwExg90LGKCubPx8++ihMxbv4YiV8EZEkK3Qh30Dggej+WKBf5g7ufr27Pxs97Ap8m7mPmR1vZpPNbPLyVsmvRSxW8+bBkCEweDAsWRJ3NCIi0lyFzpgdgK+i+3OAbnXtaGY7Ap3d/ZXM59z9Rnfv6+5929TUtEykZW72bNhtt7D4zrXXQvv2cUckIiLNVegx/YVAKn10pI6TDjPrAlwD7F+guCTNt9+GFv5HH4WL5+y5Z9wRiYhIPhS6pf8GtV36WwPTMncws0rgQeBP7j69cKFJyjnnwMcfw+jRSvgiIqXE3L1wb2a2BvACMB7YAzgEONDd/5y2z0nAhcA70aZ/uvv9dR2zb2WlT65ScX8+LVoE770HO+wQdyQiIpKNmb3h7n0b/bpCJn0AM+sMDAGed/eZzT1e39VX98kLFjQ/sDI3fTqMHBnm4K+xRtzRiIhIfZqa9As+T9/d51Jbwd98WpGv2T79FAYNCtX6n30GW28dd0QiItISNN+tzH38MQwYAAsWwHPPKeGLiJQyLcNbxj74IEzLW74cJkyArbaKOyIREWlJaumXscpK6N4dJk5UwhcRKQdq6Zeh6dNhgw2gZ094/fWwxK6IiJQ+tfTLzOuvwzbbwPnnh8dK+CIi5SP5Sb9t27gjSIyXXw7r6HfuDEcfHXc0IiJSaMlP+pqyl5Pnn4ehQ6FbN5g0CTbaKO6IRESk0DSmXwbmz4dhw6BHjzAt7wc/iDsiERGJg5J+GVhjDbj//lCh363O6xqKiEipU9IvYaNHh0V3DjssXDVPRETKW/LH9FXIl9Ujj8B++8E110B1ddzRiIhIMUh+0lch3yruvx8OOgi23x7+9S+oqIg7IhERKQbJT/qykrvuCt35O+8MzzwDnTrFHZGIiBQLJf0SM3UqDBwIY8bA6qvHHY2IiBQTc/e4Y2iWvpWVPrmqKu4wYjdnDnTpAu7hAjqVlXFHJCIiLcXM3nD3vo19XfJb+irk4+qrYbPNQivfTAlfRESyS37SL/NCvksugVNPhf79w0V0RERE6pL8pF/GLrgA/vhHOOQQuO8+tfBFRKR+SvoJNWoUnH02HHlkuN+mTdwRiYhIsdOKfAl1wAEwezb8+teahy8iIrlJfku/jAr53OGKK2DuXGjXLozlK+GLiEiukp/0y6SQr6YmtOpPOw3uvDPuaEREJInUvZ8ANTVwwglw881w+ulwyilxRyQiIkmU/JZ+iauuhhEjQsL/85/h4ovDXHwREZHGUku/yH33HUyaBOefH6r1RUUej7wAAAzqSURBVEREmkpJv0gtXx6K9Lp1g3fe0YVzRESk+dS9X4SWLYP99w/j+O5K+CIikh/JT/olNmVvyRIYNgxGj4btttP4vYiI5E/yu/dLaMreokWw777w3HOhcO/YY+OOSERESknyk36JcA+r7E2YAHfcEZbXFRERySdz97hjaJa+lZU+uaoq7jDyYuxYmDMnXEBHRESkLmb2hrv3bezr1NKP2dy5MHEiDB8OQ4fGHY2IiJSy5Cf9BBfyffddSPQffACffALdu8cdkYiIlLLkJ/2EFvJ9+y0MHgz//S889pgSvoiItLzkJ/0EmjEDdtsNpk2Dp54K90VERFqakn4MnngCPv8cnn4aBgyIOxoRESkXqt4voJoaaBUth/TFF7D++vHGIyIiydTU6n2tyFcgn3wSVth7883wWAlfREQKLfnd+wko5Pvvf2HQoLDEroiISFySn/SL3Pvvh0K96uowH3/LLeOOSEREylXyu/eL2NSpMHBguK+ELyIicVPSb0EbbBAuoDNpEvTuHXc0IiJS7pLfvV+EhXxvvhkS/tprw003xR2NiIhIkPyWfpEV8r30UujSP/74uCMRERFZWfKTfhGZNCmspf+DH8DVV8cdjYiIyMqU9PNk3DjYYw/YcMNQtNejR9wRiYiIrExJPw+qq+G002CTTWDChNDSFxERKTbJL+QrAhUVMGYMtG8Pa60VdzQiIiLZqaXfDA8/DMccE1r6PXoo4YuISHFLftKPacrevffCwQfDRx9peV0REUmG5Cf9GKbs3XEHHHEE9OsHzzwDHTsWPAQREZFGS37SL7DbboMRI8IFdMaMUcIXEZHkUNJvpI03hv32g9GjYbXV4o5GREQkd0r6OXrnnfB1wAB46CFo1y7eeERERBor+Um/AIV8F18M22wTxu9FRESSKvlJv4UL+f76Vxg5Eg49FHbbrUXfSkREpEUlP+m3EHc4+2w45xw46ii46y5oraWMREQkwZT06/Dvf8MFF8Bxx4WK/YqKuCMSERFpHrVd69CvH4wdG7r0W+nUSERESkDy01keC/lqauD3v4eXXw6PhwxRwhcRkdKR/JZ+ngr5qqvh+OPh1luhUyfYcce8HFZERKRoJD/p58GKFWGVvVGjQuHe2WfHHZGIiEj+lX3SX748VOffd18o3DvrrLgjEhERaRlln/QhJP5//ANOPz3uSERERFpO8pN+Ewv5li6F+fNhnXXggQdUsCciIqUv+amuCYV8S5bAsGHhSnlVVUr4IiJSHpLf0m+kRYtgn31gwgS4+WaorIw7IhERkcIoq6S/YAH8/Odhtb0774Qjjog7IhERkcIpq6R/yinw0ktwzz1w8MFxRyMiIlJY5u5xx9AsfSsrfXJVVU77zpwJb74Je+7ZwkGJiIi0IDN7w937NvZ1JV/C9t13cOaZYQGedddVwhcRkfKV/KRfz5S9b76BXXeFK66AKVMKGJOIiEgRSv6Yfh1T9r7+Olwhb/p0ePJJ2G67AsclIiJSZJKf9LP44oswB3/mTPjXv6B//7gjEhERiV9JJv0ZM8KKe2PH6mp5IiIiKSWV9L//PlwWd4cdYOrUJq/QKyLy/+3df+xVdR3H8ecrBL+IKDQapEuLJrqAQtcUDeaXzISVC5RGGxnJimxJ84/+QCe5XGutmssyMkhbzbIMi3JaAQWITVuQo/kj0hWKqBUyNH8EAe/++HxuXL7e7/ee+4V74J77emxn33vP+Xw/5/N9c7jvcz/nnM/HrJIqcyPf1q0wcSLceushq83MzCzr/KS/Zw+PPgoXXphmy5s27Wg3yMzM7NhUetKXdJukByVdfzhlal6LHnp706Q5GzbApElHtLlmZmaVUWrSl3QZMCQizgfGSzpjMGXqbd33dnp6UsI/66z2tNvMzKwKyr6Rrxe4K79eDUwDnmi1jKRFwKL8ds8zz+iRCRPa0FqrNwbYebQbUXGOcfs5xu3nGJfjzMH8UtlJfwSwI7/eBTQaMqdpmYhYDiwHkLRpMOMPW2sc5/ZzjNvPMW4/x7gckjYN5vfKvqb/MjA8vz6xn/0XKWNmZmYtKjuhbiZ11wO8C9g2yDJmZmbWorK791cBGyWdAswCPiLpixFx/QBlpjapc3l7mmp9OM7t5xi3n2Pcfo5xOQYVZ0XEkW7IwDuURgMXA/dHxPODLWNmZmatKT3pm5mZ2dHhm+TMzMy6RMck/SM9kp+9XrP4STpZ0q8krZb0c0nDym5jFRQ9TiWNlfRwWe2qkhZivEzSpWW1q0oKfF6MlnSfpE2SvlN2+6oifw5sHGD7UEn3SPq9pIXN6uuIpN+OkfzsUAXjNx+4KSLeDzwPzCyzjVXQ4nH6NQ4+vmoFFY2xpOnAuIi4p9QGVkDBGF8B/DA/sz9Skp/db1G+v+37pPFr+rMY2BwR7wHmSho5UJ0dkfRpPErfYMpY/3ppEr+IWBYRa/LbNwH/LKdpldJLgeNU0nuBV0gnV9aaXprEWNJQYAWwTdKHymtaZfTS/Dh+AZgkaRTwFmB7OU2rlP3APOClAcr0cvDf4n5gwJOrTkn6fUfpGzvIMta/wvGTdD4wOiIeKqNhFdM0zvmyyVJgSYntqpIix/LHgMeArwDnSlpcUtuqokiMHwBOBz4LPJ7LWQsi4qWIeLFJsZZyX6ckfY/k136F4ifpjcA3gabXjqyhInFeAiyLiN2ltapaisT4bGB5fiT4DmBGSW2riiIxvgG4KiJuBP4CXFlS27pNS7mvUxKjR/Jrv6bxy99AfwpcGxFPlde0SilynL4P+Iyk9cAUSd8tp2mVUSTGTwLj8+t3Az6eW1MkxqOByZKGAOcBfj68PVrKfR3xnL6kk4CNwG/JI/kBH64fya9BmakFukUsKxjjTwNfArbkVd+OiJ+U3dZOViTOfcqvj4je8lrY+QoeyyOB20ldoUOBuRGxo0F11kDBGJ8LfI/Uxf8gMCciXj4Kze14tc+BfK/POyLilrptpwP3AWuBC0i5b3+/dXVC0geP5FcGx68cjnP7Ocbt5xgfO/Kw9dOA3zT7stsxSd/MzMwOT6dc0zczM7PD5KRvZmbWJZz0zbqEpNmSLuhnW0+ZwypLmp/HezCzEjnpm3WPpcAcSQ9JirplCfB14F+SdkvaJ+lTAJKuzu935+WApC9IuqJPHfXLitoO87jgwyX1/axZCFxXv0LSkFz2+DbHwaxrOembdQFJpwGTScl9DzAjIkQa13tPRFwVESdHxCjSSGq1R6v+AzyQ148nDQt6R17/VESofqnVV7frecBzwA5J2/KyAziT9Ax3bd024GngWeBz7YyFWTc77mg3wMxKsYA0KccOSUUe2dnf5yfAbODhiHhC0sQBfndf3es7889fR8ROAEl3AqNyfVOAP0bEgTxpywdIIz6aWRv4m75ZxUk6DvgEh34DX5eT/wKgR9JaSf+WtJv0vG9tVq/6E4R5wN0t7n44MBX4q6SLJa0ExpGGaL0O2ADMylOCbgLeDPS0uA8zK8jf9M2q70rSmNz1ZkTE+tobSTcDB0hd/a/rCchzLlwE3NjKjvMIbFdLWg7sBS4nzRi2hnRCMCci7pV0NrAgIla1Ur+ZtcZJ36zC8rX8LwPLgOkNtvcAJwGXkXoCQhLApoh4pFYuInZJuhe4FvhgXn16P5cKbm6w7pW8TAaOB9YBHwVOlXQOsBO4SJIanXSY2ZHh7n2zanuW1JW+uc/6Wvf+a6S5zr8KzCddZ78JOKNBXQuB6ZJqJw/93ch3CEmzgD+QegpGAL/Idd0NXEIaA/+tpBOKlZJOGPyfa2YDcdI3q7CI2Fc/OUed2t37w4E/kSbsuCUiZpO64Tc1qOsF4JfApUX3L+k64Aek+esfJ028MhL4BvAiafa1icA5pNnu3km6d8DM2sDd+2ZdSqkfX/ntWmCmpD8Dr0bE9gblhwKnkK7JF6n/DcCPgbsi4sm87jzS1J+3AX+LiGvybGzbI+I5SVNIlxiGDDRTmJkNjpO+WXdaV/f6bcDPgM+T7tb/Vt/CkuYCK0jX428AxhTYx2TSZYW9kvb22TYCOCDp43X7gNT7OIzU7b+h2J9iZkU56Zt1pxkRsT5/e98XESFpNela+7gG5dcBnwR+l2/qmw2clh/xq3cC6aZBImIL/XzGSFoFbIuIa47Q32NmBfiavll3GMLB/+9Daysj4r/AiZKWAjOBLcDtksbmIsrlXoiIlRGxq66+pyNiVP0C/ChvM7NjkJO+WXfo4eCgN/8f2z5PwLMDmEC6mW4aadjcxySdWl+2QX2NvEp63r+Z+pMQMyuJ/EisWXeTNDYi/tFn3ZiI2ClpMXB5RPT22T4M6ImIQjf1NdjnGuDvEbFosO02s9Y56ZuZmXUJd6+ZmZl1CSd9MzOzLuGkb2Zm1iWc9M3MzLqEk76ZmVmXcNI3MzPrEv8DDZyCi7D0kN0AAAAASUVORK5CYII=\n",
836 | "text/plain": [
837 | ""
838 | ]
839 | },
840 | "metadata": {},
841 | "output_type": "display_data"
842 | },
843 | {
844 | "name": "stdout",
845 | "output_type": "stream",
846 | "text": [
847 | "ROC曲线的AUC为: 0.8953941870182941\n"
848 | ]
849 | }
850 | ],
851 | "source": [
852 | "plotROC(aggClassEst,mat(labelArr).T)"
853 | ]
854 | },
855 | {
856 | "cell_type": "markdown",
857 | "metadata": {},
858 | "source": [
859 | "### sklearn实现AdaBoost,用于疝病马数据集"
860 | ]
861 | },
862 | {
863 | "cell_type": "code",
864 | "execution_count": 1,
865 | "metadata": {},
866 | "outputs": [],
867 | "source": [
868 | "from sklearn.ensemble import AdaBoostClassifier"
869 | ]
870 | },
871 | {
872 | "cell_type": "code",
873 | "execution_count": 27,
874 | "metadata": {},
875 | "outputs": [],
876 | "source": [
877 | "dataArr,labelArr=loadDataSet(r'D:\\DM\\python\\data\\MLiA_SourceCode\\machinelearninginaction\\Ch07\\horseColicTraining2.txt')\n",
878 | "dataTest,labelTest=loadDataSet(r'D:\\DM\\python\\data\\MLiA_SourceCode\\machinelearninginaction\\Ch07\\horseColicTest2.txt')\n",
879 | "abc=AdaBoostClassifier(n_estimators=50)"
880 | ]
881 | },
882 | {
883 | "cell_type": "code",
884 | "execution_count": 28,
885 | "metadata": {},
886 | "outputs": [
887 | {
888 | "data": {
889 | "text/plain": [
890 | "0.7761194029850746"
891 | ]
892 | },
893 | "execution_count": 28,
894 | "metadata": {},
895 | "output_type": "execute_result"
896 | }
897 | ],
898 | "source": [
899 | "#训练弱分类器\n",
900 | "abc.fit(dataArr,labelArr)\n",
901 | "Y_pred=abc.predict(dataTest)\n",
902 | "abc.score(dataTest,labelTest)"
903 | ]
904 | },
905 | {
906 | "cell_type": "code",
907 | "execution_count": 33,
908 | "metadata": {},
909 | "outputs": [
910 | {
911 | "name": "stdout",
912 | "output_type": "stream",
913 | "text": [
914 | "[-1. 1.]\n",
915 | "[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n",
916 | " 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.\n",
917 | " 1. 1.]\n",
918 | "[0.28428094 0.33214454 0.39257738 0.40697642 0.41930599 0.40339763\n",
919 | " 0.4357326 0.42456092 0.44568626 0.42683673 0.45293997 0.45518735\n",
920 | " 0.4225583 0.43627423 0.43044037 0.42723727 0.4744538 0.431832\n",
921 | " 0.44713836 0.44962627 0.46277843 0.43343921 0.43638023 0.47462149\n",
922 | " 0.45972328 0.46639559 0.43662019 0.42964436 0.43240759 0.44019402\n",
923 | " 0.48131686 0.48714723 0.47491754 0.45261708 0.45541079 0.45172395\n",
924 | " 0.44629907 0.4741625 0.47014168 0.45891498 0.44262397 0.48395967\n",
925 | " 0.4574864 0.45985565 0.45594619 0.45687237 0.45398568 0.46546841\n",
926 | " 0.47404671 0.45379753]\n"
927 | ]
928 | }
929 | ],
930 | "source": [
931 | "#返回模型属性\n",
932 | "print (abc.classes_) #返回类别标签\n",
933 | "print (abc.estimator_weights_ ) #返回训练得到的各分类器权重 alpham\n",
934 | "print (abc.estimator_errors_) #返回训练得到的各分类器分类误差\n",
935 | "#print (abc.estimators_) #返回训练得到的弱分类器,列表格式"
936 | ]
937 | },
938 | {
939 | "cell_type": "code",
940 | "execution_count": null,
941 | "metadata": {},
942 | "outputs": [],
943 | "source": []
944 | }
945 | ],
946 | "metadata": {
947 | "kernelspec": {
948 | "display_name": "Python 3",
949 | "language": "python",
950 | "name": "python3"
951 | },
952 | "language_info": {
953 | "codemirror_mode": {
954 | "name": "ipython",
955 | "version": 3
956 | },
957 | "file_extension": ".py",
958 | "mimetype": "text/x-python",
959 | "name": "python",
960 | "nbconvert_exporter": "python",
961 | "pygments_lexer": "ipython3",
962 | "version": "3.6.4"
963 | }
964 | },
965 | "nbformat": 4,
966 | "nbformat_minor": 2
967 | }
968 |
--------------------------------------------------------------------------------