├── Adaboost
    └── 【HP20190519】Adaboost算法代码学习.md
├── CART
    └── 【HP20190525】CART算法代码实现.md
├── GBDT
    ├── GBDT_XGBoost_LGBM算法原理v1.1.pdf
    ├── GBDT_python3_code
    │   ├── GBDT_model.md
    │   ├── TempLinkoping2016.txt
    │   ├── readme.md
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── __init__.cpython-37.pyc
    │   │       ├── bools.cpython-37.pyc
    │   │       ├── data_manipulation.cpython-37.pyc
    │   │       ├── data_operation.cpython-37.pyc
    │   │       ├── dates.cpython-37.pyc
    │   │       ├── enum.cpython-37.pyc
    │   │       ├── lists.cpython-37.pyc
    │   │       ├── loss_functions.cpython-37.pyc
    │   │       ├── math.cpython-37.pyc
    │   │       ├── misc.cpython-37.pyc
    │   │       └── objects.cpython-37.pyc
    │   │   ├── bools.py
    │   │   ├── data_manipulation.py
    │   │   ├── data_operation.py
    │   │   ├── dates.py
    │   │   ├── decision_tree
    │   │       ├── __pycache__
    │   │       │   ├── decision_tree_model.cpython-35.pyc
    │   │       │   └── decision_tree_model.cpython-37.pyc
    │   │       ├── decision_tree_classifier_example.py
    │   │       ├── decision_tree_model.py
    │   │       └── decision_tree_regressor_example.py
    │   │   ├── dicts
    │   │       ├── __init__.py
    │   │       ├── __pycache__
    │   │       │   ├── __init__.cpython-37.pyc
    │   │       │   ├── chained_dict.cpython-37.pyc
    │   │       │   ├── helpers.cpython-37.pyc
    │   │       │   └── limited_dict.cpython-37.pyc
    │   │       ├── chained_dict.py
    │   │       ├── helpers.py
    │   │       └── limited_dict.py
    │   │   ├── enum.py
    │   │   ├── lists.py
    │   │   ├── loss_functions.py
    │   │   ├── math.py
    │   │   ├── misc.py
    │   │   └── objects.py
    ├── readme.md
    └── 【HP20190706】《统计学习方法》第一版例题8.2代码实现.md
├── README.md
├── Stacking
    ├── 20190630_titanic_Stacking.ipynb
    ├── Stacking_learn_beta.ipynb
    ├── Stacking_learn_beta.md
    ├── changelog.md
    ├── kaggle_titanic_data
    │   ├── gender_submission.csv
    │   ├── test.csv
    │   └── train.csv
    └── 两层stacking结构理解beta.pdf
├── Xgboost
    ├── XGBoost_code
    │   ├── XGBoost算法代码简易实现.md
    │   ├── output_21_4.png
    │   └── output_24_4.png
    ├── readme.md
    ├── 【HP20190616】xgboost_titanic.md
    └── 一张图说明XGBoost算法.jpg
└── kaggle初学者应该如何参加机器学习比赛.ipynb


/Adaboost/【HP20190519】Adaboost算法代码学习.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ```python
  4 | import numpy as np
  5 | import pandas as pd
  6 | ```
  7 | 
  8 | 
  9 | ```python
 10 | # 通过一个函数来加载特征矩阵和类别标签
 11 | def loadSimpleData():
 12 |     print ("-----加载数据的特征矩阵和类别标签------")
 13 |     datMat = np.matrix([[1,2.1],[2,1.1],[1.3,1],[1,1],[2,1]])
 14 |     classLabels = [1,1,-1,-1,1]
 15 |     print("展示特征矩阵")
 16 |     print(datMat)
 17 |     print("类别标签")
 18 |     print(classLabels)
 19 |     return datMat, classLabels
 20 | ```
 21 | 
 22 | 
 23 | ```python
 24 | datMat,classLabels=loadSimpleData()
 25 | ```
 26 | 
 27 |     -----加载数据的特征矩阵和类别标签------
 28 |     展示特征矩阵
 29 |     [[1.  2.1]
 30 |      [2.  1.1]
 31 |      [1.3 1. ]
 32 |      [1.  1. ]
 33 |      [2.  1. ]]
 34 |     类别标签
 35 |     [1, 1, -1, -1, 1]
 36 |     
 37 | 
 38 | 
 39 | ```python
 40 | # 定义单层决策树的阈值过滤函数
 41 | # 接着是树的分类函数。这个函数在下面的循环里要用到
 42 | # 作用很简单，就是比对每一列的特征值和目标函数，返回比对的结果。四个参数分别是（输入矩阵，第几列，阈值，lt小于或gt大于）
 43 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq):
 44 |     #对数据集每一列的各个特征进行阈值过滤，这里是构建一个以数据特征集的行数相等的元素全部为1的向量
 45 |     retArray=np.ones((np.shape(dataMatrix)[0],1))
 46 |     #阈值的模式，将小于某一阈值的特征归类为-1,下面的逻辑判断将要对每一个元素值与阈值进行比较，如果小于等于这个值，那么就更新这个元素为-1
 47 |     if threshIneq=='lt':
 48 |         retArray[dataMatrix[:,dimen]<=threshVal]=-1.0
 49 |     #将大于某一阈值的特征归类为-1
 50 |     else:
 51 |         retArray[dataMatrix[:,dimen]>threshVal]=-1.0
 52 |     return retArray
 53 | ```
 54 | 
 55 | 
 56 | ```python
 57 | # 定义单层决策树函数，这个单层决策树函数会通过设置一个巧妙的阈值，然后用阈值去判断当前特征向量的每一列的每一个元素是否大于阈值，据此输出类别预测
 58 | def buildStump(dataArr,classLabels,D):
 59 | # 将数据集和标签列表转为矩阵形式，其中标签定义为转置矩阵
 60 |     dataMatrix=np.mat(dataArr);labelMat=np.mat(classLabels).T
 61 |     # 计算矩阵的行数和列数
 62 |     m,n=np.shape(dataMatrix)
 63 |     # 对关键变量进行初始化，设置步长或区间总数，定义一个字典保存最优决策树信息（迭代的时候这里就用来保存每一次的最新的预测结果）
 64 |     # 初始化生成一个m行1列的全0元素的矩阵，这里是为后期最优单层决策树预测结果生成初始化值
 65 |     numSteps=10.0;bestStump={};bestClasEst=np.mat(np.zeros((m,1)))
 66 |     # 最小错误率初始化为+∞,inf是指正无穷的意思，其中错误率应该是一个正实数
 67 |     minError=np.inf
 68 |     # 遍历每一列的特征值,这里的n是原始特征矩阵的列数，机器学习实战中的n=2，那么就是分别遍历两列的特征值
 69 |     for i in range(n):
 70 |         #找出并定义每列中特征值的最小值和最大值
 71 |         rangeMin=dataMatrix[:,i].min();rangeMax=dataMatrix[:,i].max()
 72 |         #求取步长大小或者说区间间隔，这里的步长设置其实只是其中一种定义步长的算法
 73 |         stepSize=(rangeMax-rangeMin)/numSteps
 74 |         #遍历各个步长区间，这里实际上是遍历[-1,11]这个闭区间所有的整数
 75 |         for j in range(-1,int(numSteps)+1):
 76 |             #两种阈值过滤模式
 77 |             for inequal in ['lt','gt']:
 78 |                 #阈值计算公式：最小值+j(-1<=j<=numSteps+1)*步长
 79 |                 # 以机器学习实战的第一列为准，rangeMin=1，j=-1,stepSize=(2-1)/10=0.1,那么阈值threshVal=[1+(-1)*0.1]=0.9
 80 |                 # 这里的阈值设置，其实是非常科学的，虽然我不知道为何这样设置，但是这样设置就是能够比较出来元素和阈值的大小
 81 |                 threshVal=(rangeMin+float(j)*stepSize)
 82 |                 #选定阈值后，调用阈值过滤函数分类预测，\表示换行符，如果取消，就需要将下一行换行回到当前行，否则会报错
 83 |                 # stumpClassify(矩阵，第1列，0.9的阈值，lt),初次运行predictedVals的结果为array([[1.],[1.],[1.],[1.],[1.]])
 84 |                 predictedVals=\
 85 |                     stumpClassify(dataMatrix,i,threshVal,inequal)
 86 |                 # 既然predictedVals已经预测出来，那么就需要去比较预测的结果和真实结果的值的错误率，那么初始化错误向量
 87 |                 # 这里初始化出来的向量是一个元素都为1的5行1列的矩阵，用来对错误的情况进行比对
 88 |                 errArr=np.mat(np.ones((m,1)))
 89 |                 #将错误向量中分类正确项置0
 90 |                 errArr[predictedVals==labelMat]=0
 91 |                 #计算加权的错误率，这里用到了矩阵乘法，矩阵与矩阵相乘，还是一个矩阵，但是一个行向量和一个列向量相乘，是一个数
 92 |                 weightedError=D.T*errArr
 93 |                 #打印相关信息，可省略
 94 |                 print("当前遍历的列数为 %d, 阈值 %.2f, 当前大于小于类型: %s, 加权错误率= %.3f" %\
 95 |                       (i, threshVal, inequal, weightedError))
 96 |                 #如果当前错误率小于当前最小错误率，将当前错误率作为最小错误率
 97 |                 #存储相关信息
 98 |                 if weightedError<minError:
 99 |                     minError=weightedError
100 |                     bestClasEst=predictedVals.copy()
101 |                     bestStump['dim']=i
102 |                     bestStump['thresh']=threshVal
103 |                     bestStump['ineq']=inequal
104 |     #返回最佳单层决策树相关信息的字典，最小错误率，决策树预测输出结果
105 |     return bestStump,minError,bestClasEst
106 | ```
107 | 
108 | 
109 | ```python
110 | # 观察一下权值向量D在做些什么,这里初始化一个权值向量D，其中初始化的向量D的每个权重都是相等的，对应了每一个特征向量
111 | D = np.mat(np.ones((5,1))/5)
112 | print(D)
113 | ```
114 | 
115 |     [[0.2]
116 |      [0.2]
117 |      [0.2]
118 |      [0.2]
119 |      [0.2]]
120 |     
121 | 
122 | 
123 | ```python
124 | buildStump(datMat,classLabels,D)
125 | ```
126 | 
127 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: lt, 加权错误率= 0.400
128 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: gt, 加权错误率= 0.600
129 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.400
130 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.600
131 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: lt, 加权错误率= 0.400
132 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: gt, 加权错误率= 0.600
133 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: lt, 加权错误率= 0.400
134 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: gt, 加权错误率= 0.600
135 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: lt, 加权错误率= 0.200
136 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: gt, 加权错误率= 0.800
137 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: lt, 加权错误率= 0.200
138 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: gt, 加权错误率= 0.800
139 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: lt, 加权错误率= 0.200
140 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: gt, 加权错误率= 0.800
141 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: lt, 加权错误率= 0.200
142 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: gt, 加权错误率= 0.800
143 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: lt, 加权错误率= 0.200
144 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: gt, 加权错误率= 0.800
145 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: lt, 加权错误率= 0.200
146 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: gt, 加权错误率= 0.800
147 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: lt, 加权错误率= 0.200
148 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: gt, 加权错误率= 0.800
149 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: lt, 加权错误率= 0.600
150 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: gt, 加权错误率= 0.400
151 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: lt, 加权错误率= 0.400
152 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: gt, 加权错误率= 0.600
153 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.200
154 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.800
155 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: lt, 加权错误率= 0.400
156 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: gt, 加权错误率= 0.600
157 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: lt, 加权错误率= 0.400
158 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: gt, 加权错误率= 0.600
159 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: lt, 加权错误率= 0.400
160 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: gt, 加权错误率= 0.600
161 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: lt, 加权错误率= 0.400
162 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: gt, 加权错误率= 0.600
163 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: lt, 加权错误率= 0.400
164 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: gt, 加权错误率= 0.600
165 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: lt, 加权错误率= 0.400
166 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: gt, 加权错误率= 0.600
167 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: lt, 加权错误率= 0.400
168 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: gt, 加权错误率= 0.600
169 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: lt, 加权错误率= 0.400
170 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: gt, 加权错误率= 0.600
171 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: lt, 加权错误率= 0.400
172 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: gt, 加权错误率= 0.600
173 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: lt, 加权错误率= 0.600
174 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: gt, 加权错误率= 0.400
175 |     
176 | 
177 | 
178 | 
179 | 
180 |     ({'dim': 0, 'thresh': 1.3, 'ineq': 'lt'}, matrix([[0.2]]), array([[-1.],
181 |             [ 1.],
182 |             [-1.],
183 |             [-1.],
184 |             [ 1.]]))
185 | 
186 | 
187 | 
188 | 
189 | ```python
190 | # 开始定义最终的adaboost算法的主函数部分
191 | ```
192 | 
193 | 
194 | ```python
195 | # adaBoost算法
196 | # dataArr：数据特征矩阵
197 | # classLabels:数据标签向量
198 | # numIt:迭代次数
199 | def adaBoostTrainDS(dataArr,classLabels,numIt=40):
200 |     #弱分类器相关信息列表
201 |     weakClassArr=[]
202 |     #获取数据集行数
203 |     m=np.shape(dataArr)[0]
204 |     # 初始化权重向量的每一项值相等，在机器学习实战中，初始向量共计5个元素，每个元素都等于0.2，全部相加等于1，D的所有权重加起来总是等于1
205 |     D=np.mat(np.ones((m,1))/m)
206 |     # 累计估计值向量,这里初始化的结果是    matrix([[0.],[0.],[0.],[0.],[0.]]).T
207 |     aggClassEst=np.mat(np.zeros((m,1)))
208 |     #循环迭代次数40次，for循环从第一次开始，然后结束在40次
209 |     for i in range(numIt):
210 |         #根据当前数据集，标签及权重建立最佳单层决策树，这里会返回最佳决策树、错误率、分类结果三个结果
211 |         bestStump,error,classEst=buildStump(dataArr,classLabels,D)
212 |         #打印权重向量，初始化未迭代的权重向量为 [[0.2 0.2 0.2 0.2 0.2]]
213 |         print("D:",D.T)
214 |         #求单层决策树的系数alpha
215 |         alpha=float(0.5*np.log((1.0-error)/(max(error,1e-16))))
216 |         #存储决策树的系数alpha，添加到最佳决策树的位置
217 |         bestStump['alpha']=alpha
218 |         #将该决策树存入列表
219 |         weakClassArr.append(bestStump)
220 |         #打印决策树的预测结果
221 |         print("classEst:",classEst.T)
222 |         # 预测正确为exp(-alpha),预测错误为exp(alpha)
223 |         # 即增大分类错误样本的权重，减少分类正确的数据点权重
224 |         # 这里实际上是一个向量，存储每一个分类结果的对应权重，机器学习实战的案例中是5行1列的矩阵
225 |         # 首轮迭代结果    matrix([[ 0.69314718],[-0.69314718],[-0.69314718],[-0.69314718],[-0.69314718]]).T
226 |         expon=np.multiply(-1*alpha*np.mat(classLabels).T,classEst)
227 |         #更新权值向量，这里开始第一轮的更新，后面for循环会不断的更新权重向量D，第一轮更新结果   matrix([[0.4],[0.1],[0.1],[0.1],[0.1]]).T
228 |         D=np.multiply(D,np.exp(expon))
229 |         # D是一个矩阵，这里D.sum()是对矩阵里面的所有元素进行求和，第一轮 D.sum() = 0.7999999
230 |         D=D/D.sum() 
231 |         # 累加当前单层决策树的加权预测值,这里本来aggClassEst是一个矩阵，但是假如alpha*classEst的结果是一个数组
232 |         # 矩阵加法的规则是不能直接相加的，因为不是同型矩阵
233 |         # 但是numpy自带了广播机制，具体可以百度，所以实际上这里是可以相加的，相加的规则是对应位置的横轴进行数据对应一一相加，测试一次就知道结果
234 |         # 在本案例中不涉及广播机制，但是需要了解
235 |         aggClassEst=aggClassEst + alpha*classEst
236 |         print("aggClassEst",aggClassEst.T)
237 |         #求出分类错的样本个数
238 |         aggErrors=np.multiply(np.sign(aggClassEst)!=\
239 |                     np.mat(classLabels).T,np.ones((m,1)))
240 |         #计算错误率
241 |         errorRate=aggErrors.sum()/m
242 |         print("total error:",errorRate,"\n")
243 |         #错误率为0.0退出循环
244 |         if errorRate==0.0:break
245 |     #返回弱分类器的组合列表
246 |     return weakClassArr
247 | ```
248 | 
249 | 
250 | ```python
251 | # 测试结果是迭代三次错误率就为0，因此就此结束循环
252 | classifierArray=adaBoostTrainDS(datMat,classLabels,9)
253 | ```
254 | 
255 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: lt, 加权错误率= 0.400
256 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: gt, 加权错误率= 0.600
257 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.400
258 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.600
259 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: lt, 加权错误率= 0.400
260 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: gt, 加权错误率= 0.600
261 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: lt, 加权错误率= 0.400
262 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: gt, 加权错误率= 0.600
263 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: lt, 加权错误率= 0.200
264 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: gt, 加权错误率= 0.800
265 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: lt, 加权错误率= 0.200
266 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: gt, 加权错误率= 0.800
267 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: lt, 加权错误率= 0.200
268 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: gt, 加权错误率= 0.800
269 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: lt, 加权错误率= 0.200
270 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: gt, 加权错误率= 0.800
271 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: lt, 加权错误率= 0.200
272 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: gt, 加权错误率= 0.800
273 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: lt, 加权错误率= 0.200
274 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: gt, 加权错误率= 0.800
275 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: lt, 加权错误率= 0.200
276 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: gt, 加权错误率= 0.800
277 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: lt, 加权错误率= 0.600
278 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: gt, 加权错误率= 0.400
279 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: lt, 加权错误率= 0.400
280 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: gt, 加权错误率= 0.600
281 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.200
282 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.800
283 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: lt, 加权错误率= 0.400
284 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: gt, 加权错误率= 0.600
285 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: lt, 加权错误率= 0.400
286 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: gt, 加权错误率= 0.600
287 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: lt, 加权错误率= 0.400
288 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: gt, 加权错误率= 0.600
289 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: lt, 加权错误率= 0.400
290 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: gt, 加权错误率= 0.600
291 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: lt, 加权错误率= 0.400
292 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: gt, 加权错误率= 0.600
293 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: lt, 加权错误率= 0.400
294 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: gt, 加权错误率= 0.600
295 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: lt, 加权错误率= 0.400
296 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: gt, 加权错误率= 0.600
297 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: lt, 加权错误率= 0.400
298 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: gt, 加权错误率= 0.600
299 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: lt, 加权错误率= 0.400
300 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: gt, 加权错误率= 0.600
301 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: lt, 加权错误率= 0.600
302 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: gt, 加权错误率= 0.400
303 |     D: [[0.2 0.2 0.2 0.2 0.2]]
304 |     classEst: [[-1.  1. -1. -1.  1.]]
305 |     aggClassEst [[-0.69314718  0.69314718 -0.69314718 -0.69314718  0.69314718]]
306 |     total error: 0.2 
307 |     
308 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: lt, 加权错误率= 0.250
309 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: gt, 加权错误率= 0.750
310 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.625
311 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.375
312 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: lt, 加权错误率= 0.625
313 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: gt, 加权错误率= 0.375
314 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: lt, 加权错误率= 0.625
315 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: gt, 加权错误率= 0.375
316 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: lt, 加权错误率= 0.500
317 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: gt, 加权错误率= 0.500
318 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: lt, 加权错误率= 0.500
319 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: gt, 加权错误率= 0.500
320 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: lt, 加权错误率= 0.500
321 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: gt, 加权错误率= 0.500
322 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: lt, 加权错误率= 0.500
323 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: gt, 加权错误率= 0.500
324 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: lt, 加权错误率= 0.500
325 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: gt, 加权错误率= 0.500
326 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: lt, 加权错误率= 0.500
327 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: gt, 加权错误率= 0.500
328 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: lt, 加权错误率= 0.500
329 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: gt, 加权错误率= 0.500
330 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: lt, 加权错误率= 0.750
331 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: gt, 加权错误率= 0.250
332 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: lt, 加权错误率= 0.250
333 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: gt, 加权错误率= 0.750
334 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.125
335 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.875
336 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: lt, 加权错误率= 0.250
337 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: gt, 加权错误率= 0.750
338 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: lt, 加权错误率= 0.250
339 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: gt, 加权错误率= 0.750
340 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: lt, 加权错误率= 0.250
341 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: gt, 加权错误率= 0.750
342 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: lt, 加权错误率= 0.250
343 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: gt, 加权错误率= 0.750
344 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: lt, 加权错误率= 0.250
345 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: gt, 加权错误率= 0.750
346 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: lt, 加权错误率= 0.250
347 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: gt, 加权错误率= 0.750
348 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: lt, 加权错误率= 0.250
349 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: gt, 加权错误率= 0.750
350 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: lt, 加权错误率= 0.250
351 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: gt, 加权错误率= 0.750
352 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: lt, 加权错误率= 0.250
353 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: gt, 加权错误率= 0.750
354 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: lt, 加权错误率= 0.750
355 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: gt, 加权错误率= 0.250
356 |     D: [[0.5   0.125 0.125 0.125 0.125]]
357 |     classEst: [[ 1.  1. -1. -1. -1.]]
358 |     aggClassEst [[ 0.27980789  1.66610226 -1.66610226 -1.66610226 -0.27980789]]
359 |     total error: 0.2 
360 |     
361 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: lt, 加权错误率= 0.143
362 |     当前遍历的列数为 0, 阈值 0.90, 当前大于小于类型: gt, 加权错误率= 0.857
363 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.357
364 |     当前遍历的列数为 0, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.643
365 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: lt, 加权错误率= 0.357
366 |     当前遍历的列数为 0, 阈值 1.10, 当前大于小于类型: gt, 加权错误率= 0.643
367 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: lt, 加权错误率= 0.357
368 |     当前遍历的列数为 0, 阈值 1.20, 当前大于小于类型: gt, 加权错误率= 0.643
369 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: lt, 加权错误率= 0.286
370 |     当前遍历的列数为 0, 阈值 1.30, 当前大于小于类型: gt, 加权错误率= 0.714
371 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: lt, 加权错误率= 0.286
372 |     当前遍历的列数为 0, 阈值 1.40, 当前大于小于类型: gt, 加权错误率= 0.714
373 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: lt, 加权错误率= 0.286
374 |     当前遍历的列数为 0, 阈值 1.50, 当前大于小于类型: gt, 加权错误率= 0.714
375 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: lt, 加权错误率= 0.286
376 |     当前遍历的列数为 0, 阈值 1.60, 当前大于小于类型: gt, 加权错误率= 0.714
377 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: lt, 加权错误率= 0.286
378 |     当前遍历的列数为 0, 阈值 1.70, 当前大于小于类型: gt, 加权错误率= 0.714
379 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: lt, 加权错误率= 0.286
380 |     当前遍历的列数为 0, 阈值 1.80, 当前大于小于类型: gt, 加权错误率= 0.714
381 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: lt, 加权错误率= 0.286
382 |     当前遍历的列数为 0, 阈值 1.90, 当前大于小于类型: gt, 加权错误率= 0.714
383 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: lt, 加权错误率= 0.857
384 |     当前遍历的列数为 0, 阈值 2.00, 当前大于小于类型: gt, 加权错误率= 0.143
385 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: lt, 加权错误率= 0.143
386 |     当前遍历的列数为 1, 阈值 0.89, 当前大于小于类型: gt, 加权错误率= 0.857
387 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: lt, 加权错误率= 0.500
388 |     当前遍历的列数为 1, 阈值 1.00, 当前大于小于类型: gt, 加权错误率= 0.500
389 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: lt, 加权错误率= 0.571
390 |     当前遍历的列数为 1, 阈值 1.11, 当前大于小于类型: gt, 加权错误率= 0.429
391 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: lt, 加权错误率= 0.571
392 |     当前遍历的列数为 1, 阈值 1.22, 当前大于小于类型: gt, 加权错误率= 0.429
393 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: lt, 加权错误率= 0.571
394 |     当前遍历的列数为 1, 阈值 1.33, 当前大于小于类型: gt, 加权错误率= 0.429
395 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: lt, 加权错误率= 0.571
396 |     当前遍历的列数为 1, 阈值 1.44, 当前大于小于类型: gt, 加权错误率= 0.429
397 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: lt, 加权错误率= 0.571
398 |     当前遍历的列数为 1, 阈值 1.55, 当前大于小于类型: gt, 加权错误率= 0.429
399 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: lt, 加权错误率= 0.571
400 |     当前遍历的列数为 1, 阈值 1.66, 当前大于小于类型: gt, 加权错误率= 0.429
401 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: lt, 加权错误率= 0.571
402 |     当前遍历的列数为 1, 阈值 1.77, 当前大于小于类型: gt, 加权错误率= 0.429
403 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: lt, 加权错误率= 0.571
404 |     当前遍历的列数为 1, 阈值 1.88, 当前大于小于类型: gt, 加权错误率= 0.429
405 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: lt, 加权错误率= 0.571
406 |     当前遍历的列数为 1, 阈值 1.99, 当前大于小于类型: gt, 加权错误率= 0.429
407 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: lt, 加权错误率= 0.857
408 |     当前遍历的列数为 1, 阈值 2.10, 当前大于小于类型: gt, 加权错误率= 0.143
409 |     D: [[0.28571429 0.07142857 0.07142857 0.07142857 0.5       ]]
410 |     classEst: [[1. 1. 1. 1. 1.]]
411 |     aggClassEst [[ 1.17568763  2.56198199 -0.77022252 -0.77022252  0.61607184]]
412 |     total error: 0.0 
413 |     
414 |     
415 | 
416 | 
417 | ```python
418 | classifierArray
419 | ```
420 | 
421 | 
422 | 
423 | 
424 |     [{'dim': 0, 'thresh': 1.3, 'ineq': 'lt', 'alpha': 0.6931471805599453},
425 |      {'dim': 1, 'thresh': 1.0, 'ineq': 'lt', 'alpha': 0.9729550745276565},
426 |      {'dim': 0, 'thresh': 0.9, 'ineq': 'lt', 'alpha': 0.8958797346140273}]
427 | 
428 | 
429 | 
430 | 
431 | ```python
432 | 
433 | ```
434 | 


--------------------------------------------------------------------------------
/CART/【HP20190525】CART算法代码实现.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | ```python
  4 | import numpy as np
  5 | import pandas as pd
  6 | ```
  7 | 
  8 | 
  9 | ```python
 10 | # 读取一个以tab键为分隔符的文件，然后将每行内容保存成一组浮点数,其中\t转义字符表示为tab四个空格
 11 | def loaddataset(filename):
 12 |     # 定义一个空列表
 13 |     datamat = []
 14 |     # 打开一个filename文件
 15 |     fr = open(filename)
 16 |     # readlines() 方法用于读取所有行(直到结束符 EOF)并返回列表，该列表可以由 Python 的 for... in ... 结构进行处理。
 17 |     # 如果碰到结束符 EOF 则返回空字符串。
 18 |     for line in fr.readlines():
 19 |         # strip() 方法用于移除字符串头尾指定的字符（默认为空格或换行符）
 20 |         # split() 通过指定分隔符对字符串进行分割切片
 21 |         curline = line.strip().split('\t')
 22 |         # map() 会根据提供的函数对指定序列做映射,python2.0和3.0版本在返回结果上有差异，这里是指对curline做float操作并返回结果
 23 |         fltline = list(map(float,curline))
 24 |         # 将结果添加到原来的空列表中去
 25 |         datamat.append(fltline)
 26 |     return datamat
 27 | ```
 28 | 
 29 | 
 30 | ```python
 31 | # 机器学习实战的代码是基于python2，这里是基于python3,因此代码有细微差异
 32 | # 定义函数和三个参数，数据集、待切分的特征、该特征的某个值
 33 | # 该函数将通过数组过滤的方式对数据集切分得到两个子集并返回结果
 34 | def binsplitdataset(dataset, feature, value):
 35 |     # nonzero是numpy中用于得到数组array中非零元素的位置（数组索引）的函数
 36 |     mat0 = dataset[np.nonzero(dataset[:, feature] > value)[0],:]
 37 |     mat1 = dataset[np.nonzero(dataset[:, feature] <= value)[0],:]
 38 |     return mat0,mat1
 39 | ```
 40 | 
 41 | 
 42 | ```python
 43 | # np.eye生成对角矩阵
 44 | testmat = np.mat(np.eye(4))
 45 | ```
 46 | 
 47 | 
 48 | ```python
 49 | testmat
 50 | ```
 51 | 
 52 | 
 53 | 
 54 | 
 55 |     matrix([[1., 0., 0., 0.],
 56 |             [0., 1., 0., 0.],
 57 |             [0., 0., 1., 0.],
 58 |             [0., 0., 0., 1.]])
 59 | 
 60 | 
 61 | 
 62 | 
 63 | ```python
 64 | mat0, mat1 = binsplitdataset(testmat,1,0.5)
 65 | ```
 66 | 
 67 | 
 68 | ```python
 69 | print('---展示按照某个特征的大于阈值部分切片的矩阵第一部分---')
 70 | print(mat0)
 71 | print('---展示按照某个特征的小于等于阈值部分切片的矩阵第二部分---')
 72 | print(mat1)
 73 | ```
 74 | 
 75 |     ---展示按照某个特征的大于阈值部分切片的矩阵第一部分---
 76 |     [[0. 1. 0. 0.]]
 77 |     ---展示按照某个特征的小于等于阈值部分切片的矩阵第二部分---
 78 |     [[1. 0. 0. 0.]
 79 |      [0. 0. 1. 0.]
 80 |      [0. 0. 0. 1.]]
 81 |     
 82 | 
 83 | 
 84 | ```python
 85 | # 对数据集取出目标变量列，同时对目标变量求均值，比如上述的案例的对焦矩阵中，目标变量是最后一列，求均值等于0.25
 86 | def regleaf(dataset):
 87 |     return np.mean(dataset[:,-1])
 88 | ```
 89 | 
 90 | 
 91 | ```python
 92 | def regerr(dataset):
 93 |     # np.var函数是求方差，方差是指所有数与数组均值之差的平方和的均值，方差表达的是数组内的数的差异度
 94 |     # 这里表达的是目标变量的方差乘以数据集的样本个数（即行数），返回的是总方差
 95 |     # 实际含义为计算目标变量的平方误差
 96 |     return np.var(dataset[:,-1]) * np.shape(dataset)[0]
 97 | ```
 98 | 
 99 | 
100 | ```python
101 | # 定义函数来求数据的目标变量的均值，目标变量的平方误差，同时定义一个列表来控制函数停止的时机
102 | def chooseBestSplit(dataset, leaftype=regleaf, errtype=regerr, ops=(1,4)):
103 |     # 使用两个变量来作为控制阈值，tolS是允许的误差下降值，tolN是切分的最少样本数
104 |     tolS=ops[0]; tolN=ops[1]
105 |     # 先将目标变量转置为一个列表,然后选取这个列表中的不重复的元素组合成为一个元组,然后计算这个元祖的元素数量并判断是否唯一
106 |     if len(set(dataset[:,-1].T.tolist()[0])) == 1:
107 |         # 在Python中，有一个特殊的表示，None，它就是空
108 |         # 如果所有目标变量的值都相等，那就不用预测了，退出，那么返回空值，并直接计算目标变量的均值
109 |         return None, leaftype(dataset)
110 |     # 计算数据集的行数和列数，行数代表数据集大小，列数代表特征值数量和目标变量的和
111 |     m,n = np.shape(dataset)
112 |     # 计算数据的平方误差
113 |     S = errtype(dataset)
114 |     # 定义bestS初始化为正无穷，这一步在初始化
115 |     bestS = np.inf; bestIndex = 0; bestValue = 0
116 |     # 遍历所有特征，这里n-1就是所有的特征数量
117 |     # 对每一个特征
118 |     for featIndex in range(n-1):
119 |         # 对每一个特征里面的特征值
120 |         for splitVal in set((dataset[:,featIndex].T.A.tolist())[0]):
121 |             # 通过数据集的特征和特征值来对数据进行切分
122 |             mat0,mat1 = binsplitdataset(dataset, featIndex, splitVal)
123 |             # continue 语句用来告诉Python跳过当前循环的剩余语句，然后继续进行下一轮循环
124 |             if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):continue
125 |             # 计算切分的两个单元分别的平方误差和
126 |             newS=errtype(mat0) + errtype(mat1)
127 |             # 更新平方误差值，并更新前期初始化的值为新的特征和新的特征值
128 |             if newS < bestS:
129 |                 bestIndex = featIndex
130 |                 bestValue = splitVal
131 |                 bestS = newS
132 |     # 假如数据的误差减小量低于阈值，则退出，并计算数据集的目标变量的均值
133 |     if (S-bestS) < tolS:
134 |         return None, leaftype(dataset)
135 |      # 通过数据集的当前的特征和特征值来对数据进行切分
136 |     mat0, mat1 = binsplitdataset(dataset, bestIndex, bestValue)
137 |     # 如果切分出的数据集很小且低于阈值，则退出，并计算数据集的目标变量的均值
138 |     if (np.shape(mat0)[0]< tolN) or (np.shape(mat1)[0]< tolN):
139 |         return None, leaftype(dataset)
140 |     # 返回最佳的切分特征和最佳的切分特征值
141 |     return bestIndex,bestValue
142 | ```
143 | 
144 | 
145 | ```python
146 | # 生成最终的回归树，树的后三个参数决定了树的类型，分别为给出建立叶节点的函数、计算误差的函数、设置树生成的一个控制阈值
147 | def createtree(dataset, leaftype=regleaf, errtype=regerr, ops=(1,4)):
148 |      # 返回最佳的切分特征和最佳的切分特征值
149 |     feat, val = chooseBestSplit(dataset,leaftype, errtype, ops)
150 |     if feat == None:return val
151 |     rettree = {}
152 |     rettree['spind'] = feat
153 |     rettree['spval'] = val
154 |     lset, rset = binsplitdataset(dataset, feat, val)
155 |     # 这里是递归调用本函数
156 |     rettree['left'] = createtree(lset, leaftype, errtype, ops)
157 |     rettree['right'] = createtree(rset, leaftype, errtype, ops)
158 |     return rettree
159 | ```
160 | 
161 | 
162 | ```python
163 | myDat = loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex00.txt')
164 | ```
165 | 
166 | 
167 | ```python
168 | myMat = np.mat(myDat)
169 | myMat
170 | ```
171 | 
172 | 
173 | 
174 | 
175 |     matrix([[ 3.609800e-02,  1.550960e-01],
176 |             [ 9.933490e-01,  1.077553e+00],
177 |             [ 5.308970e-01,  8.934620e-01],
178 |             [ 7.123860e-01,  5.648580e-01],
179 |             [ 3.435540e-01, -3.717000e-01],
180 |             [ 9.801600e-02, -3.327600e-01],
181 |             [ 6.911150e-01,  8.343910e-01],
182 |             [ 9.135800e-02,  9.993500e-02],
183 |             [ 7.270980e-01,  1.000567e+00],
184 |             [ 9.519490e-01,  9.452550e-01],
185 |             [ 7.685960e-01,  7.602190e-01],
186 |             [ 5.413140e-01,  8.937480e-01],
187 |             [ 1.463660e-01,  3.428300e-02],
188 |             [ 6.731950e-01,  9.150770e-01],
189 |             [ 1.835100e-01,  1.848430e-01],
190 |             [ 3.395630e-01,  2.067830e-01],
191 |             [ 5.179210e-01,  1.493586e+00],
192 |             [ 7.037550e-01,  1.101678e+00],
193 |             [ 8.307000e-03,  6.997600e-02],
194 |             [ 2.439090e-01, -2.946700e-02],
195 |             [ 3.069640e-01, -1.773210e-01],
196 |             [ 3.649200e-02,  4.081550e-01],
197 |             [ 2.955110e-01,  2.882000e-03],
198 |             [ 8.375220e-01,  1.229373e+00],
199 |             [ 2.020540e-01, -8.774400e-02],
200 |             [ 9.193840e-01,  1.029889e+00],
201 |             [ 3.772010e-01, -2.435500e-01],
202 |             [ 8.148250e-01,  1.095206e+00],
203 |             [ 6.112700e-01,  9.820360e-01],
204 |             [ 7.224300e-02, -4.209830e-01],
205 |             [ 4.102300e-01,  3.317220e-01],
206 |             [ 8.690770e-01,  1.114825e+00],
207 |             [ 6.205990e-01,  1.334421e+00],
208 |             [ 1.011490e-01,  6.883400e-02],
209 |             [ 8.208020e-01,  1.325907e+00],
210 |             [ 5.200440e-01,  9.619830e-01],
211 |             [ 4.881300e-01, -9.779100e-02],
212 |             [ 8.198230e-01,  8.352640e-01],
213 |             [ 9.750220e-01,  6.735790e-01],
214 |             [ 9.531120e-01,  1.064690e+00],
215 |             [ 4.759760e-01, -1.637070e-01],
216 |             [ 2.731470e-01, -4.552190e-01],
217 |             [ 8.045860e-01,  9.240330e-01],
218 |             [ 7.479500e-02, -3.496920e-01],
219 |             [ 6.253360e-01,  6.236960e-01],
220 |             [ 6.562180e-01,  9.585060e-01],
221 |             [ 8.340780e-01,  1.010580e+00],
222 |             [ 7.819300e-01,  1.074488e+00],
223 |             [ 9.849000e-03,  5.659400e-02],
224 |             [ 3.022170e-01, -1.486500e-01],
225 |             [ 6.782870e-01,  9.077270e-01],
226 |             [ 1.805060e-01,  1.036760e-01],
227 |             [ 1.936410e-01, -3.275890e-01],
228 |             [ 3.434790e-01,  1.752640e-01],
229 |             [ 1.458090e-01,  1.369790e-01],
230 |             [ 9.967570e-01,  1.035533e+00],
231 |             [ 5.902100e-01,  1.336661e+00],
232 |             [ 2.380700e-01, -3.584590e-01],
233 |             [ 5.613620e-01,  1.070529e+00],
234 |             [ 3.775970e-01,  8.850500e-02],
235 |             [ 9.914200e-02,  2.528000e-02],
236 |             [ 5.395580e-01,  1.053846e+00],
237 |             [ 7.902400e-01,  5.332140e-01],
238 |             [ 2.422040e-01,  2.093590e-01],
239 |             [ 1.523240e-01,  1.328580e-01],
240 |             [ 2.526490e-01, -5.561300e-02],
241 |             [ 8.959300e-01,  1.077275e+00],
242 |             [ 1.333000e-01, -2.231430e-01],
243 |             [ 5.597630e-01,  1.253151e+00],
244 |             [ 6.436650e-01,  1.024241e+00],
245 |             [ 8.772410e-01,  7.970050e-01],
246 |             [ 6.137650e-01,  1.621091e+00],
247 |             [ 6.457620e-01,  1.026886e+00],
248 |             [ 6.513760e-01,  1.315384e+00],
249 |             [ 6.977180e-01,  1.212434e+00],
250 |             [ 7.425270e-01,  1.087056e+00],
251 |             [ 9.010560e-01,  1.055900e+00],
252 |             [ 3.623140e-01, -5.564640e-01],
253 |             [ 9.482680e-01,  6.318620e-01],
254 |             [ 2.340000e-04,  6.090300e-02],
255 |             [ 7.500780e-01,  9.062910e-01],
256 |             [ 3.254120e-01, -2.192450e-01],
257 |             [ 7.268280e-01,  1.017112e+00],
258 |             [ 3.480130e-01,  4.893900e-02],
259 |             [ 4.581210e-01, -6.145600e-02],
260 |             [ 2.807380e-01, -2.288800e-01],
261 |             [ 5.677040e-01,  9.690580e-01],
262 |             [ 7.509180e-01,  7.481040e-01],
263 |             [ 5.758050e-01,  8.990900e-01],
264 |             [ 5.079400e-01,  1.107265e+00],
265 |             [ 7.176900e-02, -1.109460e-01],
266 |             [ 5.535200e-01,  1.391273e+00],
267 |             [ 4.011520e-01, -1.216400e-01],
268 |             [ 4.066490e-01, -3.663170e-01],
269 |             [ 6.521210e-01,  1.004346e+00],
270 |             [ 3.478370e-01, -1.534050e-01],
271 |             [ 8.193100e-02, -2.697560e-01],
272 |             [ 8.216480e-01,  1.280895e+00],
273 |             [ 4.801400e-02,  6.449600e-02],
274 |             [ 1.309620e-01,  1.842410e-01],
275 |             [ 7.734220e-01,  1.125943e+00],
276 |             [ 7.896250e-01,  5.526140e-01],
277 |             [ 9.699400e-02,  2.271670e-01],
278 |             [ 6.257910e-01,  1.244731e+00],
279 |             [ 5.895750e-01,  1.185812e+00],
280 |             [ 3.231810e-01,  1.808110e-01],
281 |             [ 8.224430e-01,  1.086648e+00],
282 |             [ 3.603230e-01, -2.048300e-01],
283 |             [ 9.501530e-01,  1.022906e+00],
284 |             [ 5.275050e-01,  8.795600e-01],
285 |             [ 8.600490e-01,  7.174900e-01],
286 |             [ 7.044000e-03,  9.415000e-02],
287 |             [ 4.383670e-01,  3.401400e-02],
288 |             [ 5.745730e-01,  1.066130e+00],
289 |             [ 5.366890e-01,  8.672840e-01],
290 |             [ 7.821670e-01,  8.860490e-01],
291 |             [ 9.898880e-01,  7.442070e-01],
292 |             [ 7.614740e-01,  1.058262e+00],
293 |             [ 9.854250e-01,  1.227946e+00],
294 |             [ 1.325430e-01, -3.293720e-01],
295 |             [ 3.469860e-01, -1.503890e-01],
296 |             [ 7.687840e-01,  8.997050e-01],
297 |             [ 8.489210e-01,  1.170959e+00],
298 |             [ 4.492800e-01,  6.909800e-02],
299 |             [ 6.617200e-02,  5.243900e-02],
300 |             [ 8.137190e-01,  7.066010e-01],
301 |             [ 6.619230e-01,  7.670400e-01],
302 |             [ 5.294910e-01,  1.022206e+00],
303 |             [ 8.464550e-01,  7.200300e-01],
304 |             [ 4.486560e-01,  2.697400e-02],
305 |             [ 7.950720e-01,  9.657210e-01],
306 |             [ 1.181560e-01, -7.740900e-02],
307 |             [ 8.424800e-02, -1.954700e-02],
308 |             [ 8.458150e-01,  9.526170e-01],
309 |             [ 5.769460e-01,  1.234129e+00],
310 |             [ 7.720830e-01,  1.299018e+00],
311 |             [ 6.966480e-01,  8.454230e-01],
312 |             [ 5.950120e-01,  1.213435e+00],
313 |             [ 6.486750e-01,  1.287407e+00],
314 |             [ 8.970940e-01,  1.240209e+00],
315 |             [ 5.529900e-01,  1.036158e+00],
316 |             [ 3.329820e-01,  2.100840e-01],
317 |             [ 6.561500e-02, -3.069700e-01],
318 |             [ 2.786610e-01,  2.536280e-01],
319 |             [ 7.731680e-01,  1.140917e+00],
320 |             [ 2.036930e-01, -6.403600e-02],
321 |             [ 3.556880e-01, -1.193990e-01],
322 |             [ 9.888520e-01,  1.069062e+00],
323 |             [ 5.187350e-01,  1.037179e+00],
324 |             [ 5.145630e-01,  1.156648e+00],
325 |             [ 9.764140e-01,  8.629110e-01],
326 |             [ 9.190740e-01,  1.123413e+00],
327 |             [ 6.977770e-01,  8.278050e-01],
328 |             [ 9.280970e-01,  8.832250e-01],
329 |             [ 9.002720e-01,  9.968710e-01],
330 |             [ 3.441020e-01, -6.153900e-02],
331 |             [ 1.480490e-01,  2.042980e-01],
332 |             [ 1.300520e-01, -2.616700e-02],
333 |             [ 3.020010e-01,  3.171350e-01],
334 |             [ 3.371000e-01,  2.633200e-02],
335 |             [ 3.149240e-01, -1.952000e-03],
336 |             [ 2.696810e-01, -1.659710e-01],
337 |             [ 1.960050e-01, -4.884700e-02],
338 |             [ 1.290610e-01,  3.051070e-01],
339 |             [ 9.367830e-01,  1.026258e+00],
340 |             [ 3.055400e-01, -1.159910e-01],
341 |             [ 6.839210e-01,  1.414382e+00],
342 |             [ 6.223980e-01,  7.663300e-01],
343 |             [ 9.025320e-01,  8.616010e-01],
344 |             [ 7.125030e-01,  9.334900e-01],
345 |             [ 5.900620e-01,  7.055310e-01],
346 |             [ 7.231200e-01,  1.307248e+00],
347 |             [ 1.882180e-01,  1.136850e-01],
348 |             [ 6.436010e-01,  7.825520e-01],
349 |             [ 5.202070e-01,  1.209557e+00],
350 |             [ 2.331150e-01, -3.481470e-01],
351 |             [ 4.656250e-01, -1.529400e-01],
352 |             [ 8.845120e-01,  1.117833e+00],
353 |             [ 6.632000e-01,  7.016340e-01],
354 |             [ 2.688570e-01,  7.344700e-02],
355 |             [ 7.292340e-01,  9.319560e-01],
356 |             [ 4.296640e-01, -1.886590e-01],
357 |             [ 7.371890e-01,  1.200781e+00],
358 |             [ 3.785950e-01, -2.960940e-01],
359 |             [ 9.301730e-01,  1.035645e+00],
360 |             [ 7.743010e-01,  8.367630e-01],
361 |             [ 2.739400e-01, -8.571300e-02],
362 |             [ 8.244420e-01,  1.082153e+00],
363 |             [ 6.260110e-01,  8.405440e-01],
364 |             [ 6.793900e-01,  1.307217e+00],
365 |             [ 5.782520e-01,  9.218850e-01],
366 |             [ 7.855410e-01,  1.165296e+00],
367 |             [ 5.974090e-01,  9.747700e-01],
368 |             [ 1.408300e-02, -1.325250e-01],
369 |             [ 6.638700e-01,  1.187129e+00],
370 |             [ 5.523810e-01,  1.369630e+00],
371 |             [ 6.838860e-01,  9.999850e-01],
372 |             [ 2.103340e-01, -6.899000e-03],
373 |             [ 6.045290e-01,  1.212685e+00],
374 |             [ 2.507440e-01,  4.629700e-02]])
375 | 
376 | 
377 | 
378 | 
379 | ```python
380 | createtree(myMat)
381 | ```
382 | 
383 | 
384 | 
385 | 
386 |     {'spind': 0,
387 |      'spval': 0.48813,
388 |      'left': 1.0180967672413792,
389 |      'right': -0.04465028571428572}
390 | 
391 | 
392 | 
393 | 
394 | ```python
395 | import matplotlib.pyplot as plt 
396 | myDat=loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex00.txt')
397 | myMat=np.mat(myDat) 
398 | createtree(myMat) 
399 | plt.plot(myMat[:,0],myMat[:,1],'ro') 
400 | plt.show()
401 | ```
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | ```python
409 | myDat1 = np.mat(loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex0.txt'))
410 | createtree(myDat1)
411 | ```
412 | 
413 | 
414 | 
415 | 
416 |     {'spind': 1,
417 |      'spval': 0.39435,
418 |      'left': {'spind': 1,
419 |       'spval': 0.582002,
420 |       'left': {'spind': 1,
421 |        'spval': 0.797583,
422 |        'left': 3.9871632,
423 |        'right': 2.9836209534883724},
424 |       'right': 1.980035071428571},
425 |      'right': {'spind': 1,
426 |       'spval': 0.197834,
427 |       'left': 1.0289583666666666,
428 |       'right': -0.023838155555555553}}
429 | 
430 | 
431 | 
432 | 
433 | ```python
434 | import matplotlib.pyplot as plt 
435 | myDat1 = np.mat(loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex0.txt'))
436 | myMat1=np.mat(myDat1) 
437 | createtree(myMat1) 
438 | plt.plot(myMat1[:,1],myMat1[:,2],'ro') 
439 | plt.show()
440 | 
441 | ```
442 | 
443 | 
444 | 
445 | ```python
446 | 
447 | ```
448 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_XGBoost_LGBM算法原理v1.1.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_XGBoost_LGBM算法原理v1.1.pdf


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/GBDT_model.md:
--------------------------------------------------------------------------------
  1 | #### 基于面向对象思路实现的代码若无基础需先补充面向对象知识
  2 | 
  3 | 推荐面向对象知识补充路径：
  4 | 
  5 | https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528
  6 | 
  7 | 
  8 | ```python
  9 | 
 10 | # __future__模块，把下一个新版本的特性导入到当前版本，于是我们就可以在当前版本中测试一些新版本的特性，解决python2中运行pytho3兼容性问题
 11 | #　如果某个版本中出现了某个新的功能特性，而且这个特性和当前版本中使用的不兼容
 12 | # 也就是它在该版本中不是语言标准，那么我如果想要使用的话就需要从future模块导入
 13 | # division 表示精确除法
 14 | from __future__ import division, print_function
 15 | import numpy as np
 16 | # 显示完成的进度条
 17 | from progressbar import *
 18 | 
 19 | 
 20 | # 这段代码主要展示progressbar进度条的作用，在于展示任务的完成进度并显示出来，与本项目无任何关系
 21 | import time
 22 | from progressbar import *
 23 |  
 24 | total = 1000
 25 |  
 26 | def dosomework():
 27 |     time.sleep(0.01)
 28 |  
 29 | progress = ProgressBar()
 30 | for i in progress(range(1000)):
 31 |     dosomework()
 32 | 
 33 | 
 34 | 
 35 | 
 36 | 
 37 | 
 38 | # 导入辅助函数,这里的辅助函数全部在模块中，如果代码报错，需要自行对辅助函数的py文件和类进行整理
 39 | 
 40 | # 训练集和测试集划分函数、数据处理标准化函数、将整型的类别标签转为onehot编码函数
 41 | from utils.data_manipulation import train_test_split, standardize, to_categorical
 42 | 
 43 | # 导入均方误差函数、分类AUC评估函数
 44 | from utils.data_operation import mean_squared_error, accuracy_score
 45 | 
 46 | # GBDT需要用到决策树的回归树模块，这也是GBDT的核心基础算法之一，如果对决策树不熟悉，需要先学习决策树decision_tree库下面的代码
 47 | from utils.decision_tree.decision_tree_model import RegressionTree
 48 | 
 49 | # 导入进度条调度函数，方便展示模型训练进度和倒计时
 50 | from utils.misc import bar_widgets
 51 | 
 52 | # 导入平方损失函数、交叉熵损失函数（用于多分类损失评估）、softmax损失函数
 53 | from utils.loss_functions import SquareLoss, CrossEntropy, SoftMaxLoss
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | # 这里定义GBDT的核心算法父类，后面的分类和回归算法直接继承父类的函数方法
 61 | class GBDT(object):
 62 |     """使用一组回归树来训练预测梯度损失函数。
 63 |     参数:
 64 |     -----------
 65 |     n_estimators: int
 66 |         树的数量
 67 |         The number of classification trees that are used.
 68 |     learning_rate: float
 69 |         梯度下降的学习率
 70 |         The step length that will be taken when following the negative gradient during
 71 |         training.
 72 |     min_samples_split: int
 73 |         每棵子树的节点的最小数目（小于后不继续切割）
 74 |         The minimum number of samples needed to make a split when building a tree.
 75 |     min_impurity: float
 76 |         每棵子树的最小纯度（小于后不继续切割）
 77 |         The minimum impurity required to split the tree further.
 78 |     max_depth: int
 79 |         每棵子树的最大层数（大于后不继续切割）
 80 |         The maximum depth of a tree.
 81 |     regression: boolean
 82 |         是否为回归问题
 83 |         True or false depending on if we're doing regression or classification.
 84 |     """
 85 | 
 86 |     def __init__(self, n_estimators, learning_rate, min_samples_split,
 87 |                  min_impurity, max_depth, regression):
 88 |         
 89 |         # self表示实例本身，在__init__方法内部，就可以把各种属性绑定到self
 90 |         # 由于类可以起到模板的作用，因此，可以在创建实例的时候，把一些我们认为必须绑定的属性强制填写进去。
 91 |         # 通过定义一个特殊的__init__方法，在创建实例的时候，把上述的属性（参数）绑定到self
 92 |         
 93 |         # 树的棵数、梯度下降学习率α、每棵子树节点最小数量、每棵子树的最小纯度、每棵子树最大深度、是否为回归问题
 94 |         
 95 |         self.n_estimators = n_estimators
 96 |         self.learning_rate = learning_rate
 97 |         self.min_samples_split = min_samples_split
 98 |         self.min_impurity = min_impurity
 99 |         self.max_depth = max_depth
100 |         self.regression = regression
101 | 
102 |         # 进度条 processbar
103 |         self.bar = progressbar.ProgressBar(widgets=bar_widgets)
104 | 
105 |         # 定义损失函数为平方损失
106 |         self.loss = SquareLoss()
107 |         
108 |         # 如果是分类问题，则使用SoftMax损失
109 |         if not self.regression:
110 |             self.loss = SoftMaxLoss()
111 | 
112 |         # 分类问题也使用回归树，利用残差去学习概率
113 |         self.trees = []
114 |         
115 |         # 遍历每一棵树，并对每一颗数调用回归树方法，并限制树的相关参数执行上述定义的参数
116 |         for i in range(self.n_estimators):
117 |             self.trees.append(RegressionTree(min_samples_split=self.min_samples_split,
118 |                                              min_impurity=self.min_impurity,
119 |                                              max_depth=self.max_depth))
120 |     
121 |     
122 |     # GBDT的核心代码
123 |     def fit(self, X, y):
124 |         # 让第一棵树去拟合模型
125 |         self.trees[0].fit(X, y)
126 |         y_pred = self.trees[0].predict(X)
127 |         for i in self.bar(range(1, self.n_estimators)):
128 |             gradient = self.loss.gradient(y, y_pred)
129 |             self.trees[i].fit(X, gradient)
130 |             y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
131 |     
132 |     # np.multiply 矩阵乘法
133 |     def predict(self, X):
134 |         y_pred = self.trees[0].predict(X)
135 |         for i in range(1, self.n_estimators):
136 |             y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X))
137 | 
138 |         # 判断是分类问题则转换预测值的表达方式
139 |         if not self.regression:
140 |             # 如果是分类问题，则转换为概率分布
141 |             y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1)
142 |             # 将标签设置为最大化概率的值
143 |             y_pred = np.argmax(y_pred, axis=1)
144 |         return y_pred
145 | 
146 | 
147 | 
148 | 
149 | 
150 | # 对np.argmax的功能做一次演示,按行搜索最大值的索引，当前行存在多个并列最大值取第一个所在位置索引
151 | import numpy as np
152 | a = np.array([[1, 5, 5, 2],
153 |               [9, 6, 2, 8],
154 |               [3, 7, 9, 1]])
155 | 
156 | c=np.argmax(a, axis=1)
157 | print(c)
158 | 
159 | [1 0 2]
160 | 
161 | 
162 | # 对np.multiply功能做一次演示，向量乘法和矩阵乘法
163 | A = np.array([[1, 2],
164 |        [3, 4]])
165 | B = np.array([[0, 1],
166 |        [2, 3]])
167 | C = np.multiply(A, B)
168 | 
169 | print(C)
170 | 
171 | print(np.multiply(np.mat(A),np.mat(B)))
172 | 
173 | 
174 | [[ 0  2]
175 |  [ 6 12]]
176 |  
177 | [[ 0  2]
178 |  [ 6 12]]
179 | 
180 | 
181 | 
182 | 
183 | 
184 | 
185 | 
186 | 
187 | # GBDT回归算法
188 | class GBDTRegressor(GBDT):
189 |     def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2,
190 |                  min_var_red=1e-7, max_depth=4, debug=False):
191 |         super(GBDTRegressor, self).__init__(n_estimators=n_estimators,
192 |                                             learning_rate=learning_rate,
193 |                                             min_samples_split=min_samples_split,
194 |                                             min_impurity=min_var_red,
195 |                                             max_depth=max_depth,
196 |                                             regression=True)
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | # GBDT分类算法
204 | # 在类中提前定义好训练的参数
205 | class GBDTClassifier(GBDT):
206 |     def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2,
207 |                  min_info_gain=1e-7, max_depth=2, debug=False):
208 |         super(GBDTClassifier, self).__init__(n_estimators=n_estimators,
209 |                                              learning_rate=learning_rate,
210 |                                              min_samples_split=min_samples_split,
211 |                                              min_impurity=min_info_gain,
212 |                                              max_depth=max_depth,
213 |                                              regression=False)
214 |     def fit(self, X, y):
215 |         
216 |         # 对多分类label进行one_hot编码
217 |         y = to_categorical(y)
218 |         super(GBDTClassifier, self).fit(X, y)
219 |         
220 |         
221 |         
222 |         
223 | 
224 | 
225 | 
226 | 
227 | 
228 | 
229 | # 分类算法的具体测试实例
230 | 
231 | from __future__ import division, print_function
232 | import numpy as np
233 | from sklearn import datasets
234 | import matplotlib.pyplot as plt
235 | 
236 | from utils.misc import Plot
237 | 
238 | 
239 | def main():
240 | 
241 |     print ("-- Gradient Boosting Classification --")
242 |     
243 |     # 利用自带的鸢尾花数据集
244 |     data = datasets.load_iris()
245 |     X = data.data
246 |     y = data.target
247 | 
248 |     # 划分训练集和测试集，测试集比例40%
249 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
250 |     print(y_train)
251 | 
252 |     clf = GBDTClassifier()
253 |     clf.fit(X_train, y_train)
254 |     y_pred = clf.predict(X_test)
255 | 
256 |     accuracy = accuracy_score(y_test, y_pred)
257 | 
258 |     print ("Accuracy:", accuracy)
259 | 
260 | 
261 |     Plot().plot_in_2d(X_test, y_pred,
262 |         title="Gradient Boosting",
263 |         accuracy=accuracy,
264 |         legend_labels=data.target_names)
265 | 
266 | 
267 | 
268 | if __name__ == "__main__":
269 |     main()
270 |     
271 |     
272 |     
273 | 
274 | 
275 | 
276 | 
277 | 
278 | 
279 | 
280 | # 回归算法的具体测试实例，此处需要导入文件TempLinkoping2016.txt进行测试
281 | # from __future__ import division, print_function
282 | # import numpy as np
283 | import pandas as pd
284 | # import matplotlib.pyplot as plt
285 | # import progressbar
286 | 
287 | # from utils import train_test_split, standardize, to_categorical
288 | # from utils import mean_squared_error, accuracy_score, Plot
289 | # from utils.loss_functions import SquareLoss
290 | # from utils.misc import bar_widgets
291 | # from gradient_boosting_decision_tree.gbdt_model import GBDTRegressor
292 | 
293 | def main():
294 |     print ("-- Gradient Boosting Regression --")
295 | 
296 |     # Load temperature data
297 |     data = pd.read_csv('D:\Git\Machine_Learning_in_Action_for_smallwhite\GBDT\GBDT_python3_code\TempLinkoping2016.txt', sep="\t")
298 | 
299 |     time = np.atleast_2d(data["time"].as_matrix()).T
300 |     temp = np.atleast_2d(data["temp"].as_matrix()).T
301 | 
302 |     X = time.reshape((-1, 1))               # Time. Fraction of the year [0, 1]
303 |     X = np.insert(X, 0, values=1, axis=1)   # Insert bias term
304 |     y = temp[:, 0]                          # Temperature. Reduce to one-dim
305 | 
306 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
307 | 
308 |     model = GBDTRegressor()
309 |     model.fit(X_train, y_train)
310 |     y_pred = model.predict(X_test)
311 | 
312 |     y_pred_line = model.predict(X)
313 | 
314 |     # Color map
315 |     cmap = plt.get_cmap('viridis')
316 | 
317 |     mse = mean_squared_error(y_test, y_pred)
318 | 
319 |     print ("Mean Squared Error:", mse)
320 | 
321 |     # Plot the results
322 |     m1 = plt.scatter(366 * X_train[:, 1], y_train, color=cmap(0.9), s=10)
323 |     m2 = plt.scatter(366 * X_test[:, 1], y_test, color=cmap(0.5), s=10)
324 |     m3 = plt.scatter(366 * X_test[:, 1], y_pred, color='black', s=10)
325 |     plt.suptitle("Regression Tree")
326 |     plt.title("MSE: %.2f" % mse, fontsize=10)
327 |     plt.xlabel('Day')
328 |     plt.ylabel('Temperature in Celcius')
329 |     plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
330 |     plt.show()
331 | 
332 | 
333 | if __name__ == "__main__":
334 |     main()
335 | 
336 | 
337 | 
338 | 
339 | 
340 | 
341 | 
342 | 
343 | ```


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/TempLinkoping2016.txt:
--------------------------------------------------------------------------------
  1 | time	temp
  2 | 0.00273224	0.1
  3 | 0.005464481	-4.5
  4 | 0.008196721	-6.3
  5 | 0.010928962	-9.6
  6 | 0.013661202	-9.9
  7 | 0.016393443	-17.1
  8 | 0.019125683	-11.6
  9 | 0.021857923	-6.2
 10 | 0.024590164	-6.4
 11 | 0.027322404	-0.5
 12 | 0.030054645	0.5
 13 | 0.032786885	-2.4
 14 | 0.035519126	-7.5
 15 | 0.038251366	-16.8
 16 | 0.040983607	-16.6
 17 | 0.043715847	-14.6
 18 | 0.046448087	-9.6
 19 | 0.049180328	-5.8
 20 | 0.051912568	-8.6
 21 | 0.054644809	-9.0
 22 | 0.057377049	-9.7
 23 | 0.06010929	-6.9
 24 | 0.06284153	-3.9
 25 | 0.06557377	1.4
 26 | 0.068306011	1.9
 27 | 0.071038251	4.3
 28 | 0.073770492	6.9
 29 | 0.076502732	4.3
 30 | 0.079234973	5.9
 31 | 0.081967213	3.8
 32 | 0.084699454	1.5
 33 | 0.087431694	0.1
 34 | 0.090163934	4.6
 35 | 0.092896175	0.8
 36 | 0.095628415	-0.5
 37 | 0.098360656	-1.0
 38 | 0.101092896	4.2
 39 | 0.103825137	6.6
 40 | 0.106557377	4.8
 41 | 0.109289617	4.7
 42 | 0.112021858	1.3
 43 | 0.114754098	0.9
 44 | 0.117486339	-2.8
 45 | 0.120218579	-3.3
 46 | 0.12295082	-5.3
 47 | 0.12568306	-6.8
 48 | 0.128415301	-5.1
 49 | 0.131147541	-2.6
 50 | 0.133879781	-0.5
 51 | 0.136612022	-0.5
 52 | 0.139344262	0.1
 53 | 0.142076503	1.7
 54 | 0.144808743	2.4
 55 | 0.147540984	-0.9
 56 | 0.150273224	-1.3
 57 | 0.153005464	-1.4
 58 | 0.155737705	-0.1
 59 | 0.158469945	-0.7
 60 | 0.161202186	-2.6
 61 | 0.163934426	-4.1
 62 | 0.166666667	-2.7
 63 | 0.169398907	0.7
 64 | 0.172131148	2.0
 65 | 0.174863388	1.7
 66 | 0.177595628	0.9
 67 | 0.180327869	0.3
 68 | 0.183060109	0.9
 69 | 0.18579235	1.1
 70 | 0.18852459	0.1
 71 | 0.191256831	-0.9
 72 | 0.193989071	0.2
 73 | 0.196721311	0.1
 74 | 0.199453552	1.0
 75 | 0.202185792	3.4
 76 | 0.204918033	5.2
 77 | 0.207650273	4.9
 78 | 0.210382514	4.9
 79 | 0.213114754	2.2
 80 | 0.215846995	2.9
 81 | 0.218579235	5.3
 82 | 0.221311475	3.7
 83 | 0.224043716	3.4
 84 | 0.226775956	2.1
 85 | 0.229508197	1.8
 86 | 0.232240437	4.3
 87 | 0.234972678	7.0
 88 | 0.237704918	7.7
 89 | 0.240437158	6.2
 90 | 0.243169399	7.5
 91 | 0.245901639	4.9
 92 | 0.24863388	4.4
 93 | 0.25136612	3.8
 94 | 0.254098361	6.4
 95 | 0.256830601	8.0
 96 | 0.259562842	7.9
 97 | 0.262295082	8.9
 98 | 0.265027322	6.6
 99 | 0.267759563	6.5
100 | 0.270491803	5.8
101 | 0.273224044	5.6
102 | 0.275956284	4.7
103 | 0.278688525	5.5
104 | 0.281420765	5.5
105 | 0.284153005	5.8
106 | 0.286885246	5.3
107 | 0.289617486	6.9
108 | 0.292349727	5.9
109 | 0.295081967	6.1
110 | 0.297814208	6.6
111 | 0.300546448	6.7
112 | 0.303278689	6.5
113 | 0.306010929	7.0
114 | 0.308743169	5.8
115 | 0.31147541	3.0
116 | 0.31420765	2.5
117 | 0.316939891	2.4
118 | 0.319672131	4.3
119 | 0.322404372	2.8
120 | 0.325136612	3.6
121 | 0.327868852	6.8
122 | 0.330601093	9.1
123 | 0.333333333	8.4
124 | 0.336065574	9.3
125 | 0.338797814	13.3
126 | 0.341530055	10.6
127 | 0.344262295	10.5
128 | 0.346994536	11.8
129 | 0.349726776	14.7
130 | 0.352459016	16.2
131 | 0.355191257	16.4
132 | 0.357923497	16.9
133 | 0.360655738	12.3
134 | 0.363387978	10.2
135 | 0.366120219	11.2
136 | 0.368852459	6.1
137 | 0.371584699	6.4
138 | 0.37431694	6.1
139 | 0.37704918	10.4
140 | 0.379781421	10.3
141 | 0.382513661	11.9
142 | 0.385245902	12.9
143 | 0.387978142	12.5
144 | 0.390710383	17.5
145 | 0.393442623	19.9
146 | 0.396174863	19.3
147 | 0.398907104	11.4
148 | 0.401639344	9.7
149 | 0.404371585	10.7
150 | 0.407103825	13.0
151 | 0.409836066	12.4
152 | 0.412568306	16.3
153 | 0.415300546	19.2
154 | 0.418032787	19.2
155 | 0.420765027	19.8
156 | 0.423497268	19.5
157 | 0.426229508	16.6
158 | 0.428961749	13.0
159 | 0.431693989	12.6
160 | 0.43442623	17.6
161 | 0.43715847	13.7
162 | 0.43989071	11.3
163 | 0.442622951	10.2
164 | 0.445355191	10.2
165 | 0.448087432	11.6
166 | 0.450819672	14.2
167 | 0.453551913	14.4
168 | 0.456284153	17.4
169 | 0.459016393	13.1
170 | 0.461748634	17.4
171 | 0.464480874	15.9
172 | 0.467213115	15.9
173 | 0.469945355	15.5
174 | 0.472677596	16.4
175 | 0.475409836	16.7
176 | 0.478142077	18.2
177 | 0.480874317	20.9
178 | 0.483606557	22.2
179 | 0.486338798	19.1
180 | 0.489071038	16.3
181 | 0.491803279	16.6
182 | 0.494535519	15.1
183 | 0.49726776	14.5
184 | 0.5	17.4
185 | 0.50273224	16.5
186 | 0.505464481	13.7
187 | 0.508196721	14.0
188 | 0.510928962	14.2
189 | 0.513661202	15.6
190 | 0.516393443	15.7
191 | 0.519125683	15.6
192 | 0.521857923	16.2
193 | 0.524590164	16.3
194 | 0.527322404	18.3
195 | 0.530054645	16.6
196 | 0.532786885	16.1
197 | 0.535519126	15.9
198 | 0.538251366	16.0
199 | 0.540983607	15.9
200 | 0.543715847	16.0
201 | 0.546448087	15.7
202 | 0.549180328	17.2
203 | 0.551912568	19.9
204 | 0.554644809	21.0
205 | 0.557377049	19.4
206 | 0.56010929	20.4
207 | 0.56284153	23.1
208 | 0.56557377	23.0
209 | 0.568306011	19.9
210 | 0.571038251	17.6
211 | 0.573770492	18.8
212 | 0.576502732	17.8
213 | 0.579234973	18.6
214 | 0.581967213	16.4
215 | 0.584699454	15.2
216 | 0.587431694	15.3
217 | 0.590163934	16.0
218 | 0.592896175	18.0
219 | 0.595628415	17.7
220 | 0.598360656	16.0
221 | 0.601092896	16.4
222 | 0.603825137	16.7
223 | 0.606557377	14.3
224 | 0.609289617	12.2
225 | 0.612021858	10.0
226 | 0.614754098	12.0
227 | 0.617486339	16.2
228 | 0.620218579	15.9
229 | 0.62295082	14.5
230 | 0.62568306	15.3
231 | 0.628415301	13.3
232 | 0.631147541	14.5
233 | 0.633879781	15.5
234 | 0.636612022	15.3
235 | 0.639344262	17.3
236 | 0.642076503	15.3
237 | 0.644808743	16.4
238 | 0.647540984	17.0
239 | 0.650273224	20.2
240 | 0.653005464	22.4
241 | 0.655737705	18.1
242 | 0.658469945	11.6
243 | 0.661202186	14.6
244 | 0.663934426	13.5
245 | 0.666666667	17.9
246 | 0.669398907	16.4
247 | 0.672131148	15.5
248 | 0.674863388	15.9
249 | 0.677595628	14.1
250 | 0.680327869	13.2
251 | 0.683060109	14.5
252 | 0.68579235	19.0
253 | 0.68852459	18.3
254 | 0.691256831	18.8
255 | 0.693989071	16.8
256 | 0.696721311	16.8
257 | 0.699453552	14.3
258 | 0.702185792	18.4
259 | 0.704918033	18.3
260 | 0.707650273	18.4
261 | 0.710382514	14.9
262 | 0.713114754	11.4
263 | 0.715846995	12.6
264 | 0.718579235	14.0
265 | 0.721311475	14.8
266 | 0.724043716	9.9
267 | 0.726775956	11.4
268 | 0.729508197	12.9
269 | 0.732240437	12.1
270 | 0.734972678	12.8
271 | 0.737704918	13.5
272 | 0.740437158	12.9
273 | 0.743169399	14.0
274 | 0.745901639	14.6
275 | 0.74863388	12.0
276 | 0.75136612	10.5
277 | 0.754098361	9.5
278 | 0.756830601	7.6
279 | 0.759562842	6.4
280 | 0.762295082	7.0
281 | 0.765027322	8.1
282 | 0.767759563	8.1
283 | 0.770491803	7.6
284 | 0.773224044	7.4
285 | 0.775956284	7.2
286 | 0.778688525	7.0
287 | 0.781420765	6.4
288 | 0.784153005	5.8
289 | 0.786885246	5.5
290 | 0.789617486	6.4
291 | 0.792349727	7.3
292 | 0.795081967	7.4
293 | 0.797814208	7.8
294 | 0.800546448	7.9
295 | 0.803278689	6.9
296 | 0.806010929	6.1
297 | 0.808743169	3.7
298 | 0.81147541	5.3
299 | 0.81420765	6.1
300 | 0.816939891	4.3
301 | 0.819672131	3.3
302 | 0.822404372	8.8
303 | 0.825136612	9.8
304 | 0.827868852	6.4
305 | 0.830601093	4.6
306 | 0.833333333	5.2
307 | 0.836065574	5.5
308 | 0.838797814	1.4
309 | 0.841530055	0.5
310 | 0.844262295	-2.6
311 | 0.846994536	2.4
312 | 0.849726776	-0.8
313 | 0.852459016	-3.3
314 | 0.855191257	-2.8
315 | 0.857923497	-3.5
316 | 0.860655738	-2.8
317 | 0.863387978	-2.2
318 | 0.866120219	-0.3
319 | 0.868852459	0.0
320 | 0.871584699	2.3
321 | 0.87431694	4.9
322 | 0.87704918	3.1
323 | 0.879781421	3.6
324 | 0.882513661	5.2
325 | 0.885245902	3.8
326 | 0.887978142	3.2
327 | 0.890710383	7.7
328 | 0.893442623	7.8
329 | 0.896174863	6.9
330 | 0.898907104	2.7
331 | 0.901639344	2.8
332 | 0.904371585	6.6
333 | 0.907103825	1.9
334 | 0.909836066	-1.4
335 | 0.912568306	2.2
336 | 0.915300546	1.9
337 | 0.918032787	-1.3
338 | 0.920765027	-1.6
339 | 0.923497268	-3.2
340 | 0.926229508	-2.7
341 | 0.928961749	3.7
342 | 0.931693989	-3.2
343 | 0.93442623	-0.2
344 | 0.93715847	9.3
345 | 0.93989071	7.1
346 | 0.942622951	3.2
347 | 0.945355191	1.1
348 | 0.948087432	-6.0
349 | 0.950819672	1.7
350 | 0.953551913	-1.3
351 | 0.956284153	-2.2
352 | 0.959016393	-1.2
353 | 0.961748634	1.0
354 | 0.964480874	1.7
355 | 0.967213115	3.7
356 | 0.969945355	4.7
357 | 0.972677596	-0.3
358 | 0.975409836	3.5
359 | 0.978142077	3.4
360 | 0.980874317	3.9
361 | 0.983606557	4.5
362 | 0.986338798	5.3
363 | 0.989071038	2.7
364 | 0.991803279	-0.4
365 | 0.994535519	4.3
366 | 0.99726776	7.0
367 | 1	9.3


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/readme.md:
--------------------------------------------------------------------------------
 1 | 20190710
 2 | 
 3 | v1.0 添加GBDT的辅助函数，为运行GBDT算法代码，需要下载辅助函数文件夹utils到您的python或者anaconda安装目录，例如将utils文件夹下载并拷贝到anaconda为\Anaconda3\Lib\site-packages目录下。
 4 | 
 5 | 添加模型的主程序文件，可以直接在jupyter_notebook中运行
 6 | 
 7 | 添加模型的回归和分类问题代码和测试数据
 8 | 
 9 | 代码参考作者地址：https://github.com/RRdmlearning/Machine-Learning-From-Scratch
10 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__init__.py


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/bools.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/bools.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/data_manipulation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/data_manipulation.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/data_operation.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/data_operation.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/dates.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/dates.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/enum.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/enum.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/lists.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/lists.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/loss_functions.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/loss_functions.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/math.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/math.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/misc.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/misc.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/__pycache__/objects.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/objects.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/bools.py:
--------------------------------------------------------------------------------
1 | try:
2 |     reduce
3 | except NameError:
4 |     from functools import reduce
5 | 
6 | def xor(*things):
7 |     return reduce(lambda x, y: bool(x) ^ bool(y), things)
8 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/data_manipulation.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from itertools import combinations_with_replacement
  3 | import numpy as np
  4 | import math
  5 | import sys
  6 | 
  7 | 
  8 | def shuffle_data(X, y, seed=None):
  9 |     """ Random shuffle of the samples in X and y """
 10 |     if seed:
 11 |         np.random.seed(seed)
 12 |     idx = np.arange(X.shape[0])
 13 |     np.random.shuffle(idx)
 14 |     return X[idx], y[idx]
 15 | 
 16 | 
 17 | def batch_iterator(X, y=None, batch_size=64):
 18 |     """ Simple batch generator """
 19 |     n_samples = X.shape[0]
 20 |     for i in np.arange(0, n_samples, batch_size):
 21 |         begin, end = i, min(i+batch_size, n_samples)
 22 |         if y is not None:
 23 |             yield X[begin:end], y[begin:end]
 24 |         else:
 25 |             yield X[begin:end]
 26 | 
 27 | 
 28 | def divide_on_feature(X, feature_i, threshold):
 29 |     """ Divide dataset based on if sample value on feature index is larger than
 30 |         the given threshold """
 31 |     split_func = None
 32 |     if isinstance(threshold, int) or isinstance(threshold, float):
 33 |         split_func = lambda sample: sample[feature_i] >= threshold
 34 |     else:
 35 |         split_func = lambda sample: sample[feature_i] == threshold
 36 | 
 37 |     X_1 = np.array([sample for sample in X if split_func(sample)])
 38 |     X_2 = np.array([sample for sample in X if not split_func(sample)])
 39 | 
 40 |     return np.array([X_1, X_2])
 41 | 
 42 | 
 43 | def polynomial_features(X, degree):
 44 |     n_samples, n_features = np.shape(X)
 45 | 
 46 |     def index_combinations():
 47 |         combs = [combinations_with_replacement(range(n_features), i) for i in range(0, degree + 1)]
 48 |         flat_combs = [item for sublist in combs for item in sublist]
 49 |         return flat_combs
 50 |     
 51 |     combinations = index_combinations()
 52 |     n_output_features = len(combinations)
 53 |     X_new = np.empty((n_samples, n_output_features))
 54 |     
 55 |     for i, index_combs in enumerate(combinations):  
 56 |         X_new[:, i] = np.prod(X[:, index_combs], axis=1)
 57 | 
 58 |     return X_new
 59 | 
 60 | 
 61 | def get_random_subsets(X, y, n_subsets, replacements=True):
 62 |     """ Return random subsets (with replacements) of the data """
 63 |     n_samples = np.shape(X)[0]
 64 |     # Concatenate x and y and do a random shuffle
 65 |     X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1)
 66 |     np.random.shuffle(X_y)
 67 |     subsets = []
 68 | 
 69 |     # Uses 50% of training samples without replacements
 70 |     subsample_size = int(n_samples // 2)
 71 |     if replacements:
 72 |         subsample_size = n_samples      # 100% with replacements
 73 | 
 74 |     for _ in range(n_subsets):
 75 |         idx = np.random.choice(
 76 |             range(n_samples),
 77 |             size=np.shape(range(subsample_size)),
 78 |             replace=replacements)
 79 |         X = X_y[idx][:, :-1]
 80 |         y = X_y[idx][:, -1]
 81 |         subsets.append([X, y])
 82 |     return subsets
 83 | 
 84 | 
 85 | def normalize(X, axis=-1, order=2):
 86 |     """ Normalize the dataset X """
 87 |     l2 = np.atleast_1d(np.linalg.norm(X, order, axis))
 88 |     l2[l2 == 0] = 1
 89 |     return X / np.expand_dims(l2, axis)
 90 | 
 91 | 
 92 | def standardize(X):
 93 |     """ Standardize the dataset X """
 94 |     X_std = X
 95 |     mean = X.mean(axis=0)
 96 |     std = X.std(axis=0)
 97 |     for col in range(np.shape(X)[1]):
 98 |         if std[col]:
 99 |             X_std[:, col] = (X_std[:, col] - mean[col]) / std[col]
100 |     # X_std = (X - X.mean(axis=0)) / X.std(axis=0)
101 |     return X_std
102 | 
103 | 
104 | def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None):
105 |     """ Split the data into train and test sets """
106 |     if shuffle:
107 |         X, y = shuffle_data(X, y, seed)
108 |     # Split the training data from test data in the ratio specified in
109 |     # test_size
110 |     split_i = len(y) - int(len(y) // (1 / test_size))
111 |     X_train, X_test = X[:split_i], X[split_i:]
112 |     y_train, y_test = y[:split_i], y[split_i:]
113 | 
114 |     return X_train, X_test, y_train, y_test
115 | 
116 | 
117 | def k_fold_cross_validation_sets(X, y, k, shuffle=True):
118 |     """ Split the data into k sets of training / test data """
119 |     if shuffle:
120 |         X, y = shuffle_data(X, y)
121 | 
122 |     n_samples = len(y)
123 |     left_overs = {}
124 |     n_left_overs = (n_samples % k)
125 |     if n_left_overs != 0:
126 |         left_overs["X"] = X[-n_left_overs:]
127 |         left_overs["y"] = y[-n_left_overs:]
128 |         X = X[:-n_left_overs]
129 |         y = y[:-n_left_overs]
130 | 
131 |     X_split = np.split(X, k)
132 |     y_split = np.split(y, k)
133 |     sets = []
134 |     for i in range(k):
135 |         X_test, y_test = X_split[i], y_split[i]
136 |         X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0)
137 |         y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0)
138 |         sets.append([X_train, X_test, y_train, y_test])
139 | 
140 |     # Add left over samples to last set as training samples
141 |     if n_left_overs != 0:
142 |         np.append(sets[-1][0], left_overs["X"], axis=0)
143 |         np.append(sets[-1][2], left_overs["y"], axis=0)
144 | 
145 |     return np.array(sets)
146 | 
147 | 
148 | def to_categorical(x, n_col=None):
149 |     """ One-hot encoding of nominal values """
150 |     if not n_col:
151 |         n_col = np.amax(x) + 1
152 |     one_hot = np.zeros((x.shape[0], n_col))
153 |     one_hot[np.arange(x.shape[0]), x] = 1
154 |     return one_hot
155 | 
156 | 
157 | def to_nominal(x):
158 |     """ Conversion from one-hot encoding to nominal """
159 |     return np.argmax(x, axis=1)
160 | 
161 | 
162 | def make_diagonal(x):
163 |     """ Converts a vector into an diagonal matrix """
164 |     m = np.zeros((len(x), len(x)))
165 |     for i in range(len(m[0])):
166 |         m[i, i] = x[i]
167 |     return m
168 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/data_operation.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | import math
 4 | import sys
 5 | 
 6 | 
 7 | def calculate_entropy(y):
 8 |     """ Calculate the entropy of label array y """
 9 |     log2 = lambda x: math.log(x) / math.log(2)
10 |     unique_labels = np.unique(y)
11 |     entropy = 0
12 |     for label in unique_labels:
13 |         count = len(y[y == label])
14 |         p = count / len(y)
15 |         entropy += -p * log2(p)
16 |     return entropy
17 | 
18 | 
19 | def mean_squared_error(y_true, y_pred):
20 |     """ Returns the mean squared error between y_true and y_pred """
21 |     mse = np.mean(np.power(y_true - y_pred, 2))
22 |     return mse
23 | 
24 | 
25 | def calculate_variance(X):
26 |     """ Return the variance of the features in dataset X """
27 |     mean = np.ones(np.shape(X)) * X.mean(0)
28 |     n_samples = np.shape(X)[0]
29 |     variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean))
30 |     
31 |     return variance
32 | 
33 | 
34 | def calculate_std_dev(X):
35 |     """ Calculate the standard deviations of the features in dataset X """
36 |     std_dev = np.sqrt(calculate_variance(X))
37 |     return std_dev
38 | 
39 | 
40 | def euclidean_distance(x1, x2):
41 |     """ Calculates the l2 distance between two vectors """
42 |     distance = 0
43 |     # Squared distance between each coordinate
44 |     for i in range(len(x1)):
45 |         distance += pow((x1[i] - x2[i]), 2)
46 |     return math.sqrt(distance)
47 | 
48 | 
49 | def accuracy_score(y_true, y_pred):
50 |     """ Compare y_true to y_pred and return the accuracy """
51 |     accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true)
52 |     return accuracy
53 | 
54 | 
55 | def calculate_covariance_matrix(X, Y=None):
56 |     """ Calculate the covariance matrix for the dataset X """
57 |     if Y is None:
58 |         Y = X
59 |     n_samples = np.shape(X)[0]
60 |     covariance_matrix = (1 / (n_samples-1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0))
61 | 
62 |     return np.array(covariance_matrix, dtype=float)
63 |  
64 | 
65 | def calculate_correlation_matrix(X, Y=None):
66 |     """ Calculate the correlation matrix for the dataset X """
67 |     if Y is None:
68 |         Y = X
69 |     n_samples = np.shape(X)[0]
70 |     covariance = (1 / n_samples) * (X - X.mean(0)).T.dot(Y - Y.mean(0))
71 |     std_dev_X = np.expand_dims(calculate_std_dev(X), 1)
72 |     std_dev_y = np.expand_dims(calculate_std_dev(Y), 1)
73 |     correlation_matrix = np.divide(covariance, std_dev_X.dot(std_dev_y.T))
74 | 
75 |     return np.array(correlation_matrix, dtype=float)
76 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dates.py:
--------------------------------------------------------------------------------
  1 | """Useful things to do with dates"""
  2 | import datetime
  3 | 
  4 | 
  5 | def date_from_string(string, format_string=None):
  6 |     """Runs through a few common string formats for datetimes,
  7 |     and attempts to coerce them into a datetime. Alternatively,
  8 |     format_string can provide either a single string to attempt
  9 |     or an iterable of strings to attempt."""
 10 | 
 11 |     if isinstance(format_string, str):
 12 |         return datetime.datetime.strptime(string, format_string).date()
 13 | 
 14 |     elif format_string is None:
 15 |         format_string = [
 16 |             "%Y-%m-%d",
 17 |             "%m-%d-%Y",
 18 |             "%m/%d/%Y",
 19 |             "%d/%m/%Y",
 20 |         ]
 21 | 
 22 |     for format in format_string:
 23 |         try:
 24 |             return datetime.datetime.strptime(string, format).date()
 25 |         except ValueError:
 26 |             continue
 27 | 
 28 |     raise ValueError("Could not produce date from string: {}".format(string))
 29 | 
 30 | 
 31 | def to_datetime(plain_date, hours=0, minutes=0, seconds=0, ms=0):
 32 |     """given a datetime.date, gives back a datetime.datetime"""
 33 |     # don't mess with datetimes
 34 |     if isinstance(plain_date, datetime.datetime):
 35 |         return plain_date
 36 |     return datetime.datetime(
 37 |         plain_date.year,
 38 |         plain_date.month,
 39 |         plain_date.day,
 40 |         hours,
 41 |         minutes,
 42 |         seconds,
 43 |         ms,
 44 |     )
 45 | 
 46 | 
 47 | class TimePeriod(object):
 48 | 
 49 |     def __init__(self, earliest, latest):
 50 |         if not isinstance(earliest, datetime.date) and earliest is not None:
 51 |             raise TypeError("Earliest must be a date or None")
 52 |         if not isinstance(latest, datetime.date) and latest is not None:
 53 |             raise TypeError("Latest must be a date or None")
 54 | 
 55 |         # convert dates to datetimes, for to have better resolution
 56 |         if earliest is not None:
 57 |             earliest = to_datetime(earliest)
 58 |         if latest is not None:
 59 |             latest = to_datetime(latest, 23, 59, 59)
 60 | 
 61 |         if earliest is not None and latest is not None and earliest >= latest:
 62 |             raise ValueError("Earliest must be earlier than latest")
 63 | 
 64 |         self._earliest = earliest
 65 |         self._latest = latest
 66 | 
 67 |     def __contains__(self, key):
 68 |         if isinstance(key, datetime.date):
 69 |             key = to_datetime(key)
 70 | 
 71 |             if self._latest is None:
 72 |                 upper_bounded = True
 73 |             else:
 74 |                 upper_bounded = key <= self._latest
 75 | 
 76 |             if self._earliest is None:
 77 |                 lower_bounded = True
 78 |             else:
 79 |                 lower_bounded = self._earliest <= key
 80 | 
 81 |             return upper_bounded and lower_bounded
 82 | 
 83 |         elif isinstance(key, TimePeriod):
 84 |             if self._latest is None:
 85 |                 upper_bounded = True
 86 |             elif key._latest is None:
 87 |                 upper_bounded = False
 88 |             else:
 89 |                 upper_bounded = self._latest >= key._latest
 90 | 
 91 |             if self._earliest is None:
 92 |                 lower_bounded = True
 93 |             elif key._earliest is None:
 94 |                 lower_bounded = False
 95 |             else:
 96 |                 lower_bounded = self._earliest <= key._earliest
 97 | 
 98 |             return upper_bounded and lower_bounded
 99 | 
100 |     def contains(self, other):
101 |         return other in self
102 | 
103 |     def overlaps(self, other):
104 |         """does another datetime overlap with this one? this is a symmetric
105 |         property.
106 | 
107 |         TP1       |------------|
108 |         -------------------------------------------------> time
109 |         TP2                 |--------------|
110 | 
111 |         TP1.overlaps(TP2) == TP2.overlaps(TP1) == True
112 | 
113 |         args:
114 |             other - a TimePeriod
115 |         """
116 | 
117 |         return self._latest in other or self._earliest in other
118 | 
119 |     def __eq__(self, other):
120 |         return (self._earliest == other._earliest) and (self._latest == other._latest)
121 | 
122 |     def __hash__(self):
123 |         return hash((self._earliest, self._latest))
124 | 
125 |     def __repr__(self):
126 |         return "<{}: {}-{}>".format(
127 |             self.__class__.__name__,
128 |             self._earliest,
129 |             self._latest,
130 |         )
131 | 
132 |     @classmethod
133 |     def get_containing_period(cls, *periods):
134 |         """Given a bunch of TimePeriods, return a TimePeriod that most closely
135 |         contains them."""
136 | 
137 |         if any(not isinstance(period, TimePeriod) for period in periods):
138 |             raise TypeError("periods must all be TimePeriods: {}".format(periods))
139 | 
140 |         latest = datetime.datetime.min
141 |         earliest = datetime.datetime.max
142 | 
143 |         for period in periods:
144 |             # the best we can do to conain None is None!
145 |             if period._latest is None:
146 |                 latest = None
147 |             elif latest is not None and period._latest > latest:
148 |                 latest = period._latest
149 | 
150 |             if period._earliest is None:
151 |                 earliest = None
152 |             elif earliest is not None and period._earliest < earliest:
153 |                 earliest = period._earliest
154 | 
155 |         return TimePeriod(earliest, latest)
156 | 
157 | 
158 | class DiscontinuousTimePeriod(object):
159 |     """A bunch of TimePeriods"""
160 | 
161 |     def __init__(self, *periods):
162 |         if any(not isinstance(period, TimePeriod) for period in periods):
163 |             raise TypeError("periods must all be TimePeriods: {}".format(periods))
164 | 
165 |         periods = set(periods)
166 | 
167 |         no_overlaps_periods = []
168 |         for period in periods:
169 |             for other_period in periods:
170 |                 if id(other_period) == id(period):
171 |                     continue
172 | 
173 |                 # periods that overlap should be combined
174 |                 if period.overlaps(other_period):
175 |                     period = TimePeriod.get_containing_period(period, other_period)
176 | 
177 |             no_overlaps_periods.append(period)
178 | 
179 |         no_equals_periods = []
180 |         reference = set(no_overlaps_periods)
181 |         for period in no_overlaps_periods:
182 |             # clean out duplicated periods
183 |             if any(other_period == period and other_period is not period for other_period in reference):
184 |                 reference.remove(period)
185 |             else:
186 |                 no_equals_periods.append(period)
187 | 
188 |         no_contains_periods = []
189 |         for period in no_equals_periods:
190 |             # don't need to keep periods that are wholly contained
191 |             skip = False
192 |             for other_period in no_equals_periods:
193 |                 if id(other_period) == id(period):
194 |                     continue
195 | 
196 |                 if period in other_period:
197 |                     skip = True
198 | 
199 |             if not skip:
200 |                 no_contains_periods.append(period)
201 |         self._periods = no_contains_periods
202 | 
203 |     def __contains__(self, other):
204 |         if isinstance(other, (datetime.date, TimePeriod)):
205 |             for period in self._periods:
206 |                 if other in period:
207 |                     return True
208 | 
209 | 
210 | def days_ago(days, give_datetime=True):
211 |     delta = datetime.timedelta(days=days)
212 |     dt = datetime.datetime.now() - delta
213 |     if give_datetime:
214 |         return dt
215 |     else:
216 |         return dt.date()
217 | 
218 | 
219 | def days_ahead(days, give_datetime=True):
220 |     delta = datetime.timedelta(days=days)
221 |     dt = datetime.datetime.now() + delta
222 |     if give_datetime:
223 |         return dt
224 |     else:
225 |         return dt.date()
226 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-35.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/decision_tree/decision_tree_classifier_example.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function
 2 | import numpy as np
 3 | from sklearn import datasets
 4 | import matplotlib.pyplot as plt
 5 | import sys
 6 | import os
 7 | 
 8 | # Import helper functions
 9 | from utils import train_test_split, standardize, accuracy_score
10 | from utils import mean_squared_error, calculate_variance, Plot
11 | from decision_tree.decision_tree_model import ClassificationTree
12 | 
13 | def main():
14 | 
15 |     print ("-- Classification Tree --")
16 | 
17 |     data = datasets.load_iris()
18 |     X = data.data
19 |     y = data.target
20 | 
21 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)
22 | 
23 |     clf = ClassificationTree()
24 |     clf.fit(X_train, y_train)
25 |     y_pred = clf.predict(X_test)
26 | 
27 |     accuracy = accuracy_score(y_test, y_pred)
28 | 
29 |     print ("Accuracy:", accuracy)
30 | 
31 |     Plot().plot_in_2d(X_test, y_pred,
32 |         title="Decision Tree",
33 |         accuracy=accuracy,
34 |         legend_labels=data.target_names)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     main()


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/decision_tree/decision_tree_model.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division, print_function
  2 | import numpy as np
  3 | 
  4 | from utils.data_manipulation import divide_on_feature, train_test_split, standardize
  5 | from utils.data_operation import calculate_entropy, accuracy_score, calculate_variance, mean_squared_error
  6 | 
  7 | 
  8 | class DecisionNode():
  9 |     """Class that represents a decision node or leaf in the decision tree
 10 | 
 11 |     Parameters:
 12 |     -----------
 13 |     feature_i: int
 14 |         Feature index which we want to use as the threshold measure.
 15 |     threshold: float
 16 |         The value that we will compare feature values at feature_i against to
 17 |         determine the prediction.
 18 |     value: float
 19 |         The class prediction if classification tree, or float value if regression tree.
 20 |     true_branch: DecisionNode
 21 |         Next decision node for samples where features value met the threshold.
 22 |     false_branch: DecisionNode
 23 |         Next decision node for samples where features value did not meet the threshold.
 24 |     """
 25 | 
 26 |     def __init__(self, feature_i=None, threshold=None,
 27 |                  value=None, true_branch=None, false_branch=None):
 28 |         self.feature_i = feature_i  # Index for the feature that is tested
 29 |         self.threshold = threshold  # Threshold value for feature
 30 |         self.value = value  # Value if the node is a leaf in the tree
 31 |         self.true_branch = true_branch  # 'Left' subtree
 32 |         self.false_branch = false_branch  # 'Right' subtree
 33 | 
 34 | 
 35 | # Super class of RegressionTree and ClassificationTree
 36 | class DecisionTree(object):
 37 |     """Super class of RegressionTree and ClassificationTree.
 38 | 
 39 |     Parameters:
 40 |     -----------
 41 |     min_samples_split: int
 42 |         The minimum number of samples needed to make a split when building a tree.
 43 |     min_impurity: float
 44 |         The minimum impurity required to split the tree further.
 45 |     max_depth: int
 46 |         The maximum depth of a tree.
 47 |     loss: function
 48 |         Loss function that is used for Gradient Boosting models to calculate impurity.
 49 |     """
 50 | 
 51 |     def __init__(self, min_samples_split=2, min_impurity=1e-7,
 52 |                  max_depth=float("inf"), loss=None):
 53 |         self.root = None  # Root node in dec. tree
 54 |         # Minimum n of samples to justify split
 55 |         self.min_samples_split = min_samples_split
 56 |         # The minimum impurity to justify split
 57 |         self.min_impurity = min_impurity
 58 |         # The maximum depth to grow the tree to
 59 |         self.max_depth = max_depth
 60 |         # Function to calculate impurity (classif.=>info gain, regr=>variance reduct.)
 61 |         # 切割树的方法，gini，方差等
 62 |         self._impurity_calculation = None
 63 |         # Function to determine prediction of y at leaf
 64 |         # 树节点取值的方法，分类树：选取出现最多次数的值，回归树：取所有值的平均值
 65 |         self._leaf_value_calculation = None
 66 |         # If y is one-hot encoded (multi-dim) or not (one-dim)
 67 |         self.one_dim = None
 68 |         # If Gradient Boost
 69 |         self.loss = loss
 70 | 
 71 |     def fit(self, X, y, loss=None):
 72 |         """ Build decision tree """
 73 |         self.one_dim = len(np.shape(y)) == 1
 74 |         self.root = self._build_tree(X, y)
 75 |         self.loss = None
 76 | 
 77 |     def _build_tree(self, X, y, current_depth=0):
 78 |         """ Recursive method which builds out the decision tree and splits X and respective y
 79 |         on the feature of X which (based on impurity) best separates the data"""
 80 |         largest_impurity = 0
 81 |         best_criteria = None  # Feature index and threshold
 82 |         best_sets = None  # Subsets of the data
 83 | 
 84 |         # Check if expansion of y is needed
 85 |         if len(np.shape(y)) == 1:
 86 |             y = np.expand_dims(y, axis=1)
 87 | 
 88 |         # Add y as last column of X
 89 |         Xy = np.concatenate((X, y), axis=1)
 90 | 
 91 |         n_samples, n_features = np.shape(X)
 92 | 
 93 |         if n_samples >= self.min_samples_split and current_depth <= self.max_depth:
 94 |             # Calculate the impurity for each feature
 95 |             for feature_i in range(n_features):
 96 |                 # All values of feature_i
 97 |                 feature_values = np.expand_dims(X[:, feature_i], axis=1)
 98 |                 unique_values = np.unique(feature_values)
 99 | 
100 |                 # Iterate through all unique values of feature column i and
101 |                 # calculate the impurity
102 |                 for threshold in unique_values:
103 |                     # Divide X and y depending on if the feature value of X at index feature_i
104 |                     # meets the threshold
105 |                     Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold)
106 | 
107 |                     if len(Xy1) > 0 and len(Xy2) > 0:
108 |                         # Select the y-values of the two sets
109 |                         y1 = Xy1[:, n_features:]
110 |                         y2 = Xy2[:, n_features:]
111 | 
112 |                         # Calculate impurity
113 |                         impurity = self._impurity_calculation(y, y1, y2)
114 | 
115 |                         # If this threshold resulted in a higher information gain than previously
116 |                         # recorded save the threshold value and the feature
117 |                         # index
118 |                         if impurity > largest_impurity:
119 |                             largest_impurity = impurity
120 |                             best_criteria = {"feature_i": feature_i, "threshold": threshold}
121 |                             best_sets = {
122 |                                 "leftX": Xy1[:, :n_features],  # X of left subtree
123 |                                 "lefty": Xy1[:, n_features:],  # y of left subtree
124 |                                 "rightX": Xy2[:, :n_features],  # X of right subtree
125 |                                 "righty": Xy2[:, n_features:]  # y of right subtree
126 |                             }
127 | 
128 |         if largest_impurity > self.min_impurity:
129 |             # Build subtrees for the right and left branches
130 |             true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1)
131 |             false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1)
132 |             return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[
133 |                 "threshold"], true_branch=true_branch, false_branch=false_branch)
134 | 
135 |         # We're at leaf => determine value
136 |         leaf_value = self._leaf_value_calculation(y)
137 |         return DecisionNode(value=leaf_value)
138 | 
139 |     def predict_value(self, x, tree=None):
140 |         """ Do a recursive search down the tree and make a prediction of the data sample by the
141 |             value of the leaf that we end up at """
142 | 
143 |         if tree is None:
144 |             tree = self.root
145 | 
146 |         # If we have a value (i.e we're at a leaf) => return value as the prediction
147 |         if tree.value is not None:
148 |             return tree.value
149 | 
150 |         # Choose the feature that we will test
151 |         feature_value = x[tree.feature_i]
152 | 
153 |         # Determine if we will follow left or right branch
154 |         branch = tree.false_branch
155 |         if isinstance(feature_value, int) or isinstance(feature_value, float):
156 |             if feature_value >= tree.threshold:
157 |                 branch = tree.true_branch
158 |         elif feature_value == tree.threshold:
159 |             branch = tree.true_branch
160 | 
161 |         # Test subtree
162 |         return self.predict_value(x, branch)
163 | 
164 |     def predict(self, X):
165 |         """ Classify samples one by one and return the set of labels """
166 |         y_pred = []
167 |         for x in X:
168 |             y_pred.append(self.predict_value(x))
169 |         return y_pred
170 | 
171 |     def print_tree(self, tree=None, indent=" "):
172 |         """ Recursively print the decision tree """
173 |         if not tree:
174 |             tree = self.root
175 | 
176 |         # If we're at leaf => print the label
177 |         if tree.value is not None:
178 |             print(tree.value)
179 |         # Go deeper down the tree
180 |         else:
181 |             # Print test
182 |             print("%s:%s? " % (tree.feature_i, tree.threshold))
183 |             # Print the true scenario
184 |             print("%sT->" % (indent), end="")
185 |             self.print_tree(tree.true_branch, indent + indent)
186 |             # Print the false scenario
187 |             print("%sF->" % (indent), end="")
188 |             self.print_tree(tree.false_branch, indent + indent)
189 | 
190 | 
191 | class ClassificationTree(DecisionTree):
192 |     def _calculate_information_gain(self, y, y1, y2):
193 |         # Calculate information gain
194 |         p = len(y1) / len(y)
195 |         entropy = calculate_entropy(y)
196 |         info_gain = entropy - p * \
197 |                               calculate_entropy(y1) - (1 - p) * \
198 |                                                       calculate_entropy(y2)
199 |         # print("info_gain",info_gain)
200 |         return info_gain
201 | 
202 |     def _majority_vote(self, y):
203 |         most_common = None
204 |         max_count = 0
205 |         for label in np.unique(y):
206 |             # Count number of occurences of samples with label
207 |             count = len(y[y == label])
208 |             if count > max_count:
209 |                 most_common = label
210 |                 max_count = count
211 |         # print("most_common :",most_common)
212 |         return most_common
213 | 
214 |     def fit(self, X, y):
215 |         self._impurity_calculation = self._calculate_information_gain
216 |         self._leaf_value_calculation = self._majority_vote
217 |         super(ClassificationTree, self).fit(X, y)
218 | 
219 | 
220 | class RegressionTree(DecisionTree):
221 |     def _calculate_variance_reduction(self, y, y1, y2):
222 |         var_tot = calculate_variance(y)
223 |         var_1 = calculate_variance(y1)
224 |         var_2 = calculate_variance(y2)
225 |         frac_1 = len(y1) / len(y)
226 |         frac_2 = len(y2) / len(y)
227 | 
228 |         # Calculate the variance reduction
229 |         variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2)
230 | 
231 |         return sum(variance_reduction)
232 | 
233 |     def _mean_of_y(self, y):
234 |         value = np.mean(y, axis=0)
235 |         return value if len(value) > 1 else value[0]
236 | 
237 |     def fit(self, X, y):
238 |         self._impurity_calculation = self._calculate_variance_reduction
239 |         self._leaf_value_calculation = self._mean_of_y
240 |         super(RegressionTree, self).fit(X, y)
241 | 
242 | 
243 | class XGBoostRegressionTree(DecisionTree):
244 |     """
245 |     Regression tree for XGBoost
246 |     - Reference -
247 |     http://xgboost.readthedocs.io/en/latest/model.html
248 |     """
249 | 
250 |     def _split(self, y):
251 |         """ y contains y_true in left half of the middle column and
252 |         y_pred in the right half. Split and return the two matrices """
253 |         col = int(np.shape(y)[1] / 2)
254 |         y, y_pred = y[:, :col], y[:, col:]
255 |         return y, y_pred
256 | 
257 |     def _gain(self, y, y_pred):
258 |         nominator = np.power((y * self.loss.gradient(y, y_pred)).sum(), 2)
259 |         denominator = self.loss.hess(y, y_pred).sum()
260 |         return 0.5 * (nominator / denominator)
261 | 
262 |     def _gain_by_taylor(self, y, y1, y2):
263 |         # Split
264 |         y, y_pred = self._split(y)
265 |         y1, y1_pred = self._split(y1)
266 |         y2, y2_pred = self._split(y2)
267 | 
268 |         true_gain = self._gain(y1, y1_pred)
269 |         false_gain = self._gain(y2, y2_pred)
270 |         gain = self._gain(y, y_pred)
271 |         return true_gain + false_gain - gain
272 | 
273 |     def _approximate_update(self, y):
274 |         # y split into y, y_pred
275 |         y, y_pred = self._split(y)
276 |         # Newton's Method
277 |         gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0)
278 |         hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
279 |         update_approximation = gradient / hessian
280 | 
281 |         return update_approximation
282 | 
283 |     def fit(self, X, y):
284 |         self._impurity_calculation = self._gain_by_taylor
285 |         self._leaf_value_calculation = self._approximate_update
286 |         super(XGBoostRegressionTree, self).fit(X, y)


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/decision_tree/decision_tree_regressor_example.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division, print_function
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | 
 6 | from utils import train_test_split, standardize, accuracy_score
 7 | from utils import mean_squared_error, calculate_variance, Plot
 8 | from decision_tree.decision_tree_model import RegressionTree
 9 | 
10 | def main():
11 | 
12 |     print ("-- Regression Tree --")
13 | 
14 |     # Load temperature data
15 |     data = pd.read_csv('../TempLinkoping2016.txt', sep="\t")
16 | 
17 |     time = np.atleast_2d(data["time"].as_matrix()).T
18 |     temp = np.atleast_2d(data["temp"].as_matrix()).T
19 | 
20 |     X = standardize(time)        # Time. Fraction of the year [0, 1]
21 |     y = temp[:, 0]  # Temperature. Reduce to one-dim
22 | 
23 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
24 | 
25 |     model = RegressionTree()
26 |     model.fit(X_train, y_train)
27 |     y_pred = model.predict(X_test)
28 | 
29 |     y_pred_line = model.predict(X)
30 | 
31 |     # Color map
32 |     cmap = plt.get_cmap('viridis')
33 | 
34 |     mse = mean_squared_error(y_test, y_pred)
35 | 
36 |     print ("Mean Squared Error:", mse)
37 | 
38 |     # Plot the results
39 |     # Plot the results
40 |     m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10)
41 |     m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10)
42 |     m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10)
43 |     plt.suptitle("Regression Tree")
44 |     plt.title("MSE: %.2f" % mse, fontsize=10)
45 |     plt.xlabel('Day')
46 |     plt.ylabel('Temperature in Celcius')
47 |     plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
48 |     plt.show()
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/__init__.py:
--------------------------------------------------------------------------------
1 | """Helper functinos for dealing with dicts.
2 | 
3 | Things you always wished you could do more succinctly!
4 | """
5 | from .limited_dict import LimitedDict
6 | from .chained_dict import ChainedDict
7 | from .helpers import *
8 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/__pycache__/__init__.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/__init__.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/__pycache__/chained_dict.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/chained_dict.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/__pycache__/helpers.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/helpers.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/__pycache__/limited_dict.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/limited_dict.cpython-37.pyc


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/chained_dict.py:
--------------------------------------------------------------------------------
 1 | from collections import MutableMapping
 2 | from itertools import chain
 3 | 
 4 | 
 5 | class ChainedDict(MutableMapping):
 6 | 
 7 |     def __init__(self, parent=None, **kwargs):
 8 |         self.__parent = parent
 9 |         self.__deleted_keys = set()
10 |         self.__data = kwargs
11 | 
12 |     def __contains__(self, key):
13 |         if self.__parent is not None:
14 |             return (
15 |                 (key in self.__data or key in self.__parent)
16 |                 and key not in self.__deleted_keys
17 |             )
18 |         return key in self.__data
19 | 
20 |     def __getitem__(self, key):
21 |         try:
22 |             return self.__data[key]
23 |         except KeyError:
24 |             if self.__parent is not None and key not in self.__deleted_keys:
25 |                 return self.__parent[key]
26 |             else:
27 |                 raise
28 | 
29 |     def __setitem__(self, key, val):
30 |         self.__data[key] = val
31 |         self.__deleted_keys.discard(key)
32 | 
33 |     def __delitem__(self, key):
34 |         if key in self:
35 |             self.__deleted_keys.add(key)
36 |             try:
37 |                 del self.__data[key]
38 |             except KeyError:
39 |                 pass
40 |         else:
41 |             raise KeyError(key)
42 | 
43 |     def __repr__(self):
44 |         return "{}({})".format(self.__class__.__name__, dict(self.items()))
45 | 
46 |     def __iter__(self):
47 |         return self.keys()
48 | 
49 |     def __len__(self):
50 |         return len(list(self.keys()))
51 | 
52 |     def iterkeys(self):
53 |         yielded = set(self.__deleted_keys)
54 |         if self.__parent is None:
55 |             iterable = self.__data.keys()
56 |         else:
57 |             iterable = chain(self.__parent.keys(), self.__data.keys())
58 | 
59 |         for key in iterable:
60 |             if key in yielded:
61 |                 continue
62 |             yield key
63 |             yielded.add(key)
64 | 
65 |     keys = iterkeys
66 | 
67 |     def iteritems(self):
68 |         for key in self.iterkeys():
69 |             yield key, self[key]
70 | 
71 |     items = iteritems
72 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/helpers.py:
--------------------------------------------------------------------------------
  1 | from collections import namedtuple
  2 | 
  3 | 
  4 | def from_keyed_iterable(iterable, key, filter_func=None):
  5 |     """Construct a dictionary out of an iterable, using an attribute name as
  6 |     the key. Optionally provide a filter function, to determine what should be
  7 |     kept in the dictionary."""
  8 | 
  9 |     generated = {}
 10 | 
 11 |     for element in iterable:
 12 |         try:
 13 |             k = getattr(element, key)
 14 |         except AttributeError:
 15 |             raise RuntimeError("{} does not have the keyed attribute: {}".format(
 16 |                 element, key
 17 |             ))
 18 | 
 19 |         if filter_func is None or filter_func(element):
 20 |             if k in generated:
 21 |                 generated[k] += [element]
 22 |             else:
 23 |                 generated[k] = [element]
 24 | 
 25 |     return generated
 26 | 
 27 | 
 28 | def subtract_by_key(dict_a, dict_b):
 29 |     """given two dicts, a and b, this function returns c = a - b, where
 30 |     a - b is defined as the key difference between a and b.
 31 | 
 32 |     e.g.,
 33 |     {1:None, 2:3, 3:"yellow", 4:True} - {2:4, 1:"green"} =
 34 |         {3:"yellow", 4:True}
 35 | 
 36 |     """
 37 |     difference_dict = {}
 38 |     for key in dict_a:
 39 |         if key not in dict_b:
 40 |             difference_dict[key] = dict_a[key]
 41 | 
 42 |     return difference_dict
 43 | 
 44 | 
 45 | def subtract(dict_a, dict_b, strict=False):
 46 |     """a stricter form of subtract_by_key(), this version will only remove an
 47 |     entry from dict_a if the key is in dict_b *and* the value at that key
 48 |     matches"""
 49 |     if not strict:
 50 |         return subtract_by_key(dict_a, dict_b)
 51 | 
 52 |     difference_dict = {}
 53 |     for key in dict_a:
 54 |         if key not in dict_b or dict_b[key] != dict_a[key]:
 55 |             difference_dict[key] = dict_a[key]
 56 | 
 57 |     return difference_dict
 58 | 
 59 | 
 60 | WinnowedResult = namedtuple("WinnowedResult", ['has', 'has_not'])
 61 | def winnow_by_keys(dct, keys=None, filter_func=None):
 62 |     """separates a dict into has-keys and not-has-keys pairs, using either
 63 |     a list of keys or a filtering function."""
 64 |     has = {}
 65 |     has_not = {}
 66 | 
 67 |     for key in dct:
 68 |         key_passes_check = False
 69 |         if keys is not None:
 70 |             key_passes_check = key in keys
 71 |         elif filter_func is not None:
 72 |             key_passes_check = filter_func(key)
 73 | 
 74 |         if key_passes_check:
 75 |             has[key] = dct[key]
 76 |         else:
 77 |             has_not[key] = dct[key]
 78 | 
 79 |     return WinnowedResult(has, has_not)
 80 | 
 81 | 
 82 | def intersection(dict_a, dict_b, strict=True):
 83 |     intersection_dict = {}
 84 | 
 85 |     for key in dict_a:
 86 |         if key in dict_b:
 87 |             if not strict or dict_a[key] == dict_b[key]:
 88 |                 intersection_dict[key] = dict_a[key]
 89 | 
 90 |     return intersection_dict
 91 | 
 92 | 
 93 | def setdefaults(dct, defaults):
 94 |     """Given a target dct and a dict of {key:default value} pairs,
 95 |     calls setdefault for all of those pairs."""
 96 |     for key in defaults:
 97 |         dct.setdefault(key, defaults[key])
 98 | 
 99 |     return dct
100 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/dicts/limited_dict.py:
--------------------------------------------------------------------------------
 1 | from collections import MutableMapping
 2 | 
 3 | 
 4 | class LimitedDict(MutableMapping):
 5 |     def __init__(self, args=None, **kwargs):
 6 |         keys = kwargs.pop('keys', [])
 7 |         self.__keys = keys
 8 | 
 9 |         self.__data = {}
10 | 
11 |         if args:
12 |             kwargs.update((key, val) for key, val in args)
13 | 
14 |         for key, val in kwargs.items():
15 |             self[key] = val
16 | 
17 |     def __setitem__(self, key, val):
18 |         if key not in self.__keys:
19 |             raise KeyError("Illegal key: {}".format(key))
20 | 
21 |         self.__data[key] = val
22 | 
23 |     def __getitem__(self, key):
24 |         return self.__data[key]
25 | 
26 |     def __iter__(self):
27 |         return self.__data.__iter__()
28 | 
29 |     def __delitem__(self, key):
30 |         del self.__data[key]
31 | 
32 |     def __len__(self):
33 |         return len(self.__data)
34 | 
35 |     def __repr__(self):
36 |         return "{}({}, {})".format(self.__class__.__name__, self.defined_keys, self.__data)
37 | 
38 |     @property
39 |     def defined_keys(self):
40 |         return self.__keys
41 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/enum.py:
--------------------------------------------------------------------------------
  1 | """Who hasn't needed a good, old-fashioned enum now and then?"""
  2 | 
  3 | 
  4 | class _enum(object):
  5 | 
  6 |     def __call__(self, enum_name, *args, **kwargs):
  7 |         if args and kwargs:
  8 |             raise TypeError("enums can only be made from args XOR kwargs")
  9 | 
 10 |         enum_items = {}
 11 | 
 12 |         counter = 0
 13 |         for name, val in kwargs.items():
 14 |             if val is None:
 15 |                 val = counter
 16 |                 counter += 1
 17 |             elif isinstance(val, int):
 18 |                 counter = val + 1
 19 | 
 20 |             enum_items[name] = val
 21 | 
 22 |         for val, name in enumerate(args, start=counter):
 23 |             enum_items[name] = val
 24 | 
 25 |         return type(enum_name, (Enum,), enum_items)
 26 | 
 27 |     def from_iterable(self, iterable):
 28 |         return self(*iterable)
 29 | 
 30 |     def from_dict(self, dct):
 31 |         return self(**dct)
 32 | 
 33 |     def __iter__(self):
 34 |         for k, v in self.__enum_items.items():
 35 |             yield k, v
 36 | 
 37 |     def __repr__(self):
 38 |         return "<{}: {}>".format(self.__class__.__name__, self.__enum_items.values())
 39 | enum = _enum()
 40 | 
 41 | 
 42 | class EnumItem(object):
 43 | 
 44 |     def __init__(self, parent, name, value):
 45 |         self.__parent = parent
 46 |         self.__name = name
 47 |         self.__value = value
 48 | 
 49 |     def __repr__(self):
 50 |         return "<{}: {} [{}]>".format(self.__class__.__name__, self.name, self.value)
 51 | 
 52 |     def __eq__(self, other):
 53 |         if isinstance(other, self.__class__):
 54 |             if self.parent.is_strict and self.parent != other.parent:
 55 |                 raise ValueError("can't compare EnumItems from different enums")
 56 |             return self.value == other.value
 57 | 
 58 |         return self.value == other
 59 | 
 60 |     @property
 61 |     def value(self):
 62 |         return self.__value
 63 | 
 64 |     @property
 65 |     def name(self):
 66 |         return self.__name
 67 | 
 68 |     @property
 69 |     def parent(self):
 70 |         return self.__parent
 71 | 
 72 | 
 73 | class _EnumMeta(type):
 74 |     def __new__(cls, name, bases, attr_dict):
 75 | 
 76 |         options = attr_dict.pop('Options', object)
 77 | 
 78 |         attr_dict['__strict__'] = getattr(options, "strict_compare", True)
 79 | 
 80 |         new_enum = super(_EnumMeta, cls).__new__(cls, name, bases, {})
 81 | 
 82 |         enum_items = {}
 83 | 
 84 |         for attr_name, attr_value in attr_dict.items():
 85 |             if attr_name.startswith('__'):
 86 |                 super(_EnumMeta, cls).__setattr__(new_enum, attr_name, attr_value)
 87 |                 continue
 88 | 
 89 |             if getattr(options, 'force_uppercase', False):
 90 |                 attr_dict.pop(attr_name)
 91 |                 attr_name = attr_name.upper()
 92 | 
 93 |             enum_item = EnumItem(new_enum, attr_name, attr_value)
 94 | 
 95 |             enum_items[attr_name] = enum_item
 96 |             super(_EnumMeta, cls).__setattr__(new_enum, attr_name, enum_item)
 97 | 
 98 |         if getattr(options, "frozen", True):
 99 |             super(_EnumMeta, cls).__setattr__(new_enum, '__frozen__', True)
100 |         else:
101 |             super(_EnumMeta, cls).__setattr__(new_enum, '__frozen__', False)
102 | 
103 |         if getattr(options, "strict", False):
104 |             super(_EnumMeta, cls).__setattr__(new_enum, '__strict__', True)
105 |         else:
106 |             super(_EnumMeta, cls).__setattr__(new_enum, '__strict__', False)
107 | 
108 |         super(_EnumMeta, cls).__setattr__(new_enum, '__enum_item_map__', enum_items)
109 | 
110 |         return new_enum
111 | 
112 |     def __setattr__(cls, name, val):
113 |         if getattr(cls, "__frozen__", False):
114 |             raise TypeError("can't set attributes on a frozen enum")
115 | 
116 |         if name in cls.__enum_item_map__:
117 |             val =  EnumItem(cls, name, val)
118 |             cls.__enum_item_map__[name] = val
119 | 
120 |         super(_EnumMeta, cls).__setattr__(name, val)
121 | 
122 |     @property
123 |     def is_strict(cls):
124 |         return getattr(cls, "__strict__", True)
125 | 
126 |     def get_name_value_map(cls):
127 |         e = cls.__enum_item_map__
128 |         return dict((e[i].name, e[i].value) for i in e)
129 | 
130 | 
131 | class Enum(_EnumMeta("EnumBase", (object, ), {})):
132 |     pass
133 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/lists.py:
--------------------------------------------------------------------------------
 1 | """List-related functions"""
 2 | 
 3 | 
 4 | def unlist(list_thing, complain=True):
 5 |     """transforms [Something] -> Something. By default, raises a ValueError for
 6 |     any other list values."""
 7 |     if complain and len(list_thing) > 1:
 8 |         raise ValueError("More than one element in {}".format(list_thing))
 9 |     elif len(list_thing) == 1:
10 |         return list_thing[0]
11 | 
12 |     if complain:
13 |         raise ValueError("Nothing in {}".format(list_thing))
14 |     return None
15 | 
16 | 
17 | def flatten(iterable):
18 |     """Fully flattens an iterable:
19 |     In: flatten([1,2,3,4,[5,6,[7,8]]])
20 |     Out: [1,2,3,4,5,6,7,8]
21 |     """
22 |     container = iterable.__class__
23 | 
24 |     placeholder = []
25 |     for item in iterable:
26 |         try:
27 |             placeholder.extend(flatten(item))
28 |         except TypeError:
29 |             placeholder.append(item)
30 | 
31 |     return container(placeholder)
32 | 
33 | 
34 | def flat_map(iterable, func):
35 |     """func must take an item and return an interable that contains that
36 |     item. this is flatmap in the classic mode"""
37 |     results = []
38 |     for element in iterable:
39 |         result = func(element)
40 |         if len(result) > 0:
41 |             results.extend(result)
42 |     return results
43 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/loss_functions.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | from utils.data_operation import accuracy_score
 4 | 
 5 | class Loss(object):
 6 |     def loss(self, y_true, y_pred):
 7 |         return NotImplementedError()
 8 | 
 9 |     def gradient(self, y, y_pred):
10 |         raise NotImplementedError()
11 | 
12 |     def acc(self, y, y_pred):
13 |         return 0
14 | 
15 | class SquareLoss(Loss):
16 |     def __init__(self): pass
17 | 
18 |     def loss(self, y, y_pred):
19 |         return 0.5 * np.power((y - y_pred), 2)
20 | 
21 |     def gradient(self, y, y_pred):
22 |         return -(y - y_pred)
23 | 
24 | class CrossEntropy(Loss):
25 |     def __init__(self): pass
26 | 
27 |     def loss(self, y, p):
28 |         # Avoid division by zero
29 |         p = np.clip(p, 1e-15, 1 - 1e-15)
30 |         return - y * np.log(p) - (1 - y) * np.log(1 - p)
31 | 
32 |     def acc(self, y, p):
33 |         return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1))
34 | 
35 |     def gradient(self, y, p):
36 |         # Avoid division by zero
37 |         p = np.clip(p, 1e-15, 1 - 1e-15)
38 |         return - (y / p) + (1 - y) / (1 - p)
39 | 
40 | 
41 | class SoftMaxLoss(Loss):
42 |     def gradient(self, y, p):
43 |         return y - p


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/math.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import operator
 3 | 
 4 | # py3 doesn't include reduce as a builtin
 5 | try:
 6 |     reduce
 7 | except NameError:
 8 |     from functools import reduce
 9 | 
10 | 
11 | def product(sequence, initial=1):
12 |     """like the built-in sum, but for multiplication."""
13 |     if not isinstance(sequence, collections.Iterable):
14 |         raise TypeError("'{}' object is not iterable".format(type(sequence).__name__))
15 | 
16 |     return reduce(operator.mul, sequence, initial)
17 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/misc.py:
--------------------------------------------------------------------------------
  1 | import progressbar
  2 | from mpl_toolkits.mplot3d import Axes3D
  3 | import matplotlib.pyplot as plt
  4 | import matplotlib.cm as cmx
  5 | import matplotlib.colors as colors
  6 | import numpy as np
  7 | 
  8 | from utils.data_operation import calculate_covariance_matrix
  9 | from utils.data_operation import calculate_correlation_matrix
 10 | from utils.data_manipulation import standardize
 11 | 
 12 | bar_widgets = [
 13 |     'Training: ', progressbar.Percentage(), ' ', progressbar.Bar(marker="-", left="[", right="]"),
 14 |     ' ', progressbar.ETA()
 15 | ]
 16 | 
 17 | class Plot():
 18 |     def __init__(self): 
 19 |         self.cmap = plt.get_cmap('viridis')
 20 | 
 21 |     def _transform(self, X, dim):
 22 |         covariance = calculate_covariance_matrix(X)
 23 |         eigenvalues, eigenvectors = np.linalg.eig(covariance)
 24 |         # Sort eigenvalues and eigenvector by largest eigenvalues
 25 |         idx = eigenvalues.argsort()[::-1]
 26 |         eigenvalues = eigenvalues[idx][:dim]
 27 |         eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :dim]
 28 |         # Project the data onto principal components
 29 |         X_transformed = X.dot(eigenvectors)
 30 | 
 31 |         return X_transformed
 32 | 
 33 | 
 34 |     def plot_regression(self, lines, title, axis_labels=None, mse=None, scatter=None, legend={"type": "lines", "loc": "lower right"}):
 35 |         
 36 |         if scatter:
 37 |             scatter_plots = scatter_labels = []
 38 |             for s in scatter:
 39 |                 scatter_plots += [plt.scatter(s["x"], s["y"], color=s["color"], s=s["size"])]
 40 |                 scatter_labels += [s["label"]]
 41 |             scatter_plots = tuple(scatter_plots)
 42 |             scatter_labels = tuple(scatter_labels)
 43 | 
 44 |         for l in lines:
 45 |             li = plt.plot(l["x"], l["y"], color=s["color"], linewidth=l["width"], label=l["label"])
 46 | 
 47 |         if mse:
 48 |             plt.suptitle(title)
 49 |             plt.title("MSE: %.2f" % mse, fontsize=10)
 50 |         else:
 51 |             plt.title(title)
 52 | 
 53 |         if axis_labels:
 54 |             plt.xlabel(axis_labels["x"])
 55 |             plt.ylabel(axis_labels["y"])
 56 | 
 57 |         if legend["type"] == "lines":
 58 |             plt.legend(loc="lower_left")
 59 |         elif legend["type"] == "scatter" and scatter:
 60 |             plt.legend(scatter_plots, scatter_labels, loc=legend["loc"])
 61 | 
 62 |         plt.show()
 63 | 
 64 | 
 65 | 
 66 |     # Plot the dataset X and the corresponding labels y in 2D using PCA.
 67 |     def plot_in_2d(self, X, y=None, title=None, accuracy=None, legend_labels=None):
 68 |         X_transformed = self._transform(X, dim=2)
 69 |         x1 = X_transformed[:, 0]
 70 |         x2 = X_transformed[:, 1]
 71 |         class_distr = []
 72 | 
 73 |         y = np.array(y).astype(int)
 74 | 
 75 |         colors = [self.cmap(i) for i in np.linspace(0, 1, len(np.unique(y)))]
 76 | 
 77 |         # Plot the different class distributions
 78 |         for i, l in enumerate(np.unique(y)):
 79 |             _x1 = x1[y == l]
 80 |             _x2 = x2[y == l]
 81 |             _y = y[y == l]
 82 |             class_distr.append(plt.scatter(_x1, _x2, color=colors[i]))
 83 | 
 84 |         # Plot legend
 85 |         if not legend_labels is None: 
 86 |             plt.legend(class_distr, legend_labels, loc=1)
 87 | 
 88 |         # Plot title
 89 |         if title:
 90 |             if accuracy:
 91 |                 perc = 100 * accuracy
 92 |                 plt.suptitle(title)
 93 |                 plt.title("Accuracy: %.1f%%" % perc, fontsize=10)
 94 |             else:
 95 |                 plt.title(title)
 96 | 
 97 |         # Axis labels
 98 |         plt.xlabel('Principal Component 1')
 99 |         plt.ylabel('Principal Component 2')
100 | 
101 |         plt.show()
102 | 
103 |     # Plot the dataset X and the corresponding labels y in 3D using PCA.
104 |     def plot_in_3d(self, X, y=None):
105 |         X_transformed = self._transform(X, dim=3)
106 |         x1 = X_transformed[:, 0]
107 |         x2 = X_transformed[:, 1]
108 |         x3 = X_transformed[:, 2]
109 |         fig = plt.figure()
110 |         ax = fig.add_subplot(111, projection='3d')
111 |         ax.scatter(x1, x2, x3, c=y)
112 |         plt.show()
113 | 
114 | 
115 | 


--------------------------------------------------------------------------------
/GBDT/GBDT_python3_code/utils/objects.py:
--------------------------------------------------------------------------------
 1 | _get_attr_raise_on_attribute_error = "RAISE ON EXCEPTION"
 2 | 
 3 | def get_attr(obj, string_rep, default=_get_attr_raise_on_attribute_error, separator="."):
 4 |     """ getattr via a chain of attributes like so:
 5 |     >>> import datetime
 6 |     >>> some_date = datetime.date.today()
 7 |     >>> get_attr(some_date, "month.numerator.__doc__")
 8 |     'int(x[, base]) -> integer\n\nConvert a string or number to an integer, ...
 9 |     """
10 |     attribute_chain = string_rep.split(separator)
11 | 
12 |     current_obj = obj
13 | 
14 |     for attr in attribute_chain:
15 |         try:
16 |             current_obj = getattr(current_obj, attr)
17 |         except AttributeError:
18 |             if default is _get_attr_raise_on_attribute_error:
19 |                 raise AttributeError(
20 |                     "Bad attribute \"{}\" in chain: \"{}\"".format(attr, string_rep)
21 |                 )
22 |             return default
23 | 
24 |     return current_obj
25 | 
26 | 
27 | class ImmutableWrapper(object):
28 |     _obj = None
29 |     _recursive = None
30 | 
31 |     def __init__(self, obj, recursive):
32 |         self._obj = obj
33 |         self._recursive = recursive
34 | 
35 |     def __setattr__(self, name, val):
36 |         if name == "_obj" and self._obj is None:
37 |             object.__setattr__(self, name, val)
38 |             return
39 |         elif name == "_recursive" and self._recursive is None:
40 |             object.__setattr__(self, name, val)
41 |             return
42 | 
43 |         raise AttributeError("This object has been marked as immutable; you cannot set its attributes.")
44 | 
45 |     def __getattr__(self, name):
46 |         if self._recursive:
47 |             return immutable(getattr(self._obj, name), recursive=self._recursive)
48 | 
49 |         return getattr(self._obj, name)
50 | 
51 |     def __repr__(self):
52 |         return "<Immutable {}: {}>".format(self._obj.__class__.__name__, self._obj.__repr__())
53 | 
54 | 
55 | def immutable(obj, recursive=True):
56 |     """wraps the argument in a pass-through class that disallows all attribute
57 |     setting. If the `recursive` flag is true, all attribute accesses will
58 |     return an immutable-wrapped version of the "real" attribute."""
59 |     return ImmutableWrapper(obj, recursive)
60 | 


--------------------------------------------------------------------------------
/GBDT/readme.md:
--------------------------------------------------------------------------------
1 | #### 文档引用说明
2 | 
3 | - 本目录下的GBDT文档分享来源于阿里星wepon大神
4 | - 其github地址为https://github.com/wepe
5 | - 知乎也可以搜索到大佬ID进行相关学习
6 | - 该文档结合代码进行理解，将有助于明白GBDT在具体的分类和回归任务中如何将基函数回归树做相关的转换应用
7 | - 该文档后半部分还包含XGBoost算法详述、LightGBM算法简述，可用于进阶学习
8 | - 如对简易代码实现的面向对象中的super函数使用方法理解困难的，建议学习该方法的使用后再理解。建议地址https://blog.csdn.net/qq_26442553/article/details/81775449
9 | 


--------------------------------------------------------------------------------
/GBDT/【HP20190706】《统计学习方法》第一版例题8.2代码实现.md:
--------------------------------------------------------------------------------
  1 | ## 李航统计学习方法8.2例题代码实现
  2 | 
  3 | 写代码一开始没有头绪，但是一般解决问题的办法有三个：
  4 | 
  5 | - **思考代码实现顺序步骤，先写出伪代码**
  6 | - 如果想不出实现，不知道代码的使用方法，比如本案例的数组的调用方法，如何查找和使用索引等，那么**就可以尽量的去参考别人的写法，学习和借鉴**，学习借鉴得越多，掌握的代码的具体细节方法就越多，在伪代码拆分一个具体的项目的时候，可用的每一步方案就越多
  7 | - **多写，多写，多写**！！！编程没有捷径，多写，实现不了的时候多思考，然后多借鉴，遭遇到具体问题多请教大神
  8 | 
  9 | 
 10 | 伪代码如下：
 11 | 
 12 | 输入变量：
 13 | 
 14 | 1.x数组，相当于特征
 15 | 
 16 | 2.y数组，相当于目标变量
 17 | 
 18 | 3.分界点，这里的分界点是需要for循环来处理的
 19 | 
 20 | 4.返回的结果，输出各个分界点的最小误差损失，以及对应的所有分界点的最小误差分界点
 21 | 
 22 | 
 23 | 
 24 | ```python
 25 | import pandas as pd
 26 | import numpy as np
 27 | import math
 28 | 
 29 | 
 30 | # 生成数组列表
 31 | x = list(range(1,11,1))
 32 | # 打印目标变量
 33 | y = [5.56,5.70,5.91,6.40,6.80,7.05,8.9,8.7,9.00,9.05]
 34 | 
 35 | # 定义数据的切分点数组
 36 | spliting_points=list(range(1,10,1))
 37 | spliting_points = [i + 0.5 for i in spliting_points]
 38 | spliting_points
 39 | 
 40 | # 封装一个函数来实现P149页的内容
 41 | # 这里的思路是一定要梳理出来哪些是需要循环调用的变量
 42 | # 另外这里如何去通过一列数组的索引去获取另外一列数组的值列表，值得学习
 43 | # 最后需要学习的是如何通过for循环开实现∑求和
 44 | 
 45 | def Spliting_list(x_array,y_array,spliting_array):
 46 |     for s in spliting_array:
 47 |         # 通过x的索引位置来获取y的列表分组
 48 |         # math.floor（）向下取整
 49 |         R1 =y_array[:x_array.index(math.floor(s))+1]
 50 |         R2 =y_array[x_array.index(math.floor(s))+1:]
 51 |         c1 = round(np.mean(R1),2)
 52 |         c2 = round(np.mean(R2),2)
 53 |         ms_1 = 0
 54 |         ms_2 = 0
 55 |     # 跳出for循环开始计算最小误差
 56 |         for i in R1:
 57 |             ms_1 += (i-c1)**2
 58 |         for j in R2:
 59 |             ms_2 += (j-c2)**2
 60 |         ms = round((ms_1 + ms_2),2) 
 61 |         K = print([s,c1,c2,ms])
 62 |     return K
 63 | 
 64 | # 初步运行结果，已经实现了P149的全部内容，下一个问题是如何根据所求内容求误差的最小值
 65 | 
 66 | Spliting_list(x,y,spliting_points)
 67 | 
 68 | [1.5, 5.56, 7.5, 15.72]
 69 | [2.5, 5.63, 7.73, 12.08]
 70 | [3.5, 5.72, 7.99, 8.37]
 71 | [4.5, 5.89, 8.25, 5.78]
 72 | [5.5, 6.07, 8.54, 3.91]
 73 | [6.5, 6.24, 8.91, 1.93]
 74 | [7.5, 6.62, 8.92, 8.01]
 75 | [8.5, 6.88, 9.02, 11.74]
 76 | [9.5, 7.11, 9.05, 15.74]
 77 | 
 78 | # 函数封装的迭代，主要思考如何将该数据中的最小误差所对应的c1\c2\R1\R2存储下来
 79 | 
 80 | def Spliting_list(x_array,y_array,spliting_array):
 81 |     ms_list = []
 82 |     min_lose= np.inf
 83 |     for s in spliting_array:
 84 |         # 通过x的索引位置来获取y的列表分组
 85 |         # math.floor（）向下取整
 86 |         R1 =y_array[:x_array.index(math.floor(s))+1]
 87 |         R2 =y_array[x_array.index(math.floor(s))+1:]
 88 |         c1 = round(np.mean(R1),2)
 89 |         c2 = round(np.mean(R2),2)
 90 |         ms_1 = 0
 91 |         ms_2 = 0
 92 |         # for循环开始计算最小误差
 93 |         for i in R1:
 94 |             ms_1 += (i-c1)**2
 95 |         for j in R2:
 96 |             ms_2 += (j-c2)**2
 97 |         ms = round((ms_1 + ms_2),2)
 98 |         # 如何来存储最佳的C1和C2呢
 99 |         if ms < min_lose:
100 |             # 更新最小误差，这一步特别重要
101 |             min_lose = ms
102 |             best_c1 = c1
103 |             best_c2 = c2
104 |             best_R1 = R1
105 |             best_R2 = R2
106 |         # 对所有所求的ms添加一个列表进行存储
107 |         ms_list.append(ms)
108 |     # 对所有ms求得最小误差
109 |     K = min(ms_list)
110 |     return K,best_c1,best_c2,best_R1,best_R2
111 | 
112 | # 函数的封装迭代二，思考如何实现R1根据给到的分组求出最小的残差，用于下一阶段的数据拟合
113 | 
114 | 
115 | def Spliting_list(x_array,y_array,spliting_array):
116 |     ms_list = []
117 |     min_lose= np.inf
118 |     for s in spliting_array:
119 |         # 通过x的索引位置来获取y的列表分组
120 |         # math.floor（）向下取整
121 |         R1 =y_array[:x_array.index(math.floor(s))+1]
122 |         R2 =y_array[x_array.index(math.floor(s))+1:]
123 |         c1 = round(np.mean(R1),2)
124 |         c2 = round(np.mean(R2),2)
125 |         ms_1 = 0
126 |         ms_2 = 0
127 |         # for循环开始计算最小误差
128 |         for i in R1:
129 |             ms_1 += (i-c1)**2
130 |         for j in R2:
131 |             ms_2 += (j-c2)**2
132 |         ms = round((ms_1 + ms_2),2)
133 |         # 如何来存储最佳的C1和C2呢
134 |         if ms < min_lose:
135 |             # 更新最小误差，这一步特别重要
136 |             min_lose = ms
137 |             best_c1 = c1
138 |             best_c2 = c2
139 |             best_R1 = R1
140 |             best_R2 = R2
141 |             # 计算残差
142 |             R1_loss = [m - c1 for m in best_R1]
143 |             R2_loss = [n - c2 for n in best_R2]
144 |             R_loss = np.hstack((R1_loss,R2_loss))
145 |         # 对所有所求的ms添加一个列表进行存储
146 |         ms_list.append(ms)
147 |     # 对所有ms求得最小误差
148 |     K = min(ms_list)
149 |     return K,best_c1,best_c2,R_loss
150 |     
151 |     TD = Spliting_list(x,y,spliting_points)
152 | 
153 | (1.93,
154 |  6.24,
155 |  8.91,
156 |  array([-0.68, -0.54, -0.33,  0.16,  0.56,  0.81, -0.01, -0.21,  0.09,
157 |          0.14]))
158 | 
159 | 
160 | 
161 | # 平方误差损失SSE
162 | while SSE > 0.18:
163 |     SSE = np.inf
164 |     loss_min = Spliting_list(x,y,spliting_points)[0]
165 |     R_loss = Spliting_list(x,y,spliting_points)[3] 
166 |     if loss_min < SSE:
167 |         SSE = loss_min
168 |         y = R_loss
169 |     print(SSE)
170 |     
171 | 1.93
172 | 0.79
173 | 0.47
174 | 0.3
175 | 0.23
176 | 0.17
177 | ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### 面向机器学习初学者的最全注释版本的机器学习实战的代码
 3 | 
 4 | #### 代码说明
 5 | 
 6 | - 本代码会对每一行机器学习的某个算法的代码进行注释，确保刚接触算法代码的初学者也能够看懂程序到底在运行什么；
 7 | 
 8 | - 本代码基于python3版本，因此和所涉书籍《机器学习实战》的python2代码在细节上存在差异。
 9 | #### v5.0 20190722 添加XGBoost算法简易代码实现
10 | 
11 | #### v4.0 20190710,添加GBDT算法代码实现，添加Stacking的PPT讲解，便于理解Stacking模型融合的主要流程，并通过实践Kaggle关于泰坦尼克号数据集的模型融合代码，以工程形式展示Stacking的整个过程
12 | 
13 | #### v3.0 20190617 添加XGBoost的泰坦尼克号数据集调包实践，后期将更新XGBoost的理解内容
14 | 
15 | #### v2.0 20190529 添加CART回归算法的核心代码
16 | 
17 | #### v1.0 20190529 添加AdaBoost算法核心代码
18 | 


--------------------------------------------------------------------------------
/Stacking/Stacking_learn_beta.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## kaggle泰坦尼克号机器学习stacking模型融合
  3 | 
  4 | 
  5 | ```python
  6 | import numpy as np
  7 | import pandas as pd
  8 | import re 
  9 | import sklearn
 10 | import os
 11 | # 显示当前路径
 12 | os.getcwd()
 13 | ```
 14 | 
 15 | 
 16 | 
 17 | 
 18 |     'D:\\jupyter_notebook'
 19 | 
 20 | 
 21 | 
 22 | 
 23 | ```python
 24 | # 导入数据
 25 | train_ = pd.read_csv('D:/jupyter_notebook/titanic/train.csv')
 26 | test_ = pd.read_csv('D:/jupyter_notebook/titanic/test.csv')
 27 | ```
 28 | 
 29 | 
 30 | ```python
 31 | # 为方便进行数据处理，将训练集和测试集合并进行数据处理
 32 | train_['number'] = 1 
 33 | test_['number'] = 0
 34 | datamart = pd.concat([train_, test_], axis=0, join='outer')  
 35 | ```
 36 | 
 37 |     C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
 38 |     of pandas will change to not sort by default.
 39 |     
 40 |     To accept the future behavior, pass 'sort=False'.
 41 |     
 42 |     To retain the current behavior and silence the warning, pass 'sort=True'.
 43 |     
 44 |       after removing the cwd from sys.path.
 45 |     
 46 | 
 47 | ### 1.根据原始特征进行特征处理，训练集和测试集合并处理
 48 | 
 49 | 
 50 | ```python
 51 | #根据原始特征的观察构建新特征
 52 | # 计算名字的长度
 53 | datamart['Name_length'] = datamart['Name'].apply(len)
 54 | # 将旅客是否住在头等舱二值化
 55 | datamart['Has_Cabin'] = datamart["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
 56 | # 构建新特征家庭总人数
 57 | datamart['FamilySize'] = datamart['SibSp'] + datamart['Parch'] + 1
 58 | # 构建新特征是否独居
 59 | datamart['IsAlone'] = 0
 60 | datamart.loc[datamart['FamilySize'] == 1, 'IsAlone'] = 1
 61 | # 查看乘客登船口岸存在缺失值
 62 | datamart['Embarked'].isnull().value_counts() 
 63 | # 对乘客登船口岸进行固定值填充缺失值
 64 | datamart['Embarked'] = datamart['Embarked'].fillna('S')
 65 | # 对票价进行中位数填充缺失值
 66 | datamart['Fare'] = datamart['Fare'].fillna(datamart['Fare'].median())
 67 | # 生成绝对票价分区，qcut是根据分区分位定义，将每一个值划为到具体的分区区间中去，此处定义为四分位值
 68 | datamart['CategoricalFare'] = pd.qcut(datamart['Fare'], 4)
 69 | # 生成新变量年龄平均值、年龄标准差
 70 | age_avg = datamart['Age'].mean()
 71 | age_std = datamart['Age'].std()
 72 | # 计算年龄是否有缺失值并统计
 73 | age_null_count = datamart['Age'].isnull().sum()
 74 | # np.random.randint()产生离散均匀分布的整数,size是产生的元素数量，前面分别为最小值和最大值区间
 75 | age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
 76 | # 对年龄用生成的一些新数值进行填充
 77 | datamart['Age'][np.isnan(datamart['Age'])] = age_null_random_list
 78 | # 转换变量类型为数值类型，便于后期计算
 79 | datamart['Age'] = datamart['Age'].astype(int)
 80 | # 对年龄生成新的分箱变量中来代替，即将年龄绝对值转换为离散类别
 81 | datamart['CategoricalAge'] = pd.cut(datamart['Age'], 5)
 82 | 
 83 | # 定义正则表达式函数导出旅客的Title
 84 | def get_title(name):
 85 |     # re.search()方法扫描整个字符串，并返回第一个成功的匹配。如果匹配失败，则返回None
 86 |     title_search = re.search('([A-Za-z]+)\.',name)
 87 |     if title_search:
 88 |         return title_search.group(1)
 89 |     return ''
 90 | 
 91 | # 取出姓名中尊称部分
 92 | datamart['Title'] = datamart['Name'].apply(get_title)
 93 | 
 94 | # 对姓名的称呼部分做统一
 95 | datamart['Title'] = datamart['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major'
 96 |                                            , 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
 97 | datamart['Title'] = datamart['Title'].replace('Mlle', 'Miss')
 98 | datamart['Title'] = datamart['Title'].replace('Ms', 'Miss')
 99 | datamart['Title'] = datamart['Title'].replace('Mme', 'Mrs')
100 | 
101 | # 对性别从离散型替换为数值型
102 | datamart['Sex'] = datamart['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
103 |     
104 | # 对姓名的称呼部分做数值型变换
105 | title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
106 | # 先定义一个字典，然后通过map函数传入字典进行替换
107 | datamart['Title'] = datamart['Title'].map(title_mapping)
108 | # 最后对缺失值替换为0
109 | datamart['Title'] = datamart['Title'].fillna(0)
110 |     
111 | # 替换登船口岸
112 | datamart['Embarked'] = datamart['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
113 |     
114 | # 替换票价的四分位数，该步骤应该有更好的办法做数据处理
115 | # loc函数取出列中某类元素的数据集
116 | datamart.loc[ datamart['Fare'] <= 7.91, 'Fare'] = 0
117 | datamart.loc[(datamart['Fare'] > 7.91) & (datamart['Fare'] <= 14.454), 'Fare'] = 1
118 | datamart.loc[(datamart['Fare'] > 14.454) & (datamart['Fare'] <= 31), 'Fare']   = 2
119 | datamart.loc[ datamart['Fare'] > 31, 'Fare'] = 3
120 | datamart['Fare'] = datamart['Fare'].astype(int)
121 |     
122 | # 对年龄进行分段
123 | datamart.loc[ datamart['Age'] <= 16, 'Age'] = 0
124 | datamart.loc[(datamart['Age'] > 16) & (datamart['Age'] <= 32), 'Age'] = 1
125 | datamart.loc[(datamart['Age'] > 32) & (datamart['Age'] <= 48), 'Age'] = 2
126 | datamart.loc[(datamart['Age'] > 48) & (datamart['Age'] <= 64), 'Age'] = 3
127 | datamart.loc[datamart['Age'] > 64, 'Age'] = 4
128 | 
129 | 
130 | # 特征选择，先对处理过的不需要的特征进行删除，定义一个列表，然后批量删除
131 | drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
132 | datamart = datamart.drop(drop_elements, axis = 1)
133 | datamart = datamart.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
134 | # test_  = test_.drop(drop_elements, axis = 1)
135 | 
136 | datamart.head()
137 | ```
138 | 
139 |     C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 
140 |     A value is trying to be set on a copy of a slice from a DataFrame
141 |     
142 |     See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
143 |     
144 | 
145 | 
146 | 
147 | 
148 | <div>
149 | <style scoped>
150 |     .dataframe tbody tr th:only-of-type {
151 |         vertical-align: middle;
152 |     }
153 | 
154 |     .dataframe tbody tr th {
155 |         vertical-align: top;
156 |     }
157 | 
158 |     .dataframe thead th {
159 |         text-align: right;
160 |     }
161 | </style>
162 | <table border="1" class="dataframe">
163 |   <thead>
164 |     <tr style="text-align: right;">
165 |       <th></th>
166 |       <th>Age</th>
167 |       <th>Embarked</th>
168 |       <th>Fare</th>
169 |       <th>Parch</th>
170 |       <th>Pclass</th>
171 |       <th>Sex</th>
172 |       <th>Survived</th>
173 |       <th>number</th>
174 |       <th>Name_length</th>
175 |       <th>Has_Cabin</th>
176 |       <th>FamilySize</th>
177 |       <th>IsAlone</th>
178 |       <th>Title</th>
179 |     </tr>
180 |   </thead>
181 |   <tbody>
182 |     <tr>
183 |       <th>0</th>
184 |       <td>1</td>
185 |       <td>0</td>
186 |       <td>0</td>
187 |       <td>0</td>
188 |       <td>3</td>
189 |       <td>1</td>
190 |       <td>0.0</td>
191 |       <td>1</td>
192 |       <td>23</td>
193 |       <td>0</td>
194 |       <td>2</td>
195 |       <td>0</td>
196 |       <td>1</td>
197 |     </tr>
198 |     <tr>
199 |       <th>1</th>
200 |       <td>2</td>
201 |       <td>1</td>
202 |       <td>3</td>
203 |       <td>0</td>
204 |       <td>1</td>
205 |       <td>0</td>
206 |       <td>1.0</td>
207 |       <td>1</td>
208 |       <td>51</td>
209 |       <td>1</td>
210 |       <td>2</td>
211 |       <td>0</td>
212 |       <td>3</td>
213 |     </tr>
214 |     <tr>
215 |       <th>2</th>
216 |       <td>1</td>
217 |       <td>0</td>
218 |       <td>1</td>
219 |       <td>0</td>
220 |       <td>3</td>
221 |       <td>0</td>
222 |       <td>1.0</td>
223 |       <td>1</td>
224 |       <td>22</td>
225 |       <td>0</td>
226 |       <td>1</td>
227 |       <td>1</td>
228 |       <td>2</td>
229 |     </tr>
230 |     <tr>
231 |       <th>3</th>
232 |       <td>2</td>
233 |       <td>0</td>
234 |       <td>3</td>
235 |       <td>0</td>
236 |       <td>1</td>
237 |       <td>0</td>
238 |       <td>1.0</td>
239 |       <td>1</td>
240 |       <td>44</td>
241 |       <td>1</td>
242 |       <td>2</td>
243 |       <td>0</td>
244 |       <td>3</td>
245 |     </tr>
246 |     <tr>
247 |       <th>4</th>
248 |       <td>2</td>
249 |       <td>0</td>
250 |       <td>1</td>
251 |       <td>0</td>
252 |       <td>3</td>
253 |       <td>1</td>
254 |       <td>0.0</td>
255 |       <td>1</td>
256 |       <td>24</td>
257 |       <td>0</td>
258 |       <td>1</td>
259 |       <td>1</td>
260 |       <td>1</td>
261 |     </tr>
262 |   </tbody>
263 | </table>
264 | </div>
265 | 
266 | 
267 | 
268 | ### 2.对特征处理后的测试集和训练集分开
269 | 
270 | 
271 | ```python
272 | # 通过loc方法选取训练集的数据
273 | train_new = datamart.loc[datamart['number'] == 1]
274 | # 对number列进行删除
275 | train_new = train_new.drop(['number'],axis=1)
276 | ```
277 | 
278 | 
279 | ```python
280 | train_new.head()
281 | ```
282 | 
283 | 
284 | 
285 | 
286 | <div>
287 | <style scoped>
288 |     .dataframe tbody tr th:only-of-type {
289 |         vertical-align: middle;
290 |     }
291 | 
292 |     .dataframe tbody tr th {
293 |         vertical-align: top;
294 |     }
295 | 
296 |     .dataframe thead th {
297 |         text-align: right;
298 |     }
299 | </style>
300 | <table border="1" class="dataframe">
301 |   <thead>
302 |     <tr style="text-align: right;">
303 |       <th></th>
304 |       <th>Age</th>
305 |       <th>Embarked</th>
306 |       <th>Fare</th>
307 |       <th>Parch</th>
308 |       <th>Pclass</th>
309 |       <th>Sex</th>
310 |       <th>Survived</th>
311 |       <th>Name_length</th>
312 |       <th>Has_Cabin</th>
313 |       <th>FamilySize</th>
314 |       <th>IsAlone</th>
315 |       <th>Title</th>
316 |     </tr>
317 |   </thead>
318 |   <tbody>
319 |     <tr>
320 |       <th>0</th>
321 |       <td>1</td>
322 |       <td>0</td>
323 |       <td>0</td>
324 |       <td>0</td>
325 |       <td>3</td>
326 |       <td>1</td>
327 |       <td>0.0</td>
328 |       <td>23</td>
329 |       <td>0</td>
330 |       <td>2</td>
331 |       <td>0</td>
332 |       <td>1</td>
333 |     </tr>
334 |     <tr>
335 |       <th>1</th>
336 |       <td>2</td>
337 |       <td>1</td>
338 |       <td>3</td>
339 |       <td>0</td>
340 |       <td>1</td>
341 |       <td>0</td>
342 |       <td>1.0</td>
343 |       <td>51</td>
344 |       <td>1</td>
345 |       <td>2</td>
346 |       <td>0</td>
347 |       <td>3</td>
348 |     </tr>
349 |     <tr>
350 |       <th>2</th>
351 |       <td>1</td>
352 |       <td>0</td>
353 |       <td>1</td>
354 |       <td>0</td>
355 |       <td>3</td>
356 |       <td>0</td>
357 |       <td>1.0</td>
358 |       <td>22</td>
359 |       <td>0</td>
360 |       <td>1</td>
361 |       <td>1</td>
362 |       <td>2</td>
363 |     </tr>
364 |     <tr>
365 |       <th>3</th>
366 |       <td>2</td>
367 |       <td>0</td>
368 |       <td>3</td>
369 |       <td>0</td>
370 |       <td>1</td>
371 |       <td>0</td>
372 |       <td>1.0</td>
373 |       <td>44</td>
374 |       <td>1</td>
375 |       <td>2</td>
376 |       <td>0</td>
377 |       <td>3</td>
378 |     </tr>
379 |     <tr>
380 |       <th>4</th>
381 |       <td>2</td>
382 |       <td>0</td>
383 |       <td>1</td>
384 |       <td>0</td>
385 |       <td>3</td>
386 |       <td>1</td>
387 |       <td>0.0</td>
388 |       <td>24</td>
389 |       <td>0</td>
390 |       <td>1</td>
391 |       <td>1</td>
392 |       <td>1</td>
393 |     </tr>
394 |   </tbody>
395 | </table>
396 | </div>
397 | 
398 | 
399 | 
400 | 
401 | ```python
402 | test_new = datamart.loc[datamart['number'] == 0]
403 | drop_columns = ['number','Survived']
404 | test_new = test_new.drop(drop_columns,axis=1)
405 | ```
406 | 
407 | 
408 | ```python
409 | test_new.head()
410 | ```
411 | 
412 | 
413 | 
414 | 
415 | <div>
416 | <style scoped>
417 |     .dataframe tbody tr th:only-of-type {
418 |         vertical-align: middle;
419 |     }
420 | 
421 |     .dataframe tbody tr th {
422 |         vertical-align: top;
423 |     }
424 | 
425 |     .dataframe thead th {
426 |         text-align: right;
427 |     }
428 | </style>
429 | <table border="1" class="dataframe">
430 |   <thead>
431 |     <tr style="text-align: right;">
432 |       <th></th>
433 |       <th>Age</th>
434 |       <th>Embarked</th>
435 |       <th>Fare</th>
436 |       <th>Parch</th>
437 |       <th>Pclass</th>
438 |       <th>Sex</th>
439 |       <th>Name_length</th>
440 |       <th>Has_Cabin</th>
441 |       <th>FamilySize</th>
442 |       <th>IsAlone</th>
443 |       <th>Title</th>
444 |     </tr>
445 |   </thead>
446 |   <tbody>
447 |     <tr>
448 |       <th>0</th>
449 |       <td>2</td>
450 |       <td>2</td>
451 |       <td>0</td>
452 |       <td>0</td>
453 |       <td>3</td>
454 |       <td>1</td>
455 |       <td>16</td>
456 |       <td>0</td>
457 |       <td>1</td>
458 |       <td>1</td>
459 |       <td>1</td>
460 |     </tr>
461 |     <tr>
462 |       <th>1</th>
463 |       <td>2</td>
464 |       <td>0</td>
465 |       <td>0</td>
466 |       <td>0</td>
467 |       <td>3</td>
468 |       <td>0</td>
469 |       <td>32</td>
470 |       <td>0</td>
471 |       <td>2</td>
472 |       <td>0</td>
473 |       <td>3</td>
474 |     </tr>
475 |     <tr>
476 |       <th>2</th>
477 |       <td>3</td>
478 |       <td>2</td>
479 |       <td>1</td>
480 |       <td>0</td>
481 |       <td>2</td>
482 |       <td>1</td>
483 |       <td>25</td>
484 |       <td>0</td>
485 |       <td>1</td>
486 |       <td>1</td>
487 |       <td>1</td>
488 |     </tr>
489 |     <tr>
490 |       <th>3</th>
491 |       <td>1</td>
492 |       <td>0</td>
493 |       <td>1</td>
494 |       <td>0</td>
495 |       <td>3</td>
496 |       <td>1</td>
497 |       <td>16</td>
498 |       <td>0</td>
499 |       <td>1</td>
500 |       <td>1</td>
501 |       <td>1</td>
502 |     </tr>
503 |     <tr>
504 |       <th>4</th>
505 |       <td>1</td>
506 |       <td>0</td>
507 |       <td>1</td>
508 |       <td>1</td>
509 |       <td>3</td>
510 |       <td>0</td>
511 |       <td>44</td>
512 |       <td>0</td>
513 |       <td>3</td>
514 |       <td>0</td>
515 |       <td>3</td>
516 |     </tr>
517 |   </tbody>
518 | </table>
519 | </div>
520 | 
521 | 
522 | 
523 | ### 3.导入机器学习库对数据特征进行探索
524 | 
525 | 
526 | ```python
527 | import sklearn
528 | import plotly.offline as py
529 | py.init_notebook_mode(connected=True)
530 | import plotly.graph_objs as go
531 | import plotly.tools as tls
532 | 
533 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
534 | from sklearn.svm import SVC
535 | from sklearn.model_selection import KFold
536 | ```
537 | 
538 | 
539 |         <script type="text/javascript">
540 |         window.PlotlyConfig = {MathJaxConfig: 'local'};
541 |         if (window.MathJax) {MathJax.Hub.Config({SVG: {font: "STIX-Web"}});}
542 |         if (typeof require !== 'undefined') {
543 |         require.undef("plotly");
544 |         requirejs.config({
545 |             paths: {
546 |                 'plotly': ['https://cdn.plot.ly/plotly-latest.min']
547 |             }
548 |         });
549 |         require(['plotly'], function(Plotly) {
550 |             window._Plotly = Plotly;
551 |         });
552 |         }
553 |         </script>
554 |         
555 | 
556 | 
557 | 
558 | ```python
559 | 
560 | ```
561 | 


--------------------------------------------------------------------------------
/Stacking/changelog.md:
--------------------------------------------------------------------------------
1 | 20190621
2 | v1.0 本目录主要用于更新Stacking模型融合集成学习方法，结合kaggle泰坦尼克数据集进行相关机器学习方法搭建，同时会涉及到python对象方法的简单学习
3 | 


--------------------------------------------------------------------------------
/Stacking/kaggle_titanic_data/gender_submission.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived
  2 | 892,0
  3 | 893,1
  4 | 894,0
  5 | 895,0
  6 | 896,1
  7 | 897,0
  8 | 898,1
  9 | 899,0
 10 | 900,1
 11 | 901,0
 12 | 902,0
 13 | 903,0
 14 | 904,1
 15 | 905,0
 16 | 906,1
 17 | 907,1
 18 | 908,0
 19 | 909,0
 20 | 910,1
 21 | 911,1
 22 | 912,0
 23 | 913,0
 24 | 914,1
 25 | 915,0
 26 | 916,1
 27 | 917,0
 28 | 918,1
 29 | 919,0
 30 | 920,0
 31 | 921,0
 32 | 922,0
 33 | 923,0
 34 | 924,1
 35 | 925,1
 36 | 926,0
 37 | 927,0
 38 | 928,1
 39 | 929,1
 40 | 930,0
 41 | 931,0
 42 | 932,0
 43 | 933,0
 44 | 934,0
 45 | 935,1
 46 | 936,1
 47 | 937,0
 48 | 938,0
 49 | 939,0
 50 | 940,1
 51 | 941,1
 52 | 942,0
 53 | 943,0
 54 | 944,1
 55 | 945,1
 56 | 946,0
 57 | 947,0
 58 | 948,0
 59 | 949,0
 60 | 950,0
 61 | 951,1
 62 | 952,0
 63 | 953,0
 64 | 954,0
 65 | 955,1
 66 | 956,0
 67 | 957,1
 68 | 958,1
 69 | 959,0
 70 | 960,0
 71 | 961,1
 72 | 962,1
 73 | 963,0
 74 | 964,1
 75 | 965,0
 76 | 966,1
 77 | 967,0
 78 | 968,0
 79 | 969,1
 80 | 970,0
 81 | 971,1
 82 | 972,0
 83 | 973,0
 84 | 974,0
 85 | 975,0
 86 | 976,0
 87 | 977,0
 88 | 978,1
 89 | 979,1
 90 | 980,1
 91 | 981,0
 92 | 982,1
 93 | 983,0
 94 | 984,1
 95 | 985,0
 96 | 986,0
 97 | 987,0
 98 | 988,1
 99 | 989,0
100 | 990,1
101 | 991,0
102 | 992,1
103 | 993,0
104 | 994,0
105 | 995,0
106 | 996,1
107 | 997,0
108 | 998,0
109 | 999,0
110 | 1000,0
111 | 1001,0
112 | 1002,0
113 | 1003,1
114 | 1004,1
115 | 1005,1
116 | 1006,1
117 | 1007,0
118 | 1008,0
119 | 1009,1
120 | 1010,0
121 | 1011,1
122 | 1012,1
123 | 1013,0
124 | 1014,1
125 | 1015,0
126 | 1016,0
127 | 1017,1
128 | 1018,0
129 | 1019,1
130 | 1020,0
131 | 1021,0
132 | 1022,0
133 | 1023,0
134 | 1024,1
135 | 1025,0
136 | 1026,0
137 | 1027,0
138 | 1028,0
139 | 1029,0
140 | 1030,1
141 | 1031,0
142 | 1032,1
143 | 1033,1
144 | 1034,0
145 | 1035,0
146 | 1036,0
147 | 1037,0
148 | 1038,0
149 | 1039,0
150 | 1040,0
151 | 1041,0
152 | 1042,1
153 | 1043,0
154 | 1044,0
155 | 1045,1
156 | 1046,0
157 | 1047,0
158 | 1048,1
159 | 1049,1
160 | 1050,0
161 | 1051,1
162 | 1052,1
163 | 1053,0
164 | 1054,1
165 | 1055,0
166 | 1056,0
167 | 1057,1
168 | 1058,0
169 | 1059,0
170 | 1060,1
171 | 1061,1
172 | 1062,0
173 | 1063,0
174 | 1064,0
175 | 1065,0
176 | 1066,0
177 | 1067,1
178 | 1068,1
179 | 1069,0
180 | 1070,1
181 | 1071,1
182 | 1072,0
183 | 1073,0
184 | 1074,1
185 | 1075,0
186 | 1076,1
187 | 1077,0
188 | 1078,1
189 | 1079,0
190 | 1080,1
191 | 1081,0
192 | 1082,0
193 | 1083,0
194 | 1084,0
195 | 1085,0
196 | 1086,0
197 | 1087,0
198 | 1088,0
199 | 1089,1
200 | 1090,0
201 | 1091,1
202 | 1092,1
203 | 1093,0
204 | 1094,0
205 | 1095,1
206 | 1096,0
207 | 1097,0
208 | 1098,1
209 | 1099,0
210 | 1100,1
211 | 1101,0
212 | 1102,0
213 | 1103,0
214 | 1104,0
215 | 1105,1
216 | 1106,1
217 | 1107,0
218 | 1108,1
219 | 1109,0
220 | 1110,1
221 | 1111,0
222 | 1112,1
223 | 1113,0
224 | 1114,1
225 | 1115,0
226 | 1116,1
227 | 1117,1
228 | 1118,0
229 | 1119,1
230 | 1120,0
231 | 1121,0
232 | 1122,0
233 | 1123,1
234 | 1124,0
235 | 1125,0
236 | 1126,0
237 | 1127,0
238 | 1128,0
239 | 1129,0
240 | 1130,1
241 | 1131,1
242 | 1132,1
243 | 1133,1
244 | 1134,0
245 | 1135,0
246 | 1136,0
247 | 1137,0
248 | 1138,1
249 | 1139,0
250 | 1140,1
251 | 1141,1
252 | 1142,1
253 | 1143,0
254 | 1144,0
255 | 1145,0
256 | 1146,0
257 | 1147,0
258 | 1148,0
259 | 1149,0
260 | 1150,1
261 | 1151,0
262 | 1152,0
263 | 1153,0
264 | 1154,1
265 | 1155,1
266 | 1156,0
267 | 1157,0
268 | 1158,0
269 | 1159,0
270 | 1160,1
271 | 1161,0
272 | 1162,0
273 | 1163,0
274 | 1164,1
275 | 1165,1
276 | 1166,0
277 | 1167,1
278 | 1168,0
279 | 1169,0
280 | 1170,0
281 | 1171,0
282 | 1172,1
283 | 1173,0
284 | 1174,1
285 | 1175,1
286 | 1176,1
287 | 1177,0
288 | 1178,0
289 | 1179,0
290 | 1180,0
291 | 1181,0
292 | 1182,0
293 | 1183,1
294 | 1184,0
295 | 1185,0
296 | 1186,0
297 | 1187,0
298 | 1188,1
299 | 1189,0
300 | 1190,0
301 | 1191,0
302 | 1192,0
303 | 1193,0
304 | 1194,0
305 | 1195,0
306 | 1196,1
307 | 1197,1
308 | 1198,0
309 | 1199,0
310 | 1200,0
311 | 1201,1
312 | 1202,0
313 | 1203,0
314 | 1204,0
315 | 1205,1
316 | 1206,1
317 | 1207,1
318 | 1208,0
319 | 1209,0
320 | 1210,0
321 | 1211,0
322 | 1212,0
323 | 1213,0
324 | 1214,0
325 | 1215,0
326 | 1216,1
327 | 1217,0
328 | 1218,1
329 | 1219,0
330 | 1220,0
331 | 1221,0
332 | 1222,1
333 | 1223,0
334 | 1224,0
335 | 1225,1
336 | 1226,0
337 | 1227,0
338 | 1228,0
339 | 1229,0
340 | 1230,0
341 | 1231,0
342 | 1232,0
343 | 1233,0
344 | 1234,0
345 | 1235,1
346 | 1236,0
347 | 1237,1
348 | 1238,0
349 | 1239,1
350 | 1240,0
351 | 1241,1
352 | 1242,1
353 | 1243,0
354 | 1244,0
355 | 1245,0
356 | 1246,1
357 | 1247,0
358 | 1248,1
359 | 1249,0
360 | 1250,0
361 | 1251,1
362 | 1252,0
363 | 1253,1
364 | 1254,1
365 | 1255,0
366 | 1256,1
367 | 1257,1
368 | 1258,0
369 | 1259,1
370 | 1260,1
371 | 1261,0
372 | 1262,0
373 | 1263,1
374 | 1264,0
375 | 1265,0
376 | 1266,1
377 | 1267,1
378 | 1268,1
379 | 1269,0
380 | 1270,0
381 | 1271,0
382 | 1272,0
383 | 1273,0
384 | 1274,1
385 | 1275,1
386 | 1276,0
387 | 1277,1
388 | 1278,0
389 | 1279,0
390 | 1280,0
391 | 1281,0
392 | 1282,0
393 | 1283,1
394 | 1284,0
395 | 1285,0
396 | 1286,0
397 | 1287,1
398 | 1288,0
399 | 1289,1
400 | 1290,0
401 | 1291,0
402 | 1292,1
403 | 1293,0
404 | 1294,1
405 | 1295,0
406 | 1296,0
407 | 1297,0
408 | 1298,0
409 | 1299,0
410 | 1300,1
411 | 1301,1
412 | 1302,1
413 | 1303,1
414 | 1304,1
415 | 1305,0
416 | 1306,1
417 | 1307,0
418 | 1308,0
419 | 1309,0
420 | 


--------------------------------------------------------------------------------
/Stacking/kaggle_titanic_data/test.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
  2 | 892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
  3 | 893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S
  4 | 894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q
  5 | 895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S
  6 | 896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S
  7 | 897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S
  8 | 898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q
  9 | 899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S
 10 | 900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C
 11 | 901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S
 12 | 902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S
 13 | 903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S
 14 | 904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S
 15 | 905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S
 16 | 906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S
 17 | 907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C
 18 | 908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q
 19 | 909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C
 20 | 910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S
 21 | 911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C
 22 | 912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C
 23 | 913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S
 24 | 914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S
 25 | 915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C
 26 | 916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C
 27 | 917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S
 28 | 918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C
 29 | 919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C
 30 | 920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S
 31 | 921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C
 32 | 922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S
 33 | 923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S
 34 | 924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S
 35 | 925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S
 36 | 926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C
 37 | 927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C
 38 | 928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S
 39 | 929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S
 40 | 930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S
 41 | 931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S
 42 | 932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C
 43 | 933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S
 44 | 934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S
 45 | 935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S
 46 | 936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S
 47 | 937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S
 48 | 938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C
 49 | 939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q
 50 | 940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C
 51 | 941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S
 52 | 942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S
 53 | 943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C
 54 | 944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S
 55 | 945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S
 56 | 946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C
 57 | 947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q
 58 | 948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S
 59 | 949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S
 60 | 950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S
 61 | 951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C
 62 | 952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S
 63 | 953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S
 64 | 954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S
 65 | 955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q
 66 | 956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C
 67 | 957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S
 68 | 958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q
 69 | 959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S
 70 | 960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C
 71 | 961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S
 72 | 962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q
 73 | 963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S
 74 | 964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S
 75 | 965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C
 76 | 966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C
 77 | 967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C
 78 | 968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S
 79 | 969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S
 80 | 970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S
 81 | 971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q
 82 | 972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C
 83 | 973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S
 84 | 974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S
 85 | 975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S
 86 | 976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q
 87 | 977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C
 88 | 978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q
 89 | 979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S
 90 | 980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q
 91 | 981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S
 92 | 982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S
 93 | 983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S
 94 | 984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S
 95 | 985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S
 96 | 986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C
 97 | 987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S
 98 | 988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S
 99 | 989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S
100 | 990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S
101 | 991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S
102 | 992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C
103 | 993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S
104 | 994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q
105 | 995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S
106 | 996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C
107 | 997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S
108 | 998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q
109 | 999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q
110 | 1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S
111 | 1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S
112 | 1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C
113 | 1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q
114 | 1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C
115 | 1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q
116 | 1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S
117 | 1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C
118 | 1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C
119 | 1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S
120 | 1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C
121 | 1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S
122 | 1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S
123 | 1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q
124 | 1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C
125 | 1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S
126 | 1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q
127 | 1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S
128 | 1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S
129 | 1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q
130 | 1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S
131 | 1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S
132 | 1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S
133 | 1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C
134 | 1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S
135 | 1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C
136 | 1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S
137 | 1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S
138 | 1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C
139 | 1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S
140 | 1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S
141 | 1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S
142 | 1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S
143 | 1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S
144 | 1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C
145 | 1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S
146 | 1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S
147 | 1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S
148 | 1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S
149 | 1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S
150 | 1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S
151 | 1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S
152 | 1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C
153 | 1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C
154 | 1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S
155 | 1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S
156 | 1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S
157 | 1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S
158 | 1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S
159 | 1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S
160 | 1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S
161 | 1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S
162 | 1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q
163 | 1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C
164 | 1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S
165 | 1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S
166 | 1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S
167 | 1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S
168 | 1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C
169 | 1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S
170 | 1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C
171 | 1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S
172 | 1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S
173 | 1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C
174 | 1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S
175 | 1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C
176 | 1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S
177 | 1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S
178 | 1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S
179 | 1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C
180 | 1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S
181 | 1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C
182 | 1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S
183 | 1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C
184 | 1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S
185 | 1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q
186 | 1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C
187 | 1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S
188 | 1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S
189 | 1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S
190 | 1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S
191 | 1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S
192 | 1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S
193 | 1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S
194 | 1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S
195 | 1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q
196 | 1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S
197 | 1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S
198 | 1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C
199 | 1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S
200 | 1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S
201 | 1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S
202 | 1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q
203 | 1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S
204 | 1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C
205 | 1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S
206 | 1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S
207 | 1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C
208 | 1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q
209 | 1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S
210 | 1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C
211 | 1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S
212 | 1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S
213 | 1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S
214 | 1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S
215 | 1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S
216 | 1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S
217 | 1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S
218 | 1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q
219 | 1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S
220 | 1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C
221 | 1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S
222 | 1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C
223 | 1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S
224 | 1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S
225 | 1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S
226 | 1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C
227 | 1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C
228 | 1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S
229 | 1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q
230 | 1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S
231 | 1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S
232 | 1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S
233 | 1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S
234 | 1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S
235 | 1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q
236 | 1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C
237 | 1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S
238 | 1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C
239 | 1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C
240 | 1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S
241 | 1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C
242 | 1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C
243 | 1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S
244 | 1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C
245 | 1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S
246 | 1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S
247 | 1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S
248 | 1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S
249 | 1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S
250 | 1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S
251 | 1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C
252 | 1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S
253 | 1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S
254 | 1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C
255 | 1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S
256 | 1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S
257 | 1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S
258 | 1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q
259 | 1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S
260 | 1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S
261 | 1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S
262 | 1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S
263 | 1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S
264 | 1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S
265 | 1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S
266 | 1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C
267 | 1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S
268 | 1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S
269 | 1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S
270 | 1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S
271 | 1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S
272 | 1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C
273 | 1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q
274 | 1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C
275 | 1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q
276 | 1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C
277 | 1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S
278 | 1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S
279 | 1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S
280 | 1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S
281 | 1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S
282 | 1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S
283 | 1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S
284 | 1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q
285 | 1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C
286 | 1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S
287 | 1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S
288 | 1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S
289 | 1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S
290 | 1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C
291 | 1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S
292 | 1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S
293 | 1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q
294 | 1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C
295 | 1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S
296 | 1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S
297 | 1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S
298 | 1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C
299 | 1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C
300 | 1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S
301 | 1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S
302 | 1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S
303 | 1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C
304 | 1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S
305 | 1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S
306 | 1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q
307 | 1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S
308 | 1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S
309 | 1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S
310 | 1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S
311 | 1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S
312 | 1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S
313 | 1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C
314 | 1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S
315 | 1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q
316 | 1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C
317 | 1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q
318 | 1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C
319 | 1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S
320 | 1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S
321 | 1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S
322 | 1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S
323 | 1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C
324 | 1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S
325 | 1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S
326 | 1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S
327 | 1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S
328 | 1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S
329 | 1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C
330 | 1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S
331 | 1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S
332 | 1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S
333 | 1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C
334 | 1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C
335 | 1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C
336 | 1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S
337 | 1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S
338 | 1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S
339 | 1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C
340 | 1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S
341 | 1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C
342 | 1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S
343 | 1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S
344 | 1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S
345 | 1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C
346 | 1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S
347 | 1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S
348 | 1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S
349 | 1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C
350 | 1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S
351 | 1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S
352 | 1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C
353 | 1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S
354 | 1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S
355 | 1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S
356 | 1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S
357 | 1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S
358 | 1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S
359 | 1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S
360 | 1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q
361 | 1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S
362 | 1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S
363 | 1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C
364 | 1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S
365 | 1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S
366 | 1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C
367 | 1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S
368 | 1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C
369 | 1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S
370 | 1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C
371 | 1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C
372 | 1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S
373 | 1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C
374 | 1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S
375 | 1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S
376 | 1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S
377 | 1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C
378 | 1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S
379 | 1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S
380 | 1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S
381 | 1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S
382 | 1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q
383 | 1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q
384 | 1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S
385 | 1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S
386 | 1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S
387 | 1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S
388 | 1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S
389 | 1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S
390 | 1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q
391 | 1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S
392 | 1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S
393 | 1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S
394 | 1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S
395 | 1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S
396 | 1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S
397 | 1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S
398 | 1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q
399 | 1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C
400 | 1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S
401 | 1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q
402 | 1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S
403 | 1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S
404 | 1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C
405 | 1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S
406 | 1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C
407 | 1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C
408 | 1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S
409 | 1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C
410 | 1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q
411 | 1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S
412 | 1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q
413 | 1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q
414 | 1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S
415 | 1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S
416 | 1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C
417 | 1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S
418 | 1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S
419 | 1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C
420 | 


--------------------------------------------------------------------------------
/Stacking/两层stacking结构理解beta.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Stacking/两层stacking结构理解beta.pdf


--------------------------------------------------------------------------------
/Xgboost/XGBoost_code/XGBoost算法代码简易实现.md:
--------------------------------------------------------------------------------
  1 | 
  2 | #### 1.部分重要np函数实例，便于理解后面的np调用
  3 | 当你看不懂部分代码的时候，请结合引用作者的文档
  4 | https://www.zhihu.com/people/chen-zhen-64-12/columns
  5 | 
  6 | 
  7 | ```python
  8 | from __future__ import division, print_function
  9 | import numpy as np
 10 | import pandas as pd
 11 | ```
 12 | 
 13 | np.ones_like测试实例
 14 | 
 15 | 
 16 | ```python
 17 | test = np.array([[1,2,3],[4,5,6]])
 18 | test_one = np.ones_like(test)
 19 | print(test_one)
 20 | print(test_one.sum())
 21 | ```
 22 | 
 23 |     [[1 1 1]
 24 |      [1 1 1]]
 25 |     6
 26 |     
 27 | 
 28 | reshape测试实例
 29 | 
 30 | 
 31 | ```python
 32 | K = np.arange(6)
 33 | print(K) 
 34 | T = K.reshape((3,2))
 35 | print(T)
 36 | ```
 37 | 
 38 |     [0 1 2 3 4 5]
 39 |     [[0 1]
 40 |      [2 3]
 41 |      [4 5]]
 42 |     
 43 | 
 44 | shape测试实例
 45 | 
 46 | 
 47 | ```python
 48 | K = np.arange(6)
 49 | print(K)
 50 | type(np.shape(K)[0]/2)
 51 | ```
 52 | 
 53 |     [0 1 2 3 4 5]
 54 |     
 55 | 
 56 | 
 57 | 
 58 | 
 59 |     float
 60 | 
 61 | 
 62 | 
 63 | 测试数组相减求和
 64 | 
 65 | 
 66 | ```python
 67 | def gain(y,y_pred):
 68 |     K = (y - y_pred).sum()
 69 |     return K
 70 | y = np.array([[1,2],[1,2]])
 71 | y_pred = np.array([[2,3],[2,9]])
 72 | gain(y,y_pred)
 73 | ```
 74 | 
 75 | 
 76 | 
 77 | 
 78 |     -10
 79 | 
 80 | 
 81 | 
 82 | 定义一个三元二次方程理解np.power的作用
 83 | 
 84 | 
 85 | ```python
 86 | def funciton_test(x_1,x_2,x_3):
 87 |     y = np.power(x_1,2)+3*x_2+x_3
 88 |     return y 
 89 | funciton_test(2,1,3)
 90 | ```
 91 | 
 92 | 
 93 | 
 94 | 
 95 |     10
 96 | 
 97 | 
 98 | 
 99 | 测试reshape的用途
100 | 
101 | 
102 | ```python
103 | z = np.array([[1, 2, 3, 4],
104 |           [5, 6, 7, 8],
105 |           [9, 10, 11, 12],
106 |           [13, 14, 15, 16]])
107 | print('当前的行列数：',z.shape)
108 | A = np.reshape(z,(8,-1))
109 | print('reshape后的行列数：',A.shape)
110 | print(A)
111 | ```
112 | 
113 |     当前的行列数： (4, 4)
114 |     reshape后的行列数： (8, 2)
115 |     [[ 1  2]
116 |      [ 3  4]
117 |      [ 5  6]
118 |      [ 7  8]
119 |      [ 9 10]
120 |      [11 12]
121 |      [13 14]
122 |      [15 16]]
123 |     
124 | 
125 | #### 2.XGBoost正式代码部分
126 | 
127 | __future__模块，把下一个新版本的特性导入到当前版本，于是我们就可以在当前版本中测试一些新版本的特性，解决python2中运行pytho3兼容性问题
128 | 
129 | 如果某个版本中出现了某个新的功能特性，而且这个特性和当前版本中使用的不兼容
130 | 
131 | 也就是它在该版本中不是语言标准，那么我如果想要使用的话就需要从future模块导入
132 | 
133 | division 表示精确除法
134 | 
135 | progressbar显示完成的进度条
136 | 
137 | 
138 | ```python
139 | # xgboost算法也将决策树算法作为基函数进行使用
140 | # 导入进度条调度函数，方便展示模型训练进度和倒计时
141 | from utils.decision_tree.decision_tree_model import DecisionTree
142 | from utils.misc import bar_widgets
143 | import progressbar
144 | ```
145 | 
146 | 
147 | ```python
148 | # 最小二乘损失  1/2(x-x_0)^2,看不懂下面的两个函数就请对这个函数进行一阶导数求导和二阶导数求导，其中x_0是常数项
149 | class LeastSquaresLoss():
150 |     """Least squares loss"""
151 |     
152 |     # 定义梯度函数（最小二乘的一阶导数），参数包括真实值和预测值
153 |     def gradient(self, actual, predicted):
154 |         return actual - predicted
155 | 
156 |     # 定义海塞函数（最小二乘的二阶导数），参数包括真实值和预测值
157 |     # np.ones_like返回一个用1填充所有元素的同型数组或者同型矩阵，因为最小二乘损失的二阶导数是
158 |     def hess(self, actual, predicted):
159 |         return np.ones_like(actual)
160 | ```
161 | 
162 | 
163 | ```python
164 | isinstance(LeastSquaresLoss,object)
165 | ```
166 | 
167 | 
168 | 
169 | 
170 |     True
171 | 
172 | 
173 | 
174 | 
175 | ```python
176 | # XGBoost回归树，从父类决策树继承，是决策树的子类
177 | # 特别说明一点，GBDT和XGBoost在分类问题上都是先调用回归树，然后通过sigmod函数对输出值做概率转换判断分类
178 | # 有些时候，你会看到以一个下划线开头的实例变量名，比如_name，这样的实例变量外部是可以访问的
179 | # 但按照约定俗成的规定，当你看到这样的变量时，意思是，“虽然我可以被访问，但是，请把我视为私有变量，不要随意访问”。
180 | class XGBoostRegressionTree(DecisionTree):
181 |     """
182 |     Regression tree for XGBoost
183 |     - 参考文档 -
184 |     http://xgboost.readthedocs.io/en/latest/model.html
185 |     """
186 |     
187 |     # y输入是一个矩阵，np.shape是计算矩阵的行数和列数，此处代表返回矩阵的列数的一半用作划分点
188 |     # 此处划分的目的在于将label划分为两部分
189 |     def _split(self, y):
190 |         """ y contains y_true in left half of the middle column and
191 |         y_pred in the right half. Split and return the two matrices """
192 |         col = int(np.shape(y)[1]/2)
193 |         y, y_pred = y[:, :col], y[:, col:]
194 |         return y, y_pred
195 |     
196 |     # 定义打分函数增益值，此处忽略正则化参数λ
197 |     # 函数计算切分后的数据集的Gain值
198 |     # 这里类并的loss方法没有一阶导数和二阶导数可以调用，但是在XGBoost类里面定义了损失函数为最小二乘损失
199 |     def _gain(self, y, y_pred):
200 |         
201 |         # 假设这里的函数是平方误差，那么梯度就是残差，这里的结果就是对矩阵求元素对应位置相减，然后对所有元素求和，最后求平方
202 |         nominator = np.power((self.loss.gradient(y, y_pred)).sum(), 2)
203 |         # 返回一个以y为行列数的对角矩阵，对角线的元素均为1，并求和
204 |         denominator = self.loss.hess(y, y_pred).sum()
205 |         return 0.5 * (nominator / denominator)
206 |     
207 |     # 该函数通过调用gain()来计算树节点的纯度，并以此来作为树是否分割的标准
208 |     # 对输入的三个参数均执行相同的切分操作，切分为两部分
209 |     
210 |     def _gain_by_taylor(self, y, y1, y2):
211 |         # Split
212 |         y, y_pred = self._split(y)
213 |         y1, y1_pred = self._split(y1)
214 |         y2, y2_pred = self._split(y2)
215 |         
216 |         # 对三个切分好的参数分别计算最终的增益系数
217 |         true_gain = self._gain(y1, y1_pred)
218 |         false_gain = self._gain(y2, y2_pred)
219 |         gain = self._gain(y, y_pred)
220 |         return true_gain + false_gain - gain
221 |     
222 |     
223 |     # 此处忽略了正则化参数λ，因此函数名为近似更新
224 |     # 将approximate_update()作为估算子节点取值的方法
225 |     # xgboost被切割完成后，每个子节点的取值都已经计算完成，这里返回每个叶节点的预测分数
226 |     
227 |     def _approximate_update(self, y):
228 |         # y split into y, y_pred
229 |         y, y_pred = self._split(y)
230 |         gradient = np.sum(self.loss.gradient(y, y_pred),axis=0)
231 |         hessian = np.sum(self.loss.hess(y, y_pred), axis=0)
232 |         # 这里特别注意计算梯度的时候，使用的是最小二乘法，最小二乘法是（真实值-预测值）**2，那么这个地方实际上y是真实值，y_pred是预测值
233 |         # 所以这里在计算update_approximation的时候是没有负号的，按照XGBoost的公式推导顺序本来是有负号的，我也是理解了很久才理解透彻
234 |         update_approximation = gradient / hessian
235 |         return update_approximation 
236 | 
237 |     # 将gain_by_taylor()作为切割树的标准，将approximate_update()作为估算子节点取值的方法，传递回给decisionTree，并以此来构建决策树
238 |     # 很多人会看不懂下面这个super函数，这里看起来是集成了XGBoostRegressionTree本身，实际上不是并不是
239 |     # 需要看到，这里的参数是self，那么我们需要回过去看看XGBoostRegressionTree的self对象是谁
240 |     # XGBoostRegressionTree(DecisionTree)这个类是个子类，大家有没有发现这个子类并没有定义self函数
241 |     # 那是因为，在单层继承中，python定义，子类若不自定义self,那么将直接继承父类的self作为自己的self
242 |     
243 |     # 这里训练完成后返回的决策树的相关参数，也就是模型
244 |     def fit(self, X, y):
245 |         self._impurity_calculation = self._gain_by_taylor
246 |         self._leaf_value_calculation = self._approximate_update
247 |         super(XGBoostRegressionTree, self).fit(X, y)
248 | ```
249 | 
250 | 
251 | ```python
252 | # 定义XGBoost分类树
253 | class XGBoost(object):
254 |     """The XGBoost classifier.
255 | 
256 |     参考文档: http://xgboost.readthedocs.io/en/latest/model.html
257 | 
258 |     Parameters:
259 |     -----------
260 |     n_estimators: int
261 |     树的数量
262 |         The number of classification trees that are used.
263 |     learning_rate: float
264 |     梯度下降的学习率
265 |         The step length that will be taken when following the negative gradient during
266 |         training.
267 |     min_samples_split: int
268 |     每棵子树的节点的最小数目（小于后不继续切割）
269 |         The minimum number of samples needed to make a split when building a tree.
270 |     min_impurity: float
271 |     每棵子树的最小纯度（小于后不继续切割）
272 |         The minimum impurity required to split the tree further.
273 |     max_depth: int
274 |     每棵子树的最大层数（大于后不继续切割）
275 |         The maximum depth of a tree.
276 |     """
277 |     
278 |     # 构建一个含有n_estimators棵XGBoostRegressionTree的类
279 |     def __init__(self, n_estimators=200, learning_rate=0.01, min_samples_split=2,
280 |                  min_impurity=1e-7, max_depth=2):
281 |         self.n_estimators = n_estimators  # 树最大生成数量
282 |         self.learning_rate = learning_rate  # 权重更新步长
283 |         self.min_samples_split = min_samples_split  # 每棵子树的节点的最小数目（小于后不继续切割）
284 |         self.min_impurity = min_impurity  # 每棵子树的最小纯度（小于后不继续切割），标准是最小方差
285 |         self.max_depth = max_depth  # 每棵子树的最大层数（大于后不继续切割）
286 | 
287 |         self.bar = progressbar.ProgressBar(widgets=bar_widgets)
288 | 
289 |         # 定义损失函数为最小二乘损失
290 |         self.loss = LeastSquaresLoss()
291 | 
292 |         # 初始化回归树
293 |         # for _ in range()表示不关心具体元素内容，就是简单的让下面的循环range()次，_表示占位符
294 |         self.trees = []
295 |         for _ in range(n_estimators):
296 |             tree = XGBoostRegressionTree(
297 |                 min_samples_split=self.min_samples_split,
298 |                 min_impurity=min_impurity,
299 |                 max_depth=self.max_depth,
300 |                 loss=self.loss)
301 | 
302 |             self.trees.append(tree)
303 | 
304 |             
305 |     # np.concatenate 按轴向将两个数组组成一个新数组
306 |     # 对X计算数据的样本总量
307 |     # 对数据拆分成m行准备用于训练
308 |     def fit(self, X, y):
309 |         # y = to_categorical(y)
310 |         m = X.shape[0]
311 |         y = np.reshape(y, (m, -1))
312 |         # 生成一个全部为0的同时与y相等行列数的数组，该数组主要用于初始化
313 |         # np.concatenate 无法按行对一维度数组拼接，但是对二维及以上的数组可以拼接，按行拼接实际上是对对应元素进行叠加
314 |         y_pred = np.zeros(np.shape(y))
315 |         for i in self.bar(range(self.n_estimators)):
316 |             tree = self.trees[i]
317 |             y_and_pred = np.concatenate((y, y_pred), axis=1)
318 |             tree.fit(X, y_and_pred)
319 |             # 这里是调用了决策树基函数的predict方法，逐个对样本进行分类并返回标签集
320 |             update_pred = tree.predict(X)
321 |             update_pred = np.reshape(update_pred, (m, -1))
322 |             # 加法模型，预测值是当前轮和上一轮叠加的结果
323 |             y_pred += update_pred
324 | 
325 |     def predict(self, X):
326 |         y_pred = None
327 |         m = X.shape[0]
328 |         # 开始预测
329 |         for tree in self.trees:
330 |             # 估计梯度和更新预测值
331 |             update_pred = tree.predict(X)
332 |             update_pred = np.reshape(update_pred, (m, -1))
333 |             if y_pred is None:
334 |                 # 初始化一个全部为0的同型数组或者同型矩阵
335 |                 y_pred = np.zeros_like(update_pred)
336 |             y_pred += update_pred
337 | 
338 |         return y_pred
339 | ```
340 | 
341 | 
342 | ```python
343 | # 开始使用实例进行测试，实例数据TempLinkoping2016.txt在GBDT文件夹可以找到
344 | import matplotlib.pyplot as plt
345 | from utils.data_manipulation import train_test_split, standardize, to_categorical, normalize
346 | from utils.data_operation import mean_squared_error, accuracy_score
347 | 
348 | def main():
349 |     print ("-- XGBoost --")
350 | 
351 |     # 加载温度数据集，本问题为回归问题
352 |     data = pd.read_csv('D:\Machine-Learning-From-Scratch-master\TempLinkoping2016.txt', sep="\t")
353 | 
354 |     time = np.atleast_2d(data["time"].values).T
355 |     temp = np.atleast_2d(data["temp"].values).T
356 | 
357 |     X = time.reshape((-1, 1))               # Time. Fraction of the year [0, 1]
358 |     X = np.insert(X, 0, values=1, axis=1)   # 插入偏差项
359 |     y = temp[:, 0]                          # Temperature. Reduce to one-dim
360 | 
361 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
362 |     #print(y_train)
363 |     model = XGBoost()
364 |     model.fit(X_train, y_train)
365 |     y_pred = model.predict(X_test)
366 | 
367 |     y_pred_line = model.predict(X)
368 |     print(y_test[0:5])
369 |     # Color map
370 |     cmap = plt.get_cmap('viridis')
371 | 
372 |     mse = mean_squared_error(y_test, y_pred)
373 | 
374 |     print ("Mean Squared Error:", mse)
375 | 
376 |     # Plot the results
377 |     m1 = plt.scatter(366 * X_train[:, 1], y_train, color=cmap(0.9), s=10)
378 |     m2 = plt.scatter(366 * X_test[:, 1], y_test, color=cmap(0.5), s=10)
379 |     m3 = plt.scatter(366 * X_test[:, 1], y_pred, color='black', s=10)
380 |     plt.suptitle("Regression Tree")
381 |     plt.title("MSE: %.2f" % mse, fontsize=10)
382 |     plt.xlabel('Day')
383 |     plt.ylabel('Temperature in Celcius')
384 |     plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
385 |     plt.show()
386 | 
387 | 
388 |     
389 | # 小明.py
390 | # 朋友眼中你是小明(__name__ == '小明'), 你自己眼中你是你自己(__name__ == '__main__')
391 | # 你编程很好, 朋友调你去帮他写程序(import 小明, 这时你在朋友眼中: __name__ == '小明')
392 | # 但你晚上也会打开xx网站, 做一些自己的事情(直接运行小明.py, __name__ == '__main__')
393 | # 也就是说，当你被别的文件导入的时候，你的name并不是__main__,而是导入的模块名称，因此被别人导入的时候，代码不会被执行
394 | if __name__ == "__main__":
395 |     main()
396 | ```
397 | 
398 |     Training:   0% [                                               ] ETA:  --:--:--
399 | 
400 |     -- XGBoost --
401 |     
402 | 
403 |     Training: 100% [------------------------------------------------] Time: 0:00:47
404 |     
405 | 
406 |     [18.8  6.1 -0.8 17.6  5.2]
407 |     Mean Squared Error: 122.01499106753589
408 |     
409 | 
410 | 
411 | ![png](output_21_4.png)
412 | 
413 | 
414 | 
415 | ```python
416 | data = pd.read_csv('D:\Machine-Learning-From-Scratch-master\TempLinkoping2016.txt', sep="\t")
417 | ```
418 | 
419 | 
420 | ```python
421 | data.head(1)
422 | ```
423 | 
424 | 
425 | 
426 | 
427 | <div>
428 | <style scoped>
429 |     .dataframe tbody tr th:only-of-type {
430 |         vertical-align: middle;
431 |     }
432 | 
433 |     .dataframe tbody tr th {
434 |         vertical-align: top;
435 |     }
436 | 
437 |     .dataframe thead th {
438 |         text-align: right;
439 |     }
440 | </style>
441 | <table border="1" class="dataframe">
442 |   <thead>
443 |     <tr style="text-align: right;">
444 |       <th></th>
445 |       <th>time</th>
446 |       <th>temp</th>
447 |     </tr>
448 |   </thead>
449 |   <tbody>
450 |     <tr>
451 |       <th>0</th>
452 |       <td>0.002732</td>
453 |       <td>0.1</td>
454 |     </tr>
455 |   </tbody>
456 | </table>
457 | </div>
458 | 
459 | 
460 | 
461 | 
462 | ```python
463 | import matplotlib.pyplot as plt
464 | 
465 | from utils.data_manipulation import train_test_split, standardize, to_categorical, normalize
466 | from utils.data_operation import mean_squared_error, accuracy_score
467 | 
468 | def main():
469 |     print ("-- XGBoost --")
470 | 
471 |     # 加载《统计学习方法》例8.2
472 |     x = np.array(range(1,11,1))
473 |     y = np.array([5.56,5.70,5.91,6.40,6.80,7.05,8.9,8.7,9.00,9.05])
474 |     data = pd.DataFrame([x,y]).T
475 |     data.columns=['x','y']
476 | 
477 |     X = np.atleast_2d(data["x"].values).T
478 |     Y = np.atleast_2d(data["y"].values).T
479 | 
480 |     X = X.reshape((-1, 1))              
481 |     X = np.insert(X, 0, values=1, axis=1)   
482 |     y = Y[:, 0]                          
483 | 
484 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
485 |     #print(y_train)
486 |     model = XGBoost()
487 |     model.fit(X_train, y_train)
488 |     y_pred = model.predict(X_test)
489 | 
490 |     y_pred_line = model.predict(X)
491 |     print(y_test[0:5])
492 |     # Color map
493 |     cmap = plt.get_cmap('viridis')
494 | 
495 |     mse = mean_squared_error(y_test, y_pred)
496 | 
497 |     print ("Mean Squared Error:", mse)
498 | 
499 |     # Plot the results
500 |     m1 = plt.scatter(366 * X_train[:, 1], y_train, color=cmap(0.9), s=10)
501 |     m2 = plt.scatter(366 * X_test[:, 1], y_test, color=cmap(0.5), s=10)
502 |     m3 = plt.scatter(366 * X_test[:, 1], y_pred, color='black', s=10)
503 |     plt.suptitle("Regression Tree")
504 |     plt.title("MSE: %.2f" % mse, fontsize=10)
505 |     plt.xlabel('X')
506 |     plt.ylabel('Y')
507 |     plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right')
508 |     plt.show()
509 | 
510 | 
511 | if __name__ == "__main__":
512 |     main()
513 | ```
514 | 
515 |     Training:  44% [---------------------                           ] ETA:  0:00:00
516 | 
517 |     -- XGBoost --
518 |     
519 | 
520 |     Training: 100% [------------------------------------------------] Time: 0:00:00
521 |     
522 | 
523 |     [8.9 9. ]
524 |     Mean Squared Error: 1.838750000000001
525 |     
526 | 
527 | 
528 | ![png](output_24_4.png)
529 | 
530 | 
531 | 
532 | ```python
533 | print('\n'.join([''.join([('365'[(x-y) % len('365')] if 
534 |                            ((x*0.05)**2+(y*0.1)**2-1)**3-(x*0.05)**2*(y*0.1)**3 <= 0 else ' ') 
535 |                           for x in range(-30, 30)]) for y in range(30, -30, -1)])) 
536 | ```
537 | 
538 |                                                                 
539 |                                                                 
540 |                                                                 
541 |                                                                 
542 |                                                                 
543 |                                                                 
544 |                                                                 
545 |                                                                 
546 |                                                                 
547 |                                                                 
548 |                                                                 
549 |                                                                 
550 |                                                                 
551 |                                                                 
552 |                                                                 
553 |                                                                 
554 |                                                                 
555 |                                                                 
556 |                     653653653           365365365               
557 |                 65365365365365365   36536536536536536           
558 |               36536536536536536536536536536536536536536         
559 |              3653653653653653653653653653653653653653653        
560 |             365365365365365365365365365365365365365365365       
561 |             653653653653653653653653653653653653653653653       
562 |             536536536536536536536536536536536536536536536       
563 |             365365365365365365365365365365365365365365365       
564 |             653653653653653653653653653653653653653653653       
565 |             536536536536536536536536536536536536536536536       
566 |              6536536536536536536536536536536536536536536        
567 |               36536536536536536536536536536536536536536         
568 |               65365365365365365365365365365365365365365         
569 |                 6536536536536536536536536536536536536           
570 |                  36536536536536536536536536536536536            
571 |                   536536536536536536536536536536536             
572 |                     53653653653653653653653653653               
573 |                       5365365365365365365365365                 
574 |                         536536536536536536536                   
575 |                            365365365365365                      
576 |                               653653653                         
577 |                                  536                            
578 |                                   6                             
579 |                                                                 
580 |                                                                 
581 |                                                                 
582 |                                                                 
583 |                                                                 
584 |                                                                 
585 |                                                                 
586 |                                                                 
587 |                                                                 
588 |                                                                 
589 |                                                                 
590 |                                                                 
591 |                                                                 
592 |                                                                 
593 |                                                                 
594 |                                                                 
595 |                                                                 
596 |                                                                 
597 |                                                                 
598 |     
599 | 
600 | 
601 | ```python
602 | 
603 | ```
604 | 


--------------------------------------------------------------------------------
/Xgboost/XGBoost_code/output_21_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Xgboost/XGBoost_code/output_21_4.png


--------------------------------------------------------------------------------
/Xgboost/XGBoost_code/output_24_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Xgboost/XGBoost_code/output_24_4.png


--------------------------------------------------------------------------------
/Xgboost/readme.md:
--------------------------------------------------------------------------------
 1 | #### XGBoost学习方法
 2 | 如果有大佬看到代码内容有问题并愿意指正，请及时联系cethuang@gmail.com，诚挚感谢指导
 3 | 
 4 | 要学习代码，建议先看如下内容！！！
 5 | 
 6 | v1.2 20190722
 7 | 
 8 | - XGBoost的代码实现主要分为以下步骤
 9 | 
10 | 1. 代码主要使用的损失函数为最小二乘损失，这个损失函数的使用简单，理解方便，不清楚的可以在网上自行搜索资料学习透彻后再进行下一步
11 | 2. 代码主要计算了最小二乘损失的梯度和海赛函数，梯度实际上是函数的一阶导数，海赛实际上是函数的二阶导数，是一阶导函数的导数。这样组成的函数可以构建二阶泰勒公式，泰勒公式主要用于定义一个新的更方便求解极值的函数用于近似原函数。
12 | 3. XGBoost的相比GBDT更大的进步不仅仅在于使用了二阶泰勒公式（GBDT使用的是梯度），能够更好的拟合损失函数。另外一个大的特点是在损失函数中加入了正则化项，实际上，正则化项常见于逻辑斯蒂回归、SVM、神经网络等，这些正则化项主要是对参数θ进行正则化。而XGBoost的正则化主要是对叶子节点进行正则化，这实际上是希望对叶子节点进行惩罚，如果叶子节点生成过多，那么可以通过正则化项对叶子节点这一项的总值进行缩小，那么实际上就起到了剪枝的作用；同时XGBoost还对决策树的分裂次数进行正则化，这实际上是希望分裂出来的决策树相对简单，那么可以更好的解决对未知数据预测能力的过拟合问题。根据《统计学习方法》决策树相关章节的论述，决策树模型的损失函数同样也加入了正则化项目，但是决策树仅仅对叶节点加入了正则化，并未对决策树的分裂次数进行正则化。
13 | 4. 本代码实现为了简单，方便理解，在所有的公式中，都没有加入正则化项，也就是令λ=0，w=0。至于不明白正则化作用的，建议学习的内容是吴恩达的网易云课堂的机器学习正则化相关视频，可以清晰的理解正则化项到底对损失函数做了什么。
14 | 5. 本代码基于GBDT迭代生成，因此相关的测试数据均可以在本仓库下的GBDT下学习后再进行，强烈不建议没有掌握GBDT相关知识的同学直接上来学习XGBoost。
15 | 6. 正确的学习路径应该是决策树-CART回归树-Boosting-AdaBoost-GBT-GBDT-XGBoost-LightGBM。XGBoost相关的列采样方案建议还需要学习补充随机森林相关算法知识。决策树到GBT相关知识非常建议学习《统计学习方法》相关章节，内容翔实，案例简单。
16 | 7. XGBoost算法的核心代码主要是基于面向对象设计，对super超类继承等方法不熟悉的同学还需要补充面向对象相关方法，方可以理解本代码实现。本代码实现注释了非常多的内容，虽然繁琐，但是方便初学者理解。
17 | 8. 学习本代码可以先结合本仓库GBDT下的GBDT_XGBoost_LGBM算法原理v1.1.PPT对XGBoost和相关公式推导有一个概念理解，然后再进行代码阅读。
18 | 9. 本代码中定义的Gain增益系数，和决策树的Gini基尼系数是两个概念，请勿混淆，至于具体的XGBoost的Gain增益值是如何计算并打分的，请参照第8点的材料进行阅读。
19 | 
20 | 
21 | v1.1 20190715
22 | 
23 | - 添加知乎上一张图说明XGBoost算法的图例，可以结合wepon的算法材料学习
24 | 
25 | 
26 | 
27 | v1.0 20190713
28 | 
29 | - 本方法仅建议机器学习初学者参考，大佬求放过
30 | - 对于初学者不要看到各种Kaggle比赛、腾讯广告算法大神的大佬用了LGBM（XGBoost变种）等XGBoost算法就想着经典机器学习算法不想看了先学习最牛逼的算法的思想
31 | - 所有牛逼的算法都有一个牛逼的爹。XGBoost的很牛逼是因为有多个牛逼的爹。列举部分如下：决策树爹、正则爹、泰勒公式爹、随机森林爹（列采样基因）、排序爹（Boosting为何能够做到并行）、梯度下降爹……
32 | - 你应该先去看李航《统计学习方法》中关于CART决策树的基础理论，掌握一颗决策树是如何遍历所有切分点然后找到最佳切分点的
33 | - 你应该先去学习梯度下降算法，至少应该看懂一元函数和多元函数求偏导数的方法、如此你还需要去复习一下极限理论
34 | - 你应该先去学习决策树的损失函数，了解决策树的损失函数是如何控制分裂点
35 | - 你应该先去学习正则化的思想，强烈建议去看网易云课堂的吴恩达机器学习关于正则化的相关解释，非常适合我这种白痴，至少应该懂得，惩罚项到底在干嘛，我一开始看到惩罚项，以为是前面的损失函数做错了什么事情，例如私下河塘洗澡之类的需要受到惩罚，损失函数到底做错了什么。正则化L1和L2两种，至少应该理解经典的等高线图的交叉点概念
36 | - 先学习Boosting算法。掌握加法模型和Bagging算法的不同之处在哪儿，比如和随机森林的区别点在什么地方
37 | - 然后去掌握提升树算法，理解提升树在拟合数据的时候使用残差的概念，实际上就是在求定义为平方损失函数的梯度。
38 | - 然后去掌握梯度提升树算法，理解为何需要升级成这个算法，在面临什么场景的时候，提升树算法就不灵了，需要使用到梯度来求解类似SoftMax这样的损失函数
39 | - 然后你理解了上面的GBDT，你可以开始尝试去学习XGBoost，别急，看到蛋疼的原论文的时候很痛苦，找一下知乎，找一下github的可能是东半球最大的学习组织（罗永浩？）等地方，慢慢的啃，一步步的来，才是适合我这样的白痴学习方法。
40 | 


--------------------------------------------------------------------------------
/Xgboost/【HP20190616】xgboost_titanic.md:
--------------------------------------------------------------------------------
  1 | 
  2 | ## kaggle泰坦尼克号机器学习xgboost
  3 | 
  4 | 
  5 | ```python
  6 | import numpy as np
  7 | import pandas as pd
  8 | import re 
  9 | import sklearn
 10 | import os
 11 | # 显示当前路径
 12 | os.getcwd()
 13 | ```
 14 | 
 15 | 
 16 | 
 17 | 
 18 |     'D:\\jupyter_notebook'
 19 | 
 20 | 
 21 | 
 22 | 
 23 | ```python
 24 | # 导入数据
 25 | train_ = pd.read_csv('D:/jupyter_notebook/titanic/train.csv')
 26 | test_ = pd.read_csv('D:/jupyter_notebook/titanic/test.csv')
 27 | ```
 28 | 
 29 | 
 30 | ```python
 31 | #根据原始特征的观察构建新特征
 32 | # 计算名字的长度
 33 | train_['Name_length'] = train_['Name'].apply(len)
 34 | # 将旅客是否住在头等舱二值化
 35 | train_['Has_Cabin'] = train_["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
 36 | # 构建新特征家庭总人数
 37 | train_['FamilySize'] = train_['SibSp'] + train_['Parch'] + 1
 38 | # 构建新特征是否独居
 39 | train_['IsAlone'] = 0
 40 | train_.loc[train_['FamilySize'] == 1, 'IsAlone'] = 1
 41 | # 查看乘客登船口岸存在缺失值
 42 | train_['Embarked'].isnull().value_counts() 
 43 | # 对乘客登船口岸进行固定值填充缺失值
 44 | train_['Embarked'] = train_['Embarked'].fillna('S')
 45 | # 对票价进行中位数填充缺失值
 46 | train_['Fare'] = train_['Fare'].fillna(train_['Fare'].median())
 47 | # 生成绝对票价分区，qcut是根据分区分位定义，将每一个值划为到具体的分区区间中去，此处定义为四分位值
 48 | train_['CategoricalFare'] = pd.qcut(train_['Fare'], 4)
 49 | # 生成新变量年龄平均值、年龄标准差
 50 | age_avg = train_['Age'].mean()
 51 | age_std = train_['Age'].std()
 52 | # 计算年龄是否有缺失值并统计
 53 | age_null_count = train_['Age'].isnull().sum()
 54 | # np.random.randint()产生离散均匀分布的整数,size是产生的元素数量，前面分别为最小值和最大值区间
 55 | age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
 56 | # 对年龄用生成的一些新数值进行填充
 57 | train_['Age'][np.isnan(train_['Age'])] = age_null_random_list
 58 | # 转换变量类型为数值类型，便于后期计算
 59 | train_['Age'] = train_['Age'].astype(int)
 60 | # 对年龄生成新的分箱变量中来代替，即将年龄绝对值转换为离散类别
 61 | train_['CategoricalAge'] = pd.cut(train_['Age'], 5)
 62 | 
 63 | # 定义正则表达式函数导出旅客的Title
 64 | def get_title(name):
 65 |     # re.search()方法扫描整个字符串，并返回第一个成功的匹配。如果匹配失败，则返回None
 66 |     title_search = re.search('([A-Za-z]+)\.',name)
 67 |     if title_search:
 68 |         return title_search.group(1)
 69 |     return ''
 70 | 
 71 | # 取出姓名中尊称部分
 72 | train_['Title'] = train_['Name'].apply(get_title)
 73 | 
 74 | # 对姓名的称呼部分做统一
 75 | train_['Title'] = train_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major'
 76 |                                            , 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
 77 | train_['Title'] = train_['Title'].replace('Mlle', 'Miss')
 78 | train_['Title'] = train_['Title'].replace('Ms', 'Miss')
 79 | train_['Title'] = train_['Title'].replace('Mme', 'Mrs')
 80 | 
 81 | # 对性别从离散型替换为数值型
 82 | train_['Sex'] = train_['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
 83 |     
 84 | # 对姓名的称呼部分做数值型变换
 85 | title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
 86 | # 先定义一个字典，然后通过map函数传入字典进行替换
 87 | train_['Title'] = train_['Title'].map(title_mapping)
 88 | # 最后对缺失值替换为0
 89 | train_['Title'] = train_['Title'].fillna(0)
 90 |     
 91 | # 替换登船口岸
 92 | train_['Embarked'] = train_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
 93 |     
 94 | # 替换票价的四分位数，该步骤应该有更好的办法做数据处理
 95 | # loc函数取出列中某类元素的数据集
 96 | train_.loc[ train_['Fare'] <= 7.91, 'Fare'] = 0
 97 | train_.loc[(train_['Fare'] > 7.91) & (train_['Fare'] <= 14.454), 'Fare'] = 1
 98 | train_.loc[(train_['Fare'] > 14.454) & (train_['Fare'] <= 31), 'Fare']   = 2
 99 | train_.loc[ train_['Fare'] > 31, 'Fare'] = 3
100 | train_['Fare'] = train_['Fare'].astype(int)
101 |     
102 | # 对年龄进行分段
103 | train_.loc[ train_['Age'] <= 16, 'Age'] = 0
104 | train_.loc[(train_['Age'] > 16) & (train_['Age'] <= 32), 'Age'] = 1
105 | train_.loc[(train_['Age'] > 32) & (train_['Age'] <= 48), 'Age'] = 2
106 | train_.loc[(train_['Age'] > 48) & (train_['Age'] <= 64), 'Age'] = 3
107 | train_.loc[train_['Age'] > 64, 'Age'] = 4
108 | 
109 | 
110 | # 特征选择，先对处理过的不需要的特征进行删除，定义一个列表，然后批量删除
111 | drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
112 | train_ = train_.drop(drop_elements, axis = 1)
113 | train_ = train_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
114 | # test_  = test_.drop(drop_elements, axis = 1)
115 | 
116 | train_.head()
117 | ```
118 | 
119 |     C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 
120 |     A value is trying to be set on a copy of a slice from a DataFrame
121 |     
122 |     See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
123 |     
124 | 
125 | 
126 | 
127 | 
128 | <div>
129 | <style scoped>
130 |     .dataframe tbody tr th:only-of-type {
131 |         vertical-align: middle;
132 |     }
133 | 
134 |     .dataframe tbody tr th {
135 |         vertical-align: top;
136 |     }
137 | 
138 |     .dataframe thead th {
139 |         text-align: right;
140 |     }
141 | </style>
142 | <table border="1" class="dataframe">
143 |   <thead>
144 |     <tr style="text-align: right;">
145 |       <th></th>
146 |       <th>Survived</th>
147 |       <th>Pclass</th>
148 |       <th>Sex</th>
149 |       <th>Age</th>
150 |       <th>Parch</th>
151 |       <th>Fare</th>
152 |       <th>Embarked</th>
153 |       <th>Name_length</th>
154 |       <th>Has_Cabin</th>
155 |       <th>FamilySize</th>
156 |       <th>IsAlone</th>
157 |       <th>Title</th>
158 |     </tr>
159 |   </thead>
160 |   <tbody>
161 |     <tr>
162 |       <th>0</th>
163 |       <td>0</td>
164 |       <td>3</td>
165 |       <td>1</td>
166 |       <td>1</td>
167 |       <td>0</td>
168 |       <td>0</td>
169 |       <td>0</td>
170 |       <td>23</td>
171 |       <td>0</td>
172 |       <td>2</td>
173 |       <td>0</td>
174 |       <td>1</td>
175 |     </tr>
176 |     <tr>
177 |       <th>1</th>
178 |       <td>1</td>
179 |       <td>1</td>
180 |       <td>0</td>
181 |       <td>2</td>
182 |       <td>0</td>
183 |       <td>3</td>
184 |       <td>1</td>
185 |       <td>51</td>
186 |       <td>1</td>
187 |       <td>2</td>
188 |       <td>0</td>
189 |       <td>3</td>
190 |     </tr>
191 |     <tr>
192 |       <th>2</th>
193 |       <td>1</td>
194 |       <td>3</td>
195 |       <td>0</td>
196 |       <td>1</td>
197 |       <td>0</td>
198 |       <td>1</td>
199 |       <td>0</td>
200 |       <td>22</td>
201 |       <td>0</td>
202 |       <td>1</td>
203 |       <td>1</td>
204 |       <td>2</td>
205 |     </tr>
206 |     <tr>
207 |       <th>3</th>
208 |       <td>1</td>
209 |       <td>1</td>
210 |       <td>0</td>
211 |       <td>2</td>
212 |       <td>0</td>
213 |       <td>3</td>
214 |       <td>0</td>
215 |       <td>44</td>
216 |       <td>1</td>
217 |       <td>2</td>
218 |       <td>0</td>
219 |       <td>3</td>
220 |     </tr>
221 |     <tr>
222 |       <th>4</th>
223 |       <td>0</td>
224 |       <td>3</td>
225 |       <td>1</td>
226 |       <td>2</td>
227 |       <td>0</td>
228 |       <td>1</td>
229 |       <td>0</td>
230 |       <td>24</td>
231 |       <td>0</td>
232 |       <td>1</td>
233 |       <td>1</td>
234 |       <td>1</td>
235 |     </tr>
236 |   </tbody>
237 | </table>
238 | </div>
239 | 
240 | 
241 | 
242 | 
243 | ```python
244 | #根据原始特征的观察构建新特征
245 | # 计算名字的长度
246 | test_['Name_length'] = test_['Name'].apply(len)
247 | # 将旅客是否住在头等舱二值化
248 | test_['Has_Cabin'] = test_["Cabin"].apply(lambda x: 0 if type(x) == float else 1)
249 | # 构建新特征家庭总人数
250 | test_['FamilySize'] = test_['SibSp'] + test_['Parch'] + 1
251 | # 构建新特征是否独居
252 | test_['IsAlone'] = 0
253 | test_.loc[test_['FamilySize'] == 1, 'IsAlone'] = 1
254 | # 查看乘客登船口岸存在缺失值
255 | test_['Embarked'].isnull().value_counts() 
256 | # 对乘客登船口岸进行固定值填充缺失值
257 | test_['Embarked'] = test_['Embarked'].fillna('S')
258 | # 对票价进行中位数填充缺失值
259 | test_['Fare'] = test_['Fare'].fillna(test_['Fare'].median())
260 | # 生成绝对票价分区，qcut是根据分区分位定义，将每一个值划为到具体的分区区间中去，此处定义为四分位值
261 | test_['CategoricalFare'] = pd.qcut(test_['Fare'], 4)
262 | # 生成新变量年龄平均值、年龄标准差
263 | age_avg = test_['Age'].mean()
264 | age_std = test_['Age'].std()
265 | # 计算年龄是否有缺失值并统计
266 | age_null_count = test_['Age'].isnull().sum()
267 | # np.random.randint()产生离散均匀分布的整数,size是产生的元素数量，前面分别为最小值和最大值区间
268 | age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count)
269 | # 对年龄用生成的一些新数值进行填充
270 | test_['Age'][np.isnan(test_['Age'])] = age_null_random_list
271 | # 转换变量类型为数值类型，便于后期计算
272 | test_['Age'] = test_['Age'].astype(int)
273 | # 对年龄生成新的分箱变量中来代替，即将年龄绝对值转换为离散类别
274 | test_['CategoricalAge'] = pd.cut(test_['Age'], 5)
275 | 
276 | # 定义正则表达式函数导出旅客的Title
277 | def get_title(name):
278 |     # re.search()方法扫描整个字符串，并返回第一个成功的匹配。如果匹配失败，则返回None
279 |     title_search = re.search('([A-Za-z]+)\.',name)
280 |     if title_search:
281 |         return title_search.group(1)
282 |     return ''
283 | 
284 | # 取出姓名中尊称部分
285 | test_['Title'] = test_['Name'].apply(get_title)
286 | 
287 | # 对姓名的称呼部分做统一
288 | test_['Title'] = test_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major'
289 |                                            , 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
290 | test_['Title'] = test_['Title'].replace('Mlle', 'Miss')
291 | test_['Title'] = test_['Title'].replace('Ms', 'Miss')
292 | test_['Title'] = test_['Title'].replace('Mme', 'Mrs')
293 | 
294 | # 对性别从离散型替换为数值型
295 | test_['Sex'] = test_['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
296 |     
297 | # 对姓名的称呼部分做数值型变换
298 | title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
299 | # 先定义一个字典，然后通过map函数传入字典进行替换
300 | test_['Title'] = test_['Title'].map(title_mapping)
301 | # 最后对缺失值替换为0
302 | test_['Title'] = test_['Title'].fillna(0)
303 |     
304 | # 替换登船口岸
305 | test_['Embarked'] = test_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int)
306 |     
307 | # 替换票价的四分位数，该步骤应该有更好的办法做数据处理
308 | # loc函数取出列中某类元素的数据集
309 | test_.loc[ test_['Fare'] <= 7.91, 'Fare'] = 0
310 | test_.loc[(test_['Fare'] > 7.91) & (test_['Fare'] <= 14.454), 'Fare'] = 1
311 | test_.loc[(test_['Fare'] > 14.454) & (test_['Fare'] <= 31), 'Fare']   = 2
312 | test_.loc[ test_['Fare'] > 31, 'Fare'] = 3
313 | test_['Fare'] = test_['Fare'].astype(int)
314 |     
315 | # 对年龄进行分段
316 | test_.loc[ test_['Age'] <= 16, 'Age'] = 0
317 | test_.loc[(test_['Age'] > 16) & (test_['Age'] <= 32), 'Age'] = 1
318 | test_.loc[(test_['Age'] > 32) & (test_['Age'] <= 48), 'Age'] = 2
319 | test_.loc[(test_['Age'] > 48) & (test_['Age'] <= 64), 'Age'] = 3
320 | test_.loc[test_['Age'] > 64, 'Age'] = 4
321 | 
322 | 
323 | # 特征选择，先对处理过的不需要的特征进行删除，定义一个列表，然后批量删除
324 | drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp']
325 | test_ = test_.drop(drop_elements, axis = 1)
326 | test_ = test_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1)
327 | # test_  = test_.drop(drop_elements, axis = 1)
328 | 
329 | test_.head()
330 | ```
331 | 
332 |     C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 
333 |     A value is trying to be set on a copy of a slice from a DataFrame
334 |     
335 |     See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
336 |     
337 | 
338 | 
339 | 
340 | 
341 | <div>
342 | <style scoped>
343 |     .dataframe tbody tr th:only-of-type {
344 |         vertical-align: middle;
345 |     }
346 | 
347 |     .dataframe tbody tr th {
348 |         vertical-align: top;
349 |     }
350 | 
351 |     .dataframe thead th {
352 |         text-align: right;
353 |     }
354 | </style>
355 | <table border="1" class="dataframe">
356 |   <thead>
357 |     <tr style="text-align: right;">
358 |       <th></th>
359 |       <th>Pclass</th>
360 |       <th>Sex</th>
361 |       <th>Age</th>
362 |       <th>Parch</th>
363 |       <th>Fare</th>
364 |       <th>Embarked</th>
365 |       <th>Name_length</th>
366 |       <th>Has_Cabin</th>
367 |       <th>FamilySize</th>
368 |       <th>IsAlone</th>
369 |       <th>Title</th>
370 |     </tr>
371 |   </thead>
372 |   <tbody>
373 |     <tr>
374 |       <th>0</th>
375 |       <td>3</td>
376 |       <td>1</td>
377 |       <td>2</td>
378 |       <td>0</td>
379 |       <td>0</td>
380 |       <td>2</td>
381 |       <td>16</td>
382 |       <td>0</td>
383 |       <td>1</td>
384 |       <td>1</td>
385 |       <td>1</td>
386 |     </tr>
387 |     <tr>
388 |       <th>1</th>
389 |       <td>3</td>
390 |       <td>0</td>
391 |       <td>2</td>
392 |       <td>0</td>
393 |       <td>0</td>
394 |       <td>0</td>
395 |       <td>32</td>
396 |       <td>0</td>
397 |       <td>2</td>
398 |       <td>0</td>
399 |       <td>3</td>
400 |     </tr>
401 |     <tr>
402 |       <th>2</th>
403 |       <td>2</td>
404 |       <td>1</td>
405 |       <td>3</td>
406 |       <td>0</td>
407 |       <td>1</td>
408 |       <td>2</td>
409 |       <td>25</td>
410 |       <td>0</td>
411 |       <td>1</td>
412 |       <td>1</td>
413 |       <td>1</td>
414 |     </tr>
415 |     <tr>
416 |       <th>3</th>
417 |       <td>3</td>
418 |       <td>1</td>
419 |       <td>1</td>
420 |       <td>0</td>
421 |       <td>1</td>
422 |       <td>0</td>
423 |       <td>16</td>
424 |       <td>0</td>
425 |       <td>1</td>
426 |       <td>1</td>
427 |       <td>1</td>
428 |     </tr>
429 |     <tr>
430 |       <th>4</th>
431 |       <td>3</td>
432 |       <td>0</td>
433 |       <td>1</td>
434 |       <td>1</td>
435 |       <td>1</td>
436 |       <td>0</td>
437 |       <td>44</td>
438 |       <td>0</td>
439 |       <td>3</td>
440 |       <td>0</td>
441 |       <td>3</td>
442 |     </tr>
443 |   </tbody>
444 | </table>
445 | </div>
446 | 
447 | 
448 | 
449 | 
450 | ```python
451 | import xgboost as xgb
452 | import pandas as pd 
453 | import numpy as np
454 | import sklearn 
455 | import os 
456 | from sklearn.model_selection import train_test_split  # 导入测试集和验证集划分函数
457 | ```
458 | 
459 | 
460 | ```python
461 | X = train_.drop("Survived",axis= 1) # 提取不带标签的数据集
462 | Y = train_["Survived"] # 提取数据集的标签，数据集的标签一般是指用于预测的label
463 | ```
464 | 
465 | 
466 | ```python
467 | X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0)
468 | ```
469 | 
470 | 
471 | ```python
472 | dtrain = xgb.DMatrix(X_train, label=Y_train)
473 | dtest = xgb.DMatrix(X_test, label=Y_test)
474 | ```
475 | 
476 | 
477 | ```python
478 | watchlist = [(dtest, 'eval'), (dtrain, 'train')]
479 | param = {'max_depth':3, 'eta':1, 'silent':1, 'objective':'multi:softmax', 'num_class':3}
480 |  
481 | bst = xgb.train(param, dtrain, num_boost_round=10, evals=watchlist)
482 | y_hat = bst.predict(dtest)
483 | result = Y_test.values.reshape(1, -1) == y_hat
484 | print('the accuracy:\t', float(np.sum(result)) / len(y_hat))
485 | ```
486 | 
487 |     [0]	eval-merror:0.201493	train-merror:0.160514
488 |     [1]	eval-merror:0.16791	train-merror:0.157303
489 |     [2]	eval-merror:0.175373	train-merror:0.144462
490 |     [3]	eval-merror:0.182836	train-merror:0.136437
491 |     [4]	eval-merror:0.171642	train-merror:0.138042
492 |     [5]	eval-merror:0.160448	train-merror:0.133226
493 |     [6]	eval-merror:0.160448	train-merror:0.126806
494 |     [7]	eval-merror:0.171642	train-merror:0.125201
495 |     [8]	eval-merror:0.164179	train-merror:0.11878
496 |     [9]	eval-merror:0.164179	train-merror:0.117175
497 |     the accuracy:	 0.835820895522388
498 |     
499 | 
500 | 
501 | ```python
502 | 
503 | ```
504 | 


--------------------------------------------------------------------------------
/Xgboost/一张图说明XGBoost算法.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Xgboost/一张图说明XGBoost算法.jpg


--------------------------------------------------------------------------------
/kaggle初学者应该如何参加机器学习比赛.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 1.问题建模"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### 1.1 业务理解\n",
 15 |     "理解赛题的具体含义,理解业务才能构造出与业务相关性高的特征."
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "### 1.2 赛题数据\n",
 23 |     "拿到数据首先明确数值特征部分和类别特征部分. 然后思考能够构造出哪些特征,并且考虑哪些特征或者数据是不能够使用的."
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {},
 29 |    "source": [
 30 |     "### 1.3 评价指标 AUC\n",
 31 |     "本数据题采用的是AUC作为评价指标，二分类问题"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "### 1.4 是否存在线下验证\n",
 39 |     "时序验证 :一般选择最近邻的1-3天\n",
 40 |     "\n",
 41 |     "交叉验证:与k相关大, k偏小则性能不稳定. k偏大则计算量大."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": []
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": []
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## 2. 探索性分析\n",
 63 |     "一般使用可视化、统计检测完成"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "### 2.1 数据集大小、 字段类型\n",
 71 |     "数据集多大，每个字段是什么类型"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": []
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "### 2.2 缺失值情况\n",
 86 |     "缺失值是否严重，是否缺失值有特殊含义"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": null,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": []
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "### 2.3 特征之间是否冗余\n",
101 |     "比如身高用m和cm同时表示了两个特征就表示冗余"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": []
110 |   },
111 |   {
112 |    "cell_type": "markdown",
113 |    "metadata": {},
114 |    "source": [
115 |     "### 2.4 是否存在时间信息\n",
116 |     "潜在的穿越问题，本题由于有时间特征，而且根据baseline训练集和测试集的两个数据采集时间段并不一致"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": []
125 |   },
126 |   {
127 |    "cell_type": "markdown",
128 |    "metadata": {},
129 |    "source": [
130 |     "### 2.5 标签分布\n",
131 |     "是否存在类别不平衡问题：本题存在绝对的类别不平衡问题，欺诈人数远远低于非欺诈人数"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": null,
137 |    "metadata": {},
138 |    "outputs": [],
139 |    "source": []
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "### 2.6 训练集和测试集的分布\n",
146 |     "是否训练集中有的字段是测试集没有的，或者二者均存在不同字段"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": []
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": null,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": []
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "## 3. 特征工程"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "markdown",
172 |    "metadata": {},
173 |    "source": [
174 |     "### 3.1  数据预处理"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "markdown",
179 |    "metadata": {},
180 |    "source": [
181 |     "#### 3.1.1 数值特征可视化:数值特征使用散点图进行可视化绘制,去掉离群点."
182 |    ]
183 |   },
184 |   {
185 |    "cell_type": "code",
186 |    "execution_count": null,
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": []
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "#### 3.1.2 缺失值处理\n",
196 |     "\n",
197 |     "缺失值处理: 缺失值并非全是脏数据, 其背后可能存在具体的业务意义,这种需要根据业务意义进行填充.\n",
198 |     "\n",
199 |     "另外一部分是真实的缺失,那么可以考虑用统计值进行填充,或者不填充,对于树模型来说是可以处理缺失的."
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": []
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "#### 3.1.3错误值处理\n",
221 |     "需要注意的是除了明显的违反逻辑常识的错误值之外, 有的数据集中可能使用\n",
222 |     "\n",
223 |     "某一特定的数值对缺失进行填充,这种需要注意\n",
224 |     "\n",
225 |     "明显错误值：体重--1000kg、暗错误值：根据业务理解，比如在某个特定的特征中，-1表示了数据缺失定义为了-1，这时候需要处理暗错误值"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": []
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {},
239 |    "outputs": [],
240 |    "source": []
241 |   },
242 |   {
243 |    "cell_type": "markdown",
244 |    "metadata": {},
245 |    "source": [
246 |     "#### 3.1.4 假标签处理\n",
247 |     "假标签处理: 如果在训练集中出现明显的错误标签(在业务背景下),那么直接删除该样本,或者也可以使用统计特征进行替换. \n",
248 |     "\n",
249 |     "对于标签和评估指标不一致的情况需要根据实际要求对标签进行数值转换."
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": null,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": []
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {},
263 |    "outputs": [],
264 |    "source": []
265 |   },
266 |   {
267 |    "cell_type": "markdown",
268 |    "metadata": {},
269 |    "source": [
270 |     "### 3.2 特征提取"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "markdown",
275 |    "metadata": {},
276 |    "source": [
277 |     "#### 3.2.1 类别特征"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "markdown",
282 |    "metadata": {},
283 |    "source": [
284 |     "#### 3.2.1.1  编码\n",
285 |     "对于模型不能学习的字符串特征可以进行编码(自然数编码和独热编码,使用区别在于该特征是否具有大小的意义)."
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": []
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": null,
298 |    "metadata": {},
299 |    "outputs": [],
300 |    "source": []
301 |   },
302 |   {
303 |    "cell_type": "markdown",
304 |    "metadata": {},
305 |    "source": [
306 |     "#### 3.2.1.2  计数统计(count)\n",
307 |     "给类别特征做count运算,反应类别在整体下的一个热度情况.但是对于异常值是很敏感,比如某一个类别特别大的情况, 这个时候特征会影响模型的泛化能力."
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": null,
313 |    "metadata": {},
314 |    "outputs": [],
315 |    "source": []
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": null,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": []
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "metadata": {},
328 |    "outputs": [],
329 |    "source": []
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {},
334 |    "source": [
335 |     "#### 3.2.1.3  计数排名\n",
336 |     "根据统计进行类别连续值排名,可以缩小异常值的权重,减少异常值的影响."
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": null,
342 |    "metadata": {},
343 |    "outputs": [],
344 |    "source": []
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": null,
349 |    "metadata": {},
350 |    "outputs": [],
351 |    "source": []
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "#### 3.2.1.4  目标编码\n",
358 |     "根据标签来做特征,比如统计标签下的统计特征(譬如房租赛当中统计每个板块下的均价).\n",
359 |     "\n",
360 |     "但是这样的做法很容易导致过拟合,需要使用交叉验证来解决.\n",
361 |     "\n",
362 |     "将数据拆分成多份,用n-1份作为已知数据,剩下一份作为未知数据,在已知数据中构造特征,赋值到未知数据中,从而避免过拟合,提升泛化能力"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": null,
368 |    "metadata": {},
369 |    "outputs": [],
370 |    "source": []
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": null,
375 |    "metadata": {},
376 |    "outputs": [],
377 |    "source": []
378 |   },
379 |   {
380 |    "cell_type": "markdown",
381 |    "metadata": {},
382 |    "source": [
383 |     "#### 3.2.1.5  交叉组合\n",
384 |     "类别与类别进行交叉组合,可以让类别之间的粒度更细. 类别与数值进行组合,可以反映类别的统计特征,比如类别下的平均值统计等."
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": null,
390 |    "metadata": {},
391 |    "outputs": [],
392 |    "source": []
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": null,
397 |    "metadata": {},
398 |    "outputs": [],
399 |    "source": []
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": null,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": []
407 |   },
408 |   {
409 |    "cell_type": "markdown",
410 |    "metadata": {},
411 |    "source": [
412 |     "#### 3.2.1.6  防止过拟合\n",
413 |     "使用交叉统计进行特征构造(比如数据拆分为5分,每使用4份构造一份,构造五次可以拼出一个完整的集合). \n",
414 |     "\n",
415 |     "时序特征进行构造(使用前一天和前两天的一些信息).在这种情况下可能部分类别特征没有同时存在于两个集合中\n",
416 |     "\n",
417 |     "(比如房租比赛中的plate62, 和JD比赛中的action type5),此时可以选择使用统计特征填充."
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": []
426 |   },
427 |   {
428 |    "cell_type": "code",
429 |    "execution_count": null,
430 |    "metadata": {},
431 |    "outputs": [],
432 |    "source": []
433 |   },
434 |   {
435 |    "cell_type": "markdown",
436 |    "metadata": {},
437 |    "source": [
438 |     "#### 3.2.2 数值特征"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {},
444 |    "source": [
445 |     "#### 3.2.2.1 分桶——将数值特征转换为离散特征，或者分位数"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": []
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": null,
458 |    "metadata": {},
459 |    "outputs": [],
460 |    "source": []
461 |   },
462 |   {
463 |    "cell_type": "markdown",
464 |    "metadata": {},
465 |    "source": [
466 |     "#### 3.2.2.2 根据业务做特征交叉\n",
467 |     "对不同的特征进行加减乘除"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": []
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": null,
480 |    "metadata": {},
481 |    "outputs": [],
482 |    "source": []
483 |   },
484 |   {
485 |    "cell_type": "code",
486 |    "execution_count": null,
487 |    "metadata": {},
488 |    "outputs": [],
489 |    "source": []
490 |   },
491 |   {
492 |    "cell_type": "markdown",
493 |    "metadata": {},
494 |    "source": [
495 |     "#### 3.2.2.3 交叉组合\n",
496 |     "对类别数值两类特征做交叉组合构建新特征"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": null,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": []
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": null,
509 |    "metadata": {},
510 |    "outputs": [],
511 |    "source": []
512 |   },
513 |   {
514 |    "cell_type": "code",
515 |    "execution_count": null,
516 |    "metadata": {},
517 |    "outputs": [],
518 |    "source": []
519 |   },
520 |   {
521 |    "cell_type": "markdown",
522 |    "metadata": {},
523 |    "source": [
524 |     "#### 3.2.2.4 时间特征处理\n",
525 |     "可以反映周期性和趋势线.并且时间越近效果是越好的.\n",
526 |     "\n",
527 |     "对于日期变量是可以做One-Hot的\n",
528 |     "\n",
529 |     "时序相关特征可以进行历史平移和滑窗统"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "metadata": {},
536 |    "outputs": [],
537 |    "source": []
538 |   },
539 |   {
540 |    "cell_type": "markdown",
541 |    "metadata": {},
542 |    "source": [
543 |     "#### 3.2.2.5 多值特征处理\n",
544 |     "\n",
545 |     "多值特征,可以完全展开one_hot,也可以词频统计,也可以wordToVector, embeding等进行降维."
546 |    ]
547 |   },
548 |   {
549 |    "cell_type": "code",
550 |    "execution_count": null,
551 |    "metadata": {},
552 |    "outputs": [],
553 |    "source": []
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": null,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": []
561 |   },
562 |   {
563 |    "cell_type": "code",
564 |    "execution_count": null,
565 |    "metadata": {},
566 |    "outputs": [],
567 |    "source": []
568 |   },
569 |   {
570 |    "cell_type": "markdown",
571 |    "metadata": {},
572 |    "source": [
573 |     "### 3.3 特征选择\n",
574 |     "这一步没有哪种方法是一定最好的，需要根据具体问题具体分析，部分机器学习算法，如XGBoost、lightGBM可以做到自动帮助特征选择\n",
575 |     "\n",
576 |     "另外补充一点，目前有一些kaggle算法已经做到了通过一些辅助库进行调参自动化选择最优参数，貌似是利用贝叶斯法对参数选择\n",
577 |     "\n",
578 |     "大佬总结：在比赛中的时候需要尝试多种方式,没有哪一种一定是最好的"
579 |    ]
580 |   },
581 |   {
582 |    "cell_type": "code",
583 |    "execution_count": null,
584 |    "metadata": {},
585 |    "outputs": [],
586 |    "source": []
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": null,
591 |    "metadata": {},
592 |    "outputs": [],
593 |    "source": []
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": null,
598 |    "metadata": {},
599 |    "outputs": [],
600 |    "source": []
601 |   },
602 |   {
603 |    "cell_type": "markdown",
604 |    "metadata": {},
605 |    "source": [
606 |     "#### 3.3.1 过滤法\n",
607 |     "卡方检验和互信息衡量x和y的相关性;相关系数衡量特征之间的相关性."
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "code",
612 |    "execution_count": null,
613 |    "metadata": {},
614 |    "outputs": [],
615 |    "source": []
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": null,
620 |    "metadata": {},
621 |    "outputs": [],
622 |    "source": []
623 |   },
624 |   {
625 |    "cell_type": "markdown",
626 |    "metadata": {},
627 |    "source": [
628 |     "#### 3.3.2 封装法\n",
629 |     "前向搜索: 选定好模型,先放入几个基本特征,然后依次往其中丢入特征,保留效果好的特征,属于启发式算法,但是可能陷入局部最优解.\n",
630 |     "    \n",
631 |     "后向搜索: 依次从模型中剔除特征的思路,耗费时间更长,数据量大的时候难以使用."
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": null,
637 |    "metadata": {},
638 |    "outputs": [],
639 |    "source": []
640 |   },
641 |   {
642 |    "cell_type": "code",
643 |    "execution_count": null,
644 |    "metadata": {},
645 |    "outputs": [],
646 |    "source": []
647 |   },
648 |   {
649 |    "cell_type": "markdown",
650 |    "metadata": {},
651 |    "source": [
652 |     "#### 3.3.3 嵌入法\n",
653 |     "根据树模型返回的特征重要性来选择特征.boosting类别的模型基本都可以做特征选择，而且很多baseline采用"
654 |    ]
655 |   },
656 |   {
657 |    "cell_type": "code",
658 |    "execution_count": null,
659 |    "metadata": {},
660 |    "outputs": [],
661 |    "source": []
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {},
667 |    "outputs": [],
668 |    "source": []
669 |   },
670 |   {
671 |    "cell_type": "markdown",
672 |    "metadata": {},
673 |    "source": [
674 |     "## 4.必备模型和模型融合"
675 |    ]
676 |   },
677 |   {
678 |    "cell_type": "markdown",
679 |    "metadata": {},
680 |    "source": [
681 |     "### 4.1 XGBoost和lightGBM\n",
682 |     "\n",
683 |     "对特征处理的要求低\n",
684 |     "\n",
685 |     "对类别特征和连续特征友好\n",
686 |     "\n",
687 |     "缺失值不需要填充"
688 |    ]
689 |   },
690 |   {
691 |    "cell_type": "code",
692 |    "execution_count": null,
693 |    "metadata": {},
694 |    "outputs": [],
695 |    "source": []
696 |   },
697 |   {
698 |    "cell_type": "code",
699 |    "execution_count": null,
700 |    "metadata": {},
701 |    "outputs": [],
702 |    "source": []
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": null,
707 |    "metadata": {},
708 |    "outputs": [],
709 |    "source": []
710 |   },
711 |   {
712 |    "cell_type": "markdown",
713 |    "metadata": {},
714 |    "source": [
715 |     "### 4.2 模型融合\n",
716 |     "理论分析: 让模型做到优质而尽量不同. 保证特征差异,样本差异,模型差异的情况下, 可以使得融合能有更好的效果. \n",
717 |     "\n",
718 |     "实际中可以处理出多套特征,使用重合较少的样本,使用不同的理论模型来达到这一目的.\n",
719 |     "\n",
720 |     "训练过程融合是树模型本身带有的, 因此可以使用调参达到这个目的.\n",
721 |     "\n",
722 |     "结果融合: 分类(投票),平均(回归), stacking."
723 |    ]
724 |   },
725 |   {
726 |    "cell_type": "code",
727 |    "execution_count": null,
728 |    "metadata": {},
729 |    "outputs": [],
730 |    "source": []
731 |   },
732 |   {
733 |    "cell_type": "code",
734 |    "execution_count": null,
735 |    "metadata": {},
736 |    "outputs": [],
737 |    "source": []
738 |   },
739 |   {
740 |    "cell_type": "markdown",
741 |    "metadata": {},
742 |    "source": [
743 |     "## 5.竞赛总结\n",
744 |     "好的竞赛总结比竞赛过程更总要\n",
745 |     "\n",
746 |     "赛后及时总结：自己的整体思路、关键代码、自己的不足、还需要做哪些尝试\n",
747 |     "\n",
748 |     "学习优秀的方案：不要局限于自己的思维模式，其他人是如何思考的，哪些是可以借鉴的。进行对比发现自己的不足之处\n",
749 |     "\n",
750 |     "初学者：一定要有耐心和毅力，既然选择了打比赛，那就需要多花时间和多学习，慢慢来，不要一口气想拿个TOP10"
751 |    ]
752 |   },
753 |   {
754 |    "cell_type": "code",
755 |    "execution_count": null,
756 |    "metadata": {},
757 |    "outputs": [],
758 |    "source": []
759 |   }
760 |  ],
761 |  "metadata": {
762 |   "kernelspec": {
763 |    "display_name": "Python 3",
764 |    "language": "python",
765 |    "name": "python3"
766 |   },
767 |   "language_info": {
768 |    "codemirror_mode": {
769 |     "name": "ipython",
770 |     "version": 3
771 |    },
772 |    "file_extension": ".py",
773 |    "mimetype": "text/x-python",
774 |    "name": "python",
775 |    "nbconvert_exporter": "python",
776 |    "pygments_lexer": "ipython3",
777 |    "version": "3.7.0"
778 |   }
779 |  },
780 |  "nbformat": 4,
781 |  "nbformat_minor": 2
782 | }
783 | 


--------------------------------------------------------------------------------