├── Adaboost └── 【HP20190519】Adaboost算法代码学习.md ├── CART └── 【HP20190525】CART算法代码实现.md ├── GBDT ├── GBDT_XGBoost_LGBM算法原理v1.1.pdf ├── GBDT_python3_code │ ├── GBDT_model.md │ ├── TempLinkoping2016.txt │ ├── readme.md │ └── utils │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── __init__.cpython-37.pyc │ │ ├── bools.cpython-37.pyc │ │ ├── data_manipulation.cpython-37.pyc │ │ ├── data_operation.cpython-37.pyc │ │ ├── dates.cpython-37.pyc │ │ ├── enum.cpython-37.pyc │ │ ├── lists.cpython-37.pyc │ │ ├── loss_functions.cpython-37.pyc │ │ ├── math.cpython-37.pyc │ │ ├── misc.cpython-37.pyc │ │ └── objects.cpython-37.pyc │ │ ├── bools.py │ │ ├── data_manipulation.py │ │ ├── data_operation.py │ │ ├── dates.py │ │ ├── decision_tree │ │ ├── __pycache__ │ │ │ ├── decision_tree_model.cpython-35.pyc │ │ │ └── decision_tree_model.cpython-37.pyc │ │ ├── decision_tree_classifier_example.py │ │ ├── decision_tree_model.py │ │ └── decision_tree_regressor_example.py │ │ ├── dicts │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-37.pyc │ │ │ ├── chained_dict.cpython-37.pyc │ │ │ ├── helpers.cpython-37.pyc │ │ │ └── limited_dict.cpython-37.pyc │ │ ├── chained_dict.py │ │ ├── helpers.py │ │ └── limited_dict.py │ │ ├── enum.py │ │ ├── lists.py │ │ ├── loss_functions.py │ │ ├── math.py │ │ ├── misc.py │ │ └── objects.py ├── readme.md └── 【HP20190706】《统计学习方法》第一版例题8.2代码实现.md ├── README.md ├── Stacking ├── 20190630_titanic_Stacking.ipynb ├── Stacking_learn_beta.ipynb ├── Stacking_learn_beta.md ├── changelog.md ├── kaggle_titanic_data │ ├── gender_submission.csv │ ├── test.csv │ └── train.csv └── 两层stacking结构理解beta.pdf ├── Xgboost ├── XGBoost_code │ ├── XGBoost算法代码简易实现.md │ ├── output_21_4.png │ └── output_24_4.png ├── readme.md ├── 【HP20190616】xgboost_titanic.md └── 一张图说明XGBoost算法.jpg └── kaggle初学者应该如何参加机器学习比赛.ipynb /Adaboost/【HP20190519】Adaboost算法代码学习.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ```python 4 | import numpy as np 5 | import pandas as pd 6 | ``` 7 | 8 | 9 | ```python 10 | # 通过一个函数来加载特征矩阵和类别标签 11 | def loadSimpleData(): 12 | print ("-----加载数据的特征矩阵和类别标签------") 13 | datMat = np.matrix([[1,2.1],[2,1.1],[1.3,1],[1,1],[2,1]]) 14 | classLabels = [1,1,-1,-1,1] 15 | print("展示特征矩阵") 16 | print(datMat) 17 | print("类别标签") 18 | print(classLabels) 19 | return datMat, classLabels 20 | ``` 21 | 22 | 23 | ```python 24 | datMat,classLabels=loadSimpleData() 25 | ``` 26 | 27 | -----加载数据的特征矩阵和类别标签------ 28 | 展示特征矩阵 29 | [[1. 2.1] 30 | [2. 1.1] 31 | [1.3 1. ] 32 | [1. 1. ] 33 | [2. 1. ]] 34 | 类别标签 35 | [1, 1, -1, -1, 1] 36 | 37 | 38 | 39 | ```python 40 | # 定义单层决策树的阈值过滤函数 41 | # 接着是树的分类函数。这个函数在下面的循环里要用到 42 | # 作用很简单,就是比对每一列的特征值和目标函数,返回比对的结果。四个参数分别是(输入矩阵,第几列,阈值,lt小于或gt大于) 43 | def stumpClassify(dataMatrix,dimen,threshVal,threshIneq): 44 | #对数据集每一列的各个特征进行阈值过滤,这里是构建一个以数据特征集的行数相等的元素全部为1的向量 45 | retArray=np.ones((np.shape(dataMatrix)[0],1)) 46 | #阈值的模式,将小于某一阈值的特征归类为-1,下面的逻辑判断将要对每一个元素值与阈值进行比较,如果小于等于这个值,那么就更新这个元素为-1 47 | if threshIneq=='lt': 48 | retArray[dataMatrix[:,dimen]<=threshVal]=-1.0 49 | #将大于某一阈值的特征归类为-1 50 | else: 51 | retArray[dataMatrix[:,dimen]>threshVal]=-1.0 52 | return retArray 53 | ``` 54 | 55 | 56 | ```python 57 | # 定义单层决策树函数,这个单层决策树函数会通过设置一个巧妙的阈值,然后用阈值去判断当前特征向量的每一列的每一个元素是否大于阈值,据此输出类别预测 58 | def buildStump(dataArr,classLabels,D): 59 | # 将数据集和标签列表转为矩阵形式,其中标签定义为转置矩阵 60 | dataMatrix=np.mat(dataArr);labelMat=np.mat(classLabels).T 61 | # 计算矩阵的行数和列数 62 | m,n=np.shape(dataMatrix) 63 | # 对关键变量进行初始化,设置步长或区间总数,定义一个字典保存最优决策树信息(迭代的时候这里就用来保存每一次的最新的预测结果) 64 | # 初始化生成一个m行1列的全0元素的矩阵,这里是为后期最优单层决策树预测结果生成初始化值 65 | numSteps=10.0;bestStump={};bestClasEst=np.mat(np.zeros((m,1))) 66 | # 最小错误率初始化为+∞,inf是指正无穷的意思,其中错误率应该是一个正实数 67 | minError=np.inf 68 | # 遍历每一列的特征值,这里的n是原始特征矩阵的列数,机器学习实战中的n=2,那么就是分别遍历两列的特征值 69 | for i in range(n): 70 | #找出并定义每列中特征值的最小值和最大值 71 | rangeMin=dataMatrix[:,i].min();rangeMax=dataMatrix[:,i].max() 72 | #求取步长大小或者说区间间隔,这里的步长设置其实只是其中一种定义步长的算法 73 | stepSize=(rangeMax-rangeMin)/numSteps 74 | #遍历各个步长区间,这里实际上是遍历[-1,11]这个闭区间所有的整数 75 | for j in range(-1,int(numSteps)+1): 76 | #两种阈值过滤模式 77 | for inequal in ['lt','gt']: 78 | #阈值计算公式:最小值+j(-1<=j<=numSteps+1)*步长 79 | # 以机器学习实战的第一列为准,rangeMin=1,j=-1,stepSize=(2-1)/10=0.1,那么阈值threshVal=[1+(-1)*0.1]=0.9 80 | # 这里的阈值设置,其实是非常科学的,虽然我不知道为何这样设置,但是这样设置就是能够比较出来元素和阈值的大小 81 | threshVal=(rangeMin+float(j)*stepSize) 82 | #选定阈值后,调用阈值过滤函数分类预测,\表示换行符,如果取消,就需要将下一行换行回到当前行,否则会报错 83 | # stumpClassify(矩阵,第1列,0.9的阈值,lt),初次运行predictedVals的结果为array([[1.],[1.],[1.],[1.],[1.]]) 84 | predictedVals=\ 85 | stumpClassify(dataMatrix,i,threshVal,inequal) 86 | # 既然predictedVals已经预测出来,那么就需要去比较预测的结果和真实结果的值的错误率,那么初始化错误向量 87 | # 这里初始化出来的向量是一个元素都为1的5行1列的矩阵,用来对错误的情况进行比对 88 | errArr=np.mat(np.ones((m,1))) 89 | #将错误向量中分类正确项置0 90 | errArr[predictedVals==labelMat]=0 91 | #计算加权的错误率,这里用到了矩阵乘法,矩阵与矩阵相乘,还是一个矩阵,但是一个行向量和一个列向量相乘,是一个数 92 | weightedError=D.T*errArr 93 | #打印相关信息,可省略 94 | print("当前遍历的列数为 %d, 阈值 %.2f, 当前大于小于类型: %s, 加权错误率= %.3f" %\ 95 | (i, threshVal, inequal, weightedError)) 96 | #如果当前错误率小于当前最小错误率,将当前错误率作为最小错误率 97 | #存储相关信息 98 | if weightedError value)[0],:] 37 | mat1 = dataset[np.nonzero(dataset[:, feature] <= value)[0],:] 38 | return mat0,mat1 39 | ``` 40 | 41 | 42 | ```python 43 | # np.eye生成对角矩阵 44 | testmat = np.mat(np.eye(4)) 45 | ``` 46 | 47 | 48 | ```python 49 | testmat 50 | ``` 51 | 52 | 53 | 54 | 55 | matrix([[1., 0., 0., 0.], 56 | [0., 1., 0., 0.], 57 | [0., 0., 1., 0.], 58 | [0., 0., 0., 1.]]) 59 | 60 | 61 | 62 | 63 | ```python 64 | mat0, mat1 = binsplitdataset(testmat,1,0.5) 65 | ``` 66 | 67 | 68 | ```python 69 | print('---展示按照某个特征的大于阈值部分切片的矩阵第一部分---') 70 | print(mat0) 71 | print('---展示按照某个特征的小于等于阈值部分切片的矩阵第二部分---') 72 | print(mat1) 73 | ``` 74 | 75 | ---展示按照某个特征的大于阈值部分切片的矩阵第一部分--- 76 | [[0. 1. 0. 0.]] 77 | ---展示按照某个特征的小于等于阈值部分切片的矩阵第二部分--- 78 | [[1. 0. 0. 0.] 79 | [0. 0. 1. 0.] 80 | [0. 0. 0. 1.]] 81 | 82 | 83 | 84 | ```python 85 | # 对数据集取出目标变量列,同时对目标变量求均值,比如上述的案例的对焦矩阵中,目标变量是最后一列,求均值等于0.25 86 | def regleaf(dataset): 87 | return np.mean(dataset[:,-1]) 88 | ``` 89 | 90 | 91 | ```python 92 | def regerr(dataset): 93 | # np.var函数是求方差,方差是指所有数与数组均值之差的平方和的均值,方差表达的是数组内的数的差异度 94 | # 这里表达的是目标变量的方差乘以数据集的样本个数(即行数),返回的是总方差 95 | # 实际含义为计算目标变量的平方误差 96 | return np.var(dataset[:,-1]) * np.shape(dataset)[0] 97 | ``` 98 | 99 | 100 | ```python 101 | # 定义函数来求数据的目标变量的均值,目标变量的平方误差,同时定义一个列表来控制函数停止的时机 102 | def chooseBestSplit(dataset, leaftype=regleaf, errtype=regerr, ops=(1,4)): 103 | # 使用两个变量来作为控制阈值,tolS是允许的误差下降值,tolN是切分的最少样本数 104 | tolS=ops[0]; tolN=ops[1] 105 | # 先将目标变量转置为一个列表,然后选取这个列表中的不重复的元素组合成为一个元组,然后计算这个元祖的元素数量并判断是否唯一 106 | if len(set(dataset[:,-1].T.tolist()[0])) == 1: 107 | # 在Python中,有一个特殊的表示,None,它就是空 108 | # 如果所有目标变量的值都相等,那就不用预测了,退出,那么返回空值,并直接计算目标变量的均值 109 | return None, leaftype(dataset) 110 | # 计算数据集的行数和列数,行数代表数据集大小,列数代表特征值数量和目标变量的和 111 | m,n = np.shape(dataset) 112 | # 计算数据的平方误差 113 | S = errtype(dataset) 114 | # 定义bestS初始化为正无穷,这一步在初始化 115 | bestS = np.inf; bestIndex = 0; bestValue = 0 116 | # 遍历所有特征,这里n-1就是所有的特征数量 117 | # 对每一个特征 118 | for featIndex in range(n-1): 119 | # 对每一个特征里面的特征值 120 | for splitVal in set((dataset[:,featIndex].T.A.tolist())[0]): 121 | # 通过数据集的特征和特征值来对数据进行切分 122 | mat0,mat1 = binsplitdataset(dataset, featIndex, splitVal) 123 | # continue 语句用来告诉Python跳过当前循环的剩余语句,然后继续进行下一轮循环 124 | if (np.shape(mat0)[0] < tolN) or (np.shape(mat1)[0] < tolN):continue 125 | # 计算切分的两个单元分别的平方误差和 126 | newS=errtype(mat0) + errtype(mat1) 127 | # 更新平方误差值,并更新前期初始化的值为新的特征和新的特征值 128 | if newS < bestS: 129 | bestIndex = featIndex 130 | bestValue = splitVal 131 | bestS = newS 132 | # 假如数据的误差减小量低于阈值,则退出,并计算数据集的目标变量的均值 133 | if (S-bestS) < tolS: 134 | return None, leaftype(dataset) 135 | # 通过数据集的当前的特征和特征值来对数据进行切分 136 | mat0, mat1 = binsplitdataset(dataset, bestIndex, bestValue) 137 | # 如果切分出的数据集很小且低于阈值,则退出,并计算数据集的目标变量的均值 138 | if (np.shape(mat0)[0]< tolN) or (np.shape(mat1)[0]< tolN): 139 | return None, leaftype(dataset) 140 | # 返回最佳的切分特征和最佳的切分特征值 141 | return bestIndex,bestValue 142 | ``` 143 | 144 | 145 | ```python 146 | # 生成最终的回归树,树的后三个参数决定了树的类型,分别为给出建立叶节点的函数、计算误差的函数、设置树生成的一个控制阈值 147 | def createtree(dataset, leaftype=regleaf, errtype=regerr, ops=(1,4)): 148 | # 返回最佳的切分特征和最佳的切分特征值 149 | feat, val = chooseBestSplit(dataset,leaftype, errtype, ops) 150 | if feat == None:return val 151 | rettree = {} 152 | rettree['spind'] = feat 153 | rettree['spval'] = val 154 | lset, rset = binsplitdataset(dataset, feat, val) 155 | # 这里是递归调用本函数 156 | rettree['left'] = createtree(lset, leaftype, errtype, ops) 157 | rettree['right'] = createtree(rset, leaftype, errtype, ops) 158 | return rettree 159 | ``` 160 | 161 | 162 | ```python 163 | myDat = loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex00.txt') 164 | ``` 165 | 166 | 167 | ```python 168 | myMat = np.mat(myDat) 169 | myMat 170 | ``` 171 | 172 | 173 | 174 | 175 | matrix([[ 3.609800e-02, 1.550960e-01], 176 | [ 9.933490e-01, 1.077553e+00], 177 | [ 5.308970e-01, 8.934620e-01], 178 | [ 7.123860e-01, 5.648580e-01], 179 | [ 3.435540e-01, -3.717000e-01], 180 | [ 9.801600e-02, -3.327600e-01], 181 | [ 6.911150e-01, 8.343910e-01], 182 | [ 9.135800e-02, 9.993500e-02], 183 | [ 7.270980e-01, 1.000567e+00], 184 | [ 9.519490e-01, 9.452550e-01], 185 | [ 7.685960e-01, 7.602190e-01], 186 | [ 5.413140e-01, 8.937480e-01], 187 | [ 1.463660e-01, 3.428300e-02], 188 | [ 6.731950e-01, 9.150770e-01], 189 | [ 1.835100e-01, 1.848430e-01], 190 | [ 3.395630e-01, 2.067830e-01], 191 | [ 5.179210e-01, 1.493586e+00], 192 | [ 7.037550e-01, 1.101678e+00], 193 | [ 8.307000e-03, 6.997600e-02], 194 | [ 2.439090e-01, -2.946700e-02], 195 | [ 3.069640e-01, -1.773210e-01], 196 | [ 3.649200e-02, 4.081550e-01], 197 | [ 2.955110e-01, 2.882000e-03], 198 | [ 8.375220e-01, 1.229373e+00], 199 | [ 2.020540e-01, -8.774400e-02], 200 | [ 9.193840e-01, 1.029889e+00], 201 | [ 3.772010e-01, -2.435500e-01], 202 | [ 8.148250e-01, 1.095206e+00], 203 | [ 6.112700e-01, 9.820360e-01], 204 | [ 7.224300e-02, -4.209830e-01], 205 | [ 4.102300e-01, 3.317220e-01], 206 | [ 8.690770e-01, 1.114825e+00], 207 | [ 6.205990e-01, 1.334421e+00], 208 | [ 1.011490e-01, 6.883400e-02], 209 | [ 8.208020e-01, 1.325907e+00], 210 | [ 5.200440e-01, 9.619830e-01], 211 | [ 4.881300e-01, -9.779100e-02], 212 | [ 8.198230e-01, 8.352640e-01], 213 | [ 9.750220e-01, 6.735790e-01], 214 | [ 9.531120e-01, 1.064690e+00], 215 | [ 4.759760e-01, -1.637070e-01], 216 | [ 2.731470e-01, -4.552190e-01], 217 | [ 8.045860e-01, 9.240330e-01], 218 | [ 7.479500e-02, -3.496920e-01], 219 | [ 6.253360e-01, 6.236960e-01], 220 | [ 6.562180e-01, 9.585060e-01], 221 | [ 8.340780e-01, 1.010580e+00], 222 | [ 7.819300e-01, 1.074488e+00], 223 | [ 9.849000e-03, 5.659400e-02], 224 | [ 3.022170e-01, -1.486500e-01], 225 | [ 6.782870e-01, 9.077270e-01], 226 | [ 1.805060e-01, 1.036760e-01], 227 | [ 1.936410e-01, -3.275890e-01], 228 | [ 3.434790e-01, 1.752640e-01], 229 | [ 1.458090e-01, 1.369790e-01], 230 | [ 9.967570e-01, 1.035533e+00], 231 | [ 5.902100e-01, 1.336661e+00], 232 | [ 2.380700e-01, -3.584590e-01], 233 | [ 5.613620e-01, 1.070529e+00], 234 | [ 3.775970e-01, 8.850500e-02], 235 | [ 9.914200e-02, 2.528000e-02], 236 | [ 5.395580e-01, 1.053846e+00], 237 | [ 7.902400e-01, 5.332140e-01], 238 | [ 2.422040e-01, 2.093590e-01], 239 | [ 1.523240e-01, 1.328580e-01], 240 | [ 2.526490e-01, -5.561300e-02], 241 | [ 8.959300e-01, 1.077275e+00], 242 | [ 1.333000e-01, -2.231430e-01], 243 | [ 5.597630e-01, 1.253151e+00], 244 | [ 6.436650e-01, 1.024241e+00], 245 | [ 8.772410e-01, 7.970050e-01], 246 | [ 6.137650e-01, 1.621091e+00], 247 | [ 6.457620e-01, 1.026886e+00], 248 | [ 6.513760e-01, 1.315384e+00], 249 | [ 6.977180e-01, 1.212434e+00], 250 | [ 7.425270e-01, 1.087056e+00], 251 | [ 9.010560e-01, 1.055900e+00], 252 | [ 3.623140e-01, -5.564640e-01], 253 | [ 9.482680e-01, 6.318620e-01], 254 | [ 2.340000e-04, 6.090300e-02], 255 | [ 7.500780e-01, 9.062910e-01], 256 | [ 3.254120e-01, -2.192450e-01], 257 | [ 7.268280e-01, 1.017112e+00], 258 | [ 3.480130e-01, 4.893900e-02], 259 | [ 4.581210e-01, -6.145600e-02], 260 | [ 2.807380e-01, -2.288800e-01], 261 | [ 5.677040e-01, 9.690580e-01], 262 | [ 7.509180e-01, 7.481040e-01], 263 | [ 5.758050e-01, 8.990900e-01], 264 | [ 5.079400e-01, 1.107265e+00], 265 | [ 7.176900e-02, -1.109460e-01], 266 | [ 5.535200e-01, 1.391273e+00], 267 | [ 4.011520e-01, -1.216400e-01], 268 | [ 4.066490e-01, -3.663170e-01], 269 | [ 6.521210e-01, 1.004346e+00], 270 | [ 3.478370e-01, -1.534050e-01], 271 | [ 8.193100e-02, -2.697560e-01], 272 | [ 8.216480e-01, 1.280895e+00], 273 | [ 4.801400e-02, 6.449600e-02], 274 | [ 1.309620e-01, 1.842410e-01], 275 | [ 7.734220e-01, 1.125943e+00], 276 | [ 7.896250e-01, 5.526140e-01], 277 | [ 9.699400e-02, 2.271670e-01], 278 | [ 6.257910e-01, 1.244731e+00], 279 | [ 5.895750e-01, 1.185812e+00], 280 | [ 3.231810e-01, 1.808110e-01], 281 | [ 8.224430e-01, 1.086648e+00], 282 | [ 3.603230e-01, -2.048300e-01], 283 | [ 9.501530e-01, 1.022906e+00], 284 | [ 5.275050e-01, 8.795600e-01], 285 | [ 8.600490e-01, 7.174900e-01], 286 | [ 7.044000e-03, 9.415000e-02], 287 | [ 4.383670e-01, 3.401400e-02], 288 | [ 5.745730e-01, 1.066130e+00], 289 | [ 5.366890e-01, 8.672840e-01], 290 | [ 7.821670e-01, 8.860490e-01], 291 | [ 9.898880e-01, 7.442070e-01], 292 | [ 7.614740e-01, 1.058262e+00], 293 | [ 9.854250e-01, 1.227946e+00], 294 | [ 1.325430e-01, -3.293720e-01], 295 | [ 3.469860e-01, -1.503890e-01], 296 | [ 7.687840e-01, 8.997050e-01], 297 | [ 8.489210e-01, 1.170959e+00], 298 | [ 4.492800e-01, 6.909800e-02], 299 | [ 6.617200e-02, 5.243900e-02], 300 | [ 8.137190e-01, 7.066010e-01], 301 | [ 6.619230e-01, 7.670400e-01], 302 | [ 5.294910e-01, 1.022206e+00], 303 | [ 8.464550e-01, 7.200300e-01], 304 | [ 4.486560e-01, 2.697400e-02], 305 | [ 7.950720e-01, 9.657210e-01], 306 | [ 1.181560e-01, -7.740900e-02], 307 | [ 8.424800e-02, -1.954700e-02], 308 | [ 8.458150e-01, 9.526170e-01], 309 | [ 5.769460e-01, 1.234129e+00], 310 | [ 7.720830e-01, 1.299018e+00], 311 | [ 6.966480e-01, 8.454230e-01], 312 | [ 5.950120e-01, 1.213435e+00], 313 | [ 6.486750e-01, 1.287407e+00], 314 | [ 8.970940e-01, 1.240209e+00], 315 | [ 5.529900e-01, 1.036158e+00], 316 | [ 3.329820e-01, 2.100840e-01], 317 | [ 6.561500e-02, -3.069700e-01], 318 | [ 2.786610e-01, 2.536280e-01], 319 | [ 7.731680e-01, 1.140917e+00], 320 | [ 2.036930e-01, -6.403600e-02], 321 | [ 3.556880e-01, -1.193990e-01], 322 | [ 9.888520e-01, 1.069062e+00], 323 | [ 5.187350e-01, 1.037179e+00], 324 | [ 5.145630e-01, 1.156648e+00], 325 | [ 9.764140e-01, 8.629110e-01], 326 | [ 9.190740e-01, 1.123413e+00], 327 | [ 6.977770e-01, 8.278050e-01], 328 | [ 9.280970e-01, 8.832250e-01], 329 | [ 9.002720e-01, 9.968710e-01], 330 | [ 3.441020e-01, -6.153900e-02], 331 | [ 1.480490e-01, 2.042980e-01], 332 | [ 1.300520e-01, -2.616700e-02], 333 | [ 3.020010e-01, 3.171350e-01], 334 | [ 3.371000e-01, 2.633200e-02], 335 | [ 3.149240e-01, -1.952000e-03], 336 | [ 2.696810e-01, -1.659710e-01], 337 | [ 1.960050e-01, -4.884700e-02], 338 | [ 1.290610e-01, 3.051070e-01], 339 | [ 9.367830e-01, 1.026258e+00], 340 | [ 3.055400e-01, -1.159910e-01], 341 | [ 6.839210e-01, 1.414382e+00], 342 | [ 6.223980e-01, 7.663300e-01], 343 | [ 9.025320e-01, 8.616010e-01], 344 | [ 7.125030e-01, 9.334900e-01], 345 | [ 5.900620e-01, 7.055310e-01], 346 | [ 7.231200e-01, 1.307248e+00], 347 | [ 1.882180e-01, 1.136850e-01], 348 | [ 6.436010e-01, 7.825520e-01], 349 | [ 5.202070e-01, 1.209557e+00], 350 | [ 2.331150e-01, -3.481470e-01], 351 | [ 4.656250e-01, -1.529400e-01], 352 | [ 8.845120e-01, 1.117833e+00], 353 | [ 6.632000e-01, 7.016340e-01], 354 | [ 2.688570e-01, 7.344700e-02], 355 | [ 7.292340e-01, 9.319560e-01], 356 | [ 4.296640e-01, -1.886590e-01], 357 | [ 7.371890e-01, 1.200781e+00], 358 | [ 3.785950e-01, -2.960940e-01], 359 | [ 9.301730e-01, 1.035645e+00], 360 | [ 7.743010e-01, 8.367630e-01], 361 | [ 2.739400e-01, -8.571300e-02], 362 | [ 8.244420e-01, 1.082153e+00], 363 | [ 6.260110e-01, 8.405440e-01], 364 | [ 6.793900e-01, 1.307217e+00], 365 | [ 5.782520e-01, 9.218850e-01], 366 | [ 7.855410e-01, 1.165296e+00], 367 | [ 5.974090e-01, 9.747700e-01], 368 | [ 1.408300e-02, -1.325250e-01], 369 | [ 6.638700e-01, 1.187129e+00], 370 | [ 5.523810e-01, 1.369630e+00], 371 | [ 6.838860e-01, 9.999850e-01], 372 | [ 2.103340e-01, -6.899000e-03], 373 | [ 6.045290e-01, 1.212685e+00], 374 | [ 2.507440e-01, 4.629700e-02]]) 375 | 376 | 377 | 378 | 379 | ```python 380 | createtree(myMat) 381 | ``` 382 | 383 | 384 | 385 | 386 | {'spind': 0, 387 | 'spval': 0.48813, 388 | 'left': 1.0180967672413792, 389 | 'right': -0.04465028571428572} 390 | 391 | 392 | 393 | 394 | ```python 395 | import matplotlib.pyplot as plt 396 | myDat=loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex00.txt') 397 | myMat=np.mat(myDat) 398 | createtree(myMat) 399 | plt.plot(myMat[:,0],myMat[:,1],'ro') 400 | plt.show() 401 | ``` 402 | 403 | 404 | 405 | 406 | 407 | 408 | ```python 409 | myDat1 = np.mat(loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex0.txt')) 410 | createtree(myDat1) 411 | ``` 412 | 413 | 414 | 415 | 416 | {'spind': 1, 417 | 'spval': 0.39435, 418 | 'left': {'spind': 1, 419 | 'spval': 0.582002, 420 | 'left': {'spind': 1, 421 | 'spval': 0.797583, 422 | 'left': 3.9871632, 423 | 'right': 2.9836209534883724}, 424 | 'right': 1.980035071428571}, 425 | 'right': {'spind': 1, 426 | 'spval': 0.197834, 427 | 'left': 1.0289583666666666, 428 | 'right': -0.023838155555555553}} 429 | 430 | 431 | 432 | 433 | ```python 434 | import matplotlib.pyplot as plt 435 | myDat1 = np.mat(loaddataset('D:\jupyter_notebook\machinelearninginaction\Ch09\ex0.txt')) 436 | myMat1=np.mat(myDat1) 437 | createtree(myMat1) 438 | plt.plot(myMat1[:,1],myMat1[:,2],'ro') 439 | plt.show() 440 | 441 | ``` 442 | 443 | 444 | 445 | ```python 446 | 447 | ``` 448 | -------------------------------------------------------------------------------- /GBDT/GBDT_XGBoost_LGBM算法原理v1.1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_XGBoost_LGBM算法原理v1.1.pdf -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/GBDT_model.md: -------------------------------------------------------------------------------- 1 | #### 基于面向对象思路实现的代码若无基础需先补充面向对象知识 2 | 3 | 推荐面向对象知识补充路径: 4 | 5 | https://www.liaoxuefeng.com/wiki/1016959663602400/1017495723838528 6 | 7 | 8 | ```python 9 | 10 | # __future__模块,把下一个新版本的特性导入到当前版本,于是我们就可以在当前版本中测试一些新版本的特性,解决python2中运行pytho3兼容性问题 11 | # 如果某个版本中出现了某个新的功能特性,而且这个特性和当前版本中使用的不兼容 12 | # 也就是它在该版本中不是语言标准,那么我如果想要使用的话就需要从future模块导入 13 | # division 表示精确除法 14 | from __future__ import division, print_function 15 | import numpy as np 16 | # 显示完成的进度条 17 | from progressbar import * 18 | 19 | 20 | # 这段代码主要展示progressbar进度条的作用,在于展示任务的完成进度并显示出来,与本项目无任何关系 21 | import time 22 | from progressbar import * 23 | 24 | total = 1000 25 | 26 | def dosomework(): 27 | time.sleep(0.01) 28 | 29 | progress = ProgressBar() 30 | for i in progress(range(1000)): 31 | dosomework() 32 | 33 | 34 | 35 | 36 | 37 | 38 | # 导入辅助函数,这里的辅助函数全部在模块中,如果代码报错,需要自行对辅助函数的py文件和类进行整理 39 | 40 | # 训练集和测试集划分函数、数据处理标准化函数、将整型的类别标签转为onehot编码函数 41 | from utils.data_manipulation import train_test_split, standardize, to_categorical 42 | 43 | # 导入均方误差函数、分类AUC评估函数 44 | from utils.data_operation import mean_squared_error, accuracy_score 45 | 46 | # GBDT需要用到决策树的回归树模块,这也是GBDT的核心基础算法之一,如果对决策树不熟悉,需要先学习决策树decision_tree库下面的代码 47 | from utils.decision_tree.decision_tree_model import RegressionTree 48 | 49 | # 导入进度条调度函数,方便展示模型训练进度和倒计时 50 | from utils.misc import bar_widgets 51 | 52 | # 导入平方损失函数、交叉熵损失函数(用于多分类损失评估)、softmax损失函数 53 | from utils.loss_functions import SquareLoss, CrossEntropy, SoftMaxLoss 54 | 55 | 56 | 57 | 58 | 59 | 60 | # 这里定义GBDT的核心算法父类,后面的分类和回归算法直接继承父类的函数方法 61 | class GBDT(object): 62 | """使用一组回归树来训练预测梯度损失函数。 63 | 参数: 64 | ----------- 65 | n_estimators: int 66 | 树的数量 67 | The number of classification trees that are used. 68 | learning_rate: float 69 | 梯度下降的学习率 70 | The step length that will be taken when following the negative gradient during 71 | training. 72 | min_samples_split: int 73 | 每棵子树的节点的最小数目(小于后不继续切割) 74 | The minimum number of samples needed to make a split when building a tree. 75 | min_impurity: float 76 | 每棵子树的最小纯度(小于后不继续切割) 77 | The minimum impurity required to split the tree further. 78 | max_depth: int 79 | 每棵子树的最大层数(大于后不继续切割) 80 | The maximum depth of a tree. 81 | regression: boolean 82 | 是否为回归问题 83 | True or false depending on if we're doing regression or classification. 84 | """ 85 | 86 | def __init__(self, n_estimators, learning_rate, min_samples_split, 87 | min_impurity, max_depth, regression): 88 | 89 | # self表示实例本身,在__init__方法内部,就可以把各种属性绑定到self 90 | # 由于类可以起到模板的作用,因此,可以在创建实例的时候,把一些我们认为必须绑定的属性强制填写进去。 91 | # 通过定义一个特殊的__init__方法,在创建实例的时候,把上述的属性(参数)绑定到self 92 | 93 | # 树的棵数、梯度下降学习率α、每棵子树节点最小数量、每棵子树的最小纯度、每棵子树最大深度、是否为回归问题 94 | 95 | self.n_estimators = n_estimators 96 | self.learning_rate = learning_rate 97 | self.min_samples_split = min_samples_split 98 | self.min_impurity = min_impurity 99 | self.max_depth = max_depth 100 | self.regression = regression 101 | 102 | # 进度条 processbar 103 | self.bar = progressbar.ProgressBar(widgets=bar_widgets) 104 | 105 | # 定义损失函数为平方损失 106 | self.loss = SquareLoss() 107 | 108 | # 如果是分类问题,则使用SoftMax损失 109 | if not self.regression: 110 | self.loss = SoftMaxLoss() 111 | 112 | # 分类问题也使用回归树,利用残差去学习概率 113 | self.trees = [] 114 | 115 | # 遍历每一棵树,并对每一颗数调用回归树方法,并限制树的相关参数执行上述定义的参数 116 | for i in range(self.n_estimators): 117 | self.trees.append(RegressionTree(min_samples_split=self.min_samples_split, 118 | min_impurity=self.min_impurity, 119 | max_depth=self.max_depth)) 120 | 121 | 122 | # GBDT的核心代码 123 | def fit(self, X, y): 124 | # 让第一棵树去拟合模型 125 | self.trees[0].fit(X, y) 126 | y_pred = self.trees[0].predict(X) 127 | for i in self.bar(range(1, self.n_estimators)): 128 | gradient = self.loss.gradient(y, y_pred) 129 | self.trees[i].fit(X, gradient) 130 | y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X)) 131 | 132 | # np.multiply 矩阵乘法 133 | def predict(self, X): 134 | y_pred = self.trees[0].predict(X) 135 | for i in range(1, self.n_estimators): 136 | y_pred -= np.multiply(self.learning_rate, self.trees[i].predict(X)) 137 | 138 | # 判断是分类问题则转换预测值的表达方式 139 | if not self.regression: 140 | # 如果是分类问题,则转换为概率分布 141 | y_pred = np.exp(y_pred) / np.expand_dims(np.sum(np.exp(y_pred), axis=1), axis=1) 142 | # 将标签设置为最大化概率的值 143 | y_pred = np.argmax(y_pred, axis=1) 144 | return y_pred 145 | 146 | 147 | 148 | 149 | 150 | # 对np.argmax的功能做一次演示,按行搜索最大值的索引,当前行存在多个并列最大值取第一个所在位置索引 151 | import numpy as np 152 | a = np.array([[1, 5, 5, 2], 153 | [9, 6, 2, 8], 154 | [3, 7, 9, 1]]) 155 | 156 | c=np.argmax(a, axis=1) 157 | print(c) 158 | 159 | [1 0 2] 160 | 161 | 162 | # 对np.multiply功能做一次演示,向量乘法和矩阵乘法 163 | A = np.array([[1, 2], 164 | [3, 4]]) 165 | B = np.array([[0, 1], 166 | [2, 3]]) 167 | C = np.multiply(A, B) 168 | 169 | print(C) 170 | 171 | print(np.multiply(np.mat(A),np.mat(B))) 172 | 173 | 174 | [[ 0 2] 175 | [ 6 12]] 176 | 177 | [[ 0 2] 178 | [ 6 12]] 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | # GBDT回归算法 188 | class GBDTRegressor(GBDT): 189 | def __init__(self, n_estimators=200, learning_rate=0.5, min_samples_split=2, 190 | min_var_red=1e-7, max_depth=4, debug=False): 191 | super(GBDTRegressor, self).__init__(n_estimators=n_estimators, 192 | learning_rate=learning_rate, 193 | min_samples_split=min_samples_split, 194 | min_impurity=min_var_red, 195 | max_depth=max_depth, 196 | regression=True) 197 | 198 | 199 | 200 | 201 | 202 | 203 | # GBDT分类算法 204 | # 在类中提前定义好训练的参数 205 | class GBDTClassifier(GBDT): 206 | def __init__(self, n_estimators=200, learning_rate=.5, min_samples_split=2, 207 | min_info_gain=1e-7, max_depth=2, debug=False): 208 | super(GBDTClassifier, self).__init__(n_estimators=n_estimators, 209 | learning_rate=learning_rate, 210 | min_samples_split=min_samples_split, 211 | min_impurity=min_info_gain, 212 | max_depth=max_depth, 213 | regression=False) 214 | def fit(self, X, y): 215 | 216 | # 对多分类label进行one_hot编码 217 | y = to_categorical(y) 218 | super(GBDTClassifier, self).fit(X, y) 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | # 分类算法的具体测试实例 230 | 231 | from __future__ import division, print_function 232 | import numpy as np 233 | from sklearn import datasets 234 | import matplotlib.pyplot as plt 235 | 236 | from utils.misc import Plot 237 | 238 | 239 | def main(): 240 | 241 | print ("-- Gradient Boosting Classification --") 242 | 243 | # 利用自带的鸢尾花数据集 244 | data = datasets.load_iris() 245 | X = data.data 246 | y = data.target 247 | 248 | # 划分训练集和测试集,测试集比例40% 249 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) 250 | print(y_train) 251 | 252 | clf = GBDTClassifier() 253 | clf.fit(X_train, y_train) 254 | y_pred = clf.predict(X_test) 255 | 256 | accuracy = accuracy_score(y_test, y_pred) 257 | 258 | print ("Accuracy:", accuracy) 259 | 260 | 261 | Plot().plot_in_2d(X_test, y_pred, 262 | title="Gradient Boosting", 263 | accuracy=accuracy, 264 | legend_labels=data.target_names) 265 | 266 | 267 | 268 | if __name__ == "__main__": 269 | main() 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | # 回归算法的具体测试实例,此处需要导入文件TempLinkoping2016.txt进行测试 281 | # from __future__ import division, print_function 282 | # import numpy as np 283 | import pandas as pd 284 | # import matplotlib.pyplot as plt 285 | # import progressbar 286 | 287 | # from utils import train_test_split, standardize, to_categorical 288 | # from utils import mean_squared_error, accuracy_score, Plot 289 | # from utils.loss_functions import SquareLoss 290 | # from utils.misc import bar_widgets 291 | # from gradient_boosting_decision_tree.gbdt_model import GBDTRegressor 292 | 293 | def main(): 294 | print ("-- Gradient Boosting Regression --") 295 | 296 | # Load temperature data 297 | data = pd.read_csv('D:\Git\Machine_Learning_in_Action_for_smallwhite\GBDT\GBDT_python3_code\TempLinkoping2016.txt', sep="\t") 298 | 299 | time = np.atleast_2d(data["time"].as_matrix()).T 300 | temp = np.atleast_2d(data["temp"].as_matrix()).T 301 | 302 | X = time.reshape((-1, 1)) # Time. Fraction of the year [0, 1] 303 | X = np.insert(X, 0, values=1, axis=1) # Insert bias term 304 | y = temp[:, 0] # Temperature. Reduce to one-dim 305 | 306 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) 307 | 308 | model = GBDTRegressor() 309 | model.fit(X_train, y_train) 310 | y_pred = model.predict(X_test) 311 | 312 | y_pred_line = model.predict(X) 313 | 314 | # Color map 315 | cmap = plt.get_cmap('viridis') 316 | 317 | mse = mean_squared_error(y_test, y_pred) 318 | 319 | print ("Mean Squared Error:", mse) 320 | 321 | # Plot the results 322 | m1 = plt.scatter(366 * X_train[:, 1], y_train, color=cmap(0.9), s=10) 323 | m2 = plt.scatter(366 * X_test[:, 1], y_test, color=cmap(0.5), s=10) 324 | m3 = plt.scatter(366 * X_test[:, 1], y_pred, color='black', s=10) 325 | plt.suptitle("Regression Tree") 326 | plt.title("MSE: %.2f" % mse, fontsize=10) 327 | plt.xlabel('Day') 328 | plt.ylabel('Temperature in Celcius') 329 | plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') 330 | plt.show() 331 | 332 | 333 | if __name__ == "__main__": 334 | main() 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | ``` -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/TempLinkoping2016.txt: -------------------------------------------------------------------------------- 1 | time temp 2 | 0.00273224 0.1 3 | 0.005464481 -4.5 4 | 0.008196721 -6.3 5 | 0.010928962 -9.6 6 | 0.013661202 -9.9 7 | 0.016393443 -17.1 8 | 0.019125683 -11.6 9 | 0.021857923 -6.2 10 | 0.024590164 -6.4 11 | 0.027322404 -0.5 12 | 0.030054645 0.5 13 | 0.032786885 -2.4 14 | 0.035519126 -7.5 15 | 0.038251366 -16.8 16 | 0.040983607 -16.6 17 | 0.043715847 -14.6 18 | 0.046448087 -9.6 19 | 0.049180328 -5.8 20 | 0.051912568 -8.6 21 | 0.054644809 -9.0 22 | 0.057377049 -9.7 23 | 0.06010929 -6.9 24 | 0.06284153 -3.9 25 | 0.06557377 1.4 26 | 0.068306011 1.9 27 | 0.071038251 4.3 28 | 0.073770492 6.9 29 | 0.076502732 4.3 30 | 0.079234973 5.9 31 | 0.081967213 3.8 32 | 0.084699454 1.5 33 | 0.087431694 0.1 34 | 0.090163934 4.6 35 | 0.092896175 0.8 36 | 0.095628415 -0.5 37 | 0.098360656 -1.0 38 | 0.101092896 4.2 39 | 0.103825137 6.6 40 | 0.106557377 4.8 41 | 0.109289617 4.7 42 | 0.112021858 1.3 43 | 0.114754098 0.9 44 | 0.117486339 -2.8 45 | 0.120218579 -3.3 46 | 0.12295082 -5.3 47 | 0.12568306 -6.8 48 | 0.128415301 -5.1 49 | 0.131147541 -2.6 50 | 0.133879781 -0.5 51 | 0.136612022 -0.5 52 | 0.139344262 0.1 53 | 0.142076503 1.7 54 | 0.144808743 2.4 55 | 0.147540984 -0.9 56 | 0.150273224 -1.3 57 | 0.153005464 -1.4 58 | 0.155737705 -0.1 59 | 0.158469945 -0.7 60 | 0.161202186 -2.6 61 | 0.163934426 -4.1 62 | 0.166666667 -2.7 63 | 0.169398907 0.7 64 | 0.172131148 2.0 65 | 0.174863388 1.7 66 | 0.177595628 0.9 67 | 0.180327869 0.3 68 | 0.183060109 0.9 69 | 0.18579235 1.1 70 | 0.18852459 0.1 71 | 0.191256831 -0.9 72 | 0.193989071 0.2 73 | 0.196721311 0.1 74 | 0.199453552 1.0 75 | 0.202185792 3.4 76 | 0.204918033 5.2 77 | 0.207650273 4.9 78 | 0.210382514 4.9 79 | 0.213114754 2.2 80 | 0.215846995 2.9 81 | 0.218579235 5.3 82 | 0.221311475 3.7 83 | 0.224043716 3.4 84 | 0.226775956 2.1 85 | 0.229508197 1.8 86 | 0.232240437 4.3 87 | 0.234972678 7.0 88 | 0.237704918 7.7 89 | 0.240437158 6.2 90 | 0.243169399 7.5 91 | 0.245901639 4.9 92 | 0.24863388 4.4 93 | 0.25136612 3.8 94 | 0.254098361 6.4 95 | 0.256830601 8.0 96 | 0.259562842 7.9 97 | 0.262295082 8.9 98 | 0.265027322 6.6 99 | 0.267759563 6.5 100 | 0.270491803 5.8 101 | 0.273224044 5.6 102 | 0.275956284 4.7 103 | 0.278688525 5.5 104 | 0.281420765 5.5 105 | 0.284153005 5.8 106 | 0.286885246 5.3 107 | 0.289617486 6.9 108 | 0.292349727 5.9 109 | 0.295081967 6.1 110 | 0.297814208 6.6 111 | 0.300546448 6.7 112 | 0.303278689 6.5 113 | 0.306010929 7.0 114 | 0.308743169 5.8 115 | 0.31147541 3.0 116 | 0.31420765 2.5 117 | 0.316939891 2.4 118 | 0.319672131 4.3 119 | 0.322404372 2.8 120 | 0.325136612 3.6 121 | 0.327868852 6.8 122 | 0.330601093 9.1 123 | 0.333333333 8.4 124 | 0.336065574 9.3 125 | 0.338797814 13.3 126 | 0.341530055 10.6 127 | 0.344262295 10.5 128 | 0.346994536 11.8 129 | 0.349726776 14.7 130 | 0.352459016 16.2 131 | 0.355191257 16.4 132 | 0.357923497 16.9 133 | 0.360655738 12.3 134 | 0.363387978 10.2 135 | 0.366120219 11.2 136 | 0.368852459 6.1 137 | 0.371584699 6.4 138 | 0.37431694 6.1 139 | 0.37704918 10.4 140 | 0.379781421 10.3 141 | 0.382513661 11.9 142 | 0.385245902 12.9 143 | 0.387978142 12.5 144 | 0.390710383 17.5 145 | 0.393442623 19.9 146 | 0.396174863 19.3 147 | 0.398907104 11.4 148 | 0.401639344 9.7 149 | 0.404371585 10.7 150 | 0.407103825 13.0 151 | 0.409836066 12.4 152 | 0.412568306 16.3 153 | 0.415300546 19.2 154 | 0.418032787 19.2 155 | 0.420765027 19.8 156 | 0.423497268 19.5 157 | 0.426229508 16.6 158 | 0.428961749 13.0 159 | 0.431693989 12.6 160 | 0.43442623 17.6 161 | 0.43715847 13.7 162 | 0.43989071 11.3 163 | 0.442622951 10.2 164 | 0.445355191 10.2 165 | 0.448087432 11.6 166 | 0.450819672 14.2 167 | 0.453551913 14.4 168 | 0.456284153 17.4 169 | 0.459016393 13.1 170 | 0.461748634 17.4 171 | 0.464480874 15.9 172 | 0.467213115 15.9 173 | 0.469945355 15.5 174 | 0.472677596 16.4 175 | 0.475409836 16.7 176 | 0.478142077 18.2 177 | 0.480874317 20.9 178 | 0.483606557 22.2 179 | 0.486338798 19.1 180 | 0.489071038 16.3 181 | 0.491803279 16.6 182 | 0.494535519 15.1 183 | 0.49726776 14.5 184 | 0.5 17.4 185 | 0.50273224 16.5 186 | 0.505464481 13.7 187 | 0.508196721 14.0 188 | 0.510928962 14.2 189 | 0.513661202 15.6 190 | 0.516393443 15.7 191 | 0.519125683 15.6 192 | 0.521857923 16.2 193 | 0.524590164 16.3 194 | 0.527322404 18.3 195 | 0.530054645 16.6 196 | 0.532786885 16.1 197 | 0.535519126 15.9 198 | 0.538251366 16.0 199 | 0.540983607 15.9 200 | 0.543715847 16.0 201 | 0.546448087 15.7 202 | 0.549180328 17.2 203 | 0.551912568 19.9 204 | 0.554644809 21.0 205 | 0.557377049 19.4 206 | 0.56010929 20.4 207 | 0.56284153 23.1 208 | 0.56557377 23.0 209 | 0.568306011 19.9 210 | 0.571038251 17.6 211 | 0.573770492 18.8 212 | 0.576502732 17.8 213 | 0.579234973 18.6 214 | 0.581967213 16.4 215 | 0.584699454 15.2 216 | 0.587431694 15.3 217 | 0.590163934 16.0 218 | 0.592896175 18.0 219 | 0.595628415 17.7 220 | 0.598360656 16.0 221 | 0.601092896 16.4 222 | 0.603825137 16.7 223 | 0.606557377 14.3 224 | 0.609289617 12.2 225 | 0.612021858 10.0 226 | 0.614754098 12.0 227 | 0.617486339 16.2 228 | 0.620218579 15.9 229 | 0.62295082 14.5 230 | 0.62568306 15.3 231 | 0.628415301 13.3 232 | 0.631147541 14.5 233 | 0.633879781 15.5 234 | 0.636612022 15.3 235 | 0.639344262 17.3 236 | 0.642076503 15.3 237 | 0.644808743 16.4 238 | 0.647540984 17.0 239 | 0.650273224 20.2 240 | 0.653005464 22.4 241 | 0.655737705 18.1 242 | 0.658469945 11.6 243 | 0.661202186 14.6 244 | 0.663934426 13.5 245 | 0.666666667 17.9 246 | 0.669398907 16.4 247 | 0.672131148 15.5 248 | 0.674863388 15.9 249 | 0.677595628 14.1 250 | 0.680327869 13.2 251 | 0.683060109 14.5 252 | 0.68579235 19.0 253 | 0.68852459 18.3 254 | 0.691256831 18.8 255 | 0.693989071 16.8 256 | 0.696721311 16.8 257 | 0.699453552 14.3 258 | 0.702185792 18.4 259 | 0.704918033 18.3 260 | 0.707650273 18.4 261 | 0.710382514 14.9 262 | 0.713114754 11.4 263 | 0.715846995 12.6 264 | 0.718579235 14.0 265 | 0.721311475 14.8 266 | 0.724043716 9.9 267 | 0.726775956 11.4 268 | 0.729508197 12.9 269 | 0.732240437 12.1 270 | 0.734972678 12.8 271 | 0.737704918 13.5 272 | 0.740437158 12.9 273 | 0.743169399 14.0 274 | 0.745901639 14.6 275 | 0.74863388 12.0 276 | 0.75136612 10.5 277 | 0.754098361 9.5 278 | 0.756830601 7.6 279 | 0.759562842 6.4 280 | 0.762295082 7.0 281 | 0.765027322 8.1 282 | 0.767759563 8.1 283 | 0.770491803 7.6 284 | 0.773224044 7.4 285 | 0.775956284 7.2 286 | 0.778688525 7.0 287 | 0.781420765 6.4 288 | 0.784153005 5.8 289 | 0.786885246 5.5 290 | 0.789617486 6.4 291 | 0.792349727 7.3 292 | 0.795081967 7.4 293 | 0.797814208 7.8 294 | 0.800546448 7.9 295 | 0.803278689 6.9 296 | 0.806010929 6.1 297 | 0.808743169 3.7 298 | 0.81147541 5.3 299 | 0.81420765 6.1 300 | 0.816939891 4.3 301 | 0.819672131 3.3 302 | 0.822404372 8.8 303 | 0.825136612 9.8 304 | 0.827868852 6.4 305 | 0.830601093 4.6 306 | 0.833333333 5.2 307 | 0.836065574 5.5 308 | 0.838797814 1.4 309 | 0.841530055 0.5 310 | 0.844262295 -2.6 311 | 0.846994536 2.4 312 | 0.849726776 -0.8 313 | 0.852459016 -3.3 314 | 0.855191257 -2.8 315 | 0.857923497 -3.5 316 | 0.860655738 -2.8 317 | 0.863387978 -2.2 318 | 0.866120219 -0.3 319 | 0.868852459 0.0 320 | 0.871584699 2.3 321 | 0.87431694 4.9 322 | 0.87704918 3.1 323 | 0.879781421 3.6 324 | 0.882513661 5.2 325 | 0.885245902 3.8 326 | 0.887978142 3.2 327 | 0.890710383 7.7 328 | 0.893442623 7.8 329 | 0.896174863 6.9 330 | 0.898907104 2.7 331 | 0.901639344 2.8 332 | 0.904371585 6.6 333 | 0.907103825 1.9 334 | 0.909836066 -1.4 335 | 0.912568306 2.2 336 | 0.915300546 1.9 337 | 0.918032787 -1.3 338 | 0.920765027 -1.6 339 | 0.923497268 -3.2 340 | 0.926229508 -2.7 341 | 0.928961749 3.7 342 | 0.931693989 -3.2 343 | 0.93442623 -0.2 344 | 0.93715847 9.3 345 | 0.93989071 7.1 346 | 0.942622951 3.2 347 | 0.945355191 1.1 348 | 0.948087432 -6.0 349 | 0.950819672 1.7 350 | 0.953551913 -1.3 351 | 0.956284153 -2.2 352 | 0.959016393 -1.2 353 | 0.961748634 1.0 354 | 0.964480874 1.7 355 | 0.967213115 3.7 356 | 0.969945355 4.7 357 | 0.972677596 -0.3 358 | 0.975409836 3.5 359 | 0.978142077 3.4 360 | 0.980874317 3.9 361 | 0.983606557 4.5 362 | 0.986338798 5.3 363 | 0.989071038 2.7 364 | 0.991803279 -0.4 365 | 0.994535519 4.3 366 | 0.99726776 7.0 367 | 1 9.3 -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/readme.md: -------------------------------------------------------------------------------- 1 | 20190710 2 | 3 | v1.0 添加GBDT的辅助函数,为运行GBDT算法代码,需要下载辅助函数文件夹utils到您的python或者anaconda安装目录,例如将utils文件夹下载并拷贝到anaconda为\Anaconda3\Lib\site-packages目录下。 4 | 5 | 添加模型的主程序文件,可以直接在jupyter_notebook中运行 6 | 7 | 添加模型的回归和分类问题代码和测试数据 8 | 9 | 代码参考作者地址:https://github.com/RRdmlearning/Machine-Learning-From-Scratch 10 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__init__.py -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/bools.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/bools.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/data_manipulation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/data_manipulation.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/data_operation.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/data_operation.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/dates.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/dates.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/enum.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/enum.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/lists.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/lists.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/loss_functions.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/loss_functions.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/math.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/math.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/misc.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/misc.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/__pycache__/objects.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/__pycache__/objects.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/bools.py: -------------------------------------------------------------------------------- 1 | try: 2 | reduce 3 | except NameError: 4 | from functools import reduce 5 | 6 | def xor(*things): 7 | return reduce(lambda x, y: bool(x) ^ bool(y), things) 8 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/data_manipulation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from itertools import combinations_with_replacement 3 | import numpy as np 4 | import math 5 | import sys 6 | 7 | 8 | def shuffle_data(X, y, seed=None): 9 | """ Random shuffle of the samples in X and y """ 10 | if seed: 11 | np.random.seed(seed) 12 | idx = np.arange(X.shape[0]) 13 | np.random.shuffle(idx) 14 | return X[idx], y[idx] 15 | 16 | 17 | def batch_iterator(X, y=None, batch_size=64): 18 | """ Simple batch generator """ 19 | n_samples = X.shape[0] 20 | for i in np.arange(0, n_samples, batch_size): 21 | begin, end = i, min(i+batch_size, n_samples) 22 | if y is not None: 23 | yield X[begin:end], y[begin:end] 24 | else: 25 | yield X[begin:end] 26 | 27 | 28 | def divide_on_feature(X, feature_i, threshold): 29 | """ Divide dataset based on if sample value on feature index is larger than 30 | the given threshold """ 31 | split_func = None 32 | if isinstance(threshold, int) or isinstance(threshold, float): 33 | split_func = lambda sample: sample[feature_i] >= threshold 34 | else: 35 | split_func = lambda sample: sample[feature_i] == threshold 36 | 37 | X_1 = np.array([sample for sample in X if split_func(sample)]) 38 | X_2 = np.array([sample for sample in X if not split_func(sample)]) 39 | 40 | return np.array([X_1, X_2]) 41 | 42 | 43 | def polynomial_features(X, degree): 44 | n_samples, n_features = np.shape(X) 45 | 46 | def index_combinations(): 47 | combs = [combinations_with_replacement(range(n_features), i) for i in range(0, degree + 1)] 48 | flat_combs = [item for sublist in combs for item in sublist] 49 | return flat_combs 50 | 51 | combinations = index_combinations() 52 | n_output_features = len(combinations) 53 | X_new = np.empty((n_samples, n_output_features)) 54 | 55 | for i, index_combs in enumerate(combinations): 56 | X_new[:, i] = np.prod(X[:, index_combs], axis=1) 57 | 58 | return X_new 59 | 60 | 61 | def get_random_subsets(X, y, n_subsets, replacements=True): 62 | """ Return random subsets (with replacements) of the data """ 63 | n_samples = np.shape(X)[0] 64 | # Concatenate x and y and do a random shuffle 65 | X_y = np.concatenate((X, y.reshape((1, len(y))).T), axis=1) 66 | np.random.shuffle(X_y) 67 | subsets = [] 68 | 69 | # Uses 50% of training samples without replacements 70 | subsample_size = int(n_samples // 2) 71 | if replacements: 72 | subsample_size = n_samples # 100% with replacements 73 | 74 | for _ in range(n_subsets): 75 | idx = np.random.choice( 76 | range(n_samples), 77 | size=np.shape(range(subsample_size)), 78 | replace=replacements) 79 | X = X_y[idx][:, :-1] 80 | y = X_y[idx][:, -1] 81 | subsets.append([X, y]) 82 | return subsets 83 | 84 | 85 | def normalize(X, axis=-1, order=2): 86 | """ Normalize the dataset X """ 87 | l2 = np.atleast_1d(np.linalg.norm(X, order, axis)) 88 | l2[l2 == 0] = 1 89 | return X / np.expand_dims(l2, axis) 90 | 91 | 92 | def standardize(X): 93 | """ Standardize the dataset X """ 94 | X_std = X 95 | mean = X.mean(axis=0) 96 | std = X.std(axis=0) 97 | for col in range(np.shape(X)[1]): 98 | if std[col]: 99 | X_std[:, col] = (X_std[:, col] - mean[col]) / std[col] 100 | # X_std = (X - X.mean(axis=0)) / X.std(axis=0) 101 | return X_std 102 | 103 | 104 | def train_test_split(X, y, test_size=0.5, shuffle=True, seed=None): 105 | """ Split the data into train and test sets """ 106 | if shuffle: 107 | X, y = shuffle_data(X, y, seed) 108 | # Split the training data from test data in the ratio specified in 109 | # test_size 110 | split_i = len(y) - int(len(y) // (1 / test_size)) 111 | X_train, X_test = X[:split_i], X[split_i:] 112 | y_train, y_test = y[:split_i], y[split_i:] 113 | 114 | return X_train, X_test, y_train, y_test 115 | 116 | 117 | def k_fold_cross_validation_sets(X, y, k, shuffle=True): 118 | """ Split the data into k sets of training / test data """ 119 | if shuffle: 120 | X, y = shuffle_data(X, y) 121 | 122 | n_samples = len(y) 123 | left_overs = {} 124 | n_left_overs = (n_samples % k) 125 | if n_left_overs != 0: 126 | left_overs["X"] = X[-n_left_overs:] 127 | left_overs["y"] = y[-n_left_overs:] 128 | X = X[:-n_left_overs] 129 | y = y[:-n_left_overs] 130 | 131 | X_split = np.split(X, k) 132 | y_split = np.split(y, k) 133 | sets = [] 134 | for i in range(k): 135 | X_test, y_test = X_split[i], y_split[i] 136 | X_train = np.concatenate(X_split[:i] + X_split[i + 1:], axis=0) 137 | y_train = np.concatenate(y_split[:i] + y_split[i + 1:], axis=0) 138 | sets.append([X_train, X_test, y_train, y_test]) 139 | 140 | # Add left over samples to last set as training samples 141 | if n_left_overs != 0: 142 | np.append(sets[-1][0], left_overs["X"], axis=0) 143 | np.append(sets[-1][2], left_overs["y"], axis=0) 144 | 145 | return np.array(sets) 146 | 147 | 148 | def to_categorical(x, n_col=None): 149 | """ One-hot encoding of nominal values """ 150 | if not n_col: 151 | n_col = np.amax(x) + 1 152 | one_hot = np.zeros((x.shape[0], n_col)) 153 | one_hot[np.arange(x.shape[0]), x] = 1 154 | return one_hot 155 | 156 | 157 | def to_nominal(x): 158 | """ Conversion from one-hot encoding to nominal """ 159 | return np.argmax(x, axis=1) 160 | 161 | 162 | def make_diagonal(x): 163 | """ Converts a vector into an diagonal matrix """ 164 | m = np.zeros((len(x), len(x))) 165 | for i in range(len(m[0])): 166 | m[i, i] = x[i] 167 | return m 168 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/data_operation.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | import math 4 | import sys 5 | 6 | 7 | def calculate_entropy(y): 8 | """ Calculate the entropy of label array y """ 9 | log2 = lambda x: math.log(x) / math.log(2) 10 | unique_labels = np.unique(y) 11 | entropy = 0 12 | for label in unique_labels: 13 | count = len(y[y == label]) 14 | p = count / len(y) 15 | entropy += -p * log2(p) 16 | return entropy 17 | 18 | 19 | def mean_squared_error(y_true, y_pred): 20 | """ Returns the mean squared error between y_true and y_pred """ 21 | mse = np.mean(np.power(y_true - y_pred, 2)) 22 | return mse 23 | 24 | 25 | def calculate_variance(X): 26 | """ Return the variance of the features in dataset X """ 27 | mean = np.ones(np.shape(X)) * X.mean(0) 28 | n_samples = np.shape(X)[0] 29 | variance = (1 / n_samples) * np.diag((X - mean).T.dot(X - mean)) 30 | 31 | return variance 32 | 33 | 34 | def calculate_std_dev(X): 35 | """ Calculate the standard deviations of the features in dataset X """ 36 | std_dev = np.sqrt(calculate_variance(X)) 37 | return std_dev 38 | 39 | 40 | def euclidean_distance(x1, x2): 41 | """ Calculates the l2 distance between two vectors """ 42 | distance = 0 43 | # Squared distance between each coordinate 44 | for i in range(len(x1)): 45 | distance += pow((x1[i] - x2[i]), 2) 46 | return math.sqrt(distance) 47 | 48 | 49 | def accuracy_score(y_true, y_pred): 50 | """ Compare y_true to y_pred and return the accuracy """ 51 | accuracy = np.sum(y_true == y_pred, axis=0) / len(y_true) 52 | return accuracy 53 | 54 | 55 | def calculate_covariance_matrix(X, Y=None): 56 | """ Calculate the covariance matrix for the dataset X """ 57 | if Y is None: 58 | Y = X 59 | n_samples = np.shape(X)[0] 60 | covariance_matrix = (1 / (n_samples-1)) * (X - X.mean(axis=0)).T.dot(Y - Y.mean(axis=0)) 61 | 62 | return np.array(covariance_matrix, dtype=float) 63 | 64 | 65 | def calculate_correlation_matrix(X, Y=None): 66 | """ Calculate the correlation matrix for the dataset X """ 67 | if Y is None: 68 | Y = X 69 | n_samples = np.shape(X)[0] 70 | covariance = (1 / n_samples) * (X - X.mean(0)).T.dot(Y - Y.mean(0)) 71 | std_dev_X = np.expand_dims(calculate_std_dev(X), 1) 72 | std_dev_y = np.expand_dims(calculate_std_dev(Y), 1) 73 | correlation_matrix = np.divide(covariance, std_dev_X.dot(std_dev_y.T)) 74 | 75 | return np.array(correlation_matrix, dtype=float) 76 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dates.py: -------------------------------------------------------------------------------- 1 | """Useful things to do with dates""" 2 | import datetime 3 | 4 | 5 | def date_from_string(string, format_string=None): 6 | """Runs through a few common string formats for datetimes, 7 | and attempts to coerce them into a datetime. Alternatively, 8 | format_string can provide either a single string to attempt 9 | or an iterable of strings to attempt.""" 10 | 11 | if isinstance(format_string, str): 12 | return datetime.datetime.strptime(string, format_string).date() 13 | 14 | elif format_string is None: 15 | format_string = [ 16 | "%Y-%m-%d", 17 | "%m-%d-%Y", 18 | "%m/%d/%Y", 19 | "%d/%m/%Y", 20 | ] 21 | 22 | for format in format_string: 23 | try: 24 | return datetime.datetime.strptime(string, format).date() 25 | except ValueError: 26 | continue 27 | 28 | raise ValueError("Could not produce date from string: {}".format(string)) 29 | 30 | 31 | def to_datetime(plain_date, hours=0, minutes=0, seconds=0, ms=0): 32 | """given a datetime.date, gives back a datetime.datetime""" 33 | # don't mess with datetimes 34 | if isinstance(plain_date, datetime.datetime): 35 | return plain_date 36 | return datetime.datetime( 37 | plain_date.year, 38 | plain_date.month, 39 | plain_date.day, 40 | hours, 41 | minutes, 42 | seconds, 43 | ms, 44 | ) 45 | 46 | 47 | class TimePeriod(object): 48 | 49 | def __init__(self, earliest, latest): 50 | if not isinstance(earliest, datetime.date) and earliest is not None: 51 | raise TypeError("Earliest must be a date or None") 52 | if not isinstance(latest, datetime.date) and latest is not None: 53 | raise TypeError("Latest must be a date or None") 54 | 55 | # convert dates to datetimes, for to have better resolution 56 | if earliest is not None: 57 | earliest = to_datetime(earliest) 58 | if latest is not None: 59 | latest = to_datetime(latest, 23, 59, 59) 60 | 61 | if earliest is not None and latest is not None and earliest >= latest: 62 | raise ValueError("Earliest must be earlier than latest") 63 | 64 | self._earliest = earliest 65 | self._latest = latest 66 | 67 | def __contains__(self, key): 68 | if isinstance(key, datetime.date): 69 | key = to_datetime(key) 70 | 71 | if self._latest is None: 72 | upper_bounded = True 73 | else: 74 | upper_bounded = key <= self._latest 75 | 76 | if self._earliest is None: 77 | lower_bounded = True 78 | else: 79 | lower_bounded = self._earliest <= key 80 | 81 | return upper_bounded and lower_bounded 82 | 83 | elif isinstance(key, TimePeriod): 84 | if self._latest is None: 85 | upper_bounded = True 86 | elif key._latest is None: 87 | upper_bounded = False 88 | else: 89 | upper_bounded = self._latest >= key._latest 90 | 91 | if self._earliest is None: 92 | lower_bounded = True 93 | elif key._earliest is None: 94 | lower_bounded = False 95 | else: 96 | lower_bounded = self._earliest <= key._earliest 97 | 98 | return upper_bounded and lower_bounded 99 | 100 | def contains(self, other): 101 | return other in self 102 | 103 | def overlaps(self, other): 104 | """does another datetime overlap with this one? this is a symmetric 105 | property. 106 | 107 | TP1 |------------| 108 | -------------------------------------------------> time 109 | TP2 |--------------| 110 | 111 | TP1.overlaps(TP2) == TP2.overlaps(TP1) == True 112 | 113 | args: 114 | other - a TimePeriod 115 | """ 116 | 117 | return self._latest in other or self._earliest in other 118 | 119 | def __eq__(self, other): 120 | return (self._earliest == other._earliest) and (self._latest == other._latest) 121 | 122 | def __hash__(self): 123 | return hash((self._earliest, self._latest)) 124 | 125 | def __repr__(self): 126 | return "<{}: {}-{}>".format( 127 | self.__class__.__name__, 128 | self._earliest, 129 | self._latest, 130 | ) 131 | 132 | @classmethod 133 | def get_containing_period(cls, *periods): 134 | """Given a bunch of TimePeriods, return a TimePeriod that most closely 135 | contains them.""" 136 | 137 | if any(not isinstance(period, TimePeriod) for period in periods): 138 | raise TypeError("periods must all be TimePeriods: {}".format(periods)) 139 | 140 | latest = datetime.datetime.min 141 | earliest = datetime.datetime.max 142 | 143 | for period in periods: 144 | # the best we can do to conain None is None! 145 | if period._latest is None: 146 | latest = None 147 | elif latest is not None and period._latest > latest: 148 | latest = period._latest 149 | 150 | if period._earliest is None: 151 | earliest = None 152 | elif earliest is not None and period._earliest < earliest: 153 | earliest = period._earliest 154 | 155 | return TimePeriod(earliest, latest) 156 | 157 | 158 | class DiscontinuousTimePeriod(object): 159 | """A bunch of TimePeriods""" 160 | 161 | def __init__(self, *periods): 162 | if any(not isinstance(period, TimePeriod) for period in periods): 163 | raise TypeError("periods must all be TimePeriods: {}".format(periods)) 164 | 165 | periods = set(periods) 166 | 167 | no_overlaps_periods = [] 168 | for period in periods: 169 | for other_period in periods: 170 | if id(other_period) == id(period): 171 | continue 172 | 173 | # periods that overlap should be combined 174 | if period.overlaps(other_period): 175 | period = TimePeriod.get_containing_period(period, other_period) 176 | 177 | no_overlaps_periods.append(period) 178 | 179 | no_equals_periods = [] 180 | reference = set(no_overlaps_periods) 181 | for period in no_overlaps_periods: 182 | # clean out duplicated periods 183 | if any(other_period == period and other_period is not period for other_period in reference): 184 | reference.remove(period) 185 | else: 186 | no_equals_periods.append(period) 187 | 188 | no_contains_periods = [] 189 | for period in no_equals_periods: 190 | # don't need to keep periods that are wholly contained 191 | skip = False 192 | for other_period in no_equals_periods: 193 | if id(other_period) == id(period): 194 | continue 195 | 196 | if period in other_period: 197 | skip = True 198 | 199 | if not skip: 200 | no_contains_periods.append(period) 201 | self._periods = no_contains_periods 202 | 203 | def __contains__(self, other): 204 | if isinstance(other, (datetime.date, TimePeriod)): 205 | for period in self._periods: 206 | if other in period: 207 | return True 208 | 209 | 210 | def days_ago(days, give_datetime=True): 211 | delta = datetime.timedelta(days=days) 212 | dt = datetime.datetime.now() - delta 213 | if give_datetime: 214 | return dt 215 | else: 216 | return dt.date() 217 | 218 | 219 | def days_ahead(days, give_datetime=True): 220 | delta = datetime.timedelta(days=days) 221 | dt = datetime.datetime.now() + delta 222 | if give_datetime: 223 | return dt 224 | else: 225 | return dt.date() 226 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-35.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/decision_tree/__pycache__/decision_tree_model.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/decision_tree/decision_tree_classifier_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import numpy as np 3 | from sklearn import datasets 4 | import matplotlib.pyplot as plt 5 | import sys 6 | import os 7 | 8 | # Import helper functions 9 | from utils import train_test_split, standardize, accuracy_score 10 | from utils import mean_squared_error, calculate_variance, Plot 11 | from decision_tree.decision_tree_model import ClassificationTree 12 | 13 | def main(): 14 | 15 | print ("-- Classification Tree --") 16 | 17 | data = datasets.load_iris() 18 | X = data.data 19 | y = data.target 20 | 21 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) 22 | 23 | clf = ClassificationTree() 24 | clf.fit(X_train, y_train) 25 | y_pred = clf.predict(X_test) 26 | 27 | accuracy = accuracy_score(y_test, y_pred) 28 | 29 | print ("Accuracy:", accuracy) 30 | 31 | Plot().plot_in_2d(X_test, y_pred, 32 | title="Decision Tree", 33 | accuracy=accuracy, 34 | legend_labels=data.target_names) 35 | 36 | 37 | if __name__ == "__main__": 38 | main() -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/decision_tree/decision_tree_model.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import numpy as np 3 | 4 | from utils.data_manipulation import divide_on_feature, train_test_split, standardize 5 | from utils.data_operation import calculate_entropy, accuracy_score, calculate_variance, mean_squared_error 6 | 7 | 8 | class DecisionNode(): 9 | """Class that represents a decision node or leaf in the decision tree 10 | 11 | Parameters: 12 | ----------- 13 | feature_i: int 14 | Feature index which we want to use as the threshold measure. 15 | threshold: float 16 | The value that we will compare feature values at feature_i against to 17 | determine the prediction. 18 | value: float 19 | The class prediction if classification tree, or float value if regression tree. 20 | true_branch: DecisionNode 21 | Next decision node for samples where features value met the threshold. 22 | false_branch: DecisionNode 23 | Next decision node for samples where features value did not meet the threshold. 24 | """ 25 | 26 | def __init__(self, feature_i=None, threshold=None, 27 | value=None, true_branch=None, false_branch=None): 28 | self.feature_i = feature_i # Index for the feature that is tested 29 | self.threshold = threshold # Threshold value for feature 30 | self.value = value # Value if the node is a leaf in the tree 31 | self.true_branch = true_branch # 'Left' subtree 32 | self.false_branch = false_branch # 'Right' subtree 33 | 34 | 35 | # Super class of RegressionTree and ClassificationTree 36 | class DecisionTree(object): 37 | """Super class of RegressionTree and ClassificationTree. 38 | 39 | Parameters: 40 | ----------- 41 | min_samples_split: int 42 | The minimum number of samples needed to make a split when building a tree. 43 | min_impurity: float 44 | The minimum impurity required to split the tree further. 45 | max_depth: int 46 | The maximum depth of a tree. 47 | loss: function 48 | Loss function that is used for Gradient Boosting models to calculate impurity. 49 | """ 50 | 51 | def __init__(self, min_samples_split=2, min_impurity=1e-7, 52 | max_depth=float("inf"), loss=None): 53 | self.root = None # Root node in dec. tree 54 | # Minimum n of samples to justify split 55 | self.min_samples_split = min_samples_split 56 | # The minimum impurity to justify split 57 | self.min_impurity = min_impurity 58 | # The maximum depth to grow the tree to 59 | self.max_depth = max_depth 60 | # Function to calculate impurity (classif.=>info gain, regr=>variance reduct.) 61 | # 切割树的方法,gini,方差等 62 | self._impurity_calculation = None 63 | # Function to determine prediction of y at leaf 64 | # 树节点取值的方法,分类树:选取出现最多次数的值,回归树:取所有值的平均值 65 | self._leaf_value_calculation = None 66 | # If y is one-hot encoded (multi-dim) or not (one-dim) 67 | self.one_dim = None 68 | # If Gradient Boost 69 | self.loss = loss 70 | 71 | def fit(self, X, y, loss=None): 72 | """ Build decision tree """ 73 | self.one_dim = len(np.shape(y)) == 1 74 | self.root = self._build_tree(X, y) 75 | self.loss = None 76 | 77 | def _build_tree(self, X, y, current_depth=0): 78 | """ Recursive method which builds out the decision tree and splits X and respective y 79 | on the feature of X which (based on impurity) best separates the data""" 80 | largest_impurity = 0 81 | best_criteria = None # Feature index and threshold 82 | best_sets = None # Subsets of the data 83 | 84 | # Check if expansion of y is needed 85 | if len(np.shape(y)) == 1: 86 | y = np.expand_dims(y, axis=1) 87 | 88 | # Add y as last column of X 89 | Xy = np.concatenate((X, y), axis=1) 90 | 91 | n_samples, n_features = np.shape(X) 92 | 93 | if n_samples >= self.min_samples_split and current_depth <= self.max_depth: 94 | # Calculate the impurity for each feature 95 | for feature_i in range(n_features): 96 | # All values of feature_i 97 | feature_values = np.expand_dims(X[:, feature_i], axis=1) 98 | unique_values = np.unique(feature_values) 99 | 100 | # Iterate through all unique values of feature column i and 101 | # calculate the impurity 102 | for threshold in unique_values: 103 | # Divide X and y depending on if the feature value of X at index feature_i 104 | # meets the threshold 105 | Xy1, Xy2 = divide_on_feature(Xy, feature_i, threshold) 106 | 107 | if len(Xy1) > 0 and len(Xy2) > 0: 108 | # Select the y-values of the two sets 109 | y1 = Xy1[:, n_features:] 110 | y2 = Xy2[:, n_features:] 111 | 112 | # Calculate impurity 113 | impurity = self._impurity_calculation(y, y1, y2) 114 | 115 | # If this threshold resulted in a higher information gain than previously 116 | # recorded save the threshold value and the feature 117 | # index 118 | if impurity > largest_impurity: 119 | largest_impurity = impurity 120 | best_criteria = {"feature_i": feature_i, "threshold": threshold} 121 | best_sets = { 122 | "leftX": Xy1[:, :n_features], # X of left subtree 123 | "lefty": Xy1[:, n_features:], # y of left subtree 124 | "rightX": Xy2[:, :n_features], # X of right subtree 125 | "righty": Xy2[:, n_features:] # y of right subtree 126 | } 127 | 128 | if largest_impurity > self.min_impurity: 129 | # Build subtrees for the right and left branches 130 | true_branch = self._build_tree(best_sets["leftX"], best_sets["lefty"], current_depth + 1) 131 | false_branch = self._build_tree(best_sets["rightX"], best_sets["righty"], current_depth + 1) 132 | return DecisionNode(feature_i=best_criteria["feature_i"], threshold=best_criteria[ 133 | "threshold"], true_branch=true_branch, false_branch=false_branch) 134 | 135 | # We're at leaf => determine value 136 | leaf_value = self._leaf_value_calculation(y) 137 | return DecisionNode(value=leaf_value) 138 | 139 | def predict_value(self, x, tree=None): 140 | """ Do a recursive search down the tree and make a prediction of the data sample by the 141 | value of the leaf that we end up at """ 142 | 143 | if tree is None: 144 | tree = self.root 145 | 146 | # If we have a value (i.e we're at a leaf) => return value as the prediction 147 | if tree.value is not None: 148 | return tree.value 149 | 150 | # Choose the feature that we will test 151 | feature_value = x[tree.feature_i] 152 | 153 | # Determine if we will follow left or right branch 154 | branch = tree.false_branch 155 | if isinstance(feature_value, int) or isinstance(feature_value, float): 156 | if feature_value >= tree.threshold: 157 | branch = tree.true_branch 158 | elif feature_value == tree.threshold: 159 | branch = tree.true_branch 160 | 161 | # Test subtree 162 | return self.predict_value(x, branch) 163 | 164 | def predict(self, X): 165 | """ Classify samples one by one and return the set of labels """ 166 | y_pred = [] 167 | for x in X: 168 | y_pred.append(self.predict_value(x)) 169 | return y_pred 170 | 171 | def print_tree(self, tree=None, indent=" "): 172 | """ Recursively print the decision tree """ 173 | if not tree: 174 | tree = self.root 175 | 176 | # If we're at leaf => print the label 177 | if tree.value is not None: 178 | print(tree.value) 179 | # Go deeper down the tree 180 | else: 181 | # Print test 182 | print("%s:%s? " % (tree.feature_i, tree.threshold)) 183 | # Print the true scenario 184 | print("%sT->" % (indent), end="") 185 | self.print_tree(tree.true_branch, indent + indent) 186 | # Print the false scenario 187 | print("%sF->" % (indent), end="") 188 | self.print_tree(tree.false_branch, indent + indent) 189 | 190 | 191 | class ClassificationTree(DecisionTree): 192 | def _calculate_information_gain(self, y, y1, y2): 193 | # Calculate information gain 194 | p = len(y1) / len(y) 195 | entropy = calculate_entropy(y) 196 | info_gain = entropy - p * \ 197 | calculate_entropy(y1) - (1 - p) * \ 198 | calculate_entropy(y2) 199 | # print("info_gain",info_gain) 200 | return info_gain 201 | 202 | def _majority_vote(self, y): 203 | most_common = None 204 | max_count = 0 205 | for label in np.unique(y): 206 | # Count number of occurences of samples with label 207 | count = len(y[y == label]) 208 | if count > max_count: 209 | most_common = label 210 | max_count = count 211 | # print("most_common :",most_common) 212 | return most_common 213 | 214 | def fit(self, X, y): 215 | self._impurity_calculation = self._calculate_information_gain 216 | self._leaf_value_calculation = self._majority_vote 217 | super(ClassificationTree, self).fit(X, y) 218 | 219 | 220 | class RegressionTree(DecisionTree): 221 | def _calculate_variance_reduction(self, y, y1, y2): 222 | var_tot = calculate_variance(y) 223 | var_1 = calculate_variance(y1) 224 | var_2 = calculate_variance(y2) 225 | frac_1 = len(y1) / len(y) 226 | frac_2 = len(y2) / len(y) 227 | 228 | # Calculate the variance reduction 229 | variance_reduction = var_tot - (frac_1 * var_1 + frac_2 * var_2) 230 | 231 | return sum(variance_reduction) 232 | 233 | def _mean_of_y(self, y): 234 | value = np.mean(y, axis=0) 235 | return value if len(value) > 1 else value[0] 236 | 237 | def fit(self, X, y): 238 | self._impurity_calculation = self._calculate_variance_reduction 239 | self._leaf_value_calculation = self._mean_of_y 240 | super(RegressionTree, self).fit(X, y) 241 | 242 | 243 | class XGBoostRegressionTree(DecisionTree): 244 | """ 245 | Regression tree for XGBoost 246 | - Reference - 247 | http://xgboost.readthedocs.io/en/latest/model.html 248 | """ 249 | 250 | def _split(self, y): 251 | """ y contains y_true in left half of the middle column and 252 | y_pred in the right half. Split and return the two matrices """ 253 | col = int(np.shape(y)[1] / 2) 254 | y, y_pred = y[:, :col], y[:, col:] 255 | return y, y_pred 256 | 257 | def _gain(self, y, y_pred): 258 | nominator = np.power((y * self.loss.gradient(y, y_pred)).sum(), 2) 259 | denominator = self.loss.hess(y, y_pred).sum() 260 | return 0.5 * (nominator / denominator) 261 | 262 | def _gain_by_taylor(self, y, y1, y2): 263 | # Split 264 | y, y_pred = self._split(y) 265 | y1, y1_pred = self._split(y1) 266 | y2, y2_pred = self._split(y2) 267 | 268 | true_gain = self._gain(y1, y1_pred) 269 | false_gain = self._gain(y2, y2_pred) 270 | gain = self._gain(y, y_pred) 271 | return true_gain + false_gain - gain 272 | 273 | def _approximate_update(self, y): 274 | # y split into y, y_pred 275 | y, y_pred = self._split(y) 276 | # Newton's Method 277 | gradient = np.sum(y * self.loss.gradient(y, y_pred), axis=0) 278 | hessian = np.sum(self.loss.hess(y, y_pred), axis=0) 279 | update_approximation = gradient / hessian 280 | 281 | return update_approximation 282 | 283 | def fit(self, X, y): 284 | self._impurity_calculation = self._gain_by_taylor 285 | self._leaf_value_calculation = self._approximate_update 286 | super(XGBoostRegressionTree, self).fit(X, y) -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/decision_tree/decision_tree_regressor_example.py: -------------------------------------------------------------------------------- 1 | from __future__ import division, print_function 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | 6 | from utils import train_test_split, standardize, accuracy_score 7 | from utils import mean_squared_error, calculate_variance, Plot 8 | from decision_tree.decision_tree_model import RegressionTree 9 | 10 | def main(): 11 | 12 | print ("-- Regression Tree --") 13 | 14 | # Load temperature data 15 | data = pd.read_csv('../TempLinkoping2016.txt', sep="\t") 16 | 17 | time = np.atleast_2d(data["time"].as_matrix()).T 18 | temp = np.atleast_2d(data["temp"].as_matrix()).T 19 | 20 | X = standardize(time) # Time. Fraction of the year [0, 1] 21 | y = temp[:, 0] # Temperature. Reduce to one-dim 22 | 23 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 24 | 25 | model = RegressionTree() 26 | model.fit(X_train, y_train) 27 | y_pred = model.predict(X_test) 28 | 29 | y_pred_line = model.predict(X) 30 | 31 | # Color map 32 | cmap = plt.get_cmap('viridis') 33 | 34 | mse = mean_squared_error(y_test, y_pred) 35 | 36 | print ("Mean Squared Error:", mse) 37 | 38 | # Plot the results 39 | # Plot the results 40 | m1 = plt.scatter(366 * X_train, y_train, color=cmap(0.9), s=10) 41 | m2 = plt.scatter(366 * X_test, y_test, color=cmap(0.5), s=10) 42 | m3 = plt.scatter(366 * X_test, y_pred, color='black', s=10) 43 | plt.suptitle("Regression Tree") 44 | plt.title("MSE: %.2f" % mse, fontsize=10) 45 | plt.xlabel('Day') 46 | plt.ylabel('Temperature in Celcius') 47 | plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') 48 | plt.show() 49 | 50 | 51 | if __name__ == "__main__": 52 | main() -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/__init__.py: -------------------------------------------------------------------------------- 1 | """Helper functinos for dealing with dicts. 2 | 3 | Things you always wished you could do more succinctly! 4 | """ 5 | from .limited_dict import LimitedDict 6 | from .chained_dict import ChainedDict 7 | from .helpers import * 8 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/__pycache__/__init__.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/__init__.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/__pycache__/chained_dict.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/chained_dict.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/__pycache__/helpers.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/helpers.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/__pycache__/limited_dict.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/GBDT/GBDT_python3_code/utils/dicts/__pycache__/limited_dict.cpython-37.pyc -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/chained_dict.py: -------------------------------------------------------------------------------- 1 | from collections import MutableMapping 2 | from itertools import chain 3 | 4 | 5 | class ChainedDict(MutableMapping): 6 | 7 | def __init__(self, parent=None, **kwargs): 8 | self.__parent = parent 9 | self.__deleted_keys = set() 10 | self.__data = kwargs 11 | 12 | def __contains__(self, key): 13 | if self.__parent is not None: 14 | return ( 15 | (key in self.__data or key in self.__parent) 16 | and key not in self.__deleted_keys 17 | ) 18 | return key in self.__data 19 | 20 | def __getitem__(self, key): 21 | try: 22 | return self.__data[key] 23 | except KeyError: 24 | if self.__parent is not None and key not in self.__deleted_keys: 25 | return self.__parent[key] 26 | else: 27 | raise 28 | 29 | def __setitem__(self, key, val): 30 | self.__data[key] = val 31 | self.__deleted_keys.discard(key) 32 | 33 | def __delitem__(self, key): 34 | if key in self: 35 | self.__deleted_keys.add(key) 36 | try: 37 | del self.__data[key] 38 | except KeyError: 39 | pass 40 | else: 41 | raise KeyError(key) 42 | 43 | def __repr__(self): 44 | return "{}({})".format(self.__class__.__name__, dict(self.items())) 45 | 46 | def __iter__(self): 47 | return self.keys() 48 | 49 | def __len__(self): 50 | return len(list(self.keys())) 51 | 52 | def iterkeys(self): 53 | yielded = set(self.__deleted_keys) 54 | if self.__parent is None: 55 | iterable = self.__data.keys() 56 | else: 57 | iterable = chain(self.__parent.keys(), self.__data.keys()) 58 | 59 | for key in iterable: 60 | if key in yielded: 61 | continue 62 | yield key 63 | yielded.add(key) 64 | 65 | keys = iterkeys 66 | 67 | def iteritems(self): 68 | for key in self.iterkeys(): 69 | yield key, self[key] 70 | 71 | items = iteritems 72 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/helpers.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | 3 | 4 | def from_keyed_iterable(iterable, key, filter_func=None): 5 | """Construct a dictionary out of an iterable, using an attribute name as 6 | the key. Optionally provide a filter function, to determine what should be 7 | kept in the dictionary.""" 8 | 9 | generated = {} 10 | 11 | for element in iterable: 12 | try: 13 | k = getattr(element, key) 14 | except AttributeError: 15 | raise RuntimeError("{} does not have the keyed attribute: {}".format( 16 | element, key 17 | )) 18 | 19 | if filter_func is None or filter_func(element): 20 | if k in generated: 21 | generated[k] += [element] 22 | else: 23 | generated[k] = [element] 24 | 25 | return generated 26 | 27 | 28 | def subtract_by_key(dict_a, dict_b): 29 | """given two dicts, a and b, this function returns c = a - b, where 30 | a - b is defined as the key difference between a and b. 31 | 32 | e.g., 33 | {1:None, 2:3, 3:"yellow", 4:True} - {2:4, 1:"green"} = 34 | {3:"yellow", 4:True} 35 | 36 | """ 37 | difference_dict = {} 38 | for key in dict_a: 39 | if key not in dict_b: 40 | difference_dict[key] = dict_a[key] 41 | 42 | return difference_dict 43 | 44 | 45 | def subtract(dict_a, dict_b, strict=False): 46 | """a stricter form of subtract_by_key(), this version will only remove an 47 | entry from dict_a if the key is in dict_b *and* the value at that key 48 | matches""" 49 | if not strict: 50 | return subtract_by_key(dict_a, dict_b) 51 | 52 | difference_dict = {} 53 | for key in dict_a: 54 | if key not in dict_b or dict_b[key] != dict_a[key]: 55 | difference_dict[key] = dict_a[key] 56 | 57 | return difference_dict 58 | 59 | 60 | WinnowedResult = namedtuple("WinnowedResult", ['has', 'has_not']) 61 | def winnow_by_keys(dct, keys=None, filter_func=None): 62 | """separates a dict into has-keys and not-has-keys pairs, using either 63 | a list of keys or a filtering function.""" 64 | has = {} 65 | has_not = {} 66 | 67 | for key in dct: 68 | key_passes_check = False 69 | if keys is not None: 70 | key_passes_check = key in keys 71 | elif filter_func is not None: 72 | key_passes_check = filter_func(key) 73 | 74 | if key_passes_check: 75 | has[key] = dct[key] 76 | else: 77 | has_not[key] = dct[key] 78 | 79 | return WinnowedResult(has, has_not) 80 | 81 | 82 | def intersection(dict_a, dict_b, strict=True): 83 | intersection_dict = {} 84 | 85 | for key in dict_a: 86 | if key in dict_b: 87 | if not strict or dict_a[key] == dict_b[key]: 88 | intersection_dict[key] = dict_a[key] 89 | 90 | return intersection_dict 91 | 92 | 93 | def setdefaults(dct, defaults): 94 | """Given a target dct and a dict of {key:default value} pairs, 95 | calls setdefault for all of those pairs.""" 96 | for key in defaults: 97 | dct.setdefault(key, defaults[key]) 98 | 99 | return dct 100 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/dicts/limited_dict.py: -------------------------------------------------------------------------------- 1 | from collections import MutableMapping 2 | 3 | 4 | class LimitedDict(MutableMapping): 5 | def __init__(self, args=None, **kwargs): 6 | keys = kwargs.pop('keys', []) 7 | self.__keys = keys 8 | 9 | self.__data = {} 10 | 11 | if args: 12 | kwargs.update((key, val) for key, val in args) 13 | 14 | for key, val in kwargs.items(): 15 | self[key] = val 16 | 17 | def __setitem__(self, key, val): 18 | if key not in self.__keys: 19 | raise KeyError("Illegal key: {}".format(key)) 20 | 21 | self.__data[key] = val 22 | 23 | def __getitem__(self, key): 24 | return self.__data[key] 25 | 26 | def __iter__(self): 27 | return self.__data.__iter__() 28 | 29 | def __delitem__(self, key): 30 | del self.__data[key] 31 | 32 | def __len__(self): 33 | return len(self.__data) 34 | 35 | def __repr__(self): 36 | return "{}({}, {})".format(self.__class__.__name__, self.defined_keys, self.__data) 37 | 38 | @property 39 | def defined_keys(self): 40 | return self.__keys 41 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/enum.py: -------------------------------------------------------------------------------- 1 | """Who hasn't needed a good, old-fashioned enum now and then?""" 2 | 3 | 4 | class _enum(object): 5 | 6 | def __call__(self, enum_name, *args, **kwargs): 7 | if args and kwargs: 8 | raise TypeError("enums can only be made from args XOR kwargs") 9 | 10 | enum_items = {} 11 | 12 | counter = 0 13 | for name, val in kwargs.items(): 14 | if val is None: 15 | val = counter 16 | counter += 1 17 | elif isinstance(val, int): 18 | counter = val + 1 19 | 20 | enum_items[name] = val 21 | 22 | for val, name in enumerate(args, start=counter): 23 | enum_items[name] = val 24 | 25 | return type(enum_name, (Enum,), enum_items) 26 | 27 | def from_iterable(self, iterable): 28 | return self(*iterable) 29 | 30 | def from_dict(self, dct): 31 | return self(**dct) 32 | 33 | def __iter__(self): 34 | for k, v in self.__enum_items.items(): 35 | yield k, v 36 | 37 | def __repr__(self): 38 | return "<{}: {}>".format(self.__class__.__name__, self.__enum_items.values()) 39 | enum = _enum() 40 | 41 | 42 | class EnumItem(object): 43 | 44 | def __init__(self, parent, name, value): 45 | self.__parent = parent 46 | self.__name = name 47 | self.__value = value 48 | 49 | def __repr__(self): 50 | return "<{}: {} [{}]>".format(self.__class__.__name__, self.name, self.value) 51 | 52 | def __eq__(self, other): 53 | if isinstance(other, self.__class__): 54 | if self.parent.is_strict and self.parent != other.parent: 55 | raise ValueError("can't compare EnumItems from different enums") 56 | return self.value == other.value 57 | 58 | return self.value == other 59 | 60 | @property 61 | def value(self): 62 | return self.__value 63 | 64 | @property 65 | def name(self): 66 | return self.__name 67 | 68 | @property 69 | def parent(self): 70 | return self.__parent 71 | 72 | 73 | class _EnumMeta(type): 74 | def __new__(cls, name, bases, attr_dict): 75 | 76 | options = attr_dict.pop('Options', object) 77 | 78 | attr_dict['__strict__'] = getattr(options, "strict_compare", True) 79 | 80 | new_enum = super(_EnumMeta, cls).__new__(cls, name, bases, {}) 81 | 82 | enum_items = {} 83 | 84 | for attr_name, attr_value in attr_dict.items(): 85 | if attr_name.startswith('__'): 86 | super(_EnumMeta, cls).__setattr__(new_enum, attr_name, attr_value) 87 | continue 88 | 89 | if getattr(options, 'force_uppercase', False): 90 | attr_dict.pop(attr_name) 91 | attr_name = attr_name.upper() 92 | 93 | enum_item = EnumItem(new_enum, attr_name, attr_value) 94 | 95 | enum_items[attr_name] = enum_item 96 | super(_EnumMeta, cls).__setattr__(new_enum, attr_name, enum_item) 97 | 98 | if getattr(options, "frozen", True): 99 | super(_EnumMeta, cls).__setattr__(new_enum, '__frozen__', True) 100 | else: 101 | super(_EnumMeta, cls).__setattr__(new_enum, '__frozen__', False) 102 | 103 | if getattr(options, "strict", False): 104 | super(_EnumMeta, cls).__setattr__(new_enum, '__strict__', True) 105 | else: 106 | super(_EnumMeta, cls).__setattr__(new_enum, '__strict__', False) 107 | 108 | super(_EnumMeta, cls).__setattr__(new_enum, '__enum_item_map__', enum_items) 109 | 110 | return new_enum 111 | 112 | def __setattr__(cls, name, val): 113 | if getattr(cls, "__frozen__", False): 114 | raise TypeError("can't set attributes on a frozen enum") 115 | 116 | if name in cls.__enum_item_map__: 117 | val = EnumItem(cls, name, val) 118 | cls.__enum_item_map__[name] = val 119 | 120 | super(_EnumMeta, cls).__setattr__(name, val) 121 | 122 | @property 123 | def is_strict(cls): 124 | return getattr(cls, "__strict__", True) 125 | 126 | def get_name_value_map(cls): 127 | e = cls.__enum_item_map__ 128 | return dict((e[i].name, e[i].value) for i in e) 129 | 130 | 131 | class Enum(_EnumMeta("EnumBase", (object, ), {})): 132 | pass 133 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/lists.py: -------------------------------------------------------------------------------- 1 | """List-related functions""" 2 | 3 | 4 | def unlist(list_thing, complain=True): 5 | """transforms [Something] -> Something. By default, raises a ValueError for 6 | any other list values.""" 7 | if complain and len(list_thing) > 1: 8 | raise ValueError("More than one element in {}".format(list_thing)) 9 | elif len(list_thing) == 1: 10 | return list_thing[0] 11 | 12 | if complain: 13 | raise ValueError("Nothing in {}".format(list_thing)) 14 | return None 15 | 16 | 17 | def flatten(iterable): 18 | """Fully flattens an iterable: 19 | In: flatten([1,2,3,4,[5,6,[7,8]]]) 20 | Out: [1,2,3,4,5,6,7,8] 21 | """ 22 | container = iterable.__class__ 23 | 24 | placeholder = [] 25 | for item in iterable: 26 | try: 27 | placeholder.extend(flatten(item)) 28 | except TypeError: 29 | placeholder.append(item) 30 | 31 | return container(placeholder) 32 | 33 | 34 | def flat_map(iterable, func): 35 | """func must take an item and return an interable that contains that 36 | item. this is flatmap in the classic mode""" 37 | results = [] 38 | for element in iterable: 39 | result = func(element) 40 | if len(result) > 0: 41 | results.extend(result) 42 | return results 43 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/loss_functions.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | from utils.data_operation import accuracy_score 4 | 5 | class Loss(object): 6 | def loss(self, y_true, y_pred): 7 | return NotImplementedError() 8 | 9 | def gradient(self, y, y_pred): 10 | raise NotImplementedError() 11 | 12 | def acc(self, y, y_pred): 13 | return 0 14 | 15 | class SquareLoss(Loss): 16 | def __init__(self): pass 17 | 18 | def loss(self, y, y_pred): 19 | return 0.5 * np.power((y - y_pred), 2) 20 | 21 | def gradient(self, y, y_pred): 22 | return -(y - y_pred) 23 | 24 | class CrossEntropy(Loss): 25 | def __init__(self): pass 26 | 27 | def loss(self, y, p): 28 | # Avoid division by zero 29 | p = np.clip(p, 1e-15, 1 - 1e-15) 30 | return - y * np.log(p) - (1 - y) * np.log(1 - p) 31 | 32 | def acc(self, y, p): 33 | return accuracy_score(np.argmax(y, axis=1), np.argmax(p, axis=1)) 34 | 35 | def gradient(self, y, p): 36 | # Avoid division by zero 37 | p = np.clip(p, 1e-15, 1 - 1e-15) 38 | return - (y / p) + (1 - y) / (1 - p) 39 | 40 | 41 | class SoftMaxLoss(Loss): 42 | def gradient(self, y, p): 43 | return y - p -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/math.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import operator 3 | 4 | # py3 doesn't include reduce as a builtin 5 | try: 6 | reduce 7 | except NameError: 8 | from functools import reduce 9 | 10 | 11 | def product(sequence, initial=1): 12 | """like the built-in sum, but for multiplication.""" 13 | if not isinstance(sequence, collections.Iterable): 14 | raise TypeError("'{}' object is not iterable".format(type(sequence).__name__)) 15 | 16 | return reduce(operator.mul, sequence, initial) 17 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/misc.py: -------------------------------------------------------------------------------- 1 | import progressbar 2 | from mpl_toolkits.mplot3d import Axes3D 3 | import matplotlib.pyplot as plt 4 | import matplotlib.cm as cmx 5 | import matplotlib.colors as colors 6 | import numpy as np 7 | 8 | from utils.data_operation import calculate_covariance_matrix 9 | from utils.data_operation import calculate_correlation_matrix 10 | from utils.data_manipulation import standardize 11 | 12 | bar_widgets = [ 13 | 'Training: ', progressbar.Percentage(), ' ', progressbar.Bar(marker="-", left="[", right="]"), 14 | ' ', progressbar.ETA() 15 | ] 16 | 17 | class Plot(): 18 | def __init__(self): 19 | self.cmap = plt.get_cmap('viridis') 20 | 21 | def _transform(self, X, dim): 22 | covariance = calculate_covariance_matrix(X) 23 | eigenvalues, eigenvectors = np.linalg.eig(covariance) 24 | # Sort eigenvalues and eigenvector by largest eigenvalues 25 | idx = eigenvalues.argsort()[::-1] 26 | eigenvalues = eigenvalues[idx][:dim] 27 | eigenvectors = np.atleast_1d(eigenvectors[:, idx])[:, :dim] 28 | # Project the data onto principal components 29 | X_transformed = X.dot(eigenvectors) 30 | 31 | return X_transformed 32 | 33 | 34 | def plot_regression(self, lines, title, axis_labels=None, mse=None, scatter=None, legend={"type": "lines", "loc": "lower right"}): 35 | 36 | if scatter: 37 | scatter_plots = scatter_labels = [] 38 | for s in scatter: 39 | scatter_plots += [plt.scatter(s["x"], s["y"], color=s["color"], s=s["size"])] 40 | scatter_labels += [s["label"]] 41 | scatter_plots = tuple(scatter_plots) 42 | scatter_labels = tuple(scatter_labels) 43 | 44 | for l in lines: 45 | li = plt.plot(l["x"], l["y"], color=s["color"], linewidth=l["width"], label=l["label"]) 46 | 47 | if mse: 48 | plt.suptitle(title) 49 | plt.title("MSE: %.2f" % mse, fontsize=10) 50 | else: 51 | plt.title(title) 52 | 53 | if axis_labels: 54 | plt.xlabel(axis_labels["x"]) 55 | plt.ylabel(axis_labels["y"]) 56 | 57 | if legend["type"] == "lines": 58 | plt.legend(loc="lower_left") 59 | elif legend["type"] == "scatter" and scatter: 60 | plt.legend(scatter_plots, scatter_labels, loc=legend["loc"]) 61 | 62 | plt.show() 63 | 64 | 65 | 66 | # Plot the dataset X and the corresponding labels y in 2D using PCA. 67 | def plot_in_2d(self, X, y=None, title=None, accuracy=None, legend_labels=None): 68 | X_transformed = self._transform(X, dim=2) 69 | x1 = X_transformed[:, 0] 70 | x2 = X_transformed[:, 1] 71 | class_distr = [] 72 | 73 | y = np.array(y).astype(int) 74 | 75 | colors = [self.cmap(i) for i in np.linspace(0, 1, len(np.unique(y)))] 76 | 77 | # Plot the different class distributions 78 | for i, l in enumerate(np.unique(y)): 79 | _x1 = x1[y == l] 80 | _x2 = x2[y == l] 81 | _y = y[y == l] 82 | class_distr.append(plt.scatter(_x1, _x2, color=colors[i])) 83 | 84 | # Plot legend 85 | if not legend_labels is None: 86 | plt.legend(class_distr, legend_labels, loc=1) 87 | 88 | # Plot title 89 | if title: 90 | if accuracy: 91 | perc = 100 * accuracy 92 | plt.suptitle(title) 93 | plt.title("Accuracy: %.1f%%" % perc, fontsize=10) 94 | else: 95 | plt.title(title) 96 | 97 | # Axis labels 98 | plt.xlabel('Principal Component 1') 99 | plt.ylabel('Principal Component 2') 100 | 101 | plt.show() 102 | 103 | # Plot the dataset X and the corresponding labels y in 3D using PCA. 104 | def plot_in_3d(self, X, y=None): 105 | X_transformed = self._transform(X, dim=3) 106 | x1 = X_transformed[:, 0] 107 | x2 = X_transformed[:, 1] 108 | x3 = X_transformed[:, 2] 109 | fig = plt.figure() 110 | ax = fig.add_subplot(111, projection='3d') 111 | ax.scatter(x1, x2, x3, c=y) 112 | plt.show() 113 | 114 | 115 | -------------------------------------------------------------------------------- /GBDT/GBDT_python3_code/utils/objects.py: -------------------------------------------------------------------------------- 1 | _get_attr_raise_on_attribute_error = "RAISE ON EXCEPTION" 2 | 3 | def get_attr(obj, string_rep, default=_get_attr_raise_on_attribute_error, separator="."): 4 | """ getattr via a chain of attributes like so: 5 | >>> import datetime 6 | >>> some_date = datetime.date.today() 7 | >>> get_attr(some_date, "month.numerator.__doc__") 8 | 'int(x[, base]) -> integer\n\nConvert a string or number to an integer, ... 9 | """ 10 | attribute_chain = string_rep.split(separator) 11 | 12 | current_obj = obj 13 | 14 | for attr in attribute_chain: 15 | try: 16 | current_obj = getattr(current_obj, attr) 17 | except AttributeError: 18 | if default is _get_attr_raise_on_attribute_error: 19 | raise AttributeError( 20 | "Bad attribute \"{}\" in chain: \"{}\"".format(attr, string_rep) 21 | ) 22 | return default 23 | 24 | return current_obj 25 | 26 | 27 | class ImmutableWrapper(object): 28 | _obj = None 29 | _recursive = None 30 | 31 | def __init__(self, obj, recursive): 32 | self._obj = obj 33 | self._recursive = recursive 34 | 35 | def __setattr__(self, name, val): 36 | if name == "_obj" and self._obj is None: 37 | object.__setattr__(self, name, val) 38 | return 39 | elif name == "_recursive" and self._recursive is None: 40 | object.__setattr__(self, name, val) 41 | return 42 | 43 | raise AttributeError("This object has been marked as immutable; you cannot set its attributes.") 44 | 45 | def __getattr__(self, name): 46 | if self._recursive: 47 | return immutable(getattr(self._obj, name), recursive=self._recursive) 48 | 49 | return getattr(self._obj, name) 50 | 51 | def __repr__(self): 52 | return "".format(self._obj.__class__.__name__, self._obj.__repr__()) 53 | 54 | 55 | def immutable(obj, recursive=True): 56 | """wraps the argument in a pass-through class that disallows all attribute 57 | setting. If the `recursive` flag is true, all attribute accesses will 58 | return an immutable-wrapped version of the "real" attribute.""" 59 | return ImmutableWrapper(obj, recursive) 60 | -------------------------------------------------------------------------------- /GBDT/readme.md: -------------------------------------------------------------------------------- 1 | #### 文档引用说明 2 | 3 | - 本目录下的GBDT文档分享来源于阿里星wepon大神 4 | - 其github地址为https://github.com/wepe 5 | - 知乎也可以搜索到大佬ID进行相关学习 6 | - 该文档结合代码进行理解,将有助于明白GBDT在具体的分类和回归任务中如何将基函数回归树做相关的转换应用 7 | - 该文档后半部分还包含XGBoost算法详述、LightGBM算法简述,可用于进阶学习 8 | - 如对简易代码实现的面向对象中的super函数使用方法理解困难的,建议学习该方法的使用后再理解。建议地址https://blog.csdn.net/qq_26442553/article/details/81775449 9 | -------------------------------------------------------------------------------- /GBDT/【HP20190706】《统计学习方法》第一版例题8.2代码实现.md: -------------------------------------------------------------------------------- 1 | ## 李航统计学习方法8.2例题代码实现 2 | 3 | 写代码一开始没有头绪,但是一般解决问题的办法有三个: 4 | 5 | - **思考代码实现顺序步骤,先写出伪代码** 6 | - 如果想不出实现,不知道代码的使用方法,比如本案例的数组的调用方法,如何查找和使用索引等,那么**就可以尽量的去参考别人的写法,学习和借鉴**,学习借鉴得越多,掌握的代码的具体细节方法就越多,在伪代码拆分一个具体的项目的时候,可用的每一步方案就越多 7 | - **多写,多写,多写**!!!编程没有捷径,多写,实现不了的时候多思考,然后多借鉴,遭遇到具体问题多请教大神 8 | 9 | 10 | 伪代码如下: 11 | 12 | 输入变量: 13 | 14 | 1.x数组,相当于特征 15 | 16 | 2.y数组,相当于目标变量 17 | 18 | 3.分界点,这里的分界点是需要for循环来处理的 19 | 20 | 4.返回的结果,输出各个分界点的最小误差损失,以及对应的所有分界点的最小误差分界点 21 | 22 | 23 | 24 | ```python 25 | import pandas as pd 26 | import numpy as np 27 | import math 28 | 29 | 30 | # 生成数组列表 31 | x = list(range(1,11,1)) 32 | # 打印目标变量 33 | y = [5.56,5.70,5.91,6.40,6.80,7.05,8.9,8.7,9.00,9.05] 34 | 35 | # 定义数据的切分点数组 36 | spliting_points=list(range(1,10,1)) 37 | spliting_points = [i + 0.5 for i in spliting_points] 38 | spliting_points 39 | 40 | # 封装一个函数来实现P149页的内容 41 | # 这里的思路是一定要梳理出来哪些是需要循环调用的变量 42 | # 另外这里如何去通过一列数组的索引去获取另外一列数组的值列表,值得学习 43 | # 最后需要学习的是如何通过for循环开实现∑求和 44 | 45 | def Spliting_list(x_array,y_array,spliting_array): 46 | for s in spliting_array: 47 | # 通过x的索引位置来获取y的列表分组 48 | # math.floor()向下取整 49 | R1 =y_array[:x_array.index(math.floor(s))+1] 50 | R2 =y_array[x_array.index(math.floor(s))+1:] 51 | c1 = round(np.mean(R1),2) 52 | c2 = round(np.mean(R2),2) 53 | ms_1 = 0 54 | ms_2 = 0 55 | # 跳出for循环开始计算最小误差 56 | for i in R1: 57 | ms_1 += (i-c1)**2 58 | for j in R2: 59 | ms_2 += (j-c2)**2 60 | ms = round((ms_1 + ms_2),2) 61 | K = print([s,c1,c2,ms]) 62 | return K 63 | 64 | # 初步运行结果,已经实现了P149的全部内容,下一个问题是如何根据所求内容求误差的最小值 65 | 66 | Spliting_list(x,y,spliting_points) 67 | 68 | [1.5, 5.56, 7.5, 15.72] 69 | [2.5, 5.63, 7.73, 12.08] 70 | [3.5, 5.72, 7.99, 8.37] 71 | [4.5, 5.89, 8.25, 5.78] 72 | [5.5, 6.07, 8.54, 3.91] 73 | [6.5, 6.24, 8.91, 1.93] 74 | [7.5, 6.62, 8.92, 8.01] 75 | [8.5, 6.88, 9.02, 11.74] 76 | [9.5, 7.11, 9.05, 15.74] 77 | 78 | # 函数封装的迭代,主要思考如何将该数据中的最小误差所对应的c1\c2\R1\R2存储下来 79 | 80 | def Spliting_list(x_array,y_array,spliting_array): 81 | ms_list = [] 82 | min_lose= np.inf 83 | for s in spliting_array: 84 | # 通过x的索引位置来获取y的列表分组 85 | # math.floor()向下取整 86 | R1 =y_array[:x_array.index(math.floor(s))+1] 87 | R2 =y_array[x_array.index(math.floor(s))+1:] 88 | c1 = round(np.mean(R1),2) 89 | c2 = round(np.mean(R2),2) 90 | ms_1 = 0 91 | ms_2 = 0 92 | # for循环开始计算最小误差 93 | for i in R1: 94 | ms_1 += (i-c1)**2 95 | for j in R2: 96 | ms_2 += (j-c2)**2 97 | ms = round((ms_1 + ms_2),2) 98 | # 如何来存储最佳的C1和C2呢 99 | if ms < min_lose: 100 | # 更新最小误差,这一步特别重要 101 | min_lose = ms 102 | best_c1 = c1 103 | best_c2 = c2 104 | best_R1 = R1 105 | best_R2 = R2 106 | # 对所有所求的ms添加一个列表进行存储 107 | ms_list.append(ms) 108 | # 对所有ms求得最小误差 109 | K = min(ms_list) 110 | return K,best_c1,best_c2,best_R1,best_R2 111 | 112 | # 函数的封装迭代二,思考如何实现R1根据给到的分组求出最小的残差,用于下一阶段的数据拟合 113 | 114 | 115 | def Spliting_list(x_array,y_array,spliting_array): 116 | ms_list = [] 117 | min_lose= np.inf 118 | for s in spliting_array: 119 | # 通过x的索引位置来获取y的列表分组 120 | # math.floor()向下取整 121 | R1 =y_array[:x_array.index(math.floor(s))+1] 122 | R2 =y_array[x_array.index(math.floor(s))+1:] 123 | c1 = round(np.mean(R1),2) 124 | c2 = round(np.mean(R2),2) 125 | ms_1 = 0 126 | ms_2 = 0 127 | # for循环开始计算最小误差 128 | for i in R1: 129 | ms_1 += (i-c1)**2 130 | for j in R2: 131 | ms_2 += (j-c2)**2 132 | ms = round((ms_1 + ms_2),2) 133 | # 如何来存储最佳的C1和C2呢 134 | if ms < min_lose: 135 | # 更新最小误差,这一步特别重要 136 | min_lose = ms 137 | best_c1 = c1 138 | best_c2 = c2 139 | best_R1 = R1 140 | best_R2 = R2 141 | # 计算残差 142 | R1_loss = [m - c1 for m in best_R1] 143 | R2_loss = [n - c2 for n in best_R2] 144 | R_loss = np.hstack((R1_loss,R2_loss)) 145 | # 对所有所求的ms添加一个列表进行存储 146 | ms_list.append(ms) 147 | # 对所有ms求得最小误差 148 | K = min(ms_list) 149 | return K,best_c1,best_c2,R_loss 150 | 151 | TD = Spliting_list(x,y,spliting_points) 152 | 153 | (1.93, 154 | 6.24, 155 | 8.91, 156 | array([-0.68, -0.54, -0.33, 0.16, 0.56, 0.81, -0.01, -0.21, 0.09, 157 | 0.14])) 158 | 159 | 160 | 161 | # 平方误差损失SSE 162 | while SSE > 0.18: 163 | SSE = np.inf 164 | loss_min = Spliting_list(x,y,spliting_points)[0] 165 | R_loss = Spliting_list(x,y,spliting_points)[3] 166 | if loss_min < SSE: 167 | SSE = loss_min 168 | y = R_loss 169 | print(SSE) 170 | 171 | 1.93 172 | 0.79 173 | 0.47 174 | 0.3 175 | 0.23 176 | 0.17 177 | ``` -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | ### 面向机器学习初学者的最全注释版本的机器学习实战的代码 3 | 4 | #### 代码说明 5 | 6 | - 本代码会对每一行机器学习的某个算法的代码进行注释,确保刚接触算法代码的初学者也能够看懂程序到底在运行什么; 7 | 8 | - 本代码基于python3版本,因此和所涉书籍《机器学习实战》的python2代码在细节上存在差异。 9 | #### v5.0 20190722 添加XGBoost算法简易代码实现 10 | 11 | #### v4.0 20190710,添加GBDT算法代码实现,添加Stacking的PPT讲解,便于理解Stacking模型融合的主要流程,并通过实践Kaggle关于泰坦尼克号数据集的模型融合代码,以工程形式展示Stacking的整个过程 12 | 13 | #### v3.0 20190617 添加XGBoost的泰坦尼克号数据集调包实践,后期将更新XGBoost的理解内容 14 | 15 | #### v2.0 20190529 添加CART回归算法的核心代码 16 | 17 | #### v1.0 20190529 添加AdaBoost算法核心代码 18 | -------------------------------------------------------------------------------- /Stacking/Stacking_learn_beta.md: -------------------------------------------------------------------------------- 1 | 2 | ## kaggle泰坦尼克号机器学习stacking模型融合 3 | 4 | 5 | ```python 6 | import numpy as np 7 | import pandas as pd 8 | import re 9 | import sklearn 10 | import os 11 | # 显示当前路径 12 | os.getcwd() 13 | ``` 14 | 15 | 16 | 17 | 18 | 'D:\\jupyter_notebook' 19 | 20 | 21 | 22 | 23 | ```python 24 | # 导入数据 25 | train_ = pd.read_csv('D:/jupyter_notebook/titanic/train.csv') 26 | test_ = pd.read_csv('D:/jupyter_notebook/titanic/test.csv') 27 | ``` 28 | 29 | 30 | ```python 31 | # 为方便进行数据处理,将训练集和测试集合并进行数据处理 32 | train_['number'] = 1 33 | test_['number'] = 0 34 | datamart = pd.concat([train_, test_], axis=0, join='outer') 35 | ``` 36 | 37 | C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version 38 | of pandas will change to not sort by default. 39 | 40 | To accept the future behavior, pass 'sort=False'. 41 | 42 | To retain the current behavior and silence the warning, pass 'sort=True'. 43 | 44 | after removing the cwd from sys.path. 45 | 46 | 47 | ### 1.根据原始特征进行特征处理,训练集和测试集合并处理 48 | 49 | 50 | ```python 51 | #根据原始特征的观察构建新特征 52 | # 计算名字的长度 53 | datamart['Name_length'] = datamart['Name'].apply(len) 54 | # 将旅客是否住在头等舱二值化 55 | datamart['Has_Cabin'] = datamart["Cabin"].apply(lambda x: 0 if type(x) == float else 1) 56 | # 构建新特征家庭总人数 57 | datamart['FamilySize'] = datamart['SibSp'] + datamart['Parch'] + 1 58 | # 构建新特征是否独居 59 | datamart['IsAlone'] = 0 60 | datamart.loc[datamart['FamilySize'] == 1, 'IsAlone'] = 1 61 | # 查看乘客登船口岸存在缺失值 62 | datamart['Embarked'].isnull().value_counts() 63 | # 对乘客登船口岸进行固定值填充缺失值 64 | datamart['Embarked'] = datamart['Embarked'].fillna('S') 65 | # 对票价进行中位数填充缺失值 66 | datamart['Fare'] = datamart['Fare'].fillna(datamart['Fare'].median()) 67 | # 生成绝对票价分区,qcut是根据分区分位定义,将每一个值划为到具体的分区区间中去,此处定义为四分位值 68 | datamart['CategoricalFare'] = pd.qcut(datamart['Fare'], 4) 69 | # 生成新变量年龄平均值、年龄标准差 70 | age_avg = datamart['Age'].mean() 71 | age_std = datamart['Age'].std() 72 | # 计算年龄是否有缺失值并统计 73 | age_null_count = datamart['Age'].isnull().sum() 74 | # np.random.randint()产生离散均匀分布的整数,size是产生的元素数量,前面分别为最小值和最大值区间 75 | age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count) 76 | # 对年龄用生成的一些新数值进行填充 77 | datamart['Age'][np.isnan(datamart['Age'])] = age_null_random_list 78 | # 转换变量类型为数值类型,便于后期计算 79 | datamart['Age'] = datamart['Age'].astype(int) 80 | # 对年龄生成新的分箱变量中来代替,即将年龄绝对值转换为离散类别 81 | datamart['CategoricalAge'] = pd.cut(datamart['Age'], 5) 82 | 83 | # 定义正则表达式函数导出旅客的Title 84 | def get_title(name): 85 | # re.search()方法扫描整个字符串,并返回第一个成功的匹配。如果匹配失败,则返回None 86 | title_search = re.search('([A-Za-z]+)\.',name) 87 | if title_search: 88 | return title_search.group(1) 89 | return '' 90 | 91 | # 取出姓名中尊称部分 92 | datamart['Title'] = datamart['Name'].apply(get_title) 93 | 94 | # 对姓名的称呼部分做统一 95 | datamart['Title'] = datamart['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major' 96 | , 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') 97 | datamart['Title'] = datamart['Title'].replace('Mlle', 'Miss') 98 | datamart['Title'] = datamart['Title'].replace('Ms', 'Miss') 99 | datamart['Title'] = datamart['Title'].replace('Mme', 'Mrs') 100 | 101 | # 对性别从离散型替换为数值型 102 | datamart['Sex'] = datamart['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 103 | 104 | # 对姓名的称呼部分做数值型变换 105 | title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 106 | # 先定义一个字典,然后通过map函数传入字典进行替换 107 | datamart['Title'] = datamart['Title'].map(title_mapping) 108 | # 最后对缺失值替换为0 109 | datamart['Title'] = datamart['Title'].fillna(0) 110 | 111 | # 替换登船口岸 112 | datamart['Embarked'] = datamart['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) 113 | 114 | # 替换票价的四分位数,该步骤应该有更好的办法做数据处理 115 | # loc函数取出列中某类元素的数据集 116 | datamart.loc[ datamart['Fare'] <= 7.91, 'Fare'] = 0 117 | datamart.loc[(datamart['Fare'] > 7.91) & (datamart['Fare'] <= 14.454), 'Fare'] = 1 118 | datamart.loc[(datamart['Fare'] > 14.454) & (datamart['Fare'] <= 31), 'Fare'] = 2 119 | datamart.loc[ datamart['Fare'] > 31, 'Fare'] = 3 120 | datamart['Fare'] = datamart['Fare'].astype(int) 121 | 122 | # 对年龄进行分段 123 | datamart.loc[ datamart['Age'] <= 16, 'Age'] = 0 124 | datamart.loc[(datamart['Age'] > 16) & (datamart['Age'] <= 32), 'Age'] = 1 125 | datamart.loc[(datamart['Age'] > 32) & (datamart['Age'] <= 48), 'Age'] = 2 126 | datamart.loc[(datamart['Age'] > 48) & (datamart['Age'] <= 64), 'Age'] = 3 127 | datamart.loc[datamart['Age'] > 64, 'Age'] = 4 128 | 129 | 130 | # 特征选择,先对处理过的不需要的特征进行删除,定义一个列表,然后批量删除 131 | drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'] 132 | datamart = datamart.drop(drop_elements, axis = 1) 133 | datamart = datamart.drop(['CategoricalAge', 'CategoricalFare'], axis = 1) 134 | # test_ = test_.drop(drop_elements, axis = 1) 135 | 136 | datamart.head() 137 | ``` 138 | 139 | C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 140 | A value is trying to be set on a copy of a slice from a DataFrame 141 | 142 | See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy 143 | 144 | 145 | 146 | 147 | 148 |
149 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 |
AgeEmbarkedFareParchPclassSexSurvivednumberName_lengthHas_CabinFamilySizeIsAloneTitle
01000310.01230201
12130101.01511203
21010301.01220112
32030101.01441203
42010310.01240111
264 |
265 | 266 | 267 | 268 | ### 2.对特征处理后的测试集和训练集分开 269 | 270 | 271 | ```python 272 | # 通过loc方法选取训练集的数据 273 | train_new = datamart.loc[datamart['number'] == 1] 274 | # 对number列进行删除 275 | train_new = train_new.drop(['number'],axis=1) 276 | ``` 277 | 278 | 279 | ```python 280 | train_new.head() 281 | ``` 282 | 283 | 284 | 285 | 286 |
287 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 |
AgeEmbarkedFareParchPclassSexSurvivedName_lengthHas_CabinFamilySizeIsAloneTitle
01000310.0230201
12130101.0511203
21010301.0220112
32030101.0441203
42010310.0240111
396 |
397 | 398 | 399 | 400 | 401 | ```python 402 | test_new = datamart.loc[datamart['number'] == 0] 403 | drop_columns = ['number','Survived'] 404 | test_new = test_new.drop(drop_columns,axis=1) 405 | ``` 406 | 407 | 408 | ```python 409 | test_new.head() 410 | ``` 411 | 412 | 413 | 414 | 415 |
416 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 |
AgeEmbarkedFareParchPclassSexName_lengthHas_CabinFamilySizeIsAloneTitle
0220031160111
1200030320203
2321021250111
3101031160111
4101130440303
519 |
520 | 521 | 522 | 523 | ### 3.导入机器学习库对数据特征进行探索 524 | 525 | 526 | ```python 527 | import sklearn 528 | import plotly.offline as py 529 | py.init_notebook_mode(connected=True) 530 | import plotly.graph_objs as go 531 | import plotly.tools as tls 532 | 533 | from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier 534 | from sklearn.svm import SVC 535 | from sklearn.model_selection import KFold 536 | ``` 537 | 538 | 539 | 554 | 555 | 556 | 557 | 558 | ```python 559 | 560 | ``` 561 | -------------------------------------------------------------------------------- /Stacking/changelog.md: -------------------------------------------------------------------------------- 1 | 20190621 2 | v1.0 本目录主要用于更新Stacking模型融合集成学习方法,结合kaggle泰坦尼克数据集进行相关机器学习方法搭建,同时会涉及到python对象方法的简单学习 3 | -------------------------------------------------------------------------------- /Stacking/kaggle_titanic_data/gender_submission.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Survived 2 | 892,0 3 | 893,1 4 | 894,0 5 | 895,0 6 | 896,1 7 | 897,0 8 | 898,1 9 | 899,0 10 | 900,1 11 | 901,0 12 | 902,0 13 | 903,0 14 | 904,1 15 | 905,0 16 | 906,1 17 | 907,1 18 | 908,0 19 | 909,0 20 | 910,1 21 | 911,1 22 | 912,0 23 | 913,0 24 | 914,1 25 | 915,0 26 | 916,1 27 | 917,0 28 | 918,1 29 | 919,0 30 | 920,0 31 | 921,0 32 | 922,0 33 | 923,0 34 | 924,1 35 | 925,1 36 | 926,0 37 | 927,0 38 | 928,1 39 | 929,1 40 | 930,0 41 | 931,0 42 | 932,0 43 | 933,0 44 | 934,0 45 | 935,1 46 | 936,1 47 | 937,0 48 | 938,0 49 | 939,0 50 | 940,1 51 | 941,1 52 | 942,0 53 | 943,0 54 | 944,1 55 | 945,1 56 | 946,0 57 | 947,0 58 | 948,0 59 | 949,0 60 | 950,0 61 | 951,1 62 | 952,0 63 | 953,0 64 | 954,0 65 | 955,1 66 | 956,0 67 | 957,1 68 | 958,1 69 | 959,0 70 | 960,0 71 | 961,1 72 | 962,1 73 | 963,0 74 | 964,1 75 | 965,0 76 | 966,1 77 | 967,0 78 | 968,0 79 | 969,1 80 | 970,0 81 | 971,1 82 | 972,0 83 | 973,0 84 | 974,0 85 | 975,0 86 | 976,0 87 | 977,0 88 | 978,1 89 | 979,1 90 | 980,1 91 | 981,0 92 | 982,1 93 | 983,0 94 | 984,1 95 | 985,0 96 | 986,0 97 | 987,0 98 | 988,1 99 | 989,0 100 | 990,1 101 | 991,0 102 | 992,1 103 | 993,0 104 | 994,0 105 | 995,0 106 | 996,1 107 | 997,0 108 | 998,0 109 | 999,0 110 | 1000,0 111 | 1001,0 112 | 1002,0 113 | 1003,1 114 | 1004,1 115 | 1005,1 116 | 1006,1 117 | 1007,0 118 | 1008,0 119 | 1009,1 120 | 1010,0 121 | 1011,1 122 | 1012,1 123 | 1013,0 124 | 1014,1 125 | 1015,0 126 | 1016,0 127 | 1017,1 128 | 1018,0 129 | 1019,1 130 | 1020,0 131 | 1021,0 132 | 1022,0 133 | 1023,0 134 | 1024,1 135 | 1025,0 136 | 1026,0 137 | 1027,0 138 | 1028,0 139 | 1029,0 140 | 1030,1 141 | 1031,0 142 | 1032,1 143 | 1033,1 144 | 1034,0 145 | 1035,0 146 | 1036,0 147 | 1037,0 148 | 1038,0 149 | 1039,0 150 | 1040,0 151 | 1041,0 152 | 1042,1 153 | 1043,0 154 | 1044,0 155 | 1045,1 156 | 1046,0 157 | 1047,0 158 | 1048,1 159 | 1049,1 160 | 1050,0 161 | 1051,1 162 | 1052,1 163 | 1053,0 164 | 1054,1 165 | 1055,0 166 | 1056,0 167 | 1057,1 168 | 1058,0 169 | 1059,0 170 | 1060,1 171 | 1061,1 172 | 1062,0 173 | 1063,0 174 | 1064,0 175 | 1065,0 176 | 1066,0 177 | 1067,1 178 | 1068,1 179 | 1069,0 180 | 1070,1 181 | 1071,1 182 | 1072,0 183 | 1073,0 184 | 1074,1 185 | 1075,0 186 | 1076,1 187 | 1077,0 188 | 1078,1 189 | 1079,0 190 | 1080,1 191 | 1081,0 192 | 1082,0 193 | 1083,0 194 | 1084,0 195 | 1085,0 196 | 1086,0 197 | 1087,0 198 | 1088,0 199 | 1089,1 200 | 1090,0 201 | 1091,1 202 | 1092,1 203 | 1093,0 204 | 1094,0 205 | 1095,1 206 | 1096,0 207 | 1097,0 208 | 1098,1 209 | 1099,0 210 | 1100,1 211 | 1101,0 212 | 1102,0 213 | 1103,0 214 | 1104,0 215 | 1105,1 216 | 1106,1 217 | 1107,0 218 | 1108,1 219 | 1109,0 220 | 1110,1 221 | 1111,0 222 | 1112,1 223 | 1113,0 224 | 1114,1 225 | 1115,0 226 | 1116,1 227 | 1117,1 228 | 1118,0 229 | 1119,1 230 | 1120,0 231 | 1121,0 232 | 1122,0 233 | 1123,1 234 | 1124,0 235 | 1125,0 236 | 1126,0 237 | 1127,0 238 | 1128,0 239 | 1129,0 240 | 1130,1 241 | 1131,1 242 | 1132,1 243 | 1133,1 244 | 1134,0 245 | 1135,0 246 | 1136,0 247 | 1137,0 248 | 1138,1 249 | 1139,0 250 | 1140,1 251 | 1141,1 252 | 1142,1 253 | 1143,0 254 | 1144,0 255 | 1145,0 256 | 1146,0 257 | 1147,0 258 | 1148,0 259 | 1149,0 260 | 1150,1 261 | 1151,0 262 | 1152,0 263 | 1153,0 264 | 1154,1 265 | 1155,1 266 | 1156,0 267 | 1157,0 268 | 1158,0 269 | 1159,0 270 | 1160,1 271 | 1161,0 272 | 1162,0 273 | 1163,0 274 | 1164,1 275 | 1165,1 276 | 1166,0 277 | 1167,1 278 | 1168,0 279 | 1169,0 280 | 1170,0 281 | 1171,0 282 | 1172,1 283 | 1173,0 284 | 1174,1 285 | 1175,1 286 | 1176,1 287 | 1177,0 288 | 1178,0 289 | 1179,0 290 | 1180,0 291 | 1181,0 292 | 1182,0 293 | 1183,1 294 | 1184,0 295 | 1185,0 296 | 1186,0 297 | 1187,0 298 | 1188,1 299 | 1189,0 300 | 1190,0 301 | 1191,0 302 | 1192,0 303 | 1193,0 304 | 1194,0 305 | 1195,0 306 | 1196,1 307 | 1197,1 308 | 1198,0 309 | 1199,0 310 | 1200,0 311 | 1201,1 312 | 1202,0 313 | 1203,0 314 | 1204,0 315 | 1205,1 316 | 1206,1 317 | 1207,1 318 | 1208,0 319 | 1209,0 320 | 1210,0 321 | 1211,0 322 | 1212,0 323 | 1213,0 324 | 1214,0 325 | 1215,0 326 | 1216,1 327 | 1217,0 328 | 1218,1 329 | 1219,0 330 | 1220,0 331 | 1221,0 332 | 1222,1 333 | 1223,0 334 | 1224,0 335 | 1225,1 336 | 1226,0 337 | 1227,0 338 | 1228,0 339 | 1229,0 340 | 1230,0 341 | 1231,0 342 | 1232,0 343 | 1233,0 344 | 1234,0 345 | 1235,1 346 | 1236,0 347 | 1237,1 348 | 1238,0 349 | 1239,1 350 | 1240,0 351 | 1241,1 352 | 1242,1 353 | 1243,0 354 | 1244,0 355 | 1245,0 356 | 1246,1 357 | 1247,0 358 | 1248,1 359 | 1249,0 360 | 1250,0 361 | 1251,1 362 | 1252,0 363 | 1253,1 364 | 1254,1 365 | 1255,0 366 | 1256,1 367 | 1257,1 368 | 1258,0 369 | 1259,1 370 | 1260,1 371 | 1261,0 372 | 1262,0 373 | 1263,1 374 | 1264,0 375 | 1265,0 376 | 1266,1 377 | 1267,1 378 | 1268,1 379 | 1269,0 380 | 1270,0 381 | 1271,0 382 | 1272,0 383 | 1273,0 384 | 1274,1 385 | 1275,1 386 | 1276,0 387 | 1277,1 388 | 1278,0 389 | 1279,0 390 | 1280,0 391 | 1281,0 392 | 1282,0 393 | 1283,1 394 | 1284,0 395 | 1285,0 396 | 1286,0 397 | 1287,1 398 | 1288,0 399 | 1289,1 400 | 1290,0 401 | 1291,0 402 | 1292,1 403 | 1293,0 404 | 1294,1 405 | 1295,0 406 | 1296,0 407 | 1297,0 408 | 1298,0 409 | 1299,0 410 | 1300,1 411 | 1301,1 412 | 1302,1 413 | 1303,1 414 | 1304,1 415 | 1305,0 416 | 1306,1 417 | 1307,0 418 | 1308,0 419 | 1309,0 420 | -------------------------------------------------------------------------------- /Stacking/kaggle_titanic_data/test.csv: -------------------------------------------------------------------------------- 1 | PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked 2 | 892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q 3 | 893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S 4 | 894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q 5 | 895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S 6 | 896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S 7 | 897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S 8 | 898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q 9 | 899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S 10 | 900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C 11 | 901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S 12 | 902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S 13 | 903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S 14 | 904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S 15 | 905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S 16 | 906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S 17 | 907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C 18 | 908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q 19 | 909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C 20 | 910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S 21 | 911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C 22 | 912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C 23 | 913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S 24 | 914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S 25 | 915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C 26 | 916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C 27 | 917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S 28 | 918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C 29 | 919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C 30 | 920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S 31 | 921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C 32 | 922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S 33 | 923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S 34 | 924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S 35 | 925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S 36 | 926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C 37 | 927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C 38 | 928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S 39 | 929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S 40 | 930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S 41 | 931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S 42 | 932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C 43 | 933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S 44 | 934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S 45 | 935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S 46 | 936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S 47 | 937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S 48 | 938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C 49 | 939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q 50 | 940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C 51 | 941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S 52 | 942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S 53 | 943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C 54 | 944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S 55 | 945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S 56 | 946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C 57 | 947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q 58 | 948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S 59 | 949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S 60 | 950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S 61 | 951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C 62 | 952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S 63 | 953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S 64 | 954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S 65 | 955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q 66 | 956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C 67 | 957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S 68 | 958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q 69 | 959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S 70 | 960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C 71 | 961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S 72 | 962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q 73 | 963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S 74 | 964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S 75 | 965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C 76 | 966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C 77 | 967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C 78 | 968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S 79 | 969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S 80 | 970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S 81 | 971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q 82 | 972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C 83 | 973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S 84 | 974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S 85 | 975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S 86 | 976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q 87 | 977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C 88 | 978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q 89 | 979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S 90 | 980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q 91 | 981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S 92 | 982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S 93 | 983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S 94 | 984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S 95 | 985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S 96 | 986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C 97 | 987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S 98 | 988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S 99 | 989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S 100 | 990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S 101 | 991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S 102 | 992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C 103 | 993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S 104 | 994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q 105 | 995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S 106 | 996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C 107 | 997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S 108 | 998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q 109 | 999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q 110 | 1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S 111 | 1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S 112 | 1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C 113 | 1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q 114 | 1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C 115 | 1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q 116 | 1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S 117 | 1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C 118 | 1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C 119 | 1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S 120 | 1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C 121 | 1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S 122 | 1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S 123 | 1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q 124 | 1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C 125 | 1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S 126 | 1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q 127 | 1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S 128 | 1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S 129 | 1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q 130 | 1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S 131 | 1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S 132 | 1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S 133 | 1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C 134 | 1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S 135 | 1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C 136 | 1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S 137 | 1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S 138 | 1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C 139 | 1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S 140 | 1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S 141 | 1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S 142 | 1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S 143 | 1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S 144 | 1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C 145 | 1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S 146 | 1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S 147 | 1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S 148 | 1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S 149 | 1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S 150 | 1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S 151 | 1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S 152 | 1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C 153 | 1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C 154 | 1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S 155 | 1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S 156 | 1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S 157 | 1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S 158 | 1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S 159 | 1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S 160 | 1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S 161 | 1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S 162 | 1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q 163 | 1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C 164 | 1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S 165 | 1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S 166 | 1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S 167 | 1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S 168 | 1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C 169 | 1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S 170 | 1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C 171 | 1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S 172 | 1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S 173 | 1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C 174 | 1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S 175 | 1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C 176 | 1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S 177 | 1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S 178 | 1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S 179 | 1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C 180 | 1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S 181 | 1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C 182 | 1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S 183 | 1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C 184 | 1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S 185 | 1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q 186 | 1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C 187 | 1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S 188 | 1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S 189 | 1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S 190 | 1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S 191 | 1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S 192 | 1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S 193 | 1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S 194 | 1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S 195 | 1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q 196 | 1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S 197 | 1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S 198 | 1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C 199 | 1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S 200 | 1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S 201 | 1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S 202 | 1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q 203 | 1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S 204 | 1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C 205 | 1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S 206 | 1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S 207 | 1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C 208 | 1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q 209 | 1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S 210 | 1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C 211 | 1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S 212 | 1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S 213 | 1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S 214 | 1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S 215 | 1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S 216 | 1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S 217 | 1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S 218 | 1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q 219 | 1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S 220 | 1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C 221 | 1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S 222 | 1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C 223 | 1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S 224 | 1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S 225 | 1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S 226 | 1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C 227 | 1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C 228 | 1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S 229 | 1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q 230 | 1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S 231 | 1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S 232 | 1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S 233 | 1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S 234 | 1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S 235 | 1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q 236 | 1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C 237 | 1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S 238 | 1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C 239 | 1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C 240 | 1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S 241 | 1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C 242 | 1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C 243 | 1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S 244 | 1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C 245 | 1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S 246 | 1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S 247 | 1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S 248 | 1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S 249 | 1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S 250 | 1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S 251 | 1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C 252 | 1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S 253 | 1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S 254 | 1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C 255 | 1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S 256 | 1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S 257 | 1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S 258 | 1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q 259 | 1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S 260 | 1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S 261 | 1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S 262 | 1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S 263 | 1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S 264 | 1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S 265 | 1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S 266 | 1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C 267 | 1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S 268 | 1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S 269 | 1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S 270 | 1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S 271 | 1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S 272 | 1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C 273 | 1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q 274 | 1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C 275 | 1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q 276 | 1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C 277 | 1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S 278 | 1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S 279 | 1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S 280 | 1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S 281 | 1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S 282 | 1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S 283 | 1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S 284 | 1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q 285 | 1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C 286 | 1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S 287 | 1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S 288 | 1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S 289 | 1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S 290 | 1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C 291 | 1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S 292 | 1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S 293 | 1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q 294 | 1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C 295 | 1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S 296 | 1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S 297 | 1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S 298 | 1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C 299 | 1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C 300 | 1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S 301 | 1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S 302 | 1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S 303 | 1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C 304 | 1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S 305 | 1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S 306 | 1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q 307 | 1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S 308 | 1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S 309 | 1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S 310 | 1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S 311 | 1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S 312 | 1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S 313 | 1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C 314 | 1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S 315 | 1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q 316 | 1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C 317 | 1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q 318 | 1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C 319 | 1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S 320 | 1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S 321 | 1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S 322 | 1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S 323 | 1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C 324 | 1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S 325 | 1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S 326 | 1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S 327 | 1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S 328 | 1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S 329 | 1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C 330 | 1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S 331 | 1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S 332 | 1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S 333 | 1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C 334 | 1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C 335 | 1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C 336 | 1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S 337 | 1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S 338 | 1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S 339 | 1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C 340 | 1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S 341 | 1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C 342 | 1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S 343 | 1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S 344 | 1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S 345 | 1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C 346 | 1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S 347 | 1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S 348 | 1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S 349 | 1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C 350 | 1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S 351 | 1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S 352 | 1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C 353 | 1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S 354 | 1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S 355 | 1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S 356 | 1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S 357 | 1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S 358 | 1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S 359 | 1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S 360 | 1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q 361 | 1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S 362 | 1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S 363 | 1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C 364 | 1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S 365 | 1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S 366 | 1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C 367 | 1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S 368 | 1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C 369 | 1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S 370 | 1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C 371 | 1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C 372 | 1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S 373 | 1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C 374 | 1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S 375 | 1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S 376 | 1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S 377 | 1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C 378 | 1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S 379 | 1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S 380 | 1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S 381 | 1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S 382 | 1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q 383 | 1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q 384 | 1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S 385 | 1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S 386 | 1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S 387 | 1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S 388 | 1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S 389 | 1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S 390 | 1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q 391 | 1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S 392 | 1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S 393 | 1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S 394 | 1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S 395 | 1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S 396 | 1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S 397 | 1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S 398 | 1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q 399 | 1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C 400 | 1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S 401 | 1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q 402 | 1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S 403 | 1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S 404 | 1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C 405 | 1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S 406 | 1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C 407 | 1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C 408 | 1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S 409 | 1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C 410 | 1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q 411 | 1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S 412 | 1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q 413 | 1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q 414 | 1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S 415 | 1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S 416 | 1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C 417 | 1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S 418 | 1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S 419 | 1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C 420 | -------------------------------------------------------------------------------- /Stacking/两层stacking结构理解beta.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Stacking/两层stacking结构理解beta.pdf -------------------------------------------------------------------------------- /Xgboost/XGBoost_code/XGBoost算法代码简易实现.md: -------------------------------------------------------------------------------- 1 | 2 | #### 1.部分重要np函数实例,便于理解后面的np调用 3 | 当你看不懂部分代码的时候,请结合引用作者的文档 4 | https://www.zhihu.com/people/chen-zhen-64-12/columns 5 | 6 | 7 | ```python 8 | from __future__ import division, print_function 9 | import numpy as np 10 | import pandas as pd 11 | ``` 12 | 13 | np.ones_like测试实例 14 | 15 | 16 | ```python 17 | test = np.array([[1,2,3],[4,5,6]]) 18 | test_one = np.ones_like(test) 19 | print(test_one) 20 | print(test_one.sum()) 21 | ``` 22 | 23 | [[1 1 1] 24 | [1 1 1]] 25 | 6 26 | 27 | 28 | reshape测试实例 29 | 30 | 31 | ```python 32 | K = np.arange(6) 33 | print(K) 34 | T = K.reshape((3,2)) 35 | print(T) 36 | ``` 37 | 38 | [0 1 2 3 4 5] 39 | [[0 1] 40 | [2 3] 41 | [4 5]] 42 | 43 | 44 | shape测试实例 45 | 46 | 47 | ```python 48 | K = np.arange(6) 49 | print(K) 50 | type(np.shape(K)[0]/2) 51 | ``` 52 | 53 | [0 1 2 3 4 5] 54 | 55 | 56 | 57 | 58 | 59 | float 60 | 61 | 62 | 63 | 测试数组相减求和 64 | 65 | 66 | ```python 67 | def gain(y,y_pred): 68 | K = (y - y_pred).sum() 69 | return K 70 | y = np.array([[1,2],[1,2]]) 71 | y_pred = np.array([[2,3],[2,9]]) 72 | gain(y,y_pred) 73 | ``` 74 | 75 | 76 | 77 | 78 | -10 79 | 80 | 81 | 82 | 定义一个三元二次方程理解np.power的作用 83 | 84 | 85 | ```python 86 | def funciton_test(x_1,x_2,x_3): 87 | y = np.power(x_1,2)+3*x_2+x_3 88 | return y 89 | funciton_test(2,1,3) 90 | ``` 91 | 92 | 93 | 94 | 95 | 10 96 | 97 | 98 | 99 | 测试reshape的用途 100 | 101 | 102 | ```python 103 | z = np.array([[1, 2, 3, 4], 104 | [5, 6, 7, 8], 105 | [9, 10, 11, 12], 106 | [13, 14, 15, 16]]) 107 | print('当前的行列数:',z.shape) 108 | A = np.reshape(z,(8,-1)) 109 | print('reshape后的行列数:',A.shape) 110 | print(A) 111 | ``` 112 | 113 | 当前的行列数: (4, 4) 114 | reshape后的行列数: (8, 2) 115 | [[ 1 2] 116 | [ 3 4] 117 | [ 5 6] 118 | [ 7 8] 119 | [ 9 10] 120 | [11 12] 121 | [13 14] 122 | [15 16]] 123 | 124 | 125 | #### 2.XGBoost正式代码部分 126 | 127 | __future__模块,把下一个新版本的特性导入到当前版本,于是我们就可以在当前版本中测试一些新版本的特性,解决python2中运行pytho3兼容性问题 128 | 129 | 如果某个版本中出现了某个新的功能特性,而且这个特性和当前版本中使用的不兼容 130 | 131 | 也就是它在该版本中不是语言标准,那么我如果想要使用的话就需要从future模块导入 132 | 133 | division 表示精确除法 134 | 135 | progressbar显示完成的进度条 136 | 137 | 138 | ```python 139 | # xgboost算法也将决策树算法作为基函数进行使用 140 | # 导入进度条调度函数,方便展示模型训练进度和倒计时 141 | from utils.decision_tree.decision_tree_model import DecisionTree 142 | from utils.misc import bar_widgets 143 | import progressbar 144 | ``` 145 | 146 | 147 | ```python 148 | # 最小二乘损失 1/2(x-x_0)^2,看不懂下面的两个函数就请对这个函数进行一阶导数求导和二阶导数求导,其中x_0是常数项 149 | class LeastSquaresLoss(): 150 | """Least squares loss""" 151 | 152 | # 定义梯度函数(最小二乘的一阶导数),参数包括真实值和预测值 153 | def gradient(self, actual, predicted): 154 | return actual - predicted 155 | 156 | # 定义海塞函数(最小二乘的二阶导数),参数包括真实值和预测值 157 | # np.ones_like返回一个用1填充所有元素的同型数组或者同型矩阵,因为最小二乘损失的二阶导数是 158 | def hess(self, actual, predicted): 159 | return np.ones_like(actual) 160 | ``` 161 | 162 | 163 | ```python 164 | isinstance(LeastSquaresLoss,object) 165 | ``` 166 | 167 | 168 | 169 | 170 | True 171 | 172 | 173 | 174 | 175 | ```python 176 | # XGBoost回归树,从父类决策树继承,是决策树的子类 177 | # 特别说明一点,GBDT和XGBoost在分类问题上都是先调用回归树,然后通过sigmod函数对输出值做概率转换判断分类 178 | # 有些时候,你会看到以一个下划线开头的实例变量名,比如_name,这样的实例变量外部是可以访问的 179 | # 但按照约定俗成的规定,当你看到这样的变量时,意思是,“虽然我可以被访问,但是,请把我视为私有变量,不要随意访问”。 180 | class XGBoostRegressionTree(DecisionTree): 181 | """ 182 | Regression tree for XGBoost 183 | - 参考文档 - 184 | http://xgboost.readthedocs.io/en/latest/model.html 185 | """ 186 | 187 | # y输入是一个矩阵,np.shape是计算矩阵的行数和列数,此处代表返回矩阵的列数的一半用作划分点 188 | # 此处划分的目的在于将label划分为两部分 189 | def _split(self, y): 190 | """ y contains y_true in left half of the middle column and 191 | y_pred in the right half. Split and return the two matrices """ 192 | col = int(np.shape(y)[1]/2) 193 | y, y_pred = y[:, :col], y[:, col:] 194 | return y, y_pred 195 | 196 | # 定义打分函数增益值,此处忽略正则化参数λ 197 | # 函数计算切分后的数据集的Gain值 198 | # 这里类并的loss方法没有一阶导数和二阶导数可以调用,但是在XGBoost类里面定义了损失函数为最小二乘损失 199 | def _gain(self, y, y_pred): 200 | 201 | # 假设这里的函数是平方误差,那么梯度就是残差,这里的结果就是对矩阵求元素对应位置相减,然后对所有元素求和,最后求平方 202 | nominator = np.power((self.loss.gradient(y, y_pred)).sum(), 2) 203 | # 返回一个以y为行列数的对角矩阵,对角线的元素均为1,并求和 204 | denominator = self.loss.hess(y, y_pred).sum() 205 | return 0.5 * (nominator / denominator) 206 | 207 | # 该函数通过调用gain()来计算树节点的纯度,并以此来作为树是否分割的标准 208 | # 对输入的三个参数均执行相同的切分操作,切分为两部分 209 | 210 | def _gain_by_taylor(self, y, y1, y2): 211 | # Split 212 | y, y_pred = self._split(y) 213 | y1, y1_pred = self._split(y1) 214 | y2, y2_pred = self._split(y2) 215 | 216 | # 对三个切分好的参数分别计算最终的增益系数 217 | true_gain = self._gain(y1, y1_pred) 218 | false_gain = self._gain(y2, y2_pred) 219 | gain = self._gain(y, y_pred) 220 | return true_gain + false_gain - gain 221 | 222 | 223 | # 此处忽略了正则化参数λ,因此函数名为近似更新 224 | # 将approximate_update()作为估算子节点取值的方法 225 | # xgboost被切割完成后,每个子节点的取值都已经计算完成,这里返回每个叶节点的预测分数 226 | 227 | def _approximate_update(self, y): 228 | # y split into y, y_pred 229 | y, y_pred = self._split(y) 230 | gradient = np.sum(self.loss.gradient(y, y_pred),axis=0) 231 | hessian = np.sum(self.loss.hess(y, y_pred), axis=0) 232 | # 这里特别注意计算梯度的时候,使用的是最小二乘法,最小二乘法是(真实值-预测值)**2,那么这个地方实际上y是真实值,y_pred是预测值 233 | # 所以这里在计算update_approximation的时候是没有负号的,按照XGBoost的公式推导顺序本来是有负号的,我也是理解了很久才理解透彻 234 | update_approximation = gradient / hessian 235 | return update_approximation 236 | 237 | # 将gain_by_taylor()作为切割树的标准,将approximate_update()作为估算子节点取值的方法,传递回给decisionTree,并以此来构建决策树 238 | # 很多人会看不懂下面这个super函数,这里看起来是集成了XGBoostRegressionTree本身,实际上不是并不是 239 | # 需要看到,这里的参数是self,那么我们需要回过去看看XGBoostRegressionTree的self对象是谁 240 | # XGBoostRegressionTree(DecisionTree)这个类是个子类,大家有没有发现这个子类并没有定义self函数 241 | # 那是因为,在单层继承中,python定义,子类若不自定义self,那么将直接继承父类的self作为自己的self 242 | 243 | # 这里训练完成后返回的决策树的相关参数,也就是模型 244 | def fit(self, X, y): 245 | self._impurity_calculation = self._gain_by_taylor 246 | self._leaf_value_calculation = self._approximate_update 247 | super(XGBoostRegressionTree, self).fit(X, y) 248 | ``` 249 | 250 | 251 | ```python 252 | # 定义XGBoost分类树 253 | class XGBoost(object): 254 | """The XGBoost classifier. 255 | 256 | 参考文档: http://xgboost.readthedocs.io/en/latest/model.html 257 | 258 | Parameters: 259 | ----------- 260 | n_estimators: int 261 | 树的数量 262 | The number of classification trees that are used. 263 | learning_rate: float 264 | 梯度下降的学习率 265 | The step length that will be taken when following the negative gradient during 266 | training. 267 | min_samples_split: int 268 | 每棵子树的节点的最小数目(小于后不继续切割) 269 | The minimum number of samples needed to make a split when building a tree. 270 | min_impurity: float 271 | 每棵子树的最小纯度(小于后不继续切割) 272 | The minimum impurity required to split the tree further. 273 | max_depth: int 274 | 每棵子树的最大层数(大于后不继续切割) 275 | The maximum depth of a tree. 276 | """ 277 | 278 | # 构建一个含有n_estimators棵XGBoostRegressionTree的类 279 | def __init__(self, n_estimators=200, learning_rate=0.01, min_samples_split=2, 280 | min_impurity=1e-7, max_depth=2): 281 | self.n_estimators = n_estimators # 树最大生成数量 282 | self.learning_rate = learning_rate # 权重更新步长 283 | self.min_samples_split = min_samples_split # 每棵子树的节点的最小数目(小于后不继续切割) 284 | self.min_impurity = min_impurity # 每棵子树的最小纯度(小于后不继续切割),标准是最小方差 285 | self.max_depth = max_depth # 每棵子树的最大层数(大于后不继续切割) 286 | 287 | self.bar = progressbar.ProgressBar(widgets=bar_widgets) 288 | 289 | # 定义损失函数为最小二乘损失 290 | self.loss = LeastSquaresLoss() 291 | 292 | # 初始化回归树 293 | # for _ in range()表示不关心具体元素内容,就是简单的让下面的循环range()次,_表示占位符 294 | self.trees = [] 295 | for _ in range(n_estimators): 296 | tree = XGBoostRegressionTree( 297 | min_samples_split=self.min_samples_split, 298 | min_impurity=min_impurity, 299 | max_depth=self.max_depth, 300 | loss=self.loss) 301 | 302 | self.trees.append(tree) 303 | 304 | 305 | # np.concatenate 按轴向将两个数组组成一个新数组 306 | # 对X计算数据的样本总量 307 | # 对数据拆分成m行准备用于训练 308 | def fit(self, X, y): 309 | # y = to_categorical(y) 310 | m = X.shape[0] 311 | y = np.reshape(y, (m, -1)) 312 | # 生成一个全部为0的同时与y相等行列数的数组,该数组主要用于初始化 313 | # np.concatenate 无法按行对一维度数组拼接,但是对二维及以上的数组可以拼接,按行拼接实际上是对对应元素进行叠加 314 | y_pred = np.zeros(np.shape(y)) 315 | for i in self.bar(range(self.n_estimators)): 316 | tree = self.trees[i] 317 | y_and_pred = np.concatenate((y, y_pred), axis=1) 318 | tree.fit(X, y_and_pred) 319 | # 这里是调用了决策树基函数的predict方法,逐个对样本进行分类并返回标签集 320 | update_pred = tree.predict(X) 321 | update_pred = np.reshape(update_pred, (m, -1)) 322 | # 加法模型,预测值是当前轮和上一轮叠加的结果 323 | y_pred += update_pred 324 | 325 | def predict(self, X): 326 | y_pred = None 327 | m = X.shape[0] 328 | # 开始预测 329 | for tree in self.trees: 330 | # 估计梯度和更新预测值 331 | update_pred = tree.predict(X) 332 | update_pred = np.reshape(update_pred, (m, -1)) 333 | if y_pred is None: 334 | # 初始化一个全部为0的同型数组或者同型矩阵 335 | y_pred = np.zeros_like(update_pred) 336 | y_pred += update_pred 337 | 338 | return y_pred 339 | ``` 340 | 341 | 342 | ```python 343 | # 开始使用实例进行测试,实例数据TempLinkoping2016.txt在GBDT文件夹可以找到 344 | import matplotlib.pyplot as plt 345 | from utils.data_manipulation import train_test_split, standardize, to_categorical, normalize 346 | from utils.data_operation import mean_squared_error, accuracy_score 347 | 348 | def main(): 349 | print ("-- XGBoost --") 350 | 351 | # 加载温度数据集,本问题为回归问题 352 | data = pd.read_csv('D:\Machine-Learning-From-Scratch-master\TempLinkoping2016.txt', sep="\t") 353 | 354 | time = np.atleast_2d(data["time"].values).T 355 | temp = np.atleast_2d(data["temp"].values).T 356 | 357 | X = time.reshape((-1, 1)) # Time. Fraction of the year [0, 1] 358 | X = np.insert(X, 0, values=1, axis=1) # 插入偏差项 359 | y = temp[:, 0] # Temperature. Reduce to one-dim 360 | 361 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 362 | #print(y_train) 363 | model = XGBoost() 364 | model.fit(X_train, y_train) 365 | y_pred = model.predict(X_test) 366 | 367 | y_pred_line = model.predict(X) 368 | print(y_test[0:5]) 369 | # Color map 370 | cmap = plt.get_cmap('viridis') 371 | 372 | mse = mean_squared_error(y_test, y_pred) 373 | 374 | print ("Mean Squared Error:", mse) 375 | 376 | # Plot the results 377 | m1 = plt.scatter(366 * X_train[:, 1], y_train, color=cmap(0.9), s=10) 378 | m2 = plt.scatter(366 * X_test[:, 1], y_test, color=cmap(0.5), s=10) 379 | m3 = plt.scatter(366 * X_test[:, 1], y_pred, color='black', s=10) 380 | plt.suptitle("Regression Tree") 381 | plt.title("MSE: %.2f" % mse, fontsize=10) 382 | plt.xlabel('Day') 383 | plt.ylabel('Temperature in Celcius') 384 | plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') 385 | plt.show() 386 | 387 | 388 | 389 | # 小明.py 390 | # 朋友眼中你是小明(__name__ == '小明'), 你自己眼中你是你自己(__name__ == '__main__') 391 | # 你编程很好, 朋友调你去帮他写程序(import 小明, 这时你在朋友眼中: __name__ == '小明') 392 | # 但你晚上也会打开xx网站, 做一些自己的事情(直接运行小明.py, __name__ == '__main__') 393 | # 也就是说,当你被别的文件导入的时候,你的name并不是__main__,而是导入的模块名称,因此被别人导入的时候,代码不会被执行 394 | if __name__ == "__main__": 395 | main() 396 | ``` 397 | 398 | Training: 0% [ ] ETA: --:--:-- 399 | 400 | -- XGBoost -- 401 | 402 | 403 | Training: 100% [------------------------------------------------] Time: 0:00:47 404 | 405 | 406 | [18.8 6.1 -0.8 17.6 5.2] 407 | Mean Squared Error: 122.01499106753589 408 | 409 | 410 | 411 | ![png](output_21_4.png) 412 | 413 | 414 | 415 | ```python 416 | data = pd.read_csv('D:\Machine-Learning-From-Scratch-master\TempLinkoping2016.txt', sep="\t") 417 | ``` 418 | 419 | 420 | ```python 421 | data.head(1) 422 | ``` 423 | 424 | 425 | 426 | 427 |
428 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 |
timetemp
00.0027320.1
457 |
458 | 459 | 460 | 461 | 462 | ```python 463 | import matplotlib.pyplot as plt 464 | 465 | from utils.data_manipulation import train_test_split, standardize, to_categorical, normalize 466 | from utils.data_operation import mean_squared_error, accuracy_score 467 | 468 | def main(): 469 | print ("-- XGBoost --") 470 | 471 | # 加载《统计学习方法》例8.2 472 | x = np.array(range(1,11,1)) 473 | y = np.array([5.56,5.70,5.91,6.40,6.80,7.05,8.9,8.7,9.00,9.05]) 474 | data = pd.DataFrame([x,y]).T 475 | data.columns=['x','y'] 476 | 477 | X = np.atleast_2d(data["x"].values).T 478 | Y = np.atleast_2d(data["y"].values).T 479 | 480 | X = X.reshape((-1, 1)) 481 | X = np.insert(X, 0, values=1, axis=1) 482 | y = Y[:, 0] 483 | 484 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) 485 | #print(y_train) 486 | model = XGBoost() 487 | model.fit(X_train, y_train) 488 | y_pred = model.predict(X_test) 489 | 490 | y_pred_line = model.predict(X) 491 | print(y_test[0:5]) 492 | # Color map 493 | cmap = plt.get_cmap('viridis') 494 | 495 | mse = mean_squared_error(y_test, y_pred) 496 | 497 | print ("Mean Squared Error:", mse) 498 | 499 | # Plot the results 500 | m1 = plt.scatter(366 * X_train[:, 1], y_train, color=cmap(0.9), s=10) 501 | m2 = plt.scatter(366 * X_test[:, 1], y_test, color=cmap(0.5), s=10) 502 | m3 = plt.scatter(366 * X_test[:, 1], y_pred, color='black', s=10) 503 | plt.suptitle("Regression Tree") 504 | plt.title("MSE: %.2f" % mse, fontsize=10) 505 | plt.xlabel('X') 506 | plt.ylabel('Y') 507 | plt.legend((m1, m2, m3), ("Training data", "Test data", "Prediction"), loc='lower right') 508 | plt.show() 509 | 510 | 511 | if __name__ == "__main__": 512 | main() 513 | ``` 514 | 515 | Training: 44% [--------------------- ] ETA: 0:00:00 516 | 517 | -- XGBoost -- 518 | 519 | 520 | Training: 100% [------------------------------------------------] Time: 0:00:00 521 | 522 | 523 | [8.9 9. ] 524 | Mean Squared Error: 1.838750000000001 525 | 526 | 527 | 528 | ![png](output_24_4.png) 529 | 530 | 531 | 532 | ```python 533 | print('\n'.join([''.join([('365'[(x-y) % len('365')] if 534 | ((x*0.05)**2+(y*0.1)**2-1)**3-(x*0.05)**2*(y*0.1)**3 <= 0 else ' ') 535 | for x in range(-30, 30)]) for y in range(30, -30, -1)])) 536 | ``` 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 653653653 365365365 557 | 65365365365365365 36536536536536536 558 | 36536536536536536536536536536536536536536 559 | 3653653653653653653653653653653653653653653 560 | 365365365365365365365365365365365365365365365 561 | 653653653653653653653653653653653653653653653 562 | 536536536536536536536536536536536536536536536 563 | 365365365365365365365365365365365365365365365 564 | 653653653653653653653653653653653653653653653 565 | 536536536536536536536536536536536536536536536 566 | 6536536536536536536536536536536536536536536 567 | 36536536536536536536536536536536536536536 568 | 65365365365365365365365365365365365365365 569 | 6536536536536536536536536536536536536 570 | 36536536536536536536536536536536536 571 | 536536536536536536536536536536536 572 | 53653653653653653653653653653 573 | 5365365365365365365365365 574 | 536536536536536536536 575 | 365365365365365 576 | 653653653 577 | 536 578 | 6 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | ```python 602 | 603 | ``` 604 | -------------------------------------------------------------------------------- /Xgboost/XGBoost_code/output_21_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Xgboost/XGBoost_code/output_21_4.png -------------------------------------------------------------------------------- /Xgboost/XGBoost_code/output_24_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Xgboost/XGBoost_code/output_24_4.png -------------------------------------------------------------------------------- /Xgboost/readme.md: -------------------------------------------------------------------------------- 1 | #### XGBoost学习方法 2 | 如果有大佬看到代码内容有问题并愿意指正,请及时联系cethuang@gmail.com,诚挚感谢指导 3 | 4 | 要学习代码,建议先看如下内容!!! 5 | 6 | v1.2 20190722 7 | 8 | - XGBoost的代码实现主要分为以下步骤 9 | 10 | 1. 代码主要使用的损失函数为最小二乘损失,这个损失函数的使用简单,理解方便,不清楚的可以在网上自行搜索资料学习透彻后再进行下一步 11 | 2. 代码主要计算了最小二乘损失的梯度和海赛函数,梯度实际上是函数的一阶导数,海赛实际上是函数的二阶导数,是一阶导函数的导数。这样组成的函数可以构建二阶泰勒公式,泰勒公式主要用于定义一个新的更方便求解极值的函数用于近似原函数。 12 | 3. XGBoost的相比GBDT更大的进步不仅仅在于使用了二阶泰勒公式(GBDT使用的是梯度),能够更好的拟合损失函数。另外一个大的特点是在损失函数中加入了正则化项,实际上,正则化项常见于逻辑斯蒂回归、SVM、神经网络等,这些正则化项主要是对参数θ进行正则化。而XGBoost的正则化主要是对叶子节点进行正则化,这实际上是希望对叶子节点进行惩罚,如果叶子节点生成过多,那么可以通过正则化项对叶子节点这一项的总值进行缩小,那么实际上就起到了剪枝的作用;同时XGBoost还对决策树的分裂次数进行正则化,这实际上是希望分裂出来的决策树相对简单,那么可以更好的解决对未知数据预测能力的过拟合问题。根据《统计学习方法》决策树相关章节的论述,决策树模型的损失函数同样也加入了正则化项目,但是决策树仅仅对叶节点加入了正则化,并未对决策树的分裂次数进行正则化。 13 | 4. 本代码实现为了简单,方便理解,在所有的公式中,都没有加入正则化项,也就是令λ=0,w=0。至于不明白正则化作用的,建议学习的内容是吴恩达的网易云课堂的机器学习正则化相关视频,可以清晰的理解正则化项到底对损失函数做了什么。 14 | 5. 本代码基于GBDT迭代生成,因此相关的测试数据均可以在本仓库下的GBDT下学习后再进行,强烈不建议没有掌握GBDT相关知识的同学直接上来学习XGBoost。 15 | 6. 正确的学习路径应该是决策树-CART回归树-Boosting-AdaBoost-GBT-GBDT-XGBoost-LightGBM。XGBoost相关的列采样方案建议还需要学习补充随机森林相关算法知识。决策树到GBT相关知识非常建议学习《统计学习方法》相关章节,内容翔实,案例简单。 16 | 7. XGBoost算法的核心代码主要是基于面向对象设计,对super超类继承等方法不熟悉的同学还需要补充面向对象相关方法,方可以理解本代码实现。本代码实现注释了非常多的内容,虽然繁琐,但是方便初学者理解。 17 | 8. 学习本代码可以先结合本仓库GBDT下的GBDT_XGBoost_LGBM算法原理v1.1.PPT对XGBoost和相关公式推导有一个概念理解,然后再进行代码阅读。 18 | 9. 本代码中定义的Gain增益系数,和决策树的Gini基尼系数是两个概念,请勿混淆,至于具体的XGBoost的Gain增益值是如何计算并打分的,请参照第8点的材料进行阅读。 19 | 20 | 21 | v1.1 20190715 22 | 23 | - 添加知乎上一张图说明XGBoost算法的图例,可以结合wepon的算法材料学习 24 | 25 | 26 | 27 | v1.0 20190713 28 | 29 | - 本方法仅建议机器学习初学者参考,大佬求放过 30 | - 对于初学者不要看到各种Kaggle比赛、腾讯广告算法大神的大佬用了LGBM(XGBoost变种)等XGBoost算法就想着经典机器学习算法不想看了先学习最牛逼的算法的思想 31 | - 所有牛逼的算法都有一个牛逼的爹。XGBoost的很牛逼是因为有多个牛逼的爹。列举部分如下:决策树爹、正则爹、泰勒公式爹、随机森林爹(列采样基因)、排序爹(Boosting为何能够做到并行)、梯度下降爹…… 32 | - 你应该先去看李航《统计学习方法》中关于CART决策树的基础理论,掌握一颗决策树是如何遍历所有切分点然后找到最佳切分点的 33 | - 你应该先去学习梯度下降算法,至少应该看懂一元函数和多元函数求偏导数的方法、如此你还需要去复习一下极限理论 34 | - 你应该先去学习决策树的损失函数,了解决策树的损失函数是如何控制分裂点 35 | - 你应该先去学习正则化的思想,强烈建议去看网易云课堂的吴恩达机器学习关于正则化的相关解释,非常适合我这种白痴,至少应该懂得,惩罚项到底在干嘛,我一开始看到惩罚项,以为是前面的损失函数做错了什么事情,例如私下河塘洗澡之类的需要受到惩罚,损失函数到底做错了什么。正则化L1和L2两种,至少应该理解经典的等高线图的交叉点概念 36 | - 先学习Boosting算法。掌握加法模型和Bagging算法的不同之处在哪儿,比如和随机森林的区别点在什么地方 37 | - 然后去掌握提升树算法,理解提升树在拟合数据的时候使用残差的概念,实际上就是在求定义为平方损失函数的梯度。 38 | - 然后去掌握梯度提升树算法,理解为何需要升级成这个算法,在面临什么场景的时候,提升树算法就不灵了,需要使用到梯度来求解类似SoftMax这样的损失函数 39 | - 然后你理解了上面的GBDT,你可以开始尝试去学习XGBoost,别急,看到蛋疼的原论文的时候很痛苦,找一下知乎,找一下github的可能是东半球最大的学习组织(罗永浩?)等地方,慢慢的啃,一步步的来,才是适合我这样的白痴学习方法。 40 | -------------------------------------------------------------------------------- /Xgboost/【HP20190616】xgboost_titanic.md: -------------------------------------------------------------------------------- 1 | 2 | ## kaggle泰坦尼克号机器学习xgboost 3 | 4 | 5 | ```python 6 | import numpy as np 7 | import pandas as pd 8 | import re 9 | import sklearn 10 | import os 11 | # 显示当前路径 12 | os.getcwd() 13 | ``` 14 | 15 | 16 | 17 | 18 | 'D:\\jupyter_notebook' 19 | 20 | 21 | 22 | 23 | ```python 24 | # 导入数据 25 | train_ = pd.read_csv('D:/jupyter_notebook/titanic/train.csv') 26 | test_ = pd.read_csv('D:/jupyter_notebook/titanic/test.csv') 27 | ``` 28 | 29 | 30 | ```python 31 | #根据原始特征的观察构建新特征 32 | # 计算名字的长度 33 | train_['Name_length'] = train_['Name'].apply(len) 34 | # 将旅客是否住在头等舱二值化 35 | train_['Has_Cabin'] = train_["Cabin"].apply(lambda x: 0 if type(x) == float else 1) 36 | # 构建新特征家庭总人数 37 | train_['FamilySize'] = train_['SibSp'] + train_['Parch'] + 1 38 | # 构建新特征是否独居 39 | train_['IsAlone'] = 0 40 | train_.loc[train_['FamilySize'] == 1, 'IsAlone'] = 1 41 | # 查看乘客登船口岸存在缺失值 42 | train_['Embarked'].isnull().value_counts() 43 | # 对乘客登船口岸进行固定值填充缺失值 44 | train_['Embarked'] = train_['Embarked'].fillna('S') 45 | # 对票价进行中位数填充缺失值 46 | train_['Fare'] = train_['Fare'].fillna(train_['Fare'].median()) 47 | # 生成绝对票价分区,qcut是根据分区分位定义,将每一个值划为到具体的分区区间中去,此处定义为四分位值 48 | train_['CategoricalFare'] = pd.qcut(train_['Fare'], 4) 49 | # 生成新变量年龄平均值、年龄标准差 50 | age_avg = train_['Age'].mean() 51 | age_std = train_['Age'].std() 52 | # 计算年龄是否有缺失值并统计 53 | age_null_count = train_['Age'].isnull().sum() 54 | # np.random.randint()产生离散均匀分布的整数,size是产生的元素数量,前面分别为最小值和最大值区间 55 | age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count) 56 | # 对年龄用生成的一些新数值进行填充 57 | train_['Age'][np.isnan(train_['Age'])] = age_null_random_list 58 | # 转换变量类型为数值类型,便于后期计算 59 | train_['Age'] = train_['Age'].astype(int) 60 | # 对年龄生成新的分箱变量中来代替,即将年龄绝对值转换为离散类别 61 | train_['CategoricalAge'] = pd.cut(train_['Age'], 5) 62 | 63 | # 定义正则表达式函数导出旅客的Title 64 | def get_title(name): 65 | # re.search()方法扫描整个字符串,并返回第一个成功的匹配。如果匹配失败,则返回None 66 | title_search = re.search('([A-Za-z]+)\.',name) 67 | if title_search: 68 | return title_search.group(1) 69 | return '' 70 | 71 | # 取出姓名中尊称部分 72 | train_['Title'] = train_['Name'].apply(get_title) 73 | 74 | # 对姓名的称呼部分做统一 75 | train_['Title'] = train_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major' 76 | , 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') 77 | train_['Title'] = train_['Title'].replace('Mlle', 'Miss') 78 | train_['Title'] = train_['Title'].replace('Ms', 'Miss') 79 | train_['Title'] = train_['Title'].replace('Mme', 'Mrs') 80 | 81 | # 对性别从离散型替换为数值型 82 | train_['Sex'] = train_['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 83 | 84 | # 对姓名的称呼部分做数值型变换 85 | title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 86 | # 先定义一个字典,然后通过map函数传入字典进行替换 87 | train_['Title'] = train_['Title'].map(title_mapping) 88 | # 最后对缺失值替换为0 89 | train_['Title'] = train_['Title'].fillna(0) 90 | 91 | # 替换登船口岸 92 | train_['Embarked'] = train_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) 93 | 94 | # 替换票价的四分位数,该步骤应该有更好的办法做数据处理 95 | # loc函数取出列中某类元素的数据集 96 | train_.loc[ train_['Fare'] <= 7.91, 'Fare'] = 0 97 | train_.loc[(train_['Fare'] > 7.91) & (train_['Fare'] <= 14.454), 'Fare'] = 1 98 | train_.loc[(train_['Fare'] > 14.454) & (train_['Fare'] <= 31), 'Fare'] = 2 99 | train_.loc[ train_['Fare'] > 31, 'Fare'] = 3 100 | train_['Fare'] = train_['Fare'].astype(int) 101 | 102 | # 对年龄进行分段 103 | train_.loc[ train_['Age'] <= 16, 'Age'] = 0 104 | train_.loc[(train_['Age'] > 16) & (train_['Age'] <= 32), 'Age'] = 1 105 | train_.loc[(train_['Age'] > 32) & (train_['Age'] <= 48), 'Age'] = 2 106 | train_.loc[(train_['Age'] > 48) & (train_['Age'] <= 64), 'Age'] = 3 107 | train_.loc[train_['Age'] > 64, 'Age'] = 4 108 | 109 | 110 | # 特征选择,先对处理过的不需要的特征进行删除,定义一个列表,然后批量删除 111 | drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'] 112 | train_ = train_.drop(drop_elements, axis = 1) 113 | train_ = train_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1) 114 | # test_ = test_.drop(drop_elements, axis = 1) 115 | 116 | train_.head() 117 | ``` 118 | 119 | C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 120 | A value is trying to be set on a copy of a slice from a DataFrame 121 | 122 | See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy 123 | 124 | 125 | 126 | 127 | 128 |
129 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 |
SurvivedPclassSexAgeParchFareEmbarkedName_lengthHas_CabinFamilySizeIsAloneTitle
00311000230201
11102031511203
21301010220112
31102030441203
40312010240111
238 |
239 | 240 | 241 | 242 | 243 | ```python 244 | #根据原始特征的观察构建新特征 245 | # 计算名字的长度 246 | test_['Name_length'] = test_['Name'].apply(len) 247 | # 将旅客是否住在头等舱二值化 248 | test_['Has_Cabin'] = test_["Cabin"].apply(lambda x: 0 if type(x) == float else 1) 249 | # 构建新特征家庭总人数 250 | test_['FamilySize'] = test_['SibSp'] + test_['Parch'] + 1 251 | # 构建新特征是否独居 252 | test_['IsAlone'] = 0 253 | test_.loc[test_['FamilySize'] == 1, 'IsAlone'] = 1 254 | # 查看乘客登船口岸存在缺失值 255 | test_['Embarked'].isnull().value_counts() 256 | # 对乘客登船口岸进行固定值填充缺失值 257 | test_['Embarked'] = test_['Embarked'].fillna('S') 258 | # 对票价进行中位数填充缺失值 259 | test_['Fare'] = test_['Fare'].fillna(test_['Fare'].median()) 260 | # 生成绝对票价分区,qcut是根据分区分位定义,将每一个值划为到具体的分区区间中去,此处定义为四分位值 261 | test_['CategoricalFare'] = pd.qcut(test_['Fare'], 4) 262 | # 生成新变量年龄平均值、年龄标准差 263 | age_avg = test_['Age'].mean() 264 | age_std = test_['Age'].std() 265 | # 计算年龄是否有缺失值并统计 266 | age_null_count = test_['Age'].isnull().sum() 267 | # np.random.randint()产生离散均匀分布的整数,size是产生的元素数量,前面分别为最小值和最大值区间 268 | age_null_random_list = np.random.randint(age_avg - age_std, age_avg + age_std, size=age_null_count) 269 | # 对年龄用生成的一些新数值进行填充 270 | test_['Age'][np.isnan(test_['Age'])] = age_null_random_list 271 | # 转换变量类型为数值类型,便于后期计算 272 | test_['Age'] = test_['Age'].astype(int) 273 | # 对年龄生成新的分箱变量中来代替,即将年龄绝对值转换为离散类别 274 | test_['CategoricalAge'] = pd.cut(test_['Age'], 5) 275 | 276 | # 定义正则表达式函数导出旅客的Title 277 | def get_title(name): 278 | # re.search()方法扫描整个字符串,并返回第一个成功的匹配。如果匹配失败,则返回None 279 | title_search = re.search('([A-Za-z]+)\.',name) 280 | if title_search: 281 | return title_search.group(1) 282 | return '' 283 | 284 | # 取出姓名中尊称部分 285 | test_['Title'] = test_['Name'].apply(get_title) 286 | 287 | # 对姓名的称呼部分做统一 288 | test_['Title'] = test_['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major' 289 | , 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare') 290 | test_['Title'] = test_['Title'].replace('Mlle', 'Miss') 291 | test_['Title'] = test_['Title'].replace('Ms', 'Miss') 292 | test_['Title'] = test_['Title'].replace('Mme', 'Mrs') 293 | 294 | # 对性别从离散型替换为数值型 295 | test_['Sex'] = test_['Sex'].map( {'female': 0, 'male': 1} ).astype(int) 296 | 297 | # 对姓名的称呼部分做数值型变换 298 | title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5} 299 | # 先定义一个字典,然后通过map函数传入字典进行替换 300 | test_['Title'] = test_['Title'].map(title_mapping) 301 | # 最后对缺失值替换为0 302 | test_['Title'] = test_['Title'].fillna(0) 303 | 304 | # 替换登船口岸 305 | test_['Embarked'] = test_['Embarked'].map( {'S': 0, 'C': 1, 'Q': 2} ).astype(int) 306 | 307 | # 替换票价的四分位数,该步骤应该有更好的办法做数据处理 308 | # loc函数取出列中某类元素的数据集 309 | test_.loc[ test_['Fare'] <= 7.91, 'Fare'] = 0 310 | test_.loc[(test_['Fare'] > 7.91) & (test_['Fare'] <= 14.454), 'Fare'] = 1 311 | test_.loc[(test_['Fare'] > 14.454) & (test_['Fare'] <= 31), 'Fare'] = 2 312 | test_.loc[ test_['Fare'] > 31, 'Fare'] = 3 313 | test_['Fare'] = test_['Fare'].astype(int) 314 | 315 | # 对年龄进行分段 316 | test_.loc[ test_['Age'] <= 16, 'Age'] = 0 317 | test_.loc[(test_['Age'] > 16) & (test_['Age'] <= 32), 'Age'] = 1 318 | test_.loc[(test_['Age'] > 32) & (test_['Age'] <= 48), 'Age'] = 2 319 | test_.loc[(test_['Age'] > 48) & (test_['Age'] <= 64), 'Age'] = 3 320 | test_.loc[test_['Age'] > 64, 'Age'] = 4 321 | 322 | 323 | # 特征选择,先对处理过的不需要的特征进行删除,定义一个列表,然后批量删除 324 | drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp'] 325 | test_ = test_.drop(drop_elements, axis = 1) 326 | test_ = test_.drop(['CategoricalAge', 'CategoricalFare'], axis = 1) 327 | # test_ = test_.drop(drop_elements, axis = 1) 328 | 329 | test_.head() 330 | ``` 331 | 332 | C:\Users\IBM\Anaconda3\lib\site-packages\ipykernel_launcher.py:27: SettingWithCopyWarning: 333 | A value is trying to be set on a copy of a slice from a DataFrame 334 | 335 | See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy 336 | 337 | 338 | 339 | 340 | 341 |
342 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 |
PclassSexAgeParchFareEmbarkedName_lengthHas_CabinFamilySizeIsAloneTitle
0312002160111
1302000320203
2213012250111
3311010160111
4301110440303
445 |
446 | 447 | 448 | 449 | 450 | ```python 451 | import xgboost as xgb 452 | import pandas as pd 453 | import numpy as np 454 | import sklearn 455 | import os 456 | from sklearn.model_selection import train_test_split # 导入测试集和验证集划分函数 457 | ``` 458 | 459 | 460 | ```python 461 | X = train_.drop("Survived",axis= 1) # 提取不带标签的数据集 462 | Y = train_["Survived"] # 提取数据集的标签,数据集的标签一般是指用于预测的label 463 | ``` 464 | 465 | 466 | ```python 467 | X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.3, random_state = 0) 468 | ``` 469 | 470 | 471 | ```python 472 | dtrain = xgb.DMatrix(X_train, label=Y_train) 473 | dtest = xgb.DMatrix(X_test, label=Y_test) 474 | ``` 475 | 476 | 477 | ```python 478 | watchlist = [(dtest, 'eval'), (dtrain, 'train')] 479 | param = {'max_depth':3, 'eta':1, 'silent':1, 'objective':'multi:softmax', 'num_class':3} 480 | 481 | bst = xgb.train(param, dtrain, num_boost_round=10, evals=watchlist) 482 | y_hat = bst.predict(dtest) 483 | result = Y_test.values.reshape(1, -1) == y_hat 484 | print('the accuracy:\t', float(np.sum(result)) / len(y_hat)) 485 | ``` 486 | 487 | [0] eval-merror:0.201493 train-merror:0.160514 488 | [1] eval-merror:0.16791 train-merror:0.157303 489 | [2] eval-merror:0.175373 train-merror:0.144462 490 | [3] eval-merror:0.182836 train-merror:0.136437 491 | [4] eval-merror:0.171642 train-merror:0.138042 492 | [5] eval-merror:0.160448 train-merror:0.133226 493 | [6] eval-merror:0.160448 train-merror:0.126806 494 | [7] eval-merror:0.171642 train-merror:0.125201 495 | [8] eval-merror:0.164179 train-merror:0.11878 496 | [9] eval-merror:0.164179 train-merror:0.117175 497 | the accuracy: 0.835820895522388 498 | 499 | 500 | 501 | ```python 502 | 503 | ``` 504 | -------------------------------------------------------------------------------- /Xgboost/一张图说明XGBoost算法.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/echohandsome/Machine_Learning_in_Action_for_smallwhite/7ea64942aafa97aba5684b344d42f159a1e53374/Xgboost/一张图说明XGBoost算法.jpg -------------------------------------------------------------------------------- /kaggle初学者应该如何参加机器学习比赛.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 1.问题建模" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### 1.1 业务理解\n", 15 | "理解赛题的具体含义,理解业务才能构造出与业务相关性高的特征." 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "### 1.2 赛题数据\n", 23 | "拿到数据首先明确数值特征部分和类别特征部分. 然后思考能够构造出哪些特征,并且考虑哪些特征或者数据是不能够使用的." 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "### 1.3 评价指标 AUC\n", 31 | "本数据题采用的是AUC作为评价指标,二分类问题" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "### 1.4 是否存在线下验证\n", 39 | "时序验证 :一般选择最近邻的1-3天\n", 40 | "\n", 41 | "交叉验证:与k相关大, k偏小则性能不稳定. k偏大则计算量大." 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## 2. 探索性分析\n", 63 | "一般使用可视化、统计检测完成" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "### 2.1 数据集大小、 字段类型\n", 71 | "数据集多大,每个字段是什么类型" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "### 2.2 缺失值情况\n", 86 | "缺失值是否严重,是否缺失值有特殊含义" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "### 2.3 特征之间是否冗余\n", 101 | "比如身高用m和cm同时表示了两个特征就表示冗余" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "### 2.4 是否存在时间信息\n", 116 | "潜在的穿越问题,本题由于有时间特征,而且根据baseline训练集和测试集的两个数据采集时间段并不一致" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": {}, 123 | "outputs": [], 124 | "source": [] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "### 2.5 标签分布\n", 131 | "是否存在类别不平衡问题:本题存在绝对的类别不平衡问题,欺诈人数远远低于非欺诈人数" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": {}, 138 | "outputs": [], 139 | "source": [] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### 2.6 训练集和测试集的分布\n", 146 | "是否训练集中有的字段是测试集没有的,或者二者均存在不同字段" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "## 3. 特征工程" 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": {}, 173 | "source": [ 174 | "### 3.1 数据预处理" 175 | ] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": {}, 180 | "source": [ 181 | "#### 3.1.1 数值特征可视化:数值特征使用散点图进行可视化绘制,去掉离群点." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "#### 3.1.2 缺失值处理\n", 196 | "\n", 197 | "缺失值处理: 缺失值并非全是脏数据, 其背后可能存在具体的业务意义,这种需要根据业务意义进行填充.\n", 198 | "\n", 199 | "另外一部分是真实的缺失,那么可以考虑用统计值进行填充,或者不填充,对于树模型来说是可以处理缺失的." 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "#### 3.1.3错误值处理\n", 221 | "需要注意的是除了明显的违反逻辑常识的错误值之外, 有的数据集中可能使用\n", 222 | "\n", 223 | "某一特定的数值对缺失进行填充,这种需要注意\n", 224 | "\n", 225 | "明显错误值:体重--1000kg、暗错误值:根据业务理解,比如在某个特定的特征中,-1表示了数据缺失定义为了-1,这时候需要处理暗错误值" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": {}, 245 | "source": [ 246 | "#### 3.1.4 假标签处理\n", 247 | "假标签处理: 如果在训练集中出现明显的错误标签(在业务背景下),那么直接删除该样本,或者也可以使用统计特征进行替换. \n", 248 | "\n", 249 | "对于标签和评估指标不一致的情况需要根据实际要求对标签进行数值转换." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": {}, 269 | "source": [ 270 | "### 3.2 特征提取" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "#### 3.2.1 类别特征" 278 | ] 279 | }, 280 | { 281 | "cell_type": "markdown", 282 | "metadata": {}, 283 | "source": [ 284 | "#### 3.2.1.1 编码\n", 285 | "对于模型不能学习的字符串特征可以进行编码(自然数编码和独热编码,使用区别在于该特征是否具有大小的意义)." 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": {}, 305 | "source": [ 306 | "#### 3.2.1.2 计数统计(count)\n", 307 | "给类别特征做count运算,反应类别在整体下的一个热度情况.但是对于异常值是很敏感,比如某一个类别特别大的情况, 这个时候特征会影响模型的泛化能力." 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": null, 313 | "metadata": {}, 314 | "outputs": [], 315 | "source": [] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": {}, 334 | "source": [ 335 | "#### 3.2.1.3 计数排名\n", 336 | "根据统计进行类别连续值排名,可以缩小异常值的权重,减少异常值的影响." 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": {}, 343 | "outputs": [], 344 | "source": [] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "#### 3.2.1.4 目标编码\n", 358 | "根据标签来做特征,比如统计标签下的统计特征(譬如房租赛当中统计每个板块下的均价).\n", 359 | "\n", 360 | "但是这样的做法很容易导致过拟合,需要使用交叉验证来解决.\n", 361 | "\n", 362 | "将数据拆分成多份,用n-1份作为已知数据,剩下一份作为未知数据,在已知数据中构造特征,赋值到未知数据中,从而避免过拟合,提升泛化能力" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "metadata": {}, 376 | "outputs": [], 377 | "source": [] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": {}, 382 | "source": [ 383 | "#### 3.2.1.5 交叉组合\n", 384 | "类别与类别进行交叉组合,可以让类别之间的粒度更细. 类别与数值进行组合,可以反映类别的统计特征,比如类别下的平均值统计等." 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": null, 390 | "metadata": {}, 391 | "outputs": [], 392 | "source": [] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": null, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "#### 3.2.1.6 防止过拟合\n", 413 | "使用交叉统计进行特征构造(比如数据拆分为5分,每使用4份构造一份,构造五次可以拼出一个完整的集合). \n", 414 | "\n", 415 | "时序特征进行构造(使用前一天和前两天的一些信息).在这种情况下可能部分类别特征没有同时存在于两个集合中\n", 416 | "\n", 417 | "(比如房租比赛中的plate62, 和JD比赛中的action type5),此时可以选择使用统计特征填充." 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [] 433 | }, 434 | { 435 | "cell_type": "markdown", 436 | "metadata": {}, 437 | "source": [ 438 | "#### 3.2.2 数值特征" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "#### 3.2.2.1 分桶——将数值特征转换为离散特征,或者分位数" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [] 461 | }, 462 | { 463 | "cell_type": "markdown", 464 | "metadata": {}, 465 | "source": [ 466 | "#### 3.2.2.2 根据业务做特征交叉\n", 467 | "对不同的特征进行加减乘除" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": null, 480 | "metadata": {}, 481 | "outputs": [], 482 | "source": [] 483 | }, 484 | { 485 | "cell_type": "code", 486 | "execution_count": null, 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [] 490 | }, 491 | { 492 | "cell_type": "markdown", 493 | "metadata": {}, 494 | "source": [ 495 | "#### 3.2.2.3 交叉组合\n", 496 | "对类别数值两类特征做交叉组合构建新特征" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": null, 509 | "metadata": {}, 510 | "outputs": [], 511 | "source": [] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": null, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [] 519 | }, 520 | { 521 | "cell_type": "markdown", 522 | "metadata": {}, 523 | "source": [ 524 | "#### 3.2.2.4 时间特征处理\n", 525 | "可以反映周期性和趋势线.并且时间越近效果是越好的.\n", 526 | "\n", 527 | "对于日期变量是可以做One-Hot的\n", 528 | "\n", 529 | "时序相关特征可以进行历史平移和滑窗统" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": {}, 536 | "outputs": [], 537 | "source": [] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": {}, 542 | "source": [ 543 | "#### 3.2.2.5 多值特征处理\n", 544 | "\n", 545 | "多值特征,可以完全展开one_hot,也可以词频统计,也可以wordToVector, embeding等进行降维." 546 | ] 547 | }, 548 | { 549 | "cell_type": "code", 550 | "execution_count": null, 551 | "metadata": {}, 552 | "outputs": [], 553 | "source": [] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": {}, 566 | "outputs": [], 567 | "source": [] 568 | }, 569 | { 570 | "cell_type": "markdown", 571 | "metadata": {}, 572 | "source": [ 573 | "### 3.3 特征选择\n", 574 | "这一步没有哪种方法是一定最好的,需要根据具体问题具体分析,部分机器学习算法,如XGBoost、lightGBM可以做到自动帮助特征选择\n", 575 | "\n", 576 | "另外补充一点,目前有一些kaggle算法已经做到了通过一些辅助库进行调参自动化选择最优参数,貌似是利用贝叶斯法对参数选择\n", 577 | "\n", 578 | "大佬总结:在比赛中的时候需要尝试多种方式,没有哪一种一定是最好的" 579 | ] 580 | }, 581 | { 582 | "cell_type": "code", 583 | "execution_count": null, 584 | "metadata": {}, 585 | "outputs": [], 586 | "source": [] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": null, 591 | "metadata": {}, 592 | "outputs": [], 593 | "source": [] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": null, 598 | "metadata": {}, 599 | "outputs": [], 600 | "source": [] 601 | }, 602 | { 603 | "cell_type": "markdown", 604 | "metadata": {}, 605 | "source": [ 606 | "#### 3.3.1 过滤法\n", 607 | "卡方检验和互信息衡量x和y的相关性;相关系数衡量特征之间的相关性." 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": null, 613 | "metadata": {}, 614 | "outputs": [], 615 | "source": [] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": null, 620 | "metadata": {}, 621 | "outputs": [], 622 | "source": [] 623 | }, 624 | { 625 | "cell_type": "markdown", 626 | "metadata": {}, 627 | "source": [ 628 | "#### 3.3.2 封装法\n", 629 | "前向搜索: 选定好模型,先放入几个基本特征,然后依次往其中丢入特征,保留效果好的特征,属于启发式算法,但是可能陷入局部最优解.\n", 630 | " \n", 631 | "后向搜索: 依次从模型中剔除特征的思路,耗费时间更长,数据量大的时候难以使用." 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": null, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [] 640 | }, 641 | { 642 | "cell_type": "code", 643 | "execution_count": null, 644 | "metadata": {}, 645 | "outputs": [], 646 | "source": [] 647 | }, 648 | { 649 | "cell_type": "markdown", 650 | "metadata": {}, 651 | "source": [ 652 | "#### 3.3.3 嵌入法\n", 653 | "根据树模型返回的特征重要性来选择特征.boosting类别的模型基本都可以做特征选择,而且很多baseline采用" 654 | ] 655 | }, 656 | { 657 | "cell_type": "code", 658 | "execution_count": null, 659 | "metadata": {}, 660 | "outputs": [], 661 | "source": [] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": {}, 667 | "outputs": [], 668 | "source": [] 669 | }, 670 | { 671 | "cell_type": "markdown", 672 | "metadata": {}, 673 | "source": [ 674 | "## 4.必备模型和模型融合" 675 | ] 676 | }, 677 | { 678 | "cell_type": "markdown", 679 | "metadata": {}, 680 | "source": [ 681 | "### 4.1 XGBoost和lightGBM\n", 682 | "\n", 683 | "对特征处理的要求低\n", 684 | "\n", 685 | "对类别特征和连续特征友好\n", 686 | "\n", 687 | "缺失值不需要填充" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": null, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [] 696 | }, 697 | { 698 | "cell_type": "code", 699 | "execution_count": null, 700 | "metadata": {}, 701 | "outputs": [], 702 | "source": [] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": null, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "### 4.2 模型融合\n", 716 | "理论分析: 让模型做到优质而尽量不同. 保证特征差异,样本差异,模型差异的情况下, 可以使得融合能有更好的效果. \n", 717 | "\n", 718 | "实际中可以处理出多套特征,使用重合较少的样本,使用不同的理论模型来达到这一目的.\n", 719 | "\n", 720 | "训练过程融合是树模型本身带有的, 因此可以使用调参达到这个目的.\n", 721 | "\n", 722 | "结果融合: 分类(投票),平均(回归), stacking." 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": null, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "metadata": {}, 742 | "source": [ 743 | "## 5.竞赛总结\n", 744 | "好的竞赛总结比竞赛过程更总要\n", 745 | "\n", 746 | "赛后及时总结:自己的整体思路、关键代码、自己的不足、还需要做哪些尝试\n", 747 | "\n", 748 | "学习优秀的方案:不要局限于自己的思维模式,其他人是如何思考的,哪些是可以借鉴的。进行对比发现自己的不足之处\n", 749 | "\n", 750 | "初学者:一定要有耐心和毅力,既然选择了打比赛,那就需要多花时间和多学习,慢慢来,不要一口气想拿个TOP10" 751 | ] 752 | }, 753 | { 754 | "cell_type": "code", 755 | "execution_count": null, 756 | "metadata": {}, 757 | "outputs": [], 758 | "source": [] 759 | } 760 | ], 761 | "metadata": { 762 | "kernelspec": { 763 | "display_name": "Python 3", 764 | "language": "python", 765 | "name": "python3" 766 | }, 767 | "language_info": { 768 | "codemirror_mode": { 769 | "name": "ipython", 770 | "version": 3 771 | }, 772 | "file_extension": ".py", 773 | "mimetype": "text/x-python", 774 | "name": "python", 775 | "nbconvert_exporter": "python", 776 | "pygments_lexer": "ipython3", 777 | "version": "3.7.0" 778 | } 779 | }, 780 | "nbformat": 4, 781 | "nbformat_minor": 2 782 | } 783 | --------------------------------------------------------------------------------