├── 1. Data Exploration
    ├── DEA.csv
    ├── ReadMe.md
    ├── 专题1：分类问题的数据探索(以Titanic数据集为例).ipynb
    ├── 专题2：回归问题中的相关系数矩阵与热力图(以Boston数据集为例).ipynb
    └── 专题3：对iris数据集进行数据探索.ipynb
├── 2. Data Preprocessing
    ├── ReadMe.md
    ├── 专题1：数据的标准化、归一化与正则化.ipynb
    ├── 专题2：One-Hot编码.ipynb
    ├── 专题3：缺失值处理.ipynb
    ├── 专题4：共线性问题.ipynb
    └── 专题5：非正态分布数据的处理.ipynb
├── 3. Feature Engineering
    ├── 3.1 Feature Selection
    │   ├── Feature Selection.ipynb
    │   ├── ReadMe.md
    │   ├── embedded.py
    │   ├── filter.py
    │   └── wrapper.py
    ├── 3.2 Feature Extraction
    │   ├── ReadMe.md
    │   ├── cat_svd.jpg
    │   ├── pca_evd.py
    │   ├── pca_svd.py
    │   ├── 运用TruncatedSVD进行图像处理.ipynb
    │   └── 验证：sklearn采用SVD实现PCA.ipynb
    ├── 3.3 Feature Construction
    │   ├── ReadMe.md
    │   ├── create_time_feature.py
    │   ├── high_categorical.py
    │   ├── 根据时间戳生成时间型索引&透视分析.ipynb
    │   ├── 生成哑变量.ipynb
    │   ├── 连续型特征的分箱处理.ipynb
    │   ├── 长尾数据的处理.ipynb
    │   └── 高基数类别特征的处理.ipynb
    └── ReadMe.md
├── 4. Classical Supervised Learning
    └── ReadMe.md
├── 5. Ensemble Learning
    ├── Boosting
    │   ├── GBDT-LR
    │   │   ├── GBDT系列与LR的融合&性能对比.ipynb
    │   │   ├── ReadMe.md
    │   │   ├── gbdt_lr.py
    │   │   ├── lightgbm_lr.py
    │   │   └── xgboost_lr.py
    │   ├── ReadMe.md
    │   ├── XGB自定义损失&可视化.ipynb
    │   ├── early_stopping_rounds.ipynb
    │   ├── gbdt_lr_contrast.jpg
    │   ├── xgb_custom_lossfunc.py
    │   ├── xgb_early_stopping.jpg
    │   ├── xgb_early_stopping.py
    │   └── xgb_loss.jpg
    ├── ReadMe.md
    └── Stacking
    │   ├── ReadMe.md
    │   ├── StackingModels_vs_Mlxtend.py
    │   └── stacking_models.py
├── 6. Cluster Analysis
    └── ReadMe.md
├── 7. Model Evaluation
    ├── Pics
    │   ├── ks curve.jpg
    │   ├── prc.jpg
    │   └── roc.jpg
    ├── ReadMe.md
    ├── ks_curve.py
    ├── ks_value.py
    ├── prc.py
    ├── roc.py
    ├── 交叉验证.ipynb
    ├── 分类与回归模型评估.ipynb
    └── 绘制KS、ROC、PRC曲线.ipynb
├── 8. Model Persistence
    ├── ReadMe.md
    ├── joblib.py
    └── pickle.py
├── 9. The Foundations of ML
    └── ReadMe.md
└── README.md


/1. Data Exploration/DEA.csv:
--------------------------------------------------------------------------------
  1 | PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
  2 | 1,0,3,male,22,1,0,7.25,S
  3 | 2,1,1,female,38,1,0,71.2833,C
  4 | 3,1,3,female,26,0,0,7.925,S
  5 | 4,1,1,female,35,1,0,53.1,S
  6 | 5,0,3,male,35,0,0,8.05,S
  7 | 6,0,3,male,28.87594032,0,0,8.4583,Q
  8 | 7,0,1,male,54,0,0,51.8625,S
  9 | 8,0,3,male,2,3,1,21.075,S
 10 | 9,1,3,female,27,0,2,11.1333,S
 11 | 10,1,2,female,14,1,0,30.0708,C
 12 | 11,1,3,female,4,1,1,16.7,S
 13 | 12,1,1,female,58,0,0,26.55,S
 14 | 13,0,3,male,20,0,0,8.05,S
 15 | 14,0,3,male,39,1,5,31.275,S
 16 | 15,0,3,female,14,0,0,7.8542,S
 17 | 16,1,2,female,55,0,0,16,S
 18 | 17,0,3,male,2,4,1,29.125,Q
 19 | 18,1,2,male,29.203022,0,0,13,S
 20 | 19,0,3,female,31,1,0,18,S
 21 | 20,1,3,female,22.87822151,0,0,7.225,C
 22 | 21,0,2,male,35,0,0,26,S
 23 | 22,1,2,male,34,0,0,13,S
 24 | 23,1,3,female,15,0,0,8.0292,Q
 25 | 24,1,1,male,28,0,0,35.5,S
 26 | 25,0,3,female,8,3,1,21.075,S
 27 | 26,1,3,female,38,1,5,31.3875,S
 28 | 27,0,3,male,27.99400139,0,0,7.225,C
 29 | 28,0,1,male,19,3,2,263,S
 30 | 29,1,3,female,24.34621239,0,0,7.8792,Q
 31 | 30,0,3,male,29.11478615,0,0,7.8958,S
 32 | 31,0,1,male,40,0,0,27.7208,C
 33 | 32,1,1,female,34.40013123,1,0,146.5208,C
 34 | 33,1,3,female,25.35289001,0,0,7.75,Q
 35 | 34,0,2,male,66,0,0,10.5,S
 36 | 35,0,1,male,28,1,0,82.1708,C
 37 | 36,0,1,male,42,1,0,52,S
 38 | 37,1,3,male,22.87822151,0,0,7.2292,C
 39 | 38,0,3,male,21,0,0,8.05,S
 40 | 39,0,3,female,18,2,0,18,S
 41 | 40,1,3,female,14,1,0,11.2417,C
 42 | 41,0,3,female,40,1,0,9.475,S
 43 | 42,0,2,female,27,1,0,21,S
 44 | 43,0,3,male,27.39358711,0,0,7.8958,C
 45 | 44,1,2,female,3,1,2,41.5792,C
 46 | 45,1,3,female,19,0,0,7.8792,Q
 47 | 46,0,3,male,29.31497765,0,0,8.05,S
 48 | 47,0,3,male,29.30771255,1,0,15.5,Q
 49 | 48,1,3,female,25.35289001,0,0,7.75,Q
 50 | 49,0,3,male,22.17304039,2,0,21.6792,C
 51 | 50,0,3,female,18,1,0,17.8,S
 52 | 51,0,3,male,7,4,1,39.6875,S
 53 | 52,0,3,male,21,0,0,7.8,S
 54 | 53,1,1,female,49,1,0,76.7292,C
 55 | 54,1,2,female,29,1,0,26,S
 56 | 55,0,1,male,65,0,1,61.9792,C
 57 | 56,1,1,male,40.53792572,0,0,35.5,S
 58 | 57,1,2,female,21,0,0,10.5,S
 59 | 58,0,3,male,28.5,0,0,7.2292,C
 60 | 59,1,2,female,5,1,2,27.75,S
 61 | 60,0,3,male,11,5,2,46.9,S
 62 | 61,0,3,male,22,0,0,7.2292,C
 63 | 62,1,1,female,38,0,0,80,S
 64 | 63,0,1,male,45,1,0,83.475,S
 65 | 64,0,3,male,4,3,2,27.9,S
 66 | 65,0,1,male,43.93253326,0,0,27.7208,C
 67 | 66,1,3,male,16.90238953,1,1,15.2458,C
 68 | 67,1,2,female,29,0,0,10.5,S
 69 | 68,0,3,male,19,0,0,8.1583,S
 70 | 69,1,3,female,17,4,2,7.925,S
 71 | 70,0,3,male,26,2,0,8.6625,S
 72 | 71,0,2,male,32,0,0,10.5,S
 73 | 72,0,3,female,16,5,2,46.9,S
 74 | 73,0,2,male,21,0,0,73.5,S
 75 | 74,0,3,male,26,1,0,14.4542,C
 76 | 75,1,3,male,32,0,0,56.4958,S
 77 | 76,0,3,male,25,0,0,7.65,S
 78 | 77,0,3,male,29.11478615,0,0,7.8958,S
 79 | 78,0,3,male,29.31497765,0,0,8.05,S
 80 | 79,1,2,male,0.83,0,2,29,S
 81 | 80,1,3,female,30,0,0,12.475,S
 82 | 81,0,3,male,22,0,0,9,S
 83 | 82,1,3,male,29,0,0,9.5,S
 84 | 83,1,3,female,24.34621239,0,0,7.7875,Q
 85 | 84,0,1,male,28,0,0,47.1,S
 86 | 85,1,2,female,17,0,0,10.5,S
 87 | 86,1,3,female,33,3,0,15.85,S
 88 | 87,0,3,male,16,1,3,34.375,S
 89 | 88,0,3,male,29.31497765,0,0,8.05,S
 90 | 89,1,1,female,23,3,2,263,S
 91 | 90,0,3,male,24,0,0,8.05,S
 92 | 91,0,3,male,29,0,0,8.05,S
 93 | 92,0,3,male,20,0,0,7.8542,S
 94 | 93,0,1,male,46,1,0,61.175,S
 95 | 94,0,3,male,26,1,2,20.575,S
 96 | 95,0,3,male,59,0,0,7.25,S
 97 | 96,0,3,male,29.31497765,0,0,8.05,S
 98 | 97,0,1,male,71,0,0,34.6542,C
 99 | 98,1,1,male,23,0,1,63.3583,C
100 | 99,1,2,female,34,0,1,23,S
101 | 100,0,2,male,34,1,0,26,S
102 | 101,0,3,female,28,0,0,7.8958,S
103 | 102,0,3,male,29.11478615,0,0,7.8958,S
104 | 103,0,1,male,21,0,1,77.2875,S
105 | 104,0,3,male,33,0,0,8.6542,S
106 | 105,0,3,male,37,2,0,7.925,S
107 | 106,0,3,male,28,0,0,7.8958,S
108 | 107,1,3,female,21,0,0,7.65,S
109 | 108,1,3,male,23.99900627,0,0,7.775,S
110 | 109,0,3,male,38,0,0,7.8958,S
111 | 110,1,3,female,23.98064613,1,0,24.15,Q
112 | 111,0,1,male,47,0,0,52,S
113 | 112,0,3,female,14.5,1,0,14.4542,C
114 | 113,0,3,male,22,0,0,8.05,S
115 | 114,0,3,female,20,1,0,9.825,S
116 | 115,0,3,female,17,0,0,14.4583,C
117 | 116,0,3,male,21,0,0,7.925,S
118 | 117,0,3,male,70.5,0,0,7.75,Q
119 | 118,0,2,male,29,1,0,21,S
120 | 119,0,1,male,24,0,1,247.5208,C
121 | 120,0,3,female,2,4,2,31.275,S
122 | 121,0,2,male,21,2,0,73.5,S
123 | 122,0,3,male,29.31497765,0,0,8.05,S
124 | 123,0,2,male,32.5,1,0,30.0708,C
125 | 124,1,2,female,32.5,0,0,13,S
126 | 125,0,1,male,54,0,1,77.2875,S
127 | 126,1,3,male,12,1,0,11.2417,C
128 | 127,0,3,male,30.46866989,0,0,7.75,Q
129 | 128,1,3,male,24,0,0,7.1417,S
130 | 129,1,3,female,16.60898018,1,1,22.3583,C
131 | 130,0,3,male,45,0,0,6.975,S
132 | 131,0,3,male,33,0,0,7.8958,C
133 | 132,0,3,male,20,0,0,7.05,S
134 | 133,0,3,female,47,1,0,14.5,S
135 | 134,1,2,female,29,1,0,26,S
136 | 135,0,2,male,25,0,0,13,S
137 | 136,0,2,male,23,0,0,15.0458,C
138 | 137,1,1,female,19,0,2,26.2833,S
139 | 138,0,1,male,37,1,0,53.1,S
140 | 139,0,3,male,16,0,0,9.2167,S
141 | 140,0,1,male,24,0,0,79.2,C
142 | 141,0,3,female,19.17977905,0,2,15.2458,C
143 | 142,1,3,female,22,0,0,7.75,S
144 | 143,1,3,female,24,1,0,15.85,S
145 | 144,0,3,male,19,0,0,6.75,Q
146 | 145,0,2,male,18,0,0,11.5,S
147 | 146,0,2,male,19,1,1,36.75,S
148 | 147,1,3,male,27,0,0,7.7958,S
149 | 148,0,3,female,9,2,2,34.375,S
150 | 149,0,2,male,36.5,0,2,26,S
151 | 150,0,2,male,42,0,0,13,S
152 | 151,0,2,male,51,0,0,12.525,S
153 | 152,1,1,female,22,1,0,66.6,S
154 | 153,0,3,male,55.5,0,0,8.05,S
155 | 154,0,3,male,40.5,0,2,14.5,S
156 | 155,0,3,male,29.71520042,0,0,7.3125,S
157 | 156,0,1,male,51,0,1,61.3792,C
158 | 157,1,3,female,16,0,0,7.7333,Q
159 | 158,0,3,male,30,0,0,8.05,S
160 | 159,0,3,male,28.5287323,0,0,8.6625,S
161 | 160,0,3,male,7.838418007,8,2,69.55,S
162 | 161,0,3,male,44,0,1,16.1,S
163 | 162,1,2,female,40,0,0,15.75,S
164 | 163,0,3,male,26,0,0,7.775,S
165 | 164,0,3,male,17,0,0,8.6625,S
166 | 165,0,3,male,1,4,1,39.6875,S
167 | 166,1,3,male,9,0,2,20.525,S
168 | 167,1,1,female,34.78959656,0,1,55,S
169 | 168,0,3,female,45,1,4,27.9,S
170 | 169,0,1,male,46.45271301,0,0,25.925,S
171 | 170,0,3,male,28,0,0,56.4958,S
172 | 171,0,1,male,61,0,0,33.5,S
173 | 172,0,3,male,4,4,1,29.125,Q
174 | 173,1,3,female,1,1,1,11.1333,S
175 | 174,0,3,male,21,0,0,7.925,S
176 | 175,0,1,male,56,0,0,30.6958,C
177 | 176,0,3,male,18,1,1,7.8542,S
178 | 177,0,3,male,13.49962521,3,1,25.4667,S
179 | 178,0,1,female,50,0,0,28.7125,C
180 | 179,0,2,male,30,0,0,13,S
181 | 180,0,3,male,36,0,0,0,S
182 | 181,0,3,female,7.838418007,8,2,69.55,S
183 | 182,0,2,male,33.93231964,0,0,15.05,C
184 | 183,0,3,male,9,4,2,31.3875,S
185 | 184,1,2,male,1,2,1,39,S
186 | 185,1,3,female,4,0,2,22.025,S
187 | 186,0,1,male,45.12653732,0,0,50,S
188 | 187,1,3,female,24.19193268,1,0,15.5,Q
189 | 188,1,1,male,45,0,0,26.55,S
190 | 189,0,3,male,40,1,1,15.5,Q
191 | 190,0,3,male,36,0,0,7.8958,S
192 | 191,1,2,female,32,0,0,13,S
193 | 192,0,2,male,19,0,0,13,S
194 | 193,1,3,female,19,1,0,7.8542,S
195 | 194,1,2,male,3,1,1,26,S
196 | 195,1,1,female,44,0,0,27.7208,C
197 | 196,1,1,female,58,0,0,146.5208,C
198 | 197,0,3,male,30.46866989,0,0,7.75,Q
199 | 198,0,3,male,42,0,1,8.4042,S
200 | 199,1,3,female,25.35289001,0,0,7.75,Q
201 | 200,0,2,female,24,0,0,13,S
202 | 201,0,3,male,28,0,0,9.5,S
203 | 202,0,3,male,7.838418007,8,2,69.55,S
204 | 203,0,3,male,34,0,0,6.4958,S
205 | 204,0,3,male,45.5,0,0,7.225,C
206 | 205,1,3,male,18,0,0,8.05,S
207 | 206,0,3,female,2,0,1,10.4625,S
208 | 207,0,3,male,32,1,0,15.85,S
209 | 208,1,3,male,26,0,0,18.7875,C
210 | 209,1,3,female,16,0,0,7.75,Q
211 | 210,1,1,male,40,0,0,31,C
212 | 211,0,3,male,24,0,0,7.05,S
213 | 212,1,2,female,35,0,0,21,S
214 | 213,0,3,male,22,0,0,7.25,S
215 | 214,0,2,male,30,0,0,13,S
216 | 215,0,3,male,29.56570816,1,0,7.75,Q
217 | 216,1,1,female,31,1,0,113.275,C
218 | 217,1,3,female,27,0,0,7.925,S
219 | 218,0,2,male,42,1,0,27,S
220 | 219,1,1,female,32,0,0,76.2917,C
221 | 220,0,2,male,30,0,0,10.5,S
222 | 221,1,3,male,16,0,0,8.05,S
223 | 222,0,2,male,27,0,0,13,S
224 | 223,0,3,male,51,0,0,8.05,S
225 | 224,0,3,male,29.11478615,0,0,7.8958,S
226 | 225,1,1,male,38,1,0,90,S
227 | 226,0,3,male,22,0,0,9.35,S
228 | 227,1,2,male,19,0,0,10.5,S
229 | 228,0,3,male,20.5,0,0,7.25,S
230 | 229,0,2,male,18,0,0,13,S
231 | 230,0,3,female,13.49962521,3,1,25.4667,S
232 | 231,1,1,female,35,1,0,83.475,S
233 | 232,0,3,male,29,0,0,7.775,S
234 | 233,0,2,male,59,0,0,13.5,S
235 | 234,1,3,female,5,4,2,31.3875,S
236 | 235,0,2,male,24,0,0,10.5,S
237 | 236,0,3,female,29.71520042,0,0,7.55,S
238 | 237,0,2,male,44,1,0,26,S
239 | 238,1,2,female,8,0,2,26.25,S
240 | 239,0,2,male,19,0,0,10.5,S
241 | 240,0,2,male,33,0,0,12.275,S
242 | 241,0,3,female,27.2393074,1,0,14.4542,C
243 | 242,1,3,female,24.19193268,1,0,15.5,Q
244 | 243,0,2,male,29,0,0,10.5,S
245 | 244,0,3,male,22,0,0,7.125,S
246 | 245,0,3,male,30,0,0,7.225,C
247 | 246,0,1,male,44,2,0,90,Q
248 | 247,0,3,female,25,0,0,7.775,S
249 | 248,1,2,female,24,0,2,14.5,S
250 | 249,1,1,male,37,1,1,52.5542,S
251 | 250,0,2,male,54,1,0,26,S
252 | 251,0,3,male,29.71520042,0,0,7.25,S
253 | 252,0,3,female,29,1,1,10.4625,S
254 | 253,0,1,male,62,0,0,26.55,S
255 | 254,0,3,male,30,1,0,16.1,S
256 | 255,0,3,female,41,0,2,20.2125,S
257 | 256,1,3,female,29,0,2,15.2458,C
258 | 257,1,1,female,38.57210541,0,0,79.2,C
259 | 258,1,1,female,30,0,0,86.5,S
260 | 259,1,1,female,35,0,0,512.3292,C
261 | 260,1,2,female,50,0,1,26,S
262 | 261,0,3,male,30.46866989,0,0,7.75,Q
263 | 262,1,3,male,3,4,2,31.3875,S
264 | 263,0,1,male,52,1,1,79.65,S
265 | 264,0,1,male,40,0,0,0,S
266 | 265,0,3,female,30.46866989,0,0,7.75,Q
267 | 266,0,2,male,36,0,0,10.5,S
268 | 267,0,3,male,16,4,1,39.6875,S
269 | 268,1,3,male,25,1,0,7.775,S
270 | 269,1,1,female,58,0,1,153.4625,S
271 | 270,1,1,female,35,0,0,135.6333,S
272 | 271,0,1,male,45.65372086,0,0,31,S
273 | 272,1,3,male,25,0,0,0,S
274 | 273,1,2,female,41,0,1,19.5,S
275 | 274,0,1,male,37,0,1,29.7,C
276 | 275,1,3,female,25.35289001,0,0,7.75,Q
277 | 276,1,1,female,63,1,0,77.9583,S
278 | 277,0,3,female,45,0,0,7.75,S
279 | 278,0,2,male,34.65427399,0,0,0,S
280 | 279,0,3,male,7,4,1,29.125,Q
281 | 280,1,3,female,35,1,1,20.25,S
282 | 281,0,3,male,65,0,0,7.75,Q
283 | 282,0,3,male,28,0,0,7.8542,S
284 | 283,0,3,male,16,0,0,9.5,S
285 | 284,1,3,male,19,0,0,8.05,S
286 | 285,0,1,male,46.45271301,0,0,26,S
287 | 286,0,3,male,33,0,0,8.6625,C
288 | 287,1,3,male,30,0,0,9.5,S
289 | 288,0,3,male,22,0,0,7.8958,S
290 | 289,1,2,male,42,0,0,13,S
291 | 290,1,3,female,22,0,0,7.75,Q
292 | 291,1,1,female,26,0,0,78.85,S
293 | 292,1,1,female,19,1,0,91.0792,C
294 | 293,0,2,male,36,0,0,12.875,C
295 | 294,0,3,female,24,0,0,8.85,S
296 | 295,0,3,male,24,0,0,7.8958,S
297 | 296,0,1,male,43.93253326,0,0,27.7208,C
298 | 297,0,3,male,23.5,0,0,7.2292,C
299 | 298,0,1,female,2,1,2,151.55,S
300 | 299,1,1,male,40.53792572,0,0,30.5,S
301 | 300,1,1,female,50,0,1,247.5208,C
302 | 301,1,3,female,25.35289001,0,0,7.75,Q
303 | 302,1,3,male,19.12566566,2,0,23.25,Q
304 | 303,0,3,male,19,0,0,0,S
305 | 304,1,2,female,29.55023003,0,0,12.35,Q
306 | 305,0,3,male,29.31497765,0,0,8.05,S
307 | 306,1,1,male,0.92,1,2,151.55,S
308 | 307,1,1,female,35.30309677,0,0,110.8833,C
309 | 308,1,1,female,17,1,0,108.9,C
310 | 309,0,2,male,30,1,0,24,C
311 | 310,1,1,female,30,0,0,56.9292,C
312 | 311,1,1,female,24,0,0,83.1583,C
313 | 312,1,1,female,18,2,2,262.375,C
314 | 313,0,2,female,26,1,1,26,S
315 | 314,0,3,male,28,0,0,7.8958,S
316 | 315,0,2,male,43,1,1,26.25,S
317 | 316,1,3,female,26,0,0,7.8542,S
318 | 317,1,2,female,24,1,0,26,S
319 | 318,0,2,male,54,0,0,14,S
320 | 319,1,1,female,31,0,2,164.8667,S
321 | 320,1,1,female,40,1,1,134.5,C
322 | 321,0,3,male,22,0,0,7.25,S
323 | 322,0,3,male,27,0,0,7.8958,S
324 | 323,1,2,female,30,0,0,12.35,Q
325 | 324,1,2,female,22,1,1,29,S
326 | 325,0,3,male,7.838418007,8,2,69.55,S
327 | 326,1,1,female,36,0,0,135.6333,C
328 | 327,0,3,male,61,0,0,6.2375,S
329 | 328,1,2,female,36,0,0,13,S
330 | 329,1,3,female,31,1,1,20.525,S
331 | 330,1,1,female,16,0,1,57.9792,C
332 | 331,1,3,female,19.12566566,2,0,23.25,Q
333 | 332,0,1,male,45.5,0,0,28.5,S
334 | 333,0,1,male,38,0,1,153.4625,S
335 | 334,0,3,male,16,2,0,18,S
336 | 335,1,1,female,36.12131882,1,0,133.65,S
337 | 336,0,3,male,29.11478615,0,0,7.8958,S
338 | 337,0,1,male,29,1,0,66.6,S
339 | 338,1,1,female,41,0,0,134.5,C
340 | 339,1,3,male,45,0,0,8.05,S
341 | 340,0,1,male,45,0,0,35.5,S
342 | 341,1,2,male,2,1,1,26,S
343 | 342,1,1,female,24,3,2,263,S
344 | 343,0,2,male,28,0,0,13,S
345 | 344,0,2,male,25,0,0,13,S
346 | 345,0,2,male,36,0,0,13,S
347 | 346,1,2,female,24,0,0,13,S
348 | 347,1,2,female,40,0,0,13,S
349 | 348,1,3,female,23.84472466,1,0,16.1,S
350 | 349,1,3,male,3,1,1,15.9,S
351 | 350,0,3,male,42,0,0,8.6625,S
352 | 351,0,3,male,23,0,0,9.225,S
353 | 352,0,1,male,45.65372086,0,0,35,S
354 | 353,0,3,male,15,1,1,7.2292,C
355 | 354,0,3,male,25,1,0,17.8,S
356 | 355,0,3,male,27.99400139,0,0,7.225,C
357 | 356,0,3,male,28,0,0,9.5,S
358 | 357,1,1,female,22,0,1,55,S
359 | 358,0,2,female,38,0,0,13,S
360 | 359,1,3,female,24.34621239,0,0,7.8792,Q
361 | 360,1,3,female,24.34621239,0,0,7.8792,Q
362 | 361,0,3,male,40,1,4,27.9,S
363 | 362,0,2,male,29,1,0,27.7208,C
364 | 363,0,3,female,45,0,1,14.4542,C
365 | 364,0,3,male,35,0,0,7.05,S
366 | 365,0,3,male,29.30771255,1,0,15.5,Q
367 | 366,0,3,male,30,0,0,7.25,S
368 | 367,1,1,female,60,1,0,75.25,C
369 | 368,1,3,female,22.87822151,0,0,7.2292,C
370 | 369,1,3,female,25.35289001,0,0,7.75,Q
371 | 370,1,1,female,24,0,0,69.3,C
372 | 371,1,1,male,25,1,0,55.4417,C
373 | 372,0,3,male,18,1,0,6.4958,S
374 | 373,0,3,male,19,0,0,8.05,S
375 | 374,0,1,male,22,0,0,135.6333,C
376 | 375,0,3,female,3,3,1,21.075,S
377 | 376,1,1,female,34.40013123,1,0,82.1708,C
378 | 377,1,3,female,22,0,0,7.25,S
379 | 378,0,1,male,27,0,2,211.5,C
380 | 379,0,3,male,20,0,0,4.0125,C
381 | 380,0,3,male,19,0,0,7.775,S
382 | 381,1,1,female,42,0,0,227.525,C
383 | 382,1,3,female,1,0,2,15.7417,C
384 | 383,0,3,male,32,0,0,7.925,S
385 | 384,1,1,female,35,1,0,52,S
386 | 385,0,3,male,29.11478615,0,0,7.8958,S
387 | 386,0,2,male,18,0,0,73.5,S
388 | 387,0,3,male,1,5,2,46.9,S
389 | 388,1,2,female,36,0,0,13,S
390 | 389,0,3,male,30.06240654,0,0,7.7292,Q
391 | 390,1,2,female,17,0,0,12,C
392 | 391,1,1,male,36,1,2,120,S
393 | 392,1,3,male,21,0,0,7.7958,S
394 | 393,0,3,male,28,2,0,7.925,S
395 | 394,1,1,female,23,1,0,113.275,C
396 | 395,1,3,female,24,0,2,16.7,S
397 | 396,0,3,male,22,0,0,7.7958,S
398 | 397,0,3,female,31,0,0,7.8542,S
399 | 398,0,2,male,46,0,0,26,S
400 | 399,0,2,male,23,0,0,10.5,S
401 | 400,1,2,female,28,0,0,12.65,S
402 | 401,1,3,male,39,0,0,7.925,S
403 | 402,0,3,male,26,0,0,8.05,S
404 | 403,0,3,female,21,1,0,9.825,S
405 | 404,0,3,male,28,1,0,15.85,S
406 | 405,0,3,female,20,0,0,8.6625,S
407 | 406,0,2,male,34,1,0,21,S
408 | 407,0,3,male,51,0,0,7.75,S
409 | 408,1,2,male,3,1,1,18.75,S
410 | 409,0,3,male,21,0,0,7.775,S
411 | 410,0,3,female,13.49962521,3,1,25.4667,S
412 | 411,0,3,male,29.11478615,0,0,7.8958,S
413 | 412,0,3,male,30.06240654,0,0,6.8583,Q
414 | 413,1,1,female,33,1,0,90,Q
415 | 414,0,2,male,34.65427399,0,0,0,S
416 | 415,1,3,male,44,0,0,7.925,S
417 | 416,0,3,female,29.31497765,0,0,8.05,S
418 | 417,1,2,female,34,1,1,32.5,S
419 | 418,1,2,female,18,0,2,13,S
420 | 419,0,2,male,30,0,0,13,S
421 | 420,0,3,female,10,0,2,24.15,S
422 | 421,0,3,male,27.39358711,0,0,7.8958,C
423 | 422,0,3,male,21,0,0,7.7333,Q
424 | 423,0,3,male,29,0,0,7.875,S
425 | 424,0,3,female,28,1,1,14.4,S
426 | 425,0,3,male,18,1,1,20.2125,S
427 | 426,0,3,male,29.71520042,0,0,7.25,S
428 | 427,1,2,female,28,1,0,26,S
429 | 428,1,2,female,19,0,0,26,S
430 | 429,0,3,male,30.46866989,0,0,7.75,Q
431 | 430,1,3,male,32,0,0,8.05,S
432 | 431,1,1,male,28,0,0,26.55,S
433 | 432,1,3,female,23.84472466,1,0,16.1,S
434 | 433,1,2,female,42,1,0,26,S
435 | 434,0,3,male,17,0,0,7.125,S
436 | 435,0,1,male,50,1,0,55.9,S
437 | 436,1,1,female,14,1,2,120,S
438 | 437,0,3,female,21,2,2,34.375,S
439 | 438,1,2,female,24,2,3,18.75,S
440 | 439,0,1,male,64,1,4,263,S
441 | 440,0,2,male,31,0,0,10.5,S
442 | 441,1,2,female,45,1,1,26.25,S
443 | 442,0,3,male,20,0,0,9.5,S
444 | 443,0,3,male,25,1,0,7.775,S
445 | 444,1,2,female,28,0,0,13,S
446 | 445,1,3,male,23.86882591,0,0,8.1125,S
447 | 446,1,1,male,4,0,2,81.8583,S
448 | 447,1,2,female,13,0,1,19.5,S
449 | 448,1,1,male,34,0,0,26.55,S
450 | 449,1,3,female,5,2,1,19.2583,C
451 | 450,1,1,male,52,0,0,30.5,S
452 | 451,0,2,male,36,1,2,27.75,S
453 | 452,0,3,male,28.66709518,1,0,19.9667,S
454 | 453,0,1,male,30,0,0,27.75,C
455 | 454,1,1,male,49,1,0,89.1042,C
456 | 455,0,3,male,29.31497765,0,0,8.05,S
457 | 456,1,3,male,29,0,0,7.8958,C
458 | 457,0,1,male,65,0,0,26.55,S
459 | 458,1,1,female,39.10777664,1,0,51.8625,S
460 | 459,1,2,female,50,0,0,10.5,S
461 | 460,0,3,male,30.46866989,0,0,7.75,Q
462 | 461,1,1,male,48,0,0,26.55,S
463 | 462,0,3,male,34,0,0,8.05,S
464 | 463,0,1,male,47,0,0,38.5,S
465 | 464,0,2,male,48,0,0,13,S
466 | 465,0,3,male,29.31497765,0,0,8.05,S
467 | 466,0,3,male,38,0,0,7.05,S
468 | 467,0,2,male,34.65427399,0,0,0,S
469 | 468,0,1,male,56,0,0,26.55,S
470 | 469,0,3,male,30.06240654,0,0,7.725,Q
471 | 470,1,3,female,0.75,2,1,19.2583,C
472 | 471,0,3,male,29.71520042,0,0,7.25,S
473 | 472,0,3,male,38,0,0,8.6625,S
474 | 473,1,2,female,33,1,2,27.75,S
475 | 474,1,2,female,23,0,0,13.7917,C
476 | 475,0,3,female,22,0,0,9.8375,S
477 | 476,0,1,male,45.12653732,0,0,52,S
478 | 477,0,2,male,34,1,0,21,S
479 | 478,0,3,male,29,1,0,7.0458,S
480 | 479,0,3,male,22,0,0,7.5208,S
481 | 480,1,3,female,2,0,1,12.2875,S
482 | 481,0,3,male,9,5,2,46.9,S
483 | 482,0,2,male,34.65427399,0,0,0,S
484 | 483,0,3,male,50,0,0,8.05,S
485 | 484,1,3,female,63,0,0,9.5875,S
486 | 485,1,1,male,25,1,0,91.0792,C
487 | 486,0,3,female,13.49962521,3,1,25.4667,S
488 | 487,1,1,female,35,1,0,90,S
489 | 488,0,1,male,58,0,0,29.7,C
490 | 489,0,3,male,30,0,0,8.05,S
491 | 490,1,3,male,9,1,1,15.9,S
492 | 491,0,3,male,28.66709518,1,0,19.9667,S
493 | 492,0,3,male,21,0,0,7.25,S
494 | 493,0,1,male,55,0,0,30.5,S
495 | 494,0,1,male,71,0,0,49.5042,C
496 | 495,0,3,male,21,0,0,8.05,S
497 | 496,0,3,male,28.14227104,0,0,14.4583,C
498 | 497,1,1,female,54,1,0,78.2667,C
499 | 498,0,3,male,29.86346817,0,0,15.1,S
500 | 499,0,1,female,25,1,2,151.55,S
501 | 500,0,3,male,24,0,0,7.7958,S
502 | 501,0,3,male,17,0,0,8.6625,S
503 | 502,0,3,female,21,0,0,7.75,Q
504 | 503,0,3,female,30.06240654,0,0,7.6292,Q
505 | 504,0,3,female,37,0,0,9.5875,S
506 | 505,1,1,female,16,0,0,86.5,S
507 | 506,0,1,male,18,1,0,108.9,C
508 | 507,1,2,female,33,0,2,26,S
509 | 508,1,1,male,41.33691788,0,0,26.55,S
510 | 509,0,3,male,28,0,0,22.525,S
511 | 510,1,3,male,26,0,0,56.4958,S
512 | 511,1,3,male,29,0,0,7.75,Q
513 | 512,0,3,male,29.31497765,0,0,8.05,S
514 | 513,1,1,male,36,0,0,26.2875,S
515 | 514,1,1,female,54,1,0,59.4,C
516 | 515,0,3,male,24,0,0,7.4958,S
517 | 516,0,1,male,47,0,0,34.0208,S
518 | 517,1,2,female,34,0,0,10.5,S
519 | 518,0,3,male,29.99938965,0,0,24.15,Q
520 | 519,1,2,female,36,1,0,26,S
521 | 520,0,3,male,32,0,0,7.8958,S
522 | 521,1,1,female,30,0,0,93.5,S
523 | 522,0,3,male,22,0,0,7.8958,S
524 | 523,0,3,male,27.99400139,0,0,7.225,C
525 | 524,1,1,female,44,0,1,57.9792,C
526 | 525,0,3,male,27.99400139,0,0,7.2292,C
527 | 526,0,3,male,40.5,0,0,7.75,Q
528 | 527,1,2,female,50,0,0,10.5,S
529 | 528,0,1,male,43.36372757,0,0,221.7792,S
530 | 529,0,3,male,39,0,0,7.925,S
531 | 530,0,2,male,23,2,1,11.5,S
532 | 531,1,2,female,2,1,1,26,S
533 | 532,0,3,male,27.99400139,0,0,7.2292,C
534 | 533,0,3,male,17,1,1,7.2292,C
535 | 534,1,3,female,13.77058315,0,2,22.3583,C
536 | 535,0,3,female,30,0,0,8.6625,S
537 | 536,1,2,female,7,0,2,26.25,S
538 | 537,0,1,male,45,0,0,26.55,S
539 | 538,1,1,female,30,0,0,106.425,C
540 | 539,0,3,male,29.86346817,0,0,14.5,S
541 | 540,1,1,female,22,0,2,49.5,C
542 | 541,1,1,female,36,0,2,71,S
543 | 542,0,3,female,9,4,2,31.275,S
544 | 543,0,3,female,11,4,2,31.275,S
545 | 544,1,2,male,32,1,0,26,S
546 | 545,0,1,male,50,1,0,106.425,C
547 | 546,0,1,male,64,0,0,26,S
548 | 547,1,2,female,19,1,0,26,S
549 | 548,1,2,male,28.81656075,0,0,13.8625,C
550 | 549,0,3,male,33,1,1,20.525,S
551 | 550,1,2,male,8,1,1,36.75,S
552 | 551,1,1,male,17,0,2,110.8833,C
553 | 552,0,2,male,27,0,0,26,S
554 | 553,0,3,male,29.46199226,0,0,7.8292,Q
555 | 554,1,3,male,22,0,0,7.225,C
556 | 555,1,3,female,22,0,0,7.775,S
557 | 556,0,1,male,62,0,0,26.55,S
558 | 557,1,1,female,48,1,0,39.6,C
559 | 558,0,1,male,41.64253998,0,0,227.525,C
560 | 559,1,1,female,39,1,1,79.65,S
561 | 560,1,3,female,36,1,0,17.4,S
562 | 561,0,3,male,30.46866989,0,0,7.75,Q
563 | 562,0,3,male,40,0,0,7.8958,S
564 | 563,0,2,male,28,0,0,13.5,S
565 | 564,0,3,male,29.31497765,0,0,8.05,S
566 | 565,0,3,female,29.31497765,0,0,8.05,S
567 | 566,0,3,male,24,2,0,24.15,S
568 | 567,0,3,male,19,0,0,7.8958,S
569 | 568,0,3,female,29,0,4,21.075,S
570 | 569,0,3,male,27.99400139,0,0,7.2292,C
571 | 570,1,3,male,32,0,0,7.8542,S
572 | 571,1,2,male,62,0,0,10.5,S
573 | 572,1,1,female,53,2,0,51.4792,S
574 | 573,1,1,male,36,0,0,26.3875,S
575 | 574,1,3,female,25.35289001,0,0,7.75,Q
576 | 575,0,3,male,16,0,0,8.05,S
577 | 576,0,3,male,19,0,0,14.5,S
578 | 577,1,2,female,34,0,0,13,S
579 | 578,1,1,female,39,1,0,55.9,S
580 | 579,0,3,female,27.2393074,1,0,14.4583,C
581 | 580,1,3,male,32,0,0,7.925,S
582 | 581,1,2,female,25,1,1,30,S
583 | 582,1,1,female,39,1,1,110.8833,C
584 | 583,0,2,male,54,0,0,26,S
585 | 584,0,1,male,36,0,0,40.125,C
586 | 585,0,3,male,26.80753517,0,0,8.7125,C
587 | 586,1,1,female,18,0,2,79.65,S
588 | 587,0,2,male,47,0,0,15,S
589 | 588,1,1,male,60,1,1,79.2,C
590 | 589,0,3,male,22,0,0,8.05,S
591 | 590,0,3,male,29.31497765,0,0,8.05,S
592 | 591,0,3,male,35,0,0,7.125,S
593 | 592,1,1,female,52,1,0,78.2667,C
594 | 593,0,3,male,47,0,0,7.25,S
595 | 594,0,3,female,21.50617981,0,2,7.75,Q
596 | 595,0,2,male,37,1,0,26,S
597 | 596,0,3,male,36,1,1,24.15,S
598 | 597,1,2,female,29.52746773,0,0,33,S
599 | 598,0,3,male,49,0,0,0,S
600 | 599,0,3,male,27.99400139,0,0,7.225,C
601 | 600,1,1,male,49,1,0,56.9292,C
602 | 601,1,2,female,24,2,1,27,S
603 | 602,0,3,male,29.11478615,0,0,7.8958,S
604 | 603,0,1,male,45.12653732,0,0,42.4,S
605 | 604,0,3,male,44,0,0,8.05,S
606 | 605,1,1,male,35,0,0,26.55,C
607 | 606,0,3,male,36,1,0,15.55,S
608 | 607,0,3,male,30,0,0,7.8958,S
609 | 608,1,1,male,27,0,0,30.5,S
610 | 609,1,2,female,22,1,2,41.5792,C
611 | 610,1,1,female,40,0,0,153.4625,S
612 | 611,0,3,female,39,1,5,31.275,S
613 | 612,0,3,male,29.71520042,0,0,7.05,S
614 | 613,1,3,female,24.19193268,1,0,15.5,Q
615 | 614,0,3,male,30.46866989,0,0,7.75,Q
616 | 615,0,3,male,35,0,0,8.05,S
617 | 616,1,2,female,24,1,2,65,S
618 | 617,0,3,male,34,1,1,14.4,S
619 | 618,0,3,female,26,1,0,16.1,S
620 | 619,1,2,female,4,2,1,39,S
621 | 620,0,2,male,26,0,0,10.5,S
622 | 621,0,3,male,27,1,0,14.4542,C
623 | 622,1,1,male,42,1,0,52.5542,S
624 | 623,1,3,male,20,1,1,15.7417,C
625 | 624,0,3,male,21,0,0,7.8542,S
626 | 625,0,3,male,21,0,0,16.1,S
627 | 626,0,1,male,61,0,0,32.3208,S
628 | 627,0,2,male,57,0,0,12.35,Q
629 | 628,1,1,female,21,0,0,77.9583,S
630 | 629,0,3,male,26,0,0,7.8958,S
631 | 630,0,3,male,30.06240654,0,0,7.7333,Q
632 | 631,1,1,male,80,0,0,30,S
633 | 632,0,3,male,51,0,0,7.0542,S
634 | 633,1,1,male,32,0,0,30.5,C
635 | 634,0,1,male,45.66476822,0,0,0,S
636 | 635,0,3,female,9,3,2,27.9,S
637 | 636,1,2,female,28,0,0,13,S
638 | 637,0,3,male,32,0,0,7.925,S
639 | 638,0,2,male,31,1,1,26.25,S
640 | 639,0,3,female,41,0,5,39.6875,S
641 | 640,0,3,male,28.96050453,1,0,16.1,S
642 | 641,0,3,male,20,0,0,7.8542,S
643 | 642,1,1,female,24,0,0,69.3,C
644 | 643,0,3,female,2,3,2,27.9,S
645 | 644,1,3,male,23.2102108,0,0,56.4958,S
646 | 645,1,3,female,0.75,2,1,19.2583,C
647 | 646,1,1,male,48,1,0,76.7292,C
648 | 647,0,3,male,19,0,0,7.8958,S
649 | 648,1,1,male,56,0,0,35.5,C
650 | 649,0,3,male,29.71520042,0,0,7.55,S
651 | 650,1,3,female,23,0,0,7.55,S
652 | 651,0,3,male,29.11478615,0,0,7.8958,S
653 | 652,1,2,female,18,0,1,23,S
654 | 653,0,3,male,21,0,0,8.4333,S
655 | 654,1,3,female,24.34621239,0,0,7.8292,Q
656 | 655,0,3,female,18,0,0,6.75,Q
657 | 656,0,2,male,24,2,0,73.5,S
658 | 657,0,3,male,29.11478615,0,0,7.8958,S
659 | 658,0,3,female,32,1,1,15.5,Q
660 | 659,0,2,male,23,0,0,13,S
661 | 660,0,1,male,58,0,2,113.275,C
662 | 661,1,1,male,50,2,0,133.65,S
663 | 662,0,3,male,40,0,0,7.225,C
664 | 663,0,1,male,47,0,0,25.5875,S
665 | 664,0,3,male,36,0,0,7.4958,S
666 | 665,1,3,male,20,1,0,7.925,S
667 | 666,0,2,male,32,2,0,73.5,S
668 | 667,0,2,male,25,0,0,13,S
669 | 668,0,3,male,29.11478615,0,0,7.775,S
670 | 669,0,3,male,43,0,0,8.05,S
671 | 670,1,1,female,39.10777664,1,0,52,S
672 | 671,1,2,female,40,1,1,39,S
673 | 672,0,1,male,31,1,0,52,S
674 | 673,0,2,male,70,0,0,10.5,S
675 | 674,1,2,male,31,0,0,13,S
676 | 675,0,2,male,34.65427399,0,0,0,S
677 | 676,0,3,male,18,0,0,7.775,S
678 | 677,0,3,male,24.5,0,0,8.05,S
679 | 678,1,3,female,18,0,0,9.8417,S
680 | 679,0,3,female,43,1,6,46.9,S
681 | 680,1,1,male,36,0,1,512.3292,C
682 | 681,0,3,female,29.3318119,0,0,8.1375,Q
683 | 682,1,1,male,27,0,0,76.7292,C
684 | 683,0,3,male,20,0,0,9.225,S
685 | 684,0,3,male,14,5,2,46.9,S
686 | 685,0,2,male,60,1,1,39,S
687 | 686,0,2,male,25,1,2,41.5792,C
688 | 687,0,3,male,14,4,1,39.6875,S
689 | 688,0,3,male,19,0,0,10.1708,S
690 | 689,0,3,male,18,0,0,7.7958,S
691 | 690,1,1,female,15,0,1,211.3375,S
692 | 691,1,1,male,31,1,0,57,S
693 | 692,1,3,female,4,0,1,13.4167,C
694 | 693,1,3,male,23.2102108,0,0,56.4958,S
695 | 694,0,3,male,25,0,0,7.225,C
696 | 695,0,1,male,60,0,0,26.55,S
697 | 696,0,2,male,52,0,0,13.5,S
698 | 697,0,3,male,44,0,0,8.05,S
699 | 698,1,3,female,24.94662666,0,0,7.7333,Q
700 | 699,0,1,male,49,1,1,110.8833,C
701 | 700,0,3,male,42,0,0,7.65,S
702 | 701,1,1,female,18,1,0,227.525,C
703 | 702,1,1,male,35,0,0,26.2875,S
704 | 703,0,3,female,18,0,1,14.4542,C
705 | 704,0,3,male,25,0,0,7.7417,Q
706 | 705,0,3,male,26,1,0,7.8542,S
707 | 706,0,2,male,39,0,0,26,S
708 | 707,1,2,female,45,0,0,13.5,S
709 | 708,1,1,male,42,0,0,26.2875,S
710 | 709,1,1,female,22,0,0,151.55,S
711 | 710,1,3,male,16.90238953,1,1,15.2458,C
712 | 711,1,1,female,24,0,0,49.5042,C
713 | 712,0,1,male,46.45271301,0,0,26.55,S
714 | 713,1,1,male,48,1,0,52,S
715 | 714,0,3,male,29,0,0,9.4833,S
716 | 715,0,2,male,52,0,0,13,S
717 | 716,0,3,male,19,0,0,7.65,S
718 | 717,1,1,female,38,0,0,227.525,C
719 | 718,1,2,female,27,0,0,10.5,S
720 | 719,0,3,male,30.21067619,0,0,15.5,Q
721 | 720,0,3,male,33,0,0,7.775,S
722 | 721,1,2,female,6,0,1,33,S
723 | 722,0,3,male,17,1,0,7.0542,S
724 | 723,0,2,male,34,0,0,13,S
725 | 724,0,2,male,50,0,0,13,S
726 | 725,1,1,male,27,1,0,53.1,S
727 | 726,0,3,male,20,0,0,8.6625,S
728 | 727,1,2,female,30,3,0,21,S
729 | 728,1,3,female,24.94662666,0,0,7.7375,Q
730 | 729,0,2,male,25,1,0,26,S
731 | 730,0,3,female,25,1,0,7.925,S
732 | 731,1,1,female,29,0,0,211.3375,S
733 | 732,0,3,male,11,0,0,18.7875,C
734 | 733,0,2,male,34.65427399,0,0,0,S
735 | 734,0,2,male,23,0,0,13,S
736 | 735,0,2,male,23,0,0,13,S
737 | 736,0,3,male,28.5,0,0,16.1,S
738 | 737,0,3,female,48,1,3,34.375,S
739 | 738,1,1,male,35,0,0,512.3292,C
740 | 739,0,3,male,29.11478615,0,0,7.8958,S
741 | 740,0,3,male,29.11478615,0,0,7.8958,S
742 | 741,1,1,male,40.53792572,0,0,30,S
743 | 742,0,1,male,36,1,0,78.85,S
744 | 743,1,1,female,21,2,2,262.375,C
745 | 744,0,3,male,24,1,0,16.1,S
746 | 745,1,3,male,31,0,0,7.925,S
747 | 746,0,1,male,70,1,1,71,S
748 | 747,0,3,male,16,1,1,20.25,S
749 | 748,1,2,female,30,0,0,13,S
750 | 749,0,1,male,19,1,0,53.1,S
751 | 750,0,3,male,31,0,0,7.75,Q
752 | 751,1,2,female,4,1,1,23,S
753 | 752,1,3,male,6,0,1,12.475,S
754 | 753,0,3,male,33,0,0,9.5,S
755 | 754,0,3,male,23,0,0,7.8958,S
756 | 755,1,2,female,48,1,2,65,S
757 | 756,1,2,male,0.67,1,1,14.5,S
758 | 757,0,3,male,28,0,0,7.7958,S
759 | 758,0,2,male,18,0,0,11.5,S
760 | 759,0,3,male,34,0,0,8.05,S
761 | 760,1,1,female,33,0,0,86.5,S
762 | 761,0,3,male,29.86346817,0,0,14.5,S
763 | 762,0,3,male,41,0,0,7.125,S
764 | 763,1,3,male,20,0,0,7.2292,C
765 | 764,1,1,female,36,1,2,120,S
766 | 765,0,3,male,16,0,0,7.775,S
767 | 766,1,1,female,51,1,0,77.9583,S
768 | 767,0,1,male,43.40534973,0,0,39.6,C
769 | 768,0,3,female,30.5,0,0,7.75,Q
770 | 769,0,3,male,29.09642601,1,0,24.15,Q
771 | 770,0,3,male,32,0,0,8.3625,S
772 | 771,0,3,male,24,0,0,9.5,S
773 | 772,0,3,male,48,0,0,7.8542,S
774 | 773,0,2,female,57,0,0,10.5,S
775 | 774,0,3,male,27.99400139,0,0,7.225,C
776 | 775,1,2,female,54,1,3,23,S
777 | 776,0,3,male,18,0,0,7.75,S
778 | 777,0,3,male,30.46866989,0,0,7.75,Q
779 | 778,1,3,female,5,0,0,12.475,S
780 | 779,0,3,male,30.06240654,0,0,7.7375,Q
781 | 780,1,1,female,43,0,1,211.3375,S
782 | 781,1,3,female,13,0,0,7.2292,C
783 | 782,1,1,female,17,1,0,57,S
784 | 783,0,1,male,29,0,0,30,S
785 | 784,0,3,male,19.7046032,1,2,23.45,S
786 | 785,0,3,male,25,0,0,7.05,S
787 | 786,0,3,male,25,0,0,7.25,S
788 | 787,1,3,female,18,0,0,7.4958,S
789 | 788,0,3,male,8,4,1,29.125,Q
790 | 789,1,3,male,1,1,2,20.575,S
791 | 790,0,1,male,46,0,0,79.2,C
792 | 791,0,3,male,30.46866989,0,0,7.75,Q
793 | 792,0,2,male,16,0,0,26,S
794 | 793,0,3,female,7.838418007,8,2,69.55,S
795 | 794,0,1,male,43.93253326,0,0,30.6958,C
796 | 795,0,3,male,25,0,0,7.8958,S
797 | 796,0,2,male,39,0,0,13,S
798 | 797,1,1,female,49,0,0,25.9292,S
799 | 798,1,3,female,31,0,0,8.6833,S
800 | 799,0,3,male,30,0,0,7.2292,C
801 | 800,0,3,female,30,1,1,24.15,S
802 | 801,0,2,male,34,0,0,13,S
803 | 802,1,2,female,31,1,1,26.25,S
804 | 803,1,1,male,11,1,2,120,S
805 | 804,1,3,male,0.42,0,1,8.5167,C
806 | 805,1,3,male,27,0,0,6.975,S
807 | 806,0,3,male,31,0,0,7.775,S
808 | 807,0,1,male,39,0,0,0,S
809 | 808,0,3,female,18,0,0,7.775,S
810 | 809,0,2,male,39,0,0,13,S
811 | 810,1,1,female,33,1,0,53.1,S
812 | 811,0,3,male,26,0,0,7.8875,S
813 | 812,0,3,male,39,0,0,24.15,S
814 | 813,0,2,male,35,0,0,10.5,S
815 | 814,0,3,female,6,4,2,31.275,S
816 | 815,0,3,male,30.5,0,0,8.05,S
817 | 816,0,1,male,45.66476822,0,0,0,S
818 | 817,0,3,female,23,0,0,7.925,S
819 | 818,0,2,male,31,1,1,37.0042,C
820 | 819,0,3,male,43,0,0,6.45,S
821 | 820,0,3,male,10,3,2,27.9,S
822 | 821,1,1,female,52,1,1,93.5,S
823 | 822,1,3,male,27,0,0,8.6625,S
824 | 823,0,1,male,38,0,0,0,S
825 | 824,1,3,female,27,0,1,12.475,S
826 | 825,0,3,male,2,4,1,39.6875,S
827 | 826,0,3,male,30.06240654,0,0,6.95,Q
828 | 827,0,3,male,28.32599068,0,0,56.4958,S
829 | 828,1,2,male,1,0,2,37.0042,C
830 | 829,1,3,male,25.35289001,0,0,7.75,Q
831 | 830,1,1,female,62,0,0,80,S
832 | 831,1,3,female,15,1,0,14.4542,C
833 | 832,1,2,male,0.83,1,1,18.75,S
834 | 833,0,3,male,27.99400139,0,0,7.2292,C
835 | 834,0,3,male,23,0,0,7.8542,S
836 | 835,0,3,male,18,0,0,8.3,S
837 | 836,1,1,female,39,1,1,83.1583,C
838 | 837,0,3,male,21,0,0,8.6625,S
839 | 838,0,3,male,29.31497765,0,0,8.05,S
840 | 839,1,3,male,32,0,0,56.4958,S
841 | 840,1,1,male,38.81673813,0,0,29.7,C
842 | 841,0,3,male,20,0,0,7.925,S
843 | 842,0,2,male,16,0,0,10.5,S
844 | 843,1,1,female,30,0,0,31,C
845 | 844,0,3,male,34.5,0,0,6.4375,C
846 | 845,0,3,male,17,0,0,8.6625,S
847 | 846,0,3,male,42,0,0,7.55,S
848 | 847,0,3,male,7.838418007,8,2,69.55,S
849 | 848,0,3,male,35,0,0,7.8958,C
850 | 849,0,2,male,28,0,1,33,S
851 | 850,1,1,female,34.40013123,1,0,89.1042,C
852 | 851,0,3,male,4,4,2,31.275,S
853 | 852,0,3,male,74,0,0,7.775,S
854 | 853,0,3,female,9,1,1,15.2458,C
855 | 854,1,1,female,16,0,1,39.4,S
856 | 855,0,2,female,44,1,0,26,S
857 | 856,1,3,female,18,0,1,9.35,S
858 | 857,1,1,female,45,1,1,164.8667,S
859 | 858,1,1,male,51,0,0,26.55,S
860 | 859,1,3,female,24,0,3,19.2583,C
861 | 860,0,3,male,27.99400139,0,0,7.2292,C
862 | 861,0,3,male,41,2,0,14.1083,S
863 | 862,0,2,male,21,1,0,11.5,S
864 | 863,1,1,female,48,0,0,25.9292,S
865 | 864,0,3,female,7.838418007,8,2,69.55,S
866 | 865,0,2,male,24,0,0,13,S
867 | 866,1,2,female,42,0,0,13,S
868 | 867,1,2,female,27,1,0,13.8583,C
869 | 868,0,1,male,31,0,0,50.4958,S
870 | 869,0,3,male,28.5287323,0,0,9.5,S
871 | 870,1,3,male,4,1,1,11.1333,S
872 | 871,0,3,male,26,0,0,7.8958,S
873 | 872,1,1,female,47,1,1,52.5542,S
874 | 873,0,1,male,33,0,0,5,S
875 | 874,0,3,male,47,0,0,9,S
876 | 875,1,2,female,28,1,0,24,C
877 | 876,1,3,female,15,0,0,7.225,C
878 | 877,0,3,male,20,0,0,9.8458,S
879 | 878,0,3,male,19,0,0,7.8958,S
880 | 879,0,3,male,29.11478615,0,0,7.8958,S
881 | 880,1,1,female,56,0,1,83.1583,C
882 | 881,1,2,female,25,0,1,26,S
883 | 882,0,3,male,33,0,0,7.8958,S
884 | 883,0,3,female,22,0,0,10.5167,S
885 | 884,0,2,male,28,0,0,10.5,S
886 | 885,0,3,male,25,0,0,7.05,S
887 | 886,0,3,female,39,0,5,29.125,Q
888 | 887,0,2,male,27,0,0,13,S
889 | 888,1,1,female,19,0,0,30,S
890 | 889,0,3,female,19.7046032,1,2,23.45,S
891 | 890,1,1,male,26,0,0,30,C
892 | 891,0,3,male,32,0,0,7.75,Q
893 | 


--------------------------------------------------------------------------------
/1. Data Exploration/ReadMe.md:
--------------------------------------------------------------------------------
 1 | - **Author：** 马肖
 2 | - **E-Mail：** maxiaoscut@aliyun.com
 3 | - **GitHub：**  https://github.com/Albertsr
 4 | ---
 5 | 
 6 | ### 数据探索实例(请点击以下链接)
 7 | ##### [1.对Titanic数据集进行探索](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/1.%20Data%20Exploration/%E4%B8%93%E9%A2%981%EF%BC%9A%E5%88%86%E7%B1%BB%E9%97%AE%E9%A2%98%E7%9A%84%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2%28%E4%BB%A5Titanic%E6%95%B0%E6%8D%AE%E9%9B%86%E4%B8%BA%E4%BE%8B%29.ipynb)
 8 | 
 9 | ##### [2.回归问题中的相关系数矩阵与热力图(以Boston数据集为例)](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/1.%20Data%20Exploration/%E4%B8%93%E9%A2%982%EF%BC%9A%E5%9B%9E%E5%BD%92%E9%97%AE%E9%A2%98%E4%B8%AD%E7%9A%84%E7%9B%B8%E5%85%B3%E7%B3%BB%E6%95%B0%E7%9F%A9%E9%98%B5%E4%B8%8E%E7%83%AD%E5%8A%9B%E5%9B%BE%28%E4%BB%A5Boston%E6%95%B0%E6%8D%AE%E9%9B%86%E4%B8%BA%E4%BE%8B%29.ipynb)
10 | 
11 | ##### [3.对iris数据集进行数据探索](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/1.%20Data%20Exploration/%E4%B8%93%E9%A2%983%EF%BC%9A%E5%AF%B9iris%E6%95%B0%E6%8D%AE%E9%9B%86%E8%BF%9B%E8%A1%8C%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2.ipynb)
12 | 
13 | ---
14 | 
15 | ### 数据探索的基本思路（以Titanic数据集为例）
16 | 
17 | #### 1.1 离散型特征的探索
18 | - 各等级舱位等级Pclass的人数
19 |   - train["Pclass"].value_counts()
20 |   - sns.countplot(x='Pclass', data=train)
21 | 
22 | - 船舱等级对生还率的影响
23 |   - train.pivot_table(index=["Pclass"], values=["Survived"], aggfunc="mean")
24 |   - sns.barplot(x='Pclass', y='Survived', data=train)
25 | 
26 | - 与其他离散型变量的联合探索
27 |   -  对Pclass、Sex联合探索：无论哪个船舱等级，女性都有更高的生还率
28 |      - train.pivot_table(index=["Pclass"], values=["Survived"], columns=["Sex"])
29 |      - sns.barplot(x='Pclass', y='Survived', data=train, hue='Sex', dodge=True)
30 | 
31 |   - 对Pclass、Embarked进行联合探索：S港口登录的乘客生还率最低，C港口登录的乘客生还率稍高于Q
32 |     - train.pivot_table(index=["Pclass"], values=["Survived"], columns=["Embarked"])
33 |     - sns.barplot(x='Pclass', y='Survived', data=train, hue='Embarked', dodge=True) 
34 |   
35 | ---
36 | 
37 | #### 1.2 连续型特征的探索
38 | 
39 | - 通过箱型图发现Fare特征存在一些极端的异常值，对这些特征修改为95分位点的值
40 |   - sns.boxplot(train["Fare"], ax=axes[0])  
41 |   - Fare_per_95 = np.percentile(train["Fare"], 95)
42 |   - train["Fare"][train["Fare"] >= Fare_per_95] = Fare_per_95
43 | 
44 | - 通过连续型变量的KDE曲线，探索连续型变量对分类结果的影响，还可以为连续型特征的离散化处理提供参考
45 |   - 查看生还与否的年龄Age密度
46 |     - sns.distplot(train['Age'][train['Survived']==0], ax=axes[0])
47 |     - sns.distplot(train['Age'][train['Survived']==1],ax=axes[0])
48 |    
49 |   - 查看生还与否的费用Fare密度
50 |     - sns.distplot(train['Fare'][train['Survived']==0], ax=axes[1])
51 |     - sns.distplot(train['Fare'][train['Survived']==1], ax=axes[1])
52 | 
53 | - 某些特征虽然是连续型变量，但是取值较少，分布不均匀，可作为离散型变量处理
54 |    - 例如家属个数
55 |    - sns.barplot(x="Parch", y="Survived", data=train)
56 |    - sns.barplot(x="SibSp", y="Survived", data=train)
57 |    
58 | ---
59 | 
60 | #### 3. 离散型与连续性特征的联合探索
61 | 
62 | - 无论什么性别，舱位与年龄成反比；无论哪个舱位，男性要年长于女性
63 |     - train.pivot_table(index=["Pclass"], values=["Age", 'Fare'], columns=["Sex"])
64 |     - sns.barplot(x='Pclass', y='Age', data=train, hue="Sex", dodge=True, errwidth=2)
65 |     
66 | - 舱位与票价成正比；无论哪个舱位，女性支付的费用都高于男性
67 |     - train.pivot_table(index=["Pclass"], values=["Fare"], columns=["Sex"])
68 |     - sns.barplot(x='Pclass', y='Fare', data=train, hue="Sex", dodge=True, errwidth=2) 
69 | 


--------------------------------------------------------------------------------
/2. Data Preprocessing/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/2. Data Preprocessing/专题1：数据的标准化、归一化与正则化.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 1 数据的标准化"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### 1.1 scale：sklearn.preprocessing.scale(X, axis=0, with_mean=True, with_std=True, copy=True)\n",
 15 |     "- with_mean : boolean, True by default, If True, center the data before scaling. 即使得对应axis上的均值为0\n",
 16 |     "- with_std : boolean, True by default，If True, scale the data to unit variance. 即使得对应axis上的方差为1"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {
 23 |     "scrolled": true
 24 |    },
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Mean: [0 0 0], \n",
 31 |       "Std: [1. 1. 1.]\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "import numpy as np\n",
 37 |     "from sklearn.preprocessing import * \n",
 38 |     "\n",
 39 |     "rg = np.random.RandomState(2017)\n",
 40 |     "X_train = rg.uniform(0, 5, (4,3))\n",
 41 |     "X_scaled = scale(X_train)\n",
 42 |     "print('Mean: {}, \\nStd: {}'.format(X_scaled.mean(axis=0, dtype=np.int), X_scaled.std(axis=0)))"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "#### scale的参数axis=0，表示对每列进行标准化，即每列减去此列均值再除以其方差"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "metadata": {
 56 |     "scrolled": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "def f(array):\n",
 61 |     "    result = (array - np.mean(array)) / np.std(array, ddof=0)  # ddof默认为0\n",
 62 |     "    return result\n",
 63 |     "\n",
 64 |     "scale_result = np.apply_along_axis(f, axis=0, arr=X_train)\n",
 65 |     "assert np.allclose(X_scaled, scale_result)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "### 1.2 StandardScaler：sklearn.preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)\n",
 73 |     "\n",
 74 |     "可通过fit方法获取某特征的均值与方差，再运用transform方法标准化其他特征\n",
 75 |     "\n",
 76 |     "#### 优点：\n",
 77 |     "1)提升模型的收敛速度\n",
 78 |     "\n",
 79 |     "2)使得各指标值都处于同一个量纲上，提升模型的精度"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 3,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/html": [
 90 |        "<div>\n",
 91 |        "<style scoped>\n",
 92 |        "    .dataframe tbody tr th:only-of-type {\n",
 93 |        "        vertical-align: middle;\n",
 94 |        "    }\n",
 95 |        "\n",
 96 |        "    .dataframe tbody tr th {\n",
 97 |        "        vertical-align: top;\n",
 98 |        "    }\n",
 99 |        "\n",
100 |        "    .dataframe thead th {\n",
101 |        "        text-align: right;\n",
102 |        "    }\n",
103 |        "</style>\n",
104 |        "<table border=\"1\" class=\"dataframe\">\n",
105 |        "  <thead>\n",
106 |        "    <tr style=\"text-align: right;\">\n",
107 |        "      <th></th>\n",
108 |        "      <th>sepal length (cm)</th>\n",
109 |        "      <th>sepal width (cm)</th>\n",
110 |        "      <th>petal length (cm)</th>\n",
111 |        "      <th>petal width (cm)</th>\n",
112 |        "    </tr>\n",
113 |        "  </thead>\n",
114 |        "  <tbody>\n",
115 |        "    <tr>\n",
116 |        "      <th>143</th>\n",
117 |        "      <td>6.8</td>\n",
118 |        "      <td>3.2</td>\n",
119 |        "      <td>5.9</td>\n",
120 |        "      <td>2.3</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>115</th>\n",
124 |        "      <td>6.4</td>\n",
125 |        "      <td>3.2</td>\n",
126 |        "      <td>5.3</td>\n",
127 |        "      <td>2.3</td>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>102</th>\n",
131 |        "      <td>7.1</td>\n",
132 |        "      <td>3.0</td>\n",
133 |        "      <td>5.9</td>\n",
134 |        "      <td>2.1</td>\n",
135 |        "    </tr>\n",
136 |        "    <tr>\n",
137 |        "      <th>51</th>\n",
138 |        "      <td>6.4</td>\n",
139 |        "      <td>3.2</td>\n",
140 |        "      <td>4.5</td>\n",
141 |        "      <td>1.5</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th>76</th>\n",
145 |        "      <td>6.8</td>\n",
146 |        "      <td>2.8</td>\n",
147 |        "      <td>4.8</td>\n",
148 |        "      <td>1.4</td>\n",
149 |        "    </tr>\n",
150 |        "  </tbody>\n",
151 |        "</table>\n",
152 |        "</div>"
153 |       ],
154 |       "text/plain": [
155 |        "     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)\n",
156 |        "143                6.8               3.2                5.9               2.3\n",
157 |        "115                6.4               3.2                5.3               2.3\n",
158 |        "102                7.1               3.0                5.9               2.1\n",
159 |        "51                 6.4               3.2                4.5               1.5\n",
160 |        "76                 6.8               2.8                4.8               1.4"
161 |       ]
162 |      },
163 |      "execution_count": 3,
164 |      "metadata": {},
165 |      "output_type": "execute_result"
166 |     }
167 |    ],
168 |    "source": [
169 |     "import numpy as np\n",
170 |     "import pandas as pd\n",
171 |     "from sklearn.datasets import load_iris \n",
172 |     "\n",
173 |     "dataset = load_iris()\n",
174 |     "np.random.seed(2017)\n",
175 |     "iris = pd.DataFrame(dataset.data, columns=dataset.feature_names).sample(5)\n",
176 |     "iris"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "#### 对某数据框直接调用fit_transform时，等价于单独对每列分别进行scale操作"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 4,
189 |    "metadata": {
190 |     "scrolled": false
191 |    },
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "array([[ 0.372678  ,  0.75      ,  1.0932857 ,  0.96958969],\n",
197 |        "       [-1.11803399,  0.75      ,  0.03526728,  0.96958969],\n",
198 |        "       [ 1.49071198, -0.5       ,  1.0932857 ,  0.45927933],\n",
199 |        "       [-1.11803399,  0.75      , -1.37542395, -1.07165176],\n",
200 |        "       [ 0.372678  , -1.75      , -0.84641474, -1.32680694]])"
201 |       ]
202 |      },
203 |      "execution_count": 4,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "scaler = StandardScaler()\n",
210 |     "iris_scaled = scaler.fit_transform(iris)  \n",
211 |     "iris_scaled"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 5,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "(array([0, 0, 0, 0]), array([1., 1., 1., 1.]))"
223 |       ]
224 |      },
225 |      "execution_count": 5,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "iris_scaled.mean(axis=0, dtype=np.int), iris_scaled.std(axis=0)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 6,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "data": {
241 |       "text/plain": [
242 |        "True"
243 |       ]
244 |      },
245 |      "execution_count": 6,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "# 等价于对每列单独调用scale\n",
252 |     "np.allclose(scaler.fit_transform(iris), scale(iris))"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "markdown",
257 |    "metadata": {},
258 |    "source": [
259 |     "### 1.2 数据的归一化：将数据映射到指定的范围，用于去除不同维度数据的量纲以及量纲单位"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "##### sklearn.preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "#### 转换过程\n",
274 |     "X_std = (X - X.min(axis=0)) / (X.max(axis=0) - X.min(axis=0))\n",
275 |     "\n",
276 |     "X_scaled = X_std * (max - min) + min"
277 |    ]
278 |   },
279 |   {
280 |    "cell_type": "code",
281 |    "execution_count": 7,
282 |    "metadata": {},
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "MinMaxScaler(copy=True, feature_range=(0, 1))"
288 |       ]
289 |      },
290 |      "execution_count": 7,
291 |      "metadata": {},
292 |      "output_type": "execute_result"
293 |     }
294 |    ],
295 |    "source": [
296 |     "data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]\n",
297 |     "scaler = MinMaxScaler()\n",
298 |     "scaler.fit(data)"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 8,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "data": {
308 |       "text/plain": [
309 |        "array([[0.  , 0.  ],\n",
310 |        "       [0.25, 0.25],\n",
311 |        "       [0.5 , 0.5 ],\n",
312 |        "       [1.  , 1.  ]])"
313 |       ]
314 |      },
315 |      "execution_count": 8,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "scaler.transform(data)"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": 9,
327 |    "metadata": {
328 |     "scrolled": true
329 |    },
330 |    "outputs": [
331 |     {
332 |      "data": {
333 |       "text/plain": [
334 |        "array([[0.  , 0.  ],\n",
335 |        "       [0.25, 0.25],\n",
336 |        "       [0.5 , 0.5 ],\n",
337 |        "       [1.  , 1.  ]])"
338 |       ]
339 |      },
340 |      "execution_count": 9,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "# 或者直接调用fit_transform \n",
347 |     "scaler.fit_transform(data)"
348 |    ]
349 |   },
350 |   {
351 |    "cell_type": "markdown",
352 |    "metadata": {},
353 |    "source": [
354 |     "### 1.3 MaxAbsScaler"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 10,
360 |    "metadata": {
361 |     "scrolled": true
362 |    },
363 |    "outputs": [
364 |     {
365 |      "data": {
366 |       "text/plain": [
367 |        "array([[ 0.5, -1. ,  1. ],\n",
368 |        "       [ 1. ,  0. ,  0. ],\n",
369 |        "       [ 0. ,  1. , -0.5]])"
370 |       ]
371 |      },
372 |      "execution_count": 10,
373 |      "metadata": {},
374 |      "output_type": "execute_result"
375 |     }
376 |    ],
377 |    "source": [
378 |     "X_train = np.array([[ 1., -1.,  2.],\n",
379 |     "                    [ 2.,  0.,  0.],\n",
380 |     "                    [ 0.,  1., -1.]])\n",
381 |     "\n",
382 |     "max_abs_scaler = MaxAbsScaler()\n",
383 |     "X_train_maxabs = max_abs_scaler.fit_transform(X_train)\n",
384 |     "X_train_maxabs           "
385 |    ]
386 |   },
387 |   {
388 |    "cell_type": "code",
389 |    "execution_count": 11,
390 |    "metadata": {
391 |     "scrolled": true
392 |    },
393 |    "outputs": [
394 |     {
395 |      "data": {
396 |       "text/plain": [
397 |        "array([2., 1., 2.])"
398 |       ]
399 |      },
400 |      "execution_count": 11,
401 |      "metadata": {},
402 |      "output_type": "execute_result"
403 |     }
404 |    ],
405 |    "source": [
406 |     "max_abs_scaler.scale_    "
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 12,
412 |    "metadata": {
413 |     "scrolled": true
414 |    },
415 |    "outputs": [
416 |     {
417 |      "data": {
418 |       "text/plain": [
419 |        "array([[-1.5, -1. ,  2. ]])"
420 |       ]
421 |      },
422 |      "execution_count": 12,
423 |      "metadata": {},
424 |      "output_type": "execute_result"
425 |     }
426 |    ],
427 |    "source": [
428 |     "X_test = np.array([[ -3., -1.,  4.]])\n",
429 |     "X_test_maxabs = max_abs_scaler.transform(X_test)\n",
430 |     "X_test_maxabs                 "
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "markdown",
435 |    "metadata": {},
436 |    "source": [
437 |     "### 1.4 RobustScaler"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "markdown",
442 |    "metadata": {},
443 |    "source": [
444 |     "#### 转化过程:(x-median) / IQR, IQR等于75分位点减去25分位点处的值"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": 13,
450 |    "metadata": {},
451 |    "outputs": [
452 |     {
453 |      "data": {
454 |       "text/plain": [
455 |        "array([[-0.12669367,  0.61018033,  1.22235048],\n",
456 |        "       [-2.02310438, -0.03205827,  0.34615926],\n",
457 |        "       [ 0.12669367, -3.19747007, -0.70069397],\n",
458 |        "       [ 1.2167336 ,  0.03205827, -0.34615926]])"
459 |       ]
460 |      },
461 |      "execution_count": 13,
462 |      "metadata": {},
463 |      "output_type": "execute_result"
464 |     }
465 |    ],
466 |    "source": [
467 |     "np.random.seed(2018)\n",
468 |     "X_train = np.random.randn(4,3)\n",
469 |     "\n",
470 |     "max_abs_scaler = RobustScaler()\n",
471 |     "X_train_maxabs = max_abs_scaler.fit_transform(X_train)\n",
472 |     "X_train_maxabs               "
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 14,
478 |    "metadata": {},
479 |    "outputs": [
480 |     {
481 |      "data": {
482 |       "text/plain": [
483 |        "array([-0.20977884,  0.50624895,  0.34544916])"
484 |       ]
485 |      },
486 |      "execution_count": 14,
487 |      "metadata": {},
488 |      "output_type": "execute_result"
489 |     }
490 |    ],
491 |    "source": [
492 |     "# 求各列的中位数\n",
493 |     "max_abs_scaler.center_ "
494 |    ]
495 |   },
496 |   {
497 |    "cell_type": "code",
498 |    "execution_count": 15,
499 |    "metadata": {
500 |     "scrolled": false
501 |    },
502 |    "outputs": [
503 |     {
504 |      "data": {
505 |       "text/plain": [
506 |        "array([0.52874591, 0.12390117, 1.47498622])"
507 |       ]
508 |      },
509 |      "execution_count": 15,
510 |      "metadata": {},
511 |      "output_type": "execute_result"
512 |     }
513 |    ],
514 |    "source": [
515 |     "# 求各列IQR值\n",
516 |     "max_abs_scaler.scale_"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 16,
522 |    "metadata": {
523 |     "scrolled": false
524 |    },
525 |    "outputs": [
526 |     {
527 |      "data": {
528 |       "text/plain": [
529 |        "True"
530 |       ]
531 |      },
532 |      "execution_count": 16,
533 |      "metadata": {},
534 |      "output_type": "execute_result"
535 |     }
536 |    ],
537 |    "source": [
538 |     "# 验证max_abs_scaler.scale_返回的是否为IQR值\n",
539 |     "IQR = np.percentile(X_train, 75, axis=0) - np.percentile(X_train, 25, axis=0)\n",
540 |     "np.allclose(max_abs_scaler.scale_ ,IQR)"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "markdown",
545 |    "metadata": {},
546 |    "source": [
547 |     "## 2 正则化"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {},
553 |    "source": [
554 |     "### 2.1 L1正则化:每行各元素除以每行的L1范数"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 17,
560 |    "metadata": {
561 |     "scrolled": false
562 |    },
563 |    "outputs": [
564 |     {
565 |      "name": "stdout",
566 |      "output_type": "stream",
567 |      "text": [
568 |       "L1正则化:\n"
569 |      ]
570 |     },
571 |     {
572 |      "data": {
573 |       "text/html": [
574 |        "<div>\n",
575 |        "<style scoped>\n",
576 |        "    .dataframe tbody tr th:only-of-type {\n",
577 |        "        vertical-align: middle;\n",
578 |        "    }\n",
579 |        "\n",
580 |        "    .dataframe tbody tr th {\n",
581 |        "        vertical-align: top;\n",
582 |        "    }\n",
583 |        "\n",
584 |        "    .dataframe thead th {\n",
585 |        "        text-align: right;\n",
586 |        "    }\n",
587 |        "</style>\n",
588 |        "<table border=\"1\" class=\"dataframe\">\n",
589 |        "  <thead>\n",
590 |        "    <tr style=\"text-align: right;\">\n",
591 |        "      <th></th>\n",
592 |        "      <th>0</th>\n",
593 |        "      <th>1</th>\n",
594 |        "      <th>2</th>\n",
595 |        "    </tr>\n",
596 |        "  </thead>\n",
597 |        "  <tbody>\n",
598 |        "    <tr>\n",
599 |        "      <th>0</th>\n",
600 |        "      <td>0.25</td>\n",
601 |        "      <td>-0.25</td>\n",
602 |        "      <td>0.5</td>\n",
603 |        "    </tr>\n",
604 |        "    <tr>\n",
605 |        "      <th>1</th>\n",
606 |        "      <td>1.00</td>\n",
607 |        "      <td>0.00</td>\n",
608 |        "      <td>0.0</td>\n",
609 |        "    </tr>\n",
610 |        "    <tr>\n",
611 |        "      <th>2</th>\n",
612 |        "      <td>0.00</td>\n",
613 |        "      <td>0.50</td>\n",
614 |        "      <td>-0.5</td>\n",
615 |        "    </tr>\n",
616 |        "  </tbody>\n",
617 |        "</table>\n",
618 |        "</div>"
619 |       ],
620 |       "text/plain": [
621 |        "      0     1    2\n",
622 |        "0  0.25 -0.25  0.5\n",
623 |        "1  1.00  0.00  0.0\n",
624 |        "2  0.00  0.50 -0.5"
625 |       ]
626 |      },
627 |      "execution_count": 17,
628 |      "metadata": {},
629 |      "output_type": "execute_result"
630 |     }
631 |    ],
632 |    "source": [
633 |     "x = [[1,-1,2],[2, 0,0],[0, 1, -1]]\n",
634 |     "df = pd.DataFrame(x, columns=list('ABC'))\n",
635 |     "\n",
636 |     "x_norm1 = normalize(x, norm='l1')\n",
637 |     "df_norm1 = pd.DataFrame(x_norm1)\n",
638 |     "print('L1正则化:')\n",
639 |     "df_norm1"
640 |    ]
641 |   },
642 |   {
643 |    "cell_type": "code",
644 |    "execution_count": 18,
645 |    "metadata": {
646 |     "scrolled": true
647 |    },
648 |    "outputs": [
649 |     {
650 |      "data": {
651 |       "text/html": [
652 |        "<div>\n",
653 |        "<style scoped>\n",
654 |        "    .dataframe tbody tr th:only-of-type {\n",
655 |        "        vertical-align: middle;\n",
656 |        "    }\n",
657 |        "\n",
658 |        "    .dataframe tbody tr th {\n",
659 |        "        vertical-align: top;\n",
660 |        "    }\n",
661 |        "\n",
662 |        "    .dataframe thead th {\n",
663 |        "        text-align: right;\n",
664 |        "    }\n",
665 |        "</style>\n",
666 |        "<table border=\"1\" class=\"dataframe\">\n",
667 |        "  <thead>\n",
668 |        "    <tr style=\"text-align: right;\">\n",
669 |        "      <th></th>\n",
670 |        "      <th>A</th>\n",
671 |        "      <th>B</th>\n",
672 |        "      <th>C</th>\n",
673 |        "    </tr>\n",
674 |        "  </thead>\n",
675 |        "  <tbody>\n",
676 |        "    <tr>\n",
677 |        "      <th>0</th>\n",
678 |        "      <td>0.25</td>\n",
679 |        "      <td>-0.25</td>\n",
680 |        "      <td>0.5</td>\n",
681 |        "    </tr>\n",
682 |        "    <tr>\n",
683 |        "      <th>1</th>\n",
684 |        "      <td>1.00</td>\n",
685 |        "      <td>0.00</td>\n",
686 |        "      <td>0.0</td>\n",
687 |        "    </tr>\n",
688 |        "    <tr>\n",
689 |        "      <th>2</th>\n",
690 |        "      <td>0.00</td>\n",
691 |        "      <td>0.50</td>\n",
692 |        "      <td>-0.5</td>\n",
693 |        "    </tr>\n",
694 |        "  </tbody>\n",
695 |        "</table>\n",
696 |        "</div>"
697 |       ],
698 |       "text/plain": [
699 |        "      A     B    C\n",
700 |        "0  0.25 -0.25  0.5\n",
701 |        "1  1.00  0.00  0.0\n",
702 |        "2  0.00  0.50 -0.5"
703 |       ]
704 |      },
705 |      "execution_count": 18,
706 |      "metadata": {},
707 |      "output_type": "execute_result"
708 |     }
709 |    ],
710 |    "source": [
711 |     "df_norm1 = df.copy()\n",
712 |     "for idx in df.index:\n",
713 |     "    l1_row = sum(abs(df.iloc[idx]))\n",
714 |     "    df_norm1.iloc[idx] = df.iloc[idx] / l1_row\n",
715 |     "    \n",
716 |     "df_norm1  "
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "markdown",
721 |    "metadata": {},
722 |    "source": [
723 |     "### 2.2 L2正则化：每行各元素除以每行的L2范数"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "code",
728 |    "execution_count": 19,
729 |    "metadata": {},
730 |    "outputs": [
731 |     {
732 |      "data": {
733 |       "text/html": [
734 |        "<div>\n",
735 |        "<style scoped>\n",
736 |        "    .dataframe tbody tr th:only-of-type {\n",
737 |        "        vertical-align: middle;\n",
738 |        "    }\n",
739 |        "\n",
740 |        "    .dataframe tbody tr th {\n",
741 |        "        vertical-align: top;\n",
742 |        "    }\n",
743 |        "\n",
744 |        "    .dataframe thead th {\n",
745 |        "        text-align: right;\n",
746 |        "    }\n",
747 |        "</style>\n",
748 |        "<table border=\"1\" class=\"dataframe\">\n",
749 |        "  <thead>\n",
750 |        "    <tr style=\"text-align: right;\">\n",
751 |        "      <th></th>\n",
752 |        "      <th>A</th>\n",
753 |        "      <th>B</th>\n",
754 |        "      <th>C</th>\n",
755 |        "    </tr>\n",
756 |        "  </thead>\n",
757 |        "  <tbody>\n",
758 |        "    <tr>\n",
759 |        "      <th>0</th>\n",
760 |        "      <td>1</td>\n",
761 |        "      <td>-1</td>\n",
762 |        "      <td>2</td>\n",
763 |        "    </tr>\n",
764 |        "    <tr>\n",
765 |        "      <th>1</th>\n",
766 |        "      <td>2</td>\n",
767 |        "      <td>0</td>\n",
768 |        "      <td>0</td>\n",
769 |        "    </tr>\n",
770 |        "    <tr>\n",
771 |        "      <th>2</th>\n",
772 |        "      <td>0</td>\n",
773 |        "      <td>1</td>\n",
774 |        "      <td>-1</td>\n",
775 |        "    </tr>\n",
776 |        "  </tbody>\n",
777 |        "</table>\n",
778 |        "</div>"
779 |       ],
780 |       "text/plain": [
781 |        "   A  B  C\n",
782 |        "0  1 -1  2\n",
783 |        "1  2  0  0\n",
784 |        "2  0  1 -1"
785 |       ]
786 |      },
787 |      "execution_count": 19,
788 |      "metadata": {},
789 |      "output_type": "execute_result"
790 |     }
791 |    ],
792 |    "source": [
793 |     "x = [[1,-1,2],[2, 0,0],[0, 1, -1]]\n",
794 |     "df = pd.DataFrame(x, columns=list('ABC'))\n",
795 |     "df"
796 |    ]
797 |   },
798 |   {
799 |    "cell_type": "code",
800 |    "execution_count": 20,
801 |    "metadata": {},
802 |    "outputs": [
803 |     {
804 |      "data": {
805 |       "text/html": [
806 |        "<div>\n",
807 |        "<style scoped>\n",
808 |        "    .dataframe tbody tr th:only-of-type {\n",
809 |        "        vertical-align: middle;\n",
810 |        "    }\n",
811 |        "\n",
812 |        "    .dataframe tbody tr th {\n",
813 |        "        vertical-align: top;\n",
814 |        "    }\n",
815 |        "\n",
816 |        "    .dataframe thead th {\n",
817 |        "        text-align: right;\n",
818 |        "    }\n",
819 |        "</style>\n",
820 |        "<table border=\"1\" class=\"dataframe\">\n",
821 |        "  <thead>\n",
822 |        "    <tr style=\"text-align: right;\">\n",
823 |        "      <th></th>\n",
824 |        "      <th>0</th>\n",
825 |        "      <th>1</th>\n",
826 |        "      <th>2</th>\n",
827 |        "    </tr>\n",
828 |        "  </thead>\n",
829 |        "  <tbody>\n",
830 |        "    <tr>\n",
831 |        "      <th>0</th>\n",
832 |        "      <td>0.408248</td>\n",
833 |        "      <td>-0.408248</td>\n",
834 |        "      <td>0.816497</td>\n",
835 |        "    </tr>\n",
836 |        "    <tr>\n",
837 |        "      <th>1</th>\n",
838 |        "      <td>1.000000</td>\n",
839 |        "      <td>0.000000</td>\n",
840 |        "      <td>0.000000</td>\n",
841 |        "    </tr>\n",
842 |        "    <tr>\n",
843 |        "      <th>2</th>\n",
844 |        "      <td>0.000000</td>\n",
845 |        "      <td>0.707107</td>\n",
846 |        "      <td>-0.707107</td>\n",
847 |        "    </tr>\n",
848 |        "  </tbody>\n",
849 |        "</table>\n",
850 |        "</div>"
851 |       ],
852 |       "text/plain": [
853 |        "          0         1         2\n",
854 |        "0  0.408248 -0.408248  0.816497\n",
855 |        "1  1.000000  0.000000  0.000000\n",
856 |        "2  0.000000  0.707107 -0.707107"
857 |       ]
858 |      },
859 |      "execution_count": 20,
860 |      "metadata": {},
861 |      "output_type": "execute_result"
862 |     }
863 |    ],
864 |    "source": [
865 |     "x_norm2 = normalize(x, norm='l2')\n",
866 |     "df_norm2 = pd.DataFrame(x_norm2)\n",
867 |     "df_norm2"
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": 21,
873 |    "metadata": {},
874 |    "outputs": [
875 |     {
876 |      "data": {
877 |       "text/html": [
878 |        "<div>\n",
879 |        "<style scoped>\n",
880 |        "    .dataframe tbody tr th:only-of-type {\n",
881 |        "        vertical-align: middle;\n",
882 |        "    }\n",
883 |        "\n",
884 |        "    .dataframe tbody tr th {\n",
885 |        "        vertical-align: top;\n",
886 |        "    }\n",
887 |        "\n",
888 |        "    .dataframe thead th {\n",
889 |        "        text-align: right;\n",
890 |        "    }\n",
891 |        "</style>\n",
892 |        "<table border=\"1\" class=\"dataframe\">\n",
893 |        "  <thead>\n",
894 |        "    <tr style=\"text-align: right;\">\n",
895 |        "      <th></th>\n",
896 |        "      <th>A</th>\n",
897 |        "      <th>B</th>\n",
898 |        "      <th>C</th>\n",
899 |        "    </tr>\n",
900 |        "  </thead>\n",
901 |        "  <tbody>\n",
902 |        "    <tr>\n",
903 |        "      <th>0</th>\n",
904 |        "      <td>0.408248</td>\n",
905 |        "      <td>-0.408248</td>\n",
906 |        "      <td>0.816497</td>\n",
907 |        "    </tr>\n",
908 |        "    <tr>\n",
909 |        "      <th>1</th>\n",
910 |        "      <td>1.000000</td>\n",
911 |        "      <td>0.000000</td>\n",
912 |        "      <td>0.000000</td>\n",
913 |        "    </tr>\n",
914 |        "    <tr>\n",
915 |        "      <th>2</th>\n",
916 |        "      <td>0.000000</td>\n",
917 |        "      <td>0.707107</td>\n",
918 |        "      <td>-0.707107</td>\n",
919 |        "    </tr>\n",
920 |        "  </tbody>\n",
921 |        "</table>\n",
922 |        "</div>"
923 |       ],
924 |       "text/plain": [
925 |        "          A         B         C\n",
926 |        "0  0.408248 -0.408248  0.816497\n",
927 |        "1  1.000000  0.000000  0.000000\n",
928 |        "2  0.000000  0.707107 -0.707107"
929 |       ]
930 |      },
931 |      "execution_count": 21,
932 |      "metadata": {},
933 |      "output_type": "execute_result"
934 |     }
935 |    ],
936 |    "source": [
937 |     "df_norm2 = df.copy()\n",
938 |     "for idx in df.index:\n",
939 |     "    l2_row = np.sqrt(sum(np.square(df.iloc[idx])))\n",
940 |     "    df_norm2.iloc[idx] = df.iloc[idx] / l2_row\n",
941 |     "\n",
942 |     "df_norm2     "
943 |    ]
944 |   }
945 |  ],
946 |  "metadata": {
947 |   "kernelspec": {
948 |    "display_name": "Python 3",
949 |    "language": "python",
950 |    "name": "python3"
951 |   },
952 |   "language_info": {
953 |    "codemirror_mode": {
954 |     "name": "ipython",
955 |     "version": 3
956 |    },
957 |    "file_extension": ".py",
958 |    "mimetype": "text/x-python",
959 |    "name": "python",
960 |    "nbconvert_exporter": "python",
961 |    "pygments_lexer": "ipython3",
962 |    "version": "3.6.6"
963 |   }
964 |  },
965 |  "nbformat": 4,
966 |  "nbformat_minor": 2
967 | }
968 | 


--------------------------------------------------------------------------------
/2. Data Preprocessing/专题2：One-Hot编码.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### 3.1 One-Hot 编码"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### 每一列特征需要构建的状态寄存器的位数等于该列特征独立取值的个数\n",
 15 |     "#### 使用N位状态寄存器来对N个状态进行编码，每个状态都由他独立的寄存器位，并且在任意时候，其中只有一位有效。"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "import pandas as pd\n",
 28 |     "from sklearn import preprocessing"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {},
 35 |    "outputs": [
 36 |     {
 37 |      "data": {
 38 |       "text/plain": [
 39 |        "array([[ 1.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  1.],\n",
 40 |        "       [ 0.,  1.,  0.,  1.,  0.,  1.,  0.,  0.,  0.],\n",
 41 |        "       [ 1.,  0.,  0.,  0.,  1.,  0.,  1.,  0.,  0.],\n",
 42 |        "       [ 0.,  1.,  1.,  0.,  0.,  0.,  0.,  1.,  0.]])"
 43 |       ]
 44 |      },
 45 |      "execution_count": 3,
 46 |      "metadata": {},
 47 |      "output_type": "execute_result"
 48 |     }
 49 |    ],
 50 |    "source": [
 51 |     "enc = preprocessing.OneHotEncoder()\n",
 52 |     "enc.fit_transform([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]]).toarray()  \n",
 53 |     "#第一列有两个状态：0、1，需要2位状态寄存器来进行编码，[1,0]表示0,[0,1]表示1\n",
 54 |     "#第二列有三个状态：0、1、2，需要3位状态寄存器来进行编码，[1,0,0]表示0，[0,1,0]表示1，[0,0,1]表示2\n",
 55 |     "#第三列有四个状态：0、1、2、3，需要4位状态寄存器来进行编码，[1,0,0,0]表示0，[0,1,0,0]表示1，[0,0,1,0]表示2，[0,0,0,1]表示3"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 4,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "array([[ 1.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  1.]])"
 67 |       ]
 68 |      },
 69 |      "execution_count": 4,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "enc.transform([[0, 1, 3]]).toarray()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "#### 如果训练集中有丢失的分类特征值，必须显式地设置n_values"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "OneHotEncoder(categorical_features='all', dtype=<class 'numpy.float64'>,\n",
 94 |        "       handle_unknown='error', n_values=[2, 3, 4], sparse=True)"
 95 |       ]
 96 |      },
 97 |      "execution_count": 4,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "# n_values=[2, 3, 4]表示第1/2/3列分别有2/3/4个独立取值\n",
104 |     "enc = preprocessing.OneHotEncoder(n_values=[2, 3, 4]) \n",
105 |     "enc.fit([[1, 2, 3], [0, 2, 0]])  "
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": 5,
111 |    "metadata": {},
112 |    "outputs": [
113 |     {
114 |      "data": {
115 |       "text/plain": [
116 |        "array([[0., 1., 1., 0., 0., 1., 0., 0., 0.]])"
117 |       ]
118 |      },
119 |      "execution_count": 5,
120 |      "metadata": {},
121 |      "output_type": "execute_result"
122 |     }
123 |    ],
124 |    "source": [
125 |     "enc.transform([[1, 0, 0]]).toarray()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### 3.2 pandas.get_dummies构造哑变量"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "markdown",
137 |    "metadata": {},
138 |    "source": [
139 |     "##### pandas.get_dummies(data, prefix=None, prefixsep='', dummy_na=False, columns=None, sparse=False, drop_first=False)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 6,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "data": {
149 |       "text/html": [
150 |        "<div>\n",
151 |        "<style scoped>\n",
152 |        "    .dataframe tbody tr th:only-of-type {\n",
153 |        "        vertical-align: middle;\n",
154 |        "    }\n",
155 |        "\n",
156 |        "    .dataframe tbody tr th {\n",
157 |        "        vertical-align: top;\n",
158 |        "    }\n",
159 |        "\n",
160 |        "    .dataframe thead th {\n",
161 |        "        text-align: right;\n",
162 |        "    }\n",
163 |        "</style>\n",
164 |        "<table border=\"1\" class=\"dataframe\">\n",
165 |        "  <thead>\n",
166 |        "    <tr style=\"text-align: right;\">\n",
167 |        "      <th></th>\n",
168 |        "      <th>Explorer</th>\n",
169 |        "      <th>Nation</th>\n",
170 |        "      <th>Quantity</th>\n",
171 |        "    </tr>\n",
172 |        "  </thead>\n",
173 |        "  <tbody>\n",
174 |        "    <tr>\n",
175 |        "      <th>0</th>\n",
176 |        "      <td>Firefox</td>\n",
177 |        "      <td>CN</td>\n",
178 |        "      <td>1</td>\n",
179 |        "    </tr>\n",
180 |        "    <tr>\n",
181 |        "      <th>1</th>\n",
182 |        "      <td>Chrome</td>\n",
183 |        "      <td>US</td>\n",
184 |        "      <td>2</td>\n",
185 |        "    </tr>\n",
186 |        "    <tr>\n",
187 |        "      <th>2</th>\n",
188 |        "      <td>Safari</td>\n",
189 |        "      <td>UK</td>\n",
190 |        "      <td>3</td>\n",
191 |        "    </tr>\n",
192 |        "  </tbody>\n",
193 |        "</table>\n",
194 |        "</div>"
195 |       ],
196 |       "text/plain": [
197 |        "  Explorer Nation  Quantity\n",
198 |        "0  Firefox     CN         1\n",
199 |        "1   Chrome     US         2\n",
200 |        "2   Safari     UK         3"
201 |       ]
202 |      },
203 |      "execution_count": 6,
204 |      "metadata": {},
205 |      "output_type": "execute_result"
206 |     }
207 |    ],
208 |    "source": [
209 |     "dict_ = {'Nation': ['CN', 'US', 'UK'], 'Explorer': ['Firefox','Chrome','Safari'], 'Quantity': [1, 2, 3]}\n",
210 |     "df = pd.DataFrame(dict_)\n",
211 |     "df"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 7,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/html": [
222 |        "<div>\n",
223 |        "<style scoped>\n",
224 |        "    .dataframe tbody tr th:only-of-type {\n",
225 |        "        vertical-align: middle;\n",
226 |        "    }\n",
227 |        "\n",
228 |        "    .dataframe tbody tr th {\n",
229 |        "        vertical-align: top;\n",
230 |        "    }\n",
231 |        "\n",
232 |        "    .dataframe thead th {\n",
233 |        "        text-align: right;\n",
234 |        "    }\n",
235 |        "</style>\n",
236 |        "<table border=\"1\" class=\"dataframe\">\n",
237 |        "  <thead>\n",
238 |        "    <tr style=\"text-align: right;\">\n",
239 |        "      <th></th>\n",
240 |        "      <th>Quantity</th>\n",
241 |        "      <th>Explorer_Chrome</th>\n",
242 |        "      <th>Explorer_Firefox</th>\n",
243 |        "      <th>Explorer_Safari</th>\n",
244 |        "      <th>Nation_CN</th>\n",
245 |        "      <th>Nation_UK</th>\n",
246 |        "      <th>Nation_US</th>\n",
247 |        "    </tr>\n",
248 |        "  </thead>\n",
249 |        "  <tbody>\n",
250 |        "    <tr>\n",
251 |        "      <th>0</th>\n",
252 |        "      <td>1</td>\n",
253 |        "      <td>0</td>\n",
254 |        "      <td>1</td>\n",
255 |        "      <td>0</td>\n",
256 |        "      <td>1</td>\n",
257 |        "      <td>0</td>\n",
258 |        "      <td>0</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>1</th>\n",
262 |        "      <td>2</td>\n",
263 |        "      <td>1</td>\n",
264 |        "      <td>0</td>\n",
265 |        "      <td>0</td>\n",
266 |        "      <td>0</td>\n",
267 |        "      <td>0</td>\n",
268 |        "      <td>1</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>2</th>\n",
272 |        "      <td>3</td>\n",
273 |        "      <td>0</td>\n",
274 |        "      <td>0</td>\n",
275 |        "      <td>1</td>\n",
276 |        "      <td>0</td>\n",
277 |        "      <td>1</td>\n",
278 |        "      <td>0</td>\n",
279 |        "    </tr>\n",
280 |        "  </tbody>\n",
281 |        "</table>\n",
282 |        "</div>"
283 |       ],
284 |       "text/plain": [
285 |        "   Quantity  Explorer_Chrome  Explorer_Firefox  Explorer_Safari  Nation_CN  \\\n",
286 |        "0         1                0                 1                0          1   \n",
287 |        "1         2                1                 0                0          0   \n",
288 |        "2         3                0                 0                1          0   \n",
289 |        "\n",
290 |        "   Nation_UK  Nation_US  \n",
291 |        "0          0          0  \n",
292 |        "1          0          1  \n",
293 |        "2          1          0  "
294 |       ]
295 |      },
296 |      "execution_count": 7,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "df_dummies = pd.get_dummies(df, prefix=['Explorer', 'Nation'], prefix_sep='_')    \n",
303 |     "df_dummies"
304 |    ]
305 |   }
306 |  ],
307 |  "metadata": {
308 |   "kernelspec": {
309 |    "display_name": "Python 3",
310 |    "language": "python",
311 |    "name": "python3"
312 |   },
313 |   "language_info": {
314 |    "codemirror_mode": {
315 |     "name": "ipython",
316 |     "version": 3
317 |    },
318 |    "file_extension": ".py",
319 |    "mimetype": "text/x-python",
320 |    "name": "python",
321 |    "nbconvert_exporter": "python",
322 |    "pygments_lexer": "ipython3",
323 |    "version": "3.6.4"
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 2
328 | }
329 | 


--------------------------------------------------------------------------------
/2. Data Preprocessing/专题4：共线性问题.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "### 共线性对模型有如下4种效应"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "- 参数的估计值变得不准确\n",
 15 |     "- 参数估计值的标准差变大\n",
 16 |     "- 参数显著性检验变得不准确，容易将重要的自变量误判为不显著，即针对模型参数的假设检验变得不准确\n",
 17 |     "- 对于已知数据，模型的预测效果几乎不受影响"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": 1,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import numpy as np\n",
 27 |     "from sklearn.linear_model import Ridge, LinearRegression\n",
 28 |     "from sklearn.decomposition import PCA\n",
 29 |     "from sklearn import datasets\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "dataset = datasets.make_classification(n_samples=5000, n_features=15, n_informative=5, n_redundant=2, n_repeated=2, \n",
 33 |     "                                       n_classes=2, n_clusters_per_class=2, shuffle=True, random_state=2018)\n",
 34 |     "X, y = dataset[0], dataset[1]"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 3,
 40 |    "metadata": {},
 41 |    "outputs": [
 42 |     {
 43 |      "data": {
 44 |       "text/plain": [
 45 |        "(-0.5593271141268006, 0.0)"
 46 |       ]
 47 |      },
 48 |      "execution_count": 3,
 49 |      "metadata": {},
 50 |      "output_type": "execute_result"
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "import scipy.stats.stats as scss\n",
 55 |     "\n",
 56 |     "# 第一个值是相关系数，第二个值是P值\n",
 57 |     "scss.pearsonr(X[:, 0], X[:, 1])"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 4,
 63 |    "metadata": {},
 64 |    "outputs": [
 65 |     {
 66 |      "data": {
 67 |       "text/plain": [
 68 |        "SpearmanrResult(correlation=-0.48175831850233264, pvalue=5.732959707734371e-289)"
 69 |       ]
 70 |      },
 71 |      "execution_count": 4,
 72 |      "metadata": {},
 73 |      "output_type": "execute_result"
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "scss.spearmanr(X[:, 0], X[:, 1])"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "### 1 岭回归(L2正则项)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "- 在线性回归模型中加入L2惩罚项能比较有效地解决共线性问题\n",
 92 |     "- 岭回归是一种可用于共线性问题的有偏估计回归方法，实质上是一种改良的最小二乘估计法\n",
 93 |     "- 它通过放弃最小二乘法的无偏性，以损失部分信息、降低精读为代价来获得更实际和可靠性更强的回归系数"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 5,
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "coef:\n",
106 |       " [-0.00836161 -0.10320428 -0.01964247 -0.00560743  0.02988531 -0.00410644\n",
107 |       "  0.00484214 -0.03720753  0.02489295  0.02489295  0.00527641  0.06280543\n",
108 |       " -0.00212993 -0.03720753 -0.00726083]\n",
109 |       "\n",
110 |       "intercept:\n",
111 |       " 0.6222527142570053\n"
112 |      ]
113 |     }
114 |    ],
115 |    "source": [
116 |     "model_ridge = Ridge(alpha=1.0)\n",
117 |     "model_ridge.fit(X, y)\n",
118 |     "print('coef:\\n', model_ridge.coef_)\n",
119 |     "print('\\nintercept:\\n', model_ridge.intercept_)"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "metadata": {},
125 |    "source": [
126 |     "### 2 PCA"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "- 通过主成分分析，将原始特征转换为少数几个主成分，每个主成分是原特征的线性组合\n",
134 |     "- 基于主成分做回归分析，可以在不丢失重要数据特征的前提下避开共线性问题"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 6,
140 |    "metadata": {},
141 |    "outputs": [
142 |     {
143 |      "name": "stdout",
144 |      "output_type": "stream",
145 |      "text": [
146 |       "ratio_cumsum:\n",
147 |       " [0.29512698 0.48842363 0.62481032 0.74903408 0.79338958 0.82974707\n",
148 |       " 0.865346   0.90040382 0.93489478 0.96783082 1.         1.\n",
149 |       " 1.         1.         1.        ]\n",
150 |       "\n",
151 |       "rule_index: \n",
152 |       " (array([ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14], dtype=int64),)\n",
153 |       "\n",
154 |       "shape:\n",
155 |       " (5000, 6)\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "model_pca = PCA()\n",
161 |     "data_pca = model_pca.fit_transform(X)\n",
162 |     "\n",
163 |     "# np.cumsum(a, axis=None, dtype=None, out=None)可以在指定的axis上累计求和\n",
164 |     "ratio_cumsum = np.cumsum(model_pca.explained_variance_ratio_)\n",
165 |     "print('ratio_cumsum:\\n', ratio_cumsum)\n",
166 |     "\n",
167 |     "# 获取主成分方差占比累积大于0.8的值索引\n",
168 |     "rule_index = np.where(ratio_cumsum > 0.8)\n",
169 |     "print('\\nrule_index: \\n', rule_index)\n",
170 |     "\n",
171 |     "# rule_index是一个元组，min_index为其最小值\n",
172 |     "min_index = rule_index[0][0]\n",
173 |     "\n",
174 |     "# 获取data_pca的所有行，前min_index列.\n",
175 |     "data_pca_result = data_pca[:, :(min_index + 1)]\n",
176 |     "print('\\nshape:\\n', data_pca_result.shape)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 7,
182 |    "metadata": {},
183 |    "outputs": [
184 |     {
185 |      "name": "stdout",
186 |      "output_type": "stream",
187 |      "text": [
188 |       "coef: [ 0.03679445  0.04293586 -0.08639767 -0.01217279 -0.09520728  0.00861532]\n",
189 |       "\n",
190 |       "intercept 0.4988\n"
191 |      ]
192 |     }
193 |    ],
194 |    "source": [
195 |     "model_linear = LinearRegression()\n",
196 |     "model_linear.fit(data_pca_result, y)\n",
197 |     "\n",
198 |     "print('coef:', model_linear.coef_)\n",
199 |     "print('\\nintercept', model_linear.intercept_)"
200 |    ]
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "kernelspec": {
205 |    "display_name": "Python 3",
206 |    "language": "python",
207 |    "name": "python3"
208 |   },
209 |   "language_info": {
210 |    "codemirror_mode": {
211 |     "name": "ipython",
212 |     "version": 3
213 |    },
214 |    "file_extension": ".py",
215 |    "mimetype": "text/x-python",
216 |    "name": "python",
217 |    "nbconvert_exporter": "python",
218 |    "pygments_lexer": "ipython3",
219 |    "version": "3.6.6"
220 |   }
221 |  },
222 |  "nbformat": 4,
223 |  "nbformat_minor": 2
224 | }
225 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.1 Feature Selection/ReadMe.md:
--------------------------------------------------------------------------------
  1 | 
  2 | - **Author：** 马肖
  3 | - **E-Mail：** maxiaoscut@aliyun.com
  4 | - **GitHub：**  https://github.com/Albertsr
  5 | 
  6 | ### 详细分析与解读：[三大特征选择 by MaXiao](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.1%20Feature%20Selection/Feature%20Selection.ipynb)
  7 | ---
  8 | 
  9 | ## 1.Filter (过滤型)
 10 | 
 11 | #### 1.1 概述
 12 | - Filter方法运用特定的统计指标对单个特征进行评分，评分高的特征优先被选择
 13 | 
 14 | ---
 15 | 
 16 | #### 1.2 优缺点
 17 | - **优点**
 18 |   - 算法的通用性强，算法复杂度低，适用于大规模数据集
 19 |   - 可快速去除大量不相关的特征，适合作为特征的预筛选器
 20 | 
 21 | - **缺点**：
 22 |   - Filter独立地考察单个特征，不考虑与其他特征之间的联系，被保留的特征可能具有冗余性
 23 |   - Filter不考虑特征对模型性能的影响，被保留的特征对于模型性能来说不一定是最优特征
 24 |   
 25 | ---
 26 | 
 27 | #### 1.3 常见的Filter方法
 28 | - **方差阈**
 29 |   - 方差阈是一种无监督方法，通过移除方差低于阈值的特征进行选择
 30 |   - sklearn.feature_selection.VarianceThreshold(threshold=0.0)
 31 | 
 32 |  
 33 | - **卡方检验**
 34 |   - 卡方检验只能用于检测**非负特征**与**分类标签列**的独立性，卡方统计量越大，两者越可能相互独立；
 35 |   - sklearn.feature_selection.chi2(X, y)
 36 | 
 37 | 
 38 | - **互信息**
 39 |   - **能捕捉变量之间任何线性或非线性关系：** 既可以用于筛选分类模型的特征，也可以于筛选回归模型的特征
 40 |   - **适用于分类模型：** sklearn.feature_selection.mutual_info_classif
 41 |   - **适用于回归模型：** sklearn.feature_selection.mutual_info_classif
 42 |  
 43 | 
 44 | - **F检验**
 45 |   - **只能衡量线性关系：** 既可以用于筛选分类模型的特征，也可以于筛选回归模型的特征
 46 |   - **适用于分类模型：** 
 47 |      - sklearn.feature_selection.f_classif(X, y)
 48 |      - Compute the ANOVA F-value for the provided sample
 49 |   - **适用于回归模型：**
 50 |     - sklearn.feature_selection.f_regression(X, y, center=True)
 51 |     - Univariate linear regression tests
 52 | ---
 53 | 
 54 | ## 2.Wrapper (封装型)
 55 | 
 56 | #### 2.1 概述
 57 | - Wrapper根据**外部模型**返回的特征重要性，在迭代过程中递归地剔除不重要的特征
 58 | 
 59 | - Wrapper通过**贪心搜索算法，启发式地递归搜索**最佳特征子集，**最佳特征子集**是指所训练的模型具有**最佳的交叉验证性能**
 60 | 
 61 | - 外部模型需要具备coef_或feature_importances_属性来对特征重要性进行评估 
 62 | 
 63 | 
 64 | #### 2.2 优缺点
 65 | - **优点**：能将特征之间的非独立性考虑在内，基于外部模型性能筛选出**独立性与解释能力较强**的特征
 66 | - **缺点**：相比其他特征选择方法，有更高的计算代价，筛选出的特征子集更易过拟合
 67 | 
 68 | #### 2.3 常见Wrapper方法
 69 | - **递归消除特征法(RFE, recursive feature elimination)**
 70 |   - 通过**逐步剔除回归系数或重要性较小的特征**来进行特征选择
 71 |   - sklearn.feature_selection.RFE(estimator, n_features_to_select=None, step=1, verbose=0) 
 72 | 
 73 | - **sequential feature selection algorithms**
 74 | 
 75 | - **genetic algorithms**
 76 | 
 77 | 
 78 | ---
 79 | 
 80 | ## 3.Embedded(嵌入型)
 81 | 
 82 | #### 3.1 概述
 83 | - **特征选择过程与模型训练过程融为一体，两者在同一个优化过程中完成，即在模型训练过程中同时进行了特征选择**
 84 | - **学习器必须具有衡量特征重要性的属性：** 能返回特征系数coef或者特征重要度(feature importances)的算法才可以做为嵌入法的基学习器，例如线性模型和决策树类算法
 85 | - **嵌入型与封装型的区别：** 在模型训练过程中是否具备内生性的特征评价准则
 86 | 
 87 | #### 3.2 优缺点
 88 | - **优点**：相比wrapper计算消耗更少
 89 | - **缺点**：仅限于特定的机器学习算法（specific to a learning machine）
 90 | 
 91 | 
 92 | #### 3.3 常见的嵌入型方法
 93 | - **L1正则化**：
 94 |   - 典型的嵌入式特征选择方法，能有效降低过拟合风险
 95 |   - L1正则化可能是不稳定的，如果特征集合中具有共线性特征，则共线性特征可能只保留了一个，没有被选择到的特征不代表不重要
 96 |   - 如果要确定哪个特征重要，可再通过L2正则方法交叉检验
 97 | 
 98 | 
 99 | - **决策树类模型**：
100 |   - **以CART回归树为例**：若某特征的各个取值作为分裂结点时，平方误差减少量极少，意味着这个特征不能参与CART回归树构建过程，从而在训练CART回归树的同时，也完成了特征选择过程
101 |   - **XGBoost**：既是决策树类模型，同时又带有正则化项
102 |   
103 | 
104 | #### 3.4 **API**
105 |   - sklearn.feature_selection.SelectFromModel(estimator, threshold=None, prefit=False, norm_order=1)
106 |   - estimator： 模型必须具有feature_importances_ 或者coef_ attribute这里反应特征重要性的属性
107 |   - threshold ：大于等于阈值的特征被保留，默认取'mean'，还可取'median'，以及'1.25*mean'、标量等形式
108 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.1 Feature Selection/embedded.py:
--------------------------------------------------------------------------------
 1 | # Author：MaXiao
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | 
 6 | import matplotlib.pyplot as plt
 7 | import numpy as np
 8 | import pandas as pd
 9 | from sklearn.datasets import load_breast_cancer
10 | from sklearn.ensemble import RandomForestClassifier
11 | from sklearn.feature_selection import SelectFromModel
12 | from sklearn.preprocessing import StandardScaler
13 | from sklearn.svm import LinearSVC
14 | 
15 | 
16 | cancer = load_breast_cancer()
17 | X, y = load_breast_cancer(return_X_y=True)
18 | X_scaled = StandardScaler().fit_transform(X)
19 | 
20 | # 基于L1正则化的特征选择
21 | linear_svc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X_scaled, y)
22 | sfm_linear_svc = SelectFromModel(linear_svc, prefit=False)
23 | sfm_linear_svc.fit(X_scaled, y)
24 | X_selected = sfm_linear_svc.transform(X_scaled)
25 | print('shape: {:} ---> shape:{:}\n'.format(X.shape, X_selected.shape))
26 | 
27 | # get_support属性返回布尔型列表，若特征被保留，则显示True
28 | get_support = {'Support' : sfm_linear_svc.get_support()}
29 | sfm_result = pd.DataFrame(get_support, index=cancer.feature_names)
30 | print(sfm_result[sfm_result['Support']==True])
31 | 
32 | 
33 | #基于树模型进行模型选择
34 | rf = RandomForestClassifier(n_estimators=100, random_state=10)
35 | rf.fit(X, y)
36 | 
37 | # 选择特征重要性为1.2倍均值的特征
38 | sfm_rf = SelectFromModel(rf, threshold='1.2*mean',prefit=True)
39 | 
40 | #返回所选的特征
41 | X_selected_rf = sfm_rf.transform(X)
42 | print('\nshape:{:}--->shape:{:}'.format(X.shape, X_selected_rf.shape))
43 | 
44 | mask = sfm_rf.get_support()
45 | plt.matshow(mask.reshape(1, -1), cmap=plt.cm.Reds)#, aspect='auto')
46 | plt.xlabel('Sample index')
47 | plt.ylim(-0.5, 0.5)
48 | plt.yticks([-0.5, 0.5])
49 | plt.show()
50 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.1 Feature Selection/filter.py:
--------------------------------------------------------------------------------
 1 | # Author：MaXiao
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | 
 6 | from sklearn.datasets import load_breast_cancer, load_boston
 7 | from sklearn.feature_selection import GenericUnivariateSelect, mutual_info_classif, mutual_info_regression
 8 | 
 9 | '''
10 | API : GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=85)
11 |    1) 参数score_func为评分函数。
12 |    - 对于分类问题，可以取值：chi2(卡方检验)、mutual_info_classif(互信息)、f_classif(F检验)
13 |    - 对于回归问题，可以取值：mutual_info_regression、f_regression 
14 |    
15 |    注意事项：
16 |    - 卡方检验可用于检测非负特征与分类标签列的独立性，卡方统计量越大，两者越可能相互独立
17 |    - 互信息既能捕捉到线性关系，也能捕捉到非线性关系，因此习惯采用mutual_info_classif或mutual_info_regression
18 |    
19 |    2）参数mode为选择模式
20 |    - 可以取值：{'percentile', 'k_best', 'fpr', 'fdr', 'fwe'}
21 |    - 'fpr' : Select features based on a false positive rate test.只能用于分类问题.
22 |    - 'fdr' : Select features based on an estimated false discovery rate.只能用于分类问题.
23 |    - 'fwe' : Select features based on family-wise error rate.
24 |    
25 |    3) 参数param的取值范围由参数mode的取值决定，例如mode='percentile',param=80表示取分数位于前80%的特征   
26 | '''
27 | 
28 | # 分类问题：乳腺癌数据集
29 | X_cancer, y_cancer = load_breast_cancer(return_X_y=True)
30 | transformer = GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=85)
31 | X_cancer_selected = transformer.fit_transform(X_cancer, y_cancer)
32 | print("Cancer's shape: {} ---> {}".format(X_cancer.shape, X_cancer_selected.shape))
33 | 
34 | # 回归问题：波士顿房价数据集
35 | X_boston, y_boston = load_boston(return_X_y=True)
36 | transformer = GenericUnivariateSelect(score_func=mutual_info_regression, mode='percentile', param=85)
37 | X_boston_selected = transformer.fit_transform(X_boston, y_boston)
38 | print("Boston's shape: {} ---> {}".format(X_boston.shape, X_boston_selected.shape))
39 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.1 Feature Selection/wrapper.py:
--------------------------------------------------------------------------------
 1 | # Author：MaXiao
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | from sklearn.datasets import load_boston
 8 | from sklearn.feature_selection import RFE, RFECV  
 9 | from xgboost import XGBRegressor
10 | 
11 | '''
12 | API说明： 
13 | sklearn.feature_selection模块提供了两个API可用于wrapper，分别为：
14 | 1. RFE(estimator, n_features_to_select=None, step=1, verbose=0)
15 | 2. RFECV(estimator, step=1, min_features_to_select=1, cv='warn', scoring=None, verbose=0, n_jobs=None)
16 | 
17 | - 两者的区别在于RFECV可以通过交叉验证的方式返回最佳的特征数，而RFE需要通过参数n_features_to_select预先指定；
18 | - estimator：模型必须具备coef_或feature_importances_属性用于评估特征重要性。
19 |              一般来说线性模型以及线性核SVM具备coef_属性、决策树类算法具备feature_importances_属性 
20 | - step：整数或小数形式，表示每次迭代剔除的特征数或特征占比；
21 | 
22 | 属性说明：
23 | 1. RFECV_XGB.support_ ：布尔值列表，若特征被保留则相应索引处为True，否则为False
24 | 2. RFECV_XGB.ranking_ ：数值型列表，若特征被保留则相应索引处为1，否则大于1，且ranking值越大，特征越不重要
25 | 3. RFECV_XGB.grid_scores_ ：数值型列表，表示特征子集的交叉验证分数，与特征是否被选择没有太大关系
26 | 
27 | '''
28 | 
29 | X, y = load_boston(return_X_y=True)
30 | xgb = XGBRegressor(learning_rate=0.2, n_estimators=150, random_state=2017)
31 | RFECV_XGB = RFECV(xgb, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
32 | RFECV_XGB.fit(X, y)
33 | print('Original features: {:}'.format(X.shape[1]))
34 | print('RFECV_XGB features: {:}'.format(RFECV_XGB.n_features_))
35 | 
36 | # 将RFECV_XGB的训练结果用pandas.dataframe进行直观展示
37 | feature_name = load_boston().feature_names
38 | rfecv_dict = {'Support':RFECV_XGB.support_, 'Ranking':RFECV_XGB.ranking_, 'Grid_scores':RFECV_XGB.grid_scores_}
39 | rfecv_result = pd.DataFrame(rfecv_dict, index=feature_name)
40 | # 根据Ranking对rfecv_result升序排列
41 | rfecv_result.sort_values('Ranking', inplace=True)
42 | 
43 | # 将保留特征对应的support_与ranking_属性标红
44 | def highlight(s):
45 |     if isinstance(s[0], np.bool_):
46 |         criterion = s == s.max()
47 |     else:
48 |         criterion = s == s.min()
49 |     return ['color: red' if v else '' for v in criterion]
50 | print(rfecv_result.style.apply(highlight, subset=['Support', 'Ranking']))
51 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.2 Feature Extraction/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.2 Feature Extraction/cat_svd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/3. Feature Engineering/3.2 Feature Extraction/cat_svd.jpg


--------------------------------------------------------------------------------
/3. Feature Engineering/3.2 Feature Extraction/pca_evd.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from numpy import linalg as LA
 7 | 
 8 | class PCA_EVD:
 9 |     # 参数n_components为保留的主成分数
10 |     def __init__(self, matrix, n_components=None):
11 |         self.matrix = matrix
12 |         self.n_components = matrix.shape[1] if n_components==None else n_components
13 |     
14 |     # 自定义标准化方法
15 |     def scale(self):
16 |         def scale_vector(vector):
17 |             delta = vector - np.mean(vector)
18 |             std = np.std(vector, ddof=0)
19 |             return delta / std
20 |         matrix_scaled = np.apply_along_axis(arr=self.matrix, func1d=scale_vector, axis=0)
21 |         return matrix_scaled
22 |          
23 |     # 求标准化矩阵的协方差矩阵
24 |     def matrix_cov(self):
25 |         # rowvar设置为False表示每列代表一个特征，每行代表一个观测值; 默认值为True
26 |         # ddof默认值为1，表示是无偏估计
27 |         cov_matrix = np.cov(self.scale(), rowvar=False, ddof=1)
28 |         return cov_matrix
29 |         
30 |     # 求投影矩阵、特征值、特征向量
31 |     def matrix_eig(self):
32 |         # eigenvectors的每一列即为一个特征向量
33 |         # the column v[:,i] is the eigenvector corresponding to the eigenvalue w[i]
34 |         # https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.eig.html
35 |         eigenvalues, eigenvectors = LA.eig(self.matrix_cov())
36 |         
37 |         # 根据特征值大小对特征值、特征向量降序排列
38 |         eigen_values = eigenvalues[np.argsort(-eigenvalues)]
39 |         eigen_vectors = eigenvectors[np.argsort(-eigenvalues)]
40 |         
41 |         # 选取eigen_vectors的前n_components列，构成的n*n_components型投影矩阵Q
42 |         Q = eigen_vectors[:, :self.n_components]
43 |         return Q, eigen_values, eigen_vectors
44 |     
45 |     # 完成降维
46 |     def pca_result(self):
47 |         Q = self.matrix_eig()[0]
48 |         PCA_result = np.dot(self.scale(), Q)
49 |         assert PCA_result.shape[1] == self.n_components, '降维后矩阵的列数应等于指定的主成分数'
50 |         return PCA_result


--------------------------------------------------------------------------------
/3. Feature Engineering/3.2 Feature Extraction/pca_svd.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from numpy import linalg as LA
 7 | 
 8 | class PCA_SVD:
 9 |     # 参数n_components为保留的主成分数
10 |     def __init__(self, matrix, n_components=None):
11 |         self.matrix = matrix
12 |         self.n_components = matrix.shape[1] if n_components==None else n_components
13 |     
14 |     # 自定义标准化方法
15 |     def scale(self):
16 |         def scale_vector(vector):
17 |             delta = vector - np.mean(vector)
18 |             std = np.std(vector, ddof=0)
19 |             return delta / std
20 |         matrix_scaled = np.apply_along_axis(arr=self.matrix, func1d=scale_vector, axis=0)
21 |         return matrix_scaled
22 |      
23 |     # 对标准化后的矩阵进行奇异值分解    
24 |     def matrix_svd(self):
25 |         # 令A为m*n型矩阵，则U、V分别为m阶、n阶正交矩阵
26 |         # U的每一个列向量都是A*A.T的特征向量，也称为左奇异向量
27 |         # V的每一个行向量都是A.T*A的特征向量，也称为右奇异向量
28 |         # sigma是由k个降序排列的奇异值构成的向量，其中k = min(matrix.shape)
29 |         U, sigma, V =  LA.svd(self.scale()) 
30 |         
31 |         # 非零奇异值的个数不会超过原矩阵的秩，从而不会超过矩阵维度的最小值
32 |         assert len(sigma) == min(self.matrix.shape)
33 |         return U, sigma, V 
34 |     
35 |     # 通过矩阵V进行PCA，返回最终降维后的矩阵
36 |     def pca_result(self):
37 |         sigma, V = self.matrix_svd()[1], self.matrix_svd()[2]
38 |         # Q为投影矩阵，由V的前n_components个行向量转置后得到
39 |         Q = V[:self.n_components, :].T
40 |         # 计算标准化后的矩阵在Q上的投影，得到PCA的结果
41 |         matrix_pca = np.dot(self.scale(), Q)
42 |         # matrix_pca的列数应等于保留的主成分数
43 |         assert matrix_pca.shape[1] == self.n_components
44 |         return matrix_pca


--------------------------------------------------------------------------------
/3. Feature Engineering/3.2 Feature Extraction/验证：sklearn采用SVD实现PCA.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Author：马肖\n",
  8 |     "#### E-Mail：maxiaoscut@aliyun.com\n",
  9 |     "#### GitHub：https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### 1. 通过SVD自定义实现PCA"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import numpy as np\n",
 26 |     "from numpy import linalg as LA\n",
 27 |     "\n",
 28 |     "class PCA_SVD:\n",
 29 |     "    # 参数n_components为保留的主成分数\n",
 30 |     "    def __init__(self, matrix, n_components=None):\n",
 31 |     "        self.matrix = matrix\n",
 32 |     "        self.n_components = matrix.shape[1] if n_components==None else n_components\n",
 33 |     "    \n",
 34 |     "    # 自定义标准化方法\n",
 35 |     "    def scale(self):\n",
 36 |     "        def scale_vector(vector):\n",
 37 |     "            delta = vector - np.mean(vector)\n",
 38 |     "            std = np.std(vector, ddof=0)\n",
 39 |     "            return delta / std\n",
 40 |     "        matrix_scaled = np.apply_along_axis(arr=self.matrix, func1d=scale_vector, axis=0)\n",
 41 |     "        return matrix_scaled\n",
 42 |     "     \n",
 43 |     "    # 对标准化后的矩阵进行奇异值分解    \n",
 44 |     "    def matrix_svd(self):\n",
 45 |     "        # 令A为m*n型矩阵，则U、V分别为m阶、n阶正交矩阵\n",
 46 |     "        # U的每一个列向量都是A*A.T的特征向量，也称为左奇异向量\n",
 47 |     "        # V的每一个行向量都是A.T*A的特征向量，也称为右奇异向量\n",
 48 |     "        # sigma是由k个降序排列的奇异值构成的向量，其中k = min(matrix.shape)\n",
 49 |     "        U, sigma, V =  LA.svd(self.matrix) \n",
 50 |     "        \n",
 51 |     "        # 非零奇异值的个数不会超过原矩阵的秩，从而不会超过矩阵维度的最小值\n",
 52 |     "        assert len(sigma) == min(self.matrix.shape)\n",
 53 |     "        return U, sigma, V \n",
 54 |     "    \n",
 55 |     "    # 通过矩阵V进行PCA，返回最终降维后的矩阵\n",
 56 |     "    def pca_result(self):\n",
 57 |     "        sigma, V = self.matrix_svd()[1], self.matrix_svd()[2]\n",
 58 |     "        \n",
 59 |     "        # 奇异值的平方等于(A^T)*A的特征值\n",
 60 |     "        eigen_values = np.square(sigma[:self.n_components]) / (self.matrix.shape[0]-1)\n",
 61 |     "        \n",
 62 |     "        # Q为投影矩阵，由V的前n_components个行向量转置后得到\n",
 63 |     "        Q = V[:self.n_components, :].T\n",
 64 |     "        \n",
 65 |     "        # 计算标准化后的矩阵在Q上的投影，得到PCA的结果\n",
 66 |     "        matrix_pca = np.dot(self.scale(), Q)\n",
 67 |     "        # matrix_pca的列数应等于保留的主成分数\n",
 68 |     "        assert matrix_pca.shape[1] == self.n_components\n",
 69 |     "        return matrix_pca, eigen_values, Q.T"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### 2. 调用sklearn实现的PCA"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "from sklearn.decomposition import PCA\n",
 86 |     "from sklearn.preprocessing import StandardScaler\n",
 87 |     "from sklearn.datasets import load_wine\n",
 88 |     "\n",
 89 |     "X = load_wine().data\n",
 90 |     "row, col = X.shape\n",
 91 |     "scaler = StandardScaler()\n",
 92 |     "X_scaled = scaler.fit_transform(X)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "### 3. 验证结果表明：sklearn通过矩阵的奇异值分解实现PCA，而不是矩阵的特征分解"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 3,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "def verify(n_components, dataset = X_scaled):\n",
109 |     "    # 返回sklearn的PCA结果\n",
110 |     "    pca_sklearn = PCA(n_components=n_components)\n",
111 |     "    sklearn_matrix = pca_sklearn.fit_transform(dataset)\n",
112 |     "    sklearn_eigenvalue = pca_sklearn.explained_variance_\n",
113 |     "    sklearn_eigenvector = pca_sklearn.components_\n",
114 |     "    \n",
115 |     "    # 返回SVD的PCA结果\n",
116 |     "    pca_custom = PCA_SVD(dataset, n_components=n_components)\n",
117 |     "    pca_custom_matrix, pca_custom_eigenvalue, pca_custom_eigenvector = pca_custom.pca_result()\n",
118 |     "    \n",
119 |     "    # 验证\n",
120 |     "    verify_eigenvalue = np.allclose(abs(sklearn_eigenvalue), abs(pca_custom_eigenvalue))\n",
121 |     "    verify_eigenvector = np.allclose(abs(sklearn_eigenvector), abs(pca_custom_eigenvector))\n",
122 |     "    verify_result = np.allclose(abs(sklearn_matrix), abs(pca_custom_matrix))  \n",
123 |     "    \n",
124 |     "    verify_bool = all([verify_eigenvalue, verify_eigenvector, verify_result])\n",
125 |     "    return verify_bool"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 4,
131 |    "metadata": {
132 |     "scrolled": true
133 |    },
134 |    "outputs": [
135 |     {
136 |      "data": {
137 |       "text/plain": [
138 |        "True"
139 |       ]
140 |      },
141 |      "execution_count": 4,
142 |      "metadata": {},
143 |      "output_type": "execute_result"
144 |     }
145 |    ],
146 |    "source": [
147 |     "all(map(verify, range(1, col+1)))"
148 |    ]
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": "Python 3",
154 |    "language": "python",
155 |    "name": "python3"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.7.1"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 2
172 | }
173 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/create_time_feature.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def create_time_feature(df, time_feature, start_end_month=3):
10 |     # 确保time_feature为pandas.pd.DatetimeIndex格式
11 |     time_feature = pd.DatetimeIndex(time_feature)
12 |     
13 |     # pandas里面的Timestamp、DatetimeIndex具有is_month_start、is_month_end属性来判断是否为月初月末
14 |     # 参数k用于需要自定义月初月末的范围，即月初k天、月末k天均定义为月初月末
15 |     result_list = []
16 |     for i in time_feature:
17 |         month_len = i.days_in_month  
18 |         month_end = [(month_len - j) for j in range(start_end_month)]
19 |         month_start_end = list(range(1, start_end_month+1))
20 |         month_start_end.extend(month_end)
21 |         result = i.day in month_start_end
22 |         result_list.append(int(result))
23 |     df["Is_Month_Start_End"] = result_list
24 |     
25 |     
26 |     # 与周相关的特征
27 |     # 小写a:返回星期几的英文缩写，如Mon、Tue；大写A则为完整形式
28 |     df["Weekday"] = [i.strftime("%a") for i in time_feature]
29 |     
30 |     # datetime.isoweekday():返回星期索引，取值范围为1～7，以周一作为每周的第一天
31 |     ISO_Weekday = [i.isoweekday() for i in time_feature]
32 |     
33 |     # 根据星期索引判断是否为周末
34 |     df["Is_Weekend"] = [int(i in [6,7]) for i in ISO_Weekday]
35 | 
36 |     # 大写W：返回在本年度的第几周，周一作为每周的第一天，新年的第一个周一前的日期属于week 0.
37 |     df["Week_Order"] = [i.strftime("%W") for i in time_feature]
38 |     
39 |     
40 |     # 与季节相关的特征
41 |     def Season(x):
42 |         if x in range(1,4):
43 |             return "Spring"
44 |         elif x in range(4,7):
45 |             return "Summer"
46 |         elif x in range(7,10):
47 |              return "Autumn"   
48 |         else:
49 |             return "Winter"
50 |     df["Season"] = time_feature.month.map(Season) 
51 |         
52 |         
53 |     # 与日期、时间相关的特征
54 |     df["Time"] = time_feature.strftime("%H:%M:%S") 
55 |     df["Hour_of_Day"] = [i[:2] for i in df["Time"]]
56 |     
57 |     def Time_Range(x):
58 |         if x >= "06:00:00" and x < "12:00:00":
59 |             return "AM"
60 |         
61 |         elif x >= "12:00:00" and x < "19:00:00":
62 |             return "PM"
63 |         
64 |         elif x >= "19:00:00" and x < "23:00:00":
65 |             return "Night"  
66 |         
67 |         else:
68 |             return "Mid Night"
69 |     df["Time_Range"] = df["Time"].map(Time_Range)
70 |     
71 |     # 小写j: 返回在本年度属于第几天，范围01-366
72 |     # 等价于df["Day_Order"] = [i.dayofyear for i in time_feature]
73 |     df["Day_Order"] = [i.strftime("%j") for i in time_feature]
74 |     
75 |     return df.drop(time_feature.name, axis=1)


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/high_categorical.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def high_categorical(dataframe, high_discrete, k=5):
10 |     # dataframe为pandas.DataFrame格式
11 |     # high_discrete为dataframe的某一列高势集离散型特征，为pandas.Series格式
12 |     # k表示上述离散型特征出现频次最高的k个不重复取值
13 |     
14 |     value_counts = high_discrete.value_counts()
15 |     top_categories = list(value_counts[:k].index)
16 |     top_categories.append('other')
17 |     
18 |     high_discrete = high_discrete.apply(lambda category: category if category in top_categories else 'other')
19 |     #print(high_discrete)
20 |     feature_dummies = pd.get_dummies(high_discrete, prefix=high_discrete.name)
21 |     
22 |     dataframe = dataframe.join(feature_dummies)
23 |     dataframe.drop(high_discrete.name, axis=1, inplace=True)
24 |     return dataframe
25 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/根据时间戳生成时间型索引&透视分析.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Author：马肖\n",
  8 |     "#### E-Mail：maxiaoscut@aliyun.com\n",
  9 |     "#### GitHub：https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "#### 生成实验数据集"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "data": {
 26 |       "text/html": [
 27 |        "<div>\n",
 28 |        "<style scoped>\n",
 29 |        "    .dataframe tbody tr th:only-of-type {\n",
 30 |        "        vertical-align: middle;\n",
 31 |        "    }\n",
 32 |        "\n",
 33 |        "    .dataframe tbody tr th {\n",
 34 |        "        vertical-align: top;\n",
 35 |        "    }\n",
 36 |        "\n",
 37 |        "    .dataframe thead th {\n",
 38 |        "        text-align: right;\n",
 39 |        "    }\n",
 40 |        "</style>\n",
 41 |        "<table border=\"1\" class=\"dataframe\">\n",
 42 |        "  <thead>\n",
 43 |        "    <tr style=\"text-align: right;\">\n",
 44 |        "      <th></th>\n",
 45 |        "      <th>Amount</th>\n",
 46 |        "      <th>Time</th>\n",
 47 |        "    </tr>\n",
 48 |        "  </thead>\n",
 49 |        "  <tbody>\n",
 50 |        "    <tr>\n",
 51 |        "      <th>2017-07-11</th>\n",
 52 |        "      <td>32.367275</td>\n",
 53 |        "      <td>2017-07-11 17:30:14.852224</td>\n",
 54 |        "    </tr>\n",
 55 |        "    <tr>\n",
 56 |        "      <th>2017-07-12</th>\n",
 57 |        "      <td>542.409663</td>\n",
 58 |        "      <td>2017-07-12 18:15:28.303745</td>\n",
 59 |        "    </tr>\n",
 60 |        "    <tr>\n",
 61 |        "      <th>2017-07-13</th>\n",
 62 |        "      <td>802.919610</td>\n",
 63 |        "      <td>2017-07-13 19:00:41.755266</td>\n",
 64 |        "    </tr>\n",
 65 |        "    <tr>\n",
 66 |        "      <th>2017-07-14</th>\n",
 67 |        "      <td>55.159403</td>\n",
 68 |        "      <td>2017-07-14 19:45:55.206787</td>\n",
 69 |        "    </tr>\n",
 70 |        "    <tr>\n",
 71 |        "      <th>2017-07-15</th>\n",
 72 |        "      <td>382.264775</td>\n",
 73 |        "      <td>2017-07-15 20:31:08.658308</td>\n",
 74 |        "    </tr>\n",
 75 |        "  </tbody>\n",
 76 |        "</table>\n",
 77 |        "</div>"
 78 |       ],
 79 |       "text/plain": [
 80 |        "                Amount                       Time\n",
 81 |        "2017-07-11   32.367275 2017-07-11 17:30:14.852224\n",
 82 |        "2017-07-12  542.409663 2017-07-12 18:15:28.303745\n",
 83 |        "2017-07-13  802.919610 2017-07-13 19:00:41.755266\n",
 84 |        "2017-07-14   55.159403 2017-07-14 19:45:55.206787\n",
 85 |        "2017-07-15  382.264775 2017-07-15 20:31:08.658308"
 86 |       ]
 87 |      },
 88 |      "execution_count": 1,
 89 |      "metadata": {},
 90 |      "output_type": "execute_result"
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "import datetime as dt\n",
 95 |     "import numpy as np\n",
 96 |     "import pandas as pd\n",
 97 |     "from create_time_feature import create_time_feature as ctf\n",
 98 |     "\n",
 99 |     "\n",
100 |     "# 生成日期、时间信息\n",
101 |     "today = dt.datetime.today()\n",
102 |     "size = 600\n",
103 |     "full_time = pd.date_range(end=today, freq='1D 45min 13s 451521us', periods=size)\n",
104 |     "\n",
105 |     "# 生成交易额信息\n",
106 |     "np.random.seed(size)\n",
107 |     "consume_num = np.random.uniform(0, 1000, size)\n",
108 |     "\n",
109 |     "# 运用datetime.strftime(\"%Y-%m-%d\")从完整的时间中分离出字符串格式的日期、时间\n",
110 |     "# 运用pd.DatetimeIndex或者pd.to_datetime将字符串格式的日期转化为日期索引\n",
111 |     "consume_date = pd.DatetimeIndex(full_time.strftime(\"%Y-%m-%d\"))\n",
112 |     "\n",
113 |     "# 构建数据框\n",
114 |     "dict_ = {\"Amount\": consume_num, \"Time\":full_time}\n",
115 |     "sales = pd.DataFrame(dict_, index=consume_date)\n",
116 |     "sales.head()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 2,
122 |    "metadata": {
123 |     "scrolled": true
124 |    },
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/html": [
129 |        "<div>\n",
130 |        "<style scoped>\n",
131 |        "    .dataframe tbody tr th:only-of-type {\n",
132 |        "        vertical-align: middle;\n",
133 |        "    }\n",
134 |        "\n",
135 |        "    .dataframe tbody tr th {\n",
136 |        "        vertical-align: top;\n",
137 |        "    }\n",
138 |        "\n",
139 |        "    .dataframe thead th {\n",
140 |        "        text-align: right;\n",
141 |        "    }\n",
142 |        "</style>\n",
143 |        "<table border=\"1\" class=\"dataframe\">\n",
144 |        "  <thead>\n",
145 |        "    <tr style=\"text-align: right;\">\n",
146 |        "      <th></th>\n",
147 |        "      <th>Amount</th>\n",
148 |        "      <th>Is_Month_Start_End</th>\n",
149 |        "      <th>Weekday</th>\n",
150 |        "      <th>Is_Weekend</th>\n",
151 |        "      <th>Week_Order</th>\n",
152 |        "      <th>Season</th>\n",
153 |        "      <th>Hour_of_Day</th>\n",
154 |        "      <th>Time_Range</th>\n",
155 |        "      <th>Day_Order</th>\n",
156 |        "    </tr>\n",
157 |        "  </thead>\n",
158 |        "  <tbody>\n",
159 |        "    <tr>\n",
160 |        "      <th>2019-03-17</th>\n",
161 |        "      <td>167.892404</td>\n",
162 |        "      <td>0</td>\n",
163 |        "      <td>Sun</td>\n",
164 |        "      <td>1</td>\n",
165 |        "      <td>10</td>\n",
166 |        "      <td>Spring</td>\n",
167 |        "      <td>09</td>\n",
168 |        "      <td>AM</td>\n",
169 |        "      <td>076</td>\n",
170 |        "    </tr>\n",
171 |        "    <tr>\n",
172 |        "      <th>2019-03-18</th>\n",
173 |        "      <td>728.618652</td>\n",
174 |        "      <td>0</td>\n",
175 |        "      <td>Mon</td>\n",
176 |        "      <td>0</td>\n",
177 |        "      <td>11</td>\n",
178 |        "      <td>Spring</td>\n",
179 |        "      <td>10</td>\n",
180 |        "      <td>AM</td>\n",
181 |        "      <td>077</td>\n",
182 |        "    </tr>\n",
183 |        "    <tr>\n",
184 |        "      <th>2019-03-19</th>\n",
185 |        "      <td>976.788669</td>\n",
186 |        "      <td>0</td>\n",
187 |        "      <td>Tue</td>\n",
188 |        "      <td>0</td>\n",
189 |        "      <td>11</td>\n",
190 |        "      <td>Spring</td>\n",
191 |        "      <td>11</td>\n",
192 |        "      <td>AM</td>\n",
193 |        "      <td>078</td>\n",
194 |        "    </tr>\n",
195 |        "    <tr>\n",
196 |        "      <th>2019-03-20</th>\n",
197 |        "      <td>458.933563</td>\n",
198 |        "      <td>0</td>\n",
199 |        "      <td>Wed</td>\n",
200 |        "      <td>0</td>\n",
201 |        "      <td>11</td>\n",
202 |        "      <td>Spring</td>\n",
203 |        "      <td>12</td>\n",
204 |        "      <td>PM</td>\n",
205 |        "      <td>079</td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>2019-03-21</th>\n",
209 |        "      <td>421.315669</td>\n",
210 |        "      <td>0</td>\n",
211 |        "      <td>Thu</td>\n",
212 |        "      <td>0</td>\n",
213 |        "      <td>11</td>\n",
214 |        "      <td>Spring</td>\n",
215 |        "      <td>12</td>\n",
216 |        "      <td>PM</td>\n",
217 |        "      <td>080</td>\n",
218 |        "    </tr>\n",
219 |        "  </tbody>\n",
220 |        "</table>\n",
221 |        "</div>"
222 |       ],
223 |       "text/plain": [
224 |        "                Amount  Is_Month_Start_End Weekday  Is_Weekend Week_Order  \\\n",
225 |        "2019-03-17  167.892404                   0     Sun           1         10   \n",
226 |        "2019-03-18  728.618652                   0     Mon           0         11   \n",
227 |        "2019-03-19  976.788669                   0     Tue           0         11   \n",
228 |        "2019-03-20  458.933563                   0     Wed           0         11   \n",
229 |        "2019-03-21  421.315669                   0     Thu           0         11   \n",
230 |        "\n",
231 |        "            Season Hour_of_Day Time_Range Day_Order  \n",
232 |        "2019-03-17  Spring          09         AM       076  \n",
233 |        "2019-03-18  Spring          10         AM       077  \n",
234 |        "2019-03-19  Spring          11         AM       078  \n",
235 |        "2019-03-20  Spring          12         PM       079  \n",
236 |        "2019-03-21  Spring          12         PM       080  "
237 |       ]
238 |      },
239 |      "execution_count": 2,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "ctf(sales, sales['Time'], 3).tail(5)"
246 |    ]
247 |   },
248 |   {
249 |    "cell_type": "markdown",
250 |    "metadata": {},
251 |    "source": [
252 |     "#### 结合时间型特征、数字型特征进行分析"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 3,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "name": "stdout",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "13259.086647654212\n"
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "print(sales.loc[\"2018-05\", \"Amount\"].sum())"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 4,
275 |    "metadata": {
276 |     "scrolled": true
277 |    },
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/html": [
282 |        "<div>\n",
283 |        "<style scoped>\n",
284 |        "    .dataframe tbody tr th:only-of-type {\n",
285 |        "        vertical-align: middle;\n",
286 |        "    }\n",
287 |        "\n",
288 |        "    .dataframe tbody tr th {\n",
289 |        "        vertical-align: top;\n",
290 |        "    }\n",
291 |        "\n",
292 |        "    .dataframe thead tr th {\n",
293 |        "        text-align: left;\n",
294 |        "    }\n",
295 |        "</style>\n",
296 |        "<table border=\"1\" class=\"dataframe\">\n",
297 |        "  <thead>\n",
298 |        "    <tr>\n",
299 |        "      <th></th>\n",
300 |        "      <th colspan=\"2\" halign=\"left\">Amount</th>\n",
301 |        "      <th colspan=\"2\" halign=\"left\">Is_Month_Start_End</th>\n",
302 |        "      <th colspan=\"2\" halign=\"left\">Is_Weekend</th>\n",
303 |        "    </tr>\n",
304 |        "    <tr>\n",
305 |        "      <th></th>\n",
306 |        "      <th>sum</th>\n",
307 |        "      <th>mean</th>\n",
308 |        "      <th>sum</th>\n",
309 |        "      <th>mean</th>\n",
310 |        "      <th>sum</th>\n",
311 |        "      <th>mean</th>\n",
312 |        "    </tr>\n",
313 |        "  </thead>\n",
314 |        "  <tbody>\n",
315 |        "    <tr>\n",
316 |        "      <th>2017-07-31</th>\n",
317 |        "      <td>7964.874424</td>\n",
318 |        "      <td>398.243721</td>\n",
319 |        "      <td>3</td>\n",
320 |        "      <td>0.150000</td>\n",
321 |        "      <td>6</td>\n",
322 |        "      <td>0.300000</td>\n",
323 |        "    </tr>\n",
324 |        "    <tr>\n",
325 |        "      <th>2017-10-31</th>\n",
326 |        "      <td>42624.193038</td>\n",
327 |        "      <td>478.923517</td>\n",
328 |        "      <td>18</td>\n",
329 |        "      <td>0.202247</td>\n",
330 |        "      <td>25</td>\n",
331 |        "      <td>0.280899</td>\n",
332 |        "    </tr>\n",
333 |        "    <tr>\n",
334 |        "      <th>2018-01-31</th>\n",
335 |        "      <td>52071.987789</td>\n",
336 |        "      <td>578.577642</td>\n",
337 |        "      <td>16</td>\n",
338 |        "      <td>0.177778</td>\n",
339 |        "      <td>25</td>\n",
340 |        "      <td>0.277778</td>\n",
341 |        "    </tr>\n",
342 |        "    <tr>\n",
343 |        "      <th>2018-04-30</th>\n",
344 |        "      <td>38017.128956</td>\n",
345 |        "      <td>442.059639</td>\n",
346 |        "      <td>17</td>\n",
347 |        "      <td>0.197674</td>\n",
348 |        "      <td>26</td>\n",
349 |        "      <td>0.302326</td>\n",
350 |        "    </tr>\n",
351 |        "    <tr>\n",
352 |        "      <th>2018-07-31</th>\n",
353 |        "      <td>45222.353345</td>\n",
354 |        "      <td>508.116330</td>\n",
355 |        "      <td>18</td>\n",
356 |        "      <td>0.202247</td>\n",
357 |        "      <td>25</td>\n",
358 |        "      <td>0.280899</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>2018-10-31</th>\n",
362 |        "      <td>44577.379553</td>\n",
363 |        "      <td>500.869433</td>\n",
364 |        "      <td>18</td>\n",
365 |        "      <td>0.202247</td>\n",
366 |        "      <td>25</td>\n",
367 |        "      <td>0.280899</td>\n",
368 |        "    </tr>\n",
369 |        "    <tr>\n",
370 |        "      <th>2019-01-31</th>\n",
371 |        "      <td>39211.077433</td>\n",
372 |        "      <td>440.573904</td>\n",
373 |        "      <td>17</td>\n",
374 |        "      <td>0.191011</td>\n",
375 |        "      <td>25</td>\n",
376 |        "      <td>0.280899</td>\n",
377 |        "    </tr>\n",
378 |        "    <tr>\n",
379 |        "      <th>2019-04-30</th>\n",
380 |        "      <td>25348.311909</td>\n",
381 |        "      <td>528.089831</td>\n",
382 |        "      <td>8</td>\n",
383 |        "      <td>0.166667</td>\n",
384 |        "      <td>13</td>\n",
385 |        "      <td>0.270833</td>\n",
386 |        "    </tr>\n",
387 |        "  </tbody>\n",
388 |        "</table>\n",
389 |        "</div>"
390 |       ],
391 |       "text/plain": [
392 |        "                  Amount             Is_Month_Start_End           Is_Weekend  \\\n",
393 |        "                     sum        mean                sum      mean        sum   \n",
394 |        "2017-07-31   7964.874424  398.243721                  3  0.150000          6   \n",
395 |        "2017-10-31  42624.193038  478.923517                 18  0.202247         25   \n",
396 |        "2018-01-31  52071.987789  578.577642                 16  0.177778         25   \n",
397 |        "2018-04-30  38017.128956  442.059639                 17  0.197674         26   \n",
398 |        "2018-07-31  45222.353345  508.116330                 18  0.202247         25   \n",
399 |        "2018-10-31  44577.379553  500.869433                 18  0.202247         25   \n",
400 |        "2019-01-31  39211.077433  440.573904                 17  0.191011         25   \n",
401 |        "2019-04-30  25348.311909  528.089831                  8  0.166667         13   \n",
402 |        "\n",
403 |        "                      \n",
404 |        "                mean  \n",
405 |        "2017-07-31  0.300000  \n",
406 |        "2017-10-31  0.280899  \n",
407 |        "2018-01-31  0.277778  \n",
408 |        "2018-04-30  0.302326  \n",
409 |        "2018-07-31  0.280899  \n",
410 |        "2018-10-31  0.280899  \n",
411 |        "2019-01-31  0.280899  \n",
412 |        "2019-04-30  0.270833  "
413 |       ]
414 |      },
415 |      "execution_count": 4,
416 |      "metadata": {},
417 |      "output_type": "execute_result"
418 |     }
419 |    ],
420 |    "source": [
421 |     "sales.resample(\"3M\").agg([np.sum, np.mean])"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": 5,
427 |    "metadata": {},
428 |    "outputs": [
429 |     {
430 |      "data": {
431 |       "text/html": [
432 |        "<div>\n",
433 |        "<style scoped>\n",
434 |        "    .dataframe tbody tr th:only-of-type {\n",
435 |        "        vertical-align: middle;\n",
436 |        "    }\n",
437 |        "\n",
438 |        "    .dataframe tbody tr th {\n",
439 |        "        vertical-align: top;\n",
440 |        "    }\n",
441 |        "\n",
442 |        "    .dataframe thead tr th {\n",
443 |        "        text-align: left;\n",
444 |        "    }\n",
445 |        "\n",
446 |        "    .dataframe thead tr:last-of-type th {\n",
447 |        "        text-align: right;\n",
448 |        "    }\n",
449 |        "</style>\n",
450 |        "<table border=\"1\" class=\"dataframe\">\n",
451 |        "  <thead>\n",
452 |        "    <tr>\n",
453 |        "      <th></th>\n",
454 |        "      <th colspan=\"4\" halign=\"left\">sum</th>\n",
455 |        "      <th colspan=\"4\" halign=\"left\">mean</th>\n",
456 |        "    </tr>\n",
457 |        "    <tr>\n",
458 |        "      <th></th>\n",
459 |        "      <th colspan=\"4\" halign=\"left\">Amount</th>\n",
460 |        "      <th colspan=\"4\" halign=\"left\">Amount</th>\n",
461 |        "    </tr>\n",
462 |        "    <tr>\n",
463 |        "      <th>Time_Range</th>\n",
464 |        "      <th>AM</th>\n",
465 |        "      <th>Mid Night</th>\n",
466 |        "      <th>Night</th>\n",
467 |        "      <th>PM</th>\n",
468 |        "      <th>AM</th>\n",
469 |        "      <th>Mid Night</th>\n",
470 |        "      <th>Night</th>\n",
471 |        "      <th>PM</th>\n",
472 |        "    </tr>\n",
473 |        "    <tr>\n",
474 |        "      <th>Season</th>\n",
475 |        "      <th></th>\n",
476 |        "      <th></th>\n",
477 |        "      <th></th>\n",
478 |        "      <th></th>\n",
479 |        "      <th></th>\n",
480 |        "      <th></th>\n",
481 |        "      <th></th>\n",
482 |        "      <th></th>\n",
483 |        "    </tr>\n",
484 |        "  </thead>\n",
485 |        "  <tbody>\n",
486 |        "    <tr>\n",
487 |        "      <th>Autumn</th>\n",
488 |        "      <td>17030.478111</td>\n",
489 |        "      <td>28295.772910</td>\n",
490 |        "      <td>13636.219565</td>\n",
491 |        "      <td>25423.063901</td>\n",
492 |        "      <td>500.896415</td>\n",
493 |        "      <td>533.882508</td>\n",
494 |        "      <td>413.218775</td>\n",
495 |        "      <td>529.647165</td>\n",
496 |        "    </tr>\n",
497 |        "    <tr>\n",
498 |        "      <th>Spring</th>\n",
499 |        "      <td>20941.083065</td>\n",
500 |        "      <td>24287.649816</td>\n",
501 |        "      <td>13111.780454</td>\n",
502 |        "      <td>25098.884251</td>\n",
503 |        "      <td>436.272564</td>\n",
504 |        "      <td>495.666323</td>\n",
505 |        "      <td>624.370498</td>\n",
506 |        "      <td>522.893422</td>\n",
507 |        "    </tr>\n",
508 |        "    <tr>\n",
509 |        "      <th>Summer</th>\n",
510 |        "      <td>9287.479980</td>\n",
511 |        "      <td>15292.584821</td>\n",
512 |        "      <td>6982.025141</td>\n",
513 |        "      <td>9745.677805</td>\n",
514 |        "      <td>386.978332</td>\n",
515 |        "      <td>546.163744</td>\n",
516 |        "      <td>436.376571</td>\n",
517 |        "      <td>487.283890</td>\n",
518 |        "    </tr>\n",
519 |        "    <tr>\n",
520 |        "      <th>Winter</th>\n",
521 |        "      <td>21182.473400</td>\n",
522 |        "      <td>22908.261242</td>\n",
523 |        "      <td>15040.550491</td>\n",
524 |        "      <td>26773.321497</td>\n",
525 |        "      <td>460.488552</td>\n",
526 |        "      <td>498.005679</td>\n",
527 |        "      <td>485.179048</td>\n",
528 |        "      <td>486.787664</td>\n",
529 |        "    </tr>\n",
530 |        "  </tbody>\n",
531 |        "</table>\n",
532 |        "</div>"
533 |       ],
534 |       "text/plain": [
535 |        "                     sum                                            \\\n",
536 |        "                  Amount                                             \n",
537 |        "Time_Range            AM     Mid Night         Night            PM   \n",
538 |        "Season                                                               \n",
539 |        "Autumn      17030.478111  28295.772910  13636.219565  25423.063901   \n",
540 |        "Spring      20941.083065  24287.649816  13111.780454  25098.884251   \n",
541 |        "Summer       9287.479980  15292.584821   6982.025141   9745.677805   \n",
542 |        "Winter      21182.473400  22908.261242  15040.550491  26773.321497   \n",
543 |        "\n",
544 |        "                  mean                                      \n",
545 |        "                Amount                                      \n",
546 |        "Time_Range          AM   Mid Night       Night          PM  \n",
547 |        "Season                                                      \n",
548 |        "Autumn      500.896415  533.882508  413.218775  529.647165  \n",
549 |        "Spring      436.272564  495.666323  624.370498  522.893422  \n",
550 |        "Summer      386.978332  546.163744  436.376571  487.283890  \n",
551 |        "Winter      460.488552  498.005679  485.179048  486.787664  "
552 |       ]
553 |      },
554 |      "execution_count": 5,
555 |      "metadata": {},
556 |      "output_type": "execute_result"
557 |     }
558 |    ],
559 |    "source": [
560 |     "sales.pivot_table(index=[\"Season\"], values=[\"Amount\"], columns=[\"Time_Range\"], aggfunc=[np.sum, np.mean])"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 6,
566 |    "metadata": {},
567 |    "outputs": [
568 |     {
569 |      "data": {
570 |       "text/plain": [
571 |        "Mid Night    176\n",
572 |        "PM           171\n",
573 |        "AM           152\n",
574 |        "Night        101\n",
575 |        "Name: Time_Range, dtype: int64"
576 |       ]
577 |      },
578 |      "execution_count": 6,
579 |      "metadata": {},
580 |      "output_type": "execute_result"
581 |     }
582 |    ],
583 |    "source": [
584 |     "time_sub_dummies = pd.get_dummies(sales[\"Time_Range\"])\n",
585 |     "\n",
586 |     "# sales.drop(\"time_sub\", axis=1).join(time_sub_dummies).head()\n",
587 |     "# sales.join(time_sub_dummies).head()\n",
588 |     "\n",
589 |     "sales[\"Time_Range\"].value_counts()"
590 |    ]
591 |   }
592 |  ],
593 |  "metadata": {
594 |   "kernelspec": {
595 |    "display_name": "Python 3",
596 |    "language": "python",
597 |    "name": "python3"
598 |   },
599 |   "language_info": {
600 |    "codemirror_mode": {
601 |     "name": "ipython",
602 |     "version": 3
603 |    },
604 |    "file_extension": ".py",
605 |    "mimetype": "text/x-python",
606 |    "name": "python",
607 |    "nbconvert_exporter": "python",
608 |    "pygments_lexer": "ipython3",
609 |    "version": "3.7.1"
610 |   }
611 |  },
612 |  "nbformat": 4,
613 |  "nbformat_minor": 2
614 | }
615 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/生成哑变量.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Author：马肖\n",
  8 |     "#### E-Mail：maxiaoscut@aliyun.com\n",
  9 |     "#### GitHub：https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "### 1. One-Hot 编码"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "### 每一列特征需要构建的状态寄存器的位数等于该列特征独立取值的个数\n",
 24 |     "#### 使用N位状态寄存器来对N个状态进行编码，每个状态都由他独立的寄存器位，并且在任意时候，其中只有一位有效。"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 1,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "import numpy as np\n",
 34 |     "import pandas as pd\n",
 35 |     "from sklearn.preprocessing import OneHotEncoder"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {},
 42 |    "outputs": [
 43 |     {
 44 |      "data": {
 45 |       "text/html": [
 46 |        "<div>\n",
 47 |        "<style scoped>\n",
 48 |        "    .dataframe tbody tr th:only-of-type {\n",
 49 |        "        vertical-align: middle;\n",
 50 |        "    }\n",
 51 |        "\n",
 52 |        "    .dataframe tbody tr th {\n",
 53 |        "        vertical-align: top;\n",
 54 |        "    }\n",
 55 |        "\n",
 56 |        "    .dataframe thead th {\n",
 57 |        "        text-align: right;\n",
 58 |        "    }\n",
 59 |        "</style>\n",
 60 |        "<table border=\"1\" class=\"dataframe\">\n",
 61 |        "  <thead>\n",
 62 |        "    <tr style=\"text-align: right;\">\n",
 63 |        "      <th></th>\n",
 64 |        "      <th>0</th>\n",
 65 |        "      <th>1</th>\n",
 66 |        "    </tr>\n",
 67 |        "  </thead>\n",
 68 |        "  <tbody>\n",
 69 |        "    <tr>\n",
 70 |        "      <th>0</th>\n",
 71 |        "      <td>Male</td>\n",
 72 |        "      <td>CN</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>1</th>\n",
 76 |        "      <td>Female</td>\n",
 77 |        "      <td>USA</td>\n",
 78 |        "    </tr>\n",
 79 |        "    <tr>\n",
 80 |        "      <th>2</th>\n",
 81 |        "      <td>Female</td>\n",
 82 |        "      <td>UK</td>\n",
 83 |        "    </tr>\n",
 84 |        "  </tbody>\n",
 85 |        "</table>\n",
 86 |        "</div>"
 87 |       ],
 88 |       "text/plain": [
 89 |        "        0    1\n",
 90 |        "0    Male   CN\n",
 91 |        "1  Female  USA\n",
 92 |        "2  Female   UK"
 93 |       ]
 94 |      },
 95 |      "execution_count": 2,
 96 |      "metadata": {},
 97 |      "output_type": "execute_result"
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "X = pd.DataFrame([['Male', 'CN'], ['Female', 'USA'], ['Female', 'UK']])\n",
102 |     "X"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 3,
108 |    "metadata": {
109 |     "scrolled": true
110 |    },
111 |    "outputs": [
112 |     {
113 |      "data": {
114 |       "text/plain": [
115 |        "array([[0., 1., 1., 0., 0.],\n",
116 |        "       [1., 0., 0., 0., 1.],\n",
117 |        "       [1., 0., 0., 1., 0.]])"
118 |       ]
119 |      },
120 |      "execution_count": 3,
121 |      "metadata": {},
122 |      "output_type": "execute_result"
123 |     }
124 |    ],
125 |    "source": [
126 |     "enc = OneHotEncoder()\n",
127 |     "enc.fit_transform(X).toarray()"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": 4,
133 |    "metadata": {
134 |     "scrolled": true
135 |    },
136 |    "outputs": [
137 |     {
138 |      "data": {
139 |       "text/plain": [
140 |        "array([[1., 0., 0., 1., 0.],\n",
141 |        "       [0., 1., 1., 0., 0.]])"
142 |       ]
143 |      },
144 |      "execution_count": 4,
145 |      "metadata": {},
146 |      "output_type": "execute_result"
147 |     }
148 |    ],
149 |    "source": [
150 |     "enc.transform([['Female', 'UK'], ['Male', 'CN']]).toarray()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "#### categories_ : list of arrays"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 5,
163 |    "metadata": {},
164 |    "outputs": [
165 |     {
166 |      "data": {
167 |       "text/plain": [
168 |        "[array(['Female', 'Male'], dtype=object),\n",
169 |        " array(['CN', 'UK', 'USA'], dtype=object)]"
170 |       ]
171 |      },
172 |      "execution_count": 5,
173 |      "metadata": {},
174 |      "output_type": "execute_result"
175 |     }
176 |    ],
177 |    "source": [
178 |     "enc.categories_"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "#### Return feature names for output features"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 6,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "data": {
195 |       "text/plain": [
196 |        "array(['x0_Female', 'x0_Male', 'x1_CN', 'x1_UK', 'x1_USA'], dtype=object)"
197 |       ]
198 |      },
199 |      "execution_count": 6,
200 |      "metadata": {},
201 |      "output_type": "execute_result"
202 |     }
203 |    ],
204 |    "source": [
205 |     "enc.get_feature_names()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "#### 参数handle_unknown\n",
213 |     "- handle_unknown='ignore'：对于未知类别特征，则对应哑变量全部设置为0值\n",
214 |     "- handle_unknown='error'：对于未知类别特征，将进行报错"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 7,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "array([[1., 0., 0., 1., 0.],\n",
226 |        "       [0., 1., 0., 0., 0.]])"
227 |       ]
228 |      },
229 |      "execution_count": 7,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "enc = OneHotEncoder(handle_unknown='ignore')\n",
236 |     "enc.fit(X)\n",
237 |     "enc.transform([['Female', 'UK'], ['Male', 'JP']]).toarray()"
238 |    ]
239 |   },
240 |   {
241 |    "cell_type": "code",
242 |    "execution_count": 8,
243 |    "metadata": {},
244 |    "outputs": [
245 |     {
246 |      "name": "stdout",
247 |      "output_type": "stream",
248 |      "text": [
249 |       "got exception\n"
250 |      ]
251 |     }
252 |    ],
253 |    "source": [
254 |     "enc = OneHotEncoder(handle_unknown='error')\n",
255 |     "enc.fit(X)\n",
256 |     "\n",
257 |     "try:\n",
258 |     "    enc.transform([['Female', 'UK'], ['Male', 'JP']]).toarray()\n",
259 |     "except ValueError:\n",
260 |     "    print('got exception')"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "markdown",
265 |    "metadata": {},
266 |    "source": [
267 |     "### 2. pandas.get_dummies构造哑变量"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "markdown",
272 |    "metadata": {},
273 |    "source": [
274 |     "##### pandas.get_dummies(data, prefix=None, prefixsep='', dummy_na=False, columns=None, sparse=False, drop_first=False)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 9,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "data": {
284 |       "text/html": [
285 |        "<div>\n",
286 |        "<style scoped>\n",
287 |        "    .dataframe tbody tr th:only-of-type {\n",
288 |        "        vertical-align: middle;\n",
289 |        "    }\n",
290 |        "\n",
291 |        "    .dataframe tbody tr th {\n",
292 |        "        vertical-align: top;\n",
293 |        "    }\n",
294 |        "\n",
295 |        "    .dataframe thead th {\n",
296 |        "        text-align: right;\n",
297 |        "    }\n",
298 |        "</style>\n",
299 |        "<table border=\"1\" class=\"dataframe\">\n",
300 |        "  <thead>\n",
301 |        "    <tr style=\"text-align: right;\">\n",
302 |        "      <th></th>\n",
303 |        "      <th>Nation</th>\n",
304 |        "      <th>Explorer</th>\n",
305 |        "      <th>Quantity</th>\n",
306 |        "    </tr>\n",
307 |        "  </thead>\n",
308 |        "  <tbody>\n",
309 |        "    <tr>\n",
310 |        "      <th>0</th>\n",
311 |        "      <td>CN</td>\n",
312 |        "      <td>Firefox</td>\n",
313 |        "      <td>1</td>\n",
314 |        "    </tr>\n",
315 |        "    <tr>\n",
316 |        "      <th>1</th>\n",
317 |        "      <td>US</td>\n",
318 |        "      <td>Chrome</td>\n",
319 |        "      <td>2</td>\n",
320 |        "    </tr>\n",
321 |        "    <tr>\n",
322 |        "      <th>2</th>\n",
323 |        "      <td>UK</td>\n",
324 |        "      <td>Safari</td>\n",
325 |        "      <td>3</td>\n",
326 |        "    </tr>\n",
327 |        "  </tbody>\n",
328 |        "</table>\n",
329 |        "</div>"
330 |       ],
331 |       "text/plain": [
332 |        "  Nation Explorer  Quantity\n",
333 |        "0     CN  Firefox         1\n",
334 |        "1     US   Chrome         2\n",
335 |        "2     UK   Safari         3"
336 |       ]
337 |      },
338 |      "execution_count": 9,
339 |      "metadata": {},
340 |      "output_type": "execute_result"
341 |     }
342 |    ],
343 |    "source": [
344 |     "dict_ = {'Nation': ['CN', 'US', 'UK'], 'Explorer': ['Firefox','Chrome','Safari'], 'Quantity': [1, 2, 3]}\n",
345 |     "df = pd.DataFrame(dict_)\n",
346 |     "df"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 10,
352 |    "metadata": {},
353 |    "outputs": [
354 |     {
355 |      "data": {
356 |       "text/html": [
357 |        "<div>\n",
358 |        "<style scoped>\n",
359 |        "    .dataframe tbody tr th:only-of-type {\n",
360 |        "        vertical-align: middle;\n",
361 |        "    }\n",
362 |        "\n",
363 |        "    .dataframe tbody tr th {\n",
364 |        "        vertical-align: top;\n",
365 |        "    }\n",
366 |        "\n",
367 |        "    .dataframe thead th {\n",
368 |        "        text-align: right;\n",
369 |        "    }\n",
370 |        "</style>\n",
371 |        "<table border=\"1\" class=\"dataframe\">\n",
372 |        "  <thead>\n",
373 |        "    <tr style=\"text-align: right;\">\n",
374 |        "      <th></th>\n",
375 |        "      <th>Quantity</th>\n",
376 |        "      <th>Nation_CN</th>\n",
377 |        "      <th>Nation_UK</th>\n",
378 |        "      <th>Nation_US</th>\n",
379 |        "      <th>Explorer_Chrome</th>\n",
380 |        "      <th>Explorer_Firefox</th>\n",
381 |        "      <th>Explorer_Safari</th>\n",
382 |        "    </tr>\n",
383 |        "  </thead>\n",
384 |        "  <tbody>\n",
385 |        "    <tr>\n",
386 |        "      <th>0</th>\n",
387 |        "      <td>1</td>\n",
388 |        "      <td>1</td>\n",
389 |        "      <td>0</td>\n",
390 |        "      <td>0</td>\n",
391 |        "      <td>0</td>\n",
392 |        "      <td>1</td>\n",
393 |        "      <td>0</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>1</th>\n",
397 |        "      <td>2</td>\n",
398 |        "      <td>0</td>\n",
399 |        "      <td>0</td>\n",
400 |        "      <td>1</td>\n",
401 |        "      <td>1</td>\n",
402 |        "      <td>0</td>\n",
403 |        "      <td>0</td>\n",
404 |        "    </tr>\n",
405 |        "    <tr>\n",
406 |        "      <th>2</th>\n",
407 |        "      <td>3</td>\n",
408 |        "      <td>0</td>\n",
409 |        "      <td>1</td>\n",
410 |        "      <td>0</td>\n",
411 |        "      <td>0</td>\n",
412 |        "      <td>0</td>\n",
413 |        "      <td>1</td>\n",
414 |        "    </tr>\n",
415 |        "  </tbody>\n",
416 |        "</table>\n",
417 |        "</div>"
418 |       ],
419 |       "text/plain": [
420 |        "   Quantity  Nation_CN  Nation_UK  Nation_US  Explorer_Chrome  \\\n",
421 |        "0         1          1          0          0                0   \n",
422 |        "1         2          0          0          1                1   \n",
423 |        "2         3          0          1          0                0   \n",
424 |        "\n",
425 |        "   Explorer_Firefox  Explorer_Safari  \n",
426 |        "0                 1                0  \n",
427 |        "1                 0                0  \n",
428 |        "2                 0                1  "
429 |       ]
430 |      },
431 |      "execution_count": 10,
432 |      "metadata": {},
433 |      "output_type": "execute_result"
434 |     }
435 |    ],
436 |    "source": [
437 |     "df_dummies = pd.get_dummies(df, prefix=['Nation', 'Explorer'], prefix_sep='_')    \n",
438 |     "df_dummies"
439 |    ]
440 |   }
441 |  ],
442 |  "metadata": {
443 |   "kernelspec": {
444 |    "display_name": "Python 3",
445 |    "language": "python",
446 |    "name": "python3"
447 |   },
448 |   "language_info": {
449 |    "codemirror_mode": {
450 |     "name": "ipython",
451 |     "version": 3
452 |    },
453 |    "file_extension": ".py",
454 |    "mimetype": "text/x-python",
455 |    "name": "python",
456 |    "nbconvert_exporter": "python",
457 |    "pygments_lexer": "ipython3",
458 |    "version": "3.7.1"
459 |   }
460 |  },
461 |  "nbformat": 4,
462 |  "nbformat_minor": 2
463 | }
464 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/连续型特征的分箱处理.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Author：马肖\n",
  8 |     "#### E-Mail：maxiaoscut@aliyun.com\n",
  9 |     "#### GitHub：https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd \n",
 20 |     "\n",
 21 |     "\n",
 22 |     "rdg = np.random.RandomState(2017)\n",
 23 |     "age = rdg.randint(1, 78, 20)\n",
 24 |     "fare = rdg.uniform(10, 100, 20)\n",
 25 |     "df = pd.DataFrame({'Age':age, 'Fare':fare}).round(2)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "- 离散化后的特征对异常数据有很强的鲁棒性：比如一个特征是年龄>30是1，否则0。\n",
 33 |     "- 如果特征没有离散化，一个异常数据“年龄300岁”会给模型造成很大的干扰；"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "### 1. 等距分箱"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "#### 方法一：运用pd.cut()\n",
 48 |     "- [pandas.cut官方文档](http://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.cut.html)\n",
 49 |     "- pandas.cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise')"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 2,
 55 |    "metadata": {
 56 |     "scrolled": true
 57 |    },
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "(0.928, 19.0]    8\n",
 63 |        "(55.0, 73.0]     5\n",
 64 |        "(37.0, 55.0]     5\n",
 65 |        "(19.0, 37.0]     2\n",
 66 |        "Name: Age, dtype: int64"
 67 |       ]
 68 |      },
 69 |      "execution_count": 2,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "pd.cut(df['Age'], 4).value_counts()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/plain": [
 86 |        "中年    8\n",
 87 |        "少年    8\n",
 88 |        "老年    3\n",
 89 |        "青年    1\n",
 90 |        "Name: Age, dtype: int64"
 91 |       ]
 92 |      },
 93 |      "execution_count": 3,
 94 |      "metadata": {},
 95 |      "output_type": "execute_result"
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "age_catogary = pd.cut(df['Age'], bins=[0, 17, 35, 59, 100], labels=['少年', '青年', '中年', '老年'])\n",
100 |     "age_catogary.value_counts()"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 4,
106 |    "metadata": {},
107 |    "outputs": [
108 |     {
109 |      "data": {
110 |       "text/html": [
111 |        "<div>\n",
112 |        "<style scoped>\n",
113 |        "    .dataframe tbody tr th:only-of-type {\n",
114 |        "        vertical-align: middle;\n",
115 |        "    }\n",
116 |        "\n",
117 |        "    .dataframe tbody tr th {\n",
118 |        "        vertical-align: top;\n",
119 |        "    }\n",
120 |        "\n",
121 |        "    .dataframe thead th {\n",
122 |        "        text-align: right;\n",
123 |        "    }\n",
124 |        "</style>\n",
125 |        "<table border=\"1\" class=\"dataframe\">\n",
126 |        "  <thead>\n",
127 |        "    <tr style=\"text-align: right;\">\n",
128 |        "      <th></th>\n",
129 |        "      <th>Age</th>\n",
130 |        "      <th>Fare</th>\n",
131 |        "      <th>Age_少年</th>\n",
132 |        "      <th>Age_青年</th>\n",
133 |        "      <th>Age_中年</th>\n",
134 |        "      <th>Age_老年</th>\n",
135 |        "    </tr>\n",
136 |        "  </thead>\n",
137 |        "  <tbody>\n",
138 |        "    <tr>\n",
139 |        "      <th>0</th>\n",
140 |        "      <td>60</td>\n",
141 |        "      <td>76.21</td>\n",
142 |        "      <td>0</td>\n",
143 |        "      <td>0</td>\n",
144 |        "      <td>0</td>\n",
145 |        "      <td>1</td>\n",
146 |        "    </tr>\n",
147 |        "    <tr>\n",
148 |        "      <th>1</th>\n",
149 |        "      <td>10</td>\n",
150 |        "      <td>71.77</td>\n",
151 |        "      <td>1</td>\n",
152 |        "      <td>0</td>\n",
153 |        "      <td>0</td>\n",
154 |        "      <td>0</td>\n",
155 |        "    </tr>\n",
156 |        "    <tr>\n",
157 |        "      <th>2</th>\n",
158 |        "      <td>71</td>\n",
159 |        "      <td>24.76</td>\n",
160 |        "      <td>0</td>\n",
161 |        "      <td>0</td>\n",
162 |        "      <td>0</td>\n",
163 |        "      <td>1</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>3</th>\n",
167 |        "      <td>14</td>\n",
168 |        "      <td>71.50</td>\n",
169 |        "      <td>1</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>0</td>\n",
172 |        "      <td>0</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>4</th>\n",
176 |        "      <td>43</td>\n",
177 |        "      <td>43.19</td>\n",
178 |        "      <td>0</td>\n",
179 |        "      <td>0</td>\n",
180 |        "      <td>1</td>\n",
181 |        "      <td>0</td>\n",
182 |        "    </tr>\n",
183 |        "  </tbody>\n",
184 |        "</table>\n",
185 |        "</div>"
186 |       ],
187 |       "text/plain": [
188 |        "   Age   Fare  Age_少年  Age_青年  Age_中年  Age_老年\n",
189 |        "0   60  76.21       0       0       0       1\n",
190 |        "1   10  71.77       1       0       0       0\n",
191 |        "2   71  24.76       0       0       0       1\n",
192 |        "3   14  71.50       1       0       0       0\n",
193 |        "4   43  43.19       0       0       1       0"
194 |       ]
195 |      },
196 |      "execution_count": 4,
197 |      "metadata": {},
198 |      "output_type": "execute_result"
199 |     }
200 |    ],
201 |    "source": [
202 |     "age_dummies = pd.get_dummies(age_catogary, prefix='Age')\n",
203 |     "df = df.join(age_dummies)\n",
204 |     "df.head()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "#### 方法二：运用np.digitize进行等距分段"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 5,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/plain": [
222 |        "array([ 1.        , 25.33333333, 49.66666667, 74.        ])"
223 |       ]
224 |      },
225 |      "execution_count": 5,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "# 将年龄分为3个区间\n",
232 |     "bins = np.linspace(df['Age'].min(), df['Age'].max()+1, 4)\n",
233 |     "bins"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 6,
239 |    "metadata": {},
240 |    "outputs": [
241 |     {
242 |      "data": {
243 |       "text/plain": [
244 |        "array([3, 1, 3, 1, 2, 1, 3, 3, 1, 2, 1, 1, 3, 1, 2, 2, 2, 1, 3, 2],\n",
245 |        "      dtype=int64)"
246 |       ]
247 |      },
248 |      "execution_count": 6,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "age_bins = np.digitize(df['Age'], bins)\n",
255 |     "age_bins"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "markdown",
260 |    "metadata": {},
261 |    "source": [
262 |     "###  2. 等频分箱qcut\n",
263 |     "- [pandas.qcut官方文档](http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.qcut.html)\n",
264 |     "- Discretize variable into equal-sized buckets based on rank or based on sample quantiles.\n",
265 |     "- cut将根据值本身来选择箱子均匀间隔，qcut是根据这些值的频率来选择箱子的均匀间隔"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 7,
271 |    "metadata": {},
272 |    "outputs": [],
273 |    "source": [
274 |     "age_bins = pd.qcut(df['Age'], 4)"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 8,
280 |    "metadata": {
281 |     "scrolled": true
282 |    },
283 |    "outputs": [
284 |     {
285 |      "data": {
286 |       "text/plain": [
287 |        "(53.0, 73.0]     5\n",
288 |        "(37.5, 53.0]     5\n",
289 |        "(9.75, 37.5]     5\n",
290 |        "(0.999, 9.75]    5\n",
291 |        "Name: Age, dtype: int64"
292 |       ]
293 |      },
294 |      "execution_count": 8,
295 |      "metadata": {},
296 |      "output_type": "execute_result"
297 |     }
298 |    ],
299 |    "source": [
300 |     "pd.qcut(df['Age'], 4).value_counts()"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "markdown",
305 |    "metadata": {},
306 |    "source": [
307 |     "### 3. 自定义区间对费用分段"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 9,
313 |    "metadata": {
314 |     "scrolled": true
315 |    },
316 |    "outputs": [
317 |     {
318 |      "data": {
319 |       "text/html": [
320 |        "<div>\n",
321 |        "<style scoped>\n",
322 |        "    .dataframe tbody tr th:only-of-type {\n",
323 |        "        vertical-align: middle;\n",
324 |        "    }\n",
325 |        "\n",
326 |        "    .dataframe tbody tr th {\n",
327 |        "        vertical-align: top;\n",
328 |        "    }\n",
329 |        "\n",
330 |        "    .dataframe thead th {\n",
331 |        "        text-align: right;\n",
332 |        "    }\n",
333 |        "</style>\n",
334 |        "<table border=\"1\" class=\"dataframe\">\n",
335 |        "  <thead>\n",
336 |        "    <tr style=\"text-align: right;\">\n",
337 |        "      <th></th>\n",
338 |        "      <th>Age</th>\n",
339 |        "      <th>Fare</th>\n",
340 |        "      <th>Age_少年</th>\n",
341 |        "      <th>Age_青年</th>\n",
342 |        "      <th>Age_中年</th>\n",
343 |        "      <th>Age_老年</th>\n",
344 |        "      <th>Fare_high</th>\n",
345 |        "      <th>Fare_low</th>\n",
346 |        "      <th>Fare_middle</th>\n",
347 |        "    </tr>\n",
348 |        "  </thead>\n",
349 |        "  <tbody>\n",
350 |        "    <tr>\n",
351 |        "      <th>0</th>\n",
352 |        "      <td>60</td>\n",
353 |        "      <td>76.21</td>\n",
354 |        "      <td>0</td>\n",
355 |        "      <td>0</td>\n",
356 |        "      <td>0</td>\n",
357 |        "      <td>1</td>\n",
358 |        "      <td>0</td>\n",
359 |        "      <td>1</td>\n",
360 |        "      <td>0</td>\n",
361 |        "    </tr>\n",
362 |        "    <tr>\n",
363 |        "      <th>1</th>\n",
364 |        "      <td>10</td>\n",
365 |        "      <td>71.77</td>\n",
366 |        "      <td>1</td>\n",
367 |        "      <td>0</td>\n",
368 |        "      <td>0</td>\n",
369 |        "      <td>0</td>\n",
370 |        "      <td>0</td>\n",
371 |        "      <td>0</td>\n",
372 |        "      <td>1</td>\n",
373 |        "    </tr>\n",
374 |        "    <tr>\n",
375 |        "      <th>2</th>\n",
376 |        "      <td>71</td>\n",
377 |        "      <td>24.76</td>\n",
378 |        "      <td>0</td>\n",
379 |        "      <td>0</td>\n",
380 |        "      <td>0</td>\n",
381 |        "      <td>1</td>\n",
382 |        "      <td>1</td>\n",
383 |        "      <td>0</td>\n",
384 |        "      <td>0</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>3</th>\n",
388 |        "      <td>14</td>\n",
389 |        "      <td>71.50</td>\n",
390 |        "      <td>1</td>\n",
391 |        "      <td>0</td>\n",
392 |        "      <td>0</td>\n",
393 |        "      <td>0</td>\n",
394 |        "      <td>0</td>\n",
395 |        "      <td>0</td>\n",
396 |        "      <td>1</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>4</th>\n",
400 |        "      <td>43</td>\n",
401 |        "      <td>43.19</td>\n",
402 |        "      <td>0</td>\n",
403 |        "      <td>0</td>\n",
404 |        "      <td>1</td>\n",
405 |        "      <td>0</td>\n",
406 |        "      <td>1</td>\n",
407 |        "      <td>0</td>\n",
408 |        "      <td>0</td>\n",
409 |        "    </tr>\n",
410 |        "  </tbody>\n",
411 |        "</table>\n",
412 |        "</div>"
413 |       ],
414 |       "text/plain": [
415 |        "   Age   Fare  Age_少年  Age_青年  Age_中年  Age_老年  Fare_high  Fare_low  \\\n",
416 |        "0   60  76.21       0       0       0       1          0         1   \n",
417 |        "1   10  71.77       1       0       0       0          0         0   \n",
418 |        "2   71  24.76       0       0       0       1          1         0   \n",
419 |        "3   14  71.50       1       0       0       0          0         0   \n",
420 |        "4   43  43.19       0       0       1       0          1         0   \n",
421 |        "\n",
422 |        "   Fare_middle  \n",
423 |        "0            0  \n",
424 |        "1            1  \n",
425 |        "2            0  \n",
426 |        "3            1  \n",
427 |        "4            0  "
428 |       ]
429 |      },
430 |      "execution_count": 9,
431 |      "metadata": {},
432 |      "output_type": "execute_result"
433 |     }
434 |    ],
435 |    "source": [
436 |     "def fare_rate_func(x):\n",
437 |     "    if x <= np.percentile(df['Fare'], 25):\n",
438 |     "        return 'high'\n",
439 |     "    elif np.percentile(df['Fare'],25) < x <= np.percentile(df['Fare'], 75):\n",
440 |     "        return 'middle'\n",
441 |     "    else:\n",
442 |     "        return 'low'\n",
443 |     "    \n",
444 |     "df['fare_rate'] = df['Fare'].apply(fare_rate_func)\n",
445 |     "# df['fare_rate'] = df['Fare'].map(fare_rate_func)\n",
446 |     "fare_dummies = pd.get_dummies(df['fare_rate'], prefix='Fare')\n",
447 |     "df.drop(['fare_rate'], axis=1, inplace=True)\n",
448 |     "df = df.join(fare_dummies)\n",
449 |     "df.head()"
450 |    ]
451 |   }
452 |  ],
453 |  "metadata": {
454 |   "kernelspec": {
455 |    "display_name": "Python 3",
456 |    "language": "python",
457 |    "name": "python3"
458 |   },
459 |   "language_info": {
460 |    "codemirror_mode": {
461 |     "name": "ipython",
462 |     "version": 3
463 |    },
464 |    "file_extension": ".py",
465 |    "mimetype": "text/x-python",
466 |    "name": "python",
467 |    "nbconvert_exporter": "python",
468 |    "pygments_lexer": "ipython3",
469 |    "version": "3.7.1"
470 |   }
471 |  },
472 |  "nbformat": 4,
473 |  "nbformat_minor": 2
474 | }
475 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/3.3 Feature Construction/高基数类别特征的处理.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Author：马肖\n",
  8 |     "#### E-Mail：maxiaoscut@aliyun.com\n",
  9 |     "#### GitHub：https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "def high_categorical(dataframe, high_discrete, k=3):\n",
 23 |     "    # df为pandas.DataFrame格式\n",
 24 |     "    # feature为df的某一列高势集离散型特征，为pandas.Series格式\n",
 25 |     "    # k表示上述离散型特征出现频次最高的k个不重复取值\n",
 26 |     "    \n",
 27 |     "    value_counts = high_discrete.value_counts()\n",
 28 |     "    top_categories = list(value_counts[:k].index)\n",
 29 |     "    top_categories.append('other')\n",
 30 |     "    \n",
 31 |     "    high_discrete = high_discrete.apply(lambda category: category if category in top_categories else 'other')\n",
 32 |     "    #print(high_discrete)\n",
 33 |     "    feature_dummies = pd.get_dummies(high_discrete, prefix=high_discrete.name)\n",
 34 |     "    \n",
 35 |     "    dataframe = dataframe.join(feature_dummies)\n",
 36 |     "    dataframe.drop(high_discrete.name, axis=1, inplace=True)\n",
 37 |     "    return dataframe"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "### 实验"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 2,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "data": {
 54 |       "text/html": [
 55 |        "<div>\n",
 56 |        "<style scoped>\n",
 57 |        "    .dataframe tbody tr th:only-of-type {\n",
 58 |        "        vertical-align: middle;\n",
 59 |        "    }\n",
 60 |        "\n",
 61 |        "    .dataframe tbody tr th {\n",
 62 |        "        vertical-align: top;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe thead th {\n",
 66 |        "        text-align: right;\n",
 67 |        "    }\n",
 68 |        "</style>\n",
 69 |        "<table border=\"1\" class=\"dataframe\">\n",
 70 |        "  <thead>\n",
 71 |        "    <tr style=\"text-align: right;\">\n",
 72 |        "      <th></th>\n",
 73 |        "      <th>销售额</th>\n",
 74 |        "      <th>邮编</th>\n",
 75 |        "    </tr>\n",
 76 |        "  </thead>\n",
 77 |        "  <tbody>\n",
 78 |        "    <tr>\n",
 79 |        "      <th>0</th>\n",
 80 |        "      <td>142</td>\n",
 81 |        "      <td>10072</td>\n",
 82 |        "    </tr>\n",
 83 |        "    <tr>\n",
 84 |        "      <th>1</th>\n",
 85 |        "      <td>140</td>\n",
 86 |        "      <td>10114</td>\n",
 87 |        "    </tr>\n",
 88 |        "    <tr>\n",
 89 |        "      <th>2</th>\n",
 90 |        "      <td>130</td>\n",
 91 |        "      <td>10037</td>\n",
 92 |        "    </tr>\n",
 93 |        "    <tr>\n",
 94 |        "      <th>3</th>\n",
 95 |        "      <td>108</td>\n",
 96 |        "      <td>10024</td>\n",
 97 |        "    </tr>\n",
 98 |        "    <tr>\n",
 99 |        "      <th>4</th>\n",
100 |        "      <td>136</td>\n",
101 |        "      <td>10029</td>\n",
102 |        "    </tr>\n",
103 |        "  </tbody>\n",
104 |        "</table>\n",
105 |        "</div>"
106 |       ],
107 |       "text/plain": [
108 |        "   销售额     邮编\n",
109 |        "0  142  10072\n",
110 |        "1  140  10114\n",
111 |        "2  130  10037\n",
112 |        "3  108  10024\n",
113 |        "4  136  10029"
114 |       ]
115 |      },
116 |      "execution_count": 2,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "np.random.seed(2019)\n",
123 |     "zipcode = np.random.randint(10000, 10150, size=5000)\n",
124 |     "sales = np.random.randint(100, 150, size=5000)\n",
125 |     "df = pd.DataFrame({'销售额':sales, '邮编':zipcode})\n",
126 |     "df.head()"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 3,
132 |    "metadata": {
133 |     "scrolled": true
134 |    },
135 |    "outputs": [
136 |     {
137 |      "data": {
138 |       "text/html": [
139 |        "<div>\n",
140 |        "<style scoped>\n",
141 |        "    .dataframe tbody tr th:only-of-type {\n",
142 |        "        vertical-align: middle;\n",
143 |        "    }\n",
144 |        "\n",
145 |        "    .dataframe tbody tr th {\n",
146 |        "        vertical-align: top;\n",
147 |        "    }\n",
148 |        "\n",
149 |        "    .dataframe thead th {\n",
150 |        "        text-align: right;\n",
151 |        "    }\n",
152 |        "</style>\n",
153 |        "<table border=\"1\" class=\"dataframe\">\n",
154 |        "  <thead>\n",
155 |        "    <tr style=\"text-align: right;\">\n",
156 |        "      <th></th>\n",
157 |        "      <th>销售额</th>\n",
158 |        "      <th>邮编_10001</th>\n",
159 |        "      <th>邮编_10012</th>\n",
160 |        "      <th>邮编_10075</th>\n",
161 |        "      <th>邮编_10114</th>\n",
162 |        "      <th>邮编_10126</th>\n",
163 |        "      <th>邮编_other</th>\n",
164 |        "    </tr>\n",
165 |        "  </thead>\n",
166 |        "  <tbody>\n",
167 |        "    <tr>\n",
168 |        "      <th>0</th>\n",
169 |        "      <td>142</td>\n",
170 |        "      <td>0</td>\n",
171 |        "      <td>0</td>\n",
172 |        "      <td>0</td>\n",
173 |        "      <td>0</td>\n",
174 |        "      <td>0</td>\n",
175 |        "      <td>1</td>\n",
176 |        "    </tr>\n",
177 |        "    <tr>\n",
178 |        "      <th>1</th>\n",
179 |        "      <td>140</td>\n",
180 |        "      <td>0</td>\n",
181 |        "      <td>0</td>\n",
182 |        "      <td>0</td>\n",
183 |        "      <td>1</td>\n",
184 |        "      <td>0</td>\n",
185 |        "      <td>0</td>\n",
186 |        "    </tr>\n",
187 |        "    <tr>\n",
188 |        "      <th>2</th>\n",
189 |        "      <td>130</td>\n",
190 |        "      <td>0</td>\n",
191 |        "      <td>0</td>\n",
192 |        "      <td>0</td>\n",
193 |        "      <td>0</td>\n",
194 |        "      <td>0</td>\n",
195 |        "      <td>1</td>\n",
196 |        "    </tr>\n",
197 |        "    <tr>\n",
198 |        "      <th>3</th>\n",
199 |        "      <td>108</td>\n",
200 |        "      <td>0</td>\n",
201 |        "      <td>0</td>\n",
202 |        "      <td>0</td>\n",
203 |        "      <td>0</td>\n",
204 |        "      <td>0</td>\n",
205 |        "      <td>1</td>\n",
206 |        "    </tr>\n",
207 |        "    <tr>\n",
208 |        "      <th>4</th>\n",
209 |        "      <td>136</td>\n",
210 |        "      <td>0</td>\n",
211 |        "      <td>0</td>\n",
212 |        "      <td>0</td>\n",
213 |        "      <td>0</td>\n",
214 |        "      <td>0</td>\n",
215 |        "      <td>1</td>\n",
216 |        "    </tr>\n",
217 |        "  </tbody>\n",
218 |        "</table>\n",
219 |        "</div>"
220 |       ],
221 |       "text/plain": [
222 |        "   销售额  邮编_10001  邮编_10012  邮编_10075  邮编_10114  邮编_10126  邮编_other\n",
223 |        "0  142         0         0         0         0         0         1\n",
224 |        "1  140         0         0         0         1         0         0\n",
225 |        "2  130         0         0         0         0         0         1\n",
226 |        "3  108         0         0         0         0         0         1\n",
227 |        "4  136         0         0         0         0         0         1"
228 |       ]
229 |      },
230 |      "execution_count": 3,
231 |      "metadata": {},
232 |      "output_type": "execute_result"
233 |     }
234 |    ],
235 |    "source": [
236 |     "high_categorical(df, df['邮编'], k=5).head(5)"
237 |    ]
238 |   }
239 |  ],
240 |  "metadata": {
241 |   "kernelspec": {
242 |    "display_name": "Python 3",
243 |    "language": "python",
244 |    "name": "python3"
245 |   },
246 |   "language_info": {
247 |    "codemirror_mode": {
248 |     "name": "ipython",
249 |     "version": 3
250 |    },
251 |    "file_extension": ".py",
252 |    "mimetype": "text/x-python",
253 |    "name": "python",
254 |    "nbconvert_exporter": "python",
255 |    "pygments_lexer": "ipython3",
256 |    "version": "3.7.1"
257 |   }
258 |  },
259 |  "nbformat": 4,
260 |  "nbformat_minor": 2
261 | }
262 | 


--------------------------------------------------------------------------------
/3. Feature Engineering/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/4. Classical Supervised Learning/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/GBDT-LR/GBDT系列与LR的融合&性能对比.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "- **Author：** 马肖\n",
  8 |     "- **E-Mail：** maxiaoscut@aliyun.com\n",
  9 |     "- **GitHub：**  https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import numpy as np\n",
 19 |     "import pandas as pd\n",
 20 |     "from scipy.sparse import hstack\n",
 21 |     "from sklearn.datasets import make_classification\n",
 22 |     "from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier\n",
 23 |     "from sklearn.linear_model import LogisticRegression\n",
 24 |     "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score\n",
 25 |     "from sklearn.model_selection import train_test_split\n",
 26 |     "from sklearn.preprocessing import OneHotEncoder\n",
 27 |     "from xgboost import XGBClassifier\n",
 28 |     "from lightgbm import LGBMClassifier\n",
 29 |     "\n",
 30 |     "X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,\n",
 31 |     "                            n_classes=2, n_clusters_per_class=3, random_state=2017)\n",
 32 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "clf_gbdt = GradientBoostingClassifier(n_estimators=50)\n",
 42 |     "clf_xgb = XGBClassifier(n_estimators=50)\n",
 43 |     "clf_lgb = LGBMClassifier(n_estimators=50)\n",
 44 |     "lr = LogisticRegression(max_iter=500, solver='lbfgs')\n",
 45 |     "\n",
 46 |     "models = [clf_gbdt, clf_xgb, clf_lgb]\n",
 47 |     "names = ['GBDT', 'XGBoost', 'LightGBM']\n",
 48 |     "\n",
 49 |     "metric_scores = []\n",
 50 |     "for model,name in zip(models, names):\n",
 51 |     "    model.fit(X_train, y_train)\n",
 52 |     "    y_pred = model.predict(X_test)\n",
 53 |     "    y_pred_prob = model.predict_proba(X_test)[:, 1]\n",
 54 |     "    acc = accuracy_score(y_test, y_pred)\n",
 55 |     "    auc = roc_auc_score(y_test, y_pred_prob)\n",
 56 |     "    fscore = f1_score(y_test, y_pred)\n",
 57 |     "\n",
 58 |     "    if name == 'GBDT':\n",
 59 |     "        X_train_leaves = model.apply(X_train)[:, :, 0]\n",
 60 |     "        X_test_leaves = model.apply(X_test)[:, :, 0]\n",
 61 |     "        \n",
 62 |     "    elif name == 'LightGBM':\n",
 63 |     "        X_train_leaves = model.predict(X_train, pred_leaf=True)\n",
 64 |     "        X_test_leaves = model.predict(X_test, pred_leaf=True)\n",
 65 |     "    else:\n",
 66 |     "        X_train_leaves = model.apply(X_train)\n",
 67 |     "        X_test_leaves = model.apply(X_test)\n",
 68 |     "\n",
 69 |     "    \n",
 70 |     "    All_leaves = np.r_[X_train_leaves, X_test_leaves]\n",
 71 |     "    All_leaves = All_leaves.astype(np.int32)\n",
 72 |     "\n",
 73 |     "    enc = OneHotEncoder(categories='auto')\n",
 74 |     "    X_new_feat = enc.fit_transform(All_leaves)\n",
 75 |     "    \n",
 76 |     "    train_samples = X_train_leaves.shape[0]\n",
 77 |     "    X_train_new = X_new_feat[:train_samples, :]\n",
 78 |     "    X_test_new = X_new_feat[train_samples:, :]\n",
 79 |     "\n",
 80 |     "    X_train_hstack = hstack([X_train_new, X_train])\n",
 81 |     "    X_test_hstack = hstack([X_test_new, X_test])\n",
 82 |     "\n",
 83 |     "    lr.fit(X_train_hstack, y_train)\n",
 84 |     "    y_pred_2 = lr.predict(X_test_hstack)\n",
 85 |     "    y_pred_prob_2 = lr.predict_proba(X_test_hstack)[:, 1]\n",
 86 |     "\n",
 87 |     "    new_acc = accuracy_score(y_test, y_pred_2)\n",
 88 |     "    new_auc = roc_auc_score(y_test, y_pred_prob_2)\n",
 89 |     "    new_fscore = f1_score(y_test, y_pred_2)\n",
 90 |     "    score = {'OriginalFeature':[fscore, acc, auc], 'NewFeature':[ new_fscore, new_acc, new_auc]}\n",
 91 |     "    result = pd.DataFrame(score)\n",
 92 |     "    metric_scores.append(result)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 3,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/html": [
103 |        "<div>\n",
104 |        "<style scoped>\n",
105 |        "    .dataframe tbody tr th:only-of-type {\n",
106 |        "        vertical-align: middle;\n",
107 |        "    }\n",
108 |        "\n",
109 |        "    .dataframe tbody tr th {\n",
110 |        "        vertical-align: top;\n",
111 |        "    }\n",
112 |        "\n",
113 |        "    .dataframe thead th {\n",
114 |        "        text-align: right;\n",
115 |        "    }\n",
116 |        "</style>\n",
117 |        "<table border=\"1\" class=\"dataframe\">\n",
118 |        "  <thead>\n",
119 |        "    <tr style=\"text-align: right;\">\n",
120 |        "      <th></th>\n",
121 |        "      <th></th>\n",
122 |        "      <th>OriginalFeature</th>\n",
123 |        "      <th>NewFeature</th>\n",
124 |        "    </tr>\n",
125 |        "  </thead>\n",
126 |        "  <tbody>\n",
127 |        "    <tr>\n",
128 |        "      <th rowspan=\"3\" valign=\"top\">GBDT + LR</th>\n",
129 |        "      <th>F1</th>\n",
130 |        "      <td>0.841070</td>\n",
131 |        "      <td>0.875536</td>\n",
132 |        "    </tr>\n",
133 |        "    <tr>\n",
134 |        "      <th>ACC</th>\n",
135 |        "      <td>0.838400</td>\n",
136 |        "      <td>0.872400</td>\n",
137 |        "    </tr>\n",
138 |        "    <tr>\n",
139 |        "      <th>AUC</th>\n",
140 |        "      <td>0.925139</td>\n",
141 |        "      <td>0.946116</td>\n",
142 |        "    </tr>\n",
143 |        "    <tr>\n",
144 |        "      <th rowspan=\"3\" valign=\"top\">XGBoost + LR</th>\n",
145 |        "      <th>F1</th>\n",
146 |        "      <td>0.837136</td>\n",
147 |        "      <td>0.872116</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>ACC</th>\n",
151 |        "      <td>0.834400</td>\n",
152 |        "      <td>0.869200</td>\n",
153 |        "    </tr>\n",
154 |        "    <tr>\n",
155 |        "      <th>AUC</th>\n",
156 |        "      <td>0.921574</td>\n",
157 |        "      <td>0.943909</td>\n",
158 |        "    </tr>\n",
159 |        "    <tr>\n",
160 |        "      <th rowspan=\"3\" valign=\"top\">LightGBM + LR</th>\n",
161 |        "      <th>F1</th>\n",
162 |        "      <td>0.910658</td>\n",
163 |        "      <td>0.921269</td>\n",
164 |        "    </tr>\n",
165 |        "    <tr>\n",
166 |        "      <th>ACC</th>\n",
167 |        "      <td>0.908800</td>\n",
168 |        "      <td>0.919600</td>\n",
169 |        "    </tr>\n",
170 |        "    <tr>\n",
171 |        "      <th>AUC</th>\n",
172 |        "      <td>0.969011</td>\n",
173 |        "      <td>0.971790</td>\n",
174 |        "    </tr>\n",
175 |        "  </tbody>\n",
176 |        "</table>\n",
177 |        "</div>"
178 |       ],
179 |       "text/plain": [
180 |        "                   OriginalFeature  NewFeature\n",
181 |        "GBDT + LR     F1          0.841070    0.875536\n",
182 |        "              ACC         0.838400    0.872400\n",
183 |        "              AUC         0.925139    0.946116\n",
184 |        "XGBoost + LR  F1          0.837136    0.872116\n",
185 |        "              ACC         0.834400    0.869200\n",
186 |        "              AUC         0.921574    0.943909\n",
187 |        "LightGBM + LR F1          0.910658    0.921269\n",
188 |        "              ACC         0.908800    0.919600\n",
189 |        "              AUC         0.969011    0.971790"
190 |       ]
191 |      },
192 |      "execution_count": 3,
193 |      "metadata": {},
194 |      "output_type": "execute_result"
195 |     }
196 |    ],
197 |    "source": [
198 |     "model_names = ['GBDT + LR', 'XGBoost + LR', 'LightGBM + LR']   \n",
199 |     "model_metrics = ['F1', 'ACC', 'AUC']\n",
200 |     "col_idx = pd.MultiIndex.from_product([model_names, model_metrics])\n",
201 |     "df_contrast = pd.concat(metric_scores, axis=0)\n",
202 |     "df_contrast.index = col_idx   \n",
203 |     "df_contrast"
204 |    ]
205 |   }
206 |  ],
207 |  "metadata": {
208 |   "kernelspec": {
209 |    "display_name": "Python 3",
210 |    "language": "python",
211 |    "name": "python3"
212 |   },
213 |   "language_info": {
214 |    "codemirror_mode": {
215 |     "name": "ipython",
216 |     "version": 3
217 |    },
218 |    "file_extension": ".py",
219 |    "mimetype": "text/x-python",
220 |    "name": "python",
221 |    "nbconvert_exporter": "python",
222 |    "pygments_lexer": "ipython3",
223 |    "version": "3.7.1"
224 |   }
225 |  },
226 |  "nbformat": 4,
227 |  "nbformat_minor": 2
228 | }
229 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/GBDT-LR/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/GBDT-LR/gbdt_lr.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from scipy.sparse import hstack
 7 | from sklearn.datasets import make_classification
 8 | from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.metrics import roc_auc_score, accuracy_score
12 | from sklearn.preprocessing import OneHotEncoder
13 | 
14 | 
15 | # 生成实验数据集
16 | X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,
17 |                             n_classes=2, n_clusters_per_class=3, random_state=2017)
18 | 
19 | # 划分训练集和测试集
20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
21 | 
22 | 
23 | # 不生成新的特征，直接训练
24 | clf = GradientBoostingClassifier(n_estimators=50)
25 | clf.fit(X_train, y_train)
26 | y_pred = clf.predict(X_test)
27 | y_prob = clf.predict_proba(X_test)[:, 1]
28 | acc = accuracy_score(y_test, y_pred)
29 | auc = roc_auc_score(y_test, y_prob)
30 | print("Original featrues")
31 | print("GBDT_ACC: {:.6f}".format(acc))
32 | print("GBDT_AUC: {:.6f}".format(auc))
33 | 
34 | 
35 | # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
36 | X_train_leaves = clf.apply(X_train)[:, :, 0]
37 | X_test_leaves = clf.apply(X_test)[:, :, 0]
38 | 
39 | # 将X_train_leaves, X_test_leaves在axis=0方向上合并，再进行OneHotEncoder操作
40 | All_leaves = np.r_[X_train_leaves, X_test_leaves]
41 | 
42 | # 索引矩阵每列不是0/1二值型离散特征，因此需要OneHotEncoder操作
43 | enc = OneHotEncoder(categories='auto')
44 | new_features = enc.fit_transform(All_leaves)
45 | 
46 | # 根据原训练集、测试集的索引对新特征予以拆分
47 | train_samples = X_train.shape[0]
48 | X_train_new = new_features[:train_samples, :]
49 | X_test_new = new_features[train_samples: , :]
50 | 
51 | # 将初始训练集与GBDT新生成的特征联合后再训练LR
52 | X_train_hstack = hstack([X_train_new, X_train])
53 | X_test_hstack = hstack([X_test_new, X_test])
54 | 
55 | lr = LogisticRegression(solver='lbfgs', max_iter=1000)
56 | lr.fit(X_train_hstack, y_train)
57 | y_pred = lr.predict(X_test_hstack)
58 | y_prob = lr.predict_proba(X_test_hstack)[:, 1]
59 | 
60 | # 进行预测
61 | GBDT_LR_ACC = accuracy_score(y_test, y_pred)
62 | GBDT_LR_AUC = roc_auc_score(y_test, y_prob)
63 | print("\nNew featrues: ")
64 | print('GBDT_LR_ACC: {:.6f}'.format(GBDT_LR_ACC))
65 | print('GBDT_LR_AUC: {:.6f}'.format(GBDT_LR_AUC))


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/GBDT-LR/lightgbm_lr.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from scipy.sparse import hstack
 7 | from lightgbm import LGBMClassifier
 8 | from sklearn.datasets import make_classification
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.metrics import roc_auc_score, accuracy_score
12 | from sklearn.preprocessing import OneHotEncoder
13 | 
14 | 
15 | # 生成实验数据集
16 | X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,
17 |                             n_classes=2, n_clusters_per_class=3, random_state=2017)
18 | 
19 | # 划分训练集和测试集
20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
21 | 
22 | 
23 | # 不生成新的特征，直接训练，用于后续的性能对比
24 | clf = LGBMClassifier(n_estimators=50)
25 | clf.fit(X_train, y_train)
26 | y_pred = clf.predict(X_test)
27 | y_prob = clf.predict_proba(X_test)[:, 1]
28 | acc = accuracy_score(y_test, y_pred)
29 | auc = roc_auc_score(y_test, y_prob)
30 | print("Original featrues")
31 | print("LGB_ACC: {:.6f}".format(acc))
32 | print("LGB_AUC: {:.6f}".format(auc))
33 | 
34 | 
35 | # 生成的新特征, predict方法返回每个样本在每颗树叶节点的索引矩阵
36 | X_train_leaves = clf.predict(X_train, pred_leaf=True)
37 | X_test_leaves = clf.predict(X_test, pred_leaf=True)
38 | 
39 | # 将X_train_leaves, X_test_leaves在axis=0方向上合并，再进行OneHotEncoder操作
40 | All_leaves = np.r_[X_train_leaves, X_test_leaves]
41 | 
42 | # 索引矩阵每列不是0/1二值型离散特征，因此需要OneHotEncoder操作
43 | enc = OneHotEncoder(categories='auto')
44 | new_features = enc.fit_transform(All_leaves)
45 | 
46 | # 根据原训练集、测试集的索引对新特征予以拆分
47 | train_samples = X_train.shape[0]
48 | X_train_new = new_features[:train_samples, :]
49 | X_test_new = new_features[train_samples: , :]
50 | 
51 | # 将初始训练集与GBDT新生成的特征联合后再训练LR
52 | X_train_hstack = hstack([X_train_new, X_train])
53 | X_test_hstack = hstack([X_test_new, X_test])
54 | lr = LogisticRegression(solver='lbfgs', max_iter=1000)
55 | lr.fit(X_train_hstack, y_train)
56 | 
57 | # 进行预测
58 | y_pred = lr.predict(X_test_hstack)
59 | y_prob = lr.predict_proba(X_test_hstack)[:, 1]
60 | 
61 | LGB_LR_ACC = accuracy_score(y_test, y_pred)
62 | LGB_LR_AUC = roc_auc_score(y_test, y_prob)
63 | print("\nNew featrues: ")
64 | print('LGB_LR_ACC: {:.6f}'.format(LGB_LR_ACC))
65 | print('LGB_LR_AUC: {:.6f}'.format(LGB_LR_AUC))


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/GBDT-LR/xgboost_lr.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from scipy.sparse import hstack
 7 | from xgboost import XGBClassifier
 8 | from sklearn.datasets import make_classification
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn.model_selection import train_test_split
11 | from sklearn.metrics import roc_auc_score, accuracy_score
12 | from sklearn.preprocessing import OneHotEncoder
13 | 
14 | 
15 | # 生成实验数据集
16 | X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_redundant=2,
17 |                             n_classes=2, n_clusters_per_class=3, random_state=2017)
18 | 
19 | # 划分训练集和测试集
20 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
21 | 
22 | 
23 | # 不生成新的特征，直接训练
24 | clf = XGBClassifier(n_estimators=50)
25 | clf.fit(X_train, y_train)
26 | y_pred = clf.predict(X_test)
27 | y_prob = clf.predict_proba(X_test)[:, 1]
28 | acc = accuracy_score(y_test, y_pred)
29 | auc = roc_auc_score(y_test, y_prob)
30 | print("Original featrues")
31 | print("XGB_ACC: {:.6f}".format(acc))
32 | print("XGB_AUC: {:.6f}".format(auc))
33 | 
34 | 
35 | # 生成的新特征, apply方法返回每个样本在每颗树叶节点的索引矩阵
36 | X_train_leaves = clf.apply(X_train)
37 | X_test_leaves = clf.apply(X_test)
38 | 
39 | # 将X_train_leaves, X_test_leaves在axis=0方向上合并，再进行OneHotEncoder操作
40 | All_leaves = np.r_[X_train_leaves, X_test_leaves]
41 | 
42 | # 索引矩阵每列不是0/1二值型离散特征，因此需要OneHotEncoder操作
43 | enc = OneHotEncoder(categories='auto')
44 | new_features = enc.fit_transform(All_leaves)
45 | 
46 | # 根据原训练集、测试集的索引对新特征予以拆分
47 | train_samples = X_train.shape[0]
48 | X_train_new = new_features[:train_samples, :]
49 | X_test_new = new_features[train_samples: , :]
50 | 
51 | # 将初始训练集与GBDT新生成的特征联合后再训练LR
52 | X_train_hstack = hstack([X_train_new, X_train])
53 | X_test_hstack = hstack([X_test_new, X_test])
54 | 
55 | lr = LogisticRegression(solver='lbfgs', max_iter=1000)
56 | lr.fit(X_train_hstack, y_train)
57 | y_pred = lr.predict(X_test_hstack)
58 | y_prob = lr.predict_proba(X_test_hstack)[:, 1]
59 | 
60 | # 进行预测
61 | XGB_LR_ACC = accuracy_score(y_test, y_pred)
62 | XGB_LR_AUC = roc_auc_score(y_test, y_prob)
63 | print("\nNew featrues: ")
64 | print('XGB_LR_ACC: {:.6f}'.format(XGB_LR_ACC))
65 | print('XGB_LR_AUC: {:.6f}'.format(XGB_LR_AUC))


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/gbdt_lr_contrast.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/5. Ensemble Learning/Boosting/gbdt_lr_contrast.jpg


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/xgb_custom_lossfunc.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from xgboost import XGBClassifier, XGBRegressor
 7 | 
 8 | 
 9 | # 1. 以ln(cosh(x))为损失函数
10 | def log_cosh_obj(y_true, y_pred):
11 |     delta = y_pred - y_true 
12 |     grad = np.tanh(delta)
13 |     hess = (1.0 - grad*grad)
14 |     return grad, hess
15 | 
16 | # 回归问题
17 | model = XGBRegressor(objective=log_cosh_obj)
18 | # 分类问题
19 | model = XGBClassifier(objective=log_cosh_obj)
20 | 
21 | 
22 | # 2. Pseudo-Huber loss function，可以近似替代MAE
23 | def huber_approx_obj(y_true, y_pred, h=1):
24 |     # h为Pseudo-Huber loss function中的参数，用于调节坡度，其值越大，图像越陡峭
25 |     d = y_pred - y_true 
26 |     scale = 1 + np.square(d / h)
27 |     scale_sqrt = np.sqrt(scale)
28 |     grad = d / scale_sqrt
29 |     hess = 1 / scale / scale_sqrt
30 |     return grad, hess
31 | 
32 | # 回归问题
33 | model = XGBRegressor(objective=huber_approx_obj)
34 | # 分类问题
35 | model = XGBClassifier(objective=huber_approx_obj)
36 | 
37 | 
38 | # 3. 以log(exp(-x) + exp(x))为损失函数：更适合处理分类问题
39 | def log_exp(y_true, y_pred):
40 |     d = y_pred - y_true
41 |     t1 = np.exp(d) - np.exp(-d) 
42 |     t2 = np.exp(d) + np.exp(-d) 
43 |     grad = t1 / t2
44 |     hess = 1.0 - grad**2 
45 |     return grad, hess
46 | 
47 | # 分类问题
48 | model = XGBClassifier(objective=log_exp)
49 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/xgb_early_stopping.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/5. Ensemble Learning/Boosting/xgb_early_stopping.jpg


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/xgb_early_stopping.py:
--------------------------------------------------------------------------------
  1 | # Author：马肖
  2 | # E-mail：maxiaoscut@aliyun.com
  3 | # Github：https://github.com/Albertsr
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | import seaborn as sns
  8 | from matplotlib import pyplot as plt
  9 | from xgboost import XGBClassifier  
 10 | from sklearn.datasets import load_breast_cancer 
 11 | from sklearn.model_selection import train_test_split  
 12 | from sklearn.metrics import accuracy_score  
 13 | 
 14 | '''API 说明
 15 | https://xgboost.readthedocs.io/en/latest/python/python_api.html#xgboost.XGBClassifier.fit
 16 | 
 17 | #### API：fit(X, y, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, 
 18 |               verbose=True, xgb_model=None, sample_weight_eval_set=None, callbacks=None)
 19 | 
 20 | - **参数eval_set：** 设置验证集
 21 |   - 一个[(X, y)]形式的列表，数据集(X, y)作为验证集;
 22 |   - eval_set包含n个数据对时，以最后一个数据对validation_n的性能作为验证标准
 23 | 
 24 | - **参数eval_metric：**设置验证指标
 25 |   - 模型的评判指标，常见的有"rmse"、"mae"、"logloss"、"error"、"merror"、"mlogloss"、"auc"等
 26 |   - 详情见https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst
 27 |   - 当设置多个m评判标准时，例如eval_metric=["auc","logloss"]，则以后一个"logloss"为准
 28 | 
 29 | - **参数early_stopping_rounds**
 30 |   - 若early_stopping_rounds=n,意味着模型性能指标在n轮迭代之内没有改善时,将会early_stop
 31 |   - early_stop发生后，模型会生成3个额外的属性：best_score、best_iteration、best_ntree_limit，
 32 |   - best_ntree_limit总是等于best_iteration + 1
 33 | 
 34 | - **参数verbose：**布尔型参数，决定是否返回模型在验证集上的性能表现
 35 | '''
 36 | 
 37 | X, y = load_breast_cancer(return_X_y=True)  
 38 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018) 
 39 | model = XGBClassifier(n_estimators=350, learning_rate=0.25, reg_lambda=1.05, random_state=2018)
 40 | eval_set = [(X_train, y_train), (X_test, y_test)]  
 41 | eval_metric = ["auc", "logloss"]
 42 | model.fit(X_train, y_train, eval_set=eval_set, eval_metric=eval_metric, early_stopping_rounds=25, verbose=False)  
 43 | model_best_trees = XGBClassifier(n_estimators=model.best_ntree_limit, random_state=2018)
 44 | description = "Best_iteration: {:} \nBest_trees: {:} \nBest_score: {:.3f}" 
 45 | print(description.format(model.best_iteration, model.best_ntree_limit, model.best_score))
 46 | 
 47 | y_pred = model.predict(X_test)  
 48 | accuracy = accuracy_score(y_test, y_pred)  
 49 | print("Accuracy_original: {:.3f}".format(accuracy)) 
 50 | 
 51 | model_best_trees.fit(X_train, y_train)
 52 | y_pred_revised = model_best_trees.predict(X_test)  
 53 | accuracy = accuracy_score(y_test, y_pred_revised)  
 54 | print("Accuracy_revised: {:.3f}".format(accuracy)) 
 55 | 
 56 | # model.evals_result()以字典形式存储了模型训练过程中所有验证集上对应的各个性能指标的数值
 57 | evals_result = model.evals_result() 
 58 | assert isinstance(evals_result, dict), 'evals_result为字典格式'
 59 | 
 60 | # 单个性能指标的数值以列表形式存储，且列表长度等于验证次数，可以设置verbose=True来验证
 61 | valid1_logloss = evals_result['validation_1']['logloss']
 62 | assert isinstance(valid1_logloss, list), 'valid1_logloss为列表格式'
 63 | print('验证次数：{:}'.format(len(valid1_logloss)))
 64 | 
 65 | train_auc = evals_result['validation_0']['auc']
 66 | train_logloss = evals_result['validation_0']['logloss']
 67 | 
 68 | test_auc = evals_result['validation_1']['auc']
 69 | test_logloss = evals_result['validation_1']['logloss']
 70 | 
 71 | evals_dict = {'Train_auc':train_auc, 'Test_auc':test_auc, 'Train_Logloss':train_logloss, 'Test_Logloss':test_logloss}
 72 | metrics_result = pd.DataFrame(evals_dict)
 73 | print(metrics_result.head())
 74 | 
 75 | 
 76 | # 将模型在[(X_train, y_train), (X_test, y_test)]上的性能表现随着迭代轮数的趋势可视化
 77 | sns.set(font_scale=1.0, style='ticks', palette='summer') 
 78 | f, axes = plt.subplots(1, 2, figsize=(12, 4.5))
 79 | 
 80 | x = metrics_result.index
 81 | [y1, y2, y3, y4] = [metrics_result[i] for i in metrics_result.columns]
 82 | 
 83 | sns.lineplot(x, y1, ax=axes[0], color='navy', linestyle='--', label='Train AUC') 
 84 | sns.lineplot(x, y2, ax=axes[0], color="r", label='Test AUC')
 85 | sns.lineplot(x, y3, ax=axes[1], color='navy', label='Train Logloss')
 86 | sns.lineplot(x, y4, ax=axes[1], color="r", label='Test Logloss')
 87 | 
 88 | # 构建start=0, stop=tick_end, step=10的等差数列，作为x轴的坐标
 89 | tick_end = metrics_result.shape[0]
 90 | ticks = np.arange(0, tick_end, 10)
 91 | 
 92 | # 设置图像的轴标签
 93 | for i in [0, 1]: 
 94 |     if i == 0:
 95 |         axes[i].set_ylabel('AUC',fontsize=12)
 96 |         axes[i].set_xlabel('Rounds', fontsize=12)
 97 |     else:
 98 |         axes[i].set_ylabel('Logloss', fontsize=12)
 99 |         axes[i].set_xlabel('Rounds', fontsize=12)
100 |     
101 |     axes[i].set_xticks(ticks)
102 |     sns.despine()
103 | plt.show()
104 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Boosting/xgb_loss.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/5. Ensemble Learning/Boosting/xgb_loss.jpg


--------------------------------------------------------------------------------
/5. Ensemble Learning/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Stacking/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/5. Ensemble Learning/Stacking/StackingModels_vs_Mlxtend.py:
--------------------------------------------------------------------------------
 1 | # Author：MaXiao
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from sklearn.svm import SVC, SVR
 7 | from xgboost import XGBClassifier, XGBRegressor
 8 | from lightgbm import LGBMClassifier, LGBMRegressor
 9 | from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor  
10 | from sklearn.linear_model import LogisticRegression, LinearRegression 
11 | from mlxtend.classifier import StackingCVClassifier
12 | from mlxtend.regressor import StackingCVRegressor
13 | from sklearn.datasets import make_classification, make_regression
14 | from sklearn.model_selection import train_test_split
15 | from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
16 | from sklearn.preprocessing import StandardScaler
17 | from StackingModels import StackingModels
18 | 
19 | 
20 | X, y  = make_classification(n_samples=10000, n_features=20, n_informative=18, n_clusters_per_class=3, hypercube=1, 
21 |                             class_sep=0.85, random_state=2018)
22 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018)
23 | 
24 | scaler = StandardScaler()
25 | X_train, X_test = map(scaler.fit_transform, [X_train, X_test])
26 | 
27 | rf = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=2018, n_jobs=8)
28 | xgb = XGBClassifier(n_estimators=50, learning_rate=0.75, random_state=2018, n_jobs=8)
29 | lgb = LGBMClassifier(n_estimators=50, learning_rate=0.75, random_state=2018, n_jobs=8)
30 | svc = SVC(kernel='rbf', random_state=2018, probability=True, gamma='auto')
31 | lr = LogisticRegression(max_iter=1000, solver='lbfgs', penalty='l2', n_jobs=8)
32 | models = [rf, xgb, lgb, svc]
33 | y_pred_self, y_prob_self = StackingModels(models=models, meta_model=lr, X_train=X_train, X_test=X_test, y_train=y_train)
34 | acc = accuracy_score(y_test, y_pred_self)
35 | auc = roc_auc_score(y_test, y_prob_self)
36 | print('MyModel:  ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc))
37 | stack_clf = StackingCVClassifier(classifiers=models, meta_classifier=lr, cv=5).fit(X_train, y_train)
38 | y_pred_mxltend, y_prob_mxltend = stack_clf.predict(X_test), stack_clf.predict_proba(X_test)[:, -1]
39 | acc = accuracy_score(y_test, y_pred_mxltend)
40 | auc = roc_auc_score(y_test, y_prob_mxltend)
41 | print('Mlxtend:  ACC = {:.6f}, AUC = {:.6f}'.format(acc, auc))
42 | 
43 | 
44 | X, y  = make_regression(n_samples=5000, n_features=20, n_informative=18, random_state=2018)
45 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018)
46 | X_train, X_test = map(scaler.fit_transform, [X_train, X_test])
47 | 
48 | rf = RandomForestRegressor(n_estimators=50, max_depth=5, random_state=2018, n_jobs=8)
49 | xgb = XGBRegressor(n_estimators=50, learning_rate=0.75, random_state=2018, n_jobs=8)
50 | lgb = LGBMRegressor(n_estimators=50, learning_rate=0.75, random_state=2018, n_jobs=8)
51 | svr = SVR(kernel='rbf', gamma='auto')
52 | lr = LinearRegression(n_jobs=8)
53 | models = [rf, xgb, lgb, svr]
54 | 
55 | y_pred_self = StackingModels(models=models, meta_model=lr, X_train=X_train, 
56 |                              X_test=X_test, y_train=y_train, use_probas=False, task_mode='reg')
57 | mse = mean_squared_error(y_test, y_pred_self)
58 | print('MyModel:  MSE = {:.6f}'.format(mse))
59 | 
60 | stack_reg = StackingCVRegressor(regressors=models, meta_regressor=lr, cv=5).fit(X_train, y_train)
61 | y_pred_mxltend = stack_reg.predict(X_test)
62 | mse = mean_squared_error(y_test, y_pred_mxltend)
63 | print('Mlxtend:  MSE = {:.6f}'.format(mse))


--------------------------------------------------------------------------------
/5. Ensemble Learning/Stacking/stacking_models.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from sklearn.model_selection import KFold
 7 | 
 8 | 
 9 | def StackingModels(models, meta_model, X_train, y_train, X_test, task_mode='clf', return_proba=True, cv=5, random_state=2018):
10 |     ntrain, ntest = X_train.shape[0], X_test.shape[0]
11 |     kf = KFold(n_splits=cv, shuffle=True, random_state=random_state)
12 |     
13 |     def cross_validator(model):
14 |         valid_pred = np.zeros((ntrain))
15 |         test_pred = np.zeros((ntest, cv))
16 |         for i, (train_index, valid_index) in enumerate(kf.split(X_train)):
17 |             # 将初始训练集进行K折交叉检验，其中K-1折作为新的训练集，剩余1折作为验证集
18 |             X_train_kfold, y_train_kfold = X_train[train_index], y_train[train_index]
19 |             X_valid_kfold, y_valid_kfold = X_train[valid_index], y_train[valid_index]
20 |             # 训练模型，并对验证集进行预测
21 |             model.fit(X_train_kfold, y_train_kfold)
22 |             valid_pred[valid_index] = model.predict(X_valid_kfold)
23 |             # 对测试集进行预测
24 |             test_pred[:, i] = model.predict(X_test)
25 |             
26 |         if task_mode == 'clf':
27 |             test_pred_final = np.array([1 if i>0.5 else 0 for i in test_pred.mean(axis=1)])
28 |         elif task_mode == 'reg':
29 |             test_pred_final = test_pred.mean(axis=1)
30 |             
31 |         return valid_pred, test_pred_final
32 |     
33 |     # 生成第二级的训练集和测试集
34 |     train_second = np.zeros((ntrain, len(models)))
35 |     test_second = np.zeros((ntest, len(models)))
36 |     for i, j in enumerate(map(cross_validator, models)):
37 |         train_second[:, i] = j[0]
38 |         test_second[:, i] = j[1]
39 |     assert train_second.shape == (ntrain, len(models))
40 |     assert test_second.shape == (ntest, len(models))
41 | 
42 |     meta_model.fit(train_second, y_train)
43 |     test_pred = meta_model.predict(test_second)
44 |     
45 |     if task_mode == 'clf' and return_proba:
46 |         test_prob = meta_model.predict_proba(test_second)[:, -1]
47 |         return test_pred, test_prob
48 |     else:
49 |         return test_pred
50 | 


--------------------------------------------------------------------------------
/6. Cluster Analysis/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/7. Model Evaluation/Pics/ks curve.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/7. Model Evaluation/Pics/ks curve.jpg


--------------------------------------------------------------------------------
/7. Model Evaluation/Pics/prc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/7. Model Evaluation/Pics/prc.jpg


--------------------------------------------------------------------------------
/7. Model Evaluation/Pics/roc.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Albertsr/Machine-Learning/72d58e7187e003aebc4df0ae5914120640c4b80c/7. Model Evaluation/Pics/roc.jpg


--------------------------------------------------------------------------------
/7. Model Evaluation/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/7. Model Evaluation/ks_curve.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import numpy as np
 6 | from sklearn.metrics import confusion_matrix
 7 | from matplotlib import pyplot as plt
 8 | 
 9 | 
10 | def plot_ks(y_true, y_prob, thresholds_num=1000):
11 |     
12 |     thresholds = np.linspace(np.min(y_prob), np.max(y_prob), thresholds_num)
13 |     def tpr_fpr_delta(threshold):
14 |         y_pred = np.array([int(i>threshold) for i in y_prob])
15 |         tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
16 |         tpr = tp / (tp+fn)
17 |         fpr = fp / (fp+tn)
18 |         delta = tpr - fpr
19 |         return tpr, fpr, delta
20 | 
21 |     tprs, fprs, deltas = np.vectorize(tpr_fpr_delta)(thresholds)
22 |     target_tpr = tprs[np.argmax(deltas)]
23 |     target_fpr = fprs[np.argmax(deltas)]
24 |     target_threshold = thresholds[np.argmax(deltas)]
25 |     ks_value = np.max(deltas)
26 |  
27 |     plt.figure(figsize=(8, 4))
28 |     plt.plot(thresholds, tprs, label='TPR', color='r', linestyle='-', linewidth=1.5)    
29 |     plt.legend(loc='upper right')
30 |     plt.plot(thresholds, fprs, label='FPR', color='k', linestyle='-', linewidth=1.5)
31 |     plt.legend(loc='upper right')
32 |     plt.xlabel('Threshold', fontsize=10)
33 |     plt.ylabel('TPR, FPR', fontsize=10)
34 |     plt.annotate('KS Value : {:.6f}'.format(ks_value), xy=(target_threshold+0.01, 0.1+0.5*ks_value))
35 |     plt.xticks()
36 | 
37 | 
38 |     # 要连接的两个点的坐标
39 |     x = [[target_threshold, target_threshold]] 
40 |     y = [[target_fpr, target_tpr]]
41 | 
42 |     for i in range(len(x)):
43 |         plt.plot(x[i], y[i], 'b--', lw=1.5)
44 |         plt.scatter(x[i], y[i], c='b', s=15) # s控制点的大小
45 |         plt.annotate('TPR : {:.6f}'.format(target_tpr), xy=([target_threshold, target_tpr]), xytext=(0.3, target_tpr),
46 |                  arrowprops=dict(arrowstyle="<-", color='r')) 
47 |         plt.annotate('FPR : {:.6f}'.format(target_fpr), xy=([target_threshold, target_fpr]), xytext=(0.3, target_fpr),
48 |                  arrowprops=dict(arrowstyle="<-", color='k')) 
49 |         plt.show()
50 | 


--------------------------------------------------------------------------------
/7. Model Evaluation/ks_value.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | 
 6 | import numpy as np
 7 | from sklearn.metrics import confusion_matrix, make_scorer
 8 | 
 9 | 
10 | def get_ks(y_true, y_prob, thresholds_num=500):
11 |     # 生成一系列阈值
12 |     thresholds = np.linspace(np.min(y_prob), np.max(y_prob), thresholds_num) 
13 |     
14 |     def tpr_fpr_delta(threshold):
15 |         y_pred = np.array([int(i>threshold) for i in y_prob])
16 |         tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
17 |         fpr = fp / (fp+tn)
18 |         tpr = tp / (tp+fn)
19 |         delta = tpr - fpr
20 |         return delta
21 | 
22 |     max_delta = np.max([tpr_fpr_delta(threshold) for threshold in thresholds])
23 |     return max_delta
24 | 


--------------------------------------------------------------------------------
/7. Model Evaluation/prc.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | 
 6 | from sklearn.metrics import precision_recall_curve
 7 | from matplotlib import pyplot as plt
 8 | 
 9 | 
10 | def plot_prc(y_true, y_prob):
11 |     precision, recall, thresholds = precision_recall_curve(y_true, y_prob)
12 |     plt.plot(recall, precision, color='red', linestyle='-', linewidth=1.5)   
13 |     plt.xlabel('TPR', fontsize=10)
14 |     plt.ylabel('Precison', fontsize=10)
15 |     plt.title('Precison-Recall Curve')
16 |     plt.show()


--------------------------------------------------------------------------------
/7. Model Evaluation/roc.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | from sklearn.metrics import roc_curve
 6 | from matplotlib import pyplot as plt
 7 | 
 8 | def plot_roc(y_true, y_prob):
 9 |     auc = roc_auc_score(y_true, y_prob)
10 |     fprs, tprs, thresholds = roc_curve(y_true, y_prob, pos_label=1) 
11 |     plt.figure(figsize=(8, 4))
12 |     plt.plot(fprs, tprs, 'r-', label='ROC', lw=1.5)    
13 |     plt.fill_between(fprs, tprs, color='lightcoral', alpha=.25)
14 |     plt.annotate('AUC : {:.3f}'.format(auc), xy=(0.4, 0.4), xytext=(0.4, 0.5), color='k', fontsize=13) 
15 |     plt.legend(loc='lower right')
16 |     plt.xlabel('FPR',fontsize=10)
17 |     plt.ylabel('Recall', fontsize=10)
18 |     plt.title('ROC')
19 |     plt.show()
20 | 


--------------------------------------------------------------------------------
/7. Model Evaluation/交叉验证.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#### Author：马肖\n",
  8 |     "#### E-Mail：maxiaoscut@aliyun.com\n",
  9 |     "#### GitHub：https://github.com/Albertsr"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## 交叉验证"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {},
 22 |    "source": [
 23 |     "- 在未设置验证集的情况下，根据模型在测试集上的表现持续优化模型参数，这一过程可能使得模型在测试集上存在过拟合，因为上述过程中测试集的知识可以“泄漏”到模型中，并且评估指标不再报告泛化性能。\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "- 为了解决这个问题，数据集的某一部分可以作为验证集：模型训练完成以后，在验证集上对模型进行评估，当模型在验证集上性能较好时，最后在测试集上完成评估。\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "- 然而，通过将原始数据分为3个数据集合就大大减少了可用于模型学习的样本数量，并且得到的结果依赖于集合对（训练，验证）的随机选择。解决这个问题的方法是做交叉验证(简称CV)。测试集仍应保留以供最终评估，但不再需要建立专属的验证集。\n",
 30 |     "\n",
 31 |     "\n",
 32 |     "- k-fold交叉验证过程如下：\n",
 33 |     "  - 训练集被不重复地划分（均分）为k份\n",
 34 |     "  - 将每个子集分别做一次验证集，其余的K-1组子集作为训练集，由此得到K个模型\n",
 35 |     "  - 上述K个模型的分类准确率的平均数作为此K-CV下分类器的性能指标"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 1,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "import numpy as np\n",
 47 |     "import pandas as pd\n",
 48 |     "pd.set_option(\"precision\",6)\n",
 49 |     "np.set_printoptions(precision=6)\n",
 50 |     "\n",
 51 |     "from sklearn import metrics\n",
 52 |     "from sklearn.datasets import load_breast_cancer\n",
 53 |     "from sklearn.model_selection import train_test_split\n",
 54 |     "from xgboost import XGBClassifier\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "cancer = load_breast_cancer()\n",
 58 |     "X, y = load_breast_cancer(return_X_y=True)\n",
 59 |     "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018)\n",
 60 |     "clf_xgb = XGBClassifier().fit(X_train, y_train)"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "markdown",
 65 |    "metadata": {},
 66 |    "source": [
 67 |     "### 1.1 cross_val_score返回模型交叉验证的分数，有助于准确了解模型的性能"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {},
 73 |    "source": [
 74 |     "#### API\n",
 75 |     "model_selection.cross_val_score(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’)\n",
 76 |     "\n",
 77 |     "参数scoring只能取单个值"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "### 参数scoring\n",
 85 |     "#### 包含scoring参数的函数\n",
 86 |     "sklearn.model_selection.GridSearchCV\n",
 87 |     "\n",
 88 |     "sklearn.model_selection.RandomizedSearchCV\n",
 89 |     "\n",
 90 |     "sklearn.model_selection.cross_validate\n",
 91 |     "\n",
 92 |     "sklearn.model_selection.cross_val_score\n",
 93 |     "\n",
 94 |     "sklearn.model_selection.learning_curve\n",
 95 |     "\n",
 96 |     "sklearn.model_selection.validation_curve\n",
 97 |     "\n",
 98 |     "sklearn.model_selection.permutation_test_score\n",
 99 |     "\n",
100 |     "#### scoring参数的取值\n",
101 |     "http://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 2,
107 |    "metadata": {},
108 |    "outputs": [
109 |     {
110 |      "data": {
111 |       "text/html": [
112 |        "<div>\n",
113 |        "<style scoped>\n",
114 |        "    .dataframe tbody tr th:only-of-type {\n",
115 |        "        vertical-align: middle;\n",
116 |        "    }\n",
117 |        "\n",
118 |        "    .dataframe tbody tr th {\n",
119 |        "        vertical-align: top;\n",
120 |        "    }\n",
121 |        "\n",
122 |        "    .dataframe thead th {\n",
123 |        "        text-align: right;\n",
124 |        "    }\n",
125 |        "</style>\n",
126 |        "<table border=\"1\" class=\"dataframe\">\n",
127 |        "  <thead>\n",
128 |        "    <tr style=\"text-align: right;\">\n",
129 |        "      <th></th>\n",
130 |        "      <th>Score</th>\n",
131 |        "    </tr>\n",
132 |        "    <tr>\n",
133 |        "      <th>Metrics</th>\n",
134 |        "      <th></th>\n",
135 |        "    </tr>\n",
136 |        "  </thead>\n",
137 |        "  <tbody>\n",
138 |        "    <tr>\n",
139 |        "      <th>accuracy</th>\n",
140 |        "      <td>0.960 (+/- 0.016)</td>\n",
141 |        "    </tr>\n",
142 |        "    <tr>\n",
143 |        "      <th>roc_auc</th>\n",
144 |        "      <td>0.991 (+/- 0.013)</td>\n",
145 |        "    </tr>\n",
146 |        "    <tr>\n",
147 |        "      <th>average_precision</th>\n",
148 |        "      <td>0.994 (+/- 0.010)</td>\n",
149 |        "    </tr>\n",
150 |        "    <tr>\n",
151 |        "      <th>f1</th>\n",
152 |        "      <td>0.969 (+/- 0.012)</td>\n",
153 |        "    </tr>\n",
154 |        "  </tbody>\n",
155 |        "</table>\n",
156 |        "</div>"
157 |       ],
158 |       "text/plain": [
159 |        "                               Score\n",
160 |        "Metrics                             \n",
161 |        "accuracy           0.960 (+/- 0.016)\n",
162 |        "roc_auc            0.991 (+/- 0.013)\n",
163 |        "average_precision  0.994 (+/- 0.010)\n",
164 |        "f1                 0.969 (+/- 0.012)"
165 |       ]
166 |      },
167 |      "execution_count": 2,
168 |      "metadata": {},
169 |      "output_type": "execute_result"
170 |     }
171 |    ],
172 |    "source": [
173 |     "from sklearn.model_selection import cross_val_score\n",
174 |     "metric = [\"accuracy\", \"roc_auc\", \"average_precision\", \"f1\"]\n",
175 |     "\n",
176 |     "# 参数scoring只能取单个值\n",
177 |     "scores = [cross_val_score(clf_xgb, X_train, y_train, scoring=i, cv=5) for i in metric]\n",
178 |     "result = [(\"%0.3f (+/- %0.3f)\" % (i.mean(), i.std())) for i in scores]\n",
179 |     "\n",
180 |     "scores_df = pd.DataFrame(result, index=metric, columns=['Score'])\n",
181 |     "scores_df.index.set_names(\"Metrics\", inplace=True)\n",
182 |     "scores_df"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "### 1.2 model_selection.cross_validate：返回模型的训练时长和分数信息"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "#### API  \n",
197 |     "model_selection.cross_validate(estimator, X, y=None, groups=None, scoring=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’, return_train_score=’warn’)\n",
198 |     "\n",
199 |     "##### 返回一个字典，包含以下信息\n",
200 |     "1) test_score: The score array for test scores on each cv split.\n",
201 |     "\n",
202 |     "2) train_score: The score array for train scores on each cv split. This is available only if return_train_score parameter is True.\n",
203 |     "\n",
204 |     "3) fit_time: The time for fitting the estimator on the train set for each cv split.\n",
205 |     "\n",
206 |     "4) score_time: The time for scoring the estimator on the test set for each cv split. (注：return_train_score无论是否设置为True，都不会返回训练集的评分时间)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 3,
212 |    "metadata": {},
213 |    "outputs": [
214 |     {
215 |      "data": {
216 |       "text/plain": [
217 |        "{'fit_time': array([ 0.080217,  0.048279,  0.068588]),\n",
218 |        " 'score_time': array([ 0.005009,  0.01    ,  0.004011]),\n",
219 |        " 'test_accuracy': array([ 0.93007 ,  0.950704,  0.950355]),\n",
220 |        " 'test_average_precision': array([ 0.983104,  0.984866,  0.996406]),\n",
221 |        " 'test_f1': array([ 0.943182,  0.961326,  0.961749]),\n",
222 |        " 'test_roc_auc': array([ 0.97631 ,  0.980496,  0.993734]),\n",
223 |        " 'train_accuracy': array([ 1.,  1.,  1.]),\n",
224 |        " 'train_average_precision': array([ 1.,  1.,  1.]),\n",
225 |        " 'train_f1': array([ 1.,  1.,  1.]),\n",
226 |        " 'train_roc_auc': array([ 1.,  1.,  1.])}"
227 |       ]
228 |      },
229 |      "execution_count": 3,
230 |      "metadata": {},
231 |      "output_type": "execute_result"
232 |     }
233 |    ],
234 |    "source": [
235 |     "# 参数scoring能取多个值，可用列表或元组形式输入\n",
236 |     "from sklearn.model_selection import cross_validate\n",
237 |     "scores = cross_validate(clf_xgb, X_train, y_train, scoring=metric, return_train_score=True)\n",
238 |     "scores "
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 4,
244 |    "metadata": {},
245 |    "outputs": [
246 |     {
247 |      "data": {
248 |       "text/html": [
249 |        "<div>\n",
250 |        "<style scoped>\n",
251 |        "    .dataframe tbody tr th:only-of-type {\n",
252 |        "        vertical-align: middle;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe tbody tr th {\n",
256 |        "        vertical-align: top;\n",
257 |        "    }\n",
258 |        "\n",
259 |        "    .dataframe thead th {\n",
260 |        "        text-align: right;\n",
261 |        "    }\n",
262 |        "</style>\n",
263 |        "<table border=\"1\" class=\"dataframe\">\n",
264 |        "  <thead>\n",
265 |        "    <tr style=\"text-align: right;\">\n",
266 |        "      <th></th>\n",
267 |        "      <th>fit_time</th>\n",
268 |        "      <th>score_time</th>\n",
269 |        "      <th>test_accuracy</th>\n",
270 |        "      <th>train_accuracy</th>\n",
271 |        "      <th>test_roc_auc</th>\n",
272 |        "      <th>train_roc_auc</th>\n",
273 |        "      <th>test_average_precision</th>\n",
274 |        "      <th>train_average_precision</th>\n",
275 |        "      <th>test_f1</th>\n",
276 |        "      <th>train_f1</th>\n",
277 |        "    </tr>\n",
278 |        "  </thead>\n",
279 |        "  <tbody>\n",
280 |        "    <tr>\n",
281 |        "      <th>0</th>\n",
282 |        "      <td>0.080217</td>\n",
283 |        "      <td>0.005009</td>\n",
284 |        "      <td>0.930070</td>\n",
285 |        "      <td>1.0</td>\n",
286 |        "      <td>0.976310</td>\n",
287 |        "      <td>1.0</td>\n",
288 |        "      <td>0.983104</td>\n",
289 |        "      <td>1.0</td>\n",
290 |        "      <td>0.943182</td>\n",
291 |        "      <td>1.0</td>\n",
292 |        "    </tr>\n",
293 |        "    <tr>\n",
294 |        "      <th>1</th>\n",
295 |        "      <td>0.048279</td>\n",
296 |        "      <td>0.010000</td>\n",
297 |        "      <td>0.950704</td>\n",
298 |        "      <td>1.0</td>\n",
299 |        "      <td>0.980496</td>\n",
300 |        "      <td>1.0</td>\n",
301 |        "      <td>0.984866</td>\n",
302 |        "      <td>1.0</td>\n",
303 |        "      <td>0.961326</td>\n",
304 |        "      <td>1.0</td>\n",
305 |        "    </tr>\n",
306 |        "    <tr>\n",
307 |        "      <th>2</th>\n",
308 |        "      <td>0.068588</td>\n",
309 |        "      <td>0.004011</td>\n",
310 |        "      <td>0.950355</td>\n",
311 |        "      <td>1.0</td>\n",
312 |        "      <td>0.993734</td>\n",
313 |        "      <td>1.0</td>\n",
314 |        "      <td>0.996406</td>\n",
315 |        "      <td>1.0</td>\n",
316 |        "      <td>0.961749</td>\n",
317 |        "      <td>1.0</td>\n",
318 |        "    </tr>\n",
319 |        "  </tbody>\n",
320 |        "</table>\n",
321 |        "</div>"
322 |       ],
323 |       "text/plain": [
324 |        "   fit_time  score_time  test_accuracy  train_accuracy  test_roc_auc  \\\n",
325 |        "0  0.080217    0.005009       0.930070             1.0      0.976310   \n",
326 |        "1  0.048279    0.010000       0.950704             1.0      0.980496   \n",
327 |        "2  0.068588    0.004011       0.950355             1.0      0.993734   \n",
328 |        "\n",
329 |        "   train_roc_auc  test_average_precision  train_average_precision   test_f1  \\\n",
330 |        "0            1.0                0.983104                      1.0  0.943182   \n",
331 |        "1            1.0                0.984866                      1.0  0.961326   \n",
332 |        "2            1.0                0.996406                      1.0  0.961749   \n",
333 |        "\n",
334 |        "   train_f1  \n",
335 |        "0       1.0  \n",
336 |        "1       1.0  \n",
337 |        "2       1.0  "
338 |       ]
339 |      },
340 |      "execution_count": 4,
341 |      "metadata": {},
342 |      "output_type": "execute_result"
343 |     }
344 |    ],
345 |    "source": [
346 |     "pd.DataFrame(scores)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "markdown",
351 |    "metadata": {},
352 |    "source": [
353 |     "### 1.3 cross_val_predict:通过交叉验证获取预测值"
354 |    ]
355 |   },
356 |   {
357 |    "cell_type": "markdown",
358 |    "metadata": {},
359 |    "source": [
360 |     "#### API\n",
361 |     "- [cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html#sklearn.model_selection.cross_val_predict)\n",
362 |     "\n",
363 |     "- model_selection.cross_val_predict(estimator, X, y=None, groups=None, cv=None, n_jobs=1, verbose=0, fit_params=None, pre_dispatch=‘2*n_jobs’, method=’predict’)"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 5,
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "Accuracy : 0.960\n"
376 |      ]
377 |     }
378 |    ],
379 |    "source": [
380 |     "from sklearn.model_selection import cross_val_predict\n",
381 |     "y_pred = cross_val_predict(clf_xgb, X_train, y_train, cv=5) \n",
382 |     "acc = metrics.accuracy_score(y_train, y_pred)\n",
383 |     "print(\"Accuracy : %.3f\" % acc)"
384 |    ]
385 |   },
386 |   {
387 |    "cell_type": "markdown",
388 |    "metadata": {},
389 |    "source": [
390 |     "## 1.4 多种交叉验证"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {},
396 |    "source": [
397 |     "#### 1.4.1 K-fold: K-折交叉验证"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 6,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "[2 3 4 5] [0 1]\n",
410 |       "[0 1 4 5] [2 3]\n",
411 |       "[0 1 2 3] [4 5]\n"
412 |      ]
413 |     }
414 |    ],
415 |    "source": [
416 |     "from sklearn.model_selection import KFold\n",
417 |     "\n",
418 |     "X = [\"a\", \"b\", \"c\", \"d\",\"e\",\"f\"]\n",
419 |     "\n",
420 |     "#将X分为3份，2份作为训练集，剩余1份作为验证集\n",
421 |     "kf = KFold(n_splits=3)  \n",
422 |     "for train, test in kf.split(X):\n",
423 |     "    print(\"%s %s\" % (train, test))"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {},
429 |    "source": [
430 |     "#### 1.4.2 StratifiedKFold: 分层 k 折\n",
431 |     "StratifiedKFold 是k-fold的变种,每个小集合中标签类别比例近似于完整训练集中的比例"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 7,
437 |    "metadata": {
438 |     "scrolled": true
439 |    },
440 |    "outputs": [
441 |     {
442 |      "name": "stdout",
443 |      "output_type": "stream",
444 |      "text": [
445 |       "[2 3 6 7 8 9] [0 1 4 5]\n",
446 |       "[0 1 3 4 5 8 9] [2 6 7]\n",
447 |       "[0 1 2 4 5 6 7] [3 8 9]\n"
448 |      ]
449 |     }
450 |    ],
451 |    "source": [
452 |     "from sklearn.model_selection import StratifiedKFold\n",
453 |     "\n",
454 |     "X = np.ones(10)\n",
455 |     "y = [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]\n",
456 |     "skf = StratifiedKFold(n_splits=3)\n",
457 |     "for train, test in skf.split(X, y):\n",
458 |     "    print(\"%s %s\" % (train, test))"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "markdown",
463 |    "metadata": {},
464 |    "source": [
465 |     "#### 1.4.2 RepeatedKFold:重复 K-折交叉验证"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 8,
471 |    "metadata": {},
472 |    "outputs": [
473 |     {
474 |      "name": "stdout",
475 |      "output_type": "stream",
476 |      "text": [
477 |       "[1 2] [0 3]\n",
478 |       "[0 3] [1 2]\n",
479 |       "[0 2] [1 3]\n",
480 |       "[1 3] [0 2]\n",
481 |       "[1 2] [0 3]\n",
482 |       "[0 3] [1 2]\n"
483 |      ]
484 |     }
485 |    ],
486 |    "source": [
487 |     "from sklearn.model_selection import RepeatedKFold\n",
488 |     "X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])\n",
489 |     "\n",
490 |     "rkf = RepeatedKFold(n_splits=2, n_repeats=3, random_state=123)\n",
491 |     "for train, test in rkf.split(X):\n",
492 |     "    print(\"%s %s\" % (train, test))"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "markdown",
497 |    "metadata": {},
498 |    "source": [
499 |     "#### 1.4.3 留一交叉验证 (LOO)"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": 9,
505 |    "metadata": {
506 |     "scrolled": true
507 |    },
508 |    "outputs": [
509 |     {
510 |      "name": "stdout",
511 |      "output_type": "stream",
512 |      "text": [
513 |       "[1 2 3] [0]\n",
514 |       "[0 2 3] [1]\n",
515 |       "[0 1 3] [2]\n",
516 |       "[0 1 2] [3]\n"
517 |      ]
518 |     }
519 |    ],
520 |    "source": [
521 |     "# 只留一个样本作为验证集\n",
522 |     "from sklearn.model_selection import LeaveOneOut\n",
523 |     "\n",
524 |     "X = [1, 2, 3, 4]\n",
525 |     "loo = LeaveOneOut()\n",
526 |     "for train, test in loo.split(X):\n",
527 |     "    print(\"%s %s\" % (train, test))"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "markdown",
532 |    "metadata": {
533 |     "collapsed": true
534 |    },
535 |    "source": [
536 |     "#### 1.4.4 理解model_selection.ShuffleSplit"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "markdown",
541 |    "metadata": {},
542 |    "source": [
543 |     "##### Class sklearn.model_selection.ShuffleSplit(n_splits=10, test_size=’default’, train_size=None, random_state=None)"
544 |    ]
545 |   },
546 |   {
547 |    "cell_type": "markdown",
548 |    "metadata": {},
549 |    "source": [
550 |     "##### 类的方法：split(X, y=None, groups=None)：返回训练集和测试集在被划分数据集X中的索引"
551 |    ]
552 |   },
553 |   {
554 |    "cell_type": "code",
555 |    "execution_count": 10,
556 |    "metadata": {},
557 |    "outputs": [
558 |     {
559 |      "name": "stdout",
560 |      "output_type": "stream",
561 |      "text": [
562 |       "(426,) 568 0\n",
563 |       "(143,) 567 1\n",
564 |       "**********************\n",
565 |       "(426,) 568 0\n",
566 |       "(143,) 567 1\n",
567 |       "**********************\n",
568 |       "(426,) 568 0\n",
569 |       "(143,) 566 3\n",
570 |       "**********************\n"
571 |      ]
572 |     }
573 |    ],
574 |    "source": [
575 |     "from sklearn.model_selection import ShuffleSplit\n",
576 |     "\n",
577 |     "\n",
578 |     "X, y = load_breast_cancer(return_X_y=True)\n",
579 |     "rs = ShuffleSplit(n_splits=3, test_size=.25, random_state=0)\n",
580 |     "for train_index, test_index in rs.split(X, y):\n",
581 |     "    print(train_index.shape, train_index.max(), train_index.min()) \n",
582 |     "    print(test_index.shape, test_index.max(), test_index.min()) \n",
583 |     "    print(\"**********************\")"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "markdown",
588 |    "metadata": {},
589 |    "source": [
590 |     "##### 将ShuffleSplit类作为cross_val_score中的cv参数"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": 11,
596 |    "metadata": {},
597 |    "outputs": [
598 |     {
599 |      "data": {
600 |       "text/plain": [
601 |        "array([ 0.9375  ,  0.953125,  0.96875 ,  0.953125,  0.960938])"
602 |       ]
603 |      },
604 |      "execution_count": 11,
605 |      "metadata": {},
606 |      "output_type": "execute_result"
607 |     }
608 |    ],
609 |    "source": [
610 |     "# cv=rs不是指将数据集划分为n_splits等份，而是根据ShuffleSplit函数中的参数test_size来划分n_splits份数据\n",
611 |     "rs = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)\n",
612 |     "cross_val_score(clf_xgb, X_train, y_train, cv=rs)"
613 |    ]
614 |   }
615 |  ],
616 |  "metadata": {
617 |   "kernelspec": {
618 |    "display_name": "Python 3",
619 |    "language": "python",
620 |    "name": "python3"
621 |   },
622 |   "language_info": {
623 |    "codemirror_mode": {
624 |     "name": "ipython",
625 |     "version": 3
626 |    },
627 |    "file_extension": ".py",
628 |    "mimetype": "text/x-python",
629 |    "name": "python",
630 |    "nbconvert_exporter": "python",
631 |    "pygments_lexer": "ipython3",
632 |    "version": "3.7.1"
633 |   }
634 |  },
635 |  "nbformat": 4,
636 |  "nbformat_minor": 2
637 | }
638 | 


--------------------------------------------------------------------------------
/8. Model Persistence/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/8. Model Persistence/joblib.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | from sklearn.datasets import load_breast_cancer 
 6 | from sklearn.externals import joblib
 7 | from sklearn.linear_model import LogisticRegression as LR
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | 
11 | X, y = load_breast_cancer(return_X_y=True)
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2018)
13 | clf = LR().fit(X_train, y_train)
14 | 
15 | 运用joblib序列化和反序列化机器学习模型
16 | with open('cancer_joblib', 'wb') as model:
17 |     joblib.dump(clf, model)
18 | 
19 | 
20 | with open('cancer_joblib','rb') as model:
21 |     clf = joblib.load(model)
22 |     result = clf.score(X_test, y_test)
23 |     print('算法评估结果：{:.2%}'.format(result))


--------------------------------------------------------------------------------
/8. Model Persistence/pickle.py:
--------------------------------------------------------------------------------
 1 | # Author：马肖
 2 | # E-mail：maxiaoscut@aliyun.com
 3 | # Github：https://github.com/Albertsr
 4 | 
 5 | import pickle
 6 | from sklearn.datasets import load_breast_cancer 
 7 | from sklearn.linear_model import LogisticRegression as LR
 8 | from sklearn.model_selection import train_test_split
 9 | 
10 | 
11 | X, y = load_breast_cancer(return_X_y=True)
12 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2018)
13 | clf = LR().fit(X_train, y_train)
14 | 
15 | # 运用pickle序列化机器学习模型, 保存为字符串形式
16 | s = pickle.dumps(clf)
17 | 
18 | # 反序列化
19 | clf_load = pickle.loads(s)
20 | 
21 | # 输出模型预测精度
22 | print(clf_load.score(X_test, y_test))
23 | 
24 | # 用dump(object, file) 将模型保存至磁盘
25 | with open('clf_pickle', 'wb') as model:
26 |     pickle.dump(clf, model)
27 | 
28 | # 运用pickle调用模型，并输出模型结果
29 | with open('clf_pickle', 'rb') as model:
30 |     loaded_clf = pickle.load(model)
31 |     result = loaded_clf.score(X_test,y_test)
32 |     print('算法评估结果：%.2f%%' % (result*100))


--------------------------------------------------------------------------------
/9. The Foundations of ML/ReadMe.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | - **Author：** Maxiao
  2 | - **E-Mail：** maxiaoscut@aliyun.com
  3 | - **GitHub：**  https://github.com/Albertsr
  4 | ---
  5 | 
  6 | ### 备注：
  7 | #### 1. 以下链接均为个人原创整理，理论部分主要为有道云笔记MarkDown文件
  8 | #### 2. 部分Markdown文件包含大量LaTeX数学公式，请耐心等待加载
  9 | 
 10 | ---
 11 | 
 12 | ## 第一部分：机器学习算法
 13 | 
 14 | ### 1. 经典非集成监督算法
 15 | #### 1.1 【逻辑回归】理论详解：[Logistic Regression](http://note.youdao.com/noteshare?id=a0d1a51a06b27665adb25196b9302a3a&sub=EF059197C6EA477E976ADD9494033C47)
 16 | 
 17 | #### 1.2 【决策树】理论详解：[Decision Tree](http://note.youdao.com/noteshare?id=1df47deec7d30a99b436f3c0e801db24&sub=1BCABBDFFA50404582372B2BCE9159F1)
 18 | 
 19 | #### 1.3 【朴素贝叶斯】理论详解： [Naive Bayesian](http://note.youdao.com/noteshare?id=7c88fb8f65d118d5c820555f865c45a7&sub=5494C55210934C17ADF503B73317851C)
 20 | 
 21 | #### 1.4 【支持向量机】理论详解
 22 | - **[1）凸二次规划、拉格朗日对偶性与KKT条件](http://note.youdao.com/noteshare?id=8ed93129261f6f0805be1fd7d3acbc24&sub=6945BC170DD54C559B0F689FEB8AFDE9)**
 23 | 
 24 | - **[2）硬间隔最大化](http://note.youdao.com/noteshare?id=2b792aa786a8d30b1a4e7108cbadf4f1&sub=410912446FD9468ABD77F581020AC8D1)**
 25 | 
 26 | - **[3）软间隔最大化](http://note.youdao.com/noteshare?id=ba08bc2004bde1a8e7a534d942448462&sub=B08F68399D674A44A5B4A09CB19CDEDF)**
 27 | 
 28 | - **[4）Kernel Trick](http://note.youdao.com/noteshare?id=fc70222f0ed0be3e41a93cdd1835bd14&sub=FDE74AB1C280401A831083001581655F)**
 29 | 
 30 | ---
 31 | 
 32 | ### 2. 集成算法
 33 | #### 2.1 Boosting
 34 | - **1）GBDT：[GBDT理论详解](http://note.youdao.com/noteshare?id=68a1bb88a57b867b54196f18e7ebdfcd&sub=E097CC28CB2747DCBF60FA967D93239A)**
 35 | 
 36 | - **2）GBDT+LR：[GBDT与LR的融合理论详解](http://note.youdao.com/noteshare?id=7a3116acb15caae65a3856e6078aa2f0&sub=46BE3B40DB1A4079AC223991FAC88BD0)**
 37 |     - **[代码：lightgbm_lr.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/GBDT-LR/lightgbm_lr.py)**
 38 |     - **[代码：xgboost_lr.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/GBDT-LR/xgboost_lr.py)**
 39 |     - **[代码：gbdt_lr.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/GBDT-LR/gbdt_lr.py)**
 40 |     - **[GBDT系列算法与LR融合与性能对比](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/GBDT-LR/GBDT%E7%B3%BB%E5%88%97%E4%B8%8ELR%E7%9A%84%E8%9E%8D%E5%90%88%26%E6%80%A7%E8%83%BD%E5%AF%B9%E6%AF%94.ipynb)**
 41 |     
 42 |     ![gbdt_lr_contrast](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/gbdt_lr_contrast.jpg)
 43 | 
 44 | 
 45 | - **3）XGBoost：[XGBoost理论详解](http://note.youdao.com/noteshare?id=8ec0afbb4b92a3ccfde94decd3bb2432&sub=2A73304730AF4BC0B0F8C53ECCA22917)**
 46 | 
 47 |    - **自定义损失函数：**
 48 |      - **[Jupyter：XGB自定义损失.ipynb](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/XGB%E8%87%AA%E5%AE%9A%E4%B9%89%E6%8D%9F%E5%A4%B1%26%E5%8F%AF%E8%A7%86%E5%8C%96.ipynb)**
 49 |      - **[代码：xgb_custom_lossfunc.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/xgb_custom_lossfunc.py)**
 50 |      
 51 |      ![xgb_loss](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/xgb_loss.jpg)
 52 |    
 53 |    - **通过early_stopping确定合理的基学习器个数：**
 54 |      - **[Jupyter：early_stopping_rounds](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/early_stopping_rounds.ipynb)**
 55 |      - **[代码：xgb_early_stopping.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/xgb_early_stopping.py)** 
 56 |     
 57 |     ![xgb_early_stopping](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Boosting/xgb_early_stopping.jpg)
 58 | 
 59 | - **4）AdaBoost：[AdaBoost详解](http://note.youdao.com/noteshare?id=d0c70dcd9b716b70ecf08fb962279955&sub=27C7D5F6889241868216754956E07E5D)**
 60 | 
 61 | - **5) LightGBM：[LightGBM详解](http://note.youdao.com/noteshare?id=be2a01188207b095ac37af107e0ec614&sub=CA3B40E0068A495EA0019421494423A4)**
 62 | 
 63 | ---
 64 | 
 65 | #### 2.2 Bagging
 66 | - **[Bagging减少variance，Boosting减少bias](http://note.youdao.com/noteshare?id=5a75ad193efd2341a2b9a6c7dbf5ba9a&sub=45E8EAAE1075459695FA53B451DB7F1B)**
 67 | 
 68 | 
 69 | #### 2.3 Stacking
 70 | - **1）Stacking：[Stacking详解](http://note.youdao.com/noteshare?id=c7891b8ad0e3013e176cb73536bdfad8&sub=943369E1A3B446FC932951A45BE7986B)**
 71 | 
 72 | - **2）二级Stacking的个人实现：[stacking_models.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Stacking/stacking_models.py)**
 73 | 
 74 | - **3）个人实现与Mlxtend对比： [stackingmodels_vs_mlxtend.py](https://github.com/Albertsr/Machine-Learning/blob/master/5.%20Ensemble%20Learning/Stacking/StackingModels_vs_Mlxtend.py)**
 75 | 
 76 | ---
 77 | 
 78 | ### 3. 聚类
 79 | #### 3.1 常见聚类算法原理
 80 | - **[1）Spectral Clustering详述](http://note.youdao.com/noteshare?id=319bd869104b6674bef01dd0a3024597&sub=7740B67581D04E69A6DF492CD8E5E685)**
 81 |   - [瑞利商的性质及其证明](http://note.youdao.com/noteshare?id=9f0062a660ded11f2d9434a8b9c3988a&sub=9884B7629E2E417F82666903DA60A873)
 82 |   - [拉普拉斯矩阵的最小特征值](http://blog.shriphani.com/2015/04/06/the-smallest-eigenvalues-of-a-graph-laplacian/)
 83 | 
 84 | - **[2）密度聚类DBSCAN](http://note.youdao.com/noteshare?id=2f9664802a90dfd9ecb2d421014a9696&sub=0FFCFBB5E9C14B1FA1DCE510700FB23A)**
 85 | 
 86 | - **[3）K均值聚类KMeans](http://note.youdao.com/noteshare?id=393875faf212f47a718fb5bbfce657ce&sub=03362B91CB454DFD985E4EDBA9A06596)**
 87 | 
 88 | - **[4）层次聚类BIRCH](http://note.youdao.com/noteshare?id=a93a6fc70108222262cc93ee3faef0a0&sub=A8848EAC58F04E08AE11D2BC424273B4)**
 89 | 
 90 | #### 3.2 最佳聚类参数
 91 | - **基本思想：** 运用GridSearch的思路在参数的笛卡尔积中寻找最佳聚类参数
 92 | - **代码实现：[cluster_centers.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/SemiSupervised-ADOA/cluster_centers.py)**
 93 | 
 94 | ---
 95 | 
 96 | ## 第二部分：机器学习重要模块
 97 | 
 98 | ### 1. 数据探索
 99 | - **1.1 综述：[数据探索分析](https://github.com/Albertsr/Machine-Learning/tree/master/1.%20Data%20Exploration)**
100 | 
101 | - **1.2 对多个数据进行探索性分析**
102 |   - **[1）对Titanic数据集进行探索](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/1.%20Data%20Exploration/%E4%B8%93%E9%A2%981%EF%BC%9A%E5%88%86%E7%B1%BB%E9%97%AE%E9%A2%98%E7%9A%84%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2%28%E4%BB%A5Titanic%E6%95%B0%E6%8D%AE%E9%9B%86%E4%B8%BA%E4%BE%8B%29.ipynb)**
103 | 
104 |   - **[2）回归问题中的相关系数矩阵与热力图(以Boston数据集为例)](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/1.%20Data%20Exploration/%E4%B8%93%E9%A2%982%EF%BC%9A%E5%9B%9E%E5%BD%92%E9%97%AE%E9%A2%98%E4%B8%AD%E7%9A%84%E7%9B%B8%E5%85%B3%E7%B3%BB%E6%95%B0%E7%9F%A9%E9%98%B5%E4%B8%8E%E7%83%AD%E5%8A%9B%E5%9B%BE%28%E4%BB%A5Boston%E6%95%B0%E6%8D%AE%E9%9B%86%E4%B8%BA%E4%BE%8B%29.ipynb)**
105 | 
106 |   - **[3）对iris数据集进行数据探索](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/1.%20Data%20Exploration/%E4%B8%93%E9%A2%983%EF%BC%9A%E5%AF%B9iris%E6%95%B0%E6%8D%AE%E9%9B%86%E8%BF%9B%E8%A1%8C%E6%95%B0%E6%8D%AE%E6%8E%A2%E7%B4%A2.ipynb)**
107 | 
108 | ---
109 | 
110 | ### 2. 数据预处理
111 | - **[2.1 数据预处理综述](https://github.com/Albertsr/Machine-Learning/tree/master/2.%20Data%20Preprocessing)**
112 | 
113 | - **[2.2 数据的标准化、归一化与正则化](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/2.%20Data%20Preprocessing/%E4%B8%93%E9%A2%981%EF%BC%9A%E6%95%B0%E6%8D%AE%E7%9A%84%E6%A0%87%E5%87%86%E5%8C%96%E3%80%81%E5%BD%92%E4%B8%80%E5%8C%96%E4%B8%8E%E6%AD%A3%E5%88%99%E5%8C%96.ipynb)**
114 | 
115 | - **[2.3 One-Hot 编码](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/2.%20Data%20Preprocessing/%E4%B8%93%E9%A2%982%EF%BC%9AOne-Hot%E7%BC%96%E7%A0%81.ipynb)**
116 | 
117 | - **[2.4 特征共线性问题](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/2.%20Data%20Preprocessing/%E4%B8%93%E9%A2%984%EF%BC%9A%E5%85%B1%E7%BA%BF%E6%80%A7%E9%97%AE%E9%A2%98.ipynb)**
118 | 
119 | - **[2.5 非正态分布数据的处理](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/2.%20Data%20Preprocessing/%E4%B8%93%E9%A2%985%EF%BC%9A%E9%9D%9E%E6%AD%A3%E6%80%81%E5%88%86%E5%B8%83%E6%95%B0%E6%8D%AE%E7%9A%84%E5%A4%84%E7%90%86.ipynb)**
120 | 
121 | ---
122 | 
123 | ### 3. 特征工程
124 | #### 3.1 特征选择
125 | - **[1）三大特征选择理论综述](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.1%20Feature%20Selection/ReadMe.md)**
126 | 
127 | - **[2）Filter：filter.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.1%20Feature%20Selection/filter.py)**
128 |   
129 | - **[3）Wrapper：wrapper.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.1%20Feature%20Selection/wrapper.py)**
130 |   
131 | - **[4）Embedded：embedded.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.1%20Feature%20Selection/embedded.py)**
132 | 
133 | #### 3.2 特征抽取
134 | - **[1）PCA理论详解](http://note.youdao.com/noteshare?id=596c5a7394109f8da87be7ce74ee5e56&sub=AAB5BEA8761C4C40B0B60E697ED749E9)**
135 |    - **奇异值分解(SVD)实现PCA：[pca_svd.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.2%20Feature%20Extraction/pca_svd.py)**
136 |    - **特征值分解(EVD)实现PCA：[pca_evd.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.2%20Feature%20Extraction/pca_evd.py)**
137 |    
138 | - **[2）KernelPCA理论详解](http://note.youdao.com/noteshare?id=6841be74d0fcf6f6a121869d6956aad0&sub=4107BFC5B47A49DD86524504B46EA639)**
139 |   - **KernelPCA重构矩阵：[KernelPCA重构矩阵理论分析](https://github.com/Albertsr/Anomaly-Detection/tree/master/UnSupervised-Based%20on%20PCA#chapter-1基于样本的重构误差)**
140 |   - **KernelPCA异常检测：[Recon_Error_KPCA.py](https://github.com/Albertsr/Anomaly-Detection/blob/master/UnSupervised-Based%20on%20PCA/Recon_Error_KPCA.py)**
141 | 
142 | - **[3）SVD理论详解](http://note.youdao.com/noteshare?id=5ebc61d03c25c9164bc461f8fa66827d&sub=56B3F62C7C1445E6B715777AA5F15BDC)**
143 |    - **验证：[sklearn采用SVD实现PCA.ipynb](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.2%20Feature%20Extraction/%E9%AA%8C%E8%AF%81%EF%BC%9Asklearn%E9%87%87%E7%94%A8SVD%E5%AE%9E%E7%8E%B0PCA.ipynb)**
144 |    - **运用TruncatedSVD进行图像处理：[truncated_svd_cat](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.2%20Feature%20Extraction/%E8%BF%90%E7%94%A8TruncatedSVD%E8%BF%9B%E8%A1%8C%E5%9B%BE%E5%83%8F%E5%A4%84%E7%90%86.ipynb)**
145 | 
146 | #### 3.3 特征构建
147 | - **1）离散型特征**
148 |   - **生成哑变量：[OneHotEncoder + pandas.get_dummies](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/%E7%94%9F%E6%88%90%E5%93%91%E5%8F%98%E9%87%8F.ipynb)**
149 |     
150 |   - **高基数类别特征的处理**
151 |     - **[代码：high_categorical.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/high_categorical.py)**
152 |     - **[实例：处理150个不重复取值的邮政编码](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/%E9%AB%98%E5%9F%BA%E6%95%B0%E7%B1%BB%E5%88%AB%E7%89%B9%E5%BE%81%E7%9A%84%E5%A4%84%E7%90%86.ipynb)** 
153 | 
154 | - **2）时间型特征的处理**
155 |   - **[代码：create_time_feature.py](https://github.com/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/create_time_feature.py)**
156 |   - **[实例：根据时间戳生成时间型索引&透视分析.ipynb](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/%E6%A0%B9%E6%8D%AE%E6%97%B6%E9%97%B4%E6%88%B3%E7%94%9F%E6%88%90%E6%97%B6%E9%97%B4%E5%9E%8B%E7%B4%A2%E5%BC%95%26%E9%80%8F%E8%A7%86%E5%88%86%E6%9E%90.ipynb)**
157 | 
158 | - **3）连续型特征的处理**
159 |   - **[连续型特征的分箱处理.ipynb](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/%E8%BF%9E%E7%BB%AD%E5%9E%8B%E7%89%B9%E5%BE%81%E7%9A%84%E5%88%86%E7%AE%B1%E5%A4%84%E7%90%86.ipynb)**
160 |   - **[长尾数据的处理.ipynb](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/3.%20Feature%20Engineering/3.3%20Feature%20Construction/%E9%95%BF%E5%B0%BE%E6%95%B0%E6%8D%AE%E7%9A%84%E5%A4%84%E7%90%86.ipynb)**
161 | 
162 | ---
163 | 
164 | ### 4. 模型评估
165 | 
166 | #### 4.1 交叉验证与常见评估指标
167 | 
168 | - **[1）交叉验证](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/%E4%BA%A4%E5%8F%89%E9%AA%8C%E8%AF%81.ipynb)**
169 | - **[2）分类与回归模型评估](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/%E5%88%86%E7%B1%BB%E4%B8%8E%E5%9B%9E%E5%BD%92%E6%A8%A1%E5%9E%8B%E8%AF%84%E4%BC%B0.ipynb)**
170 | 
171 | #### 4.2 加权覆盖率
172 | - **出处：[蚂蚁金服-风险大脑-支付风险识别大赛(第一赛季)](https://dc.cloud.alipay.com/index#/topic/data?id=4)**
173 |   
174 | - **代码实现：[coverage.py](https://github.com/Albertsr/Class-Imbalance/blob/master/5.%20Appropriate%20Metrics/coverage.py)**
175 |   
176 | - **定义：**
177 |  
178 |   ![加权覆盖率](https://github.com/Albertsr/Class-Imbalance/blob/master/5.%20Appropriate%20Metrics/Pics/weighted_coverage.jpg)
179 | 
180 |   
181 | #### 4.3 G-Mean
182 | - **出处：** [Addressing the Curse of Imbalanced Training Sets: One-Sided Selection](https://cn.bing.com/academic/profile?id=32c7b83b5988bbcad21fdeb24360d5c4&encoded=0&v=paper_preview&mkt=zh-cn) [Miroslav Kubat, Stan Matwin; 1997]
183 |   
184 | - **代码实现：** [gmean.py](https://github.com/Albertsr/Class-Imbalance/blob/master/5.%20Appropriate%20Metrics/gmean.py)
185 |   
186 | - **定义：** 
187 |   
188 |   ![G-Mean](https://github.com/Albertsr/Class-Imbalance/blob/master/5.%20Appropriate%20Metrics/Pics/gmean.jpg)
189 | 
190 | #### 4.4 KS值
191 | - **1）KS值定义：** max(TPR-FPR)
192 | 
193 | - **2）KS值代码实现：[ks_value.py](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/ks_value.py)**
194 | 
195 | - **3）KS绘制曲线代码：[ks_curve.py](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/ks_curve.py)**
196 |   
197 |  ![ks curve](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/Pics/ks%20curve.jpg)
198 | 
199 | #### 4.5 ROC曲线与PRC曲线的绘制
200 | 
201 | - **1）ROC绘制代码：[roc.py](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/roc.py)**
202 |   
203 |   ![roc](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/Pics/roc.jpg)
204 | 
205 | - **2）PRC绘制代码：[prc.py](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/prc.py)**
206 |   
207 |   ![prc](https://github.com/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/Pics/prc.jpg)  
208 | 
209 | - **3）综合：[绘制KS、ROC、PRC曲线.ipynb](https://nbviewer.jupyter.org/github/Albertsr/Machine-Learning/blob/master/7.%20Model%20Evaluation/%E7%BB%98%E5%88%B6KS%E3%80%81ROC%E3%80%81PRC%E6%9B%B2%E7%BA%BF.ipynb)**
210 | ---
211 | 
212 | ### 5. 模型持久化
213 | - **运用joblib序列化各反序列化机器学习模型：** [joblib.py](https://github.com/Albertsr/Machine-Learning/blob/master/8.%20Model%20Persistence/joblib.py)
214 | 
215 | - **运用pickle序列化/反序列化机器学习模型：** [pickle.py](https://github.com/Albertsr/Machine-Learning/blob/master/8.%20Model%20Persistence/pickle.py)
216 | 
217 | ---
218 | 
219 | ## 第三部分：基础知识
220 | 
221 | ### 1. 最优化
222 | - **[1.1 泰勒展开式与梯度下降法](http://note.youdao.com/noteshare?id=04b615c3ed519b08b2fadc1b31584b51&sub=BB9C8E31B8E041CAB48EBFFB86F81237)**
223 | 
224 | - **[1.2 牛顿法与拟牛顿法](http://note.youdao.com/noteshare?id=a833fad696ba110d0bfb3472ef9e3fb9&sub=E8D135E215314FA0B9C21103EC1AA2DB)**
225 | 
226 | - **[1.3 梯度下降法 vs 牛顿法](http://note.youdao.com/noteshare?id=879c45854ec2dc9bb1de214181ce4a67&sub=E09CCD8D6C154B5390F052B3CD159EC7)**
227 | 
228 | ### 2. 损失函数
229 | - **[2.1 六大损失函数](http://note.youdao.com/noteshare?id=b269151a475b95393019b80584e4a521&sub=4434BC48320E4A0AB2AAF95B6718B318)**
230 | 
231 | - **[2.2 熵、KL散度、交叉熵](http://note.youdao.com/noteshare?id=7824f4e49e0a73f0734864cc10a9b25f&sub=B0D769D4E7DE4A88BABD0C0372E6B26A)**
232 | 
233 | - **[2.3 Huber Loss & Fair Loss](http://note.youdao.com/noteshare?id=e724e03dd48476579e6718feedb42bb7&sub=1293076C8DEF41EE934C9DD4A15F3BBA)**
234 | 
235 | - **[2.4 经验风险、期望风险、结构风险](http://note.youdao.com/noteshare?id=17dda31b4a34b821ae4b3014a2af13cc&sub=73C8E3F08CB4414E8D393E4FE9461ED0)**
236 | 
237 | ### 3. 其他
238 | - **[3.1 Bias-Variance Tradeoff](http://note.youdao.com/noteshare?id=49c9bbe574f4d3c982c82cdde9bb0805&sub=B557C999E1FE42E6BD5BC3C8536A52C7)**
239 | 
240 | 
241 | - **[3.2 生成模型与判别模型](http://note.youdao.com/noteshare?id=ed50912f4b1a95100513667015f3fa01&sub=DE4495BD714B4BFD8FC33585D3204A2C)**
242 | 
243 | ---
244 | 


--------------------------------------------------------------------------------