├── README.md
└── general_baseline.py


/README.md:
--------------------------------------------------------------------------------
 1 | # 数据科学竞赛经验谈
 2 | 如何做数据分析？  
 3 | 如何做数据清洗？  
 4 | 如何做特征工程？(面向关系型数据的特征工程系统化分析方法)  
 5 | 如何做特征选择？  
 6 | 如何选择合适的机器学习模型？  
 7 | 如何调参？  
 8 | 如何做模型融合？  
 9 | 如何上分刷榜？  
10 | 
11 | 目前文字版已开源，请大家前往知乎阅读：https://zhuanlan.zhihu.com/p/149769029
12 | 
13 | 纯文字PDF版已经制作完成，已与PPT版一起上传至我的知识星球。想咨询竞赛经验、快速上分、争夺奖金的同学，欢迎到大卫的小屋与我交流：https://t.zsxq.com/IMfe2vB 
14 | 
15 | ## 联系方式
16 | 
17 | E-mail: davidkangyz@163.com  
18 | 
19 | 知识星球：https://t.zsxq.com/IMfe2vB  
20 |   
21 | ## 附录
22 | 
23 | 本人竞赛成绩总结
24 | 
25 | | 年份 | 竞赛平台 | 举办单位 | 竞赛名称 | 竞赛成绩 | 排名
26 | | ---- | ---- | ---- | ---- |  ----  | ---- |
27 | | 2017 | 科赛网 | 中国平安 | 前海征信“好信杯”迁移学习算法大赛 | 第6名 | 6/600
28 | | 2017 | 天池大数据众智平台 | 阿里云 | 第二届云安全算法挑战赛 | 第16名 | 16/959
29 | | 2018 | 中国农业银行 | 中国农业银行软件开发中心 | 第一届“雅典娜杯”分析挖掘大赛 | 第2名 | 2/581
30 | | 2018 | 马上金融AI竞赛平台 | 马上金融 | AI全球挑战者大赛 — 违约风险识别赛 | 第4名 | 4/107
31 | | 2018 | 天池大数据众智平台 | 阿里云 | 千里马大数据竞赛——风险识别算法赛 | 第5名 | 5/245
32 | | 2018 | 蚂蚁金服金融科技平台 | 蚂蚁金服 | 蚂蚁开发者大赛 — 支付风险识别赛题 | 第9名 | 9/2986
33 | | 2018 | 天池大数据众智平台 | 阿里巴巴 | IJCAI2018 — 阿里妈妈国际广告算法大赛 | 前2% 
34 | | 2018 | DataFountain | 中国平安 | 产险数据建模大赛——驾驶行为预测驾驶风险 | 前2% 
35 | | 2018 | kaggle | Two Sigma | Two Sigma Investment Financial Modeling Challenge | 前3% 
36 | | 2019 | 天池大数据众智平台 | 天津市津南区政府 | 津南数字制造算法挑战赛 | 第2名 | 2/2682
37 | | 2019 | 中国农业银行 | 中国农业银行软件开发中心 | 第二届“雅典娜杯”分析挖掘大赛 | 第4名 | 4/361
38 | 
39 | 因时间冲突未获奖的竞赛
40 | 
41 | | 年份 | 竞赛平台 | 举办单位 | 竞赛名称 | 竞赛成绩 | 
42 | | ---- | ---- | ---- | ---- |  ----  |				
43 | | 2016 | 滴滴AI竞赛平台 | 滴滴出行 | 首届全球DI-Tech算法大赛 |
44 | | 2016 | 融360自建平台 | 融360 | “天机”金融风控大数据竞赛 |
45 | | 2017 | DataCastle | 融360 | 智慧中国杯——用户贷款风险预测 | 前10% | 
46 | | 2017 | Kaggle | Sberbank | Sberbank Russian Housing Market | 
47 | | 2017 | 天池大数据众智平台 | 高德 | KDD CUP Highway Tollgates Traffic Flow Prediction | 
48 | | 2018 | 京东智汇平台 | 京东 | JData全球运筹优化大赛 | 
49 | 


--------------------------------------------------------------------------------
/general_baseline.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | #  -*- coding: utf-8 -*-
  3 | # date: 2018
  4 | # author: Kang Yan Zhe
  5 | 
  6 | import csv
  7 | import time
  8 | import pandas as pd
  9 | import numpy as np
 10 | from scipy import interp
 11 | from math import isnan
 12 | import matplotlib.pyplot as plt
 13 | from itertools import cycle
 14 | from sklearn.feature_selection import SelectKBest
 15 | from sklearn.feature_selection import f_classif
 16 | from sklearn.feature_selection import SelectFromModel
 17 | from sklearn.model_selection import train_test_split
 18 | from sklearn.model_selection import StratifiedKFold
 19 | from sklearn.model_selection import GridSearchCV
 20 | from sklearn.linear_model import LogisticRegression
 21 | from sklearn.ensemble import RandomForestClassifier
 22 | from sklearn.ensemble import GradientBoostingClassifier
 23 | from sklearn.metrics import roc_curve, auc, f1_score
 24 | from sklearn.externals import joblib
 25 | from xgboost import XGBClassifier
 26 | from lightgbm import LGBMClassifier
 27 | 
 28 | 
 29 | def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th):
 30 |     # SelectfromModel
 31 |     clf = GradientBoostingClassifier(n_estimators=50, random_state=100)
 32 |     clf.fit(matrix_x_temp, label_y)
 33 |     sfm = SelectFromModel(clf, prefit=True, threshold=th)
 34 |     matrix_x = sfm.transform(matrix_x_temp)
 35 | 
 36 |     # how much features whose feature importance is not zero
 37 |     feature_score_dict = {}
 38 |     for fn, s in zip(fe_name, clf.feature_importances_):
 39 |         feature_score_dict[fn] = s
 40 |     m = 0
 41 |     for k in feature_score_dict:
 42 |         if feature_score_dict[k] == 0.0:
 43 |             m += 1
 44 |     print 'number of not-zero features:' + str(len(feature_score_dict) - m)
 45 | 
 46 |     # feature importance
 47 |     feature_score_dict_sorted = sorted(feature_score_dict.items(),
 48 |                                        key=lambda d: d[1], reverse=True)
 49 |     print 'feature_importance:'
 50 |     for ii in range(len(feature_score_dict_sorted)):
 51 |         print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
 52 |     print '\n'
 53 | 
 54 |     f = open('../eda/gbdt_feature_importance.txt', 'w')
 55 |     f.write('Rank\tFeature Name\tFeature Importance\n')
 56 |     for i in range(len(feature_score_dict_sorted)):
 57 |         f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
 58 |     f.close()
 59 | 
 60 |     # print selected feartures
 61 |     how_long = matrix_x.shape[1]  
 62 |     feature_used_dict_temp = feature_score_dict_sorted[:how_long]
 63 |     feature_used_name = []
 64 |     for ii in range(len(feature_used_dict_temp)):
 65 |         feature_used_name.append(feature_used_dict_temp[ii][0])
 66 |     print 'feature_chooesed:'
 67 |     for ii in range(len(feature_used_name)):
 68 |         print feature_used_name[ii]
 69 |     print '\n'
 70 | 
 71 |     f = open('../eda/gbdt_feature_chose.txt', 'w')
 72 |     f.write('Feature Chose Name :\n')
 73 |     for i in range(len(feature_used_name)):
 74 |         f.write(str(feature_used_name[i]) + '\n')
 75 |     f.close()
 76 | 
 77 |     # find non-selected features
 78 |     feature_not_used_name = []
 79 |     for i in range(len(fe_name)):
 80 |         if fe_name[i] not in feature_used_name:
 81 |             feature_not_used_name.append(fe_name[i])
 82 | 
 83 |     return matrix_x, feature_not_used_name, len(feature_used_name)
 84 | 
 85 | 
 86 | def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
 87 |     # SelectfromModel
 88 |     clf = LGBMClassifier(n_estimators=50)
 89 |     clf.fit(matrix_x_temp, label_y)
 90 |     sfm = SelectFromModel(clf, prefit=True, threshold=th)
 91 |     matrix_x = sfm.transform(matrix_x_temp)
 92 | 
 93 |     # 打印出有多少特征重要性非零的特征
 94 |     feature_score_dict = {}
 95 |     for fn, s in zip(fe_name, clf.feature_importances_):
 96 |         feature_score_dict[fn] = s
 97 |     m = 0
 98 |     for k in feature_score_dict:
 99 |         if feature_score_dict[k] == 0.0:
100 |             m += 1
101 |     print 'number of not-zero features:' + str(len(feature_score_dict) - m)
102 | 
103 |     # 打印出特征重要性
104 |     feature_score_dict_sorted = sorted(feature_score_dict.items(),
105 |                                        key=lambda d: d[1], reverse=True)
106 |     print 'feature_importance:'
107 |     for ii in range(len(feature_score_dict_sorted)):
108 |         print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
109 |     print '\n'
110 | 
111 |     f = open('../eda/lgb_feature_importance.txt', 'w')
112 |     f.write('Rank\tFeature Name\tFeature Importance\n')
113 |     for i in range(len(feature_score_dict_sorted)):
114 |         f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
115 |     f.close()
116 | 
117 |     # 打印具体使用了哪些字段
118 |     how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
119 |     feature_used_dict_temp = feature_score_dict_sorted[:how_long]
120 |     feature_used_name = []
121 |     for ii in range(len(feature_used_dict_temp)):
122 |         feature_used_name.append(feature_used_dict_temp[ii][0])
123 |     print 'feature_chooesed:'
124 |     for ii in range(len(feature_used_name)):
125 |         print feature_used_name[ii]
126 |     print '\n'
127 | 
128 |     f = open('../eda/lgb_feature_chose.txt', 'w')
129 |     f.write('Feature Chose Name :\n')
130 |     for i in range(len(feature_used_name)):
131 |         f.write(str(feature_used_name[i]) + '\n')
132 |     f.close()
133 | 
134 |     # 找到未被使用的字段名
135 |     feature_not_used_name = []
136 |     for i in range(len(fe_name)):
137 |         if fe_name[i] not in feature_used_name:
138 |             feature_not_used_name.append(fe_name[i])
139 | 
140 |     # 生成一个染色体（诸如01011100这样的）
141 |     chromosome_temp = ''
142 |     feature_name_ivar = fe_name[:-1]
143 |     for ii in range(len(feature_name_ivar)):
144 |         if feature_name_ivar[ii] in feature_used_name:
145 |             chromosome_temp += '1'
146 |         else:
147 |             chromosome_temp += '0'
148 |     print 'Chromosome:'
149 |     print chromosome_temp
150 |     joblib.dump(chromosome_temp, '../config/chromosome.pkl')
151 |     print '\n'
152 |     return matrix_x, feature_not_used_name[:], len(feature_used_name)
153 | 
154 | 
155 | def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th):
156 |     # SelectfromModel
157 |     clf = XGBClassifier(n_estimators=50)
158 |     clf.fit(matrix_x_temp, label_y)
159 |     sfm = SelectFromModel(clf, prefit=True, threshold=th)
160 |     matrix_x = sfm.transform(matrix_x_temp)
161 | 
162 |     # 打印出有多少特征重要性非零的特征
163 |     feature_score_dict = {}
164 |     for fn, s in zip(fe_name, clf.feature_importances_):
165 |         feature_score_dict[fn] = s
166 |     m = 0
167 |     for k in feature_score_dict:
168 |         if feature_score_dict[k] == 0.0:
169 |             m += 1
170 |     print 'number of not-zero features:' + str(len(feature_score_dict) - m)
171 | 
172 |     # 打印出特征重要性
173 |     feature_score_dict_sorted = sorted(feature_score_dict.items(),
174 |                                        key=lambda d: d[1], reverse=True)
175 |     print 'xgb_feature_importance:'
176 |     for ii in range(len(feature_score_dict_sorted)):
177 |         print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1]
178 |     print '\n'
179 | 
180 |     f = open('../eda/xgb_feature_importance.txt', 'w')
181 |     f.write('Rank\tFeature Name\tFeature Importance\n')
182 |     for i in range(len(feature_score_dict_sorted)):
183 |         f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n')
184 |     f.close()
185 | 
186 |     # 打印具体使用了哪些字段
187 |     how_long = matrix_x.shape[1]  # matrix_x 是 特征选择后的 输入矩阵
188 |     feature_used_dict_temp = feature_score_dict_sorted[:how_long]
189 |     feature_used_name = []
190 |     for ii in range(len(feature_used_dict_temp)):
191 |         feature_used_name.append(feature_used_dict_temp[ii][0])
192 |     print 'feature_chooesed:'
193 |     for ii in range(len(feature_used_name)):
194 |         print feature_used_name[ii]
195 |     print '\n'
196 | 
197 |     f = open('../eda/xgb_feature_chose.txt', 'w')
198 |     f.write('Feature Chose Name :\n')
199 |     for i in range(len(feature_used_name)):
200 |         f.write(str(feature_used_name[i]) + '\n')
201 |     f.close()
202 | 
203 |     # 找到未被使用的字段名
204 |     feature_not_used_name = []
205 |     for i in range(len(fe_name)):
206 |         if fe_name[i] not in feature_used_name:
207 |             feature_not_used_name.append(fe_name[i])
208 | 
209 |     # 生成一个染色体（诸如01011100这样的）
210 |     chromosome_temp = ''
211 |     feature_name_ivar = fe_name[:-1]
212 |     for ii in range(len(feature_name_ivar)):
213 |         if feature_name_ivar[ii] in feature_used_name:
214 |             chromosome_temp += '1'
215 |         else:
216 |             chromosome_temp += '0'
217 |     print 'Chromosome:'
218 |     print chromosome_temp
219 |     joblib.dump(chromosome_temp, '../config/chromosome.pkl')
220 |     print '\n'
221 |     return matrix_x, feature_not_used_name[:], len(feature_used_name)
222 | 
223 | 
224 | def data_test_feature_drop(data_test, feature_name_drop):
225 |     # print feature_name_drop
226 |     for col in feature_name_drop:
227 |         data_test.drop(col, axis=1, inplace=True)
228 |     print "data_test_shape:"
229 |     print data_test.shape
230 |     return data_test.as_matrix()
231 | 
232 | 
233 | def write_predict_results_to_csv(csv_name, uid, prob_list):
234 | 
235 |     csv_file = file(csv_name, 'wb')
236 |     writer = csv.writer(csv_file)
237 |     combined_list = [['ID', 'pred']]
238 |     if len(uid) == len(prob_list):
239 |         for i in range(len(uid)):
240 |             combined_list.append([str(uid[i]), str(prob_list[i])])
241 |         writer.writerows(combined_list)
242 |         csv_file.close()
243 |     else:
244 |         print 'no和pred的个数不一致'
245 | 
246 | 
247 | def xgb_lgb_cv_modeling():
248 |     """
249 | 
250 |     :return:
251 |     """
252 | 
253 |     '''Data input'''
254 |     data_train = pd.read_csv('../data/train.csv', index_col='ID')
255 |     data_predict = pd.read_csv('../data/pred.csv', index_col='ID')
256 | 
257 |     '''trainset feature engineering 根据具体的数据集进行编写'''
258 |     data_train_without_label = data_train.drop('Label', axis=1)
259 |     
260 |     '''Sample'''
261 |     # s = 0
262 |     # np.random.seed(s)
263 |     # sampler = np.random.permutation(len(data_train_without_label.values))
264 |     # data_train_randomized = data_train_without_label.take(sampler)
265 | 
266 |     feature_name = list(data_train_without_label.columns.values)
267 |     data_predict_user_id = list(data_predict.index.values)
268 | 
269 |     '''fillna'''
270 |     frames = [data_train_without_label, data_predict]
271 |     data_all = pd.concat(frames)
272 |     data_train_filled = data_train_without_label.fillna(value=data_all.median())
273 | 
274 |     '''construct train and test dataset'''
275 |     x_temp = data_train_filled.iloc[:, :].as_matrix()  # 自变量
276 |     y = data_train.iloc[:, -1].as_matrix()  # 因变量
277 | 
278 |     '''Feature selection'''
279 |     X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean')
280 |     # 0.1*mean可以选出10个特征
281 |     # 0.00001*mean可以选出14个特征
282 | 
283 |     '''online test dataset -- B_test'''
284 |     # del data_predict['V17']
285 |     # data_predict['UserInfo_242x40'] = data_predict['UserInfo_242'] * data_predict['UserInfo_40']
286 | 
287 |     data_predict_filled = data_predict.fillna(value=data_all.median())
288 |     data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name)
289 | 
290 |     '''Split train/test data sets'''
291 |     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)  # 分层抽样  cv的意思是cross-validation
292 | 
293 |     '''Choose a classification model'''
294 |     parameter_n_estimators = 100
295 |     classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1)
296 | 
297 |     '''hyperparameter optimization'''
298 |     # param = {
299 |     #     'max_depth': 6,
300 |     #     'num_leaves': 64,
301 |     #     'learning_rate': 0.03,
302 |     #     'scale_pos_weight': 1,
303 |     #     'num_threads': 40,
304 |     #     'objective': 'binary',
305 |     #     'bagging_fraction': 0.7,
306 |     #     'bagging_freq': 1,
307 |     #     'min_sum_hessian_in_leaf': 100
308 |     # }
309 |     #
310 |     # param['is_unbalance'] = 'true'
311 |     # param['metric'] = 'auc'
312 | 
313 |     # （1）num_leaves
314 |     #
315 |     # LightGBM使用的是leaf - wise的算法，因此在调节树的复杂程度时，使用的是num_leaves而不是max_depth。
316 |     #
317 |     # 大致换算关系：num_leaves = 2 ^ (max_depth)
318 |     #
319 |     # （2）样本分布非平衡数据集：可以param[‘is_unbalance’]=’true’
320 |     #
321 |     # （3）Bagging参数：bagging_fraction + bagging_freq（必须同时设置）、feature_fraction
322 |     #
323 |     # （4）min_data_in_leaf、min_sum_hessian_in_leaf
324 | 
325 |     '''Model fit, predict and ROC'''
326 |     colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue'])
327 |     lw = 2
328 |     mean_f1 = 0.0
329 |     mean_tpr = 0.0
330 |     mean_fpr = np.linspace(0, 1, 500)
331 |     i_of_roc = 0
332 |     a = 0
333 | 
334 |     th = 0.5
335 | 
336 |     for (train_indice, test_indice), color in zip(cv.split(X, y), colors):
337 |         a_model = classifier.fit(X[train_indice], y[train_indice])
338 | 
339 |         # y_predict_label = a_model.predict(X[test_indice])
340 | 
341 |         probas_ = a_model.predict_proba(X[test_indice])
342 | 
343 |         fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1])
344 | 
345 |         a += 1
346 | 
347 |         mean_tpr += interp(mean_fpr, fpr, tpr)
348 |         mean_tpr[0] = 0.0
349 | 
350 |         roc_auc = auc(fpr, tpr)
351 |         plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc))
352 |         i_of_roc += 1
353 | 
354 |         label_transformed = probas_[:, 1]
355 |         for i in range(len(label_transformed)):
356 |             if label_transformed[i] > th:
357 |                 label_transformed[i] = 1
358 |             else:
359 |                 label_transformed[i] = 0
360 |         lt = label_transformed.astype('int32')
361 |         f1 = f1_score(y[test_indice], lt)
362 |         mean_f1 += f1
363 | 
364 |     plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck')
365 | 
366 |     mean_tpr /= cv.get_n_splits(X, y)
367 |     mean_tpr[-1] = 1.0
368 |     mean_auc = auc(mean_fpr, mean_tpr)
369 |     print 'mean_auc=' + str(mean_auc)
370 |     print 'mean_f1=' + str(mean_f1/5)
371 |     plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw)
372 |     plt.xlim([-0.01, 1.01])
373 |     plt.ylim([-0.01, 1.01])
374 |     plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1))
375 |     plt.ylabel('True Positive Rate')
376 | 
377 |     plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5))
378 |     plt.legend(loc="lower right")
379 |     plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) +
380 |                 '_proba_to_label_using_th_' + str(th) + '.png')
381 |     # plt.show()
382 | 
383 |     a_model = classifier.fit(X, y)
384 | 
385 |     # label_predict = a_model.predict(data_predict_filled_after_feature_selection)  # 对B_test进行预测
386 |     proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection)
387 | 
388 |     '''proba result'''
389 |     result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv'
390 |     write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist())
391 | 
392 |     # '''写入要提交的结果'''
393 |     # result_file_name = '../result/pred_result_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '.csv'
394 |     # write_predict_results_to_csv(result_file_name, data_predict_user_id, label_predict.tolist())
395 | 
396 |     '''results file'''
397 |     label_transformed = proba_predict[:, 1]
398 |     for i in range(len(label_transformed)):
399 |         if label_transformed[i] > th:
400 |             label_transformed[i] = 1
401 |         else:
402 |             label_transformed[i] = 0
403 |     lt = label_transformed.astype('int32')
404 |     result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \
405 |                        '_proba_to_label_using_th_' + str(th) + '.csv'
406 |     write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist())
407 | 
408 | 


--------------------------------------------------------------------------------