├── README.md └── general_baseline.py /README.md: -------------------------------------------------------------------------------- 1 | # 数据科学竞赛经验谈 2 | 如何做数据分析? 3 | 如何做数据清洗? 4 | 如何做特征工程?(面向关系型数据的特征工程系统化分析方法) 5 | 如何做特征选择? 6 | 如何选择合适的机器学习模型? 7 | 如何调参? 8 | 如何做模型融合? 9 | 如何上分刷榜? 10 | 11 | 目前文字版已开源,请大家前往知乎阅读:https://zhuanlan.zhihu.com/p/149769029 12 | 13 | 纯文字PDF版已经制作完成,已与PPT版一起上传至我的知识星球。想咨询竞赛经验、快速上分、争夺奖金的同学,欢迎到大卫的小屋与我交流:https://t.zsxq.com/IMfe2vB 14 | 15 | ## 联系方式 16 | 17 | E-mail: davidkangyz@163.com 18 | 19 | 知识星球:https://t.zsxq.com/IMfe2vB 20 | 21 | ## 附录 22 | 23 | 本人竞赛成绩总结 24 | 25 | | 年份 | 竞赛平台 | 举办单位 | 竞赛名称 | 竞赛成绩 | 排名 26 | | ---- | ---- | ---- | ---- | ---- | ---- | 27 | | 2017 | 科赛网 | 中国平安 | 前海征信“好信杯”迁移学习算法大赛 | 第6名 | 6/600 28 | | 2017 | 天池大数据众智平台 | 阿里云 | 第二届云安全算法挑战赛 | 第16名 | 16/959 29 | | 2018 | 中国农业银行 | 中国农业银行软件开发中心 | 第一届“雅典娜杯”分析挖掘大赛 | 第2名 | 2/581 30 | | 2018 | 马上金融AI竞赛平台 | 马上金融 | AI全球挑战者大赛 — 违约风险识别赛 | 第4名 | 4/107 31 | | 2018 | 天池大数据众智平台 | 阿里云 | 千里马大数据竞赛——风险识别算法赛 | 第5名 | 5/245 32 | | 2018 | 蚂蚁金服金融科技平台 | 蚂蚁金服 | 蚂蚁开发者大赛 — 支付风险识别赛题 | 第9名 | 9/2986 33 | | 2018 | 天池大数据众智平台 | 阿里巴巴 | IJCAI2018 — 阿里妈妈国际广告算法大赛 | 前2% 34 | | 2018 | DataFountain | 中国平安 | 产险数据建模大赛——驾驶行为预测驾驶风险 | 前2% 35 | | 2018 | kaggle | Two Sigma | Two Sigma Investment Financial Modeling Challenge | 前3% 36 | | 2019 | 天池大数据众智平台 | 天津市津南区政府 | 津南数字制造算法挑战赛 | 第2名 | 2/2682 37 | | 2019 | 中国农业银行 | 中国农业银行软件开发中心 | 第二届“雅典娜杯”分析挖掘大赛 | 第4名 | 4/361 38 | 39 | 因时间冲突未获奖的竞赛 40 | 41 | | 年份 | 竞赛平台 | 举办单位 | 竞赛名称 | 竞赛成绩 | 42 | | ---- | ---- | ---- | ---- | ---- | 43 | | 2016 | 滴滴AI竞赛平台 | 滴滴出行 | 首届全球DI-Tech算法大赛 | 44 | | 2016 | 融360自建平台 | 融360 | “天机”金融风控大数据竞赛 | 45 | | 2017 | DataCastle | 融360 | 智慧中国杯——用户贷款风险预测 | 前10% | 46 | | 2017 | Kaggle | Sberbank | Sberbank Russian Housing Market | 47 | | 2017 | 天池大数据众智平台 | 高德 | KDD CUP Highway Tollgates Traffic Flow Prediction | 48 | | 2018 | 京东智汇平台 | 京东 | JData全球运筹优化大赛 | 49 | -------------------------------------------------------------------------------- /general_baseline.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | # date: 2018 4 | # author: Kang Yan Zhe 5 | 6 | import csv 7 | import time 8 | import pandas as pd 9 | import numpy as np 10 | from scipy import interp 11 | from math import isnan 12 | import matplotlib.pyplot as plt 13 | from itertools import cycle 14 | from sklearn.feature_selection import SelectKBest 15 | from sklearn.feature_selection import f_classif 16 | from sklearn.feature_selection import SelectFromModel 17 | from sklearn.model_selection import train_test_split 18 | from sklearn.model_selection import StratifiedKFold 19 | from sklearn.model_selection import GridSearchCV 20 | from sklearn.linear_model import LogisticRegression 21 | from sklearn.ensemble import RandomForestClassifier 22 | from sklearn.ensemble import GradientBoostingClassifier 23 | from sklearn.metrics import roc_curve, auc, f1_score 24 | from sklearn.externals import joblib 25 | from xgboost import XGBClassifier 26 | from lightgbm import LGBMClassifier 27 | 28 | 29 | def gbdt_feature_selection(fe_name, matrix_x_temp, label_y, th): 30 | # SelectfromModel 31 | clf = GradientBoostingClassifier(n_estimators=50, random_state=100) 32 | clf.fit(matrix_x_temp, label_y) 33 | sfm = SelectFromModel(clf, prefit=True, threshold=th) 34 | matrix_x = sfm.transform(matrix_x_temp) 35 | 36 | # how much features whose feature importance is not zero 37 | feature_score_dict = {} 38 | for fn, s in zip(fe_name, clf.feature_importances_): 39 | feature_score_dict[fn] = s 40 | m = 0 41 | for k in feature_score_dict: 42 | if feature_score_dict[k] == 0.0: 43 | m += 1 44 | print 'number of not-zero features:' + str(len(feature_score_dict) - m) 45 | 46 | # feature importance 47 | feature_score_dict_sorted = sorted(feature_score_dict.items(), 48 | key=lambda d: d[1], reverse=True) 49 | print 'feature_importance:' 50 | for ii in range(len(feature_score_dict_sorted)): 51 | print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] 52 | print '\n' 53 | 54 | f = open('../eda/gbdt_feature_importance.txt', 'w') 55 | f.write('Rank\tFeature Name\tFeature Importance\n') 56 | for i in range(len(feature_score_dict_sorted)): 57 | f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') 58 | f.close() 59 | 60 | # print selected feartures 61 | how_long = matrix_x.shape[1] 62 | feature_used_dict_temp = feature_score_dict_sorted[:how_long] 63 | feature_used_name = [] 64 | for ii in range(len(feature_used_dict_temp)): 65 | feature_used_name.append(feature_used_dict_temp[ii][0]) 66 | print 'feature_chooesed:' 67 | for ii in range(len(feature_used_name)): 68 | print feature_used_name[ii] 69 | print '\n' 70 | 71 | f = open('../eda/gbdt_feature_chose.txt', 'w') 72 | f.write('Feature Chose Name :\n') 73 | for i in range(len(feature_used_name)): 74 | f.write(str(feature_used_name[i]) + '\n') 75 | f.close() 76 | 77 | # find non-selected features 78 | feature_not_used_name = [] 79 | for i in range(len(fe_name)): 80 | if fe_name[i] not in feature_used_name: 81 | feature_not_used_name.append(fe_name[i]) 82 | 83 | return matrix_x, feature_not_used_name, len(feature_used_name) 84 | 85 | 86 | def lgb_feature_selection(fe_name, matrix_x_temp, label_y, th): 87 | # SelectfromModel 88 | clf = LGBMClassifier(n_estimators=50) 89 | clf.fit(matrix_x_temp, label_y) 90 | sfm = SelectFromModel(clf, prefit=True, threshold=th) 91 | matrix_x = sfm.transform(matrix_x_temp) 92 | 93 | # 打印出有多少特征重要性非零的特征 94 | feature_score_dict = {} 95 | for fn, s in zip(fe_name, clf.feature_importances_): 96 | feature_score_dict[fn] = s 97 | m = 0 98 | for k in feature_score_dict: 99 | if feature_score_dict[k] == 0.0: 100 | m += 1 101 | print 'number of not-zero features:' + str(len(feature_score_dict) - m) 102 | 103 | # 打印出特征重要性 104 | feature_score_dict_sorted = sorted(feature_score_dict.items(), 105 | key=lambda d: d[1], reverse=True) 106 | print 'feature_importance:' 107 | for ii in range(len(feature_score_dict_sorted)): 108 | print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] 109 | print '\n' 110 | 111 | f = open('../eda/lgb_feature_importance.txt', 'w') 112 | f.write('Rank\tFeature Name\tFeature Importance\n') 113 | for i in range(len(feature_score_dict_sorted)): 114 | f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') 115 | f.close() 116 | 117 | # 打印具体使用了哪些字段 118 | how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 119 | feature_used_dict_temp = feature_score_dict_sorted[:how_long] 120 | feature_used_name = [] 121 | for ii in range(len(feature_used_dict_temp)): 122 | feature_used_name.append(feature_used_dict_temp[ii][0]) 123 | print 'feature_chooesed:' 124 | for ii in range(len(feature_used_name)): 125 | print feature_used_name[ii] 126 | print '\n' 127 | 128 | f = open('../eda/lgb_feature_chose.txt', 'w') 129 | f.write('Feature Chose Name :\n') 130 | for i in range(len(feature_used_name)): 131 | f.write(str(feature_used_name[i]) + '\n') 132 | f.close() 133 | 134 | # 找到未被使用的字段名 135 | feature_not_used_name = [] 136 | for i in range(len(fe_name)): 137 | if fe_name[i] not in feature_used_name: 138 | feature_not_used_name.append(fe_name[i]) 139 | 140 | # 生成一个染色体(诸如01011100这样的) 141 | chromosome_temp = '' 142 | feature_name_ivar = fe_name[:-1] 143 | for ii in range(len(feature_name_ivar)): 144 | if feature_name_ivar[ii] in feature_used_name: 145 | chromosome_temp += '1' 146 | else: 147 | chromosome_temp += '0' 148 | print 'Chromosome:' 149 | print chromosome_temp 150 | joblib.dump(chromosome_temp, '../config/chromosome.pkl') 151 | print '\n' 152 | return matrix_x, feature_not_used_name[:], len(feature_used_name) 153 | 154 | 155 | def xgb_feature_selection(fe_name, matrix_x_temp, label_y, th): 156 | # SelectfromModel 157 | clf = XGBClassifier(n_estimators=50) 158 | clf.fit(matrix_x_temp, label_y) 159 | sfm = SelectFromModel(clf, prefit=True, threshold=th) 160 | matrix_x = sfm.transform(matrix_x_temp) 161 | 162 | # 打印出有多少特征重要性非零的特征 163 | feature_score_dict = {} 164 | for fn, s in zip(fe_name, clf.feature_importances_): 165 | feature_score_dict[fn] = s 166 | m = 0 167 | for k in feature_score_dict: 168 | if feature_score_dict[k] == 0.0: 169 | m += 1 170 | print 'number of not-zero features:' + str(len(feature_score_dict) - m) 171 | 172 | # 打印出特征重要性 173 | feature_score_dict_sorted = sorted(feature_score_dict.items(), 174 | key=lambda d: d[1], reverse=True) 175 | print 'xgb_feature_importance:' 176 | for ii in range(len(feature_score_dict_sorted)): 177 | print feature_score_dict_sorted[ii][0], feature_score_dict_sorted[ii][1] 178 | print '\n' 179 | 180 | f = open('../eda/xgb_feature_importance.txt', 'w') 181 | f.write('Rank\tFeature Name\tFeature Importance\n') 182 | for i in range(len(feature_score_dict_sorted)): 183 | f.write(str(i) + '\t' + str(feature_score_dict_sorted[i][0]) + '\t' + str(feature_score_dict_sorted[i][1]) + '\n') 184 | f.close() 185 | 186 | # 打印具体使用了哪些字段 187 | how_long = matrix_x.shape[1] # matrix_x 是 特征选择后的 输入矩阵 188 | feature_used_dict_temp = feature_score_dict_sorted[:how_long] 189 | feature_used_name = [] 190 | for ii in range(len(feature_used_dict_temp)): 191 | feature_used_name.append(feature_used_dict_temp[ii][0]) 192 | print 'feature_chooesed:' 193 | for ii in range(len(feature_used_name)): 194 | print feature_used_name[ii] 195 | print '\n' 196 | 197 | f = open('../eda/xgb_feature_chose.txt', 'w') 198 | f.write('Feature Chose Name :\n') 199 | for i in range(len(feature_used_name)): 200 | f.write(str(feature_used_name[i]) + '\n') 201 | f.close() 202 | 203 | # 找到未被使用的字段名 204 | feature_not_used_name = [] 205 | for i in range(len(fe_name)): 206 | if fe_name[i] not in feature_used_name: 207 | feature_not_used_name.append(fe_name[i]) 208 | 209 | # 生成一个染色体(诸如01011100这样的) 210 | chromosome_temp = '' 211 | feature_name_ivar = fe_name[:-1] 212 | for ii in range(len(feature_name_ivar)): 213 | if feature_name_ivar[ii] in feature_used_name: 214 | chromosome_temp += '1' 215 | else: 216 | chromosome_temp += '0' 217 | print 'Chromosome:' 218 | print chromosome_temp 219 | joblib.dump(chromosome_temp, '../config/chromosome.pkl') 220 | print '\n' 221 | return matrix_x, feature_not_used_name[:], len(feature_used_name) 222 | 223 | 224 | def data_test_feature_drop(data_test, feature_name_drop): 225 | # print feature_name_drop 226 | for col in feature_name_drop: 227 | data_test.drop(col, axis=1, inplace=True) 228 | print "data_test_shape:" 229 | print data_test.shape 230 | return data_test.as_matrix() 231 | 232 | 233 | def write_predict_results_to_csv(csv_name, uid, prob_list): 234 | 235 | csv_file = file(csv_name, 'wb') 236 | writer = csv.writer(csv_file) 237 | combined_list = [['ID', 'pred']] 238 | if len(uid) == len(prob_list): 239 | for i in range(len(uid)): 240 | combined_list.append([str(uid[i]), str(prob_list[i])]) 241 | writer.writerows(combined_list) 242 | csv_file.close() 243 | else: 244 | print 'no和pred的个数不一致' 245 | 246 | 247 | def xgb_lgb_cv_modeling(): 248 | """ 249 | 250 | :return: 251 | """ 252 | 253 | '''Data input''' 254 | data_train = pd.read_csv('../data/train.csv', index_col='ID') 255 | data_predict = pd.read_csv('../data/pred.csv', index_col='ID') 256 | 257 | '''trainset feature engineering 根据具体的数据集进行编写''' 258 | data_train_without_label = data_train.drop('Label', axis=1) 259 | 260 | '''Sample''' 261 | # s = 0 262 | # np.random.seed(s) 263 | # sampler = np.random.permutation(len(data_train_without_label.values)) 264 | # data_train_randomized = data_train_without_label.take(sampler) 265 | 266 | feature_name = list(data_train_without_label.columns.values) 267 | data_predict_user_id = list(data_predict.index.values) 268 | 269 | '''fillna''' 270 | frames = [data_train_without_label, data_predict] 271 | data_all = pd.concat(frames) 272 | data_train_filled = data_train_without_label.fillna(value=data_all.median()) 273 | 274 | '''construct train and test dataset''' 275 | x_temp = data_train_filled.iloc[:, :].as_matrix() # 自变量 276 | y = data_train.iloc[:, -1].as_matrix() # 因变量 277 | 278 | '''Feature selection''' 279 | X, dropped_feature_name, len_feature_choose = xgb_feature_selection(feature_name, x_temp, y, '0.1*mean') 280 | # 0.1*mean可以选出10个特征 281 | # 0.00001*mean可以选出14个特征 282 | 283 | '''online test dataset -- B_test''' 284 | # del data_predict['V17'] 285 | # data_predict['UserInfo_242x40'] = data_predict['UserInfo_242'] * data_predict['UserInfo_40'] 286 | 287 | data_predict_filled = data_predict.fillna(value=data_all.median()) 288 | data_predict_filled_after_feature_selection = data_test_feature_drop(data_predict_filled, dropped_feature_name) 289 | 290 | '''Split train/test data sets''' 291 | cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) # 分层抽样 cv的意思是cross-validation 292 | 293 | '''Choose a classification model''' 294 | parameter_n_estimators = 100 295 | classifier = LGBMClassifier(n_estimators=parameter_n_estimators, learning_rate=0.1) 296 | 297 | '''hyperparameter optimization''' 298 | # param = { 299 | # 'max_depth': 6, 300 | # 'num_leaves': 64, 301 | # 'learning_rate': 0.03, 302 | # 'scale_pos_weight': 1, 303 | # 'num_threads': 40, 304 | # 'objective': 'binary', 305 | # 'bagging_fraction': 0.7, 306 | # 'bagging_freq': 1, 307 | # 'min_sum_hessian_in_leaf': 100 308 | # } 309 | # 310 | # param['is_unbalance'] = 'true' 311 | # param['metric'] = 'auc' 312 | 313 | # (1)num_leaves 314 | # 315 | # LightGBM使用的是leaf - wise的算法,因此在调节树的复杂程度时,使用的是num_leaves而不是max_depth。 316 | # 317 | # 大致换算关系:num_leaves = 2 ^ (max_depth) 318 | # 319 | # (2)样本分布非平衡数据集:可以param[‘is_unbalance’]=’true’ 320 | # 321 | # (3)Bagging参数:bagging_fraction + bagging_freq(必须同时设置)、feature_fraction 322 | # 323 | # (4)min_data_in_leaf、min_sum_hessian_in_leaf 324 | 325 | '''Model fit, predict and ROC''' 326 | colors = cycle(['cyan', 'indigo', 'seagreen', 'orange', 'blue']) 327 | lw = 2 328 | mean_f1 = 0.0 329 | mean_tpr = 0.0 330 | mean_fpr = np.linspace(0, 1, 500) 331 | i_of_roc = 0 332 | a = 0 333 | 334 | th = 0.5 335 | 336 | for (train_indice, test_indice), color in zip(cv.split(X, y), colors): 337 | a_model = classifier.fit(X[train_indice], y[train_indice]) 338 | 339 | # y_predict_label = a_model.predict(X[test_indice]) 340 | 341 | probas_ = a_model.predict_proba(X[test_indice]) 342 | 343 | fpr, tpr, thresholds = roc_curve(y[test_indice], probas_[:, 1]) 344 | 345 | a += 1 346 | 347 | mean_tpr += interp(mean_fpr, fpr, tpr) 348 | mean_tpr[0] = 0.0 349 | 350 | roc_auc = auc(fpr, tpr) 351 | plt.plot(fpr, tpr, lw=lw, color=color, label='ROC fold %d (area = %0.4f)' % (i_of_roc, roc_auc)) 352 | i_of_roc += 1 353 | 354 | label_transformed = probas_[:, 1] 355 | for i in range(len(label_transformed)): 356 | if label_transformed[i] > th: 357 | label_transformed[i] = 1 358 | else: 359 | label_transformed[i] = 0 360 | lt = label_transformed.astype('int32') 361 | f1 = f1_score(y[test_indice], lt) 362 | mean_f1 += f1 363 | 364 | plt.plot([0, 1], [0, 1], linestyle='--', lw=lw, color='k', label='Luck') 365 | 366 | mean_tpr /= cv.get_n_splits(X, y) 367 | mean_tpr[-1] = 1.0 368 | mean_auc = auc(mean_fpr, mean_tpr) 369 | print 'mean_auc=' + str(mean_auc) 370 | print 'mean_f1=' + str(mean_f1/5) 371 | plt.plot(mean_fpr, mean_tpr, color='g', linestyle='--', label='Mean ROC (area = %0.4f)' % mean_auc, lw=lw) 372 | plt.xlim([-0.01, 1.01]) 373 | plt.ylim([-0.01, 1.01]) 374 | plt.xlabel('False Positive Rate mean_f1:'+str(mean_f1)) 375 | plt.ylabel('True Positive Rate') 376 | 377 | plt.title('ROC_gbdt_' + str(len_feature_choose) + '_features_f1_' + str(mean_f1/5)) 378 | plt.legend(loc="lower right") 379 | plt.savefig('../result/pred_ROC_XL' + '_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + 380 | '_proba_to_label_using_th_' + str(th) + '.png') 381 | # plt.show() 382 | 383 | a_model = classifier.fit(X, y) 384 | 385 | # label_predict = a_model.predict(data_predict_filled_after_feature_selection) # 对B_test进行预测 386 | proba_predict = a_model.predict_proba(data_predict_filled_after_feature_selection) 387 | 388 | '''proba result''' 389 | result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '_proba.csv' 390 | write_predict_results_to_csv(result_file_name, data_predict_user_id, proba_predict[:, 1].tolist()) 391 | 392 | # '''写入要提交的结果''' 393 | # result_file_name = '../result/pred_result_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + '.csv' 394 | # write_predict_results_to_csv(result_file_name, data_predict_user_id, label_predict.tolist()) 395 | 396 | '''results file''' 397 | label_transformed = proba_predict[:, 1] 398 | for i in range(len(label_transformed)): 399 | if label_transformed[i] > th: 400 | label_transformed[i] = 1 401 | else: 402 | label_transformed[i] = 0 403 | lt = label_transformed.astype('int32') 404 | result_file_name = '../result/pred_result_XL_N_' + str(parameter_n_estimators) + '_features_' + str(len_feature_choose) + \ 405 | '_proba_to_label_using_th_' + str(th) + '.csv' 406 | write_predict_results_to_csv(result_file_name, data_predict_user_id, lt.tolist()) 407 | 408 | --------------------------------------------------------------------------------