├── README.md ├── XGBoost调参 ├── xgboost调参.py └── 调参过程结果截图.docx ├── data.rar ├── 南衣-上海财经大学-基于适度贪婪调参策略下的xgboost集成模型.docx ├── 数据预处理 └── 数据选择.py ├── 泛化时的程序 ├── test_model1.py ├── test_model2.py ├── test_model3.py ├── test_model4.py ├── test_model5.py ├── test_model6.py ├── test_model7.py ├── test_model8.py └── 泛化时的集成学习.py └── 训练时的程序 ├── model1.py ├── model2.py ├── model3.py ├── model4.py ├── model5.py ├── model6.py ├── model7.py ├── model8.py └── 训练时的集成学习.py /README.md: -------------------------------------------------------------------------------- 1 | # -融360- 2 | 第三届“融360”天机智能金融算法挑战赛中“拒绝推断”赛题--复赛第四名的代码分享 3 | >>基于适度贪婪调参策略下的xgboost集成模型 4 | 5 | 6 | 本赛题发放的训练集数据量比较庞大,约10万条带有6745列特征属性的数据,处理起来十分费时。其实数据量大带来的处理费时问题不大,我可以多花时间进行处理;但是巨大的数据量还带来一个硬件上的问题,因此必须对数据进行数据规模上的筛选处理 7 | 8 | -------------------------------------------------------------------------------- /XGBoost调参/xgboost调参.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | from xgboost.sklearn import XGBClassifier 4 | from sklearn import cross_validation, metrics 5 | from sklearn.grid_search import GridSearchCV # Perforing grid search 6 | 7 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') 8 | data0 = df.drop(['id', 'loan_dt', 'tag'], axis=1) 9 | 10 | # 数据筛选 11 | data = data0 12 | dataT = data.T 13 | dataT.isnull().sum() 14 | X = dataT.isnull().sum() # X是缺失值的序号集 15 | x = list() # x是超过缺失值指标的序号值集 16 | for i in range(len(X)): 17 | if X[i] < 600:# 值越大,留下的数据集越多;500-1354,600-2456,675-3313,700-3636 18 | x.append(i) 19 | data = data.ix[x] 20 | 21 | train = data 22 | target = 'label' 23 | 24 | def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): 25 | if useTrainCV: 26 | xgb_param = alg.get_xgb_params() 27 | xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values) 28 | cvresult = xgb.cv(xgb_param, xgtrain, 29 | num_boost_round=alg.get_params()['n_estimators'], 30 | nfold=cv_folds, 31 | metrics='auc', 32 | early_stopping_rounds=early_stopping_rounds, 33 | show_stdv=False) 34 | alg.set_params(n_estimators=cvresult.shape[0]) 35 | # Fit the algorithm on the data 36 | alg.fit(dtrain[predictors], dtrain['label'], eval_metric='auc') 37 | # Predict training set: 38 | dtrain_predictions = alg.predict(dtrain[predictors]) 39 | dtrain_predprob = alg.predict_proba(dtrain[predictors])[:, 1] 40 | # Print model report: 41 | print("\nModel Report") 42 | print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['label'].values, dtrain_predictions)) 43 | print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['label'], dtrain_predprob)) 44 | 45 | # Choose all predictors except target & IDcols 46 | A = [x for x in train.columns if x not in [target]] 47 | predictors = [x for x in train.columns if x not in [target]] 48 | xgb1 = XGBClassifier( 49 | learning_rate=0.1, 50 | n_estimators=140, 51 | max_depth=6, 52 | min_child_weight=1, 53 | gamma=0, 54 | subsample=0.8, 55 | colsample_bytree=0.8, 56 | objective='binary:logistic', 57 | nthread=4, 58 | scale_pos_weight=1, 59 | seed=27) 60 | modelfit(xgb1, train, predictors) 61 | 62 | 63 | # 先对 max_depth,min_child_weight两组参数进行调参 64 | param_test1 = { 65 | 'max_depth': [3, 4, 5, 6, 7, 8], 66 | 'min_child_weight': [1, 2, 3, 4, 5, 6] 67 | } 68 | gsearch1 = GridSearchCV(estimator=XGBClassifier( 69 | learning_rate=0.1, 70 | n_estimators=140, 71 | max_depth=5, 72 | min_child_weight=1, 73 | gamma=0, 74 | subsample=0.8, 75 | colsample_bytree=0.8, 76 | objective='binary:logistic', 77 | nthread=4, 78 | scale_pos_weight=1, 79 | seed=27), 80 | param_grid=param_test1, 81 | scoring='roc_auc', 82 | n_jobs=4, 83 | iid=False, 84 | cv=5) 85 | gsearch1.fit(train[predictors], train[target]) 86 | gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_ 87 | print("gsearch1.grid_scores_:", gsearch1.grid_scores_, 88 | " gsearch1.best_params_:", gsearch1.best_params_, 89 | "gsearch1.best_score_:", gsearch1.best_score_) 90 | # 最优 max_depth=3, min_child_weight=3,得分:0.78699 91 | # 次优 max_depth=7, min_child_weight=4,得分:0.78245 92 | 93 | # 在 max_depth=3,min_child_weight=3情况下调参 gamma 94 | param_test2 = { 95 | 'gamma':[i/10.0 for i in range(0,5)] 96 | } 97 | gsearch2 = GridSearchCV(estimator = XGBClassifier( 98 | learning_rate =0.1, 99 | n_estimators=200, 100 | max_depth=3, 101 | min_child_weight=3, 102 | gamma=0, 103 | subsample=0.8, 104 | colsample_bytree=0.8, 105 | objective= 'binary:logistic', 106 | nthread=4, 107 | scale_pos_weight=1, 108 | seed=27), 109 | param_grid = param_test2, 110 | scoring='roc_auc', 111 | n_jobs=4, 112 | iid=False, 113 | cv=5) 114 | gsearch2.fit(train[predictors],train[target]) 115 | gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_ 116 | print("gsearch2.grid_scores_:", gsearch2.grid_scores_, 117 | " gsearch2.best_params_:", gsearch2.best_params_, 118 | "gsearch2.best_score_:", gsearch2.best_score_) 119 | # 测试结果 最优:gamma=0.2 120 | # 次优:gamma=0.4 121 | 122 | # 下面在 max_depth=3,min_child_weight=3,gamma=0条件下,调整 subsample,colsample_bytree的参数 123 | param_test3 = { 124 | 'subsample':[i/10.0 for i in range(1,10,2)], 125 | 'colsample_bytree':[i/10.0 for i in range(1,10,2)] 126 | } 127 | gsearch3= GridSearchCV(estimator = XGBClassifier( 128 | learning_rate =0.1, 129 | n_estimators=200, 130 | max_depth=3, 131 | min_child_weight=3, 132 | gamma=0.2, 133 | subsample=0.8, 134 | colsample_bytree=0.8, 135 | objective= 'binary:logistic', 136 | nthread=4, 137 | scale_pos_weight=1, 138 | seed=27), 139 | param_grid = param_test3, 140 | scoring='roc_auc', 141 | n_jobs=4, 142 | iid=False, 143 | cv=5) 144 | gsearch3.fit(train[predictors],train[target]) 145 | gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_ 146 | print("gsearch3.grid_scores_:", gsearch3.grid_scores_, 147 | " gsearch3.best_params_:", gsearch3.best_params_, 148 | "gsearch3.best_score_:", gsearch3.best_score_) 149 | # 结果 subsample=0.9.colsample_bytree=0.5 150 | 151 | # 更加精确化,分度值0.05 152 | param_test4 = { 153 | 'subsample':[i/10.0 for i in range(8,10)], 154 | 'colsample_bytree':[i/10.0 for i in range(4,7)] 155 | } 156 | gsearch4 = GridSearchCV(estimator = XGBClassifier( 157 | learning_rate =0.1, 158 | n_estimators=200, 159 | max_depth=3, 160 | min_child_weight=3, 161 | gamma=0.2, 162 | subsample=0.8, 163 | colsample_bytree=0.8, 164 | objective= 'binary:logistic', 165 | nthread=4, 166 | scale_pos_weight=1, 167 | seed=27), 168 | param_grid = param_test4, 169 | scoring='roc_auc', 170 | n_jobs=4, 171 | iid=False, 172 | cv=5) 173 | gsearch4.fit(train[predictors],train[target]) 174 | gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_ 175 | print("gsearch4.grid_scores_:", gsearch4.grid_scores_, 176 | " gsearch4.best_params_:", gsearch4.best_params_, 177 | "gsearch4.best_score_:", gsearch4.best_score_) 178 | # 结果 最优:subsample=0.9, colsample_bytree=0.8 179 | # 次优:subsample=0.8, colsample_bytree=0.8 180 | # 得到第一组参数:max_depth=3,min_child_weight=3,gamma=0.2,subsample=0.9,colsample_bytree=0.8 181 | # 得到第二组参数:max_depth=3,min_child_weight=3,gamma=0.2,subsample=0.8,colsample_bytree=0.8 182 | 183 | #下面在max_depth=3,min_child_weight=3,gamma=0.4条件下,对subsample,colsample_bytree进行调参 184 | param_test5 = { 185 | 'subsample':[i/10.0 for i in range(1,10,2)], 186 | 'colsample_bytree':[i/10.0 for i in range(1,10,2)] 187 | } 188 | gsearch5 = GridSearchCV(estimator = XGBClassifier( 189 | learning_rate =0.1, 190 | n_estimators=200, 191 | max_depth=7, 192 | min_child_weight=5, 193 | gamma=0.3, 194 | subsample=0.8, 195 | colsample_bytree=0.8, 196 | objective= 'binary:logistic', 197 | nthread=4, 198 | scale_pos_weight=1, 199 | seed=27), 200 | param_grid = param_test5, 201 | scoring='roc_auc', 202 | n_jobs=4, 203 | iid=False, 204 | cv=5) 205 | gsearch5.fit(train[predictors],train[target]) 206 | gsearch5.grid_scores_, gsearch5.best_params_, gsearch5.best_score_ 207 | print("gsearch5.grid_scores_:", gsearch5.grid_scores_, 208 | " gsearch5.best_params_:", gsearch5.best_params_, 209 | "gsearch5.best_score_:", gsearch5.best_score_) 210 | # 测试结果 最优:subsample=0.9, colsample_bytree=0.5 211 | # 次优:subsample=0.95, colsample_bytree=0.5 212 | 213 | # 精确化 214 | param_test6 = { 215 | 'subsample':[i/10.0 for i in range(8,10)], 216 | 'colsample_bytree':[i/10.0 for i in range(4,7)] 217 | } 218 | gsearch6 = GridSearchCV(estimator = XGBClassifier( 219 | learning_rate =0.1, 220 | n_estimators=200, 221 | max_depth=3, 222 | min_child_weight=3, 223 | gamma=0., 224 | subsample=0.8, 225 | colsample_bytree=0.8, 226 | objective= 'binary:logistic', 227 | nthread=4, 228 | scale_pos_weight=1, 229 | seed=27), 230 | param_grid = param_test6, 231 | scoring='roc_auc', 232 | n_jobs=4, 233 | iid=False, 234 | cv=5) 235 | gsearch6.fit(train[predictors],train[target]) 236 | gsearch6.grid_scores_, gsearch6.best_params_, gsearch6.best_score_ 237 | print("gsearch6.grid_scores_:", gsearch6.grid_scores_, 238 | " gsearch6.best_params_:", gsearch6.best_params_, 239 | "gsearch6.best_score_:", gsearch6.best_score_) 240 | # 得到第三组参数:max_depth=3,min_child_weight=3,gamma=0.4,subsample=0.9,colsample_bytree=0.5 241 | # 得到第四组参数:max_depth=3,min_child_weight=3,gamma=0.4,subsample=0.95,colsample_bytree=0.5 242 | 243 | # 这里对max_depth=7,min_child_weight=4条件下,对gamma 进行调参 244 | param_test7 = { 245 | 'gamma':[i/10.0 for i in range(0,5)] 246 | } 247 | gsearch7 = GridSearchCV(estimator = XGBClassifier( 248 | learning_rate =0.1, 249 | n_estimators=200, 250 | max_depth=7, 251 | min_child_weight=4, 252 | gamma=0, 253 | subsample=0.8, 254 | colsample_bytree=0.8, 255 | objective= 'binary:logistic', 256 | nthread=4, 257 | scale_pos_weight=1, 258 | seed=27), 259 | param_grid = param_test7, 260 | scoring='roc_auc', 261 | n_jobs=4, 262 | iid=False, 263 | cv=5) 264 | gsearch7.fit(train[predictors],train[target]) 265 | gsearch7.grid_scores_, gsearch7.best_params_, gsearch7.best_score_ 266 | print("gsearch7.grid_scores_:", gsearch7.grid_scores_, 267 | " gsearch7.best_params_:", gsearch7.best_params_, 268 | "gsearch7.best_score_:", gsearch7.best_score_) 269 | # 结果 最优:gamma=0 270 | # 次优:gamma=0.1 271 | 272 | # 在max_depth=7,min_child_weight=4,gamma=0条件下,对subsample,colsample_bytree进行调参 273 | param_test8 = { 274 | 'subsample':[i/10.0 for i in range(1,10,2)], 275 | 'colsample_bytree':[i/10.0 for i in range(1,10,2)] 276 | } 277 | gsearch8 = GridSearchCV(estimator = XGBClassifier( 278 | learning_rate =0.1, 279 | n_estimators=177, 280 | max_depth=7, 281 | min_child_weight=4, 282 | gamma=0, 283 | subsample=0.8, 284 | colsample_bytree=0.8, 285 | objective= 'binary:logistic', 286 | nthread=4, 287 | scale_pos_weight=1, 288 | seed=27), 289 | param_grid = param_test8, 290 | scoring='roc_auc', 291 | n_jobs=4, 292 | iid=False, 293 | cv=5) 294 | gsearch8.fit(train[predictors],train[target]) 295 | gsearch8.grid_scores_, gsearch8.best_params_, gsearch8.best_score_ 296 | print("gsearch8.grid_scores_:", gsearch8.grid_scores_, 297 | " gsearch8.best_params_:", gsearch8.best_params_, 298 | "gsearch8.best_score_:", gsearch8.best_score_) 299 | # 测试结果 subsample=0.9 colsample_bytree=0.9 300 | 301 | # 精确化 302 | param_test9 = { 303 | 'subsample':[0.85, 0.9, 0.95], 304 | 'colsample_bytree':[0.85, 0.9, 0.95] 305 | } 306 | gsearch9 = GridSearchCV(estimator = XGBClassifier( 307 | learning_rate =0.1, 308 | n_estimators=177, 309 | max_depth=7, 310 | min_child_weight=4, 311 | gamma=0, 312 | subsample=0.9, 313 | colsample_bytree=0.8, 314 | objective= 'binary:logistic', 315 | nthread=4, 316 | scale_pos_weight=1, 317 | seed=27), 318 | param_grid = param_test9, 319 | scoring='roc_auc', 320 | n_jobs=4, 321 | iid=False, 322 | cv=5) 323 | gsearch9.fit(train[predictors],train[target]) 324 | gsearch9.grid_scores_, gsearch9.best_params_, gsearch9.best_score_ 325 | print("gsearch9.grid_scores_:", gsearch9.grid_scores_, 326 | " gsearch9.best_params_:", gsearch9.best_params_, 327 | "gsearch9.best_score_:", gsearch9.best_score_) 328 | # 结果 最优:subsample=0.9,colsample_bytree=0.95 329 | # 次优:subsample=0.95, colsample_bytree=0.95 330 | # 得到第五组参数:max_depth=7,min_child_weight=4,gamma=0,subsample=0.9,colsample_bytree=0.95 331 | # 得到第六组参数:max_depth=7,min_child_weight=4,gamma=0,subsample=0.95,colsample_bytree=0.95 332 | 333 | # 在max_depth=7,min_child_weight=4,gamma=0.1条件下,对subsample,colsample_bytree进行调参 334 | param_test10 = { 335 | 'subsample':[i/10.0 for i in range(1,10,2)], 336 | 'colsample_bytree':[i/10.0 for i in range(1,10,2)] 337 | } 338 | gsearch10 = GridSearchCV(estimator = XGBClassifier( 339 | learning_rate =0.1, 340 | n_estimators=177, 341 | max_depth=7, 342 | min_child_weight=4, 343 | gamma=0.1, 344 | subsample=0.8, 345 | colsample_bytree=0.8, 346 | objective= 'binary:logistic', 347 | nthread=4, 348 | scale_pos_weight=1, 349 | seed=27), 350 | param_grid = param_test10, 351 | scoring='roc_auc', 352 | n_jobs=4, 353 | iid=False, 354 | cv=5) 355 | gsearch10.fit(train[predictors],train[target]) 356 | gsearch10.grid_scores_, gsearch10.best_params_, gsearch10.best_score_ 357 | print("gsearch10.grid_scores_:", gsearch10.grid_scores_, 358 | " gsearch10.best_params_:", gsearch10.best_params_, 359 | "gsearch10.best_score_:", gsearch10.best_score_) 360 | # 测试结果 subsample=0.9 colsample_bytree=0.3 361 | 362 | # 精确化 363 | param_test11 = { 364 | 'subsample':[0.85, 0.9, 0.95], 365 | 'colsample_bytree':[0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4] 366 | } 367 | gsearch11 = GridSearchCV(estimator = XGBClassifier( 368 | learning_rate =0.1, 369 | n_estimators=177, 370 | max_depth=7, 371 | min_child_weight=4, 372 | gamma=0.1, 373 | subsample=0.9, 374 | colsample_bytree=0.8, 375 | objective= 'binary:logistic', 376 | nthread=4, 377 | scale_pos_weight=1, 378 | seed=27), 379 | param_grid = param_test11, 380 | scoring='roc_auc', 381 | n_jobs=4, 382 | iid=False, 383 | cv=5) 384 | gsearch11.fit(train[predictors],train[target]) 385 | gsearch11.grid_scores_, gsearch11.best_params_, gsearch11.best_score_ 386 | print("gsearch11.grid_scores_:", gsearch11.grid_scores_, 387 | " gsearch11.best_params_:", gsearch11.best_params_, 388 | "gsearch11.best_score_:", gsearch11.best_score_) 389 | # 结果 最优:subsample=0.9,colsample_bytree=0.15 390 | # 次优:subsample=0.9, colsample_bytree=0.3 391 | # 得到第七组参数:max_depth=7,min_child_weight=4,gamma=0.1,subsample=0.9,colsample_bytree=0.15 392 | # 得到第八组参数:max_depth=7,min_child_weight=4,gamma=0.1,subsample=0.9,colsample_bytree=0.3 393 | 394 | 395 | ################################################################################################## 396 | ## 综上共得到了8组实验参数: 397 | 398 | # 得到第一组参数:max_depth=3,min_child_weight=3,gamma=0.2,subsample=0.9,colsample_bytree=0.8 399 | # 得到第二组参数:max_depth=3,min_child_weight=3,gamma=0.2,subsample=0.8,colsample_bytree=0.8 400 | # 得到第三组参数:max_depth=3,min_child_weight=3,gamma=0.4,subsample=0.9,colsample_bytree=0.5 401 | # 得到第四组参数:max_depth=3,min_child_weight=3,gamma=0.4,subsample=0.95,colsample_bytree=0.5 402 | # 得到第五组参数:max_depth=7,min_child_weight=4,gamma=0,subsample=0.9,colsample_bytree=0.95 403 | # 得到第六组参数:max_depth=7,min_child_weight=4,gamma=0,subsample=0.95,colsample_bytree=0.95 404 | # 得到第七组参数:max_depth=7,min_child_weight=4,gamma=0.1,subsample=0.9,colsample_bytree=0.15 405 | # 得到第八组参数:max_depth=7,min_child_weight=4,gamma=0.1,subsample=0.9,colsample_bytree=0.3 406 | 407 | #################################################################################################### 408 | 409 | 410 | # 下面在对reg_alpha,reg_lambda 进行调参 411 | param_test12 = { 412 | 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100] 413 | } 414 | gsearch12 = GridSearchCV(estimator = XGBClassifier( 415 | learning_rate =0.1, 416 | n_estimators=177, 417 | max_depth=3, 418 | min_child_weight=3, 419 | gamma=0.2, 420 | subsample=0.9, 421 | colsample_bytree=0.8, 422 | objective= 'binary:logistic', 423 | nthread=4, 424 | scale_pos_weight=1, 425 | seed=27), 426 | param_grid = param_test12, 427 | scoring='roc_auc', 428 | n_jobs=4,iid=False, cv=5) 429 | gsearch12.fit(train[predictors],train[target]) 430 | gsearch12.grid_scores_, gsearch12.best_params_, gsearch12.best_score_ 431 | print("gsearch12.grid_scores_:", gsearch12.grid_scores_, 432 | " gsearch12.best_params_:", gsearch12.best_params_, 433 | "gsearch12.best_score_:", gsearch12.best_score_) 434 | # reg_alpha=0.00001继续调试 435 | 436 | param_test13 = { 437 | 'reg_alpha':[0, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3] 438 | } 439 | gsearch13 = GridSearchCV(estimator = XGBClassifier( 440 | learning_rate =0.05, 441 | n_estimators=1000, 442 | max_depth=3, 443 | min_child_weight=3, 444 | gamma=0.2, 445 | subsample=0.9, 446 | colsample_bytree=0.8, 447 | objective= 'binary:logistic', 448 | nthread=4, 449 | scale_pos_weight=1, 450 | seed=27), 451 | param_grid = param_test13, 452 | scoring='roc_auc', 453 | n_jobs=4, 454 | iid=False, 455 | cv=5) 456 | gsearch13.fit(train[predictors],train[target]) 457 | gsearch13.grid_scores_, gsearch13.best_params_, gsearch13.best_score_ 458 | print("gsearch13.grid_scores_:", gsearch13.grid_scores_, 459 | " gsearch13.best_params_:", gsearch13.best_params_, 460 | "gsearch13.best_score_:", gsearch13.best_score_) 461 | # 最终reg_alpha=1e-5 462 | 463 | 464 | #下面对reg_lambda进行调参 465 | param_test14 = { 466 | 'reg_lambda':[0, 1e-6, 1e-5, 1e-4, 1e-3] 467 | } 468 | gsearch14 = GridSearchCV(estimator = XGBClassifier( 469 | learning_rate =0.1, 470 | n_estimators=177, 471 | max_depth=3, 472 | min_child_weight=3, 473 | gamma=0.2, 474 | reg_alpha=1e-5, 475 | subsample=0.9, 476 | colsample_bytree=0.8, 477 | objective= 'binary:logistic', 478 | nthread=4, 479 | scale_pos_weight=1, 480 | seed=27), 481 | param_grid = param_test14, 482 | scoring='roc_auc', 483 | n_jobs=4, 484 | iid=False, 485 | cv=5) 486 | gsearch14.fit(train[predictors],train[target]) 487 | gsearch14.grid_scores_, gsearch14.best_params_, gsearch14.best_score_ 488 | print("gsearch14.grid_scores_:", gsearch14.grid_scores_, 489 | " gsearch14.best_params_:", gsearch14.best_params_, 490 | "gsearch14.best_score_:", gsearch14.best_score_) 491 | #reg_lambda=0.001 492 | 493 | param_test15 = { 494 | 'reg_lambda':[1, 10, 100] 495 | } 496 | gsearch15 = GridSearchCV(estimator = XGBClassifier( 497 | learning_rate =0.1, 498 | n_estimators=177, 499 | max_depth=3, 500 | min_child_weight=3, 501 | gamma=0.2, 502 | reg_alpha=1e-5, 503 | subsample=0.9, 504 | colsample_bytree=0.8, 505 | objective= 'binary:logistic', 506 | nthread=4, 507 | scale_pos_weight=1, 508 | seed=27), 509 | param_grid = param_test15, 510 | scoring='roc_auc', 511 | n_jobs=4, 512 | iid=False, 513 | cv=5) 514 | gsearch15.fit(train[predictors],train[target]) 515 | gsearch15.grid_scores_, gsearch15.best_params_, gsearch15.best_score_ 516 | print("gsearch15.grid_scores_:", gsearch15.grid_scores_, 517 | " gsearch15.best_params_:", gsearch15.best_params_, 518 | "gsearch15.best_score_:", gsearch15.best_score_) 519 | #最终reg_lambda=1 520 | 521 | # 最终正则化参数是:reg_alpha=1e-5, reg_lambda=1 522 | 523 | 524 | -------------------------------------------------------------------------------- /XGBoost调参/调参过程结果截图.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woairong/Xgboost-integration-model-based-on-moderate-greedy-parametric-strategy/383ba81c86cb817670d9a6ee32c97496a7cf6bec/XGBoost调参/调参过程结果截图.docx -------------------------------------------------------------------------------- /data.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woairong/Xgboost-integration-model-based-on-moderate-greedy-parametric-strategy/383ba81c86cb817670d9a6ee32c97496a7cf6bec/data.rar -------------------------------------------------------------------------------- /南衣-上海财经大学-基于适度贪婪调参策略下的xgboost集成模型.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/woairong/Xgboost-integration-model-based-on-moderate-greedy-parametric-strategy/383ba81c86cb817670d9a6ee32c97496a7cf6bec/南衣-上海财经大学-基于适度贪婪调参策略下的xgboost集成模型.docx -------------------------------------------------------------------------------- /数据预处理/数据选择.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df1 = pd.read_csv('G:\\ml360\\train\\traindata1.txt', sep='\t') 4 | df2 = pd.read_csv('G:\\ml360\\train\\traindata2.txt', sep='\t') 5 | df3 = pd.read_csv('G:\\ml360\\train\\traindata3.txt', sep='\t') 6 | df4 = pd.read_csv('G:\\ml360\\train\\traindata4.txt', sep='\t') 7 | df5 = pd.read_csv('G:\\ml360\\train\\traindata5.txt', sep='\t') 8 | df1.to_csv('G:\\ml360\\train\\1104.csv', index = False, sep = ',') 9 | df20 = df2.ix[0:13463,:] 10 | df1.to_csv('G:\\ml360\\train\\1104.csv', index = False, sep = ',') 11 | df20.to_csv('G:\\ml360\\train\\1104.csv', mode = 'a', index = False, header = False, sep = ',') 12 | df1104 = pd.read_csv('G:\\ml360\\train\\1104.csv') 13 | 14 | -------------------------------------------------------------------------------- /泛化时的程序/test_model1.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':3, 17 | 'lambda':1, 18 | 'subsample':0.9, 19 | 'colsample_bytree':0.8, 20 | 'min_child_weight':3, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0.2, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=4000,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model1') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model2.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':3, 17 | 'lambda':1, 18 | 'subsample':0.8, 19 | 'colsample_bytree':0.8, 20 | 'min_child_weight':3, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0.2, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model2') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model3.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':3, 17 | 'lambda':1, 18 | 'subsample':0.9, 19 | 'colsample_bytree':0.5, 20 | 'min_child_weight':3, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0.4, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=6000,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model3') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model4.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':3, 17 | 'lambda':1, 18 | 'subsample':0.95, 19 | 'colsample_bytree':0.95, 20 | 'min_child_weight':3, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0.4, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=4000,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model4') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model5.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':7, 17 | 'lambda':1, 18 | 'subsample':0.9, 19 | 'colsample_bytree':0.95, 20 | 'min_child_weight':4, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model5') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model6.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':7, 17 | 'lambda':1, 18 | 'subsample':0.95, 19 | 'colsample_bytree':0.95, 20 | 'min_child_weight':4, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=6745,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model6') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model7.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':7, 17 | 'lambda':1, 18 | 'subsample':0.9, 19 | 'colsample_bytree':0.3, 20 | 'min_child_weight':4, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0.1, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=6500,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model7') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/test_model8.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取数据预处理的训练集数据:33464*6746 5 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 6 | df = df.drop(['loan_dt','id','tag'], axis=1) 7 | train_x = df.ix[:,1:6746] 8 | train_y = df.ix[:,[0]] 9 | 10 | dtrain=xgb.DMatrix(train_x,label=train_y) 11 | 12 | #booster: 13 | params={'booster':'gbtree', 14 | 'objective': 'binary:logistic', 15 | 'eval_metric': 'auc', 16 | 'max_depth':7, 17 | 'lambda':1, 18 | 'subsample':0.95, 19 | 'colsample_bytree':0.4, 20 | 'min_child_weight':4, 21 | 'alpha':1e-5, 22 | 'seed':0, 23 | 'nthread':4, 24 | 'silent':1, 25 | 'gamma':0.1, 26 | 'learning_rate' : 0.01} 27 | watchlist = [(dtrain,'train')] 28 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist) 29 | bst.save_model('G:\\ml360\\train\\test\\test_model8') # 保存实验模型 30 | -------------------------------------------------------------------------------- /泛化时的程序/泛化时的集成学习.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | 4 | # 读取赛题发放的测试集 5 | test = pd.read_csv('G:\\ml360\\train\\test\\test(p).csv') # test(p).csv是赛题发放的测试集数据 6 | id = test['id'] 7 | test = test.ix[:, 2:6747] 8 | 9 | dtest = xgb.DMatrix(test) 10 | 11 | bst1 = xgb.Booster(model_file='G:/ml360/train/test/test_model1') 12 | bst2 = xgb.Booster(model_file='G:/ml360/train/test/test_model2') 13 | bst3 = xgb.Booster(model_file='G:/ml360/train/test/test_model3') 14 | bst4 = xgb.Booster(model_file='G:/ml360/train/test/test_model4') 15 | bst5 = xgb.Booster(model_file='G:/ml360/train/test/test_model5') 16 | bst6 = xgb.Booster(model_file='G:/ml360/train/test/test_model6') 17 | bst7 = xgb.Booster(model_file='G:/ml360/train/test/test_model7') 18 | bst8 = xgb.Booster(model_file='G:/ml360/train/test/test_model8') 19 | 20 | ypred1 = bst1.predict(dtest) 21 | ypred2 = bst2.predict(dtest) 22 | ypred3 = bst3.predict(dtest) 23 | ypred4 = bst4.predict(dtest) 24 | ypred5 = bst5.predict(dtest) 25 | ypred6 = bst6.predict(dtest) 26 | ypred7 = bst7.predict(dtest) 27 | ypred8 = bst8.predict(dtest) 28 | 29 | ypred = 0.296*ypred1 + 0.148*ypred2 + 0.148*ypred3 + 0.074*ypred4 + 0.148*ypred5 + 0.074*ypred6 + 0.074*ypred7 + 0.038*ypred8 30 | 31 | # 保存数据 32 | ypred = list(ypred) 33 | pd_ypred = pd.DataFrame(ypred, columns=['prob']) 34 | id = pd.DataFrame([id]) 35 | id = id.T 36 | result = pd.concat([id, pd_ypred], axis = 1) 37 | result.to_csv('G:\\ml360\\train\\test\\result.txt', index = False, sep = ',') -------------------------------------------------------------------------------- /训练时的程序/model1.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':3, 23 | 'lambda':1, 24 | 'subsample':0.9, 25 | 'colsample_bytree':0.8, 26 | 'min_child_weight':3, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0.2, 32 | 'learning_rate' : 0.01} 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\model1') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型1下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(参数模型1)') 69 | plt.legend(loc="lower right") 70 | plt.show() -------------------------------------------------------------------------------- /训练时的程序/model2.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':3, 23 | 'lambda':1, 24 | 'subsample':0.8, 25 | 'colsample_bytree':0.8, 26 | 'min_child_weight':3, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0.2, 32 | 'learning_rate' : 0.01} 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\model2') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型2下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(paramarors_2)') 69 | plt.legend(loc="lower right") 70 | plt.show() 71 | 72 | -------------------------------------------------------------------------------- /训练时的程序/model3.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':3, 23 | 'lambda':1, 24 | 'subsample':0.9, 25 | 'colsample_bytree':0.5, 26 | 'min_child_weight':3, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0.4, 32 | 'learning_rate' : 0.01} 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\model3') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型3下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(参数模型3)') 69 | plt.legend(loc="lower right") 70 | plt.show() -------------------------------------------------------------------------------- /训练时的程序/model4.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':3, 23 | 'lambda':1, 24 | 'subsample':0.95, 25 | 'colsample_bytree':0.5, 26 | 'min_child_weight':3, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0.4, 32 | 'learning_rate' : 0.01} 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=5000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\model4') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型4下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(参数模型4)') 69 | plt.legend(loc="lower right") 70 | plt.show() -------------------------------------------------------------------------------- /训练时的程序/model5.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':7, 23 | 'lambda':1, 24 | 'subsample':0.9, 25 | 'colsample_bytree':0.95, 26 | 'min_child_weight':4, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0, 32 | 'learning_rate' : 0.01} #0.8319/0.8299--0.8302 num-boost_round = 4000/7000 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\test5') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型5下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(parameters5)') 69 | plt.legend(loc="lower right") 70 | plt.show() -------------------------------------------------------------------------------- /训练时的程序/model6.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':7, 23 | 'lambda':1, 24 | 'subsample':0.95, 25 | 'colsample_bytree':0.95, 26 | 'min_child_weight':4, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0, 32 | 'learning_rate' : 0.01} 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\model6') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型6下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(parameters6)') 69 | plt.legend(loc="lower right") 70 | plt.show() -------------------------------------------------------------------------------- /训练时的程序/model7.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':7, 23 | 'lambda':1, 24 | 'subsample':0.9, 25 | 'colsample_bytree':0.15, 26 | 'min_child_weight':4, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0.1, 32 | 'learning_rate' : 0.01} 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist) 35 | bst.save_model('G:\\ml360\\train\\test\\model7') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型7下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(parameters7)') 69 | plt.legend(loc="lower right") 70 | plt.show() -------------------------------------------------------------------------------- /训练时的程序/model8.py: -------------------------------------------------------------------------------- 1 | from pylab import mpl 2 | import pandas as pd 3 | from xgboost import plot_importance 4 | import xgboost as xgb 5 | from sklearn.cross_validation import train_test_split 6 | import matplotlib.pyplot as plt 7 | from sklearn.metrics import roc_curve, auc 8 | 9 | # 读取数据预处理的训练集数据:33464*6746 10 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 11 | data = df.ix[:,4:6749] 12 | flag = df['label'] 13 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 14 | 15 | dtrain=xgb.DMatrix(train_x,label=train_y) 16 | dtest = xgb.DMatrix(test_x) 17 | 18 | #booster: 19 | params={'booster':'gbtree', 20 | 'objective': 'binary:logistic', 21 | 'eval_metric': 'auc', 22 | 'max_depth':7, 23 | 'lambda':1, 24 | 'subsample':0.9, 25 | 'colsample_bytree':0.3, 26 | 'min_child_weight':4, 27 | 'alpha':1e-5, 28 | 'seed':0, 29 | 'nthread':4, 30 | 'silent':1, 31 | 'gamma':0.1, 32 | 'learning_rate' : 0.01} #0.8319/0.8299--0.8302 num-boost_round = 4000/7000 33 | watchlist = [(dtrain,'train')] 34 | bst = xgb.train(params,dtrain,num_boost_round=2000,evals=watchlist) 35 | bst.save_model('G:\ml360\\train\\test\\model8') # 保存实验模型 36 | 37 | ypred=bst.predict(dtest) 38 | y_pred = (ypred >= 0.5)*1 39 | 40 | # 画出特征得分图 41 | from matplotlib.pylab import rcParams 42 | rcParams['figure.figsize'] = 20, 800 43 | plot_importance(bst) 44 | 45 | # 画出AUC 46 | from sklearn import metrics 47 | print ('参数模型8下的实验结果:') 48 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 49 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 50 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 51 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 52 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 53 | metrics.confusion_matrix(test_y,y_pred) 54 | 55 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 56 | roc_auc = auc(fpr,tpr) ###计算auc的值 57 | 58 | plt.figure() 59 | lw = 2 60 | plt.figure(figsize=(10,10)) 61 | plt.plot(fpr, tpr, color='darkorange', 62 | lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 63 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 64 | plt.xlim([0.0, 1.0]) 65 | plt.ylim([0.0, 1.05]) 66 | plt.xlabel('False Positive Rate') 67 | plt.ylabel('True Positive Rate') 68 | plt.title('Receiver operating characteristic example(parameters8)') 69 | plt.legend(loc="lower right") 70 | plt.show() 71 | -------------------------------------------------------------------------------- /训练时的程序/训练时的集成学习.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | from sklearn.cross_validation import train_test_split 4 | import matplotlib.pyplot as plt 5 | from sklearn.metrics import roc_curve, auc 6 | 7 | # 读取数据预处理的训练集数据:33464*6746 8 | df = pd.read_csv('G:\\ml360\\train\\1104.csv') # 我这里的1104.csv就是数据预处理后的训练数据集:33464*6749 9 | data = df.ix[:,4:6749] 10 | flag = df['label'] 11 | train_x, test_x, train_y, test_y = train_test_split(data, flag, test_size = 0.3, random_state=0) 12 | 13 | dtest = xgb.DMatrix(test_x) 14 | 15 | bst1 = xgb.Booster(model_file='G:/ml360/train/test/model1') 16 | bst2 = xgb.Booster(model_file='G:/ml360/train/test/model2') 17 | bst3 = xgb.Booster(model_file='G:/ml360/train/test/model3') 18 | bst4 = xgb.Booster(model_file='G:/ml360/train/test/model4') 19 | bst5 = xgb.Booster(model_file='G:/ml360/train/test/model5') 20 | bst6 = xgb.Booster(model_file='G:/ml360/train/test/model6') 21 | bst7 = xgb.Booster(model_file='G:/ml360/train/test/model7') 22 | bst8 = xgb.Booster(model_file='G:/ml360/train/test/model8') 23 | 24 | ypred1 = bst1.predict(dtest) 25 | ypred2 = bst2.predict(dtest) 26 | ypred3 = bst3.predict(dtest) 27 | ypred4 = bst4.predict(dtest) 28 | ypred5 = bst5.predict(dtest) 29 | ypred6 = bst6.predict(dtest) 30 | ypred7 = bst7.predict(dtest) 31 | ypred8 = bst8.predict(dtest) 32 | 33 | ypred = 0.296*ypred1 + 0.148*ypred2 + 0.148*ypred3 + 0.074*ypred4 + 0.148*ypred5 + 0.074*ypred6 + 0.074*ypred7 + 0.038*ypred8 34 | y_pred = (ypred >= 0.5)*1 35 | 36 | from sklearn import metrics 37 | print ('集成学习下的实验结果:') 38 | print ('AUC: %.4f' % metrics.roc_auc_score(test_y,ypred)) 39 | print ('ACC: %.4f' % metrics.accuracy_score(test_y,y_pred)) 40 | print ('Recall: %.4f' % metrics.recall_score(test_y,y_pred)) 41 | print ('F1-score: %.4f' %metrics.f1_score(test_y,y_pred)) 42 | print ('Precesion: %.4f' %metrics.precision_score(test_y,y_pred)) 43 | metrics.confusion_matrix(test_y,y_pred) 44 | 45 | fpr,tpr,threshold = roc_curve(test_y, ypred) ###计算真正率和假正率 46 | roc_auc = auc(fpr,tpr) ###计算auc的值 47 | 48 | plt.figure() 49 | lw = 2 50 | plt.figure(figsize=(10,10)) 51 | plt.plot(fpr, tpr, color='darkorange', lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###假正率为横坐标,真正率为纵坐标做曲线 52 | plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--') 53 | plt.xlim([0.0, 1.0]) 54 | plt.ylim([0.0, 1.05]) 55 | plt.xlabel('False Positive Rate') 56 | plt.ylabel('True Positive Rate') 57 | plt.title('Receiver operating characteristic example()') 58 | plt.legend(loc="lower right") 59 | plt.show() --------------------------------------------------------------------------------