├── README.md └── model_yw.py /README.md: -------------------------------------------------------------------------------- 1 | # User-purchase-forecast-in-consumer-finance-scenarios 2 | ### 参加2018年招行金融预测比赛 3 | ### 1、从数据预处理、到特征工程、到模型预测均在py文件中; 4 | ### 2、单模型0.860; 5 | ### 3、最终通过融合进入决赛; 6 | ### 4、成绩不够好,望大佬们莫嘲笑。 7 | -------------------------------------------------------------------------------- /model_yw.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Tue Jun 5 11:23:12 2018 4 | 5 | @author: yuwei 6 | """ 7 | 8 | import pandas as pd 9 | import xgboost as xgb 10 | import numpy as np 11 | from sklearn.model_selection import StratifiedKFold 12 | from sklearn.metrics import roc_curve 13 | from sklearn import metrics 14 | 15 | 16 | path ='..//dataset//' 17 | 18 | #%% 19 | def loadData(path): 20 | "读取数据集" 21 | "训练集" 22 | #APP操作行为日志 23 | train_log = pd.read_table(path+'train_log.csv',sep='\t') 24 | #切分EVT_LBL 25 | train_log['EVT_LBL_0'] = train_log.EVT_LBL.map(lambda x:x.split('-')[0]) 26 | train_log['EVT_LBL_1'] = train_log.EVT_LBL.map(lambda x:x.split('-')[1]) 27 | train_log['EVT_LBL_2'] = train_log.EVT_LBL.map(lambda x:x.split('-')[2]) 28 | #获取时间 29 | train_log['OCC_TIM_HOUR'] = train_log.OCC_TIM.map(lambda x :int(str(x)[11:13]) if int(str(x)[11:13])>1 else 24) 30 | train_log['OCC_TIM'] = train_log.OCC_TIM.map(lambda x :int(str(x)[8:10])) 31 | 32 | # #获取时间 33 | # train_log['date'] = train_log.OCC_TIM.map(lambda x :datetime.datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S')) 34 | # train_log['OCC_TIM'] = train_log.date.map(lambda x :x.day) 35 | # train_log['OCC_TIM_HOUR'] = train_log.date.map(lambda x :x.hour) 36 | #个人属性与信用卡消费数据 37 | train_agg = pd.read_table(path+'train_agg.csv',sep='\t') 38 | #标签数据 39 | train_flg = pd.read_table(path+'train_flg.csv',sep='\t') 40 | "测试集" 41 | test_log = pd.read_table(path+'test_log.csv',sep='\t') 42 | test_log['EVT_LBL_0'] = test_log.EVT_LBL.map(lambda x:x.split('-')[0]) 43 | test_log['EVT_LBL_1'] = test_log.EVT_LBL.map(lambda x:x.split('-')[1]) 44 | test_log['EVT_LBL_2'] = test_log.EVT_LBL.map(lambda x:x.split('-')[2]) 45 | #获取时间 46 | test_log['OCC_TIM_HOUR'] = test_log.OCC_TIM.map(lambda x :int(str(x)[11:13]) if int(str(x)[11:13])>1 else 24) 47 | test_log['OCC_TIM'] = test_log.OCC_TIM.map(lambda x :int(str(x)[8:10])) 48 | # #获取时间 49 | # test_log['date'] = test_log.OCC_TIM.map(lambda x :datetime.datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S')) 50 | # test_log['OCC_TIM'] = test_log.date.map(lambda x :x.day) 51 | # test_log['OCC_TIM_HOUR'] = train_log.date.map(lambda x :x.hour) 52 | test_agg = pd.read_table(path+'test_agg.csv',sep='\t') 53 | 54 | return train_log,train_agg,train_flg,test_log,test_agg 55 | 56 | #%% 57 | def genFeatureAgg(data,agg): 58 | "特征提取--agg表:V1-V30" 59 | 60 | #保存原始表 61 | ans = data.copy() 62 | 63 | "提取agg表特征" 64 | #V1到V30属性 65 | ans = pd.merge(ans,agg) 66 | #V1到V30求和 67 | ans['V1_to_V30'] = ans['V1']+ans['V2']+ans['V3']+ans['V4']+ans['V5']+ans['V6']+ans['V7']+ans['V8']+ans['V9']+ans['V10']+\ 68 | ans['V11']+ans['V12']+ans['V13']+ans['V14']+ans['V15']+ans['V16']+ans['V17']+ans['V18']+ans['V19']+ans['V20']+\ 69 | ans['V21']+ans['V22']+ans['V23']+ans['V24']+ans['V25']+ans['V26']+ans['V27']+ans['V28']+ans['V29']+ans['V30'] 70 | #V1到V10求和 71 | ans['V1_to_V10'] = ans['V1']+ans['V2']+ans['V3']+ans['V4']+ans['V5']+ans['V6']+ans['V7']+ans['V8']+ans['V9']+ans['V10'] 72 | #V11到V20求和 73 | ans['V10_to_V20'] = ans['V11']+ans['V12']+ans['V13']+ans['V14']+ans['V15'] + ans['V16']+ans['V17']+ans['V18']+ans['V19']+ans['V20'] 74 | #V20到V30求和 75 | ans['V20_to_V30'] = ans['V21']+ans['V22']+ans['V23']+ans['V24']+ans['V25']+ ans['V26']+ans['V27']+ans['V28']+ans['V29']+ans['V30'] 76 | #V1到V5求和 77 | ans['V1_to_V5'] = ans['V1']+ans['V2']+ans['V3']+ans['V4']+ans['V5'] 78 | #V6到V10求和 79 | ans['V6_to_V10'] = ans['V6']+ans['V7']+ans['V8']+ans['V9']+ans['V10'] 80 | #V11到V15求和 81 | ans['V11_to_V15'] = ans['V11']+ans['V12']+ans['V13']+ans['V14']+ans['V15'] 82 | #V16到V20求和 83 | ans['V16_to_V20'] = ans['V16']+ans['V17']+ans['V18']+ans['V19']+ans['V20'] 84 | #V21到V25求和 85 | ans['V21_to_V25'] = ans['V21']+ans['V22']+ans['V23']+ans['V24']+ans['V25'] 86 | #V26到V30求和 87 | ans['V26_to_V30'] = ans['V26']+ans['V27']+ans['V28']+ans['V29']+ans['V30'] 88 | #求除法 89 | ans['V1_to_V5_rate'] = ans['V1_to_V5']/ans['V1_to_V30'] 90 | ans['V6_to_V10_rate'] = ans['V6_to_V10']/ans['V1_to_V30'] 91 | ans['V11_to_V15_rate'] = ans['V11_to_V15']/ans['V1_to_V30'] 92 | ans['V16_to_V20_rate'] = ans['V16_to_V20']/ans['V1_to_V30'] 93 | ans['V21_to_V25_rate'] = ans['V21_to_V25']/ans['V1_to_V30'] 94 | ans['V26_to_V30_rate'] = ans['V26_to_V30']/ans['V1_to_V30'] 95 | 96 | return ans 97 | 98 | #%% 99 | def genFeatureLog(data,logData): 100 | "特征提取" 101 | #保存原始表 102 | ans = data.copy() 103 | log = logData.copy() 104 | 105 | #按不同的天数粒度提取特征 106 | for i in [31,24,21,18,14,10,7,5,4,3,2,1]: 107 | 108 | log = log[(log.OCC_TIM>=32-i)] 109 | #共计出现多少次 110 | log['count_'+str(i)] = log['USRID'] 111 | feat = pd.pivot_table(log,index=['USRID'],values='count_'+str(i),aggfunc='count').reset_index() 112 | ans = pd.merge(ans,feat,on='USRID',how='left') 113 | #共交互多少个不同的LBL 114 | log['diff_lbl_'+str(i)] = log['USRID'] 115 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL'],values='diff_lbl_'+str(i),aggfunc='count').reset_index() 116 | feat['diff_lbl_'+str(i)] = feat['USRID'] 117 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_'+str(i),aggfunc='count').reset_index() 118 | ans = pd.merge(ans,feat,on='USRID',how='left') 119 | #共交互多少个不同的LBL_0 120 | log['diff_lbl_0_'+str(i)] = log['USRID'] 121 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_0'],values='diff_lbl_0_'+str(i),aggfunc='count').reset_index() 122 | feat['diff_lbl_0_'+str(i)] = feat['USRID'] 123 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_0_'+str(i),aggfunc='count').reset_index() 124 | ans = pd.merge(ans,feat,on='USRID',how='left') 125 | #共交互多少个不同的LBL_1 126 | log['diff_lbl_1_'+str(i)] = log['USRID'] 127 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_1'],values='diff_lbl_1_'+str(i),aggfunc='count').reset_index() 128 | feat['diff_lbl_1_'+str(i)] = feat['USRID'] 129 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_1_'+str(i),aggfunc='count').reset_index() 130 | ans = pd.merge(ans,feat,on='USRID',how='left') 131 | #共交互多少个不同的LBL_2 132 | log['diff_lbl_2_'+str(i)] = log['USRID'] 133 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_2'],values='diff_lbl_2_'+str(i),aggfunc='count').reset_index() 134 | feat['diff_lbl_2_'+str(i)] = feat['USRID'] 135 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_2_'+str(i),aggfunc='count').reset_index() 136 | ans = pd.merge(ans,feat,on='USRID',how='left') 137 | 138 | #统计type为0的次数 139 | type_0 = log[log.TCH_TYP==0] 140 | type_0['type_0_count_'+str(i)] = type_0['USRID'] 141 | feat = pd.pivot_table(type_0,index=['USRID'],values='type_0_count_'+str(i),aggfunc='count').reset_index() 142 | ans = pd.merge(ans,feat,on='USRID',how='left') 143 | del type_0 144 | #统计type为2的次数 145 | type_2 = log[log.TCH_TYP==2] 146 | type_2['type_2_count_'+str(i)] = type_2['USRID'] 147 | feat = pd.pivot_table(type_2,index=['USRID'],values='type_2_count_'+str(i),aggfunc='count').reset_index() 148 | ans = pd.merge(ans,feat,on='USRID',how='left') 149 | del type_2 150 | 151 | #计算最近几个小时次数 152 | if i == 1: 153 | for j in [12,18,21,24]: 154 | log = log[(log.OCC_TIM_HOUR>=j)] 155 | #共计出现多少次 156 | log['count_hour'+str(i)] = log['USRID'] 157 | feat = pd.pivot_table(log,index=['USRID'],values='count_hour'+str(i),aggfunc='count').reset_index() 158 | ans = pd.merge(ans,feat,on='USRID',how='left') 159 | #共交互多少个不同的LBL 160 | log['diff_lbl_hour'+str(i)] = log['USRID'] 161 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL'],values='diff_lbl_hour'+str(i),aggfunc='count').reset_index() 162 | feat['diff_lbl_hour'+str(i)] = feat['USRID'] 163 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_hour'+str(i),aggfunc='count').reset_index() 164 | ans = pd.merge(ans,feat,on='USRID',how='left') 165 | #共交互多少个不同的LBL_0 166 | log['diff_lbl_0_hour'+str(i)] = log['USRID'] 167 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_0'],values='diff_lbl_0_hour'+str(i),aggfunc='count').reset_index() 168 | feat['diff_lbl_0_hour'+str(i)] = feat['USRID'] 169 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_0_hour'+str(i),aggfunc='count').reset_index() 170 | ans = pd.merge(ans,feat,on='USRID',how='left') 171 | #共交互多少个不同的LBL_1 172 | log['diff_lbl_1_hour'+str(i)] = log['USRID'] 173 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_1'],values='diff_lbl_1_hour'+str(i),aggfunc='count').reset_index() 174 | feat['diff_lbl_1_hour'+str(i)] = feat['USRID'] 175 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_1_hour'+str(i),aggfunc='count').reset_index() 176 | ans = pd.merge(ans,feat,on='USRID',how='left') 177 | #共交互多少个不同的LBL_2 178 | log['diff_lbl_2_hour'+str(i)] = log['USRID'] 179 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_2'],values='diff_lbl_2_hour'+str(i),aggfunc='count').reset_index() 180 | feat['diff_lbl_2_hour'+str(i)] = feat['USRID'] 181 | feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_2_hour'+str(i),aggfunc='count').reset_index() 182 | ans = pd.merge(ans,feat,on='USRID',how='left') 183 | 184 | #统计type为0的次数 185 | type_0 = log[log.TCH_TYP==0] 186 | type_0['type_0_count_hour'+str(i)] = type_0['USRID'] 187 | feat = pd.pivot_table(type_0,index=['USRID'],values='type_0_count_hour'+str(i),aggfunc='count').reset_index() 188 | ans = pd.merge(ans,feat,on='USRID',how='left') 189 | del type_0 190 | #统计type为2的次数 191 | type_2 = log[log.TCH_TYP==2] 192 | type_2['type_2_count_hour'+str(i)] = type_2['USRID'] 193 | feat = pd.pivot_table(type_2,index=['USRID'],values='type_2_count_hour'+str(i),aggfunc='count').reset_index() 194 | ans = pd.merge(ans,feat,on='USRID',how='left') 195 | return ans 196 | 197 | 198 | #%% 199 | def modelXgb(train,test): 200 | "xgb模型" 201 | train_y = train['FLAG'].values 202 | 203 | train_x = train.drop(['USRID','FLAG'],axis=1).values 204 | test_x = test.drop(['USRID','FLAG'],axis=1).values 205 | 206 | dtrain = xgb.DMatrix(train_x, label=train_y) 207 | dtest = xgb.DMatrix(test_x) 208 | 209 | # 模型参数 210 | params = {'booster': 'gbtree', 211 | 'objective':'binary:logistic', 212 | 'eval_metric':'auc', 213 | 'eta': 0.03, 214 | 'max_depth': 6, # 6 215 | 'colsample_bytree': 0.8,#0.8 216 | 'subsample': 0.8, 217 | 'scale_pos_weight': 1, 218 | 'min_child_weight': 18 # 2 219 | } 220 | # 训练 221 | watchlist = [(dtrain,'train')] 222 | bst = xgb.train(params, dtrain, num_boost_round=370,evals=watchlist) 223 | # 预测 224 | predict = bst.predict(dtest) 225 | test_xy = test[['USRID']] 226 | test_xy['RST'] = predict 227 | test_xy = test_xy.sort_values('RST', ascending=False) 228 | 229 | return test_xy 230 | 231 | #%% 232 | def oneHot(logData,ans): 233 | "" 234 | log = logData.copy() 235 | log['EVT_LBL_0_oh'] = log['USRID'] 236 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_0'],values='EVT_LBL_0_oh',aggfunc='count').reset_index() 237 | feat = feat.set_index(feat.columns.tolist()[0:2]) 238 | feat = feat.unstack() 239 | feat = feat.reset_index() 240 | ans = pd.merge(ans,feat,on='USRID',how='left') 241 | 242 | log['EVT_LBL_1_oh'] = log['USRID'] 243 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_1'],values='EVT_LBL_1_oh',aggfunc='count').reset_index() 244 | feat = feat.set_index(feat.columns.tolist()[0:2]) 245 | feat = feat.unstack() 246 | feat = feat.reset_index() 247 | ans = pd.merge(ans,feat,on='USRID',how='left') 248 | 249 | log['EVT_LBL_2_oh'] = log['USRID'] 250 | feat = pd.pivot_table(log,index=['USRID','EVT_LBL_2'],values='EVT_LBL_2_oh',aggfunc='count').reset_index() 251 | feat = feat.set_index(feat.columns.tolist()[0:2]) 252 | feat = feat.unstack() 253 | feat = feat.reset_index() 254 | ans = pd.merge(ans,feat,on='USRID',how='left') 255 | 256 | return ans 257 | 258 | #%% 259 | def validate(all_train): 260 | "模型验证" 261 | train_x = all_train.drop(['USRID', 'FLAG'], axis=1).values 262 | train_y = all_train['FLAG'].values 263 | auc_list = [] 264 | 265 | skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3) 266 | for train_index, test_index in skf.split(train_x, train_y): 267 | print('\n') 268 | print('Train: %s | test: %s' % (train_index, test_index)) 269 | X_train, X_test = train_x[train_index], train_x[test_index] 270 | y_train, y_test = train_y[train_index], train_y[test_index] 271 | 272 | pred_value = xgb_model(X_train, y_train, X_test) 273 | print(pred_value) 274 | print(y_test) 275 | 276 | pred_value = np.array(pred_value) 277 | pred_value = [ele + 1 for ele in pred_value] 278 | 279 | y_test = np.array(y_test) 280 | y_test = [ele + 1 for ele in y_test] 281 | 282 | fpr, tpr, thresholds = roc_curve(y_test, pred_value, pos_label=2) 283 | 284 | auc = metrics.auc(fpr, tpr) 285 | print('auc value:',auc) 286 | auc_list.append(auc) 287 | print('validate result:',np.mean(auc_list)) 288 | 289 | 290 | def xgb_model(train_set_x,train_set_y,test_set_x): 291 | # 模型参数 292 | params = {'booster': 'gbtree', 293 | 'objective':'binary:logistic', 294 | 'eval_metric':'auc', 295 | 'eta': 0.03, 296 | 'max_depth': 6, # 6 297 | 'colsample_bytree': 0.8,#0.8 298 | 'subsample': 0.8, 299 | 'scale_pos_weight': 1, 300 | 'min_child_weight': 18 # 2 301 | } 302 | dtrain = xgb.DMatrix(train_set_x, label=train_set_y) 303 | dvali = xgb.DMatrix(test_set_x) 304 | model = xgb.train(params, dtrain, num_boost_round=370) 305 | predict = model.predict(dvali) 306 | return predict 307 | 308 | #%% 309 | 310 | if __name__ == '__main__': 311 | "主函数入口" 312 | #获取原始数据 313 | train_log,train_agg,train_flg,test_log,test_agg = loadData(path) 314 | test_flg = test_agg[['USRID']];test_flg['FLAG']=-1 315 | #合并训练集和测试集 316 | flg = pd.concat([train_flg,test_flg],axis=0) 317 | agg = pd.concat([train_agg,test_agg],axis=0) 318 | log = pd.concat([train_log,test_log],axis=0) 319 | "特征提取:agg表" 320 | data = genFeatureAgg(flg,agg) 321 | "特征提取:log表" 322 | data = genFeatureLog(data,log) 323 | "特征提取:log表离散EVT_LBL" 324 | data = oneHot(log,data) 325 | "分割训练集和测试集" 326 | trainset = data[data['FLAG']!=-1] 327 | testset = data[data['FLAG']==-1] 328 | 329 | "模型验证" 330 | # validate(trainset) 331 | 332 | "模型训练" 333 | answer = modelXgb(trainset,testset) 334 | pd.Series(np.array(answer['RST'].values)).plot(figsize=(8, 8)) 335 | answer.to_csv("yw.csv",index=None,sep='\t') 336 | 337 | 338 | --------------------------------------------------------------------------------