├── README.md
└── model_yw.py


/README.md:
--------------------------------------------------------------------------------
1 | # User-purchase-forecast-in-consumer-finance-scenarios
2 | ### 参加2018年招行金融预测比赛
3 | ### 1、从数据预处理、到特征工程、到模型预测均在py文件中；
4 | ### 2、单模型0.860；
5 | ### 3、最终通过融合进入决赛；
6 | ### 4、成绩不够好，望大佬们莫嘲笑。
7 | 


--------------------------------------------------------------------------------
/model_yw.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Created on Tue Jun  5 11:23:12 2018
  4 | 
  5 | @author: yuwei
  6 | """
  7 | 
  8 | import pandas as pd
  9 | import xgboost as xgb
 10 | import numpy as np
 11 | from sklearn.model_selection import StratifiedKFold
 12 | from sklearn.metrics import roc_curve
 13 | from sklearn import metrics
 14 | 
 15 | 
 16 | path ='..//dataset//'
 17 | 
 18 | #%%
 19 | def loadData(path):
 20 |     "读取数据集"
 21 |     "训练集"
 22 |     #APP操作行为日志
 23 |     train_log = pd.read_table(path+'train_log.csv',sep='\t')
 24 |     #切分EVT_LBL
 25 |     train_log['EVT_LBL_0'] = train_log.EVT_LBL.map(lambda x:x.split('-')[0])
 26 |     train_log['EVT_LBL_1'] = train_log.EVT_LBL.map(lambda x:x.split('-')[1])
 27 |     train_log['EVT_LBL_2'] = train_log.EVT_LBL.map(lambda x:x.split('-')[2])
 28 |     #获取时间
 29 |     train_log['OCC_TIM_HOUR'] = train_log.OCC_TIM.map(lambda x :int(str(x)[11:13]) if int(str(x)[11:13])>1 else 24)
 30 |     train_log['OCC_TIM'] = train_log.OCC_TIM.map(lambda x :int(str(x)[8:10]))
 31 | 
 32 | #    #获取时间
 33 | #    train_log['date'] = train_log.OCC_TIM.map(lambda x :datetime.datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S'))
 34 | #    train_log['OCC_TIM'] = train_log.date.map(lambda x :x.day)
 35 | #    train_log['OCC_TIM_HOUR'] = train_log.date.map(lambda x :x.hour)
 36 |     #个人属性与信用卡消费数据
 37 |     train_agg = pd.read_table(path+'train_agg.csv',sep='\t')
 38 |     #标签数据
 39 |     train_flg = pd.read_table(path+'train_flg.csv',sep='\t')
 40 |     "测试集"
 41 |     test_log = pd.read_table(path+'test_log.csv',sep='\t')
 42 |     test_log['EVT_LBL_0'] = test_log.EVT_LBL.map(lambda x:x.split('-')[0])
 43 |     test_log['EVT_LBL_1'] = test_log.EVT_LBL.map(lambda x:x.split('-')[1])
 44 |     test_log['EVT_LBL_2'] = test_log.EVT_LBL.map(lambda x:x.split('-')[2])
 45 |     #获取时间
 46 |     test_log['OCC_TIM_HOUR'] = test_log.OCC_TIM.map(lambda x :int(str(x)[11:13]) if int(str(x)[11:13])>1 else 24)
 47 |     test_log['OCC_TIM'] = test_log.OCC_TIM.map(lambda x :int(str(x)[8:10]))
 48 | #    #获取时间
 49 | #    test_log['date'] = test_log.OCC_TIM.map(lambda x :datetime.datetime.strptime(str(x),'%Y-%m-%d %H:%M:%S'))
 50 | #    test_log['OCC_TIM'] = test_log.date.map(lambda x :x.day)
 51 | #    test_log['OCC_TIM_HOUR'] = train_log.date.map(lambda x :x.hour) 
 52 |     test_agg = pd.read_table(path+'test_agg.csv',sep='\t')
 53 | 
 54 |     return train_log,train_agg,train_flg,test_log,test_agg
 55 | 
 56 | #%%
 57 | def genFeatureAgg(data,agg):
 58 |     "特征提取--agg表:V1-V30"
 59 |     
 60 |     #保存原始表
 61 |     ans = data.copy()
 62 | 
 63 |     "提取agg表特征"
 64 |     #V1到V30属性
 65 |     ans = pd.merge(ans,agg)
 66 |     #V1到V30求和
 67 |     ans['V1_to_V30'] = ans['V1']+ans['V2']+ans['V3']+ans['V4']+ans['V5']+ans['V6']+ans['V7']+ans['V8']+ans['V9']+ans['V10']+\
 68 |     ans['V11']+ans['V12']+ans['V13']+ans['V14']+ans['V15']+ans['V16']+ans['V17']+ans['V18']+ans['V19']+ans['V20']+\
 69 |     ans['V21']+ans['V22']+ans['V23']+ans['V24']+ans['V25']+ans['V26']+ans['V27']+ans['V28']+ans['V29']+ans['V30']
 70 |     #V1到V10求和
 71 |     ans['V1_to_V10'] = ans['V1']+ans['V2']+ans['V3']+ans['V4']+ans['V5']+ans['V6']+ans['V7']+ans['V8']+ans['V9']+ans['V10']
 72 |     #V11到V20求和
 73 |     ans['V10_to_V20'] = ans['V11']+ans['V12']+ans['V13']+ans['V14']+ans['V15'] + ans['V16']+ans['V17']+ans['V18']+ans['V19']+ans['V20']
 74 |     #V20到V30求和
 75 |     ans['V20_to_V30'] = ans['V21']+ans['V22']+ans['V23']+ans['V24']+ans['V25']+ ans['V26']+ans['V27']+ans['V28']+ans['V29']+ans['V30']
 76 |     #V1到V5求和
 77 |     ans['V1_to_V5'] = ans['V1']+ans['V2']+ans['V3']+ans['V4']+ans['V5']
 78 |     #V6到V10求和
 79 |     ans['V6_to_V10'] = ans['V6']+ans['V7']+ans['V8']+ans['V9']+ans['V10']
 80 |     #V11到V15求和
 81 |     ans['V11_to_V15'] = ans['V11']+ans['V12']+ans['V13']+ans['V14']+ans['V15']
 82 |     #V16到V20求和
 83 |     ans['V16_to_V20'] = ans['V16']+ans['V17']+ans['V18']+ans['V19']+ans['V20']
 84 |     #V21到V25求和
 85 |     ans['V21_to_V25'] = ans['V21']+ans['V22']+ans['V23']+ans['V24']+ans['V25']
 86 |     #V26到V30求和
 87 |     ans['V26_to_V30'] = ans['V26']+ans['V27']+ans['V28']+ans['V29']+ans['V30']
 88 |     #求除法
 89 |     ans['V1_to_V5_rate'] = ans['V1_to_V5']/ans['V1_to_V30']
 90 |     ans['V6_to_V10_rate'] = ans['V6_to_V10']/ans['V1_to_V30']
 91 |     ans['V11_to_V15_rate'] = ans['V11_to_V15']/ans['V1_to_V30']
 92 |     ans['V16_to_V20_rate'] = ans['V16_to_V20']/ans['V1_to_V30']
 93 |     ans['V21_to_V25_rate'] = ans['V21_to_V25']/ans['V1_to_V30']
 94 |     ans['V26_to_V30_rate'] = ans['V26_to_V30']/ans['V1_to_V30']
 95 | 
 96 |     return ans
 97 | 
 98 | #%%
 99 | def genFeatureLog(data,logData):
100 |     "特征提取"
101 |     #保存原始表
102 |     ans = data.copy()
103 |     log = logData.copy()
104 |     
105 |     #按不同的天数粒度提取特征
106 |     for i in [31,24,21,18,14,10,7,5,4,3,2,1]:
107 | 
108 |        log = log[(log.OCC_TIM>=32-i)]
109 |        #共计出现多少次
110 |        log['count_'+str(i)] = log['USRID']
111 |        feat = pd.pivot_table(log,index=['USRID'],values='count_'+str(i),aggfunc='count').reset_index()
112 |        ans = pd.merge(ans,feat,on='USRID',how='left')
113 |        #共交互多少个不同的LBL
114 |        log['diff_lbl_'+str(i)] = log['USRID']
115 |        feat = pd.pivot_table(log,index=['USRID','EVT_LBL'],values='diff_lbl_'+str(i),aggfunc='count').reset_index()
116 |        feat['diff_lbl_'+str(i)] = feat['USRID']
117 |        feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_'+str(i),aggfunc='count').reset_index()
118 |        ans = pd.merge(ans,feat,on='USRID',how='left')
119 |        #共交互多少个不同的LBL_0
120 |        log['diff_lbl_0_'+str(i)] = log['USRID']
121 |        feat = pd.pivot_table(log,index=['USRID','EVT_LBL_0'],values='diff_lbl_0_'+str(i),aggfunc='count').reset_index()
122 |        feat['diff_lbl_0_'+str(i)] = feat['USRID']
123 |        feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_0_'+str(i),aggfunc='count').reset_index()
124 |        ans = pd.merge(ans,feat,on='USRID',how='left')
125 |        #共交互多少个不同的LBL_1
126 |        log['diff_lbl_1_'+str(i)] = log['USRID']
127 |        feat = pd.pivot_table(log,index=['USRID','EVT_LBL_1'],values='diff_lbl_1_'+str(i),aggfunc='count').reset_index()
128 |        feat['diff_lbl_1_'+str(i)] = feat['USRID']
129 |        feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_1_'+str(i),aggfunc='count').reset_index()
130 |        ans = pd.merge(ans,feat,on='USRID',how='left')
131 |        #共交互多少个不同的LBL_2
132 |        log['diff_lbl_2_'+str(i)] = log['USRID']
133 |        feat = pd.pivot_table(log,index=['USRID','EVT_LBL_2'],values='diff_lbl_2_'+str(i),aggfunc='count').reset_index()
134 |        feat['diff_lbl_2_'+str(i)] = feat['USRID']
135 |        feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_2_'+str(i),aggfunc='count').reset_index()
136 |        ans = pd.merge(ans,feat,on='USRID',how='left')
137 |     
138 |        #统计type为0的次数
139 |        type_0 = log[log.TCH_TYP==0]
140 |        type_0['type_0_count_'+str(i)] = type_0['USRID']
141 |        feat = pd.pivot_table(type_0,index=['USRID'],values='type_0_count_'+str(i),aggfunc='count').reset_index()
142 |        ans = pd.merge(ans,feat,on='USRID',how='left')
143 |        del type_0
144 |        #统计type为2的次数
145 |        type_2 = log[log.TCH_TYP==2]
146 |        type_2['type_2_count_'+str(i)] = type_2['USRID']
147 |        feat = pd.pivot_table(type_2,index=['USRID'],values='type_2_count_'+str(i),aggfunc='count').reset_index()
148 |        ans = pd.merge(ans,feat,on='USRID',how='left')
149 |        del type_2
150 |        
151 |        #计算最近几个小时次数
152 |        if i == 1:
153 |           for j in [12,18,21,24]:
154 |               log = log[(log.OCC_TIM_HOUR>=j)]
155 |               #共计出现多少次
156 |               log['count_hour'+str(i)] = log['USRID']
157 |               feat = pd.pivot_table(log,index=['USRID'],values='count_hour'+str(i),aggfunc='count').reset_index()
158 |               ans = pd.merge(ans,feat,on='USRID',how='left')
159 |               #共交互多少个不同的LBL
160 |               log['diff_lbl_hour'+str(i)] = log['USRID']
161 |               feat = pd.pivot_table(log,index=['USRID','EVT_LBL'],values='diff_lbl_hour'+str(i),aggfunc='count').reset_index()
162 |               feat['diff_lbl_hour'+str(i)] = feat['USRID']
163 |               feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_hour'+str(i),aggfunc='count').reset_index()
164 |               ans = pd.merge(ans,feat,on='USRID',how='left')
165 |               #共交互多少个不同的LBL_0
166 |               log['diff_lbl_0_hour'+str(i)] = log['USRID']
167 |               feat = pd.pivot_table(log,index=['USRID','EVT_LBL_0'],values='diff_lbl_0_hour'+str(i),aggfunc='count').reset_index()
168 |               feat['diff_lbl_0_hour'+str(i)] = feat['USRID']
169 |               feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_0_hour'+str(i),aggfunc='count').reset_index()
170 |               ans = pd.merge(ans,feat,on='USRID',how='left')
171 |               #共交互多少个不同的LBL_1
172 |               log['diff_lbl_1_hour'+str(i)] = log['USRID']
173 |               feat = pd.pivot_table(log,index=['USRID','EVT_LBL_1'],values='diff_lbl_1_hour'+str(i),aggfunc='count').reset_index()
174 |               feat['diff_lbl_1_hour'+str(i)] = feat['USRID']
175 |               feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_1_hour'+str(i),aggfunc='count').reset_index()
176 |               ans = pd.merge(ans,feat,on='USRID',how='left')
177 |               #共交互多少个不同的LBL_2
178 |               log['diff_lbl_2_hour'+str(i)] = log['USRID']
179 |               feat = pd.pivot_table(log,index=['USRID','EVT_LBL_2'],values='diff_lbl_2_hour'+str(i),aggfunc='count').reset_index()
180 |               feat['diff_lbl_2_hour'+str(i)] = feat['USRID']
181 |               feat = pd.pivot_table(feat,index=['USRID'],values='diff_lbl_2_hour'+str(i),aggfunc='count').reset_index()
182 |               ans = pd.merge(ans,feat,on='USRID',how='left')
183 |            
184 |               #统计type为0的次数
185 |               type_0 = log[log.TCH_TYP==0]
186 |               type_0['type_0_count_hour'+str(i)] = type_0['USRID']
187 |               feat = pd.pivot_table(type_0,index=['USRID'],values='type_0_count_hour'+str(i),aggfunc='count').reset_index()
188 |               ans = pd.merge(ans,feat,on='USRID',how='left')
189 |               del type_0
190 |               #统计type为2的次数
191 |               type_2 = log[log.TCH_TYP==2]
192 |               type_2['type_2_count_hour'+str(i)] = type_2['USRID']
193 |               feat = pd.pivot_table(type_2,index=['USRID'],values='type_2_count_hour'+str(i),aggfunc='count').reset_index()
194 |               ans = pd.merge(ans,feat,on='USRID',how='left')
195 |     return ans
196 | 
197 |     
198 | #%%
199 | def modelXgb(train,test):
200 |     "xgb模型"
201 |     train_y = train['FLAG'].values
202 |                          
203 |     train_x = train.drop(['USRID','FLAG'],axis=1).values
204 |     test_x = test.drop(['USRID','FLAG'],axis=1).values        
205 |                     
206 |     dtrain = xgb.DMatrix(train_x, label=train_y)
207 |     dtest = xgb.DMatrix(test_x)
208 |     
209 |     # 模型参数
210 |     params = {'booster': 'gbtree',
211 |               'objective':'binary:logistic',
212 |               'eval_metric':'auc',
213 |               'eta': 0.03,
214 |               'max_depth': 6,  # 6
215 |               'colsample_bytree': 0.8,#0.8
216 |               'subsample': 0.8,
217 |               'scale_pos_weight': 1,
218 |               'min_child_weight': 18  # 2
219 |               }
220 |     # 训练
221 |     watchlist = [(dtrain,'train')]
222 |     bst = xgb.train(params, dtrain, num_boost_round=370,evals=watchlist)
223 |     # 预测
224 |     predict = bst.predict(dtest)
225 |     test_xy = test[['USRID']]
226 |     test_xy['RST'] = predict
227 |     test_xy = test_xy.sort_values('RST', ascending=False)
228 |     
229 |     return test_xy   
230 | 
231 | #%%
232 | def oneHot(logData,ans):
233 |     ""
234 |     log = logData.copy()
235 |     log['EVT_LBL_0_oh'] = log['USRID']
236 |     feat = pd.pivot_table(log,index=['USRID','EVT_LBL_0'],values='EVT_LBL_0_oh',aggfunc='count').reset_index()
237 |     feat = feat.set_index(feat.columns.tolist()[0:2])
238 |     feat = feat.unstack()
239 |     feat = feat.reset_index()
240 |     ans = pd.merge(ans,feat,on='USRID',how='left')
241 |     
242 |     log['EVT_LBL_1_oh'] = log['USRID']
243 |     feat = pd.pivot_table(log,index=['USRID','EVT_LBL_1'],values='EVT_LBL_1_oh',aggfunc='count').reset_index()
244 |     feat = feat.set_index(feat.columns.tolist()[0:2])
245 |     feat = feat.unstack()
246 |     feat = feat.reset_index()
247 |     ans = pd.merge(ans,feat,on='USRID',how='left')
248 |     
249 |     log['EVT_LBL_2_oh'] = log['USRID']
250 |     feat = pd.pivot_table(log,index=['USRID','EVT_LBL_2'],values='EVT_LBL_2_oh',aggfunc='count').reset_index()
251 |     feat = feat.set_index(feat.columns.tolist()[0:2])
252 |     feat = feat.unstack()
253 |     feat = feat.reset_index()
254 |     ans = pd.merge(ans,feat,on='USRID',how='left')
255 | 
256 |     return ans
257 | 
258 | #%%
259 | def validate(all_train):
260 |     "模型验证"
261 |     train_x = all_train.drop(['USRID', 'FLAG'], axis=1).values
262 |     train_y = all_train['FLAG'].values
263 |     auc_list = []
264 |     
265 |     skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
266 |     for train_index, test_index in skf.split(train_x, train_y):
267 |             print('\n')
268 |             print('Train: %s | test: %s' % (train_index, test_index))
269 |             X_train, X_test = train_x[train_index], train_x[test_index]
270 |             y_train, y_test = train_y[train_index], train_y[test_index]
271 |     
272 |             pred_value = xgb_model(X_train, y_train, X_test)
273 |             print(pred_value)
274 |             print(y_test)
275 |     
276 |             pred_value = np.array(pred_value)
277 |             pred_value = [ele + 1 for ele in pred_value]
278 |     
279 |             y_test = np.array(y_test)
280 |             y_test = [ele + 1 for ele in y_test]
281 |     
282 |             fpr, tpr, thresholds = roc_curve(y_test, pred_value, pos_label=2)
283 |             
284 |             auc = metrics.auc(fpr, tpr)
285 |             print('auc value:',auc)
286 |             auc_list.append(auc)
287 |     print('validate result:',np.mean(auc_list))
288 |     
289 |     
290 | def xgb_model(train_set_x,train_set_y,test_set_x):
291 |     # 模型参数
292 |     params = {'booster': 'gbtree',
293 |               'objective':'binary:logistic',
294 |               'eval_metric':'auc',
295 |               'eta': 0.03,
296 |               'max_depth': 6,  # 6
297 |               'colsample_bytree': 0.8,#0.8
298 |               'subsample': 0.8,
299 |               'scale_pos_weight': 1,
300 |               'min_child_weight': 18  # 2
301 |               }
302 |     dtrain = xgb.DMatrix(train_set_x, label=train_set_y)
303 |     dvali = xgb.DMatrix(test_set_x)
304 |     model = xgb.train(params, dtrain, num_boost_round=370)
305 |     predict = model.predict(dvali)
306 |     return predict
307 | 
308 | #%%
309 | 
310 | if __name__ == '__main__':
311 |    "主函数入口"
312 |    #获取原始数据
313 |    train_log,train_agg,train_flg,test_log,test_agg = loadData(path)
314 |    test_flg = test_agg[['USRID']];test_flg['FLAG']=-1
315 |    #合并训练集和测试集
316 |    flg = pd.concat([train_flg,test_flg],axis=0)
317 |    agg = pd.concat([train_agg,test_agg],axis=0)
318 |    log = pd.concat([train_log,test_log],axis=0)
319 |    "特征提取：agg表"
320 |    data = genFeatureAgg(flg,agg)
321 |    "特征提取：log表"
322 |    data = genFeatureLog(data,log)
323 |    "特征提取：log表离散EVT_LBL"
324 |    data = oneHot(log,data)
325 |    "分割训练集和测试集"
326 |    trainset = data[data['FLAG']!=-1]
327 |    testset = data[data['FLAG']==-1]
328 |    
329 |    "模型验证"
330 | #   validate(trainset)
331 |    
332 |    "模型训练"
333 |    answer = modelXgb(trainset,testset)
334 |    pd.Series(np.array(answer['RST'].values)).plot(figsize=(8, 8))
335 |    answer.to_csv("yw.csv",index=None,sep='\t')
336 | 
337 | 
338 | 


--------------------------------------------------------------------------------