├── README.md ├── extract_feature.py ├── model2-cy.py ├── xgb.py ├── 奇点计划-决赛-X-Driver-答辩PPT.pdf └── 模型三 ├── fea_log.py ├── fea_tfidf.py ├── fea_word2vec.py ├── model.py ├── util_fea.py └── util_fs.py /README.md: -------------------------------------------------------------------------------- 1 | ## 赛题链接如下 2 | [赛题链接](https://www.datafountain.cn/competitions/287/details/rule) 3 | 4 | 5 | ## 模型一 6 | 代码运行方式:extract_feature.py -> xgb.py, 7 | 8 | 只需在相应的地方更改文件路径即可运行 9 | 10 | ## 模型二 11 | model2-cy.py 12 | 直接运行即可 13 | 14 | ## 模型三 15 | 详情见"模型三"文件夹 16 | 17 |
18 | 最终成绩为三个队友模型融合的成绩 ,成绩为:0.86366999 19 | -------------------------------------------------------------------------------- /extract_feature.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | from collections import Counter 4 | import scipy.stats as sp 5 | import time 6 | import datetime 7 | 8 | 9 | 10 | def get_continue_launch_count(strs,parm): 11 | time = strs.split(":") 12 | time = dict(Counter(time)) 13 | time = sorted(time.items(), key=lambda x: x[0], reverse=False) 14 | key_list = [] 15 | value_list = [] 16 | if len(time) == 1: 17 | return -2 18 | for key,value in dict(time).items(): 19 | key_list.append(int(key)) 20 | value_list.append(int(value)) 21 | 22 | if np.mean(np.diff(key_list, 1)) == 1: 23 | if parm == '1': 24 | return np.mean(value_list) 25 | elif parm == '2': 26 | return np.max(value_list) 27 | elif parm == '3': 28 | return np.min(value_list) 29 | elif parm == '4': 30 | return np.sum(value_list) 31 | elif parm == '5': 32 | return np.std(value_list) 33 | else: 34 | return -1 35 | 36 | 37 | 38 | def get_time_gap(strs,parm): 39 | time = strs.split(":") 40 | time = list(set(time)) 41 | time = sorted(list(map(lambda x:int(x),time))) 42 | time_gap = [] 43 | #用户只在当天活跃 44 | if len(time) == 1: 45 | return -20 46 | 47 | for index, value in enumerate(time): 48 | if index <= len(time) - 2: 49 | gap = abs(time[index] - time[index + 1]) 50 | time_gap.append(gap) 51 | 52 | if parm == '1': 53 | return np.mean(time_gap) 54 | elif parm == '2': 55 | return np.max(time_gap) 56 | elif parm == '3': 57 | return np.min(time_gap) 58 | elif parm == '4': 59 | return np.std(time_gap) 60 | elif parm == '5': 61 | return sp.stats.skew(time_gap) 62 | elif parm == '6': 63 | return sp.stats.kurtosis(time_gap) 64 | 65 | 66 | def get_week(day): 67 | day = int(day) 68 | if day >= 1 and day <= 7: 69 | return 1 70 | 71 | if day >= 8 and day <= 14: 72 | return 2 73 | 74 | if day >= 15 and day <= 21: 75 | return 3 76 | 77 | if day >= 22 and day <= 28: 78 | return 4 79 | 80 | if day >= 28: 81 | return 5 82 | 83 | 84 | def cur_day_repeat_count(strs): 85 | time = strs.split(":") 86 | time = dict(Counter(time)) 87 | time = sorted(time.items(), key=lambda x: x[1], reverse=False) 88 | # 一天一次启动 89 | if (len(time) == 1) & (time[0][1] == 1): 90 | return 0 91 | # 一天多次启动 92 | elif (len(time) == 1) & (time[0][1] > 1): 93 | return 1 94 | # 多天多次启动 95 | elif (len(time) > 1) & (time[0][1] >= 2): 96 | return 2 97 | else: 98 | return 3 99 | 100 | 101 | def get_lianxu_day(day_list): 102 | time = day_list.split(":") 103 | time = list(map(lambda x:int(x),time)) 104 | m = np.array(time) 105 | if len(set(m)) == 1: 106 | return -1 107 | m = list(set(m)) 108 | if len(m) == 0: 109 | return -20 110 | n = np.where(np.diff(m) == 1)[0] 111 | i = 0 112 | result = [] 113 | while i < len(n) - 1: 114 | state = 1 115 | while n[i + 1] - n[i] == 1: 116 | state += 1 117 | i += 1 118 | if i == len(n) - 1: 119 | break 120 | if state == 1: 121 | i += 1 122 | result.append(2) 123 | else: 124 | i += 1 125 | result.append(state + 1) 126 | if len(n) == 1: 127 | result.append(2) 128 | if len(result) != 0: 129 | # print(result) 130 | return np.max(result) 131 | 132 | 133 | def load_csv(): 134 | train_agg = pd.read_csv('../orig_data/train_agg.csv',sep='\t') 135 | train_log = pd.read_csv('../orig_data/train_log.csv', sep='\t') 136 | train_flg = pd.read_csv('../orig_data/train_flg.csv', sep='\t') 137 | 138 | test_agg = pd.read_csv('../orig_data/test_agg.csv', sep='\t') 139 | test_log = pd.read_csv('../orig_data/test_log.csv', sep='\t') 140 | 141 | return train_agg,train_log,train_flg,test_agg,test_log 142 | 143 | 144 | 145 | 146 | def merge_table(train_agg, train_log, train_flg, test_agg, test_log): 147 | train_log['label'] = 1 148 | test_log['label'] = 0 149 | 150 | data = pd.concat([train_log,test_log],axis=0) 151 | data = extract_feature(data) 152 | 153 | train_log = data[data.label == 1] 154 | test_log = data[data.label == 0] 155 | 156 | del train_log['label'] 157 | del test_log['label'] 158 | 159 | all_train = pd.merge(train_flg, train_agg, on=['USRID'], how='left') 160 | train = pd.merge(all_train,train_log,on='USRID',how='left') 161 | test = pd.merge(test_agg,test_log,on='USRID',how='left') 162 | 163 | return train,test 164 | 165 | 166 | def extract_feature(data): 167 | data['cate_1'] = data['EVT_LBL'].apply(lambda x: int(x.split('-')[0])) 168 | data['cate_2'] = data['EVT_LBL'].apply(lambda x: int(x.split('-')[1])) 169 | data['cate_3'] = data['EVT_LBL'].apply(lambda x: int(x.split('-')[2])) 170 | data['day'] = data['OCC_TIM'].apply(lambda x: int(x[8:10])) 171 | data['hour'] = data['OCC_TIM'].apply(lambda x: int(x[11:13])) 172 | data['week'] = data['day'].apply(get_week) 173 | 174 | 175 | feat1 = data.groupby(['USRID'], as_index=False)['OCC_TIM'].agg({"user_count": "count"}) 176 | feat2 = data.groupby(['USRID'], as_index=False)['day'].agg({"user_act_day_count": "nunique"}) 177 | feat3 = data[['USRID', 'day']] 178 | feat3['day'] = feat3['day'].astype('str') 179 | feat3 = feat3.groupby(['USRID'])['day'].agg(lambda x: ':'.join(x)).reset_index() 180 | feat3.rename(columns={'day': 'act_list'}, inplace=True) 181 | # 用户是否多天有多次启动(均值) 182 | feat3['time_gap_mean'] = feat3['act_list'].apply(get_time_gap,args=('1')) 183 | # 最大 184 | feat3['time_gap_max'] = feat3['act_list'].apply(get_time_gap,args=('2')) 185 | # 最小 186 | feat3['time_gap_min'] = feat3['act_list'].apply(get_time_gap,args=('3')) 187 | # 方差 188 | feat3['time_gap_std'] = feat3['act_list'].apply(get_time_gap,args=('4')) 189 | # 锋度 190 | feat3['time_gap_skew'] = feat3['act_list'].apply(get_time_gap, args=('5')) 191 | # 偏度 192 | feat3['time_gap_kurt'] = feat3['act_list'].apply(get_time_gap, args=('6')) 193 | # 平均行为次数 194 | feat3['mean_act_count'] = feat3['act_list'].apply(lambda x: len(x.split(":")) / len(set(x.split(":")))) 195 | # 平均行为日期 196 | feat3['act_mean_date'] = feat3['act_list'].apply(lambda x: np.sum([int(ele) for ele in x.split(":")]) / len(x.split(":"))) 197 | # 活动天数占当月的比率 198 | # feat3['act_rate'] = feat3['act_list'].apply(lambda x: len(list(set(x.split(":")))) / 31) 199 | # 用户是否当天有多次启动 200 | feat3['cur_day_repeat_count'] = feat3['act_list'].apply(cur_day_repeat_count) 201 | # 连续几天启动次数的均值, 202 | feat3['con_act_day_count_mean'] = feat3['act_list'].apply(get_continue_launch_count, args=('1')) 203 | # 最大值, 204 | feat3['con_act_day_count_max'] = feat3['act_list'].apply(get_continue_launch_count, args=('2')) 205 | # 最小值 206 | feat3['con_act_day_count_min'] = feat3['act_list'].apply(get_continue_launch_count, args=('3')) 207 | # 次数 208 | feat3['con_act_day_count_total'] = feat3['act_list'].apply(get_continue_launch_count, args=('4')) 209 | # 方差 210 | feat3['con_act_day_count_std'] = feat3['act_list'].apply(get_continue_launch_count, args=('5')) 211 | feat3['con_act_max'] = feat3['act_list'].apply(get_lianxu_day) 212 | del feat3['act_list'] 213 | 214 | # 用户发生行为的天数 215 | feat4 = data.groupby(['USRID'], as_index=False)['cate_1'].agg({'user_cate_1_count': "count"}) 216 | feat5 = data.groupby(['USRID'], as_index=False)['cate_2'].agg({'user_cate_2_count': "count"}) 217 | feat6 = data.groupby(['USRID'], as_index=False)['cate_3'].agg({'user_cate_3_count': "count"}) 218 | 219 | # 判断时期是否为高峰日 220 | higt_act_day_list = [7, 14, 21, 28] 221 | feat8 = data[['USRID', 'day']] 222 | feat8['is_higt_act'] = feat8['day'].apply(lambda x: 1 if x in higt_act_day_list else 0) 223 | feat8 = feat8.drop_duplicates(subset=['USRID']) 224 | 225 | 226 | feat10 = data.groupby(['USRID','day'], as_index=False)['TCH_TYP'].agg({'user_per_count': "count"}) 227 | feat10_copy = feat10.copy() 228 | # 用户平均每天启动次数 229 | feat11 = feat10_copy.groupby(['USRID'],as_index=False)['user_per_count'].agg({"user_per_count_mean":"mean"}) 230 | # 用户启动次数最大值 231 | feat12 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_per_count_max": "max"}) 232 | # 用户启动次数最小值 233 | feat13 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_per_count_min": "min"}) 234 | # 用户每天启动次数的众值 235 | feat14 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_mode_count":lambda x: x.value_counts().index[0]}) 236 | # 方差 237 | feat15 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_std_count":np.std}) 238 | # 峰度 239 | feat16 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_skew_count": sp.stats.skew}) 240 | # 偏度 241 | feat17 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_kurt_count": sp.stats.kurtosis}) 242 | # 中位数 243 | feat18 = feat10_copy.groupby(['USRID'], as_index=False)['user_per_count'].agg({"user_median_count": np.median}) 244 | 245 | feat27 = data[['USRID', 'OCC_TIM']] 246 | feat27['OCC_TIM'] = feat27['OCC_TIM'].apply(lambda x: time.mktime(time.strptime(x, "%Y-%m-%d %H:%M:%S"))) 247 | log = feat27.sort_values(['USRID', 'OCC_TIM']) 248 | log['next_time'] = log.groupby(['USRID'])['OCC_TIM'].diff(-1).apply(np.abs) 249 | log = log.groupby(['USRID'], as_index=False)['next_time'].agg({ 250 | 'next_time_mean': np.mean, 251 | 'next_time_std': np.std, 252 | 'next_time_min': np.min, 253 | 'next_time_max': np.max 254 | }) 255 | 256 | # 每周的平均消费次数 257 | feat28_sp = data.groupby(['USRID','week'], as_index=False)['TCH_TYP'].agg({'user_per_week_count': "count"}) 258 | feat28_sp_copy = feat28_sp.copy() 259 | # 用户平均每天启动次数 260 | feat11_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_mean": "mean"}) 261 | # 用户启动次数最大值 262 | feat12_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_max": "max"}) 263 | # 用户启动次数最小值 264 | feat13_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_min": "min"}) 265 | # 用户每天启动次数的众值 266 | feat14_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_mode": lambda x: x.value_counts().index[0]}) 267 | # 方差 268 | feat15_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_std": np.std}) 269 | # 峰度 270 | feat16_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_skew": sp.stats.skew}) 271 | # 偏度 272 | feat17_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_kurt": sp.stats.kurtosis}) 273 | # 中位数 274 | feat18_sp = feat28_sp_copy.groupby(['USRID'], as_index=False)['user_per_week_count'].agg({"user_per_week_count_median": np.median}) 275 | 276 | 277 | 278 | 279 | # 离周末越近,越消费的可能性比较大,统计前2天的特征 280 | before_three = data[(data.day >= 28) & (data.day <= 31)] 281 | before_three_copy = before_three.copy() 282 | 283 | feat1_before = before_three_copy.groupby(['USRID'], as_index=False)['OCC_TIM'].agg({"user_count_before": "count"}) 284 | feat2_before = before_three_copy.groupby(['USRID'], as_index=False)['day'].agg({"user_act_day_count_before": "nunique"}) 285 | feat3_before = before_three_copy[['USRID', 'day']] 286 | feat3_before['day'] = feat3_before['day'].astype('str') 287 | feat3_before = feat3_before.groupby(['USRID'])['day'].agg(lambda x: ':'.join(x)).reset_index() 288 | feat3_before.rename(columns={'day': 'act_list'}, inplace=True) 289 | # 用户是否多天有多次启动(均值) 290 | feat3_before['before_time_gap_mean'] = feat3_before['act_list'].apply(get_time_gap, args=('1')) 291 | # 最大 292 | feat3_before['before_time_gap_max'] = feat3_before['act_list'].apply(get_time_gap, args=('2')) 293 | # 最小 294 | feat3_before['before_time_gap_min'] = feat3_before['act_list'].apply(get_time_gap, args=('3')) 295 | # 方差 296 | feat3_before['before_time_gap_std'] = feat3_before['act_list'].apply(get_time_gap, args=('4')) 297 | # 锋度 298 | feat3_before['before_time_gap_skew'] = feat3_before['act_list'].apply(get_time_gap, args=('5')) 299 | # 偏度 300 | feat3_before['before_time_gap_kurt'] = feat3_before['act_list'].apply(get_time_gap, args=('6')) 301 | # 平均行为次数 302 | feat3_before['before_mean_act_count'] = feat3_before['act_list'].apply(lambda x: len(x.split(":")) / len(set(x.split(":")))) 303 | # 平均行为日期 304 | feat3_before['before_act_mean_date'] = feat3_before['act_list'].apply(lambda x: np.sum([int(ele) for ele in x.split(":")]) / len(x.split(":"))) 305 | # 用户是否当天有多次启动 306 | feat3_before['before_cur_day_repeat_count'] = feat3_before['act_list'].apply(cur_day_repeat_count) 307 | # 连续几天启动次数的均值, 308 | feat3_before['before_con_act_day_count_mean'] = feat3_before['act_list'].apply(get_continue_launch_count, args=('1')) 309 | # 最大值, 310 | feat3_before['before_con_act_day_count_max'] = feat3_before['act_list'].apply(get_continue_launch_count, args=('2')) 311 | # 最小值 312 | feat3_before['before_con_act_day_count_min'] = feat3_before['act_list'].apply(get_continue_launch_count, args=('3')) 313 | # 次数 314 | feat3_before['before_con_act_day_count_total'] = feat3_before['act_list'].apply(get_continue_launch_count, args=('4')) 315 | # 方差 316 | feat3_before['before_con_act_day_count_std'] = feat3_before['act_list'].apply(get_continue_launch_count, args=('5')) 317 | feat3_before['before_con_act_max'] = feat3_before['act_list'].apply(get_lianxu_day) 318 | del feat3_before['act_list'] 319 | 320 | # 用户发生行为的天数 321 | feat4_before = before_three.groupby(['USRID'], as_index=False)['cate_1'].agg({'before_user_cate_1_count': "count"}) 322 | feat5_before = before_three.groupby(['USRID'], as_index=False)['cate_2'].agg({'before_user_cate_2_count': "count"}) 323 | feat6_before = before_three.groupby(['USRID'], as_index=False)['cate_3'].agg({'before_user_cate_3_count': "count"}) 324 | 325 | 326 | feat28 = pd.crosstab(data['USRID'],data['TCH_TYP']).reset_index() 327 | feat29 = pd.crosstab(data.USRID,data.cate_1).reset_index() 328 | feat30 = pd.crosstab(data.USRID, data.cate_2).reset_index() 329 | feat31 = pd.crosstab(data.USRID, data.cate_3).reset_index() 330 | feat32 = pd.crosstab(data.USRID,data.hour).reset_index() 331 | feat34 = pd.crosstab(data.USRID,data.week).reset_index() 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | data = data[['USRID','label']] 345 | data = data.drop_duplicates(subset='USRID') 346 | data = pd.merge(data, feat1, on=['USRID'], how='left') 347 | data = pd.merge(data, feat2, on=['USRID'], how='left') 348 | data = pd.merge(data, feat3, on=['USRID'], how='left') 349 | data = pd.merge(data, feat4, on=['USRID'], how='left') 350 | data = pd.merge(data, feat5, on=['USRID'], how='left') 351 | data = pd.merge(data, feat6, on=['USRID'], how='left') 352 | data = pd.merge(data, feat8, on=['USRID'], how='left') 353 | data = pd.merge(data, feat11, on=['USRID'], how='left') 354 | data = pd.merge(data, feat12, on=['USRID'], how='left') 355 | data = pd.merge(data, feat13, on=['USRID'], how='left') 356 | data = pd.merge(data, feat14, on=['USRID'], how='left') 357 | data = pd.merge(data, feat15, on=['USRID'], how='left') 358 | data = pd.merge(data, feat16, on=['USRID'], how='left') 359 | data = pd.merge(data, feat17, on=['USRID'], how='left') 360 | data = pd.merge(data, feat18, on=['USRID'], how='left') 361 | data = pd.merge(data, log, on=['USRID'], how='left') 362 | data = pd.merge(data, feat28, on=['USRID'], how='left') 363 | data = pd.merge(data, feat29, on=['USRID'], how='left') 364 | data = pd.merge(data, feat30, on=['USRID'], how='left') 365 | data = pd.merge(data, feat31, on=['USRID'], how='left') 366 | data = pd.merge(data, feat32, on=['USRID'], how='left') 367 | data = pd.merge(data, feat34, on=['USRID'], how='left') 368 | 369 | data = pd.merge(data, feat11_sp, on=['USRID'], how='left') 370 | data = pd.merge(data, feat12_sp, on=['USRID'], how='left') 371 | data = pd.merge(data, feat13_sp, on=['USRID'], how='left') 372 | data = pd.merge(data, feat14_sp, on=['USRID'], how='left') 373 | data = pd.merge(data, feat15_sp, on=['USRID'], how='left') 374 | data = pd.merge(data, feat16_sp, on=['USRID'], how='left') 375 | data = pd.merge(data, feat17_sp, on=['USRID'], how='left') 376 | data = pd.merge(data, feat18_sp, on=['USRID'], how='left') 377 | 378 | data = pd.merge(data, feat1_before, on=['USRID'], how='left') 379 | data = pd.merge(data, feat2_before, on=['USRID'], how='left') 380 | data = pd.merge(data, feat3_before, on=['USRID'], how='left') 381 | data = pd.merge(data, feat4_before, on=['USRID'], how='left') 382 | data = pd.merge(data, feat5_before, on=['USRID'], how='left') 383 | data = pd.merge(data, feat6_before, on=['USRID'], how='left') 384 | 385 | return data 386 | 387 | 388 | def main(): 389 | train_agg, train_log, train_flg, test_agg, test_log = load_csv() 390 | train, test = merge_table(train_agg, train_log, train_flg, test_agg, test_log) 391 | train.to_csv('../fea/train.csv',sep='\t',index=None) 392 | test.to_csv('../fea/test.csv', sep='\t', index=None) 393 | 394 | 395 | if __name__ == '__main__': 396 | main() -------------------------------------------------------------------------------- /model2-cy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | #用来划分训练集和验证集 3 | import pandas as pd 4 | 5 | import xgboost as xgb 6 | import operator 7 | from functools import reduce 8 | import numpy as np 9 | import scipy.stats as sp 10 | import pandas as pd 11 | train_user_info = pd.read_csv(r'train_agg.csv',sep='\t') 12 | train_app_log = pd.read_csv(r'train_log.csv',sep='\t') 13 | 14 | train_flag = pd.read_csv(r'train_flg.csv',sep='\t') 15 | 16 | test_user_info = pd.read_csv(r'test_agg.csv',sep='\t') 17 | test_app_log = pd.read_csv(r'test_log.csv',sep='\t') 18 | 19 | #添加week 20 | train_app_log['week'] =pd.to_datetime(train_app_log.OCC_TIM) 21 | train_app_log.week = list(map(lambda x:x.weekday(),train_app_log.week)) 22 | test_app_log['week'] =pd.to_datetime(test_app_log.OCC_TIM) 23 | test_app_log.week = list(map(lambda x:x.weekday(),test_app_log.week)) 24 | 25 | 26 | # 切割字符串--训练集 27 | temp=train_app_log['EVT_LBL'].str.split('-') 28 | temp1 = list(map(lambda line: line[0],temp)) 29 | temp2 = list(map(lambda line: line[1],temp)) 30 | temp3 = list(map(lambda line: line[2],temp)) 31 | train_app_log['EVT_LBL_1'] = temp1 32 | train_app_log['EVT_LBL_2'] = temp2 33 | train_app_log['EVT_LBL_3'] = temp3 34 | 35 | 36 | temptemp=train_app_log['OCC_TIM'].str.split(' ') 37 | # 加日 38 | temp = list(map(lambda line: line[0], temptemp)) 39 | train_app_log['time'] = temp 40 | time = train_app_log['time'].str.split('-') 41 | day = list(map(lambda line: line[2], time)) 42 | train_app_log['day'] = day 43 | del train_app_log['time'] 44 | # 加时分秒 45 | temp = list(map(lambda line: line[1], temptemp)) 46 | train_app_log['time'] = temp 47 | time = train_app_log['time'].str.split(':') 48 | hour = list(map(lambda line: line[0], time)) 49 | minu = list(map(lambda line: line[1], time)) 50 | sec = list(map(lambda line: line[2], time)) 51 | train_app_log['hour'] = hour 52 | train_app_log['minu'] = minu 53 | train_app_log['sec'] = sec 54 | del train_app_log['time'] 55 | train_app_log.hour = list(map(lambda x:int(x),train_app_log.hour)) 56 | train_app_log.minu = list(map(lambda x:int(x),train_app_log.minu)) 57 | train_app_log.sec = list(map(lambda x:int(x),train_app_log.sec)) 58 | train_app_log.day = list(map(lambda x:int(x),train_app_log.day)) 59 | 60 | # 切割字符串--测试集 61 | temp=test_app_log['EVT_LBL'].str.split('-') 62 | temp1 = list(map(lambda line: line[0], temp)) 63 | temp2 = list(map(lambda line: line[1], temp)) 64 | temp3 = list(map(lambda line: line[2], temp)) 65 | test_app_log['EVT_LBL_1'] = temp1 66 | test_app_log['EVT_LBL_2'] = temp2 67 | test_app_log['EVT_LBL_3'] = temp3 68 | 69 | 70 | temptemp=test_app_log['OCC_TIM'].str.split(' ') 71 | # 加日 72 | temp = list(map(lambda line: line[0], temptemp)) 73 | test_app_log['time'] = temp 74 | time = test_app_log['time'].str.split('-') 75 | day = list(map(lambda line: line[2], time)) 76 | test_app_log['day'] = day 77 | del test_app_log['time'] 78 | # 加时分秒 79 | temp = list(map(lambda line: line[1], temptemp)) 80 | test_app_log['time'] = temp 81 | time = test_app_log['time'].str.split(':') 82 | hour = list(map(lambda line: line[0], time)) 83 | minu = list(map(lambda line: line[1], time)) 84 | sec = list(map(lambda line: line[2], time)) 85 | test_app_log['hour'] = hour 86 | test_app_log['minu'] = minu 87 | test_app_log['sec'] = sec 88 | del test_app_log['time'] 89 | test_app_log.hour = list(map(lambda x:int(x),test_app_log.hour)) 90 | test_app_log.minu = list(map(lambda x:int(x),test_app_log.minu)) 91 | test_app_log.sec = list(map(lambda x:int(x),test_app_log.sec)) 92 | test_app_log.day = list(map(lambda x:int(x),test_app_log.day)) 93 | def get_av_time_dis(x):#apply 94 | 95 | x=x.str_day 96 | if x!=x: 97 | x='-1' 98 | # print(x) 99 | day = x.split(':') 100 | day = list(set(day)) 101 | day = list(map(lambda x:float(x),day)) 102 | day.sort() 103 | if day is None or len(day) == 0: 104 | return 0 105 | m={} 106 | res = 0 107 | for i in day: 108 | if i not in m: 109 | l=0 110 | r=0 111 | if i-1 in m: 112 | l = m[i-1] 113 | if i+1 in m: 114 | r = m[i+1] 115 | m[i] = 1+r+l 116 | m[i+r] = 1+r+l 117 | m[i-l] = 1+r+l 118 | res = max(res,m[i]) 119 | return res 120 | # 训练 121 | # xgboost 122 | def xgboosts(df_train,df_test,df_eval): 123 | 124 | 125 | print('xgb---training') 126 | # XGB 'shop_star_level','shop_review_num_level','context_page_id','item_pv_level','item_collected_level','item_sales_level','item_price_level','user_star_level','user_occupation_id','user_age_level','item_category_list3','item_category_list2','item_category_list1','item_city_id','item_brand_id','context_id', 127 | feature1 = [x for x in df_train.columns if x not in ['USRID','EVT_LBL','OCC_TIM','TCH_TYP','FLAG']] 128 | feature2 = [x for x in df_test.columns if x not in ['USRID','EVT_LBL','OCC_TIM','TCH_TYP','FLAG']] 129 | feature = [v for v in feature1 if v in feature2] 130 | 131 | dtrain = xgb.DMatrix(df_train[feature].values,df_train['FLAG'].values) 132 | dpre = xgb.DMatrix(df_test[feature].values) 133 | deva = xgb.DMatrix(df_eval[feature].values,df_eval['FLAG'].values) 134 | deva2 = xgb.DMatrix(df_eval[feature].values) 135 | param = {'max_depth': 5, 136 | 'eta': 0.02, 137 | # 'objective': 'binary:logistic', 138 | 'objective': 'rank:pairwise', 139 | 'eval_metric': 'auc', 140 | 'colsample_bytree':0.8, 141 | 'subsample':0.8, 142 | 'scale_pos_weight':1, 143 | # 'booster':'gblinear', 144 | 'silent':1, 145 | 'min_child_weight':18 146 | } 147 | # param['nthread'] =5 148 | print('xxxxxx') 149 | watchlist = [(deva, 'eval'), (dtrain, 'train')] 150 | num_round =600 151 | bst = xgb.train(param, dtrain, num_round, watchlist) 152 | print('xxxxxx') 153 | # 进行预测 154 | # dtest= xgb.DMatrix(predict) 155 | preds2 = bst.predict(dpre) 156 | # 保存整体结果。 157 | predict = df_test[['USRID']] 158 | predict['rst'] = preds2 159 | # temp = predict.drop_duplicates(['user_id']) # 去重 160 | predict.to_csv('test_result.csv', encoding='utf-8', index=None,sep='\t') 161 | # 提取特征 162 | def getF(user_info,data): 163 | # 1、用户总的点击次数 164 | temp = data.groupby(['USRID'])['EVT_LBL'].agg({'user_sum': np.size}) # 165 | temp = temp.reset_index() 166 | result = pd.merge(user_info, temp, on=['USRID'], how='left') # 167 | # 2、取第一级模块,用户对各个模块的点击数 168 | data['EVT_LBL_1_new1'] = list(map(lambda x: 'EVT_LBL_1_new1' + str(x), data.EVT_LBL_1)) 169 | temp = pd.crosstab(data.USRID,data.EVT_LBL_1_new1).reset_index() 170 | del data['EVT_LBL_1_new1'] 171 | result = pd.merge(result, temp, on=['USRID'], how='left') # 172 | # 3、取第二级模块,用户对各个模块的点击数 173 | data['EVT_LBL_2_new1'] = list(map(lambda x: 'EVT_LBL_2_new1' + str(x), data.EVT_LBL_1)) 174 | temp = pd.crosstab(data.USRID,data.EVT_LBL_2_new1).reset_index() 175 | del data['EVT_LBL_2_new1'] 176 | result = pd.merge(result, temp, on=['USRID'], how='left') # 177 | # 3、取第三级模块,用户对各个模块的点击数 178 | data['EVT_LBL_3_new1'] = list(map(lambda x: 'EVT_LBL_3_new1' + str(x), data.EVT_LBL_1)) 179 | temp = pd.crosstab(data.USRID,data.EVT_LBL_3_new1).reset_index() 180 | del data['EVT_LBL_3_new1'] 181 | result = pd.merge(result, temp, on=['USRID'], how='left') # 182 | 183 | # 6、各个用户在各个小时的点击量,离散 184 | data['hour_new1'] = list(map(lambda x: 'hour_new1' + str(x), data.hour)) 185 | temp = pd.crosstab(data.USRID,data.hour_new1).reset_index() 186 | del data['hour_new1'] 187 | result = pd.merge(result, temp, on=['USRID'], how='left') # 188 | 189 | 190 | 191 | # 7、各个用户在各个星期几的点击量,离散 192 | data['week_new1'] = list(map(lambda x: 'week_new1' + str(x), data.week)) 193 | temp = pd.crosstab(data.USRID,data.week_new1).reset_index() 194 | del data['week_new1'] 195 | result = pd.merge(result, temp, on=['USRID'], how='left') # 196 | 197 | 198 | # 9、用户的平均点击时间间隔,最大时间间隔,最小时间间隔, 199 | temp = data.sort_values(['OCC_TIM'], ascending=True) 200 | temp['OCC_TIM'] =pd.to_datetime(temp.OCC_TIM) 201 | temp['next_time'] = temp.groupby(['USRID'])['OCC_TIM'].diff(1) 202 | temp['next_time'] = temp['next_time']/np.timedelta64(1,'s') 203 | temp2=temp 204 | # average 205 | temp = temp.groupby(['USRID'])['next_time'].agg({'avg_time': np.mean}) 206 | temp = temp.reset_index() 207 | result = pd.merge(result, temp, on=['USRID'], how='left') # 208 | # median 209 | temp = temp2.groupby(['USRID'])['next_time'].agg({'medain_time': np.median}) 210 | temp = temp.reset_index() 211 | result = pd.merge(result, temp, on=['USRID'], how='left') # 212 | # max 213 | temp = temp2.groupby(['USRID'])['next_time'].agg({'max_time': np.max}) 214 | temp = temp.reset_index() 215 | result = pd.merge(result, temp, on=['USRID'], how='left') # 216 | # min 217 | temp = temp2.groupby(['USRID'])['next_time'].agg({'min_time': np.min}) 218 | temp = temp.reset_index() 219 | result = pd.merge(result, temp, on=['USRID'], how='left') # 220 | # sp.stats.skew 偏度 221 | temp = temp2.groupby(['USRID'])['next_time'].agg({'skew_time': sp.stats.skew}) 222 | temp = temp.reset_index() 223 | result = pd.merge(result, temp, on=['USRID'], how='left') # 224 | # sp.stats.kurtosis 峰度 225 | temp = temp2.groupby(['USRID'])['next_time'].agg({'kurt_time': sp.stats.kurtosis}) 226 | temp = temp.reset_index() 227 | result = pd.merge(result, temp, on=['USRID'], how='left') # 228 | 229 | # 10、用户有多少天点击/有多少天点击多次 230 | temp = data.drop_duplicates(['USRID','day'])#去重 231 | temp = temp.groupby(['USRID'])['day'].agg({'howmany_day_click': np.size}) 232 | temp = temp.reset_index() 233 | result = pd.merge(result, temp, on=['USRID'], how='left') # 234 | 235 | # 12、用户是否重复点击过同一个模块 236 | # 14、用户对于各个事件类型历史发生的数量。 237 | data['TCH_TYP_new1'] = list(map(lambda x: 'TCH_TYP_new1' + str(x), data.TCH_TYP)) 238 | temp = pd.crosstab(data.USRID,data.TCH_TYP_new1).reset_index() 239 | result = pd.merge(result, temp, on=['USRID'], how='left') # 240 | 241 | # 用户每天平均点击量 242 | temp = data.groupby(['USRID','day'])['EVT_LBL'].agg({'day_user_sum': np.size}) # 243 | temp = temp.reset_index() 244 | data = pd.merge(data, temp, on=['USRID','day'], how='left') # 245 | temp = data.drop_duplicates(['USRID','day'])#去重 246 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_mean': np.mean}) # 247 | temp = temp.reset_index() 248 | result = pd.merge(result, temp, on=['USRID'], how='left') # 249 | # 用户每天点击量方差 250 | temp = data.drop_duplicates(['USRID','day'])#去重 251 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_var': np.var}) # 252 | temp = temp.reset_index() 253 | result = pd.merge(result, temp, on=['USRID'], how='left') # 254 | # 用户每天点击量标准差 255 | temp = data.drop_duplicates(['USRID','day'])#去重 256 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_std': np.std}) # 257 | temp = temp.reset_index() 258 | result = pd.merge(result, temp, on=['USRID'], how='left') # 259 | # 用户每天点击量中位数 260 | temp = data.drop_duplicates(['USRID','day'])#去重 261 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_median': np.median}) # 262 | temp = temp.reset_index() 263 | result = pd.merge(result, temp, on=['USRID'], how='left') # 264 | # 用户每天点击量max 265 | temp = data.drop_duplicates(['USRID','day'])#去重 266 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_max': np.max}) # 267 | temp = temp.reset_index() 268 | result = pd.merge(result, temp, on=['USRID'], how='left') # 269 | # 用户每天点击量min 270 | temp = data.drop_duplicates(['USRID','day'])#去重 271 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_min': np.min}) # 272 | temp = temp.reset_index() 273 | result = pd.merge(result, temp, on=['USRID'], how='left') # 274 | # 用户每天点击量sp.stats.skew 偏度 275 | temp = data.drop_duplicates(['USRID','day'])#去重 276 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_skew': sp.stats.skew}) # 277 | temp = temp.reset_index() 278 | result = pd.merge(result, temp, on=['USRID'], how='left') # 279 | # 用户每天点击量sp.stats.kurtosis 峰度 280 | temp = data.drop_duplicates(['USRID','day'])#去重 281 | temp = temp.groupby(['USRID'])['day_user_sum'].agg({'day_user_kurt': sp.stats.kurtosis}) # 282 | temp = temp.reset_index() 283 | result = pd.merge(result, temp, on=['USRID'], how='left') # 284 | # 用户最大连续点击天数 285 | temp = data[['USRID', 'day']] 286 | temp['day'] = temp['day'].astype('str') 287 | temp = temp.groupby(['USRID'])['day'].agg(lambda x: ':'.join(x)).reset_index() 288 | temp = temp.drop_duplicates(['USRID', 'day']) # 去重 289 | temp.rename(columns={'day': 'str_day'}, inplace=True) 290 | temp['max_continue_day'] = temp.apply(get_av_time_dis, axis=1)#apply 291 | temp = temp[['USRID','max_continue_day']] 292 | result = pd.merge(result, temp, on=['USRID'], how='left') # 293 | 294 | # 118、区间内最后一次活跃距离区间末端的天数 295 | 296 | temp = data.groupby(['USRID'])['day'].agg({'last_day': np.max}).reset_index() # 297 | temp.last_day = 30-temp.last_day 298 | result = pd.merge(result, temp, on=['USRID'], how='left') # 299 | 300 | 301 | 302 | # 最后一天的统计值 303 | Fregion1 = data[data.day==30] 304 | # 1、用户总的点击次数 305 | temp = Fregion1.groupby(['USRID'])['EVT_LBL'].agg({'user_sum_30': np.size}) # 306 | temp = temp.reset_index() 307 | result = pd.merge(result, temp, on=['USRID'], how='left') # 308 | # 2、取第一级模块,用户对各个模块的点击数 309 | Fregion1['EVT_LBL_1_new1'] = list(map(lambda x: 'EVT_LBL_1_new1_' + str(x), Fregion1.EVT_LBL_1)) 310 | temp = pd.crosstab(Fregion1.USRID,Fregion1.EVT_LBL_1_new1).reset_index() 311 | result = pd.merge(result, temp, on=['USRID'], how='left') # 312 | # 3、取第二级模块,用户对各个模块的点击数 313 | Fregion1['EVT_LBL_2_new1'] = list(map(lambda x: 'EVT_LBL_2_new1_' + str(x), Fregion1.EVT_LBL_2)) 314 | temp = pd.crosstab(Fregion1.USRID,Fregion1.EVT_LBL_2_new1).reset_index() 315 | result = pd.merge(result, temp, on=['USRID'], how='left') # 316 | # 3、取第三级模块,用户对各个模块的点击数 317 | Fregion1['EVT_LBL_3_new1'] = list(map(lambda x: 'EVT_LBL_3_new1_' + str(x), Fregion1.EVT_LBL_3)) 318 | temp = pd.crosstab(Fregion1.USRID,Fregion1.EVT_LBL_3_new1).reset_index() 319 | result = pd.merge(result, temp, on=['USRID'], how='left') # 320 | 321 | # 6、各个用户在各个小时的点击量,离散 322 | Fregion1['hour_new1'] = list(map(lambda x: 'hour_new1_' + str(x), Fregion1.hour)) 323 | temp = pd.crosstab(Fregion1.USRID,Fregion1.hour_new1).reset_index() 324 | result = pd.merge(result, temp, on=['USRID'], how='left') # 325 | # 14、用户对于各个事件类型历史发生的数量。 326 | Fregion1['TCH_TYP_new1'] = list(map(lambda x: 'TCH_TYP_new1_' + str(x), Fregion1.TCH_TYP)) 327 | temp = pd.crosstab(Fregion1.USRID,Fregion1.TCH_TYP_new1).reset_index() 328 | result = pd.merge(result, temp, on=['USRID'], how='left') # 329 | # 330 | # 最后2天的统计值 331 | Fregion1 = data[data.day>=29] 332 | # 1、用户总的点击次数 333 | temp = Fregion1.groupby(['USRID'])['EVT_LBL'].agg({'user_sum_29': np.size}) # 334 | temp = temp.reset_index() 335 | result = pd.merge(result, temp, on=['USRID'], how='left') # 336 | # 2、取第一级模块,用户对各个模块的点击数 337 | Fregion1['EVT_LBL_1_new2'] = list(map(lambda x: 'EVT_LBL_1_new2_' + str(x), Fregion1.EVT_LBL_1)) 338 | temp = pd.crosstab(Fregion1.USRID,Fregion1.EVT_LBL_1_new2).reset_index() 339 | result = pd.merge(result, temp, on=['USRID'], how='left') # 340 | # 3、取第二级模块,用户对各个模块的点击数 341 | Fregion1['EVT_LBL_2_new2'] = list(map(lambda x: 'EVT_LBL_2_new2_' + str(x), Fregion1.EVT_LBL_2)) 342 | temp = pd.crosstab(Fregion1.USRID,Fregion1.EVT_LBL_2_new2).reset_index() 343 | result = pd.merge(result, temp, on=['USRID'], how='left') # 344 | # 3、取第三级模块,用户对各个模块的点击数 345 | Fregion1['EVT_LBL_3_new2'] = list(map(lambda x: 'EVT_LBL_3_new2_' + str(x), Fregion1.EVT_LBL_3)) 346 | temp = pd.crosstab(Fregion1.USRID,Fregion1.EVT_LBL_3_new2).reset_index() 347 | result = pd.merge(result, temp, on=['USRID'], how='left') # 348 | 349 | # 6、各个用户在各个小时的点击量,离散 350 | Fregion1['hour_new2'] = list(map(lambda x: 'hour_new2_' + str(x), Fregion1.hour)) 351 | temp = pd.crosstab(Fregion1.USRID,Fregion1.hour_new2).reset_index() 352 | result = pd.merge(result, temp, on=['USRID'], how='left') # 353 | # 14、用户对于各个事件类型历史发生的数量。 354 | Fregion1['TCH_TYP_new2'] = list(map(lambda x: 'TCH_TYP_new2_' + str(x), Fregion1.TCH_TYP)) 355 | temp = pd.crosstab(Fregion1.USRID,Fregion1.TCH_TYP_new2).reset_index() 356 | result = pd.merge(result, temp, on=['USRID'], how='left') # 357 | 358 | # ################################比例特征################################################################################## 359 | #1,用户的各种行为类型占用户的总行为比例(总的) 360 | result['tch_type_0_rate'] = result['TCH_TYP_new10']/result['user_sum'] 361 | # result['tch_type_1_rate'] = result['TCH_TYP_new11'] / result['user_sum'] 362 | result['tch_type_2_rate'] = result['TCH_TYP_new12'] / result['user_sum'] 363 | # 2,用户各个星期点击量占比(总的) 364 | result['week_1_rate'] = result['week_new11'] / result['user_sum'] 365 | result['week_2_rate'] = result['week_new12'] / result['user_sum'] 366 | result['week_3_rate'] = result['week_new13'] / result['user_sum'] 367 | result['week_4_rate'] = result['week_new14'] / result['user_sum'] 368 | result['week_5_rate'] = result['week_new15'] / result['user_sum'] 369 | result['week_6_rate'] = result['week_new16'] / result['user_sum'] 370 | result['week_7_rate'] = result['week_new10'] / result['user_sum'] 371 | # 3,用户各个小时的点击量占比(总的) 372 | result['hour_0_rate'] = result['hour_new10'] / result['user_sum'] 373 | result['hour_1_rate'] = result['hour_new11'] / result['user_sum'] 374 | result['hour_2_rate'] = result['hour_new12'] / result['user_sum'] 375 | result['hour_3_rate'] = result['hour_new13'] / result['user_sum'] 376 | result['hour_4_rate'] = result['hour_new14'] / result['user_sum'] 377 | result['hour_5_rate'] = result['hour_new15'] / result['user_sum'] 378 | result['hour_6_rate'] = result['hour_new16'] / result['user_sum'] 379 | result['hour_7_rate'] = result['hour_new17'] / result['user_sum'] 380 | result['hour_8_rate'] = result['hour_new18'] / result['user_sum'] 381 | result['hour_9_rate'] = result['hour_new19'] / result['user_sum'] 382 | result['hour_10_rate'] = result['hour_new110'] / result['user_sum'] 383 | result['hour_11_rate'] = result['hour_new111'] / result['user_sum'] 384 | result['hour_12_rate'] = result['hour_new112'] / result['user_sum'] 385 | result['hour_13_rate'] = result['hour_new113'] / result['user_sum'] 386 | result['hour_14_rate'] = result['hour_new114'] / result['user_sum'] 387 | result['hour_15_rate'] = result['hour_new115'] / result['user_sum'] 388 | result['hour_16_rate'] = result['hour_new116'] / result['user_sum'] 389 | result['hour_17_rate'] = result['hour_new117'] / result['user_sum'] 390 | result['hour_18_rate'] = result['hour_new118'] / result['user_sum'] 391 | result['hour_19_rate'] = result['hour_new119'] / result['user_sum'] 392 | result['hour_20_rate'] = result['hour_new120'] / result['user_sum'] 393 | result['hour_21_rate'] = result['hour_new121'] / result['user_sum'] 394 | result['hour_22_rate'] = result['hour_new122'] / result['user_sum'] 395 | result['hour_23_rate'] = result['hour_new123'] / result['user_sum'] 396 | 397 | # 1,用户的各种行为类型占用户的总行为比例(最后一天) 398 | result['TCH_TYP_new1_0_rate'] = result['TCH_TYP_new1_0'] / result['user_sum_30'] 399 | # result['TCH_TYP_new1_1_rate'] = result['TCH_TYP_new1_1'] / result['user_sum_30'] 400 | result['TCH_TYP_new1_2_rate'] = result['TCH_TYP_new1_2'] / result['user_sum_30'] 401 | 402 | 403 | 404 | # 1,用户的各种行为类型占用户的总行为比例(最后2天) 405 | result['TCH_TYP_new2_0_rate'] = result['TCH_TYP_new2_0'] / result['user_sum_29'] 406 | # result['TCH_TYP_new2_1_rate'] = result['TCH_TYP_new2_1'] / result['user_sum_29'] 407 | result['TCH_TYP_new2_2_rate'] = result['TCH_TYP_new2_2'] / result['user_sum_29'] 408 | 409 | 410 | #################################################################################################################### 411 | # 分为前10天,中间10天,后十天 412 | # 前10的统计值 413 | Fregion1 = data[data.day <=10] 414 | # 1、用户总的点击次数 415 | temp = Fregion1.groupby(['USRID'])['EVT_LBL'].agg({'user_sum_one': np.size}) # 416 | temp = temp.reset_index() 417 | result = pd.merge(result, temp, on=['USRID'], how='left') # 418 | 419 | # 14、用户对于各个事件类型历史发生的数量。 420 | Fregion1['TCH_TYP_new3'] = list(map(lambda x: 'TCH_TYP_new3_' + str(x), Fregion1.TCH_TYP)) 421 | temp = pd.crosstab(Fregion1.USRID, Fregion1.TCH_TYP_new3).reset_index() 422 | result = pd.merge(result, temp, on=['USRID'], how='left') # 423 | ##########################3 424 | # 中间10的统计值 425 | Fregion1 = data[(data.day > 10)&(data.day<=20)] 426 | # 1、用户总的点击次数 427 | temp = Fregion1.groupby(['USRID'])['EVT_LBL'].agg({'user_sum_two': np.size}) # 428 | temp = temp.reset_index() 429 | result = pd.merge(result, temp, on=['USRID'], how='left') # 430 | 431 | # 14、用户对于各个事件类型历史发生的数量。 432 | Fregion1['TCH_TYP_new4'] = list(map(lambda x: 'TCH_TYP_new4_' + str(x), Fregion1.TCH_TYP)) 433 | temp = pd.crosstab(Fregion1.USRID, Fregion1.TCH_TYP_new4).reset_index() 434 | result = pd.merge(result, temp, on=['USRID'], how='left') # 435 | 436 | ##################################3 437 | # 最后10的统计值 438 | Fregion1 = data[data.day > 20] 439 | # 1、用户总的点击次数 440 | temp = Fregion1.groupby(['USRID'])['EVT_LBL'].agg({'user_sum_three': np.size}) # 441 | temp = temp.reset_index() 442 | result = pd.merge(result, temp, on=['USRID'], how='left') # 443 | 444 | # 14、用户对于各个事件类型历史发生的数量。 445 | Fregion1['TCH_TYP_new5'] = list(map(lambda x: 'TCH_TYP_new5_' + str(x), Fregion1.TCH_TYP)) 446 | temp = pd.crosstab(Fregion1.USRID, Fregion1.TCH_TYP_new5).reset_index() 447 | result = pd.merge(result, temp, on=['USRID'], how='left') # 448 | 449 | 450 | 451 | 452 | 453 | print(result) 454 | return result 455 | 456 | train = getF(train_user_info,train_app_log) 457 | train = pd.merge(train, train_flag, on=['USRID'], how='left') # 458 | test = getF(test_user_info,test_app_log) 459 | train.to_csv('train_last.csv', encoding='utf-8', index=None) 460 | test.to_csv('test_last.csv', encoding='utf-8', index=None) 461 | train = pd.read_csv(r'train_last.csv') 462 | test = pd.read_csv(r'test_last.csv') 463 | xgboosts(train,test,train) 464 | -------------------------------------------------------------------------------- /xgb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import xgboost as xgb 3 | import matplotlib.pyplot as plt 4 | import operator 5 | import matplotlib.pyplot as plt 6 | 7 | def xgb_model(train,test): 8 | train_x = train.drop(['USRID','FLAG','day'], axis=1).values 9 | train_y = train['FLAG'].values 10 | test_x = test.drop(['USRID','day'], axis=1).values 11 | 12 | xgb_train = xgb.DMatrix(train_x, label=train_y) 13 | xgb_test = xgb.DMatrix(test_x) 14 | 15 | params = {'booster': 'gbtree', 16 | 'objective': 'rank:pairwise', # 二分类的问题 17 | # 'gamma':0.1, # 用于控制是否后剪枝的参数,越大越保守,一般0.1、0.2这样子。 18 | 'max_depth': 5, # 构建树的深度,越大越容易过拟合 19 | # 'lambda':2, # 控制模型复杂度的权重值的L2正则化项参数,参数越大,模型越不容易过拟合。 20 | 'subsample': 0.7, # 随机采样训练样本 21 | 'colsample_bytree': 0.7, # 生成树时进行的列采样 22 | 'min_child_weight': 3, 23 | # 这个参数默认是 1,是每个叶子里面 h 的和至少是多少,对正负样本不均衡时的 0-1 分类而言 24 | # ,假设 h 在 0.01 附近,min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。 25 | # 这个参数非常影响结果,控制叶子节点中二阶导的和的最小值,该参数值越小,越容易 overfitting。 26 | 'silent': 0, # 设置成1则没有运行信息输出,最好是设置为0. 27 | 'eta': 0.03, # 如同学习率 28 | 'nthread': 7, # cpu 线程数 29 | 'eval_metric': 'auc' # 评价方式 30 | } 31 | 32 | plst = list(params.items()) 33 | num_rounds = 500 # 迭代次数 34 | watchlist = [(xgb_train, 'train')] 35 | # early_stopping_rounds 当设置的迭代次数较大时,early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练 36 | model = xgb.train(plst, xgb_train, num_rounds, watchlist) 37 | pred_value = model.predict(xgb_test) 38 | 39 | return pred_value 40 | 41 | 42 | def gene_result(pred_value,test_range): 43 | tess = test_range[["USRID"]] 44 | a = pd.DataFrame(pred_value, columns=["RST"]) 45 | res = pd.concat([tess, a["RST"]], axis=1) 46 | res.to_csv("../submit/test_result.csv", index=None,sep='\t') 47 | 48 | def load_csv(): 49 | train = pd.read_csv('../fea/train.csv', sep='\t') 50 | test = pd.read_csv('../fea/test.csv', sep='\t') 51 | 52 | train.fillna(-999, inplace=True) 53 | test.fillna(-999, inplace=True) 54 | 55 | return train,test 56 | 57 | 58 | def main(): 59 | train,test = load_csv() 60 | pred_value = xgb_model(train,test) 61 | gene_result(pred_value, test) 62 | 63 | 64 | if __name__ == '__main__': 65 | main() 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /奇点计划-决赛-X-Driver-答辩PPT.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sunwantong/China-Merchants-Bank-credit-card-Cente-User-purchase-forecast/e3fc87d55fce129249cafcbbf49c85f44e77bfdb/奇点计划-决赛-X-Driver-答辩PPT.pdf -------------------------------------------------------------------------------- /模型三/fea_log.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 8 21:32:49 2018 4 | 5 | @author: Alan Yan 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | from util_fea import get_stat_fea 11 | import time 12 | 13 | 14 | def time2stamp(a_datetime_str): 15 | a_datetime = time.strptime(a_datetime_str, "%Y-%m-%d %H:%M:%S") 16 | return time.mktime(a_datetime) 17 | 18 | def date2stamp(a_datetime_str): 19 | a_datetime_str = a_datetime_str.split(' ')[0] 20 | a_datetime = time.strptime(a_datetime_str, "%Y-%m-%d") 21 | return time.mktime(a_datetime) 22 | 23 | def time2hour(a_datetime_str): 24 | a_hour = int(a_datetime_str.split(' ')[1].split(':')[0]) 25 | return a_hour 26 | 27 | def time2period(a_datetime_str): 28 | a_hour = int(a_datetime_str.split(' ')[1].split(':')[0]) 29 | if a_hour > 8 and a_hour < 13: 30 | return 1 31 | elif a_hour >= 13 and a_hour < 19: 32 | return 2 33 | elif a_hour >= 19 and a_hour < 23: 34 | return 3 35 | else: 36 | return 0 37 | 38 | 39 | def day_list_fea(a_day_list): 40 | if len(a_day_list) > 1: 41 | a_day_list.sort() 42 | a_sub_day_list = [a_day_list[i] - a_day_list[i-1] for i in range(1, len(a_day_list))] 43 | a_fea_day_list = get_stat_fea(a_sub_day_list) 44 | else: 45 | a_fea_day_list = [0] * len(get_stat_fea([1, 1])) 46 | return a_fea_day_list 47 | 48 | def fea_log(a_id): 49 | a_df = train_log_df.loc[a_id] 50 | 51 | a_date_list = list(set(a_df['OCC_DATE'].values)) 52 | a_date_list.sort() 53 | a_date_rate = len(a_date_list) / len(a_df) # fea 54 | fea_date = get_stat_fea(a_date_list) 55 | 56 | a_date_list_1 = a_date_list[:int(len(a_date_list) / 2)] 57 | a_date_list_2 = a_date_list[int(len(a_date_list) / 2):] 58 | fea_date_sub_1 = day_list_fea(a_date_list_1) 59 | fea_date_sub_2 = day_list_fea(a_date_list_2) 60 | fea_date_sub_1_2 = list(np.array(fea_date_sub_2) - np.array(fea_date_sub_1)) 61 | 62 | a_date_count = a_df['OCC_DATE'].value_counts() 63 | a_date_count = a_date_count.sort_index() 64 | a_date_count = list(a_date_count.values) 65 | fea_date_count = get_stat_fea(a_date_count) 66 | 67 | a_fea_all = fea_date + fea_date_sub_1_2 + fea_date_count 68 | a_fea_all.append(a_date_rate) 69 | a_fea_all.append(a_id) 70 | 71 | return a_fea_all 72 | 73 | 74 | train_log_df = pd.read_csv('../orig_data/train_log.csv', sep='\t') 75 | #train_log_df = pd.read_csv('../orig_data/test_log.csv', sep='\t') 76 | 77 | train_log_df.set_index(['USRID', 'OCC_TIM'], drop=False, inplace=True) 78 | train_log_df = train_log_df.sort_index() 79 | user_id_list = list(set(train_log_df['USRID'].values)) 80 | 81 | train_log_df['OCC_STAMP'] = train_log_df['OCC_TIM'].apply(time2stamp) 82 | train_log_df['OCC_DATE'] = train_log_df['OCC_TIM'].apply(date2stamp) 83 | train_log_df['OCC_PERIOD'] = train_log_df['OCC_TIM'].apply(time2period) 84 | train_log_df['OCC_HOUR'] = train_log_df['OCC_TIM'].apply(time2hour) 85 | fea_mat = list(map(fea_log, user_id_list)) 86 | 87 | fea_name_list_1 = ['var_1', 'std_1', 'max_1', 'min_1', 't_max_min_1', 'ent_1', 'median_1', 88 | 'mode_1', 'rate_1_max', 'rate_1_min', 'len_1', 'sum_1'] 89 | fea_name_list_2 = [item.replace('_1', '_2') for item in fea_name_list_1] 90 | fea_name_list_3 = [item.replace('_1', '_3') for item in fea_name_list_1] 91 | fea_name_list = fea_name_list_1 + fea_name_list_2 + fea_name_list_3 92 | fea_name_list.append('date_rate') 93 | fea_name_list.append('USRID') 94 | 95 | fea_all_df = pd.DataFrame(fea_mat) 96 | fea_all_df.columns = fea_name_list 97 | 98 | fea_all_df.to_csv('../fea/fea_log_train.csv', index=False) 99 | #fea_all_df.to_csv('../fea/fea_log_test.csv', index=False) 100 | -------------------------------------------------------------------------------- /模型三/fea_tfidf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jun 18 17:22:07 2018 4 | 5 | @author: Alan Yan 6 | """ 7 | 8 | import pandas as pd 9 | from sklearn.feature_extraction.text import CountVectorizer 10 | 11 | def get_str(a_df): 12 | str_list = a_df['EVT_LBL'].values 13 | return ' '.join(str_list) 14 | 15 | df1 = pd.read_csv('../orig_data/train_log.csv', sep='\t') 16 | df2 = pd.read_csv('../orig_data/test_log.csv', sep='\t') 17 | df = pd.concat([df1, df2], axis=0) 18 | df.set_index(['USRID', 'OCC_TIM'], inplace=True, drop=False) 19 | df = df.sort_index() 20 | df = df.reset_index(drop=True) 21 | str_df = pd.pivot_table(df, index='USRID', values=['EVT_LBL'], aggfunc=get_str) 22 | 23 | vectorizer = CountVectorizer(min_df=2, token_pattern=r"\b\w+\b") # 保留单字 24 | corpus = list(str_df['EVT_LBL'].values) 25 | X_tfidf = vectorizer.fit_transform(corpus) 26 | fea_name = vectorizer.get_feature_names() 27 | X_tfidf = X_tfidf.todense() 28 | count_fea_df = pd.DataFrame(X_tfidf) 29 | count_fea_df.columns = ['w_'+item for item in fea_name] 30 | count_fea_df['USRID'] = list(str_df.index) 31 | count_fea_df.to_csv('../fea/fea_tfidf_all.csv', index=False) 32 | 33 | -------------------------------------------------------------------------------- /模型三/fea_word2vec.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Sun Jun 17 21:30:39 2018 4 | 5 | @author: Alan Yan 6 | """ 7 | 8 | import pandas as pd 9 | import gensim 10 | import re 11 | import numpy as np 12 | 13 | 14 | def get_str(a_df): 15 | str_list = a_df['EVT_LBL'].values 16 | return ' '.join(str_list) 17 | 18 | 19 | def get_word2vec_fea(content): 20 | content = re.sub(r"\s{2,}", " ", content) 21 | content_list = content.strip().split(' ') 22 | fea_vec_one = np.zeros(100) 23 | for item in content_list: 24 | if item in word_set: 25 | fea_vec_one += model.wv[item] 26 | fea_vec_one = fea_vec_one / len(content_list) 27 | fea_vec_one = [int(item*1000)/1000 for item in fea_vec_one] 28 | return fea_vec_one 29 | 30 | df1 = pd.read_csv('../orig_data/train_log.csv', sep='\t') 31 | df2 = pd.read_csv('../orig_data/test_log.csv', sep='\t') 32 | df = pd.concat([df1, df2], axis=0) 33 | df.set_index(['USRID', 'OCC_TIM'], inplace=True, drop=False) 34 | df = df.sort_index() 35 | df = df.reset_index(drop=True) 36 | 37 | #user_id = list(set(df['USRID'].values)) 38 | 39 | str_df = pd.pivot_table(df, index='USRID', values=['EVT_LBL'], aggfunc=get_str) 40 | data = str_df['EVT_LBL'].values 41 | data = [item.split(' ') for item in data] 42 | model = gensim.models.Word2Vec(data, min_count=1, size=100) 43 | word_set = set(model.wv.index2word) 44 | w2v_fea_mat = list(str_df['EVT_LBL'].apply(get_word2vec_fea).values) 45 | w2v_df = pd.DataFrame(w2v_fea_mat) 46 | w2v_df['USRID'] = list(str_df.index) 47 | w2v_df.to_csv('../fea/fea_w2v_all.csv', index=False) -------------------------------------------------------------------------------- /模型三/model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jun 6 19:49:57 2018 4 | 5 | @author: Alan Yan 6 | """ 7 | 8 | import xgboost as xgb 9 | import pandas as pd 10 | from sklearn import metrics 11 | import numpy as np 12 | from scipy.sparse import csr_matrix 13 | from util_fs import xgb_fea_select 14 | 15 | 16 | def xgb_clf(train_x, train_y, test_x): 17 | dtrain=xgb.DMatrix(train_x,label=train_y) 18 | dtest=xgb.DMatrix(test_x) 19 | params = {'booster':'gbtree', 20 | 'max_depth': 3, 21 | 'colsample_bytree': 0.7, 22 | 'subsample': 0.7, 23 | 'eta': 0.03, 24 | 'silent': 1, 25 | # 'objective': 'binary:logistic', 26 | 'objective': 'rank:pairwise', 27 | 'min_child_weight': 6, # 这儿不是3就是6 28 | 'seed': 10, 29 | 'eval_metric':'auc', 30 | 'scale_pos_weight': 3176 / 76824} 31 | watchlist = [(dtrain,'train')] 32 | bst=xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist, 33 | early_stopping_rounds=100) 34 | ypred=bst.predict(dtest) 35 | return ypred 36 | 37 | train_log_df = pd.read_csv('../fea/fea_log_train.csv') 38 | train_agg_df = pd.read_csv('../orig_data/train_agg.csv', sep='\t') 39 | train_df = pd.merge(train_agg_df, train_log_df, on='USRID', how='left') 40 | train_label_df = pd.read_csv('../orig_data/train_flg.csv', sep='\t') 41 | train_df = pd.merge(train_df, train_label_df, on='USRID', how='left') 42 | tfidf_df = pd.read_csv('../fea/fea_tfidf_all.csv') 43 | train_df = pd.merge(tfidf_df, train_df, on='USRID', how='right') 44 | w2v_df = pd.read_csv('../fea/fea_w2v_all.csv') 45 | train_df = pd.merge(w2v_df, train_df, on='USRID', how='right') 46 | 47 | test_log_df = pd.read_csv('../fea/fea_log_test.csv') 48 | test_agg_df = pd.read_csv('../orig_data/test_agg.csv', sep='\t') 49 | test_df = pd.merge(test_agg_df, test_log_df, on='USRID', how='left') 50 | test_df = pd.merge(tfidf_df, test_df, on='USRID', how='right') 51 | test_df = pd.merge(w2v_df, test_df, on='USRID', how='right') 52 | 53 | 54 | # 特征选择 55 | y_train = train_df['FLAG'].values 56 | del train_df['FLAG'], train_df['USRID'] 57 | X_train = train_df.values 58 | fea_name_list = train_df.columns 59 | X_train = X_train.astype(np.float64) 60 | X_train = csr_matrix(X_train) 61 | fea_name_new = xgb_fea_select(X_train, y_train, fea_name_list) 62 | train_df = train_df[fea_name_new] 63 | print('特征选择完成。') 64 | 65 | 66 | test_id = test_df['USRID'].values 67 | del test_df['USRID'] 68 | test_df = test_df[fea_name_new] 69 | 70 | X_train = train_df.values 71 | X_train = X_train.astype(np.float64) 72 | X_train = csr_matrix(X_train) 73 | X_test = test_df.values 74 | X_test = X_test.astype(np.float64) 75 | X_test = csr_matrix(X_test) 76 | 77 | y_pred_prob = xgb_clf(X_train, y_train, X_test) 78 | pd.Series(np.sort(y_pred_prob)).plot() 79 | 80 | result_df = pd.DataFrame() 81 | result_df['USRID'] = test_id 82 | result_df['RST'] = y_pred_prob 83 | result_df.to_csv('../result/test_result.csv', index=False, sep='\t') -------------------------------------------------------------------------------- /模型三/util_fea.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 1 10:43:50 2018 4 | 5 | @author: Alan Yan 6 | """ 7 | from math import log 8 | from scipy.stats import mode 9 | import numpy as np 10 | 11 | 12 | def cal_ent(a_list): 13 | item_count = {} 14 | for item in a_list: 15 | if item not in item_count.keys(): 16 | item_count[item] = 0 17 | item_count[item] += 1 18 | ent = 0.0 19 | for key in item_count: 20 | prob = float(item_count[key]) / len(a_list) 21 | ent -= prob * log(prob, 2) 22 | return ent 23 | 24 | def get_stat_fea(a_list): 25 | t_array = np.array(a_list) 26 | var_t = t_array.var() # t序列的方差 27 | std_t = t_array.std() # t序列的标准差 28 | max_t = t_array.max() # t序列的最大值 29 | min_t = t_array.min() # t序列的最小值 30 | t_max_min = t_array.max() - t_array.min() # t序列的极差 31 | t_ent = cal_ent(t_array) # t序列的熵 32 | median_t = np.median(t_array) # t序列的中位数 33 | mode_t = mode(t_array)[0][0] # t序列的众数 34 | rate_t_max = (t_array.argmax() + 1) * 1.0 / len(t_array) # 最大值位置 35 | rate_t_min = (t_array.argmin() + 1) * 1.0 / len(t_array) # 最小值位置 36 | len_t = len(t_array) # t序列的长度 37 | sum_t = sum(t_array) # t序列的和 38 | fea_stat_list = [var_t, std_t, max_t, min_t, t_max_min, t_ent, median_t, 39 | mode_t, rate_t_max, rate_t_min, len_t, sum_t] 40 | return fea_stat_list 41 | 42 | 43 | -------------------------------------------------------------------------------- /模型三/util_fs.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Jun 20 17:48:05 2018 4 | 5 | @author: Alan Yan 6 | """ 7 | 8 | import xgboost as xgb 9 | 10 | def xgb_fea_select(train_x, train_y, f_name_list): 11 | rate_fea = 0.5 12 | dtrain=xgb.DMatrix(train_x, label=train_y) 13 | params = {'booster':'gbtree', 14 | 'max_depth': 3, 15 | 'colsample_bytree': 0.7, 16 | 'subsample': 0.7, 17 | 'eta': 0.03, 18 | 'silent': 1, 19 | # 'objective': 'binary:logistic', 20 | 'objective': 'rank:pairwise', 21 | 'min_child_weight': 3, 22 | 'seed': 10, 23 | 'eval_metric':'auc', 24 | 'scale_pos_weight': 3176 / 76824} 25 | watchlist = [(dtrain,'train')] 26 | bst=xgb.train(params,dtrain,num_boost_round=1000,evals=watchlist, 27 | early_stopping_rounds=100) 28 | fscore_dict = bst.get_fscore() 29 | sorted_fs_dict = sorted(fscore_dict.items(),key = lambda x:x[1],reverse = True) 30 | fea_id_set = set([int(item[0][1:]) for item in sorted_fs_dict[:int(len(sorted_fs_dict)*rate_fea)]]) 31 | f_name_list = [item for i, item in enumerate(f_name_list) if i in fea_id_set] 32 | return f_name_list --------------------------------------------------------------------------------