├── SleepTight_CodeInstruction.docx ├── README.md ├── .gitignore ├── v2_win.py ├── v1_win.py ├── v1_1.py ├── v2_1.py ├── get_res.py ├── user_cate.py └── user_cate2.py /SleepTight_CodeInstruction.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/anzhizh/2019-taida-jdata-top3/HEAD/SleepTight_CodeInstruction.docx -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2019-taida-jdata-top3 2 | 3 | 方案描述和代码运行都在SleepTight_CodeInstruction中,整体方案着重特征工程,在模型构造和最终提交文件处理上有很大提升空间。 4 | 5 | 参赛收获:核心在于构建整个模型的体系,包含模型框架、特征维度等,保证每一个特征的加入都会使得模型更加丰富立体。仅个人看法。 6 | 7 | 感谢太白南路点子王、鱼遇雨欲语与余、小幸运,特别致谢太白南路点子王的各路好点子。 8 | 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | 8 | # Runtime data 9 | pids 10 | *.pid 11 | *.seed 12 | *.pid.lock 13 | 14 | # Directory for instrumented libs generated by jscoverage/JSCover 15 | lib-cov 16 | 17 | # Coverage directory used by tools like istanbul 18 | coverage 19 | 20 | # nyc test coverage 21 | .nyc_output 22 | 23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 24 | .grunt 25 | 26 | # Bower dependency directory (https://bower.io/) 27 | bower_components 28 | 29 | # node-waf configuration 30 | .lock-wscript 31 | 32 | # Compiled binary addons (https://nodejs.org/api/addons.html) 33 | build/Release 34 | 35 | # Dependency directories 36 | node_modules/ 37 | jspm_packages/ 38 | 39 | # TypeScript v1 declaration files 40 | typings/ 41 | 42 | # Optional npm cache directory 43 | .npm 44 | 45 | # Optional eslint cache 46 | .eslintcache 47 | 48 | # Optional REPL history 49 | .node_repl_history 50 | 51 | # Output of 'npm pack' 52 | *.tgz 53 | 54 | # Yarn Integrity file 55 | .yarn-integrity 56 | 57 | # dotenv environment variables file 58 | .env 59 | 60 | # next.js build output 61 | .next 62 | -------------------------------------------------------------------------------- /v2_win.py: -------------------------------------------------------------------------------- 1 | # 5天标签 2 | from user_cate_shop2 import * 3 | 4 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm', 5 | 'user_reg_tm', 'vender_id', 'module_id'] 6 | 7 | test_start_date = '2018-03-15' 8 | test_end_date = '2018-04-16' 9 | 10 | label_start_date = '2018-04-11' 11 | label_end_date = '2018-04-16' 12 | train_start_date = '2018-03-10' 13 | train_end_date = '2018-04-11' 14 | 15 | # train 16 | training_data = make_train_set(train_start_date, train_end_date, label_start_date, label_end_date, 30) 17 | 18 | feats = [f for f in training_data.columns if f not in ignore_feat] 19 | print(feats) 20 | print(len(feats)) 21 | label = training_data['label'].copy() 22 | user_index = training_data[['user_id', 'cate', 'shop_id']].copy() 23 | train = training_data[feats].values 24 | 25 | # test 26 | sub_training_data = make_test_set(test_start_date, test_end_date, 30) 27 | feats = [f for f in sub_training_data.columns if f not in ignore_feat] 28 | print(feats) 29 | print(len(feats)) 30 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy() 31 | test = sub_training_data[feats].values 32 | print('test shape: ', test.shape) 33 | 34 | lgb_train_F12_5(train, label, test, sub_user_index) 35 | 36 | 37 | -------------------------------------------------------------------------------- /v1_win.py: -------------------------------------------------------------------------------- 1 | # 5天标签 2 | from user_cate2 import * 3 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm', 4 | 'user_reg_tm', 'vender_id', 'module_id', 'cate_feat13_11', 'F11_feat13_11'] 5 | 6 | test_start_date = '2018-03-15' 7 | test_end_date = '2018-04-16' 8 | 9 | label_start_date = '2018-04-11' 10 | label_end_date = '2018-04-16' 11 | train_start_date = '2018-03-10' 12 | train_end_date = '2018-04-11' 13 | 14 | # train 15 | training_data = make_train_set(train_start_date, train_end_date, label_start_date, label_end_date, 30) 16 | 17 | feats = [f for f in training_data.columns if f not in ignore_feat] 18 | print(feats) 19 | print(len(feats)) 20 | label = training_data['label'].copy() 21 | user_index = training_data[['user_id', 'cate']].copy() 22 | train = training_data[feats].copy() 23 | del training_data 24 | # test 25 | sub_training_data = make_test_set(test_start_date, test_end_date, 30) 26 | feats = [f for f in sub_training_data.columns if f not in ignore_feat] 27 | print(feats) 28 | print(len(feats)) 29 | sub_user_index = sub_training_data[['user_id', 'cate']].copy() 30 | test = sub_training_data[feats].copy() 31 | print('test shape: ', test.shape) 32 | del sub_training_data 33 | lgb_train(train, label, test, sub_user_index) 34 | 35 | 36 | -------------------------------------------------------------------------------- /v1_1.py: -------------------------------------------------------------------------------- 1 | from user_cate import * 2 | 3 | ignore_feat = ['label', 'type', 'user_id', 'cate', 'shop_id', 'sku_id', 'action_time', 'dt', 'market_time', 'shop_reg_tm', 4 | 'user_reg_tm'] 5 | 6 | label_start_date = '2018-04-09' 7 | label_end_date = '2018-04-16' 8 | train_start_date = '2018-03-08' 9 | train_end_date = '2018-04-09' 10 | test_start_date = '2018-03-15' 11 | test_end_date = '2018-04-16' 12 | 13 | # train 14 | training_data = make_train_set(train_start_date, train_end_date, label_start_date, label_end_date) 15 | 16 | # test 17 | sub_training_data = make_test_set(test_start_date, test_end_date) 18 | 19 | 20 | # train 21 | feats_train = [f for f in training_data.columns if f not in ignore_feat] 22 | print(len(feats_train)) 23 | label = training_data['label'].copy() 24 | user_index = training_data[['user_id', 'cate']].copy() 25 | print('train shape: ', training_data.shape) 26 | train = training_data[feats_train].copy() 27 | print('train shape: ', train.shape) 28 | 29 | # test 30 | feats_test = [f for f in sub_training_data.columns if f not in ignore_feat] 31 | print(feats_test) 32 | print(len(feats_test)) 33 | sub_user_index = sub_training_data[['user_id', 'cate']].copy() 34 | test = sub_training_data[feats_test].copy() 35 | print('test shape: ', test.shape) 36 | 37 | # 训练 38 | lgb_train_F11_7(train, label, test, sub_user_index) 39 | -------------------------------------------------------------------------------- /v2_1.py: -------------------------------------------------------------------------------- 1 | # 一个月用户集 全量特征集 一周标签集 2 | 3 | from user_cate_shop import * # 清洗数据 4 | 5 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm', 6 | 'user_reg_tm', 'vender_id', 'module_id'] 7 | 8 | label_start_date = '2018-04-09' 9 | label_end_date = '2018-04-16' 10 | train_start_date = '2018-03-08' 11 | train_end_date = '2018-04-09' 12 | test_start_date = '2018-03-15' 13 | test_end_date = '2018-04-16' 14 | 15 | training_data = make_train_set_F12_7(train_start_date, train_end_date, label_start_date, label_end_date, start='2018-02-01') 16 | sub_training_data = make_test_set_F12_7(test_start_date, test_end_date, start='2018-02-01') 17 | 18 | # train 19 | feats = [f for f in training_data.columns if f not in ignore_feat] 20 | print(feats) 21 | print(len(feats)) 22 | label = training_data['label'].copy() 23 | user_index = training_data[['user_id', 'cate', 'shop_id']].copy() 24 | train = training_data[feats].copy() 25 | 26 | # test 27 | feats = [f for f in sub_training_data.columns if f not in ignore_feat] 28 | print(feats) 29 | print(len(feats)) 30 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy() 31 | test = sub_training_data[feats].copy() 32 | print('test shape: ', test.shape) 33 | 34 | del training_data, sub_training_data 35 | 36 | lgb_train_F12_7(train, label, test, sub_user_index) 37 | 38 | 39 | -------------------------------------------------------------------------------- /get_res.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import pickle 3 | # 数据准备 提速 4 | action_path = "./data/jdata_action.csv" 5 | comment_path = "./data/jdata_comment.csv" 6 | product_path = "./data/jdata_product.csv" 7 | user_path = "./data/jdata_user.csv" 8 | shop_path = "./data/jdata_shop.csv" 9 | 10 | user = pd.read_csv(user_path, sep=',') 11 | product = pd.read_csv(product_path, sep=',') 12 | action = pd.read_csv(action_path, sep=',') 13 | comment = pd.read_csv(comment_path, sep=',') 14 | shop = pd.read_csv(shop_path, sep=',') 15 | 16 | pickle.dump(user, open('./cache/origin_user.pkl', 'wb')) 17 | pickle.dump(product, open('./cache/origin_product.pkl', 'wb')) 18 | pickle.dump(action, open('./cache/origin_action.pkl', 'wb')) 19 | pickle.dump(comment, open('./cache/origin_comment.pkl', 'wb')) 20 | pickle.dump(shop, open('./cache/origin_shop.pkl', 'wb')) 21 | 22 | """ 23 | F12_7 24 | """ 25 | from user_cate_shop import make_train_set_F12_7, make_test_set_F12_7,lgb_train_F12_7 26 | 27 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm', 28 | 'user_reg_tm', 'vender_id', 'module_id'] 29 | 30 | label_start_date = '2018-04-09' 31 | label_end_date = '2018-04-16' 32 | train_start_date = '2018-03-08' 33 | train_end_date = '2018-04-09' 34 | test_start_date = '2018-03-15' 35 | test_end_date = '2018-04-16' 36 | 37 | training_data = make_train_set_F12_7(train_start_date, train_end_date, label_start_date, label_end_date, start='2018-02-01') 38 | sub_training_data = make_test_set_F12_7(test_start_date, test_end_date, start='2018-02-01') 39 | 40 | # train 41 | feats = [f for f in training_data.columns if f not in ignore_feat] 42 | print(feats) 43 | print(len(feats)) 44 | label = training_data['label'].copy() 45 | train = training_data[feats].copy() 46 | 47 | # test 48 | feats = [f for f in sub_training_data.columns if f not in ignore_feat] 49 | print(feats) 50 | print(len(feats)) 51 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy() 52 | test = sub_training_data[feats].copy() 53 | print('test shape: ', test.shape) 54 | 55 | lgb_train_F12_7(train, label, test, sub_user_index) 56 | 57 | """ 58 | F11_7 59 | """ 60 | from user_cate import make_train_set_F11_7, make_test_set_F11_7, lgb_train_F11_7 61 | 62 | training_data = make_train_set_F11_7(train_start_date, train_end_date, label_start_date, label_end_date) 63 | sub_training_data = make_test_set_F11_7(test_start_date, test_end_date) 64 | 65 | # train 66 | feats_train = [f for f in training_data.columns if f not in ignore_feat] 67 | print(len(feats_train)) 68 | label = training_data['label'].copy() 69 | print('train shape: ', training_data.shape) 70 | train = training_data[feats_train].copy() 71 | print('train shape: ', train.shape) 72 | 73 | # test 74 | feats_test = [f for f in sub_training_data.columns if f not in ignore_feat] 75 | print(feats_test) 76 | print(len(feats_test)) 77 | sub_user_index = sub_training_data[['user_id', 'cate']].copy() 78 | test = sub_training_data[feats_test].copy() 79 | print('test shape: ', test.shape) 80 | 81 | # 训练 82 | lgb_train_F11_7(train, label, test, sub_user_index) 83 | 84 | """ 85 | F12_5 86 | """ 87 | from user_cate_shop2 import make_train_set_F12_5, make_test_set_F12_5, lgb_train_F12_5 88 | test_start_date = '2018-03-15' 89 | test_end_date = '2018-04-16' 90 | 91 | label_start_date = '2018-04-11' 92 | label_end_date = '2018-04-16' 93 | train_start_date = '2018-03-10' 94 | train_end_date = '2018-04-11' 95 | 96 | training_data = make_train_set_F12_5(train_start_date, train_end_date, label_start_date, label_end_date, 30) 97 | feats = [f for f in training_data.columns if f not in ignore_feat] 98 | print(feats) 99 | print(len(feats)) 100 | label = training_data['label'].copy() 101 | train = training_data[feats].values 102 | 103 | sub_training_data = make_test_set_F12_5(test_start_date, test_end_date, 30) 104 | feats = [f for f in sub_training_data.columns if f not in ignore_feat] 105 | print(feats) 106 | print(len(feats)) 107 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy() 108 | test = sub_training_data[feats].values 109 | print('test shape: ', test.shape) 110 | 111 | lgb_train_F12_5(train, label, test, sub_user_index) 112 | 113 | """ 114 | F11_5 115 | """ 116 | from user_cate2 import make_train_set_F11_5, make_test_set_F11_5, lgb_train_F11_5 117 | training_data = make_train_set_F11_5(train_start_date, train_end_date, label_start_date, label_end_date, 30) 118 | 119 | feats = [f for f in training_data.columns if f not in ignore_feat] 120 | print(feats) 121 | print(len(feats)) 122 | label = training_data['label'].copy() 123 | user_index = training_data[['user_id', 'cate']].copy() 124 | train = training_data[feats].copy() 125 | del training_data 126 | 127 | sub_training_data = make_test_set_F11_5(test_start_date, test_end_date, 30) 128 | feats = [f for f in sub_training_data.columns if f not in ignore_feat] 129 | print(feats) 130 | print(len(feats)) 131 | sub_user_index = sub_training_data[['user_id', 'cate']].copy() 132 | test = sub_training_data[feats].copy() 133 | print('test shape: ', test.shape) 134 | del sub_training_data 135 | lgb_train_F11_5(train, label, test, sub_user_index) 136 | 137 | """ 138 | MERGE1 139 | """ 140 | import pandas as pd 141 | 142 | f11_col = ['user_id','cate'] 143 | f12_col = ['user_id','cate','shop_id'] 144 | 145 | # 新的的F11 F12结果 146 | f11_new_best_prob = pd.read_csv('./res/sub_F11_7.csv') 147 | f12_new_best_prob = pd.read_csv('./res/sub_F12_7.csv') 148 | 149 | # 5天的结果 150 | f12_5days_F11_prob = pd.read_csv('./res/sub_F11_5.csv') 151 | f12_5days_F12_prob = pd.read_csv('./res/sub_F12_5.csv') 152 | 153 | 154 | def get_old_best(f11_best_prob, f12_best_prob): 155 | ''' 156 | F11 F12结果融合函数 157 | ''' 158 | f11_col = ['user_id','cate'] 159 | f12_col = ['user_id','cate','shop_id'] 160 | f11_best_prob = f11_best_prob[f11_col][:32000] 161 | f12_best_prob = f12_best_prob[f12_col][:32000] 162 | f11 = f11_best_prob.drop_duplicates(f11_col)[f11_col] 163 | f11_merge = f11.merge(f12_best_prob, on=f11_col,how='inner') 164 | all_pred = pd.concat([f12_best_prob.head(15000), f11_merge],axis=0,ignore_index=True) 165 | output_csv = all_pred.drop_duplicates(f12_col,keep='first') 166 | print('output_csv.shape:', output_csv.shape) 167 | return output_csv 168 | 169 | 170 | # 5天F11F12融合 171 | f12_5days_best_prob = get_old_best(f12_5days_F11_prob,f12_5days_F12_prob) 172 | 173 | # 7天F11 F12 融合 174 | new_best = get_old_best(f11_new_best_prob,f12_new_best_prob) 175 | 176 | # 7天 & 5天融合 177 | fiveDay20k = f12_5days_best_prob[f12_col].head(20000) 178 | fiveDay15k = f12_5days_best_prob[f12_col].head(15000) 179 | fiveDay10k = f12_5days_best_prob[f12_col].head(10000) 180 | fiveDay5k = f12_5days_best_prob[f12_col].head(5000) 181 | 182 | print('原结果:', new_best.shape[0]) 183 | 184 | merge5day_5k = pd.concat([new_best,fiveDay5k],ignore_index=True) 185 | print('融合5天模型5k去重:', merge5day_5k.drop_duplicates().shape[0]) 186 | 187 | 188 | merge5day_10k = pd.concat([new_best,fiveDay10k],ignore_index=True) 189 | print('融合5天模型10k去重:', merge5day_10k.drop_duplicates().shape[0]) 190 | 191 | merge5day_15k = pd.concat([new_best,fiveDay15k],ignore_index=True) 192 | print('融合5天模型15k去重:',merge5day_15k.drop_duplicates().shape[0]) 193 | 194 | merge5day_20k = pd.concat([new_best,fiveDay20k],ignore_index=True) 195 | print('融合5天模型20k去重:',merge5day_20k.drop_duplicates().shape[0]) 196 | 197 | # 添加你的路径 198 | merge5day_5k.drop_duplicates().to_csv('./res/merge5day_5k.csv',index=False) 199 | merge5day_10k.drop_duplicates().to_csv('./res/merge5day_10k.csv',index=False) 200 | merge5day_15k.drop_duplicates().to_csv('./res/merge5day_15k.csv',index=False) 201 | merge5day_20k.drop_duplicates().to_csv('./res/merge5day_20k.csv',index=False) 202 | new_best.drop_duplicates().to_csv('./res/new_best.csv',index=False) 203 | 204 | 205 | """ 206 | MERGE2 207 | """ 208 | # 新的的F11 F12结果 209 | f11_best_prob = pd.read_csv('./res/sub_F11_7.csv') 210 | f12_best_prob = pd.read_csv('./res/sub_F12_7.csv') 211 | 212 | # 5天的结果 213 | f11_5days_prob = pd.read_csv('./res/sub_F11_5.csv') 214 | f12_5days_prob = pd.read_csv('./res/sub_F12_5.csv') 215 | 216 | df1 = pd.merge(f11_best_prob, f11_5days_prob, on=f12_col, how='outer') 217 | df1 = df1.fillna(0) 218 | print(df1.shape) 219 | 220 | df2 = pd.merge(f12_best_prob, f12_5days_prob, on=f12_col, how='outer') 221 | df2 = df2.fillna(0) 222 | print(df2.shape) 223 | 224 | # 6:4 225 | df1['label']=0.6*df1['label_x']+0.4*df1['label_y'] 226 | df1.sort_values(by=['label'], ascending=[0],inplace=True) 227 | df1 = df1.head(32000) 228 | df2['label']=0.6*df2['label_x']+0.4*df2['label_y'] 229 | df2.sort_values(by=['label'], ascending=[0],inplace=True) 230 | df2 = df2.head(32000) 231 | print('6/4: ') 232 | sub1 = get_old_best(df1, df2) 233 | 234 | # 5:5 235 | df1['label']=0.5*df1['label_x']+0.5*df1['label_y'] 236 | df1.sort_values(by=['label'], ascending=[0],inplace=True) 237 | df1 = df1.head(32000) 238 | df2['label']=0.5*df2['label_x']+0.5*df2['label_y'] 239 | df2.sort_values(by=['label'], ascending=[0],inplace=True) 240 | df2 = df2.head(32000) 241 | print('5/5: ') 242 | sub2 = get_old_best(df1, df2) 243 | 244 | # 7:3 245 | df1['label']=0.7*df1['label_x']+0.3*df1['label_y'] 246 | df1.sort_values(by=['label'], ascending=[0],inplace=True) 247 | df1 = df1.head(32000) 248 | df2['label']=0.7*df2['label_x']+0.3*df2['label_y'] 249 | df2.sort_values(by=['label'], ascending=[0],inplace=True) 250 | df2 = df2.head(32000) 251 | print('7/3: ') 252 | sub3 = get_old_best(df1, df2) 253 | 254 | a = pd.merge(sub1, sub2, on=['user_id', 'cate', 'shop_id']) 255 | b = pd.merge(sub1, sub3, on=['user_id', 'cate', 'shop_id']) 256 | c = pd.merge(sub2, sub3, on=['user_id', 'cate', 'shop_id']) 257 | print("6/4与5/5: ", a.shape) 258 | print("6/4与7/3: ", b.shape) 259 | print("5/5与7/3: ", c.shape) 260 | 261 | sub2.drop_duplicates().to_csv('./res/merge_2.csv',index=False) -------------------------------------------------------------------------------- /user_cate.py: -------------------------------------------------------------------------------- 1 | from user_cate_shop import * 2 | 3 | 4 | # 行为比例特征(2.01-4.08) 滑窗 5 | def get_accumulate_user_feat_v1(start_date, end_date): 6 | dump_path = './cache/user_feat_accumulate_F11_7_%s_%s.pkl' % (start_date, end_date) 7 | 8 | if os.path.exists(dump_path): 9 | f11_actions = pd.read_pickle(dump_path) 10 | else: 11 | actions = get_actions_product(start_date, end_date) 12 | 13 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 14 | actions = pd.concat([actions[['user_id', 'cate']], df], axis=1) 15 | 16 | # 索引 17 | f11_actions = actions[['user_id', 'cate']].drop_duplicates() 18 | 19 | actions1 = actions.drop(['cate'], axis=1) 20 | actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_') 21 | actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)] 22 | actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)] 23 | actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)] 24 | actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True) 25 | 26 | actions2 = actions.drop(['user_id'], axis=1) 27 | actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_') 28 | actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)] 29 | actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)] 30 | actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)] 31 | actions2.rename(columns={'cate_cate': 'cate'}, inplace=True) 32 | 33 | actions4 = actions 34 | actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_') 35 | actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)] 36 | actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)] 37 | actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)] 38 | actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True) 39 | 40 | # 拼接 41 | f11_actions = f11_actions.merge(actions1, on='user_id', how='left') 42 | f11_actions = f11_actions.merge(actions2, on='cate', how='left') 43 | f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left') 44 | # f11_actions.to_pickle(dump_path) 45 | 46 | print('accumulate user finished') 47 | return f11_actions 48 | 49 | 50 | # 读取行为数据,与产品数据拼接(用于生成购物车特征) 51 | def get_actions_product_cart(start_date, end_date): 52 | dump_path = './cache/all_action_product_cart__F11_7_%s_%s.pkl' % (start_date, end_date) 53 | if os.path.exists(dump_path): 54 | actions = pd.read_pickle(dump_path) 55 | else: 56 | actions = pd.read_pickle('./cache/origin_action.pkl') 57 | product = pd.read_pickle('./cache/origin_product.pkl') 58 | shop = pd.read_pickle('./cache/origin_shop.pkl') 59 | actions['action_time'] = pd.to_datetime(actions['action_time']) 60 | actions = actions[(actions.action_time >= start_date) & (actions.action_time < end_date)] 61 | actions = actions[actions['sku_id'].isin(product['sku_id'])] # 行为中sku_id不在product中的 62 | actions = pd.merge(actions, product, on='sku_id', how='left') 63 | actions = actions[actions['cate'] != 13] # cate13的数据没有购买行为 64 | actions = pd.merge(actions, shop[['shop_id', 'vender_id']], on=['shop_id'], how='left') 65 | print(actions.shape) 66 | actions = actions[actions['vender_id'] != 3666] # 数据没有购买行为 67 | print(actions.shape) 68 | # actions.to_pickle(dump_path) 69 | return actions 70 | 71 | 72 | def get_accumulate_user_feat_v1_cart(start_date, end_date): 73 | dump_path = './cache/user_feat_accumulate_F11_7_%s_%s.pkl' % (start_date, end_date) 74 | 75 | if os.path.exists(dump_path): 76 | f11_actions = pd.read_pickle(dump_path) 77 | else: 78 | actions = get_actions_product_cart(start_date, end_date) 79 | 80 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 81 | actions = pd.concat([actions[['user_id', 'cate']], df], axis=1) 82 | 83 | # 索引 84 | f11_actions = actions[['user_id', 'cate']].drop_duplicates() 85 | 86 | actions1 = actions.drop(['cate'], axis=1) 87 | actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_') 88 | actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)] 89 | actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)] 90 | actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)] 91 | actions1['user_action_5_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % ( 92 | start_date, end_date)] / actions1['user_id_%s-%s-action_5' % (start_date, end_date)] 93 | 94 | actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True) 95 | 96 | actions2 = actions.drop(['user_id'], axis=1) 97 | actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_') 98 | actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)] 99 | actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)] 100 | actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)] 101 | actions2['cate_action_5_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % ( 102 | start_date, end_date)] / actions2['cate_%s-%s-action_5' % (start_date, end_date)] 103 | 104 | actions2.rename(columns={'cate_cate': 'cate'}, inplace=True) 105 | 106 | actions4 = actions 107 | actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_') 108 | actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)] 109 | actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)] 110 | actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)] 111 | actions4['user_cate_shop_id_action_5_ratio_%s_%s' % (start_date, end_date)] = actions4[ 112 | 'user_cate_shop_id_%s-%s-action_2' % ( 113 | start_date, end_date)] / \ 114 | actions4[ 115 | 'user_cate_shop_id_%s-%s-action_5' % ( 116 | start_date, end_date)] 117 | 118 | actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True) 119 | 120 | # 拼接 121 | f11_actions = f11_actions.merge(actions1, on='user_id', how='left') 122 | f11_actions = f11_actions.merge(actions2, on='cate', how='left') 123 | f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left') 124 | #f11_actions.to_pickle(dump_path) 125 | 126 | print('accumulate user finished') 127 | return f11_actions 128 | 129 | 130 | # 基础统计特征 131 | def get_stat_feat_v1(start_date, end_date): 132 | dump_path = './cache/stat_feat_accumulate_F11_7_%s_%s.pkl' % (start_date, end_date) 133 | if os.path.exists(dump_path): 134 | action = pd.read_pickle(dump_path) 135 | else: 136 | action = get_actions_product(start_date, end_date) 137 | action_index = action[['user_id', 'cate']].drop_duplicates() 138 | 139 | # 行为onehot 140 | action_type = pd.get_dummies(action['type']) 141 | action_type.columns = ['act_1', 'act_2', 'act_3', 'act_4'] 142 | action_type = action_type[['act_1', 'act_2', 'act_3', 'act_4']] 143 | action_type['cate'] = action['cate'] 144 | action_type['user_id'] = action['user_id'] 145 | action_type['shop_id'] = action['shop_id'] 146 | 147 | # 基于user_id的统计特征 148 | user_stat = action[['user_id']].drop_duplicates() 149 | user_action_count = action.groupby('user_id')['type'].count() 150 | user_order_count = action_type.groupby('user_id')['act_2'].sum() 151 | user_order_rate = user_order_count / (user_action_count).fillna(0) 152 | user_cate_count = action.groupby('user_id')['cate'].nunique() 153 | user_sku_count = action.groupby('user_id')['sku_id'].nunique() 154 | user_shop_count = action.groupby('user_id')['shop_id'].nunique() 155 | 156 | user_stat['user_action_count_%s_%s' % (start_date, end_date)] = user_action_count 157 | user_stat['user_order_rate_%s_%s' % (start_date, end_date)] = user_order_rate 158 | user_stat['user_cate_count_%s_%s' % (start_date, end_date)] = user_cate_count 159 | user_stat['user_sku_count_%s_%s' % (start_date, end_date)] = user_sku_count 160 | user_stat['user_shop_count_%s_%s' % (start_date, end_date)] = user_shop_count 161 | 162 | # 基于cate的统计特征 163 | cate_stat = action[['cate']].drop_duplicates() 164 | 165 | # cate下的用户特征 166 | cate_user_count = action.groupby('cate')['user_id'].count() 167 | cate_user_nunique = action.groupby('cate')['user_id'].nunique() 168 | cate_order_count = action_type.groupby('cate')['act_2'].sum() 169 | cate_order_rate = cate_order_count / cate_user_count 170 | 171 | # cate下:购买用户/总用户 172 | cate_order_user_count = action_type.groupby(['cate', 'user_id'])['act_2'].sum().reset_index() 173 | cate_order_user_count = cate_order_user_count[cate_order_user_count.act_2 > 0].groupby('cate')['user_id'].nunique() 174 | cate_order_user_rate = (cate_order_user_count / cate_user_nunique) 175 | cate_sku_nunique = action.groupby('cate')['sku_id'].nunique() 176 | 177 | # cate下的店铺特征 178 | cate_shop_count = action.groupby('cate')['shop_id'].count() 179 | cate_shop_nunique = action.groupby('cate')['shop_id'].nunique() 180 | cate_shop_order_count = action_type.groupby('cate')['act_2'].sum() 181 | cate_shop_order_rate = cate_shop_order_count / cate_shop_count 182 | 183 | # cate下: 购买店铺/总店铺 184 | cate_order_shop_count = action_type.groupby(['cate', 'shop_id'])['act_2'].sum().reset_index() 185 | cate_order_shop_count = cate_order_shop_count[cate_order_shop_count.act_2 > 0].groupby('cate')['shop_id'].nunique() 186 | cate_order_shop_rate = (cate_order_shop_count / cate_shop_nunique) 187 | 188 | cate_stat['cate_user_count_%s_%s' % (start_date, end_date)] = cate_user_count 189 | cate_stat['cate_user_nunique_%s_%s' % (start_date, end_date)] = cate_user_nunique 190 | cate_stat['cate_order_rate_%s_%s' % (start_date, end_date)] = cate_order_rate.fillna(0) 191 | cate_stat['cate_order_user_count_%s_%s' % (start_date, end_date)] = cate_order_user_count 192 | cate_stat['cate_order_user_rate_%s_%s' % (start_date, end_date)] = cate_order_user_rate 193 | cate_stat['cate_sku_nunique_%s_%s' % (start_date, end_date)] = cate_sku_nunique 194 | cate_stat['cate_shop_nunique_%s_%s' % (start_date, end_date)] = cate_shop_nunique 195 | 196 | cate_stat['cate_shop_order_rate_%s_%s' % (start_date, end_date)] = cate_shop_order_rate 197 | cate_stat['cate_order_shop_count_%s_%s' % (start_date, end_date)] = cate_order_shop_count 198 | cate_stat['cate_order_shop_rate_%s_%s' % (start_date, end_date)] = cate_order_shop_rate 199 | 200 | action = pd.merge(action_index, user_stat, on='user_id', how='left') 201 | action = pd.merge(action, cate_stat, on='cate', how='left') 202 | #action.to_pickle(dump_path) 203 | print('stat_feat finished') 204 | return action 205 | 206 | 207 | # 交叉特征 208 | def get_cross_feat_v1(start_date, end_date): 209 | dump_path = './cache/cross_feat_F11_7_%s_%s.pkl' % (start_date, end_date) 210 | if os.path.exists(dump_path): 211 | actions = pd.read_pickle(dump_path) 212 | else: 213 | actions = get_actions_product(start_date, end_date)[['user_id', 'cate']] 214 | actions['cnt'] = 0 215 | 216 | action1 = actions.groupby(['user_id', 'cate'], as_index=False).count() 217 | 218 | action2 = actions.groupby('user_id', as_index=False).count() 219 | del action2['cate'] 220 | action2.columns = ['user_id', 'user_cnt'] 221 | 222 | action3 = actions.groupby('cate', as_index=False).count() 223 | del action3['user_id'] 224 | action3.columns = ['cate', 'cate_cnt'] 225 | actions = pd.merge(action1, action2, how='left', on='user_id') 226 | actions = pd.merge(actions, action3, how='left', on='cate') 227 | 228 | actions['user_cnt'] = actions['cnt'] / actions['user_cnt'] 229 | actions['cate_cnt'] = actions['cnt'] / actions['cate_cnt'] 230 | del actions['cnt'] 231 | #pickle.dump(actions, open(dump_path, 'wb')) 232 | actions.columns = ['user_id', 'cate'] + ['cross_feat_' + str(i) for i in range(1, actions.shape[1] - 1)] 233 | print('cross feature finished') 234 | return actions 235 | 236 | 237 | # U_B对行为1,2,4,5进行 浏览次数/用户总浏览次数(或者物品的浏览次数) 238 | def get_user_feat15_v1(start_date, end_date): 239 | dump_path = './cache/user_feat15_v1_F11_7_%s_%s.pkl' % (start_date, end_date) 240 | if os.path.exists(dump_path): 241 | actions = pd.read_pickle(dump_path) 242 | actions.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in 243 | range(1, actions.shape[1] - 1)] 244 | return actions 245 | else: 246 | temp = None 247 | df = get_actions_product(start_date, end_date)[['user_id', 'cate', 'type']] 248 | for i in (1, 2, 3): 249 | actions = df[df['type'] == i] 250 | action1 = actions.groupby(['user_id', 'cate'], as_index=False).count() 251 | action1.columns = ['user_id', 'cate', 'visit'] 252 | 253 | action2 = actions.groupby('user_id', as_index=False).count() 254 | del action2['type'] 255 | action2.columns = ['user_id', 'user_visits_cate'] 256 | 257 | action4 = actions.groupby('cate', as_index=False).count() 258 | del action4['type'] 259 | action4.columns = ['cate', 'cate_visits_user'] 260 | 261 | actions = pd.merge(action1, action2, how='left', on='user_id') 262 | actions = pd.merge(actions, action4, how='left', on='cate') 263 | 264 | actions['visit_rate_user1'] = actions['visit'] / actions['user_visits_cate'] 265 | actions['visit_rate_cate1'] = actions['visit'] / actions['cate_visits_user'] 266 | if temp is None: 267 | temp = actions 268 | else: 269 | temp = pd.merge(temp, actions, how="outer", on=['user_id', 'cate']) 270 | #pickle.dump(temp, open(dump_path, 'wb')) 271 | temp.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in 272 | range(1, temp.shape[1] - 1)] 273 | return temp 274 | 275 | 276 | # 标签 277 | def get_labels_v1(start_date, end_date): 278 | dump_path = './cache/labels_F11_7_%s_%s.pkl' % (start_date, end_date) 279 | if os.path.exists(dump_path): 280 | actions = pd.read_pickle(dump_path) 281 | else: 282 | actions = get_actions_product(start_date, end_date) 283 | actions = actions[actions['type'] == 2] 284 | actions = actions.groupby(['user_id', 'cate'], as_index=False).sum() 285 | actions['label'] = 1 286 | actions = actions[['user_id', 'cate', 'label']] 287 | #actions.to_pickle(dump_path) 288 | print('label finished') 289 | return actions 290 | 291 | 292 | def make_train_set_F11_7(train_start_date, train_end_date, test_start_date, test_end_date, days=30): 293 | dump_path = './cache/train_set_F11_7_%s_%s_%s_%s.pkl' % ( 294 | train_start_date, train_end_date, test_start_date, test_end_date) 295 | if os.path.exists(dump_path): 296 | actions = pd.read_pickle(dump_path) 297 | else: 298 | # 索引 299 | f11_actions = get_actions_product(train_start_date, train_end_date) 300 | f11_actions = f11_actions[['user_id', 'cate']].drop_duplicates() 301 | 302 | # 标签 303 | labels = get_labels_v1(test_start_date, test_end_date) 304 | 305 | # 特征 306 | start_days = "2018-02-01" 307 | user = get_basic_user_feat() 308 | product_stat = get_product_stat_feat(start_days, train_end_date) 309 | time = get_time_feat(start_days, train_end_date) 310 | stat_feat = get_stat_feat_v1(start_days, train_end_date) 311 | user_feat = user_features(start_days, train_end_date) 312 | cross_feat = get_cross_feat_v1(start_days, train_end_date) 313 | 314 | user_feat1 = get_user_feat1(start_days, train_end_date) 315 | user_feat2 = get_user_feat2(start_days, train_end_date) 316 | user_feat3 = get_user_feat3(start_days, train_end_date) 317 | user_feat5 = get_user_feat5(start_days, train_end_date) 318 | user_feat6 = get_user_feat6(start_days, train_end_date) 319 | user_feat7 = get_user_feat7(start_days, train_end_date) 320 | user_feat8 = get_user_feat8(start_days, train_end_date) 321 | user_feat9 = get_user_feat9(start_days, train_end_date) 322 | user_feat10 = get_user_feat10(start_days, train_end_date) 323 | user_feat11 = get_user_feat11(start_days, train_end_date) 324 | user_feat12 = get_user_feat12(start_days, train_end_date) 325 | user_feat13 = get_user_feat13(start_days, train_end_date) 326 | user_feat14 = get_user_feat14(start_days, train_end_date) 327 | user_feat15 = get_user_feat15_v1(start_days, train_end_date) 328 | 329 | cate_feat1 = get_cate_feat_1(start_days, train_end_date) 330 | cate_feat2 = get_cate_feat_2(start_days, train_end_date) 331 | cate_feat3 = get_cate_feat_3(start_days, train_end_date) 332 | cate_feat4 = get_cate_feat_4(start_days, train_end_date) 333 | cate_feat5 = get_cate_feat_5(start_days, train_end_date) 334 | cate_feat6 = get_cate_feat_6(start_days, train_end_date) 335 | cate_feat7 = get_cate_feat_7(start_days, train_end_date) 336 | cate_feat8 = get_cate_feat_8(start_days, train_end_date) 337 | cate_feat9 = get_cate_feat_9(start_days, train_end_date) 338 | cate_feat10 = get_cate_feat_10(start_days, train_end_date) 339 | cate_feat11 = get_cate_feat_11(start_days, train_end_date) 340 | 341 | F11_feat1 = get_F11_feat_1(start_days, train_end_date) 342 | F11_feat3 = get_F11_feat_3(start_days, train_end_date) 343 | F11_feat4 = get_F11_feat_4(start_days, train_end_date) 344 | F11_feat5 = get_F11_feat_5(start_days, train_end_date) 345 | F11_feat6 = get_F11_feat_6(start_days, train_end_date) 346 | F11_feat7 = get_F11_feat_7(start_days, train_end_date) 347 | F11_feat8 = get_F11_feat_8(start_days, train_end_date) 348 | F11_feat9 = get_F11_feat_9(start_days, train_end_date) 349 | F11_feat10 = get_F11_feat_10(start_days, train_end_date) 350 | F11_feat11 = get_F11_feat_11(start_days, train_end_date) 351 | 352 | # 滑窗行为特征 353 | actions = None 354 | for i in (3, 5, 7, 14, 21, 30): 355 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) 356 | start_days = start_days.strftime('%Y-%m-%d') 357 | if actions is None: 358 | actions = get_accumulate_user_feat_v1(start_days, train_end_date) 359 | else: 360 | actions1 = get_accumulate_user_feat_v1(start_days, train_end_date) 361 | actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate']) 362 | 363 | # 前一天滑窗行为 包含cart 364 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=1) 365 | start_days = start_days.strftime('%Y-%m-%d') 366 | actions_cart = get_accumulate_user_feat_v1_cart(start_days, train_end_date) 367 | 368 | # act_5 369 | # act5_feat = pd.read_csv('./cache_final/train_lastday_act5_stat.csv') 370 | act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 1) 371 | act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum() 372 | del act5_feat['shop_id'] 373 | 374 | f11_actions = pd.merge(f11_actions, labels, how='left', on=['user_id', 'cate']) 375 | f11_actions = f11_actions.fillna(0) 376 | 377 | # 负采样 378 | print('train data size:', f11_actions.shape[0]) 379 | f11_actions_1 = f11_actions[f11_actions['label'] == 1] 380 | f11_actions_0 = f11_actions[f11_actions['label'] == 0] 381 | frac1 = (f11_actions_1.shape[0] * 30) / f11_actions_0.shape[0] # 负样本为正样本30倍 382 | f11_actions_0 = f11_actions_0.sample(frac=frac1).reset_index(drop=True) 383 | f11_actions = pd.concat([f11_actions_1, f11_actions_0], axis=0, ignore_index=True) 384 | f11_actions = f11_actions.sample(frac=1).reset_index(drop=True) 385 | print('train data size after sample:', f11_actions.shape[0]) 386 | 387 | actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate']) 388 | actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate']) 389 | actions = pd.merge(actions, user, how='left', on='user_id') 390 | actions = pd.merge(actions, time, how='left', on='user_id') 391 | actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate']) 392 | actions = pd.merge(actions, product_stat, how='left', on='cate') 393 | 394 | actions = pd.merge(actions, user_feat1, how='left', on='user_id') 395 | actions = pd.merge(actions, user_feat2, how='left', on='user_id') 396 | actions = pd.merge(actions, user_feat3, how='left', on='user_id') 397 | actions = pd.merge(actions, user_feat5, how='left', on='user_id') 398 | actions = pd.merge(actions, user_feat6, how='left', on='user_id') 399 | actions = pd.merge(actions, user_feat7, how='left', on='user_id') 400 | actions = pd.merge(actions, user_feat8, how='left', on='user_id') 401 | actions = pd.merge(actions, user_feat9, how='left', on='user_id') 402 | actions = pd.merge(actions, user_feat10, how='left', on='user_id') 403 | actions = pd.merge(actions, user_feat11, how='left', on='user_id') 404 | actions = pd.merge(actions, user_feat12, how='left', on='user_id') 405 | actions = pd.merge(actions, user_feat13, how='left', on='user_id') 406 | actions = pd.merge(actions, user_feat14, how='left', on='user_id') 407 | actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate']) 408 | actions = pd.merge(actions, user_feat, how='left', on='user_id') 409 | """ 410 | cate 411 | """ 412 | actions = pd.merge(actions, cate_feat1, how='left', on='cate') 413 | actions = pd.merge(actions, cate_feat2, how='left', on='cate') 414 | actions = pd.merge(actions, cate_feat3, how='left', on='cate') 415 | actions = pd.merge(actions, cate_feat4, how='left', on='cate') 416 | actions = pd.merge(actions, cate_feat5, how='left', on='cate') 417 | actions = pd.merge(actions, cate_feat6, how='left', on='cate') 418 | actions = pd.merge(actions, cate_feat7, how='left', on='cate') 419 | actions = pd.merge(actions, cate_feat8, how='left', on='cate') 420 | actions = pd.merge(actions, cate_feat9, how='left', on='cate') 421 | actions = pd.merge(actions, cate_feat10, how='left', on='cate') 422 | actions = pd.merge(actions, cate_feat11, how='left', on='cate') # 用于concat 423 | print('actions1 finished') 424 | """ 425 | F11 426 | """ 427 | actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate']) 428 | actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate']) 429 | actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate']) 430 | actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate']) 431 | actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate']) 432 | actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate']) 433 | actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate']) 434 | actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate']) 435 | actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate']) 436 | actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate']) 437 | 438 | actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate']) 439 | actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate']) 440 | actions = actions.fillna(0) 441 | print('train_set finised') 442 | return actions 443 | 444 | 445 | def make_test_set_F11_7(train_start_date, train_end_date): 446 | dump_path = './cache/test_set_F11_7_%s_%s.pkl' % (train_start_date, train_end_date) 447 | if os.path.exists(dump_path): 448 | actions = pd.read_pickle(dump_path) 449 | else: 450 | # 索引 451 | f11_actions = get_actions_product(train_start_date, train_end_date) 452 | f11_actions = f11_actions[['user_id', 'cate']] .drop_duplicates() 453 | 454 | # 特征 455 | start_days = "2018-02-01" 456 | user = get_basic_user_feat() 457 | product_stat = get_product_stat_feat(start_days, train_end_date) 458 | time = get_time_feat(start_days, train_end_date) 459 | stat_feat = get_stat_feat_v1(start_days, train_end_date) 460 | user_feat = user_features(start_days, train_end_date) 461 | cross_feat = get_cross_feat_v1(start_days, train_end_date) 462 | 463 | user_feat1 = get_user_feat1(start_days, train_end_date) 464 | user_feat2 = get_user_feat2(start_days, train_end_date) 465 | user_feat3 = get_user_feat3(start_days, train_end_date) 466 | user_feat5 = get_user_feat5(start_days, train_end_date) 467 | user_feat6 = get_user_feat6(start_days, train_end_date) 468 | user_feat7 = get_user_feat7(start_days, train_end_date) 469 | user_feat8 = get_user_feat8(start_days, train_end_date) 470 | user_feat9 = get_user_feat9(start_days, train_end_date) 471 | user_feat10 = get_user_feat10(start_days, train_end_date) 472 | user_feat11 = get_user_feat11(start_days, train_end_date) 473 | user_feat12 = get_user_feat12(start_days, train_end_date) 474 | user_feat13 = get_user_feat13(start_days, train_end_date) 475 | user_feat14 = get_user_feat14(start_days, train_end_date) 476 | user_feat15 = get_user_feat15_v1(start_days, train_end_date) 477 | 478 | 479 | cate_feat1 = get_cate_feat_1(start_days, train_end_date) 480 | cate_feat2 = get_cate_feat_2(start_days, train_end_date) 481 | cate_feat3 = get_cate_feat_3(start_days, train_end_date) 482 | cate_feat4 = get_cate_feat_4(start_days, train_end_date) 483 | cate_feat5 = get_cate_feat_5(start_days, train_end_date) 484 | cate_feat6 = get_cate_feat_6(start_days, train_end_date) 485 | cate_feat7 = get_cate_feat_7(start_days, train_end_date) 486 | cate_feat8 = get_cate_feat_8(start_days, train_end_date) 487 | cate_feat9 = get_cate_feat_9(start_days, train_end_date) 488 | cate_feat10 = get_cate_feat_10(start_days, train_end_date) 489 | cate_feat11 = get_cate_feat_11(start_days, train_end_date) 490 | 491 | F11_feat1 = get_F11_feat_1(start_days, train_end_date) 492 | F11_feat3 = get_F11_feat_3(start_days, train_end_date) 493 | F11_feat4 = get_F11_feat_4(start_days, train_end_date) 494 | F11_feat5 = get_F11_feat_5(start_days, train_end_date) 495 | F11_feat6 = get_F11_feat_6(start_days, train_end_date) 496 | F11_feat7 = get_F11_feat_7(start_days, train_end_date) 497 | F11_feat8 = get_F11_feat_8(start_days, train_end_date) 498 | F11_feat9 = get_F11_feat_9(start_days, train_end_date) 499 | F11_feat10 = get_F11_feat_10(start_days, train_end_date) 500 | F11_feat11 = get_F11_feat_11(start_days, train_end_date) 501 | 502 | # generate 时间窗口 503 | actions = None 504 | for i in (3, 5, 7, 14, 21, 30): 505 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) 506 | start_days = start_days.strftime('%Y-%m-%d') 507 | if actions is None: 508 | actions = get_accumulate_user_feat_v1(start_days, train_end_date) 509 | else: 510 | actions1 = get_accumulate_user_feat_v1(start_days, train_end_date) 511 | actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate']) 512 | print(actions.shape) 513 | 514 | # 前一天滑窗行为 包含cart 515 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=1) 516 | start_days = start_days.strftime('%Y-%m-%d') 517 | actions_cart = get_accumulate_user_feat_v1_cart(start_days, train_end_date) 518 | 519 | # act_5 520 | # act5_feat = pd.read_csv('./cache_final/test_lastday_act5_stat.csv') 521 | act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 1) 522 | act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum() 523 | del act5_feat['shop_id'] 524 | 525 | actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate']) 526 | actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate']) 527 | actions = pd.merge(actions, user, how='left', on='user_id') 528 | actions = pd.merge(actions, time, how='left', on='user_id') 529 | actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate']) 530 | actions = pd.merge(actions, product_stat, how='left', on='cate') 531 | 532 | actions = pd.merge(actions, user_feat1, how='left', on='user_id') 533 | actions = pd.merge(actions, user_feat2, how='left', on='user_id') 534 | actions = pd.merge(actions, user_feat3, how='left', on='user_id') 535 | actions = pd.merge(actions, user_feat5, how='left', on='user_id') 536 | actions = pd.merge(actions, user_feat6, how='left', on='user_id') 537 | actions = pd.merge(actions, user_feat7, how='left', on='user_id') 538 | actions = pd.merge(actions, user_feat8, how='left', on='user_id') 539 | actions = pd.merge(actions, user_feat9, how='left', on='user_id') 540 | actions = pd.merge(actions, user_feat10, how='left', on='user_id') 541 | actions = pd.merge(actions, user_feat11, how='left', on='user_id') 542 | actions = pd.merge(actions, user_feat12, how='left', on='user_id') 543 | actions = pd.merge(actions, user_feat13, how='left', on='user_id') 544 | actions = pd.merge(actions, user_feat14, how='left', on='user_id') 545 | actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate']) 546 | actions = pd.merge(actions, user_feat, how='left', on='user_id') 547 | """ 548 | cate 549 | """ 550 | actions = pd.merge(actions, cate_feat1, how='left', on='cate') 551 | actions = pd.merge(actions, cate_feat2, how='left', on='cate') 552 | actions = pd.merge(actions, cate_feat3, how='left', on='cate') 553 | actions = pd.merge(actions, cate_feat4, how='left', on='cate') 554 | actions = pd.merge(actions, cate_feat5, how='left', on='cate') 555 | actions = pd.merge(actions, cate_feat6, how='left', on='cate') 556 | actions = pd.merge(actions, cate_feat7, how='left', on='cate') 557 | actions = pd.merge(actions, cate_feat8, how='left', on='cate') 558 | actions = pd.merge(actions, cate_feat9, how='left', on='cate') 559 | actions = pd.merge(actions, cate_feat10, how='left', on='cate') 560 | actions = pd.merge(actions, cate_feat11, how='left', on='cate') 561 | print('actions1 finished') 562 | """ 563 | F11 564 | """ 565 | actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate']) 566 | actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate']) 567 | actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate']) 568 | actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate']) 569 | actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate']) 570 | actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate']) 571 | actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate']) 572 | actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate']) 573 | actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate']) 574 | actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate']) 575 | 576 | actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate']) 577 | actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate']) 578 | actions = actions.fillna(0) 579 | del stat_feat, f11_actions 580 | print('test_set finished') 581 | return actions 582 | 583 | 584 | def lgb_train_F11_7(X_train1, y_train1, X_test1, sub_user_index): 585 | # 提交结果 586 | sub = sub_user_index[['user_id', 'cate']].copy() 587 | sub['shop_id'] = 0 588 | sub['label'] = 0 589 | 590 | # 训练测试集 591 | X_train = X_train1.values 592 | y_train = y_train1.values 593 | X_test = X_test1.values 594 | 595 | del X_train1, y_train1, X_test1 596 | 597 | print('================================') 598 | print(X_train.shape) 599 | print(X_test.shape) 600 | print('================================') 601 | 602 | xx_logloss = [] 603 | oof_preds = np.zeros(X_train.shape[0]) 604 | N = 5 605 | skf = StratifiedKFold(n_splits=N, random_state=1024, shuffle=True) 606 | 607 | params = { 608 | 'learning_rate': 0.01, 609 | 'boosting_type': 'gbdt', 610 | 'objective': 'binary', 611 | 'metric': 'binary_logloss', 612 | 'num_leaves': 31, 613 | 'feature_fraction': 0.8, 614 | 'bagging_fraction': 0.8, 615 | 'bagging_freq': 5, 616 | 'seed': 1, 617 | 'bagging_seed': 1, 618 | 'feature_fraction_seed': 7, 619 | 'min_data_in_leaf': 20, 620 | 'nthread': -1, # -1 621 | 'verbose': -1, 622 | } 623 | for k, (train_index, test_index) in enumerate(skf.split(X_train, y_train)): 624 | print('train _K_ flod', k) 625 | 626 | lgb_train = lgb.Dataset(X_train[train_index], y_train[train_index]) 627 | lgb_evals = lgb.Dataset(X_train[test_index], y_train[test_index], reference=lgb_train) 628 | 629 | lgbm = lgb.train(params, lgb_train, num_boost_round=50000, valid_sets=[lgb_train, lgb_evals], 630 | valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=200) 631 | 632 | sub['label'] += lgbm.predict(X_test, num_iteration=lgbm.best_iteration) / N 633 | oof_preds[test_index] = lgbm.predict(X_train[test_index], num_iteration=lgbm.best_iteration) 634 | xx_logloss.append(lgbm.best_score['valid']['binary_logloss']) 635 | print(xx_logloss) 636 | a = np.mean(xx_logloss) 637 | a = round(a, 5) 638 | print(a) 639 | 640 | sub = sub.sort_values(by='label', ascending=False) 641 | sub = sub.head(50000) 642 | sub = sub[['user_id', 'cate', 'shop_id','label']] 643 | 644 | sub.to_csv('./res/sub_F11_7.csv', index=False, index_label=False) -------------------------------------------------------------------------------- /user_cate2.py: -------------------------------------------------------------------------------- 1 | from user_cate_shop2 import * 2 | 3 | 4 | # 读取行为数据,与产品数据拼接(用于生成购物车特征) 5 | def get_actions_product_cart(start_date, end_date): 6 | dump_path = './cache/all_action_product_cart_F11_5_%s_%s.pkl' % (start_date, end_date) 7 | if os.path.exists(dump_path): 8 | actions = pd.read_pickle(dump_path) 9 | else: 10 | actions = pd.read_pickle('./cache/origin_action.pkl') 11 | product = pd.read_pickle('./cache/origin_product.pkl') 12 | shop = pd.read_pickle('./cache/origin_shop.pkl') 13 | actions['action_time'] = pd.to_datetime(actions['action_time']) 14 | actions = actions[(actions.action_time >= start_date) & (actions.action_time < end_date)] 15 | actions = actions[actions['sku_id'].isin(product['sku_id'])] # 行为中sku_id不在product中的 16 | actions = pd.merge(actions, product, on='sku_id', how='left') 17 | actions = actions[actions['cate'] != 13] # cate13的数据没有购买行为 18 | actions = pd.merge(actions, shop[['shop_id', 'vender_id']], on=['shop_id'], how='left') 19 | print(actions.shape) 20 | actions = actions[actions['vender_id'] != 3666] # 数据没有购买行为 21 | print(actions.shape) 22 | #actions.to_pickle(dump_path) 23 | return actions 24 | 25 | 26 | # 行为比例特征(2.01-4.08) 滑窗 27 | def get_accumulate_user_feat_v1(start_date, end_date): 28 | dump_path = './cache/user_feat_v1_accumulate_F11_5_%s_%s.pkl' % (start_date, end_date) 29 | 30 | if os.path.exists(dump_path): 31 | f11_actions = pd.read_pickle(dump_path) 32 | else: 33 | actions = get_actions_product(start_date, end_date) 34 | 35 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 36 | actions = pd.concat([actions[['user_id', 'cate']], df], axis=1) 37 | 38 | # 索引 39 | f11_actions = actions[['user_id', 'cate']].drop_duplicates() 40 | 41 | actions1 = actions.drop(['cate'], axis=1) 42 | actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_') 43 | actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)] 44 | actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)] 45 | actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)] 46 | actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True) 47 | 48 | actions2 = actions.drop(['user_id'], axis=1) 49 | actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_') 50 | actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)] 51 | actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)] 52 | actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)] 53 | actions2.rename(columns={'cate_cate': 'cate'}, inplace=True) 54 | 55 | actions4 = actions 56 | actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_') 57 | actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)] 58 | actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)] 59 | actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)] 60 | actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True) 61 | 62 | # 拼接 63 | f11_actions = f11_actions.merge(actions1, on='user_id', how='left') 64 | f11_actions = f11_actions.merge(actions2, on='cate', how='left') 65 | f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left') 66 | #f11_actions.to_pickle(dump_path) 67 | print('accumulate user finished') 68 | return f11_actions 69 | 70 | 71 | def get_accumulate_user_cart_feat_v1(start_date, end_date): 72 | dump_path = './cache/user_cart_feat_v1_accumulate_F11_5_%s_%s.pkl' % (start_date, end_date) 73 | 74 | if os.path.exists(dump_path): 75 | f11_actions = pd.read_pickle(dump_path) 76 | else: 77 | actions = get_actions_product_cart(start_date, end_date) 78 | 79 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 80 | actions = pd.concat([actions[['user_id', 'cate']], df], axis=1) 81 | 82 | # 索引 83 | f11_actions = actions[['user_id', 'cate']].drop_duplicates() 84 | 85 | actions1 = actions.drop(['cate'], axis=1) 86 | actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_') 87 | actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)] 88 | actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)] 89 | actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)] 90 | actions1['user_action_5_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % ( 91 | start_date, end_date)] / actions1['user_id_%s-%s-action_5' % (start_date, end_date)] 92 | 93 | actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True) 94 | 95 | actions2 = actions.drop(['user_id'], axis=1) 96 | actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_') 97 | actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)] 98 | actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)] 99 | actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)] 100 | actions2['cate_action_5_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % ( 101 | start_date, end_date)] / actions2['cate_%s-%s-action_5' % (start_date, end_date)] 102 | 103 | actions2.rename(columns={'cate_cate': 'cate'}, inplace=True) 104 | 105 | actions4 = actions 106 | actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_') 107 | actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)] 108 | actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)] 109 | actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)] 110 | actions4['user_cate_shop_id_action_5_ratio_%s_%s' % (start_date, end_date)] = actions4[ 111 | 'user_cate_shop_id_%s-%s-action_2' % ( 112 | start_date, end_date)] / \ 113 | actions4[ 114 | 'user_cate_shop_id_%s-%s-action_5' % ( 115 | start_date, end_date)] 116 | 117 | actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True) 118 | 119 | # 拼接 120 | f11_actions = f11_actions.merge(actions1, on='user_id', how='left') 121 | f11_actions = f11_actions.merge(actions2, on='cate', how='left') 122 | f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left') 123 | #f11_actions.to_pickle(dump_path) 124 | 125 | print('accumulate user cart finished') 126 | return f11_actions 127 | 128 | 129 | # 基础统计特征 130 | def get_stat_feat_v1(start_date, end_date): 131 | dump_path = './cache/stat_feat_accumulate_v1_F11_5_%s_%s.pkl' % (start_date, end_date) 132 | if os.path.exists(dump_path): 133 | action = pd.read_pickle(dump_path) 134 | else: 135 | action = get_actions_product(start_date, end_date) 136 | action_index = action[['user_id', 'cate']].drop_duplicates() 137 | 138 | # 行为onehot 139 | action_type = pd.get_dummies(action['type']) 140 | action_type.columns = ['act_1', 'act_2', 'act_3', 'act_4'] 141 | action_type = action_type[['act_1', 'act_2', 'act_3', 'act_4']] 142 | action_type['cate'] = action['cate'] 143 | action_type['user_id'] = action['user_id'] 144 | action_type['shop_id'] = action['shop_id'] 145 | 146 | # 基于user_id的统计特征 147 | user_stat = action[['user_id']].drop_duplicates() 148 | user_action_count = action.groupby('user_id')['type'].count() 149 | user_order_count = action_type.groupby('user_id')['act_2'].sum() 150 | user_order_rate = user_order_count / (user_action_count).fillna(0) 151 | user_cate_count = action.groupby('user_id')['cate'].nunique() 152 | user_sku_count = action.groupby('user_id')['sku_id'].nunique() 153 | user_shop_count = action.groupby('user_id')['shop_id'].nunique() 154 | 155 | user_stat['user_action_count_%s_%s' % (start_date, end_date)] = user_action_count 156 | user_stat['user_order_rate_%s_%s' % (start_date, end_date)] = user_order_rate 157 | user_stat['user_cate_count_%s_%s' % (start_date, end_date)] = user_cate_count 158 | user_stat['user_sku_count_%s_%s' % (start_date, end_date)] = user_sku_count 159 | user_stat['user_shop_count_%s_%s' % (start_date, end_date)] = user_shop_count 160 | 161 | # 基于cate的统计特征 162 | cate_stat = action[['cate']].drop_duplicates() 163 | 164 | # cate下的用户特征 165 | cate_user_count = action.groupby('cate')['user_id'].count() 166 | cate_user_nunique = action.groupby('cate')['user_id'].nunique() 167 | cate_order_count = action_type.groupby('cate')['act_2'].sum() 168 | cate_order_rate = cate_order_count / cate_user_count 169 | 170 | # cate下:购买用户/总用户 171 | cate_order_user_count = action_type.groupby(['cate', 'user_id'])['act_2'].sum().reset_index() 172 | cate_order_user_count = cate_order_user_count[cate_order_user_count.act_2 > 0].groupby('cate')['user_id'].nunique() 173 | cate_order_user_rate = (cate_order_user_count / cate_user_nunique) 174 | cate_sku_nunique = action.groupby('cate')['sku_id'].nunique() 175 | 176 | # cate下的店铺特征 177 | cate_shop_count = action.groupby('cate')['shop_id'].count() 178 | cate_shop_nunique = action.groupby('cate')['shop_id'].nunique() 179 | cate_shop_order_count = action_type.groupby('cate')['act_2'].sum() 180 | cate_shop_order_rate = cate_shop_order_count / cate_shop_count 181 | 182 | # cate下: 购买店铺/总店铺 183 | cate_order_shop_count = action_type.groupby(['cate', 'shop_id'])['act_2'].sum().reset_index() 184 | cate_order_shop_count = cate_order_shop_count[cate_order_shop_count.act_2 > 0].groupby('cate')['shop_id'].nunique() 185 | cate_order_shop_rate = (cate_order_shop_count / cate_shop_nunique) 186 | 187 | cate_stat['cate_user_count_%s_%s' % (start_date, end_date)] = cate_user_count 188 | cate_stat['cate_user_nunique_%s_%s' % (start_date, end_date)] = cate_user_nunique 189 | cate_stat['cate_order_rate_%s_%s' % (start_date, end_date)] = cate_order_rate.fillna(0) 190 | cate_stat['cate_order_user_count_%s_%s' % (start_date, end_date)] = cate_order_user_count 191 | cate_stat['cate_order_user_rate_%s_%s' % (start_date, end_date)] = cate_order_user_rate 192 | cate_stat['cate_sku_nunique_%s_%s' % (start_date, end_date)] = cate_sku_nunique 193 | cate_stat['cate_shop_nunique_%s_%s' % (start_date, end_date)] = cate_shop_nunique 194 | 195 | cate_stat['cate_shop_order_rate_%s_%s' % (start_date, end_date)] = cate_shop_order_rate 196 | cate_stat['cate_order_shop_count_%s_%s' % (start_date, end_date)] = cate_order_shop_count 197 | cate_stat['cate_order_shop_rate_%s_%s' % (start_date, end_date)] = cate_order_shop_rate 198 | 199 | action = pd.merge(action_index, user_stat, on='user_id', how='left') 200 | action = pd.merge(action, cate_stat, on='cate', how='left') 201 | #action.to_pickle(dump_path) 202 | print('stat_feat finished') 203 | return action 204 | 205 | 206 | # 交叉特征 207 | def get_cross_feat_v1(start_date, end_date): 208 | dump_path = './cache/cross_feat_v1_F11_5_%s_%s.pkl' % (start_date, end_date) 209 | if os.path.exists(dump_path): 210 | actions = pd.read_pickle(dump_path) 211 | else: 212 | actions = get_actions_product(start_date, end_date)[['user_id', 'cate']] 213 | actions['cnt'] = 0 214 | 215 | action1 = actions.groupby(['user_id', 'cate'], as_index=False).count() 216 | 217 | action2 = actions.groupby('user_id', as_index=False).count() 218 | del action2['cate'] 219 | action2.columns = ['user_id', 'user_cnt'] 220 | 221 | action3 = actions.groupby('cate', as_index=False).count() 222 | del action3['user_id'] 223 | action3.columns = ['cate', 'cate_cnt'] 224 | actions = pd.merge(action1, action2, how='left', on='user_id') 225 | actions = pd.merge(actions, action3, how='left', on='cate') 226 | 227 | actions['user_cnt'] = actions['cnt'] / actions['user_cnt'] 228 | actions['cate_cnt'] = actions['cnt'] / actions['cate_cnt'] 229 | del actions['cnt'] 230 | #pickle.dump(actions, open(dump_path, 'wb')) 231 | actions.columns = ['user_id', 'cate'] + ['cross_feat_' + str(i) for i in range(1, actions.shape[1] - 1)] 232 | print('cross feature finished') 233 | return actions 234 | 235 | 236 | # U_B对行为1,2,4,5进行 浏览次数/用户总浏览次数(或者物品的浏览次数) 237 | def get_user_feat15_v1(start_date, end_date): 238 | dump_path = './cache/user_feat15_v1_F11_5_%s_%s.pkl' % (start_date, end_date) 239 | if os.path.exists(dump_path): 240 | actions = pd.read_pickle(dump_path) 241 | actions.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in 242 | range(1, actions.shape[1] - 1)] 243 | return actions 244 | else: 245 | temp = None 246 | df = get_actions_product(start_date, end_date)[['user_id', 'cate', 'type']] 247 | for i in (1, 2, 3): 248 | actions = df[df['type'] == i] 249 | action1 = actions.groupby(['user_id', 'cate'], as_index=False).count() 250 | action1.columns = ['user_id', 'cate', 'visit'] 251 | 252 | action2 = actions.groupby('user_id', as_index=False).count() 253 | del action2['type'] 254 | action2.columns = ['user_id', 'user_visits_cate'] 255 | 256 | action4 = actions.groupby('cate', as_index=False).count() 257 | del action4['type'] 258 | action4.columns = ['cate', 'cate_visits_user'] 259 | 260 | actions = pd.merge(action1, action2, how='left', on='user_id') 261 | actions = pd.merge(actions, action4, how='left', on='cate') 262 | 263 | actions['visit_rate_user1'] = actions['visit'] / actions['user_visits_cate'] 264 | actions['visit_rate_cate1'] = actions['visit'] / actions['cate_visits_user'] 265 | if temp is None: 266 | temp = actions 267 | else: 268 | temp = pd.merge(temp, actions, how="outer", on=['user_id', 'cate']) 269 | #pickle.dump(temp, open(dump_path, 'wb')) 270 | temp.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in 271 | range(1, temp.shape[1] - 1)] 272 | return temp 273 | 274 | 275 | def get_last1day_cart_fearture(start_date, end_date, day): 276 | ''' 277 | 设计两个特征 278 | 第一个是在f12id上act5的总和 279 | 第二个是f12id act5行为总和 * (act_2==0) 280 | ''' 281 | this_end_date = pd.to_datetime(end_date) 282 | this_start_date = this_end_date - timedelta(days=day) 283 | 284 | # date转化为str 285 | this_end_date = str(this_end_date).split(' ')[0] 286 | this_start_date = str(this_start_date).split(' ')[0] 287 | x_action = get_actions_product_cart(this_start_date, this_end_date) 288 | print('from:', x_action.action_time.min(), ' to:', x_action.action_time.max()) 289 | 290 | x_oh = pd.get_dummies(x_action.type, prefix='act').astype('int8') 291 | 292 | x_action_oh = pd.concat([x_action[['user_id', 'cate', 'shop_id', 'sku_id', 'action_time']], x_oh], axis=1) 293 | 294 | x_act5_stat = x_action_oh.groupby(['user_id', 'cate', 'shop_id'])[['act_5', 'act_2']].sum().add_prefix( 295 | 'lastday_sum_').reset_index() 296 | 297 | x_act5_stat['cart_not_buy'] = x_act5_stat['lastday_sum_act_5'] * (x_act5_stat['lastday_sum_act_2'] == 0) 298 | 299 | x_act5_stat['cart_minus_buy'] = x_act5_stat['lastday_sum_act_5'] - x_act5_stat['lastday_sum_act_2'] 300 | 301 | return x_act5_stat 302 | 303 | 304 | # 标签 305 | def get_labels_v1(start_date, end_date): 306 | dump_path = './cache/labels_v1_F11_5_%s_%s.pkl' % (start_date, end_date) 307 | if os.path.exists(dump_path): 308 | actions = pd.read_pickle(dump_path) 309 | else: 310 | actions = get_actions_product(start_date, end_date) 311 | actions = actions[actions['type'] == 2] 312 | actions = actions.groupby(['user_id', 'cate'], as_index=False).sum() 313 | actions['label'] = 1 314 | actions = actions[['user_id', 'cate', 'label']] 315 | #actions.to_pickle(dump_path) 316 | print('label finished') 317 | return actions 318 | 319 | 320 | def make_train_set_F11_5(train_start_date, train_end_date, test_start_date, test_end_date, start): 321 | dump_path = './cache/train_set_v1_F11_5_%s_%s_%s_%s.pkl' % ( 322 | train_start_date, train_end_date, test_start_date, test_end_date) 323 | if os.path.exists(dump_path): 324 | actions = pd.read_pickle(dump_path) 325 | else: 326 | # 索引 327 | f11_actions = get_actions_product(train_start_date, train_end_date) 328 | f11_actions = f11_actions.drop_duplicates(['user_id', 'cate']) 329 | f11_actions = f11_actions[['user_id', 'cate']] 330 | 331 | # 标签 332 | labels = get_labels(test_start_date, test_end_date) 333 | 334 | # 特征 335 | start_days = "2018-02-01" # 336 | user = get_basic_user_feat() 337 | product_stat = get_product_stat_feat(start_days, train_end_date) 338 | time = get_time_feat(start_days, train_end_date) 339 | stat_feat = get_stat_feat_v1(start_days, train_end_date) 340 | user_feat = user_features(start_days, train_end_date) 341 | cross_feat = get_cross_feat_v1(start_days, train_end_date) 342 | 343 | # user 344 | user_feat1 = get_user_feat1(start_days, train_end_date) 345 | user_feat2 = get_user_feat2(start_days, train_end_date) 346 | user_feat3 = get_user_feat3(start_days, train_end_date) 347 | user_feat5 = get_user_feat5(start_days, train_end_date) 348 | user_feat6 = get_user_feat6(start_days, train_end_date) 349 | user_feat7 = get_user_feat7(start_days, train_end_date) 350 | user_feat8 = get_user_feat8(start_days, train_end_date) 351 | user_feat9 = get_user_feat9(start_days, train_end_date) 352 | user_feat10 = get_user_feat10(start_days, train_end_date) 353 | user_feat11 = get_user_feat11(start_days, train_end_date) 354 | user_feat12 = get_user_feat12(start_days, train_end_date) 355 | user_feat13 = get_user_feat13(start_days, train_end_date) 356 | user_feat14 = get_user_feat14(start_days, train_end_date) 357 | user_feat15 = get_user_feat15_v1(start_days, train_end_date) # 358 | 359 | cate_feat1 = get_cate_feat_1(start_days, train_end_date) 360 | cate_feat2 = get_cate_feat_2(start_days, train_end_date) 361 | cate_feat3 = get_cate_feat_3(start_days, train_end_date) 362 | cate_feat4 = get_cate_feat_4(start_days, train_end_date) 363 | cate_feat5 = get_cate_feat_5(start_days, train_end_date) 364 | cate_feat6 = get_cate_feat_6(start_days, train_end_date) 365 | cate_feat7 = get_cate_feat_7(start_days, train_end_date) 366 | cate_feat8 = get_cate_feat_8(start_days, train_end_date) 367 | cate_feat9 = get_cate_feat_9(start_days, train_end_date) 368 | cate_feat10 = get_cate_feat_10(start_days, train_end_date) 369 | cate_feat11 = get_cate_feat_11(start_days, train_end_date) 370 | 371 | F11_feat1 = get_F11_feat_1(start_days, train_end_date) 372 | F11_feat3 = get_F11_feat_3(start_days, train_end_date) 373 | F11_feat4 = get_F11_feat_4(start_days, train_end_date) 374 | F11_feat5 = get_F11_feat_5(start_days, train_end_date) 375 | F11_feat6 = get_F11_feat_6(start_days, train_end_date) 376 | F11_feat7 = get_F11_feat_7(start_days, train_end_date) 377 | F11_feat8 = get_F11_feat_8(start_days, train_end_date) 378 | F11_feat9 = get_F11_feat_9(start_days, train_end_date) 379 | F11_feat10 = get_F11_feat_10(start_days, train_end_date) 380 | F11_feat11 = get_F11_feat_11(start_days, train_end_date) 381 | 382 | # 滑窗行为特征 383 | actions = None 384 | for i in (5, 7, 14, 21, 30): 385 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) 386 | start_days = start_days.strftime('%Y-%m-%d') 387 | if actions is None: 388 | actions = get_accumulate_user_feat_v1(start_days, train_end_date) 389 | else: 390 | actions1 = get_accumulate_user_feat_v1(start_days, train_end_date) 391 | actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate']) 392 | 393 | # 前3天滑窗行为 包含cart 394 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=3) 395 | start_days = start_days.strftime('%Y-%m-%d') 396 | actions_cart = get_accumulate_user_cart_feat_v1(start_days, train_end_date) 397 | 398 | # act_5 399 | act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 3) 400 | act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum() 401 | del act5_feat['shop_id'] 402 | 403 | # 负采样 404 | f11_actions = pd.merge(f11_actions, labels, how='left', on=['user_id', 'cate']) 405 | f11_actions = f11_actions.fillna(0) 406 | print('train data size:', f11_actions.shape[0]) 407 | f11_actions_1 = f11_actions[f11_actions['label'] == 1] 408 | f11_actions_0 = f11_actions[f11_actions['label'] == 0] 409 | frac1 = (f11_actions_1.shape[0] * 30) / f11_actions_0.shape[0] # 负样本为正样本30倍 410 | f11_actions_0 = f11_actions_0.sample(frac=frac1).reset_index(drop=True) 411 | f11_actions = pd.concat([f11_actions_1, f11_actions_0], axis=0, ignore_index=True) 412 | f11_actions = f11_actions.sample(frac=1).reset_index(drop=True) 413 | print('train data size after sample:', f11_actions.shape[0]) 414 | 415 | actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate']) 416 | actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate']) 417 | actions = pd.merge(actions, user, how='left', on='user_id') 418 | actions = pd.merge(actions, time, how='left', on='user_id') 419 | actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate']) 420 | actions = pd.merge(actions, product_stat, how='left', on='cate') 421 | 422 | actions = pd.merge(actions, user_feat1, how='left', on='user_id') 423 | actions = pd.merge(actions, user_feat2, how='left', on='user_id') 424 | actions = pd.merge(actions, user_feat3, how='left', on='user_id') 425 | actions = pd.merge(actions, user_feat5, how='left', on='user_id') 426 | actions = pd.merge(actions, user_feat6, how='left', on='user_id') 427 | actions = pd.merge(actions, user_feat7, how='left', on='user_id') 428 | actions = pd.merge(actions, user_feat8, how='left', on='user_id') 429 | actions = pd.merge(actions, user_feat9, how='left', on='user_id') 430 | actions = pd.merge(actions, user_feat10, how='left', on='user_id') 431 | actions = pd.merge(actions, user_feat11, how='left', on='user_id') 432 | actions = pd.merge(actions, user_feat12, how='left', on='user_id') 433 | actions = pd.merge(actions, user_feat13, how='left', on='user_id') 434 | actions = pd.merge(actions, user_feat14, how='left', on='user_id') 435 | actions = pd.merge(actions, user_feat, how='left', on='user_id') 436 | actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate']) 437 | 438 | """ 439 | cate 440 | """ 441 | actions = pd.merge(actions, cate_feat1, how='left', on='cate') 442 | actions = pd.merge(actions, cate_feat2, how='left', on='cate') 443 | actions = pd.merge(actions, cate_feat3, how='left', on='cate') 444 | actions = pd.merge(actions, cate_feat4, how='left', on='cate') 445 | actions = pd.merge(actions, cate_feat5, how='left', on='cate') 446 | actions = pd.merge(actions, cate_feat6, how='left', on='cate') 447 | actions = pd.merge(actions, cate_feat7, how='left', on='cate') 448 | actions = pd.merge(actions, cate_feat8, how='left', on='cate') 449 | actions = pd.merge(actions, cate_feat9, how='left', on='cate') 450 | actions = pd.merge(actions, cate_feat10, how='left', on='cate') 451 | actions = pd.merge(actions, cate_feat11, how='left', on='cate') 452 | print('cate finished') 453 | """ 454 | F11 455 | """ 456 | actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate']) 457 | actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate']) 458 | actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate']) 459 | actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate']) 460 | actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate']) 461 | actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate']) 462 | actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate']) 463 | actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate']) 464 | actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate']) 465 | actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate']) 466 | print('F11 finished') 467 | 468 | actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate']) 469 | actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate']) 470 | actions = actions.fillna(0) 471 | # actions.to_pickle(dump_path) 472 | print('train_set finised') 473 | return actions 474 | 475 | 476 | def make_test_set_F11_5(train_start_date, train_end_date,start): 477 | dump_path = './cache/test_set_F11_5_%s_%s.pkl' % (train_start_date, train_end_date) 478 | if os.path.exists(dump_path): 479 | actions = pd.read_pickle(dump_path) 480 | else: 481 | # 索引 482 | f11_actions = get_actions_product(train_start_date, train_end_date) 483 | f11_actions = f11_actions.drop_duplicates(['user_id', 'cate']) 484 | f11_actions = f11_actions[['user_id', 'cate']] # 485 | 486 | # 特征 487 | start_days = "2018-02-01" # 488 | user = get_basic_user_feat() 489 | product_stat = get_product_stat_feat(start_days, train_end_date) 490 | time = get_time_feat(start_days, train_end_date) 491 | stat_feat = get_stat_feat_v1(start_days, train_end_date) 492 | user_feat = user_features(start_days, train_end_date) 493 | cross_feat = get_cross_feat_v1(start_days, train_end_date) 494 | 495 | # user 496 | user_feat1 = get_user_feat1(start_days, train_end_date) 497 | user_feat2 = get_user_feat2(start_days, train_end_date) 498 | user_feat3 = get_user_feat3(start_days, train_end_date) 499 | user_feat5 = get_user_feat5(start_days, train_end_date) 500 | user_feat6 = get_user_feat6(start_days, train_end_date) 501 | user_feat7 = get_user_feat7(start_days, train_end_date) 502 | user_feat8 = get_user_feat8(start_days, train_end_date) 503 | user_feat9 = get_user_feat9(start_days, train_end_date) 504 | user_feat10 = get_user_feat10(start_days, train_end_date) 505 | user_feat11 = get_user_feat11(start_days, train_end_date) 506 | user_feat12 = get_user_feat12(start_days, train_end_date) 507 | user_feat13 = get_user_feat13(start_days, train_end_date) 508 | user_feat14 = get_user_feat14(start_days, train_end_date) 509 | user_feat15 = get_user_feat15_v1(start_days, train_end_date) # 510 | 511 | cate_feat1 = get_cate_feat_1(start_days, train_end_date) 512 | cate_feat2 = get_cate_feat_2(start_days, train_end_date) 513 | cate_feat3 = get_cate_feat_3(start_days, train_end_date) 514 | cate_feat4 = get_cate_feat_4(start_days, train_end_date) 515 | cate_feat5 = get_cate_feat_5(start_days, train_end_date) 516 | cate_feat6 = get_cate_feat_6(start_days, train_end_date) 517 | cate_feat7 = get_cate_feat_7(start_days, train_end_date) 518 | cate_feat8 = get_cate_feat_8(start_days, train_end_date) 519 | cate_feat9 = get_cate_feat_9(start_days, train_end_date) 520 | cate_feat10 = get_cate_feat_10(start_days, train_end_date) 521 | cate_feat11 = get_cate_feat_11(start_days, train_end_date) 522 | 523 | F11_feat1 = get_F11_feat_1(start_days, train_end_date) 524 | F11_feat3 = get_F11_feat_3(start_days, train_end_date) 525 | F11_feat4 = get_F11_feat_4(start_days, train_end_date) 526 | F11_feat5 = get_F11_feat_5(start_days, train_end_date) 527 | F11_feat6 = get_F11_feat_6(start_days, train_end_date) 528 | F11_feat7 = get_F11_feat_7(start_days, train_end_date) 529 | F11_feat8 = get_F11_feat_8(start_days, train_end_date) 530 | F11_feat9 = get_F11_feat_9(start_days, train_end_date) 531 | F11_feat10 = get_F11_feat_10(start_days, train_end_date) 532 | F11_feat11 = get_F11_feat_11(start_days, train_end_date) 533 | 534 | # 滑窗行为特征 535 | actions = None 536 | for i in (5, 7, 14, 21, 30): 537 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) 538 | start_days = start_days.strftime('%Y-%m-%d') 539 | if actions is None: 540 | actions = get_accumulate_user_feat_v1(start_days, train_end_date) 541 | else: 542 | actions1 = get_accumulate_user_feat_v1(start_days, train_end_date) 543 | actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate']) 544 | 545 | # 前3天滑窗行为 包含cart 546 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=3) 547 | start_days = start_days.strftime('%Y-%m-%d') 548 | actions_cart = get_accumulate_user_cart_feat_v1(start_days, train_end_date) 549 | 550 | # act_5 551 | act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 3) 552 | act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum() 553 | del act5_feat['shop_id'] 554 | 555 | actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate']) 556 | actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate']) 557 | actions = pd.merge(actions, user, how='left', on='user_id') 558 | actions = pd.merge(actions, time, how='left', on='user_id') 559 | actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate']) 560 | actions = pd.merge(actions, product_stat, how='left', on='cate') 561 | 562 | actions = pd.merge(actions, user_feat1, how='left', on='user_id') 563 | actions = pd.merge(actions, user_feat2, how='left', on='user_id') 564 | actions = pd.merge(actions, user_feat3, how='left', on='user_id') 565 | actions = pd.merge(actions, user_feat5, how='left', on='user_id') 566 | actions = pd.merge(actions, user_feat6, how='left', on='user_id') 567 | actions = pd.merge(actions, user_feat7, how='left', on='user_id') 568 | actions = pd.merge(actions, user_feat8, how='left', on='user_id') 569 | actions = pd.merge(actions, user_feat9, how='left', on='user_id') 570 | actions = pd.merge(actions, user_feat10, how='left', on='user_id') 571 | actions = pd.merge(actions, user_feat11, how='left', on='user_id') 572 | actions = pd.merge(actions, user_feat12, how='left', on='user_id') 573 | actions = pd.merge(actions, user_feat13, how='left', on='user_id') 574 | actions = pd.merge(actions, user_feat14, how='left', on='user_id') 575 | actions = pd.merge(actions, user_feat, how='left', on='user_id') 576 | actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate']) 577 | 578 | """ 579 | cate 580 | """ 581 | actions = pd.merge(actions, cate_feat1, how='left', on='cate') 582 | actions = pd.merge(actions, cate_feat2, how='left', on='cate') 583 | actions = pd.merge(actions, cate_feat3, how='left', on='cate') 584 | actions = pd.merge(actions, cate_feat4, how='left', on='cate') 585 | actions = pd.merge(actions, cate_feat5, how='left', on='cate') 586 | actions = pd.merge(actions, cate_feat6, how='left', on='cate') 587 | actions = pd.merge(actions, cate_feat7, how='left', on='cate') 588 | actions = pd.merge(actions, cate_feat8, how='left', on='cate') 589 | actions = pd.merge(actions, cate_feat9, how='left', on='cate') 590 | actions = pd.merge(actions, cate_feat10, how='left', on='cate') 591 | actions = pd.merge(actions, cate_feat11, how='left', on='cate') 592 | print('cate finished') 593 | """ 594 | F11 595 | """ 596 | actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate']) 597 | actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate']) 598 | actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate']) 599 | actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate']) 600 | actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate']) 601 | actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate']) 602 | actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate']) 603 | actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate']) 604 | actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate']) 605 | actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate']) 606 | print('F11 finished') 607 | 608 | actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate']) 609 | actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate']) 610 | actions = actions.fillna(0) 611 | del stat_feat, f11_actions 612 | print('test_set finished') 613 | return actions 614 | 615 | 616 | def lgb_train_F11_5(X_train1, y_train1, X_test1, sub_user_index): 617 | # 提交结果 618 | sub = sub_user_index[['user_id', 'cate']].copy() 619 | sub['shop_id'] = 0 620 | sub['label'] = 0 621 | 622 | # 训练测试集 623 | X_train = X_train1.values 624 | y_train = y_train1.values 625 | X_test = X_test1.values 626 | 627 | del X_train1, y_train1, X_test1 628 | 629 | print('================================') 630 | print(X_train.shape) 631 | print(X_test.shape) 632 | print('================================') 633 | 634 | xx_logloss = [] 635 | oof_preds = np.zeros(X_train.shape[0]) 636 | N = 5 637 | skf = StratifiedKFold(n_splits=N, random_state=1024, shuffle=True) 638 | 639 | params = { 640 | 'learning_rate': 0.01, 641 | 'boosting_type': 'gbdt', 642 | 'objective': 'binary', 643 | 'metric': 'binary_logloss', 644 | 'num_leaves': 31, 645 | 'feature_fraction': 0.8, 646 | 'bagging_fraction': 0.8, 647 | 'bagging_freq': 5, 648 | 'seed': 1, 649 | 'bagging_seed': 1, 650 | 'feature_fraction_seed': 7, 651 | 'min_data_in_leaf': 20, 652 | 'nthread': -1, # -1 653 | 'verbose': -1, 654 | } 655 | for k, (train_index, test_index) in enumerate(skf.split(X_train, y_train)): 656 | print('train _K_ flod', k) 657 | 658 | lgb_train = lgb.Dataset(X_train[train_index], y_train[train_index]) 659 | lgb_evals = lgb.Dataset(X_train[test_index], y_train[test_index], reference=lgb_train) 660 | 661 | lgbm = lgb.train(params, lgb_train, num_boost_round=50000, valid_sets=[lgb_train, lgb_evals], 662 | valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=200) 663 | 664 | sub['label'] += lgbm.predict(X_test, num_iteration=lgbm.best_iteration) / N 665 | oof_preds[test_index] = lgbm.predict(X_train[test_index], num_iteration=lgbm.best_iteration) 666 | xx_logloss.append(lgbm.best_score['valid']['binary_logloss']) 667 | print(xx_logloss) 668 | a = np.mean(xx_logloss) 669 | a = round(a, 5) 670 | print(a) 671 | 672 | sub = sub.sort_values(by='label', ascending=False) 673 | sub = sub.head(50000) 674 | sub = sub[['user_id', 'cate', 'shop_id', 'label']] 675 | 676 | sub.to_csv('./res/sub_F11_5.csv', index=False, index_label=False) --------------------------------------------------------------------------------