├── README.md ├── cache └── README.md ├── data └── README.md ├── gen_feat.py ├── sub └── README.md └── train.py /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/foursking1/jd/5f1e22ff4e76e7731e04f557421e8f9cd4e9e873/README.md -------------------------------------------------------------------------------- /cache/README.md: -------------------------------------------------------------------------------- 1 | # 缓存目录 2 | -------------------------------------------------------------------------------- /data/README.md: -------------------------------------------------------------------------------- 1 | # 数据目录 2 | -------------------------------------------------------------------------------- /gen_feat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: UTF-8 -*- 3 | import time 4 | from datetime import datetime 5 | from datetime import timedelta 6 | import pandas as pd 7 | import pickle 8 | import os 9 | import math 10 | import numpy as np 11 | 12 | action_1_path = "./data/JData_Action_201602.csv" 13 | action_2_path = "./data/JData_Action_201603.csv" 14 | action_3_path = "./data/JData_Action_201604.csv" 15 | comment_path = "./data/JData_Comment.csv" 16 | product_path = "./data/JData_Product.csv" 17 | user_path = "./data/JData_User.csv" 18 | 19 | comment_date = ["2016-02-01", "2016-02-08", "2016-02-15", "2016-02-22", "2016-02-29", "2016-03-07", "2016-03-14", 20 | "2016-03-21", "2016-03-28", 21 | "2016-04-04", "2016-04-11", "2016-04-15"] 22 | 23 | 24 | def convert_age(age_str): 25 | if age_str == u'-1': 26 | return 0 27 | elif age_str == u'15岁以下': 28 | return 1 29 | elif age_str == u'16-25岁': 30 | return 2 31 | elif age_str == u'26-35岁': 32 | return 3 33 | elif age_str == u'36-45岁': 34 | return 4 35 | elif age_str == u'46-55岁': 36 | return 5 37 | elif age_str == u'56岁以上': 38 | return 6 39 | else: 40 | return -1 41 | 42 | def get_basic_user_feat(): 43 | dump_path = './cache/basic_user.pkl' 44 | if os.path.exists(dump_path): 45 | user = pickle.load(open(dump_path)) 46 | else: 47 | user = pd.read_csv(user_path, encoding='gbk') 48 | user['age'] = user['age'].map(convert_age) 49 | age_df = pd.get_dummies(user["age"], prefix="age") 50 | sex_df = pd.get_dummies(user["sex"], prefix="sex") 51 | user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd") 52 | user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1) 53 | pickle.dump(user, open(dump_path, 'w')) 54 | return user 55 | 56 | 57 | def get_basic_product_feat(): 58 | dump_path = './cache/basic_product.pkl' 59 | if os.path.exists(dump_path): 60 | product = pickle.load(open(dump_path)) 61 | else: 62 | product = pd.read_csv(product_path) 63 | attr1_df = pd.get_dummies(product["a1"], prefix="a1") 64 | attr2_df = pd.get_dummies(product["a2"], prefix="a2") 65 | attr3_df = pd.get_dummies(product["a3"], prefix="a3") 66 | product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1) 67 | pickle.dump(product, open(dump_path, 'w')) 68 | return product 69 | 70 | 71 | def get_actions_1(): 72 | action = pd.read_csv(action_1_path) 73 | return action 74 | 75 | def get_actions_2(): 76 | action2 = pd.read_csv(action_2_path) 77 | return action2 78 | 79 | def get_actions_3(): 80 | action3 = pd.read_csv(action_3_path) 81 | return action3 82 | 83 | 84 | def get_actions(start_date, end_date): 85 | """ 86 | 87 | :param start_date: 88 | :param end_date: 89 | :return: actions: pd.Dataframe 90 | """ 91 | dump_path = './cache/all_action_%s_%s.pkl' % (start_date, end_date) 92 | if os.path.exists(dump_path): 93 | actions = pickle.load(open(dump_path)) 94 | else: 95 | action_1 = get_actions_1() 96 | action_2 = get_actions_2() 97 | action_3 = get_actions_3() 98 | actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame 99 | actions = actions[(actions.time >= start_date) & (actions.time < end_date)] 100 | pickle.dump(actions, open(dump_path, 'w')) 101 | return actions 102 | 103 | 104 | def get_action_feat(start_date, end_date): 105 | dump_path = './cache/action_accumulate_%s_%s.pkl' % (start_date, end_date) 106 | if os.path.exists(dump_path): 107 | actions = pickle.load(open(dump_path)) 108 | else: 109 | actions = get_actions(start_date, end_date) 110 | actions = actions[['user_id', 'sku_id', 'type']] 111 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date)) 112 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 113 | actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum() 114 | del actions['type'] 115 | pickle.dump(actions, open(dump_path, 'w')) 116 | return actions 117 | 118 | 119 | def get_accumulate_action_feat(start_date, end_date): 120 | dump_path = './cache/action_accumulate_%s_%s.pkl' % (start_date, end_date) 121 | if os.path.exists(dump_path): 122 | actions = pickle.load(open(dump_path)) 123 | else: 124 | actions = get_actions(start_date, end_date) 125 | df = pd.get_dummies(actions['type'], prefix='action') 126 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame 127 | #近期行为按时间衰减 128 | actions['weights'] = actions['time'].map(lambda x: datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) 129 | #actions['weights'] = time.strptime(end_date, '%Y-%m-%d') - actions['datetime'] 130 | actions['weights'] = actions['weights'].map(lambda x: math.exp(-x.days)) 131 | print actions.head(10) 132 | actions['action_1'] = actions['action_1'] * actions['weights'] 133 | actions['action_2'] = actions['action_2'] * actions['weights'] 134 | actions['action_3'] = actions['action_3'] * actions['weights'] 135 | actions['action_4'] = actions['action_4'] * actions['weights'] 136 | actions['action_5'] = actions['action_5'] * actions['weights'] 137 | actions['action_6'] = actions['action_6'] * actions['weights'] 138 | del actions['model_id'] 139 | del actions['type'] 140 | del actions['time'] 141 | del actions['datetime'] 142 | del actions['weights'] 143 | actions = actions.groupby(['user_id', 'sku_id', 'cate', 'brand'], as_index=False).sum() 144 | pickle.dump(actions, open(dump_path, 'w')) 145 | return actions 146 | 147 | 148 | def get_comments_product_feat(start_date, end_date): 149 | dump_path = './cache/comments_accumulate_%s_%s.pkl' % (start_date, end_date) 150 | if os.path.exists(dump_path): 151 | comments = pickle.load(open(dump_path)) 152 | else: 153 | comments = pd.read_csv(comment_path) 154 | comment_date_end = end_date 155 | comment_date_begin = comment_date[0] 156 | for date in reversed(comment_date): 157 | if date < comment_date_end: 158 | comment_date_begin = date 159 | break 160 | comments = comments[(comments.dt >= comment_date_begin) & (comments.dt < comment_date_end)] 161 | df = pd.get_dummies(comments['comment_num'], prefix='comment_num') 162 | comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame 163 | #del comments['dt'] 164 | #del comments['comment_num'] 165 | comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']] 166 | pickle.dump(comments, open(dump_path, 'w')) 167 | return comments 168 | 169 | 170 | def get_accumulate_user_feat(start_date, end_date): 171 | feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio', 172 | 'user_action_5_ratio', 'user_action_6_ratio'] 173 | dump_path = './cache/user_feat_accumulate_%s_%s.pkl' % (start_date, end_date) 174 | if os.path.exists(dump_path): 175 | actions = pickle.load(open(dump_path)) 176 | else: 177 | actions = get_actions(start_date, end_date) 178 | df = pd.get_dummies(actions['type'], prefix='action') 179 | actions = pd.concat([actions['user_id'], df], axis=1) 180 | actions = actions.groupby(['user_id'], as_index=False).sum() 181 | actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1'] 182 | actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2'] 183 | actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3'] 184 | actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5'] 185 | actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6'] 186 | actions = actions[feature] 187 | pickle.dump(actions, open(dump_path, 'w')) 188 | return actions 189 | 190 | 191 | def get_accumulate_product_feat(start_date, end_date): 192 | feature = ['sku_id', 'product_action_1_ratio', 'product_action_2_ratio', 'product_action_3_ratio', 193 | 'product_action_5_ratio', 'product_action_6_ratio'] 194 | dump_path = './cache/product_feat_accumulate_%s_%s.pkl' % (start_date, end_date) 195 | if os.path.exists(dump_path): 196 | actions = pickle.load(open(dump_path)) 197 | else: 198 | actions = get_actions(start_date, end_date) 199 | df = pd.get_dummies(actions['type'], prefix='action') 200 | actions = pd.concat([actions['sku_id'], df], axis=1) 201 | actions = actions.groupby(['sku_id'], as_index=False).sum() 202 | actions['product_action_1_ratio'] = actions['action_4'] / actions['action_1'] 203 | actions['product_action_2_ratio'] = actions['action_4'] / actions['action_2'] 204 | actions['product_action_3_ratio'] = actions['action_4'] / actions['action_3'] 205 | actions['product_action_5_ratio'] = actions['action_4'] / actions['action_5'] 206 | actions['product_action_6_ratio'] = actions['action_4'] / actions['action_6'] 207 | actions = actions[feature] 208 | pickle.dump(actions, open(dump_path, 'w')) 209 | return actions 210 | 211 | 212 | def get_labels(start_date, end_date): 213 | dump_path = './cache/labels_%s_%s.pkl' % (start_date, end_date) 214 | if os.path.exists(dump_path): 215 | actions = pickle.load(open(dump_path)) 216 | else: 217 | actions = get_actions(start_date, end_date) 218 | actions = actions[actions['type'] == 4] 219 | actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum() 220 | actions['label'] = 1 221 | actions = actions[['user_id', 'sku_id', 'label']] 222 | pickle.dump(actions, open(dump_path, 'w')) 223 | return actions 224 | 225 | 226 | def make_test_set(train_start_date, train_end_date): 227 | dump_path = './cache/test_set_%s_%s.pkl' % (train_start_date, train_end_date) 228 | if os.path.exists(dump_path): 229 | actions = pickle.load(open(dump_path)) 230 | else: 231 | start_days = "2016-02-01" 232 | user = get_basic_user_feat() 233 | product = get_basic_product_feat() 234 | user_acc = get_accumulate_user_feat(start_days, train_end_date) 235 | product_acc = get_accumulate_product_feat(start_days, train_end_date) 236 | comment_acc = get_comments_product_feat(train_start_date, train_end_date) 237 | #labels = get_labels(test_start_date, test_end_date) 238 | 239 | # generate 时间窗口 240 | # actions = get_accumulate_action_feat(train_start_date, train_end_date) 241 | actions = None 242 | for i in (1, 2, 3, 5, 7, 10, 15, 21, 30): 243 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) 244 | start_days = start_days.strftime('%Y-%m-%d') 245 | if actions is None: 246 | actions = get_action_feat(start_days, train_end_date) 247 | else: 248 | actions = pd.merge(actions, get_action_feat(start_days, train_end_date), how='left', 249 | on=['user_id', 'sku_id']) 250 | 251 | actions = pd.merge(actions, user, how='left', on='user_id') 252 | actions = pd.merge(actions, user_acc, how='left', on='user_id') 253 | actions = pd.merge(actions, product, how='left', on='sku_id') 254 | actions = pd.merge(actions, product_acc, how='left', on='sku_id') 255 | actions = pd.merge(actions, comment_acc, how='left', on='sku_id') 256 | #actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id']) 257 | actions = actions.fillna(0) 258 | actions = actions[actions['cate'] == 8] 259 | 260 | users = actions[['user_id', 'sku_id']].copy() 261 | del actions['user_id'] 262 | del actions['sku_id'] 263 | return users, actions 264 | 265 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date, days=30): 266 | dump_path = './cache/train_set_%s_%s_%s_%s.pkl' % (train_start_date, train_end_date, test_start_date, test_end_date) 267 | if os.path.exists(dump_path): 268 | actions = pickle.load(open(dump_path)) 269 | else: 270 | start_days = "2016-02-01" 271 | user = get_basic_user_feat() 272 | product = get_basic_product_feat() 273 | user_acc = get_accumulate_user_feat(start_days, train_end_date) 274 | product_acc = get_accumulate_product_feat(start_days, train_end_date) 275 | comment_acc = get_comments_product_feat(train_start_date, train_end_date) 276 | labels = get_labels(test_start_date, test_end_date) 277 | 278 | # generate 时间窗口 279 | # actions = get_accumulate_action_feat(train_start_date, train_end_date) 280 | actions = None 281 | for i in (1, 2, 3, 5, 7, 10, 15, 21, 30): 282 | start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i) 283 | start_days = start_days.strftime('%Y-%m-%d') 284 | if actions is None: 285 | actions = get_action_feat(start_days, train_end_date) 286 | else: 287 | actions = pd.merge(actions, get_action_feat(start_days, train_end_date), how='left', 288 | on=['user_id', 'sku_id']) 289 | 290 | actions = pd.merge(actions, user, how='left', on='user_id') 291 | actions = pd.merge(actions, user_acc, how='left', on='user_id') 292 | actions = pd.merge(actions, product, how='left', on='sku_id') 293 | actions = pd.merge(actions, product_acc, how='left', on='sku_id') 294 | actions = pd.merge(actions, comment_acc, how='left', on='sku_id') 295 | actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id']) 296 | actions = actions.fillna(0) 297 | 298 | users = actions[['user_id', 'sku_id']].copy() 299 | labels = actions['label'].copy() 300 | del actions['user_id'] 301 | del actions['sku_id'] 302 | del actions['label'] 303 | 304 | return users, actions, labels 305 | 306 | 307 | def report(pred, label): 308 | 309 | actions = label 310 | result = pred 311 | 312 | # 所有用户商品对 313 | all_user_item_pair = actions['user_id'].map(str) + '-' + actions['sku_id'].map(str) 314 | all_user_item_pair = np.array(all_user_item_pair) 315 | # 所有购买用户 316 | all_user_set = actions['user_id'].unique() 317 | 318 | # 所有品类中预测购买的用户 319 | all_user_test_set = result['user_id'].unique() 320 | all_user_test_item_pair = result['user_id'].map(str) + '-' + result['sku_id'].map(str) 321 | all_user_test_item_pair = np.array(all_user_test_item_pair) 322 | 323 | # 计算所有用户购买评价指标 324 | pos, neg = 0,0 325 | for user_id in all_user_test_set: 326 | if user_id in all_user_set: 327 | pos += 1 328 | else: 329 | neg += 1 330 | all_user_acc = 1.0 * pos / ( pos + neg) 331 | all_user_recall = 1.0 * pos / len(all_user_set) 332 | print '所有用户中预测购买用户的准确率为 ' + str(all_user_acc) 333 | print '所有用户中预测购买用户的召回率' + str(all_user_recall) 334 | 335 | pos, neg = 0, 0 336 | for user_item_pair in all_user_test_item_pair: 337 | if user_item_pair in all_user_item_pair: 338 | pos += 1 339 | else: 340 | neg += 1 341 | all_item_acc = 1.0 * pos / ( pos + neg) 342 | all_item_recall = 1.0 * pos / len(all_user_item_pair) 343 | print '所有用户中预测购买商品的准确率为 ' + str(all_item_acc) 344 | print '所有用户中预测购买商品的召回率' + str(all_item_recall) 345 | F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc) 346 | F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc) 347 | score = 0.4 * F11 + 0.6 * F12 348 | print 'F11=' + str(F11) 349 | print 'F12=' + str(F12) 350 | print 'score=' + str(score) 351 | 352 | if __name__ == '__main__': 353 | train_start_date = '2016-02-01' 354 | train_end_date = '2016-03-01' 355 | test_start_date = '2016-03-01' 356 | test_end_date = '2016-03-05' 357 | user, action, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) 358 | print user.head(10) 359 | print action.head(10) 360 | 361 | 362 | 363 | 364 | -------------------------------------------------------------------------------- /sub/README.md: -------------------------------------------------------------------------------- 1 | # 结果目录 2 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | __author__ = 'foursking' 2 | from gen_feat import make_train_set 3 | from gen_feat import make_test_set 4 | from sklearn.model_selection import train_test_split 5 | import xgboost as xgb 6 | from gen_feat import report 7 | 8 | 9 | def xgboost_make_submission(): 10 | train_start_date = '2016-03-10' 11 | train_end_date = '2016-04-11' 12 | test_start_date = '2016-04-11' 13 | test_end_date = '2016-04-16' 14 | 15 | sub_start_date = '2016-03-15' 16 | sub_end_date = '2016-04-16' 17 | 18 | user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) 19 | X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0) 20 | dtrain=xgb.DMatrix(X_train, label=y_train) 21 | dtest=xgb.DMatrix(X_test, label=y_test) 22 | param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 23 | 'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8, 24 | 'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} 25 | num_round = 283 26 | param['nthread'] = 4 27 | #param['eval_metric'] = "auc" 28 | plst = param.items() 29 | plst += [('eval_metric', 'logloss')] 30 | evallist = [(dtest, 'eval'), (dtrain, 'train')] 31 | bst=xgb.train(plst, dtrain, num_round, evallist) 32 | sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,) 33 | sub_trainning_data = xgb.DMatrix(sub_trainning_data.values) 34 | y = bst.predict(sub_trainning_data) 35 | sub_user_index['label'] = y 36 | pred = sub_user_index[sub_user_index['label'] >= 0.03] 37 | pred = pred[['user_id', 'sku_id']] 38 | pred = pred.groupby('user_id').first().reset_index() 39 | pred['user_id'] = pred['user_id'].astype(int) 40 | pred.to_csv('./sub/submission.csv', index=False, index_label=False) 41 | 42 | 43 | 44 | def xgboost_cv(): 45 | train_start_date = '2016-03-05' 46 | train_end_date = '2016-04-06' 47 | test_start_date = '2016-04-11' 48 | test_end_date = '2016-04-16' 49 | 50 | sub_start_date = '2016-02-05' 51 | sub_end_date = '2016-03-05' 52 | sub_test_start_date = '2016-03-05' 53 | sub_test_end_date = '2016-03-10' 54 | 55 | user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date) 56 | X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0) 57 | dtrain=xgb.DMatrix(X_train, label=y_train) 58 | dtest=xgb.DMatrix(X_test, label=y_test) 59 | param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'} 60 | num_round = 4000 61 | param['nthread'] = 4 62 | param['eval_metric'] = "auc" 63 | plst = param.items() 64 | plst += [('eval_metric', 'logloss')] 65 | evallist = [(dtest, 'eval'), (dtrain, 'train')] 66 | bst=xgb.train( plst, dtrain, num_round, evallist) 67 | 68 | sub_user_index, sub_trainning_date, sub_label = make_train_set(sub_start_date, sub_end_date, 69 | sub_test_start_date, sub_test_end_date) 70 | test = xgb.DMatrix(sub_trainning_date) 71 | #y = bst.predict(test) 72 | 73 | pred = sub_user_index.copy() 74 | y_true = sub_user_index.copy() 75 | pred['label'] = y 76 | y_true['label'] = label 77 | report(pred, y_true) 78 | 79 | 80 | if __name__ == '__main__': 81 | #xgboost_cv() 82 | xgboost_make_submission() 83 | --------------------------------------------------------------------------------