├── README.md
├── cache
    └── README.md
├── data
    └── README.md
├── gen_feat.py
├── sub
    └── README.md
└── train.py


/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/foursking1/jd/5f1e22ff4e76e7731e04f557421e8f9cd4e9e873/README.md


--------------------------------------------------------------------------------
/cache/README.md:
--------------------------------------------------------------------------------
1 | # 缓存目录
2 | 


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
1 | # 数据目录
2 | 


--------------------------------------------------------------------------------
/gen_feat.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: UTF-8 -*-
  3 | import time
  4 | from datetime import datetime
  5 | from datetime import timedelta
  6 | import pandas as pd
  7 | import pickle
  8 | import os
  9 | import math
 10 | import numpy as np
 11 | 
 12 | action_1_path = "./data/JData_Action_201602.csv"
 13 | action_2_path = "./data/JData_Action_201603.csv"
 14 | action_3_path = "./data/JData_Action_201604.csv"
 15 | comment_path = "./data/JData_Comment.csv"
 16 | product_path = "./data/JData_Product.csv"
 17 | user_path = "./data/JData_User.csv"
 18 | 
 19 | comment_date = ["2016-02-01", "2016-02-08", "2016-02-15", "2016-02-22", "2016-02-29", "2016-03-07", "2016-03-14",
 20 |                 "2016-03-21", "2016-03-28",
 21 |                 "2016-04-04", "2016-04-11", "2016-04-15"]
 22 | 
 23 | 
 24 | def convert_age(age_str):
 25 |     if age_str == u'-1':
 26 |         return 0
 27 |     elif age_str == u'15岁以下':
 28 |         return 1
 29 |     elif age_str == u'16-25岁':
 30 |         return 2
 31 |     elif age_str == u'26-35岁':
 32 |         return 3
 33 |     elif age_str == u'36-45岁':
 34 |         return 4
 35 |     elif age_str == u'46-55岁':
 36 |         return 5
 37 |     elif age_str == u'56岁以上':
 38 |         return 6
 39 |     else:
 40 |         return -1
 41 | 
 42 | def get_basic_user_feat():
 43 |     dump_path = './cache/basic_user.pkl'
 44 |     if os.path.exists(dump_path):
 45 |         user = pickle.load(open(dump_path))
 46 |     else:
 47 |         user = pd.read_csv(user_path, encoding='gbk')
 48 |         user['age'] = user['age'].map(convert_age)
 49 |         age_df = pd.get_dummies(user["age"], prefix="age")
 50 |         sex_df = pd.get_dummies(user["sex"], prefix="sex")
 51 |         user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
 52 |         user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
 53 |         pickle.dump(user, open(dump_path, 'w'))
 54 |     return user
 55 | 
 56 | 
 57 | def get_basic_product_feat():
 58 |     dump_path = './cache/basic_product.pkl'
 59 |     if os.path.exists(dump_path):
 60 |         product = pickle.load(open(dump_path))
 61 |     else:
 62 |         product = pd.read_csv(product_path)
 63 |         attr1_df = pd.get_dummies(product["a1"], prefix="a1")
 64 |         attr2_df = pd.get_dummies(product["a2"], prefix="a2")
 65 |         attr3_df = pd.get_dummies(product["a3"], prefix="a3")
 66 |         product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)
 67 |         pickle.dump(product, open(dump_path, 'w'))
 68 |     return product
 69 | 
 70 | 
 71 | def get_actions_1():
 72 |     action = pd.read_csv(action_1_path)
 73 |     return action
 74 | 
 75 | def get_actions_2():
 76 |     action2 = pd.read_csv(action_2_path)
 77 |     return action2
 78 | 
 79 | def get_actions_3():
 80 |     action3 = pd.read_csv(action_3_path)
 81 |     return action3
 82 | 
 83 | 
 84 | def get_actions(start_date, end_date):
 85 |     """
 86 | 
 87 |     :param start_date:
 88 |     :param end_date:
 89 |     :return: actions: pd.Dataframe
 90 |     """
 91 |     dump_path = './cache/all_action_%s_%s.pkl' % (start_date, end_date)
 92 |     if os.path.exists(dump_path):
 93 |         actions = pickle.load(open(dump_path))
 94 |     else:
 95 |         action_1 = get_actions_1()
 96 |         action_2 = get_actions_2()
 97 |         action_3 = get_actions_3()
 98 |         actions = pd.concat([action_1, action_2, action_3]) # type: pd.DataFrame
 99 |         actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
100 |         pickle.dump(actions, open(dump_path, 'w'))
101 |     return actions
102 | 
103 | 
104 | def get_action_feat(start_date, end_date):
105 |     dump_path = './cache/action_accumulate_%s_%s.pkl' % (start_date, end_date)
106 |     if os.path.exists(dump_path):
107 |         actions = pickle.load(open(dump_path))
108 |     else:
109 |         actions = get_actions(start_date, end_date)
110 |         actions = actions[['user_id', 'sku_id', 'type']]
111 |         df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
112 |         actions = pd.concat([actions, df], axis=1)  # type: pd.DataFrame
113 |         actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
114 |         del actions['type']
115 |         pickle.dump(actions, open(dump_path, 'w'))
116 |     return actions
117 | 
118 | 
119 | def get_accumulate_action_feat(start_date, end_date):
120 |     dump_path = './cache/action_accumulate_%s_%s.pkl' % (start_date, end_date)
121 |     if os.path.exists(dump_path):
122 |         actions = pickle.load(open(dump_path))
123 |     else:
124 |         actions = get_actions(start_date, end_date)
125 |         df = pd.get_dummies(actions['type'], prefix='action')
126 |         actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
127 |         #近期行为按时间衰减
128 |         actions['weights'] = actions['time'].map(lambda x: datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
129 |         #actions['weights'] = time.strptime(end_date, '%Y-%m-%d') - actions['datetime']
130 |         actions['weights'] = actions['weights'].map(lambda x: math.exp(-x.days))
131 |         print actions.head(10)
132 |         actions['action_1'] = actions['action_1'] * actions['weights']
133 |         actions['action_2'] = actions['action_2'] * actions['weights']
134 |         actions['action_3'] = actions['action_3'] * actions['weights']
135 |         actions['action_4'] = actions['action_4'] * actions['weights']
136 |         actions['action_5'] = actions['action_5'] * actions['weights']
137 |         actions['action_6'] = actions['action_6'] * actions['weights']
138 |         del actions['model_id']
139 |         del actions['type']
140 |         del actions['time']
141 |         del actions['datetime']
142 |         del actions['weights']
143 |         actions = actions.groupby(['user_id', 'sku_id', 'cate', 'brand'], as_index=False).sum()
144 |         pickle.dump(actions, open(dump_path, 'w'))
145 |     return actions
146 | 
147 | 
148 | def get_comments_product_feat(start_date, end_date):
149 |     dump_path = './cache/comments_accumulate_%s_%s.pkl' % (start_date, end_date)
150 |     if os.path.exists(dump_path):
151 |         comments = pickle.load(open(dump_path))
152 |     else:
153 |         comments = pd.read_csv(comment_path)
154 |         comment_date_end = end_date
155 |         comment_date_begin = comment_date[0]
156 |         for date in reversed(comment_date):
157 |             if date < comment_date_end:
158 |                 comment_date_begin = date
159 |                 break
160 |         comments = comments[(comments.dt >= comment_date_begin) & (comments.dt < comment_date_end)]
161 |         df = pd.get_dummies(comments['comment_num'], prefix='comment_num')
162 |         comments = pd.concat([comments, df], axis=1) # type: pd.DataFrame
163 |         #del comments['dt']
164 |         #del comments['comment_num']
165 |         comments = comments[['sku_id', 'has_bad_comment', 'bad_comment_rate', 'comment_num_1', 'comment_num_2', 'comment_num_3', 'comment_num_4']]
166 |         pickle.dump(comments, open(dump_path, 'w'))
167 |     return comments
168 | 
169 | 
170 | def get_accumulate_user_feat(start_date, end_date):
171 |     feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio',
172 |                'user_action_5_ratio', 'user_action_6_ratio']
173 |     dump_path = './cache/user_feat_accumulate_%s_%s.pkl' % (start_date, end_date)
174 |     if os.path.exists(dump_path):
175 |         actions = pickle.load(open(dump_path))
176 |     else:
177 |         actions = get_actions(start_date, end_date)
178 |         df = pd.get_dummies(actions['type'], prefix='action')
179 |         actions = pd.concat([actions['user_id'], df], axis=1)
180 |         actions = actions.groupby(['user_id'], as_index=False).sum()
181 |         actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1']
182 |         actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2']
183 |         actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3']
184 |         actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5']
185 |         actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6']
186 |         actions = actions[feature]
187 |         pickle.dump(actions, open(dump_path, 'w'))
188 |     return actions
189 | 
190 | 
191 | def get_accumulate_product_feat(start_date, end_date):
192 |     feature = ['sku_id', 'product_action_1_ratio', 'product_action_2_ratio', 'product_action_3_ratio',
193 |                'product_action_5_ratio', 'product_action_6_ratio']
194 |     dump_path = './cache/product_feat_accumulate_%s_%s.pkl' % (start_date, end_date)
195 |     if os.path.exists(dump_path):
196 |         actions = pickle.load(open(dump_path))
197 |     else:
198 |         actions = get_actions(start_date, end_date)
199 |         df = pd.get_dummies(actions['type'], prefix='action')
200 |         actions = pd.concat([actions['sku_id'], df], axis=1)
201 |         actions = actions.groupby(['sku_id'], as_index=False).sum()
202 |         actions['product_action_1_ratio'] = actions['action_4'] / actions['action_1']
203 |         actions['product_action_2_ratio'] = actions['action_4'] / actions['action_2']
204 |         actions['product_action_3_ratio'] = actions['action_4'] / actions['action_3']
205 |         actions['product_action_5_ratio'] = actions['action_4'] / actions['action_5']
206 |         actions['product_action_6_ratio'] = actions['action_4'] / actions['action_6']
207 |         actions = actions[feature]
208 |         pickle.dump(actions, open(dump_path, 'w'))
209 |     return actions
210 | 
211 | 
212 | def get_labels(start_date, end_date):
213 |     dump_path = './cache/labels_%s_%s.pkl' % (start_date, end_date)
214 |     if os.path.exists(dump_path):
215 |         actions = pickle.load(open(dump_path))
216 |     else:
217 |         actions = get_actions(start_date, end_date)
218 |         actions = actions[actions['type'] == 4]
219 |         actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
220 |         actions['label'] = 1
221 |         actions = actions[['user_id', 'sku_id', 'label']]
222 |         pickle.dump(actions, open(dump_path, 'w'))
223 |     return actions
224 | 
225 | 
226 | def make_test_set(train_start_date, train_end_date):
227 |     dump_path = './cache/test_set_%s_%s.pkl' % (train_start_date, train_end_date)
228 |     if os.path.exists(dump_path):
229 |         actions = pickle.load(open(dump_path))
230 |     else:
231 |         start_days = "2016-02-01"
232 |         user = get_basic_user_feat()
233 |         product = get_basic_product_feat()
234 |         user_acc = get_accumulate_user_feat(start_days, train_end_date)
235 |         product_acc = get_accumulate_product_feat(start_days, train_end_date)
236 |         comment_acc = get_comments_product_feat(train_start_date, train_end_date)
237 |         #labels = get_labels(test_start_date, test_end_date)
238 | 
239 |         # generate 时间窗口
240 |         # actions = get_accumulate_action_feat(train_start_date, train_end_date)
241 |         actions = None
242 |         for i in (1, 2, 3, 5, 7, 10, 15, 21, 30):
243 |             start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
244 |             start_days = start_days.strftime('%Y-%m-%d')
245 |             if actions is None:
246 |                 actions = get_action_feat(start_days, train_end_date)
247 |             else:
248 |                 actions = pd.merge(actions, get_action_feat(start_days, train_end_date), how='left',
249 |                                    on=['user_id', 'sku_id'])
250 | 
251 |         actions = pd.merge(actions, user, how='left', on='user_id')
252 |         actions = pd.merge(actions, user_acc, how='left', on='user_id')
253 |         actions = pd.merge(actions, product, how='left', on='sku_id')
254 |         actions = pd.merge(actions, product_acc, how='left', on='sku_id')
255 |         actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
256 |         #actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
257 |         actions = actions.fillna(0)
258 |         actions = actions[actions['cate'] == 8]
259 | 
260 |     users = actions[['user_id', 'sku_id']].copy()
261 |     del actions['user_id']
262 |     del actions['sku_id']
263 |     return users, actions
264 | 
265 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date, days=30):
266 |     dump_path = './cache/train_set_%s_%s_%s_%s.pkl' % (train_start_date, train_end_date, test_start_date, test_end_date)
267 |     if os.path.exists(dump_path):
268 |         actions = pickle.load(open(dump_path))
269 |     else:
270 |         start_days = "2016-02-01"
271 |         user = get_basic_user_feat()
272 |         product = get_basic_product_feat()
273 |         user_acc = get_accumulate_user_feat(start_days, train_end_date)
274 |         product_acc = get_accumulate_product_feat(start_days, train_end_date)
275 |         comment_acc = get_comments_product_feat(train_start_date, train_end_date)
276 |         labels = get_labels(test_start_date, test_end_date)
277 | 
278 |         # generate 时间窗口
279 |         # actions = get_accumulate_action_feat(train_start_date, train_end_date)
280 |         actions = None
281 |         for i in (1, 2, 3, 5, 7, 10, 15, 21, 30):
282 |             start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
283 |             start_days = start_days.strftime('%Y-%m-%d')
284 |             if actions is None:
285 |                 actions = get_action_feat(start_days, train_end_date)
286 |             else:
287 |                 actions = pd.merge(actions, get_action_feat(start_days, train_end_date), how='left',
288 |                                    on=['user_id', 'sku_id'])
289 | 
290 |         actions = pd.merge(actions, user, how='left', on='user_id')
291 |         actions = pd.merge(actions, user_acc, how='left', on='user_id')
292 |         actions = pd.merge(actions, product, how='left', on='sku_id')
293 |         actions = pd.merge(actions, product_acc, how='left', on='sku_id')
294 |         actions = pd.merge(actions, comment_acc, how='left', on='sku_id')
295 |         actions = pd.merge(actions, labels, how='left', on=['user_id', 'sku_id'])
296 |         actions = actions.fillna(0)
297 | 
298 |     users = actions[['user_id', 'sku_id']].copy()
299 |     labels = actions['label'].copy()
300 |     del actions['user_id']
301 |     del actions['sku_id']
302 |     del actions['label']
303 | 
304 |     return users, actions, labels
305 | 
306 | 
307 | def report(pred, label):
308 | 
309 |     actions = label
310 |     result = pred
311 | 
312 |     # 所有用户商品对
313 |     all_user_item_pair = actions['user_id'].map(str) + '-' + actions['sku_id'].map(str)
314 |     all_user_item_pair = np.array(all_user_item_pair)
315 |     # 所有购买用户
316 |     all_user_set = actions['user_id'].unique()
317 | 
318 |     # 所有品类中预测购买的用户
319 |     all_user_test_set = result['user_id'].unique()
320 |     all_user_test_item_pair = result['user_id'].map(str) + '-' + result['sku_id'].map(str)
321 |     all_user_test_item_pair = np.array(all_user_test_item_pair)
322 | 
323 |     # 计算所有用户购买评价指标
324 |     pos, neg = 0,0
325 |     for user_id in all_user_test_set:
326 |         if user_id in all_user_set:
327 |             pos += 1
328 |         else:
329 |             neg += 1
330 |     all_user_acc = 1.0 * pos / ( pos + neg)
331 |     all_user_recall = 1.0 * pos / len(all_user_set)
332 |     print '所有用户中预测购买用户的准确率为 ' + str(all_user_acc)
333 |     print '所有用户中预测购买用户的召回率' + str(all_user_recall)
334 | 
335 |     pos, neg = 0, 0
336 |     for user_item_pair in all_user_test_item_pair:
337 |         if user_item_pair in all_user_item_pair:
338 |             pos += 1
339 |         else:
340 |             neg += 1
341 |     all_item_acc = 1.0 * pos / ( pos + neg)
342 |     all_item_recall = 1.0 * pos / len(all_user_item_pair)
343 |     print '所有用户中预测购买商品的准确率为 ' + str(all_item_acc)
344 |     print '所有用户中预测购买商品的召回率' + str(all_item_recall)
345 |     F11 = 6.0 * all_user_recall * all_user_acc / (5.0 * all_user_recall + all_user_acc)
346 |     F12 = 5.0 * all_item_acc * all_item_recall / (2.0 * all_item_recall + 3 * all_item_acc)
347 |     score = 0.4 * F11 + 0.6 * F12
348 |     print 'F11=' + str(F11)
349 |     print 'F12=' + str(F12)
350 |     print 'score=' + str(score)
351 | 
352 | if __name__ == '__main__':
353 |     train_start_date = '2016-02-01'
354 |     train_end_date = '2016-03-01'
355 |     test_start_date = '2016-03-01'
356 |     test_end_date = '2016-03-05'
357 |     user, action, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
358 |     print user.head(10)
359 |     print action.head(10)
360 | 
361 | 
362 | 
363 | 
364 | 


--------------------------------------------------------------------------------
/sub/README.md:
--------------------------------------------------------------------------------
1 | # 结果目录
2 | 


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
 1 | __author__ = 'foursking'
 2 | from gen_feat import make_train_set
 3 | from gen_feat import make_test_set
 4 | from sklearn.model_selection import train_test_split
 5 | import xgboost as xgb
 6 | from gen_feat import report
 7 | 
 8 | 
 9 | def xgboost_make_submission():
10 |     train_start_date = '2016-03-10'
11 |     train_end_date = '2016-04-11'
12 |     test_start_date = '2016-04-11'
13 |     test_end_date = '2016-04-16'
14 | 
15 |     sub_start_date = '2016-03-15'
16 |     sub_end_date = '2016-04-16'
17 | 
18 |     user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
19 |     X_train, X_test, y_train, y_test = train_test_split(training_data.values, label.values, test_size=0.2, random_state=0)
20 |     dtrain=xgb.DMatrix(X_train, label=y_train)
21 |     dtest=xgb.DMatrix(X_test, label=y_test)
22 |     param = {'learning_rate' : 0.1, 'n_estimators': 1000, 'max_depth': 3, 
23 |         'min_child_weight': 5, 'gamma': 0, 'subsample': 1.0, 'colsample_bytree': 0.8,
24 |         'scale_pos_weight': 1, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
25 |     num_round = 283
26 |     param['nthread'] = 4
27 |     #param['eval_metric'] = "auc"
28 |     plst = param.items()
29 |     plst += [('eval_metric', 'logloss')]
30 |     evallist = [(dtest, 'eval'), (dtrain, 'train')]
31 |     bst=xgb.train(plst, dtrain, num_round, evallist)
32 |     sub_user_index, sub_trainning_data = make_test_set(sub_start_date, sub_end_date,)
33 |     sub_trainning_data = xgb.DMatrix(sub_trainning_data.values)
34 |     y = bst.predict(sub_trainning_data)
35 |     sub_user_index['label'] = y
36 |     pred = sub_user_index[sub_user_index['label'] >= 0.03]
37 |     pred = pred[['user_id', 'sku_id']]
38 |     pred = pred.groupby('user_id').first().reset_index()
39 |     pred['user_id'] = pred['user_id'].astype(int)
40 |     pred.to_csv('./sub/submission.csv', index=False, index_label=False)
41 | 
42 | 
43 | 
44 | def xgboost_cv():
45 |     train_start_date = '2016-03-05'
46 |     train_end_date = '2016-04-06'
47 |     test_start_date = '2016-04-11'
48 |     test_end_date = '2016-04-16'
49 | 
50 |     sub_start_date = '2016-02-05'
51 |     sub_end_date = '2016-03-05'
52 |     sub_test_start_date = '2016-03-05'
53 |     sub_test_end_date = '2016-03-10'
54 | 
55 |     user_index, training_data, label = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date)
56 |     X_train, X_test, y_train, y_test = train_test_split(training_data, label, test_size=0.2, random_state=0)
57 |     dtrain=xgb.DMatrix(X_train, label=y_train)
58 |     dtest=xgb.DMatrix(X_test, label=y_test)
59 |     param = {'max_depth': 10, 'eta': 0.05, 'silent': 1, 'objective': 'binary:logistic'}
60 |     num_round = 4000
61 |     param['nthread'] = 4
62 |     param['eval_metric'] = "auc"
63 |     plst = param.items()
64 |     plst += [('eval_metric', 'logloss')]
65 |     evallist = [(dtest, 'eval'), (dtrain, 'train')]
66 |     bst=xgb.train( plst, dtrain, num_round, evallist)
67 | 
68 |     sub_user_index, sub_trainning_date, sub_label = make_train_set(sub_start_date, sub_end_date,
69 |                                                                    sub_test_start_date, sub_test_end_date)
70 |     test = xgb.DMatrix(sub_trainning_date)
71 |     #y = bst.predict(test)
72 | 
73 |     pred = sub_user_index.copy()
74 |     y_true = sub_user_index.copy()
75 |     pred['label'] = y
76 |     y_true['label'] = label
77 |     report(pred, y_true)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     #xgboost_cv()
82 |     xgboost_make_submission()
83 | 


--------------------------------------------------------------------------------