├── SleepTight_CodeInstruction.docx
├── README.md
├── .gitignore
├── v2_win.py
├── v1_win.py
├── v1_1.py
├── v2_1.py
├── get_res.py
├── user_cate.py
└── user_cate2.py


/SleepTight_CodeInstruction.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/anzhizh/2019-taida-jdata-top3/HEAD/SleepTight_CodeInstruction.docx


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 2019-taida-jdata-top3
2 | 
3 | 方案描述和代码运行都在SleepTight_CodeInstruction中，整体方案着重特征工程，在模型构造和最终提交文件处理上有很大提升空间。
4 | 
5 | 参赛收获：核心在于构建整个模型的体系，包含模型框架、特征维度等，保证每一个特征的加入都会使得模型更加丰富立体。仅个人看法。
6 | 
7 | 感谢太白南路点子王、鱼遇雨欲语与余、小幸运，特别致谢太白南路点子王的各路好点子。
8 | 
9 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | npm-debug.log*
 5 | yarn-debug.log*
 6 | yarn-error.log*
 7 | 
 8 | # Runtime data
 9 | pids
10 | *.pid
11 | *.seed
12 | *.pid.lock
13 | 
14 | # Directory for instrumented libs generated by jscoverage/JSCover
15 | lib-cov
16 | 
17 | # Coverage directory used by tools like istanbul
18 | coverage
19 | 
20 | # nyc test coverage
21 | .nyc_output
22 | 
23 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
24 | .grunt
25 | 
26 | # Bower dependency directory (https://bower.io/)
27 | bower_components
28 | 
29 | # node-waf configuration
30 | .lock-wscript
31 | 
32 | # Compiled binary addons (https://nodejs.org/api/addons.html)
33 | build/Release
34 | 
35 | # Dependency directories
36 | node_modules/
37 | jspm_packages/
38 | 
39 | # TypeScript v1 declaration files
40 | typings/
41 | 
42 | # Optional npm cache directory
43 | .npm
44 | 
45 | # Optional eslint cache
46 | .eslintcache
47 | 
48 | # Optional REPL history
49 | .node_repl_history
50 | 
51 | # Output of 'npm pack'
52 | *.tgz
53 | 
54 | # Yarn Integrity file
55 | .yarn-integrity
56 | 
57 | # dotenv environment variables file
58 | .env
59 | 
60 | # next.js build output
61 | .next
62 | 


--------------------------------------------------------------------------------
/v2_win.py:
--------------------------------------------------------------------------------
 1 | # 5天标签
 2 | from user_cate_shop2 import *
 3 | 
 4 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm',
 5 |                'user_reg_tm', 'vender_id', 'module_id']
 6 | 
 7 | test_start_date = '2018-03-15'
 8 | test_end_date = '2018-04-16'
 9 | 
10 | label_start_date = '2018-04-11'
11 | label_end_date = '2018-04-16'
12 | train_start_date = '2018-03-10'
13 | train_end_date = '2018-04-11'
14 | 
15 | # train
16 | training_data = make_train_set(train_start_date, train_end_date, label_start_date, label_end_date, 30)
17 | 
18 | feats = [f for f in training_data.columns if f not in ignore_feat]
19 | print(feats)
20 | print(len(feats))
21 | label = training_data['label'].copy()
22 | user_index = training_data[['user_id', 'cate', 'shop_id']].copy()
23 | train = training_data[feats].values
24 | 
25 | # test
26 | sub_training_data = make_test_set(test_start_date, test_end_date, 30)
27 | feats = [f for f in sub_training_data.columns if f not in ignore_feat]
28 | print(feats)
29 | print(len(feats))
30 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy()
31 | test = sub_training_data[feats].values
32 | print('test shape: ', test.shape)
33 | 
34 | lgb_train_F12_5(train, label, test, sub_user_index)
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/v1_win.py:
--------------------------------------------------------------------------------
 1 | # 5天标签
 2 | from user_cate2 import *
 3 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm',
 4 |                'user_reg_tm', 'vender_id', 'module_id', 'cate_feat13_11', 'F11_feat13_11']
 5 | 
 6 | test_start_date = '2018-03-15'
 7 | test_end_date = '2018-04-16'
 8 | 
 9 | label_start_date = '2018-04-11'
10 | label_end_date = '2018-04-16'
11 | train_start_date = '2018-03-10'
12 | train_end_date = '2018-04-11'
13 | 
14 | # train
15 | training_data = make_train_set(train_start_date, train_end_date, label_start_date, label_end_date, 30)
16 | 
17 | feats = [f for f in training_data.columns if f not in ignore_feat]
18 | print(feats)
19 | print(len(feats))
20 | label = training_data['label'].copy()
21 | user_index = training_data[['user_id', 'cate']].copy()
22 | train = training_data[feats].copy()
23 | del training_data
24 | # test
25 | sub_training_data = make_test_set(test_start_date, test_end_date, 30)
26 | feats = [f for f in sub_training_data.columns if f not in ignore_feat]
27 | print(feats)
28 | print(len(feats))
29 | sub_user_index = sub_training_data[['user_id', 'cate']].copy()
30 | test = sub_training_data[feats].copy()
31 | print('test shape: ', test.shape)
32 | del sub_training_data
33 | lgb_train(train, label, test, sub_user_index)
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/v1_1.py:
--------------------------------------------------------------------------------
 1 | from user_cate import *
 2 | 
 3 | ignore_feat = ['label', 'type', 'user_id', 'cate', 'shop_id', 'sku_id', 'action_time', 'dt', 'market_time', 'shop_reg_tm',
 4 |                'user_reg_tm']
 5 | 
 6 | label_start_date = '2018-04-09'
 7 | label_end_date = '2018-04-16'
 8 | train_start_date = '2018-03-08'
 9 | train_end_date = '2018-04-09'
10 | test_start_date = '2018-03-15'
11 | test_end_date = '2018-04-16'
12 | 
13 | # train
14 | training_data = make_train_set(train_start_date, train_end_date, label_start_date, label_end_date)
15 | 
16 | # test
17 | sub_training_data = make_test_set(test_start_date, test_end_date)
18 | 
19 | 
20 | # train
21 | feats_train = [f for f in training_data.columns if f not in ignore_feat]
22 | print(len(feats_train))
23 | label = training_data['label'].copy()
24 | user_index = training_data[['user_id', 'cate']].copy()
25 | print('train shape: ', training_data.shape)
26 | train = training_data[feats_train].copy()
27 | print('train shape: ', train.shape)
28 | 
29 | # test
30 | feats_test = [f for f in sub_training_data.columns if f not in ignore_feat]
31 | print(feats_test)
32 | print(len(feats_test))
33 | sub_user_index = sub_training_data[['user_id', 'cate']].copy()
34 | test = sub_training_data[feats_test].copy()
35 | print('test shape: ', test.shape)
36 | 
37 | # 训练
38 | lgb_train_F11_7(train, label, test, sub_user_index)
39 | 


--------------------------------------------------------------------------------
/v2_1.py:
--------------------------------------------------------------------------------
 1 | # 一个月用户集 全量特征集 一周标签集
 2 | 
 3 | from user_cate_shop import *     # 清洗数据
 4 | 
 5 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm',
 6 |                'user_reg_tm', 'vender_id', 'module_id']
 7 | 
 8 | label_start_date = '2018-04-09'
 9 | label_end_date = '2018-04-16'
10 | train_start_date = '2018-03-08'
11 | train_end_date = '2018-04-09'
12 | test_start_date = '2018-03-15'
13 | test_end_date = '2018-04-16'
14 | 
15 | training_data = make_train_set_F12_7(train_start_date, train_end_date, label_start_date, label_end_date, start='2018-02-01')
16 | sub_training_data = make_test_set_F12_7(test_start_date, test_end_date, start='2018-02-01')
17 | 
18 | # train
19 | feats = [f for f in training_data.columns if f not in ignore_feat]
20 | print(feats)
21 | print(len(feats))
22 | label = training_data['label'].copy()
23 | user_index = training_data[['user_id', 'cate', 'shop_id']].copy()
24 | train = training_data[feats].copy()
25 | 
26 | # test
27 | feats = [f for f in sub_training_data.columns if f not in ignore_feat]
28 | print(feats)
29 | print(len(feats))
30 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy()
31 | test = sub_training_data[feats].copy()
32 | print('test shape: ', test.shape)
33 | 
34 | del training_data, sub_training_data
35 | 
36 | lgb_train_F12_7(train, label, test, sub_user_index)
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/get_res.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import pickle
  3 | # 数据准备 提速
  4 | action_path = "./data/jdata_action.csv"
  5 | comment_path = "./data/jdata_comment.csv"
  6 | product_path = "./data/jdata_product.csv"
  7 | user_path = "./data/jdata_user.csv"
  8 | shop_path = "./data/jdata_shop.csv"
  9 | 
 10 | user = pd.read_csv(user_path, sep=',')
 11 | product = pd.read_csv(product_path, sep=',')
 12 | action = pd.read_csv(action_path, sep=',')
 13 | comment = pd.read_csv(comment_path, sep=',')
 14 | shop = pd.read_csv(shop_path, sep=',')
 15 | 
 16 | pickle.dump(user, open('./cache/origin_user.pkl', 'wb'))
 17 | pickle.dump(product, open('./cache/origin_product.pkl', 'wb'))
 18 | pickle.dump(action, open('./cache/origin_action.pkl', 'wb'))
 19 | pickle.dump(comment, open('./cache/origin_comment.pkl', 'wb'))
 20 | pickle.dump(shop, open('./cache/origin_shop.pkl', 'wb'))
 21 | 
 22 | """
 23 | F12_7
 24 | """
 25 | from user_cate_shop import make_train_set_F12_7, make_test_set_F12_7,lgb_train_F12_7
 26 | 
 27 | ignore_feat = ['label', 'type', 'user_id', 'cate','shop_id', 'sku_id','action_time', 'dt', 'market_time', 'shop_reg_tm',
 28 |                'user_reg_tm', 'vender_id', 'module_id']
 29 | 
 30 | label_start_date = '2018-04-09'
 31 | label_end_date = '2018-04-16'
 32 | train_start_date = '2018-03-08'
 33 | train_end_date = '2018-04-09'
 34 | test_start_date = '2018-03-15'
 35 | test_end_date = '2018-04-16'
 36 | 
 37 | training_data = make_train_set_F12_7(train_start_date, train_end_date, label_start_date, label_end_date, start='2018-02-01')
 38 | sub_training_data = make_test_set_F12_7(test_start_date, test_end_date, start='2018-02-01')
 39 | 
 40 | # train
 41 | feats = [f for f in training_data.columns if f not in ignore_feat]
 42 | print(feats)
 43 | print(len(feats))
 44 | label = training_data['label'].copy()
 45 | train = training_data[feats].copy()
 46 | 
 47 | # test
 48 | feats = [f for f in sub_training_data.columns if f not in ignore_feat]
 49 | print(feats)
 50 | print(len(feats))
 51 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy()
 52 | test = sub_training_data[feats].copy()
 53 | print('test shape: ', test.shape)
 54 | 
 55 | lgb_train_F12_7(train, label, test, sub_user_index)
 56 | 
 57 | """
 58 | F11_7
 59 | """
 60 | from user_cate import make_train_set_F11_7, make_test_set_F11_7, lgb_train_F11_7
 61 | 
 62 | training_data = make_train_set_F11_7(train_start_date, train_end_date, label_start_date, label_end_date)
 63 | sub_training_data = make_test_set_F11_7(test_start_date, test_end_date)
 64 | 
 65 | # train
 66 | feats_train = [f for f in training_data.columns if f not in ignore_feat]
 67 | print(len(feats_train))
 68 | label = training_data['label'].copy()
 69 | print('train shape: ', training_data.shape)
 70 | train = training_data[feats_train].copy()
 71 | print('train shape: ', train.shape)
 72 | 
 73 | # test
 74 | feats_test = [f for f in sub_training_data.columns if f not in ignore_feat]
 75 | print(feats_test)
 76 | print(len(feats_test))
 77 | sub_user_index = sub_training_data[['user_id', 'cate']].copy()
 78 | test = sub_training_data[feats_test].copy()
 79 | print('test shape: ', test.shape)
 80 | 
 81 | # 训练
 82 | lgb_train_F11_7(train, label, test, sub_user_index)
 83 | 
 84 | """
 85 | F12_5
 86 | """
 87 | from  user_cate_shop2 import make_train_set_F12_5, make_test_set_F12_5, lgb_train_F12_5
 88 | test_start_date = '2018-03-15'
 89 | test_end_date = '2018-04-16'
 90 | 
 91 | label_start_date = '2018-04-11'
 92 | label_end_date = '2018-04-16'
 93 | train_start_date = '2018-03-10'
 94 | train_end_date = '2018-04-11'
 95 | 
 96 | training_data = make_train_set_F12_5(train_start_date, train_end_date, label_start_date, label_end_date, 30)
 97 | feats = [f for f in training_data.columns if f not in ignore_feat]
 98 | print(feats)
 99 | print(len(feats))
100 | label = training_data['label'].copy()
101 | train = training_data[feats].values
102 | 
103 | sub_training_data = make_test_set_F12_5(test_start_date, test_end_date, 30)
104 | feats = [f for f in sub_training_data.columns if f not in ignore_feat]
105 | print(feats)
106 | print(len(feats))
107 | sub_user_index = sub_training_data[['user_id', 'cate', 'shop_id']].copy()
108 | test = sub_training_data[feats].values
109 | print('test shape: ', test.shape)
110 | 
111 | lgb_train_F12_5(train, label, test, sub_user_index)
112 | 
113 | """
114 | F11_5
115 | """
116 | from user_cate2 import make_train_set_F11_5, make_test_set_F11_5, lgb_train_F11_5
117 | training_data = make_train_set_F11_5(train_start_date, train_end_date, label_start_date, label_end_date, 30)
118 | 
119 | feats = [f for f in training_data.columns if f not in ignore_feat]
120 | print(feats)
121 | print(len(feats))
122 | label = training_data['label'].copy()
123 | user_index = training_data[['user_id', 'cate']].copy()
124 | train = training_data[feats].copy()
125 | del training_data
126 | 
127 | sub_training_data = make_test_set_F11_5(test_start_date, test_end_date, 30)
128 | feats = [f for f in sub_training_data.columns if f not in ignore_feat]
129 | print(feats)
130 | print(len(feats))
131 | sub_user_index = sub_training_data[['user_id', 'cate']].copy()
132 | test = sub_training_data[feats].copy()
133 | print('test shape: ', test.shape)
134 | del sub_training_data
135 | lgb_train_F11_5(train, label, test, sub_user_index)
136 | 
137 | """
138 | MERGE1
139 | """
140 | import pandas as pd
141 | 
142 | f11_col = ['user_id','cate']
143 | f12_col = ['user_id','cate','shop_id']
144 | 
145 | # 新的的F11 F12结果
146 | f11_new_best_prob = pd.read_csv('./res/sub_F11_7.csv')
147 | f12_new_best_prob = pd.read_csv('./res/sub_F12_7.csv')
148 | 
149 | # 5天的结果
150 | f12_5days_F11_prob = pd.read_csv('./res/sub_F11_5.csv')
151 | f12_5days_F12_prob = pd.read_csv('./res/sub_F12_5.csv')
152 | 
153 | 
154 | def get_old_best(f11_best_prob, f12_best_prob):
155 |    '''
156 |    F11 F12结果融合函数
157 |    '''
158 |    f11_col = ['user_id','cate']
159 |    f12_col = ['user_id','cate','shop_id']
160 |    f11_best_prob = f11_best_prob[f11_col][:32000]
161 |    f12_best_prob = f12_best_prob[f12_col][:32000]
162 |    f11 = f11_best_prob.drop_duplicates(f11_col)[f11_col]
163 |    f11_merge = f11.merge(f12_best_prob, on=f11_col,how='inner')
164 |    all_pred = pd.concat([f12_best_prob.head(15000), f11_merge],axis=0,ignore_index=True)
165 |    output_csv = all_pred.drop_duplicates(f12_col,keep='first')
166 |    print('output_csv.shape:', output_csv.shape)
167 |    return output_csv
168 | 
169 | 
170 | # 5天F11F12融合
171 | f12_5days_best_prob = get_old_best(f12_5days_F11_prob,f12_5days_F12_prob)
172 | 
173 | # 7天F11 F12 融合
174 | new_best = get_old_best(f11_new_best_prob,f12_new_best_prob)
175 | 
176 | # 7天 & 5天融合
177 | fiveDay20k = f12_5days_best_prob[f12_col].head(20000)
178 | fiveDay15k = f12_5days_best_prob[f12_col].head(15000)
179 | fiveDay10k = f12_5days_best_prob[f12_col].head(10000)
180 | fiveDay5k = f12_5days_best_prob[f12_col].head(5000)
181 | 
182 | print('原结果:', new_best.shape[0])
183 | 
184 | merge5day_5k = pd.concat([new_best,fiveDay5k],ignore_index=True)
185 | print('融合5天模型5k去重:', merge5day_5k.drop_duplicates().shape[0])
186 | 
187 | 
188 | merge5day_10k = pd.concat([new_best,fiveDay10k],ignore_index=True)
189 | print('融合5天模型10k去重:', merge5day_10k.drop_duplicates().shape[0])
190 | 
191 | merge5day_15k = pd.concat([new_best,fiveDay15k],ignore_index=True)
192 | print('融合5天模型15k去重:',merge5day_15k.drop_duplicates().shape[0])
193 | 
194 | merge5day_20k = pd.concat([new_best,fiveDay20k],ignore_index=True)
195 | print('融合5天模型20k去重:',merge5day_20k.drop_duplicates().shape[0])
196 | 
197 | # 添加你的路径
198 | merge5day_5k.drop_duplicates().to_csv('./res/merge5day_5k.csv',index=False)
199 | merge5day_10k.drop_duplicates().to_csv('./res/merge5day_10k.csv',index=False)
200 | merge5day_15k.drop_duplicates().to_csv('./res/merge5day_15k.csv',index=False)
201 | merge5day_20k.drop_duplicates().to_csv('./res/merge5day_20k.csv',index=False)
202 | new_best.drop_duplicates().to_csv('./res/new_best.csv',index=False)
203 | 
204 | 
205 | """
206 | MERGE2
207 | """
208 | # 新的的F11 F12结果
209 | f11_best_prob = pd.read_csv('./res/sub_F11_7.csv')
210 | f12_best_prob = pd.read_csv('./res/sub_F12_7.csv')
211 | 
212 | # 5天的结果
213 | f11_5days_prob = pd.read_csv('./res/sub_F11_5.csv')
214 | f12_5days_prob = pd.read_csv('./res/sub_F12_5.csv')
215 | 
216 | df1 = pd.merge(f11_best_prob, f11_5days_prob, on=f12_col, how='outer')
217 | df1 = df1.fillna(0)
218 | print(df1.shape)
219 | 
220 | df2 = pd.merge(f12_best_prob, f12_5days_prob, on=f12_col, how='outer')
221 | df2 = df2.fillna(0)
222 | print(df2.shape)
223 | 
224 | # 6:4
225 | df1['label']=0.6*df1['label_x']+0.4*df1['label_y']
226 | df1.sort_values(by=['label'], ascending=[0],inplace=True)
227 | df1 = df1.head(32000)
228 | df2['label']=0.6*df2['label_x']+0.4*df2['label_y']
229 | df2.sort_values(by=['label'], ascending=[0],inplace=True)
230 | df2 = df2.head(32000)
231 | print('6/4: ')
232 | sub1 = get_old_best(df1, df2)
233 | 
234 | # 5:5
235 | df1['label']=0.5*df1['label_x']+0.5*df1['label_y']
236 | df1.sort_values(by=['label'], ascending=[0],inplace=True)
237 | df1 = df1.head(32000)
238 | df2['label']=0.5*df2['label_x']+0.5*df2['label_y']
239 | df2.sort_values(by=['label'], ascending=[0],inplace=True)
240 | df2 = df2.head(32000)
241 | print('5/5: ')
242 | sub2 = get_old_best(df1, df2)
243 | 
244 | # 7:3
245 | df1['label']=0.7*df1['label_x']+0.3*df1['label_y']
246 | df1.sort_values(by=['label'], ascending=[0],inplace=True)
247 | df1 = df1.head(32000)
248 | df2['label']=0.7*df2['label_x']+0.3*df2['label_y']
249 | df2.sort_values(by=['label'], ascending=[0],inplace=True)
250 | df2 = df2.head(32000)
251 | print('7/3: ')
252 | sub3 = get_old_best(df1, df2)
253 | 
254 | a = pd.merge(sub1, sub2, on=['user_id', 'cate', 'shop_id'])
255 | b = pd.merge(sub1, sub3, on=['user_id', 'cate', 'shop_id'])
256 | c = pd.merge(sub2, sub3, on=['user_id', 'cate', 'shop_id'])
257 | print("6/4与5/5： ", a.shape)
258 | print("6/4与7/3： ", b.shape)
259 | print("5/5与7/3： ", c.shape)
260 | 
261 | sub2.drop_duplicates().to_csv('./res/merge_2.csv',index=False)


--------------------------------------------------------------------------------
/user_cate.py:
--------------------------------------------------------------------------------
  1 | from user_cate_shop import *
  2 | 
  3 | 
  4 | # 行为比例特征（2.01-4.08） 滑窗
  5 | def get_accumulate_user_feat_v1(start_date, end_date):
  6 |     dump_path = './cache/user_feat_accumulate_F11_7_%s_%s.pkl' % (start_date, end_date)
  7 | 
  8 |     if os.path.exists(dump_path):
  9 |         f11_actions = pd.read_pickle(dump_path)
 10 |     else:
 11 |         actions = get_actions_product(start_date, end_date)
 12 | 
 13 |         df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
 14 |         actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
 15 | 
 16 |         # 索引
 17 |         f11_actions = actions[['user_id', 'cate']].drop_duplicates()
 18 | 
 19 |         actions1 = actions.drop(['cate'], axis=1)
 20 |         actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_')
 21 |         actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)]
 22 |         actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)]
 23 |         actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)]
 24 |         actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True)
 25 | 
 26 |         actions2 = actions.drop(['user_id'], axis=1)
 27 |         actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_')
 28 |         actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)]
 29 |         actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)]
 30 |         actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)]
 31 |         actions2.rename(columns={'cate_cate': 'cate'}, inplace=True)
 32 | 
 33 |         actions4 = actions
 34 |         actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_')
 35 |         actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)]
 36 |         actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)]
 37 |         actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)]
 38 |         actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True)
 39 | 
 40 |         # 拼接
 41 |         f11_actions = f11_actions.merge(actions1, on='user_id', how='left')
 42 |         f11_actions = f11_actions.merge(actions2, on='cate', how='left')
 43 |         f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left')
 44 |         # f11_actions.to_pickle(dump_path)
 45 | 
 46 |     print('accumulate user finished')
 47 |     return f11_actions
 48 | 
 49 | 
 50 | # 读取行为数据，与产品数据拼接（用于生成购物车特征）
 51 | def get_actions_product_cart(start_date, end_date):
 52 |     dump_path = './cache/all_action_product_cart__F11_7_%s_%s.pkl' % (start_date, end_date)
 53 |     if os.path.exists(dump_path):
 54 |         actions = pd.read_pickle(dump_path)
 55 |     else:
 56 |         actions = pd.read_pickle('./cache/origin_action.pkl')
 57 |         product = pd.read_pickle('./cache/origin_product.pkl')
 58 |         shop = pd.read_pickle('./cache/origin_shop.pkl')
 59 |         actions['action_time'] = pd.to_datetime(actions['action_time'])
 60 |         actions = actions[(actions.action_time >= start_date) & (actions.action_time < end_date)]
 61 |         actions = actions[actions['sku_id'].isin(product['sku_id'])]  # 行为中sku_id不在product中的
 62 |         actions = pd.merge(actions, product, on='sku_id', how='left')
 63 |         actions = actions[actions['cate'] != 13]  # cate13的数据没有购买行为
 64 |         actions = pd.merge(actions, shop[['shop_id', 'vender_id']], on=['shop_id'], how='left')
 65 |         print(actions.shape)
 66 |         actions = actions[actions['vender_id'] != 3666]  # 数据没有购买行为
 67 |         print(actions.shape)
 68 |         # actions.to_pickle(dump_path)
 69 |     return actions
 70 | 
 71 | 
 72 | def get_accumulate_user_feat_v1_cart(start_date, end_date):
 73 |     dump_path = './cache/user_feat_accumulate_F11_7_%s_%s.pkl' % (start_date, end_date)
 74 | 
 75 |     if os.path.exists(dump_path):
 76 |         f11_actions = pd.read_pickle(dump_path)
 77 |     else:
 78 |         actions = get_actions_product_cart(start_date, end_date)
 79 | 
 80 |         df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
 81 |         actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
 82 | 
 83 |         # 索引
 84 |         f11_actions = actions[['user_id', 'cate']].drop_duplicates()
 85 | 
 86 |         actions1 = actions.drop(['cate'], axis=1)
 87 |         actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_')
 88 |         actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)]
 89 |         actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)]
 90 |         actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)]
 91 |         actions1['user_action_5_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (
 92 |         start_date, end_date)] / actions1['user_id_%s-%s-action_5' % (start_date, end_date)]
 93 | 
 94 |         actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True)
 95 | 
 96 |         actions2 = actions.drop(['user_id'], axis=1)
 97 |         actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_')
 98 |         actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)]
 99 |         actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)]
100 |         actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)]
101 |         actions2['cate_action_5_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (
102 |         start_date, end_date)] / actions2['cate_%s-%s-action_5' % (start_date, end_date)]
103 | 
104 |         actions2.rename(columns={'cate_cate': 'cate'}, inplace=True)
105 | 
106 |         actions4 = actions
107 |         actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_')
108 |         actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)]
109 |         actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)]
110 |         actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)]
111 |         actions4['user_cate_shop_id_action_5_ratio_%s_%s' % (start_date, end_date)] = actions4[
112 |                                                                                           'user_cate_shop_id_%s-%s-action_2' % (
113 |                                                                                           start_date, end_date)] / \
114 |                                                                                       actions4[
115 |                                                                                           'user_cate_shop_id_%s-%s-action_5' % (
116 |                                                                                           start_date, end_date)]
117 | 
118 |         actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True)
119 | 
120 |         # 拼接
121 |         f11_actions = f11_actions.merge(actions1, on='user_id', how='left')
122 |         f11_actions = f11_actions.merge(actions2, on='cate', how='left')
123 |         f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left')
124 |         #f11_actions.to_pickle(dump_path)
125 | 
126 |     print('accumulate user finished')
127 |     return f11_actions
128 | 
129 | 
130 | # 基础统计特征
131 | def get_stat_feat_v1(start_date, end_date):
132 |     dump_path = './cache/stat_feat_accumulate_F11_7_%s_%s.pkl' % (start_date, end_date)
133 |     if os.path.exists(dump_path):
134 |         action = pd.read_pickle(dump_path)
135 |     else:
136 |         action = get_actions_product(start_date, end_date)
137 |         action_index = action[['user_id', 'cate']].drop_duplicates()
138 | 
139 |         # 行为onehot
140 |         action_type = pd.get_dummies(action['type'])
141 |         action_type.columns = ['act_1', 'act_2', 'act_3', 'act_4']
142 |         action_type = action_type[['act_1', 'act_2', 'act_3', 'act_4']]
143 |         action_type['cate'] = action['cate']
144 |         action_type['user_id'] = action['user_id']
145 |         action_type['shop_id'] = action['shop_id']
146 | 
147 |         # 基于user_id的统计特征
148 |         user_stat = action[['user_id']].drop_duplicates()
149 |         user_action_count = action.groupby('user_id')['type'].count()
150 |         user_order_count = action_type.groupby('user_id')['act_2'].sum()
151 |         user_order_rate = user_order_count / (user_action_count).fillna(0)
152 |         user_cate_count = action.groupby('user_id')['cate'].nunique()
153 |         user_sku_count = action.groupby('user_id')['sku_id'].nunique()
154 |         user_shop_count = action.groupby('user_id')['shop_id'].nunique()
155 | 
156 |         user_stat['user_action_count_%s_%s' % (start_date, end_date)] = user_action_count
157 |         user_stat['user_order_rate_%s_%s' % (start_date, end_date)] = user_order_rate
158 |         user_stat['user_cate_count_%s_%s' % (start_date, end_date)] = user_cate_count
159 |         user_stat['user_sku_count_%s_%s' % (start_date, end_date)] = user_sku_count
160 |         user_stat['user_shop_count_%s_%s' % (start_date, end_date)] = user_shop_count
161 | 
162 |         # 基于cate的统计特征
163 |         cate_stat = action[['cate']].drop_duplicates()
164 | 
165 |         # cate下的用户特征
166 |         cate_user_count = action.groupby('cate')['user_id'].count()
167 |         cate_user_nunique = action.groupby('cate')['user_id'].nunique()
168 |         cate_order_count = action_type.groupby('cate')['act_2'].sum()
169 |         cate_order_rate = cate_order_count / cate_user_count
170 | 
171 |         # cate下：购买用户/总用户
172 |         cate_order_user_count = action_type.groupby(['cate', 'user_id'])['act_2'].sum().reset_index()
173 |         cate_order_user_count = cate_order_user_count[cate_order_user_count.act_2 > 0].groupby('cate')['user_id'].nunique()
174 |         cate_order_user_rate = (cate_order_user_count / cate_user_nunique)
175 |         cate_sku_nunique = action.groupby('cate')['sku_id'].nunique()
176 | 
177 |         # cate下的店铺特征
178 |         cate_shop_count = action.groupby('cate')['shop_id'].count()
179 |         cate_shop_nunique = action.groupby('cate')['shop_id'].nunique()
180 |         cate_shop_order_count = action_type.groupby('cate')['act_2'].sum()
181 |         cate_shop_order_rate = cate_shop_order_count / cate_shop_count
182 | 
183 |         # cate下： 购买店铺/总店铺
184 |         cate_order_shop_count = action_type.groupby(['cate', 'shop_id'])['act_2'].sum().reset_index()
185 |         cate_order_shop_count = cate_order_shop_count[cate_order_shop_count.act_2 > 0].groupby('cate')['shop_id'].nunique()
186 |         cate_order_shop_rate = (cate_order_shop_count / cate_shop_nunique)
187 | 
188 |         cate_stat['cate_user_count_%s_%s' % (start_date, end_date)] = cate_user_count
189 |         cate_stat['cate_user_nunique_%s_%s' % (start_date, end_date)] = cate_user_nunique
190 |         cate_stat['cate_order_rate_%s_%s' % (start_date, end_date)] = cate_order_rate.fillna(0)
191 |         cate_stat['cate_order_user_count_%s_%s' % (start_date, end_date)] = cate_order_user_count
192 |         cate_stat['cate_order_user_rate_%s_%s' % (start_date, end_date)] = cate_order_user_rate
193 |         cate_stat['cate_sku_nunique_%s_%s' % (start_date, end_date)] = cate_sku_nunique
194 |         cate_stat['cate_shop_nunique_%s_%s' % (start_date, end_date)] = cate_shop_nunique
195 | 
196 |         cate_stat['cate_shop_order_rate_%s_%s' % (start_date, end_date)] = cate_shop_order_rate
197 |         cate_stat['cate_order_shop_count_%s_%s' % (start_date, end_date)] = cate_order_shop_count
198 |         cate_stat['cate_order_shop_rate_%s_%s' % (start_date, end_date)] = cate_order_shop_rate
199 | 
200 |         action = pd.merge(action_index, user_stat, on='user_id', how='left')
201 |         action = pd.merge(action, cate_stat, on='cate', how='left')
202 |         #action.to_pickle(dump_path)
203 |     print('stat_feat finished')
204 |     return action
205 | 
206 | 
207 | # 交叉特征
208 | def get_cross_feat_v1(start_date, end_date):
209 |     dump_path = './cache/cross_feat_F11_7_%s_%s.pkl' % (start_date, end_date)
210 |     if os.path.exists(dump_path):
211 |         actions = pd.read_pickle(dump_path)
212 |     else:
213 |         actions = get_actions_product(start_date, end_date)[['user_id', 'cate']]
214 |         actions['cnt'] = 0
215 | 
216 |         action1 = actions.groupby(['user_id', 'cate'], as_index=False).count()
217 | 
218 |         action2 = actions.groupby('user_id', as_index=False).count()
219 |         del action2['cate']
220 |         action2.columns = ['user_id', 'user_cnt']
221 | 
222 |         action3 = actions.groupby('cate', as_index=False).count()
223 |         del action3['user_id']
224 |         action3.columns = ['cate', 'cate_cnt']
225 |         actions = pd.merge(action1, action2, how='left', on='user_id')
226 |         actions = pd.merge(actions, action3, how='left', on='cate')
227 | 
228 |         actions['user_cnt'] = actions['cnt'] / actions['user_cnt']
229 |         actions['cate_cnt'] = actions['cnt'] / actions['cate_cnt']
230 |         del actions['cnt']
231 |         #pickle.dump(actions, open(dump_path, 'wb'))
232 |     actions.columns = ['user_id', 'cate'] + ['cross_feat_' + str(i) for i in range(1, actions.shape[1] - 1)]
233 |     print('cross feature finished')
234 |     return actions
235 | 
236 | 
237 | # U_B对行为1，2，4，5进行 浏览次数/用户总浏览次数（或者物品的浏览次数）
238 | def get_user_feat15_v1(start_date, end_date):
239 |     dump_path = './cache/user_feat15_v1_F11_7_%s_%s.pkl' % (start_date, end_date)
240 |     if os.path.exists(dump_path):
241 |         actions = pd.read_pickle(dump_path)
242 |         actions.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in
243 |                                                             range(1, actions.shape[1] - 1)]
244 |         return actions
245 |     else:
246 |         temp = None
247 |         df = get_actions_product(start_date, end_date)[['user_id', 'cate', 'type']]
248 |         for i in (1, 2, 3):
249 |             actions = df[df['type'] == i]
250 |             action1 = actions.groupby(['user_id', 'cate'], as_index=False).count()
251 |             action1.columns = ['user_id', 'cate', 'visit']
252 | 
253 |             action2 = actions.groupby('user_id', as_index=False).count()
254 |             del action2['type']
255 |             action2.columns = ['user_id', 'user_visits_cate']
256 | 
257 |             action4 = actions.groupby('cate', as_index=False).count()
258 |             del action4['type']
259 |             action4.columns = ['cate', 'cate_visits_user']
260 | 
261 |             actions = pd.merge(action1, action2, how='left', on='user_id')
262 |             actions = pd.merge(actions, action4, how='left', on='cate')
263 | 
264 |             actions['visit_rate_user1'] = actions['visit'] / actions['user_visits_cate']
265 |             actions['visit_rate_cate1'] = actions['visit'] / actions['cate_visits_user']
266 |             if temp is None:
267 |                 temp = actions
268 |             else:
269 |                 temp = pd.merge(temp, actions, how="outer", on=['user_id', 'cate'])
270 |         #pickle.dump(temp, open(dump_path, 'wb'))
271 |         temp.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in
272 |                                                             range(1, temp.shape[1] - 1)]
273 |         return temp
274 | 
275 | 
276 | # 标签
277 | def get_labels_v1(start_date, end_date):
278 |     dump_path = './cache/labels_F11_7_%s_%s.pkl' % (start_date, end_date)
279 |     if os.path.exists(dump_path):
280 |         actions = pd.read_pickle(dump_path)
281 |     else:
282 |         actions = get_actions_product(start_date, end_date)
283 |         actions = actions[actions['type'] == 2]
284 |         actions = actions.groupby(['user_id', 'cate'], as_index=False).sum()
285 |         actions['label'] = 1
286 |         actions = actions[['user_id', 'cate', 'label']]
287 |         #actions.to_pickle(dump_path)
288 |     print('label finished')
289 |     return actions
290 | 
291 | 
292 | def make_train_set_F11_7(train_start_date, train_end_date, test_start_date, test_end_date, days=30):
293 |     dump_path = './cache/train_set_F11_7_%s_%s_%s_%s.pkl' % (
294 |     train_start_date, train_end_date, test_start_date, test_end_date)
295 |     if os.path.exists(dump_path):
296 |         actions = pd.read_pickle(dump_path)
297 |     else:
298 |         # 索引
299 |         f11_actions = get_actions_product(train_start_date, train_end_date)
300 |         f11_actions = f11_actions[['user_id', 'cate']].drop_duplicates()
301 | 
302 |         # 标签
303 |         labels = get_labels_v1(test_start_date, test_end_date)
304 | 
305 |         # 特征
306 |         start_days = "2018-02-01"
307 |         user = get_basic_user_feat()
308 |         product_stat = get_product_stat_feat(start_days, train_end_date)
309 |         time = get_time_feat(start_days, train_end_date)
310 |         stat_feat = get_stat_feat_v1(start_days, train_end_date)
311 |         user_feat = user_features(start_days, train_end_date)
312 |         cross_feat = get_cross_feat_v1(start_days, train_end_date)
313 | 
314 |         user_feat1 = get_user_feat1(start_days, train_end_date)
315 |         user_feat2 = get_user_feat2(start_days, train_end_date)
316 |         user_feat3 = get_user_feat3(start_days, train_end_date)
317 |         user_feat5 = get_user_feat5(start_days, train_end_date)
318 |         user_feat6 = get_user_feat6(start_days, train_end_date)
319 |         user_feat7 = get_user_feat7(start_days, train_end_date)
320 |         user_feat8 = get_user_feat8(start_days, train_end_date)
321 |         user_feat9 = get_user_feat9(start_days, train_end_date)
322 |         user_feat10 = get_user_feat10(start_days, train_end_date)
323 |         user_feat11 = get_user_feat11(start_days, train_end_date)
324 |         user_feat12 = get_user_feat12(start_days, train_end_date)
325 |         user_feat13 = get_user_feat13(start_days, train_end_date)
326 |         user_feat14 = get_user_feat14(start_days, train_end_date)
327 |         user_feat15 = get_user_feat15_v1(start_days, train_end_date)
328 | 
329 |         cate_feat1 = get_cate_feat_1(start_days, train_end_date)
330 |         cate_feat2 = get_cate_feat_2(start_days, train_end_date)
331 |         cate_feat3 = get_cate_feat_3(start_days, train_end_date)
332 |         cate_feat4 = get_cate_feat_4(start_days, train_end_date)
333 |         cate_feat5 = get_cate_feat_5(start_days, train_end_date)
334 |         cate_feat6 = get_cate_feat_6(start_days, train_end_date)
335 |         cate_feat7 = get_cate_feat_7(start_days, train_end_date)
336 |         cate_feat8 = get_cate_feat_8(start_days, train_end_date)
337 |         cate_feat9 = get_cate_feat_9(start_days, train_end_date)
338 |         cate_feat10 = get_cate_feat_10(start_days, train_end_date)
339 |         cate_feat11 = get_cate_feat_11(start_days, train_end_date)
340 | 
341 |         F11_feat1 = get_F11_feat_1(start_days, train_end_date)
342 |         F11_feat3 = get_F11_feat_3(start_days, train_end_date)
343 |         F11_feat4 = get_F11_feat_4(start_days, train_end_date)
344 |         F11_feat5 = get_F11_feat_5(start_days, train_end_date)
345 |         F11_feat6 = get_F11_feat_6(start_days, train_end_date)
346 |         F11_feat7 = get_F11_feat_7(start_days, train_end_date)
347 |         F11_feat8 = get_F11_feat_8(start_days, train_end_date)
348 |         F11_feat9 = get_F11_feat_9(start_days, train_end_date)
349 |         F11_feat10 = get_F11_feat_10(start_days, train_end_date)
350 |         F11_feat11 = get_F11_feat_11(start_days, train_end_date)
351 | 
352 |         # 滑窗行为特征
353 |         actions = None
354 |         for i in (3, 5, 7, 14, 21, 30):
355 |             start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
356 |             start_days = start_days.strftime('%Y-%m-%d')
357 |             if actions is None:
358 |                 actions = get_accumulate_user_feat_v1(start_days, train_end_date)
359 |             else:
360 |                 actions1 = get_accumulate_user_feat_v1(start_days, train_end_date)
361 |                 actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate'])
362 | 
363 |         # 前一天滑窗行为 包含cart
364 |         start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=1)
365 |         start_days = start_days.strftime('%Y-%m-%d')
366 |         actions_cart = get_accumulate_user_feat_v1_cart(start_days, train_end_date)
367 | 
368 |         # act_5
369 |         # act5_feat = pd.read_csv('./cache_final/train_lastday_act5_stat.csv')
370 |         act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 1)
371 |         act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum()
372 |         del act5_feat['shop_id']
373 | 
374 |         f11_actions = pd.merge(f11_actions, labels, how='left', on=['user_id', 'cate'])
375 |         f11_actions = f11_actions.fillna(0)
376 | 
377 |         # 负采样
378 |         print('train data size:', f11_actions.shape[0])
379 |         f11_actions_1 = f11_actions[f11_actions['label'] == 1]
380 |         f11_actions_0 = f11_actions[f11_actions['label'] == 0]
381 |         frac1 = (f11_actions_1.shape[0] * 30) / f11_actions_0.shape[0]  # 负样本为正样本30倍
382 |         f11_actions_0 = f11_actions_0.sample(frac=frac1).reset_index(drop=True)
383 |         f11_actions = pd.concat([f11_actions_1, f11_actions_0], axis=0, ignore_index=True)
384 |         f11_actions = f11_actions.sample(frac=1).reset_index(drop=True)
385 |         print('train data size after sample:', f11_actions.shape[0])
386 | 
387 |         actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate'])
388 |         actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate'])
389 |         actions = pd.merge(actions, user, how='left', on='user_id')
390 |         actions = pd.merge(actions, time, how='left', on='user_id')
391 |         actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate'])
392 |         actions = pd.merge(actions, product_stat, how='left', on='cate')
393 | 
394 |         actions = pd.merge(actions, user_feat1, how='left', on='user_id')
395 |         actions = pd.merge(actions, user_feat2, how='left', on='user_id')
396 |         actions = pd.merge(actions, user_feat3, how='left', on='user_id')
397 |         actions = pd.merge(actions, user_feat5, how='left', on='user_id')
398 |         actions = pd.merge(actions, user_feat6, how='left', on='user_id')
399 |         actions = pd.merge(actions, user_feat7, how='left', on='user_id')
400 |         actions = pd.merge(actions, user_feat8, how='left', on='user_id')
401 |         actions = pd.merge(actions, user_feat9, how='left', on='user_id')
402 |         actions = pd.merge(actions, user_feat10, how='left', on='user_id')
403 |         actions = pd.merge(actions, user_feat11, how='left', on='user_id')
404 |         actions = pd.merge(actions, user_feat12, how='left', on='user_id')
405 |         actions = pd.merge(actions, user_feat13, how='left', on='user_id')
406 |         actions = pd.merge(actions, user_feat14, how='left', on='user_id')
407 |         actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate'])
408 |         actions = pd.merge(actions, user_feat, how='left', on='user_id')
409 |         """
410 |         cate
411 |         """
412 |         actions = pd.merge(actions, cate_feat1, how='left', on='cate')
413 |         actions = pd.merge(actions, cate_feat2, how='left', on='cate')
414 |         actions = pd.merge(actions, cate_feat3, how='left', on='cate')
415 |         actions = pd.merge(actions, cate_feat4, how='left', on='cate')
416 |         actions = pd.merge(actions, cate_feat5, how='left', on='cate')
417 |         actions = pd.merge(actions, cate_feat6, how='left', on='cate')
418 |         actions = pd.merge(actions, cate_feat7, how='left', on='cate')
419 |         actions = pd.merge(actions, cate_feat8, how='left', on='cate')
420 |         actions = pd.merge(actions, cate_feat9, how='left', on='cate')
421 |         actions = pd.merge(actions, cate_feat10, how='left', on='cate')
422 |         actions = pd.merge(actions, cate_feat11, how='left', on='cate')                # 用于concat
423 |         print('actions1 finished')
424 |         """
425 |         F11
426 |         """
427 |         actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate'])
428 |         actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate'])
429 |         actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate'])
430 |         actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate'])
431 |         actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate'])
432 |         actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate'])
433 |         actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate'])
434 |         actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate'])
435 |         actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate'])
436 |         actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate'])
437 | 
438 |         actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate'])
439 |         actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate'])
440 |         actions = actions.fillna(0)
441 |     print('train_set finised')
442 |     return actions
443 | 
444 | 
445 | def make_test_set_F11_7(train_start_date, train_end_date):
446 |     dump_path = './cache/test_set_F11_7_%s_%s.pkl' % (train_start_date, train_end_date)
447 |     if os.path.exists(dump_path):
448 |         actions = pd.read_pickle(dump_path)
449 |     else:
450 |         # 索引
451 |         f11_actions = get_actions_product(train_start_date, train_end_date)
452 |         f11_actions = f11_actions[['user_id', 'cate']] .drop_duplicates()
453 | 
454 |         # 特征
455 |         start_days = "2018-02-01"
456 |         user = get_basic_user_feat()
457 |         product_stat = get_product_stat_feat(start_days, train_end_date)
458 |         time = get_time_feat(start_days, train_end_date)
459 |         stat_feat = get_stat_feat_v1(start_days, train_end_date)
460 |         user_feat = user_features(start_days, train_end_date)
461 |         cross_feat = get_cross_feat_v1(start_days, train_end_date)
462 | 
463 |         user_feat1 = get_user_feat1(start_days, train_end_date)
464 |         user_feat2 = get_user_feat2(start_days, train_end_date)
465 |         user_feat3 = get_user_feat3(start_days, train_end_date)
466 |         user_feat5 = get_user_feat5(start_days, train_end_date)
467 |         user_feat6 = get_user_feat6(start_days, train_end_date)
468 |         user_feat7 = get_user_feat7(start_days, train_end_date)
469 |         user_feat8 = get_user_feat8(start_days, train_end_date)
470 |         user_feat9 = get_user_feat9(start_days, train_end_date)
471 |         user_feat10 = get_user_feat10(start_days, train_end_date)
472 |         user_feat11 = get_user_feat11(start_days, train_end_date)
473 |         user_feat12 = get_user_feat12(start_days, train_end_date)
474 |         user_feat13 = get_user_feat13(start_days, train_end_date)
475 |         user_feat14 = get_user_feat14(start_days, train_end_date)
476 |         user_feat15 = get_user_feat15_v1(start_days, train_end_date)
477 | 
478 | 
479 |         cate_feat1 = get_cate_feat_1(start_days, train_end_date)
480 |         cate_feat2 = get_cate_feat_2(start_days, train_end_date)
481 |         cate_feat3 = get_cate_feat_3(start_days, train_end_date)
482 |         cate_feat4 = get_cate_feat_4(start_days, train_end_date)
483 |         cate_feat5 = get_cate_feat_5(start_days, train_end_date)
484 |         cate_feat6 = get_cate_feat_6(start_days, train_end_date)
485 |         cate_feat7 = get_cate_feat_7(start_days, train_end_date)
486 |         cate_feat8 = get_cate_feat_8(start_days, train_end_date)
487 |         cate_feat9 = get_cate_feat_9(start_days, train_end_date)
488 |         cate_feat10 = get_cate_feat_10(start_days, train_end_date)
489 |         cate_feat11 = get_cate_feat_11(start_days, train_end_date)
490 | 
491 |         F11_feat1 = get_F11_feat_1(start_days, train_end_date)
492 |         F11_feat3 = get_F11_feat_3(start_days, train_end_date)
493 |         F11_feat4 = get_F11_feat_4(start_days, train_end_date)
494 |         F11_feat5 = get_F11_feat_5(start_days, train_end_date)
495 |         F11_feat6 = get_F11_feat_6(start_days, train_end_date)
496 |         F11_feat7 = get_F11_feat_7(start_days, train_end_date)
497 |         F11_feat8 = get_F11_feat_8(start_days, train_end_date)
498 |         F11_feat9 = get_F11_feat_9(start_days, train_end_date)
499 |         F11_feat10 = get_F11_feat_10(start_days, train_end_date)
500 |         F11_feat11 = get_F11_feat_11(start_days, train_end_date)
501 | 
502 |         # generate 时间窗口
503 |         actions = None
504 |         for i in (3, 5, 7, 14, 21, 30):
505 |             start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
506 |             start_days = start_days.strftime('%Y-%m-%d')
507 |             if actions is None:
508 |                 actions = get_accumulate_user_feat_v1(start_days, train_end_date)
509 |             else:
510 |                 actions1 = get_accumulate_user_feat_v1(start_days, train_end_date)
511 |                 actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate'])
512 |         print(actions.shape)
513 | 
514 |         # 前一天滑窗行为 包含cart
515 |         start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=1)
516 |         start_days = start_days.strftime('%Y-%m-%d')
517 |         actions_cart = get_accumulate_user_feat_v1_cart(start_days, train_end_date)
518 | 
519 |         # act_5
520 |         # act5_feat = pd.read_csv('./cache_final/test_lastday_act5_stat.csv')
521 |         act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 1)
522 |         act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum()
523 |         del act5_feat['shop_id']
524 | 
525 |         actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate'])
526 |         actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate'])
527 |         actions = pd.merge(actions, user, how='left', on='user_id')
528 |         actions = pd.merge(actions, time, how='left', on='user_id')
529 |         actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate'])
530 |         actions = pd.merge(actions, product_stat, how='left', on='cate')
531 | 
532 |         actions = pd.merge(actions, user_feat1, how='left', on='user_id')
533 |         actions = pd.merge(actions, user_feat2, how='left', on='user_id')
534 |         actions = pd.merge(actions, user_feat3, how='left', on='user_id')
535 |         actions = pd.merge(actions, user_feat5, how='left', on='user_id')
536 |         actions = pd.merge(actions, user_feat6, how='left', on='user_id')
537 |         actions = pd.merge(actions, user_feat7, how='left', on='user_id')
538 |         actions = pd.merge(actions, user_feat8, how='left', on='user_id')
539 |         actions = pd.merge(actions, user_feat9, how='left', on='user_id')
540 |         actions = pd.merge(actions, user_feat10, how='left', on='user_id')
541 |         actions = pd.merge(actions, user_feat11, how='left', on='user_id')
542 |         actions = pd.merge(actions, user_feat12, how='left', on='user_id')
543 |         actions = pd.merge(actions, user_feat13, how='left', on='user_id')
544 |         actions = pd.merge(actions, user_feat14, how='left', on='user_id')
545 |         actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate'])
546 |         actions = pd.merge(actions, user_feat, how='left', on='user_id')
547 |         """ 
548 |         cate
549 |         """
550 |         actions = pd.merge(actions, cate_feat1, how='left', on='cate')
551 |         actions = pd.merge(actions, cate_feat2, how='left', on='cate')
552 |         actions = pd.merge(actions, cate_feat3, how='left', on='cate')
553 |         actions = pd.merge(actions, cate_feat4, how='left', on='cate')
554 |         actions = pd.merge(actions, cate_feat5, how='left', on='cate')
555 |         actions = pd.merge(actions, cate_feat6, how='left', on='cate')
556 |         actions = pd.merge(actions, cate_feat7, how='left', on='cate')
557 |         actions = pd.merge(actions, cate_feat8, how='left', on='cate')
558 |         actions = pd.merge(actions, cate_feat9, how='left', on='cate')
559 |         actions = pd.merge(actions, cate_feat10, how='left', on='cate')
560 |         actions = pd.merge(actions, cate_feat11, how='left', on='cate')
561 |         print('actions1 finished')
562 |         """
563 |         F11
564 |         """
565 |         actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate'])
566 |         actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate'])
567 |         actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate'])
568 |         actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate'])
569 |         actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate'])
570 |         actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate'])
571 |         actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate'])
572 |         actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate'])
573 |         actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate'])
574 |         actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate'])
575 | 
576 |         actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate'])
577 |         actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate'])
578 |         actions = actions.fillna(0)
579 |         del stat_feat, f11_actions
580 |     print('test_set finished')
581 |     return actions
582 | 
583 | 
584 | def lgb_train_F11_7(X_train1, y_train1, X_test1, sub_user_index):
585 |     # 提交结果
586 |     sub = sub_user_index[['user_id', 'cate']].copy()
587 |     sub['shop_id'] = 0
588 |     sub['label'] = 0
589 | 
590 |     # 训练测试集
591 |     X_train = X_train1.values
592 |     y_train = y_train1.values
593 |     X_test = X_test1.values
594 | 
595 |     del X_train1, y_train1, X_test1
596 | 
597 |     print('================================')
598 |     print(X_train.shape)
599 |     print(X_test.shape)
600 |     print('================================')
601 | 
602 |     xx_logloss = []
603 |     oof_preds = np.zeros(X_train.shape[0])
604 |     N = 5
605 |     skf = StratifiedKFold(n_splits=N, random_state=1024, shuffle=True)
606 | 
607 |     params = {
608 |         'learning_rate': 0.01,
609 |         'boosting_type': 'gbdt',
610 |         'objective': 'binary',
611 |         'metric': 'binary_logloss',
612 |         'num_leaves': 31,
613 |         'feature_fraction': 0.8,
614 |         'bagging_fraction': 0.8,
615 |         'bagging_freq': 5,
616 |         'seed': 1,
617 |         'bagging_seed': 1,
618 |         'feature_fraction_seed': 7,
619 |         'min_data_in_leaf': 20,
620 |         'nthread': -1,  # -1
621 |         'verbose': -1,
622 |     }
623 |     for k, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
624 |         print('train _K_ flod', k)
625 | 
626 |         lgb_train = lgb.Dataset(X_train[train_index], y_train[train_index])
627 |         lgb_evals = lgb.Dataset(X_train[test_index], y_train[test_index], reference=lgb_train)
628 | 
629 |         lgbm = lgb.train(params, lgb_train, num_boost_round=50000, valid_sets=[lgb_train, lgb_evals],
630 |                          valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=200)
631 | 
632 |         sub['label'] += lgbm.predict(X_test, num_iteration=lgbm.best_iteration) / N
633 |         oof_preds[test_index] = lgbm.predict(X_train[test_index], num_iteration=lgbm.best_iteration)
634 |         xx_logloss.append(lgbm.best_score['valid']['binary_logloss'])
635 |         print(xx_logloss)
636 |     a = np.mean(xx_logloss)
637 |     a = round(a, 5)
638 |     print(a)
639 | 
640 |     sub = sub.sort_values(by='label', ascending=False)
641 |     sub = sub.head(50000)
642 |     sub = sub[['user_id', 'cate', 'shop_id','label']]
643 | 
644 |     sub.to_csv('./res/sub_F11_7.csv', index=False, index_label=False)


--------------------------------------------------------------------------------
/user_cate2.py:
--------------------------------------------------------------------------------
  1 | from user_cate_shop2 import *
  2 | 
  3 | 
  4 | # 读取行为数据，与产品数据拼接（用于生成购物车特征）
  5 | def get_actions_product_cart(start_date, end_date):
  6 |     dump_path = './cache/all_action_product_cart_F11_5_%s_%s.pkl' % (start_date, end_date)
  7 |     if os.path.exists(dump_path):
  8 |         actions = pd.read_pickle(dump_path)
  9 |     else:
 10 |         actions = pd.read_pickle('./cache/origin_action.pkl')
 11 |         product = pd.read_pickle('./cache/origin_product.pkl')
 12 |         shop = pd.read_pickle('./cache/origin_shop.pkl')
 13 |         actions['action_time'] = pd.to_datetime(actions['action_time'])
 14 |         actions = actions[(actions.action_time >= start_date) & (actions.action_time < end_date)]
 15 |         actions = actions[actions['sku_id'].isin(product['sku_id'])]  # 行为中sku_id不在product中的
 16 |         actions = pd.merge(actions, product, on='sku_id', how='left')
 17 |         actions = actions[actions['cate'] != 13]  # cate13的数据没有购买行为
 18 |         actions = pd.merge(actions, shop[['shop_id', 'vender_id']], on=['shop_id'], how='left')
 19 |         print(actions.shape)
 20 |         actions = actions[actions['vender_id'] != 3666]  # 数据没有购买行为
 21 |         print(actions.shape)
 22 |         #actions.to_pickle(dump_path)
 23 |     return actions
 24 | 
 25 | 
 26 | # 行为比例特征（2.01-4.08） 滑窗
 27 | def get_accumulate_user_feat_v1(start_date, end_date):
 28 |     dump_path = './cache/user_feat_v1_accumulate_F11_5_%s_%s.pkl' % (start_date, end_date)
 29 | 
 30 |     if os.path.exists(dump_path):
 31 |         f11_actions = pd.read_pickle(dump_path)
 32 |     else:
 33 |         actions = get_actions_product(start_date, end_date)
 34 | 
 35 |         df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
 36 |         actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
 37 | 
 38 |         # 索引
 39 |         f11_actions = actions[['user_id', 'cate']].drop_duplicates()
 40 | 
 41 |         actions1 = actions.drop(['cate'], axis=1)
 42 |         actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_')
 43 |         actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)]
 44 |         actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)]
 45 |         actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)]
 46 |         actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True)
 47 | 
 48 |         actions2 = actions.drop(['user_id'], axis=1)
 49 |         actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_')
 50 |         actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)]
 51 |         actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)]
 52 |         actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)]
 53 |         actions2.rename(columns={'cate_cate': 'cate'}, inplace=True)
 54 | 
 55 |         actions4 = actions
 56 |         actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_')
 57 |         actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)]
 58 |         actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)]
 59 |         actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)]
 60 |         actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True)
 61 | 
 62 |         # 拼接
 63 |         f11_actions = f11_actions.merge(actions1, on='user_id', how='left')
 64 |         f11_actions = f11_actions.merge(actions2, on='cate', how='left')
 65 |         f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left')
 66 |         #f11_actions.to_pickle(dump_path)
 67 |     print('accumulate user finished')
 68 |     return f11_actions
 69 | 
 70 | 
 71 | def get_accumulate_user_cart_feat_v1(start_date, end_date):
 72 |     dump_path = './cache/user_cart_feat_v1_accumulate_F11_5_%s_%s.pkl' % (start_date, end_date)
 73 | 
 74 |     if os.path.exists(dump_path):
 75 |         f11_actions = pd.read_pickle(dump_path)
 76 |     else:
 77 |         actions = get_actions_product_cart(start_date, end_date)
 78 | 
 79 |         df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
 80 |         actions = pd.concat([actions[['user_id', 'cate']], df], axis=1)
 81 | 
 82 |         # 索引
 83 |         f11_actions = actions[['user_id', 'cate']].drop_duplicates()
 84 | 
 85 |         actions1 = actions.drop(['cate'], axis=1)
 86 |         actions1 = actions1.groupby(['user_id'], as_index=False).sum().add_prefix('user_id_')
 87 |         actions1['user_action_1_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_1' % (start_date, end_date)]
 88 |         actions1['user_action_4_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_4' % (start_date, end_date)]
 89 |         actions1['user_action_3_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (start_date, end_date)] / actions1['user_id_%s-%s-action_3' % (start_date, end_date)]
 90 |         actions1['user_action_5_ratio_%s_%s' % (start_date, end_date)] = actions1['user_id_%s-%s-action_2' % (
 91 |         start_date, end_date)] / actions1['user_id_%s-%s-action_5' % (start_date, end_date)]
 92 | 
 93 |         actions1.rename(columns={'user_id_user_id': 'user_id'}, inplace=True)
 94 | 
 95 |         actions2 = actions.drop(['user_id'], axis=1)
 96 |         actions2 = actions2.groupby(['cate'], as_index=False).sum().add_prefix('cate_')
 97 |         actions2['cate_action_1_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_1' % (start_date, end_date)]
 98 |         actions2['cate_action_4_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_4' % (start_date, end_date)]
 99 |         actions2['cate_action_3_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (start_date, end_date)] / actions2['cate_%s-%s-action_3' % (start_date, end_date)]
100 |         actions2['cate_action_5_ratio_%s_%s' % (start_date, end_date)] = actions2['cate_%s-%s-action_2' % (
101 |         start_date, end_date)] / actions2['cate_%s-%s-action_5' % (start_date, end_date)]
102 | 
103 |         actions2.rename(columns={'cate_cate': 'cate'}, inplace=True)
104 | 
105 |         actions4 = actions
106 |         actions4 = actions4.groupby(['user_id', 'cate'], as_index=False).sum().add_prefix('user_cate_shop_id_')
107 |         actions4['user_cate_shop_id_action_1_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_1' % (start_date, end_date)]
108 |         actions4['user_cate_shop_id_action_4_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_4' % (start_date, end_date)]
109 |         actions4['user_cate_shop_id_action_3_ratio_%s_%s' % (start_date, end_date)] = actions4['user_cate_shop_id_%s-%s-action_2' % (start_date, end_date)] / actions4['user_cate_shop_id_%s-%s-action_3' % (start_date, end_date)]
110 |         actions4['user_cate_shop_id_action_5_ratio_%s_%s' % (start_date, end_date)] = actions4[
111 |                                                                                           'user_cate_shop_id_%s-%s-action_2' % (
112 |                                                                                           start_date, end_date)] / \
113 |                                                                                       actions4[
114 |                                                                                           'user_cate_shop_id_%s-%s-action_5' % (
115 |                                                                                           start_date, end_date)]
116 | 
117 |         actions4.rename(columns={'user_cate_shop_id_user_id': 'user_id', 'user_cate_shop_id_cate': 'cate'}, inplace=True)
118 | 
119 |         # 拼接
120 |         f11_actions = f11_actions.merge(actions1, on='user_id', how='left')
121 |         f11_actions = f11_actions.merge(actions2, on='cate', how='left')
122 |         f11_actions = f11_actions.merge(actions4, on=['user_id', 'cate'], how='left')
123 |         #f11_actions.to_pickle(dump_path)
124 | 
125 |     print('accumulate user cart finished')
126 |     return f11_actions
127 | 
128 | 
129 | # 基础统计特征
130 | def get_stat_feat_v1(start_date, end_date):
131 |     dump_path = './cache/stat_feat_accumulate_v1_F11_5_%s_%s.pkl' % (start_date, end_date)
132 |     if os.path.exists(dump_path):
133 |         action = pd.read_pickle(dump_path)
134 |     else:
135 |         action = get_actions_product(start_date, end_date)
136 |         action_index = action[['user_id', 'cate']].drop_duplicates()
137 | 
138 |         # 行为onehot
139 |         action_type = pd.get_dummies(action['type'])
140 |         action_type.columns = ['act_1', 'act_2', 'act_3', 'act_4']
141 |         action_type = action_type[['act_1', 'act_2', 'act_3', 'act_4']]
142 |         action_type['cate'] = action['cate']
143 |         action_type['user_id'] = action['user_id']
144 |         action_type['shop_id'] = action['shop_id']
145 | 
146 |         # 基于user_id的统计特征
147 |         user_stat = action[['user_id']].drop_duplicates()
148 |         user_action_count = action.groupby('user_id')['type'].count()
149 |         user_order_count = action_type.groupby('user_id')['act_2'].sum()
150 |         user_order_rate = user_order_count / (user_action_count).fillna(0)
151 |         user_cate_count = action.groupby('user_id')['cate'].nunique()
152 |         user_sku_count = action.groupby('user_id')['sku_id'].nunique()
153 |         user_shop_count = action.groupby('user_id')['shop_id'].nunique()
154 | 
155 |         user_stat['user_action_count_%s_%s' % (start_date, end_date)] = user_action_count
156 |         user_stat['user_order_rate_%s_%s' % (start_date, end_date)] = user_order_rate
157 |         user_stat['user_cate_count_%s_%s' % (start_date, end_date)] = user_cate_count
158 |         user_stat['user_sku_count_%s_%s' % (start_date, end_date)] = user_sku_count
159 |         user_stat['user_shop_count_%s_%s' % (start_date, end_date)] = user_shop_count
160 | 
161 |         # 基于cate的统计特征
162 |         cate_stat = action[['cate']].drop_duplicates()
163 | 
164 |         # cate下的用户特征
165 |         cate_user_count = action.groupby('cate')['user_id'].count()
166 |         cate_user_nunique = action.groupby('cate')['user_id'].nunique()
167 |         cate_order_count = action_type.groupby('cate')['act_2'].sum()
168 |         cate_order_rate = cate_order_count / cate_user_count
169 | 
170 |         # cate下：购买用户/总用户
171 |         cate_order_user_count = action_type.groupby(['cate', 'user_id'])['act_2'].sum().reset_index()
172 |         cate_order_user_count = cate_order_user_count[cate_order_user_count.act_2 > 0].groupby('cate')['user_id'].nunique()
173 |         cate_order_user_rate = (cate_order_user_count / cate_user_nunique)
174 |         cate_sku_nunique = action.groupby('cate')['sku_id'].nunique()
175 | 
176 |         # cate下的店铺特征
177 |         cate_shop_count = action.groupby('cate')['shop_id'].count()
178 |         cate_shop_nunique = action.groupby('cate')['shop_id'].nunique()
179 |         cate_shop_order_count = action_type.groupby('cate')['act_2'].sum()
180 |         cate_shop_order_rate = cate_shop_order_count / cate_shop_count
181 | 
182 |         # cate下： 购买店铺/总店铺
183 |         cate_order_shop_count = action_type.groupby(['cate', 'shop_id'])['act_2'].sum().reset_index()
184 |         cate_order_shop_count = cate_order_shop_count[cate_order_shop_count.act_2 > 0].groupby('cate')['shop_id'].nunique()
185 |         cate_order_shop_rate = (cate_order_shop_count / cate_shop_nunique)
186 | 
187 |         cate_stat['cate_user_count_%s_%s' % (start_date, end_date)] = cate_user_count
188 |         cate_stat['cate_user_nunique_%s_%s' % (start_date, end_date)] = cate_user_nunique
189 |         cate_stat['cate_order_rate_%s_%s' % (start_date, end_date)] = cate_order_rate.fillna(0)
190 |         cate_stat['cate_order_user_count_%s_%s' % (start_date, end_date)] = cate_order_user_count
191 |         cate_stat['cate_order_user_rate_%s_%s' % (start_date, end_date)] = cate_order_user_rate
192 |         cate_stat['cate_sku_nunique_%s_%s' % (start_date, end_date)] = cate_sku_nunique
193 |         cate_stat['cate_shop_nunique_%s_%s' % (start_date, end_date)] = cate_shop_nunique
194 | 
195 |         cate_stat['cate_shop_order_rate_%s_%s' % (start_date, end_date)] = cate_shop_order_rate
196 |         cate_stat['cate_order_shop_count_%s_%s' % (start_date, end_date)] = cate_order_shop_count
197 |         cate_stat['cate_order_shop_rate_%s_%s' % (start_date, end_date)] = cate_order_shop_rate
198 | 
199 |         action = pd.merge(action_index, user_stat, on='user_id', how='left')
200 |         action = pd.merge(action, cate_stat, on='cate', how='left')
201 |         #action.to_pickle(dump_path)
202 |     print('stat_feat finished')
203 |     return action
204 | 
205 | 
206 | # 交叉特征
207 | def get_cross_feat_v1(start_date, end_date):
208 |     dump_path = './cache/cross_feat_v1_F11_5_%s_%s.pkl' % (start_date, end_date)
209 |     if os.path.exists(dump_path):
210 |         actions = pd.read_pickle(dump_path)
211 |     else:
212 |         actions = get_actions_product(start_date, end_date)[['user_id', 'cate']]
213 |         actions['cnt'] = 0
214 | 
215 |         action1 = actions.groupby(['user_id', 'cate'], as_index=False).count()
216 | 
217 |         action2 = actions.groupby('user_id', as_index=False).count()
218 |         del action2['cate']
219 |         action2.columns = ['user_id', 'user_cnt']
220 | 
221 |         action3 = actions.groupby('cate', as_index=False).count()
222 |         del action3['user_id']
223 |         action3.columns = ['cate', 'cate_cnt']
224 |         actions = pd.merge(action1, action2, how='left', on='user_id')
225 |         actions = pd.merge(actions, action3, how='left', on='cate')
226 | 
227 |         actions['user_cnt'] = actions['cnt'] / actions['user_cnt']
228 |         actions['cate_cnt'] = actions['cnt'] / actions['cate_cnt']
229 |         del actions['cnt']
230 |         #pickle.dump(actions, open(dump_path, 'wb'))
231 |     actions.columns = ['user_id', 'cate'] + ['cross_feat_' + str(i) for i in range(1, actions.shape[1] - 1)]
232 |     print('cross feature finished')
233 |     return actions
234 | 
235 | 
236 | # U_B对行为1，2，4，5进行 浏览次数/用户总浏览次数（或者物品的浏览次数）
237 | def get_user_feat15_v1(start_date, end_date):
238 |     dump_path = './cache/user_feat15_v1_F11_5_%s_%s.pkl' % (start_date, end_date)
239 |     if os.path.exists(dump_path):
240 |         actions = pd.read_pickle(dump_path)
241 |         actions.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in
242 |                                                             range(1, actions.shape[1] - 1)]
243 |         return actions
244 |     else:
245 |         temp = None
246 |         df = get_actions_product(start_date, end_date)[['user_id', 'cate', 'type']]
247 |         for i in (1, 2, 3):
248 |             actions = df[df['type'] == i]
249 |             action1 = actions.groupby(['user_id', 'cate'], as_index=False).count()
250 |             action1.columns = ['user_id', 'cate', 'visit']
251 | 
252 |             action2 = actions.groupby('user_id', as_index=False).count()
253 |             del action2['type']
254 |             action2.columns = ['user_id', 'user_visits_cate']
255 | 
256 |             action4 = actions.groupby('cate', as_index=False).count()
257 |             del action4['type']
258 |             action4.columns = ['cate', 'cate_visits_user']
259 | 
260 |             actions = pd.merge(action1, action2, how='left', on='user_id')
261 |             actions = pd.merge(actions, action4, how='left', on='cate')
262 | 
263 |             actions['visit_rate_user1'] = actions['visit'] / actions['user_visits_cate']
264 |             actions['visit_rate_cate1'] = actions['visit'] / actions['cate_visits_user']
265 |             if temp is None:
266 |                 temp = actions
267 |             else:
268 |                 temp = pd.merge(temp, actions, how="outer", on=['user_id', 'cate'])
269 |         #pickle.dump(temp, open(dump_path, 'wb'))
270 |         temp.columns = ['user_id', 'cate'] + ['user_feat15_' + str(i) for i in
271 |                                                             range(1, temp.shape[1] - 1)]
272 |         return temp
273 | 
274 | 
275 | def get_last1day_cart_fearture(start_date, end_date, day):
276 |     '''
277 |     设计两个特征
278 |     第一个是在f12id上act5的总和
279 |     第二个是f12id act5行为总和 * (act_2==0)
280 |     '''
281 |     this_end_date = pd.to_datetime(end_date)
282 |     this_start_date = this_end_date - timedelta(days=day)
283 | 
284 |     # date转化为str
285 |     this_end_date = str(this_end_date).split(' ')[0]
286 |     this_start_date = str(this_start_date).split(' ')[0]
287 |     x_action = get_actions_product_cart(this_start_date, this_end_date)
288 |     print('from:', x_action.action_time.min(), '  to:', x_action.action_time.max())
289 | 
290 |     x_oh = pd.get_dummies(x_action.type, prefix='act').astype('int8')
291 | 
292 |     x_action_oh = pd.concat([x_action[['user_id', 'cate', 'shop_id', 'sku_id', 'action_time']], x_oh], axis=1)
293 | 
294 |     x_act5_stat = x_action_oh.groupby(['user_id', 'cate', 'shop_id'])[['act_5', 'act_2']].sum().add_prefix(
295 |         'lastday_sum_').reset_index()
296 | 
297 |     x_act5_stat['cart_not_buy'] = x_act5_stat['lastday_sum_act_5'] * (x_act5_stat['lastday_sum_act_2'] == 0)
298 | 
299 |     x_act5_stat['cart_minus_buy'] = x_act5_stat['lastday_sum_act_5'] - x_act5_stat['lastday_sum_act_2']
300 | 
301 |     return x_act5_stat
302 | 
303 | 
304 | # 标签
305 | def get_labels_v1(start_date, end_date):
306 |     dump_path = './cache/labels_v1_F11_5_%s_%s.pkl' % (start_date, end_date)
307 |     if os.path.exists(dump_path):
308 |         actions = pd.read_pickle(dump_path)
309 |     else:
310 |         actions = get_actions_product(start_date, end_date)
311 |         actions = actions[actions['type'] == 2]
312 |         actions = actions.groupby(['user_id', 'cate'], as_index=False).sum()
313 |         actions['label'] = 1
314 |         actions = actions[['user_id', 'cate', 'label']]
315 |         #actions.to_pickle(dump_path)
316 |     print('label finished')
317 |     return actions
318 | 
319 | 
320 | def make_train_set_F11_5(train_start_date, train_end_date, test_start_date, test_end_date, start):
321 |     dump_path = './cache/train_set_v1_F11_5_%s_%s_%s_%s.pkl' % (
322 |         train_start_date, train_end_date, test_start_date, test_end_date)
323 |     if os.path.exists(dump_path):
324 |         actions = pd.read_pickle(dump_path)
325 |     else:
326 |         # 索引
327 |         f11_actions = get_actions_product(train_start_date, train_end_date)
328 |         f11_actions = f11_actions.drop_duplicates(['user_id', 'cate'])
329 |         f11_actions = f11_actions[['user_id', 'cate']]
330 | 
331 |         # 标签
332 |         labels = get_labels(test_start_date, test_end_date)
333 | 
334 |         # 特征
335 |         start_days = "2018-02-01"                              #
336 |         user = get_basic_user_feat()
337 |         product_stat = get_product_stat_feat(start_days, train_end_date)
338 |         time = get_time_feat(start_days, train_end_date)
339 |         stat_feat = get_stat_feat_v1(start_days, train_end_date)
340 |         user_feat = user_features(start_days, train_end_date)
341 |         cross_feat = get_cross_feat_v1(start_days, train_end_date)
342 | 
343 |         # user
344 |         user_feat1 = get_user_feat1(start_days, train_end_date)
345 |         user_feat2 = get_user_feat2(start_days, train_end_date)
346 |         user_feat3 = get_user_feat3(start_days, train_end_date)
347 |         user_feat5 = get_user_feat5(start_days, train_end_date)
348 |         user_feat6 = get_user_feat6(start_days, train_end_date)
349 |         user_feat7 = get_user_feat7(start_days, train_end_date)
350 |         user_feat8 = get_user_feat8(start_days, train_end_date)
351 |         user_feat9 = get_user_feat9(start_days, train_end_date)
352 |         user_feat10 = get_user_feat10(start_days, train_end_date)
353 |         user_feat11 = get_user_feat11(start_days, train_end_date)
354 |         user_feat12 = get_user_feat12(start_days, train_end_date)
355 |         user_feat13 = get_user_feat13(start_days, train_end_date)
356 |         user_feat14 = get_user_feat14(start_days, train_end_date)
357 |         user_feat15 = get_user_feat15_v1(start_days, train_end_date)           #
358 | 
359 |         cate_feat1 = get_cate_feat_1(start_days, train_end_date)
360 |         cate_feat2 = get_cate_feat_2(start_days, train_end_date)
361 |         cate_feat3 = get_cate_feat_3(start_days, train_end_date)
362 |         cate_feat4 = get_cate_feat_4(start_days, train_end_date)
363 |         cate_feat5 = get_cate_feat_5(start_days, train_end_date)
364 |         cate_feat6 = get_cate_feat_6(start_days, train_end_date)
365 |         cate_feat7 = get_cate_feat_7(start_days, train_end_date)
366 |         cate_feat8 = get_cate_feat_8(start_days, train_end_date)
367 |         cate_feat9 = get_cate_feat_9(start_days, train_end_date)
368 |         cate_feat10 = get_cate_feat_10(start_days, train_end_date)
369 |         cate_feat11 = get_cate_feat_11(start_days, train_end_date)
370 | 
371 |         F11_feat1 = get_F11_feat_1(start_days, train_end_date)
372 |         F11_feat3 = get_F11_feat_3(start_days, train_end_date)
373 |         F11_feat4 = get_F11_feat_4(start_days, train_end_date)
374 |         F11_feat5 = get_F11_feat_5(start_days, train_end_date)
375 |         F11_feat6 = get_F11_feat_6(start_days, train_end_date)
376 |         F11_feat7 = get_F11_feat_7(start_days, train_end_date)
377 |         F11_feat8 = get_F11_feat_8(start_days, train_end_date)
378 |         F11_feat9 = get_F11_feat_9(start_days, train_end_date)
379 |         F11_feat10 = get_F11_feat_10(start_days, train_end_date)
380 |         F11_feat11 = get_F11_feat_11(start_days, train_end_date)
381 | 
382 |         # 滑窗行为特征
383 |         actions = None
384 |         for i in (5, 7, 14, 21, 30):
385 |             start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
386 |             start_days = start_days.strftime('%Y-%m-%d')
387 |             if actions is None:
388 |                 actions = get_accumulate_user_feat_v1(start_days, train_end_date)
389 |             else:
390 |                 actions1 = get_accumulate_user_feat_v1(start_days, train_end_date)
391 |                 actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate'])
392 | 
393 |         # 前3天滑窗行为 包含cart
394 |         start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=3)
395 |         start_days = start_days.strftime('%Y-%m-%d')
396 |         actions_cart = get_accumulate_user_cart_feat_v1(start_days, train_end_date)
397 | 
398 |         # act_5
399 |         act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 3)
400 |         act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum()
401 |         del act5_feat['shop_id']
402 | 
403 |         # 负采样
404 |         f11_actions = pd.merge(f11_actions, labels, how='left', on=['user_id', 'cate'])
405 |         f11_actions = f11_actions.fillna(0)
406 |         print('train data size:', f11_actions.shape[0])
407 |         f11_actions_1 = f11_actions[f11_actions['label'] == 1]
408 |         f11_actions_0 = f11_actions[f11_actions['label'] == 0]
409 |         frac1 = (f11_actions_1.shape[0] * 30) / f11_actions_0.shape[0]  # 负样本为正样本30倍
410 |         f11_actions_0 = f11_actions_0.sample(frac=frac1).reset_index(drop=True)
411 |         f11_actions = pd.concat([f11_actions_1, f11_actions_0], axis=0, ignore_index=True)
412 |         f11_actions = f11_actions.sample(frac=1).reset_index(drop=True)
413 |         print('train data size after sample:', f11_actions.shape[0])
414 | 
415 |         actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate'])
416 |         actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate'])
417 |         actions = pd.merge(actions, user, how='left', on='user_id')
418 |         actions = pd.merge(actions, time, how='left', on='user_id')
419 |         actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate'])
420 |         actions = pd.merge(actions, product_stat, how='left', on='cate')
421 | 
422 |         actions = pd.merge(actions, user_feat1, how='left', on='user_id')
423 |         actions = pd.merge(actions, user_feat2, how='left', on='user_id')
424 |         actions = pd.merge(actions, user_feat3, how='left', on='user_id')
425 |         actions = pd.merge(actions, user_feat5, how='left', on='user_id')
426 |         actions = pd.merge(actions, user_feat6, how='left', on='user_id')
427 |         actions = pd.merge(actions, user_feat7, how='left', on='user_id')
428 |         actions = pd.merge(actions, user_feat8, how='left', on='user_id')
429 |         actions = pd.merge(actions, user_feat9, how='left', on='user_id')
430 |         actions = pd.merge(actions, user_feat10, how='left', on='user_id')
431 |         actions = pd.merge(actions, user_feat11, how='left', on='user_id')
432 |         actions = pd.merge(actions, user_feat12, how='left', on='user_id')
433 |         actions = pd.merge(actions, user_feat13, how='left', on='user_id')
434 |         actions = pd.merge(actions, user_feat14, how='left', on='user_id')
435 |         actions = pd.merge(actions, user_feat, how='left', on='user_id')
436 |         actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate'])
437 | 
438 |         """
439 |         cate
440 |         """
441 |         actions = pd.merge(actions, cate_feat1, how='left', on='cate')
442 |         actions = pd.merge(actions, cate_feat2, how='left', on='cate')
443 |         actions = pd.merge(actions, cate_feat3, how='left', on='cate')
444 |         actions = pd.merge(actions, cate_feat4, how='left', on='cate')
445 |         actions = pd.merge(actions, cate_feat5, how='left', on='cate')
446 |         actions = pd.merge(actions, cate_feat6, how='left', on='cate')
447 |         actions = pd.merge(actions, cate_feat7, how='left', on='cate')
448 |         actions = pd.merge(actions, cate_feat8, how='left', on='cate')
449 |         actions = pd.merge(actions, cate_feat9, how='left', on='cate')
450 |         actions = pd.merge(actions, cate_feat10, how='left', on='cate')
451 |         actions = pd.merge(actions, cate_feat11, how='left', on='cate')
452 |         print('cate finished')
453 |         """
454 |         F11
455 |         """
456 |         actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate'])
457 |         actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate'])
458 |         actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate'])
459 |         actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate'])
460 |         actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate'])
461 |         actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate'])
462 |         actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate'])
463 |         actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate'])
464 |         actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate'])
465 |         actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate'])
466 |         print('F11 finished')
467 | 
468 |         actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate'])
469 |         actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate'])
470 |         actions = actions.fillna(0)
471 |         # actions.to_pickle(dump_path)
472 |     print('train_set finised')
473 |     return actions
474 | 
475 | 
476 | def make_test_set_F11_5(train_start_date, train_end_date,start):
477 |     dump_path = './cache/test_set_F11_5_%s_%s.pkl' % (train_start_date, train_end_date)
478 |     if os.path.exists(dump_path):
479 |         actions = pd.read_pickle(dump_path)
480 |     else:
481 |         # 索引
482 |         f11_actions = get_actions_product(train_start_date, train_end_date)
483 |         f11_actions = f11_actions.drop_duplicates(['user_id', 'cate'])
484 |         f11_actions = f11_actions[['user_id', 'cate']]  #
485 | 
486 |         # 特征
487 |         start_days = "2018-02-01"  #
488 |         user = get_basic_user_feat()
489 |         product_stat = get_product_stat_feat(start_days, train_end_date)
490 |         time = get_time_feat(start_days, train_end_date)
491 |         stat_feat = get_stat_feat_v1(start_days, train_end_date)
492 |         user_feat = user_features(start_days, train_end_date)
493 |         cross_feat = get_cross_feat_v1(start_days, train_end_date)
494 | 
495 |         # user
496 |         user_feat1 = get_user_feat1(start_days, train_end_date)
497 |         user_feat2 = get_user_feat2(start_days, train_end_date)
498 |         user_feat3 = get_user_feat3(start_days, train_end_date)
499 |         user_feat5 = get_user_feat5(start_days, train_end_date)
500 |         user_feat6 = get_user_feat6(start_days, train_end_date)
501 |         user_feat7 = get_user_feat7(start_days, train_end_date)
502 |         user_feat8 = get_user_feat8(start_days, train_end_date)
503 |         user_feat9 = get_user_feat9(start_days, train_end_date)
504 |         user_feat10 = get_user_feat10(start_days, train_end_date)
505 |         user_feat11 = get_user_feat11(start_days, train_end_date)
506 |         user_feat12 = get_user_feat12(start_days, train_end_date)
507 |         user_feat13 = get_user_feat13(start_days, train_end_date)
508 |         user_feat14 = get_user_feat14(start_days, train_end_date)
509 |         user_feat15 = get_user_feat15_v1(start_days, train_end_date)  #
510 | 
511 |         cate_feat1 = get_cate_feat_1(start_days, train_end_date)
512 |         cate_feat2 = get_cate_feat_2(start_days, train_end_date)
513 |         cate_feat3 = get_cate_feat_3(start_days, train_end_date)
514 |         cate_feat4 = get_cate_feat_4(start_days, train_end_date)
515 |         cate_feat5 = get_cate_feat_5(start_days, train_end_date)
516 |         cate_feat6 = get_cate_feat_6(start_days, train_end_date)
517 |         cate_feat7 = get_cate_feat_7(start_days, train_end_date)
518 |         cate_feat8 = get_cate_feat_8(start_days, train_end_date)
519 |         cate_feat9 = get_cate_feat_9(start_days, train_end_date)
520 |         cate_feat10 = get_cate_feat_10(start_days, train_end_date)
521 |         cate_feat11 = get_cate_feat_11(start_days, train_end_date)
522 | 
523 |         F11_feat1 = get_F11_feat_1(start_days, train_end_date)
524 |         F11_feat3 = get_F11_feat_3(start_days, train_end_date)
525 |         F11_feat4 = get_F11_feat_4(start_days, train_end_date)
526 |         F11_feat5 = get_F11_feat_5(start_days, train_end_date)
527 |         F11_feat6 = get_F11_feat_6(start_days, train_end_date)
528 |         F11_feat7 = get_F11_feat_7(start_days, train_end_date)
529 |         F11_feat8 = get_F11_feat_8(start_days, train_end_date)
530 |         F11_feat9 = get_F11_feat_9(start_days, train_end_date)
531 |         F11_feat10 = get_F11_feat_10(start_days, train_end_date)
532 |         F11_feat11 = get_F11_feat_11(start_days, train_end_date)
533 | 
534 |         # 滑窗行为特征
535 |         actions = None
536 |         for i in (5, 7, 14, 21, 30):
537 |             start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=i)
538 |             start_days = start_days.strftime('%Y-%m-%d')
539 |             if actions is None:
540 |                 actions = get_accumulate_user_feat_v1(start_days, train_end_date)
541 |             else:
542 |                 actions1 = get_accumulate_user_feat_v1(start_days, train_end_date)
543 |                 actions = pd.merge(actions, actions1, how='left', on=['user_id', 'cate'])
544 | 
545 |         # 前3天滑窗行为 包含cart
546 |         start_days = datetime.strptime(train_end_date, '%Y-%m-%d') - timedelta(days=3)
547 |         start_days = start_days.strftime('%Y-%m-%d')
548 |         actions_cart = get_accumulate_user_cart_feat_v1(start_days, train_end_date)
549 | 
550 |         # act_5
551 |         act5_feat = get_last1day_cart_fearture(start_days, train_end_date, 3)
552 |         act5_feat = act5_feat.groupby(['user_id', 'cate'], as_index=False).sum()
553 |         del act5_feat['shop_id']
554 | 
555 |         actions = pd.merge(f11_actions, actions, how='left', on=['user_id', 'cate'])
556 |         actions = pd.merge(actions, actions_cart, how='left', on=['user_id', 'cate'])
557 |         actions = pd.merge(actions, user, how='left', on='user_id')
558 |         actions = pd.merge(actions, time, how='left', on='user_id')
559 |         actions = pd.merge(actions, stat_feat, how='left', on=['user_id', 'cate'])
560 |         actions = pd.merge(actions, product_stat, how='left', on='cate')
561 | 
562 |         actions = pd.merge(actions, user_feat1, how='left', on='user_id')
563 |         actions = pd.merge(actions, user_feat2, how='left', on='user_id')
564 |         actions = pd.merge(actions, user_feat3, how='left', on='user_id')
565 |         actions = pd.merge(actions, user_feat5, how='left', on='user_id')
566 |         actions = pd.merge(actions, user_feat6, how='left', on='user_id')
567 |         actions = pd.merge(actions, user_feat7, how='left', on='user_id')
568 |         actions = pd.merge(actions, user_feat8, how='left', on='user_id')
569 |         actions = pd.merge(actions, user_feat9, how='left', on='user_id')
570 |         actions = pd.merge(actions, user_feat10, how='left', on='user_id')
571 |         actions = pd.merge(actions, user_feat11, how='left', on='user_id')
572 |         actions = pd.merge(actions, user_feat12, how='left', on='user_id')
573 |         actions = pd.merge(actions, user_feat13, how='left', on='user_id')
574 |         actions = pd.merge(actions, user_feat14, how='left', on='user_id')
575 |         actions = pd.merge(actions, user_feat, how='left', on='user_id')
576 |         actions = pd.merge(actions, user_feat15, how='left', on=['user_id', 'cate'])
577 | 
578 |         """
579 |         cate
580 |         """
581 |         actions = pd.merge(actions, cate_feat1, how='left', on='cate')
582 |         actions = pd.merge(actions, cate_feat2, how='left', on='cate')
583 |         actions = pd.merge(actions, cate_feat3, how='left', on='cate')
584 |         actions = pd.merge(actions, cate_feat4, how='left', on='cate')
585 |         actions = pd.merge(actions, cate_feat5, how='left', on='cate')
586 |         actions = pd.merge(actions, cate_feat6, how='left', on='cate')
587 |         actions = pd.merge(actions, cate_feat7, how='left', on='cate')
588 |         actions = pd.merge(actions, cate_feat8, how='left', on='cate')
589 |         actions = pd.merge(actions, cate_feat9, how='left', on='cate')
590 |         actions = pd.merge(actions, cate_feat10, how='left', on='cate')
591 |         actions = pd.merge(actions, cate_feat11, how='left', on='cate')
592 |         print('cate finished')
593 |         """
594 |         F11
595 |         """
596 |         actions = pd.merge(actions, F11_feat1, how='left', on=['user_id', 'cate'])
597 |         actions = pd.merge(actions, F11_feat3, how='left', on=['user_id', 'cate'])
598 |         actions = pd.merge(actions, F11_feat4, how='left', on=['user_id', 'cate'])
599 |         actions = pd.merge(actions, F11_feat5, how='left', on=['user_id', 'cate'])
600 |         actions = pd.merge(actions, F11_feat6, how='left', on=['user_id', 'cate'])
601 |         actions = pd.merge(actions, F11_feat7, how='left', on=['user_id', 'cate'])
602 |         actions = pd.merge(actions, F11_feat8, how='left', on=['user_id', 'cate'])
603 |         actions = pd.merge(actions, F11_feat9, how='left', on=['user_id', 'cate'])
604 |         actions = pd.merge(actions, F11_feat10, how='left', on=['user_id', 'cate'])
605 |         actions = pd.merge(actions, F11_feat11, how='left', on=['user_id', 'cate'])
606 |         print('F11 finished')
607 | 
608 |         actions = pd.merge(actions, act5_feat, how='left', on=['user_id', 'cate'])
609 |         actions = pd.merge(actions, cross_feat, how='left', on=['user_id', 'cate'])
610 |         actions = actions.fillna(0)
611 |         del stat_feat, f11_actions
612 |     print('test_set finished')
613 |     return actions
614 | 
615 | 
616 | def lgb_train_F11_5(X_train1, y_train1, X_test1, sub_user_index):
617 |     # 提交结果
618 |     sub = sub_user_index[['user_id', 'cate']].copy()
619 |     sub['shop_id'] = 0
620 |     sub['label'] = 0
621 | 
622 |     # 训练测试集
623 |     X_train = X_train1.values
624 |     y_train = y_train1.values
625 |     X_test = X_test1.values
626 | 
627 |     del X_train1, y_train1, X_test1
628 | 
629 |     print('================================')
630 |     print(X_train.shape)
631 |     print(X_test.shape)
632 |     print('================================')
633 | 
634 |     xx_logloss = []
635 |     oof_preds = np.zeros(X_train.shape[0])
636 |     N = 5
637 |     skf = StratifiedKFold(n_splits=N, random_state=1024, shuffle=True)
638 | 
639 |     params = {
640 |         'learning_rate': 0.01,
641 |         'boosting_type': 'gbdt',
642 |         'objective': 'binary',
643 |         'metric': 'binary_logloss',
644 |         'num_leaves': 31,
645 |         'feature_fraction': 0.8,
646 |         'bagging_fraction': 0.8,
647 |         'bagging_freq': 5,
648 |         'seed': 1,
649 |         'bagging_seed': 1,
650 |         'feature_fraction_seed': 7,
651 |         'min_data_in_leaf': 20,
652 |         'nthread': -1,  # -1
653 |         'verbose': -1,
654 |     }
655 |     for k, (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
656 |         print('train _K_ flod', k)
657 | 
658 |         lgb_train = lgb.Dataset(X_train[train_index], y_train[train_index])
659 |         lgb_evals = lgb.Dataset(X_train[test_index], y_train[test_index], reference=lgb_train)
660 | 
661 |         lgbm = lgb.train(params, lgb_train, num_boost_round=50000, valid_sets=[lgb_train, lgb_evals],
662 |                          valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=200)
663 | 
664 |         sub['label'] += lgbm.predict(X_test, num_iteration=lgbm.best_iteration) / N
665 |         oof_preds[test_index] = lgbm.predict(X_train[test_index], num_iteration=lgbm.best_iteration)
666 |         xx_logloss.append(lgbm.best_score['valid']['binary_logloss'])
667 |         print(xx_logloss)
668 |     a = np.mean(xx_logloss)
669 |     a = round(a, 5)
670 |     print(a)
671 | 
672 |     sub = sub.sort_values(by='label', ascending=False)
673 |     sub = sub.head(50000)
674 |     sub = sub[['user_id', 'cate', 'shop_id', 'label']]
675 | 
676 |     sub.to_csv('./res/sub_F11_5.csv', index=False, index_label=False)


--------------------------------------------------------------------------------