├── JData_第一炉香_12_代码运行说明 解题思路.pdf
├── start.sh
├── merge_result.py
├── README.md
├── preprocessing.py
├── Umodel_1.py
└── Umodel_2.py
/JData_第一炉香_12_代码运行说明 解题思路.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hecongqing/2017-jdata-competition/HEAD/JData_第一炉香_12_代码运行说明 解题思路.pdf
--------------------------------------------------------------------------------
/start.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python preprocessing.py
3 | python USModel.py
4 | python Umodel_0.py
5 | python Umodel_1.py
6 | python Umodel_2.py
7 | python merge_result.py
8 |
9 |
--------------------------------------------------------------------------------
/merge_result.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | #-*-coding:utf-8-*-
3 | import pandas as pd
4 | #3个model的结果取权值,取top700
5 | def u_id():
6 | df1=pd.read_csv('./sub/Umodel_0.csv')
7 | df1.columns=['user_id','label1']
8 |
9 | df2=pd.read_csv('./sub/Umodel_1.csv')
10 | df2.columns=['user_id','label2']
11 |
12 | df3=pd.read_csv('./sub/Umodel_2.csv')
13 | df3.columns=['user_id','label3']
14 |
15 | df=pd.merge(df1,df2,on='user_id',how='outer')
16 | df=pd.merge(df,df3,on='user_id',how='outer')
17 | df['label']=0.3*df['label1']+0.3*df['label2']+0.4*df['label3']
18 | df.sort_values(by=['label'],ascending=[0],inplace=True)
19 | df=df[['user_id','label']].reset_index(drop=True)
20 | df=df[['user_id']]
21 | return df[:700]
22 | #usmodel的结果取top325
23 | def us_id():
24 | df=pd.read_csv('./sub/USModel.csv')
25 | df=df[['user_id']]
26 | return df[:325]
27 | #合并user top700 ,us中的user top 325,结果为802
28 | def merge_u_us():
29 | u = u_id()
30 | us = us_id()
31 | df=pd.merge(u,us,on='user_id',how='outer')
32 | df=df.drop_duplicates('user_id')
33 | return df
34 |
35 | #合并user802与us model['user_id','sku_id'],得到结果
36 | def result():
37 | u = merge_u_us()
38 | us=pd.read_csv( './sub/USModel.csv')
39 | us=us[['user_id','sku_id']]
40 | us=us.astype('int')
41 | result=pd.merge(u,us,how='left',on='user_id')
42 | print ('===========>>>打印输出结果:')
43 | result=result.fillna(0)
44 | result=result.astype('int')
45 |
46 | result.to_csv('./sub/best_result.csv',index=False)
47 | return result
48 |
49 | print (result())
50 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 2017 JData Competition
2 | 高潜用户购买意向预测-rank12
3 |
4 | # 赛题网址
5 |
6 | https://www.datafountain.cn/competitions/247/details/data-evaluation
7 |
8 | # 赛题任务
9 | 本次大赛以京东商城真实的用户、商品和行为数据(脱敏后)为基础,参赛队伍需要通过数据挖掘的技术和机器学习的算法,构建用户购买商品的预测模型,
10 | 输出高潜用户和目标商品的匹配结果,为精准营销提供高质量的目标群体。同时,希望参赛队伍能通过本次比赛,挖掘数据背后潜在的意义,为电商用户提供
11 | 更简单、快捷、省心的购物体验。
12 | 参赛者需要使用京东多个品类下商品的历史销售数据,构建算法模型,预测用户在未来5天内,对某个目标品类下商品的购买意向。对于训练集中出现的每一
13 | 个用户,参赛者的模型需要预测该用户在未来5天内是否购买目标品类下的商品以及所购买商品的SKU_ID。评测算法将针对参赛者提交的预测结果,计算加权得分。
14 |
15 | # 评分公式
16 | 参赛者提交的结果文件中包含对所有用户购买意向的预测结果。对每一个用户的预测结果包括两方面:
17 |
18 | 1、该用户2016-04-16到2016-04-20是否下单P中的商品,提交的结果文件中仅包含预测为下单的用户,预测为未下单的用户,无须在结果中出现。若预测正确,则评测算法中置label=1,不正确label=0;
19 |
20 | 2、如果下单,下单的sku_id (只需提交一个sku_id),若sku_id预测正确,则评测算法中置pred=1,不正确pred=0。
21 |
22 | 对于参赛者提交的结果文件,按如下公式计算得分:
23 |
24 |
25 |
26 | 此处的F1值定义为:
27 |
28 |
29 |
30 |
31 |
32 | 其中,Precise为准确率,Recall为召回率.
是label=1或0的F1值,
是pred=1或0的F1值.
33 |
--------------------------------------------------------------------------------
/preprocessing.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | import pandas as pd
7 | import numpy as np
8 |
9 | path = './'
10 |
11 | def concat_action():
12 | action1 = pd.read_csv(path+'/data/JData_Action_201602.csv')
13 | action2 = pd.read_csv(path+'/data/JData_Action_201603.csv')
14 | action3 = pd.read_csv(path+'/data/JData_Action_201604.csv')
15 | action = pd.concat([action1,action2,action3]).sort_values(by='time')
16 | action.to_csv('./data/JData_Action.csv', index=False)
17 |
18 | def map_user_reg(x):
19 | if x=0 and d<=3:
30 | d = 1
31 | elif d>3 and d<=6:
32 | d = 2
33 | elif d>6 and d<=12:
34 | d = 3
35 | elif d>12 and d<=24:
36 | d = 4
37 | elif d>24 and d<=48:
38 | d = 5
39 | else:
40 | d = 6
41 | return d
42 |
43 | def user_process():
44 | user = pd.read_csv(path + '/data/JData_User.csv', encoding='gbk', parse_dates=[4])
45 | user = user.drop_duplicates('user_id')
46 | #user = user[user['user_reg_tm']= start_date) & (action_1.time < end_date)]
113 | action_2 = get_actions_2()
114 | action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)]
115 | actions = pd.concat([action_1, action_2])
116 | action_3 = get_actions_3()
117 | action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)]
118 | actions = pd.concat([actions, action_3]) # type: pd.DataFrame
119 | actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
120 | actions.to_csv(dump_path, index=False)
121 | # actions['user_id']=actions['user_id'].astype('int')
122 | return actions
123 |
124 | # 获取两个时间相差几天
125 | def get_day_chaju(x, end_date):
126 | # x=x.split(' ')[0]
127 | x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
128 | end_date = datetime.strptime(end_date, '%Y-%m-%d')
129 | return (end_date - x).days
130 |
131 |
132 | def get_action_feat(start_date, end_date,k):
133 | dump_path = './cache/u_action_%s_%s_%s.csv' % (start_date, end_date,k)
134 | if os.path.exists(dump_path):
135 | actions = pd.read_csv(dump_path)
136 | else:
137 | start_days=pd.to_datetime(end_date)-timedelta(days=k)
138 | start_days=str(start_days).split(' ')[0]
139 | actions = get_actions(start_days, end_date)
140 | actions = actions[['user_id', 'type']]
141 | df = pd.get_dummies(actions['type'], prefix='type')
142 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
143 | actions = actions.groupby('user_id', as_index=False).sum()
144 | min_max_scaler = preprocessing.MinMaxScaler()
145 | df = min_max_scaler.fit_transform(actions.drop(['user_id','type'],axis=1).values)
146 | df = pd.DataFrame(df)
147 | df.columns=['u_action_'+str(k)+'_'+str(i) for i in range(1,df.shape[1]+1)]
148 | actions = pd.concat([actions[['user_id']], df], axis=1)
149 | actions.to_csv(dump_path, index=False)
150 | return actions
151 |
152 |
153 |
154 |
155 |
156 |
157 | # 用户的行为转化率
158 | def get_action_user_feat1(start_date, end_date):
159 | feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio',
160 | 'user_action_5_ratio', 'user_action_6_ratio']
161 | dump_path = './cache/user_feat_accumulate_xiugai_%s_%s.csv' % (start_date, end_date)
162 | if os.path.exists(dump_path):
163 | actions = pd.read_csv(dump_path)
164 | else:
165 | actions = get_actions(start_date, end_date)
166 | df = pd.get_dummies(actions['type'], prefix='action')
167 | actions = pd.concat([actions['user_id'], df], axis=1)
168 | actions = actions.groupby(['user_id'], as_index=False).sum()
169 | actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1']
170 | actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2']
171 | # actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3']
172 | actions['user_action_3_ratio'] = actions['action_3'] / actions['action_2']
173 | actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5']
174 | actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6']
175 | # 3.购物车删除
176 | actions = actions[feature]
177 | actions.to_csv(dump_path, index=False)
178 | return actions
179 |
180 |
181 | # print get_accumulate_user_feat('2016-03-10','2016-04-11')
182 | # 用户购买前访问天数
183 | # 用户购买/加入购物车/关注前访问天数
184 | def get_action_user_feat2(start_date, end_date):
185 | dump_path = './cache/user_feat2_after_%s_%s.csv' % (start_date, end_date)
186 | if os.path.exists(dump_path):
187 | actions = pd.read_csv(dump_path)
188 |
189 | else:
190 | # 用户购买前访问天数
191 | def user_feat_2_1(start_date, end_date):
192 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
193 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
194 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
195 | visit = actions[actions['type'] == 1]
196 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
197 | del visit['time']
198 | del actions['time']
199 | visit = visit.groupby('user_id', as_index=False).count()
200 | visit.columns = ['user_id', 'visit']
201 | buy = actions[actions['type'] == 4]
202 | buy = buy.groupby('user_id', as_index=False).count()
203 | buy.columns = ['user_id', 'buy']
204 | actions = pd.merge(visit, buy, on='user_id', how='left')
205 | actions['visit_day_before_buy'] = actions['visit'] / actions['buy']
206 | del actions['buy']
207 | del actions['visit']
208 | return actions
209 |
210 | # 用户加入购物车前访问天数
211 | def user_feat_2_2(start_date, end_date):
212 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
213 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
214 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
215 | visit = actions[actions['type'] == 1]
216 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
217 | del visit['time']
218 | del actions['time']
219 | visit = visit.groupby('user_id', as_index=False).count()
220 | visit.columns = ['user_id', 'visit']
221 | addtoshopping = actions[actions['type'] == 2]
222 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
223 | addtoshopping.columns = ['user_id', 'addtoshopping']
224 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
225 | actions['visit_day_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
226 | del actions['addtoshopping']
227 | del actions['visit']
228 | return actions
229 |
230 | # 用户关注前访问天数
231 | def user_feat_2_3(start_date, end_date):
232 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
233 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
234 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
235 | visit = actions[actions['type'] == 1]
236 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
237 | del visit['time']
238 | del actions['time']
239 | visit = visit.groupby('user_id', as_index=False).count()
240 | visit.columns = ['user_id', 'visit']
241 | guanzhu = actions[actions['type'] == 5]
242 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
243 | guanzhu.columns = ['user_id', 'guanzhu']
244 | actions = pd.merge(visit, guanzhu, on='user_id', how='left')
245 | actions['visit_day_before_guanzhu'] = actions['visit'] / actions['guanzhu']
246 | del actions['guanzhu']
247 | del actions['visit']
248 | return actions
249 |
250 | # 用户购买前加入购物车天数
251 | def user_feat_2_4(start_date, end_date):
252 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
253 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
254 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
255 | addtoshopping = actions[actions['type'] == 2]
256 | addtoshopping = addtoshopping.drop_duplicates(['user_id', 'time'], keep='first')
257 | del addtoshopping['time']
258 | del actions['time']
259 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
260 | addtoshopping.columns = ['user_id', 'addtoshopping']
261 | buy = actions[actions['type'] == 4]
262 | buy = buy.groupby('user_id', as_index=False).count()
263 | buy.columns = ['user_id', 'buy']
264 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
265 | actions['addtoshopping_day_before_buy'] = actions['addtoshopping'] / actions['buy']
266 | del actions['buy']
267 | del actions['addtoshopping']
268 | return actions
269 |
270 | # 用户购买前关注天数
271 | def user_feat_2_5(start_date, end_date):
272 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
273 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
274 | guanzhu = actions[actions['type'] == 5]
275 | guanzhu = guanzhu.drop_duplicates(['user_id', 'time'], keep='first')
276 | del guanzhu['time']
277 | del actions['time']
278 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
279 | guanzhu.columns = ['user_id', 'guanzhu']
280 | buy = actions[actions['type'] == 4]
281 | buy = buy.groupby('user_id', as_index=False).count()
282 | buy.columns = ['user_id', 'buy']
283 | actions = pd.merge(guanzhu, buy, on='user_id', how='left')
284 | actions['guanzhu_day_before_buy'] = actions['guanzhu'] / actions['buy']
285 | del actions['buy']
286 | del actions['guanzhu']
287 | return actions
288 |
289 | actions = pd.merge(user_feat_2_1(start_date, end_date), user_feat_2_2(start_date, end_date), on='user_id',
290 | how='outer')
291 | actions = pd.merge(actions, user_feat_2_3(start_date, end_date), on='user_id', how='outer')
292 | actions = pd.merge(actions, user_feat_2_4(start_date, end_date), on='user_id', how='outer')
293 | actions = pd.merge(actions, user_feat_2_5(start_date, end_date), on='user_id', how='outer')
294 | user_id = actions['user_id']
295 | del actions['user_id']
296 | actions = actions.fillna(0)
297 | min_max_scale = preprocessing.MinMaxScaler()
298 | actions = min_max_scale.fit_transform(actions.values)
299 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
300 | actions.to_csv(dump_path, index=False)
301 | actions.columns = ['user_id'] + ['u_feat2_' + str(i) for i in range(1, actions.shape[1])]
302 | return actions
303 |
304 |
305 |
306 |
307 | # # 用户总购买品牌数
308 | # def get_action_user_feat5(start_date, end_date):
309 | # dump_path = './cache/user_feat5_%s_%s.csv' % (start_date, end_date)
310 | # if os.path.exists(dump_path):
311 | # actions = pd.read_csv(dump_path)
312 | # else:
313 | # actions = get_actions(start_date, end_date)[['user_id', 'sku_id']]
314 | # actions = actions.drop_duplicates(['user_id', 'sku_id'], keep='first')
315 | # actions = actions.groupby('user_id', as_index=False).count()
316 | # actions.columns = ['user_id', 'sku_num']
317 | # actions['sku_num'] = actions['sku_num'].astype('float')
318 | # actions['sku_num'] = actions['sku_num'].map(
319 | # lambda x: (x - actions['sku_num'].min()) / (actions['sku_num'].max() - actions['sku_num'].min()))
320 | # actions.to_csv(dump_path, index=False)
321 | # actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
322 | # return actions
323 |
324 |
325 | # 用户平均访问间隔
326 | def get_action_user_feat6(start_date, end_date):
327 | dump_path = './cache/user_feat6_%s_%s.csv' % (start_date, end_date)
328 | if os.path.exists(dump_path):
329 | actions = pd.read_csv(dump_path)
330 | else:
331 |
332 | df = get_actions(start_date, end_date)[['user_id', 'time']]
333 | # df['user_id']=df['user_id'].astype('int')
334 | df['time'] = df['time'].map(lambda x: x.split(' ')[0])
335 | df = df.drop_duplicates(['user_id', 'time'], keep='first')
336 | df['time'] = df['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
337 | actions = df.groupby('user_id', as_index=False).agg(lambda x: x['time'].diff().mean())
338 | actions['avg_visit'] = actions['time'].dt.days
339 | del actions['time']
340 | actions.to_csv(dump_path, index=False)
341 | actions.columns = ['user_id'] + ['u_feat6_' + str(i) for i in range(1, actions.shape[1])]
342 | return actions
343 |
344 |
345 | # 用户平均六种行为的访问间隔
346 | def get_action_user_feat6_six(start_date, end_date):
347 | dump_path = './cache/user_feat6_six_%s_%s.csv' % (start_date, end_date)
348 | if os.path.exists(dump_path):
349 | actions = pd.read_csv(dump_path)
350 | else:
351 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
352 | df['time'] = df['time'].map(lambda x: (-1) * get_day_chaju(x, start_date))
353 | df = df.drop_duplicates(['user_id', 'time', 'type'], keep='first')
354 | actions = df.groupby(['user_id', 'type']).agg(lambda x: np.diff(x).mean())
355 | actions = actions.unstack()
356 | actions.columns = list(range(actions.shape[1]))
357 | actions = actions.reset_index()
358 | actions.to_csv(dump_path, index=False)
359 | actions.columns = ['user_id'] + ['u_feat6_six_' + str(i) for i in range(1, actions.shape[1])]
360 | return actions
361 |
362 |
363 | # 用户购买频率
364 | def get_action_user_feat7(start_date, end_date):
365 | dump_path = './cache/user_feat7_six_%s_%s.csv' % (start_date, end_date)
366 | if os.path.exists(dump_path):
367 | actions = pd.read_csv(dump_path)
368 | else:
369 | df = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
370 | actions = df.groupby(['user_id', 'type'], as_index=False).count()
371 |
372 | time_min = df.groupby(['user_id', 'type'], as_index=False).min()
373 | time_max = df.groupby(['user_id', 'type'], as_index=False).max()
374 |
375 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
376 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
377 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
378 |
379 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
380 | time_cha[
381 | 'time_y']).dt.seconds // 3600
382 | del time_cha['time_x']
383 | del time_cha['time_y']
384 | # time_cha=time_cha.fillna(1)
385 |
386 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
387 | actions = actions.groupby(['user_id', 'type']).sum()
388 | actions['cnt/time'] = actions['time'] / actions["cha_hour"]
389 | actions = actions.unstack()
390 | actions.columns = list(range(actions.shape[1]))
391 | actions = actions.reset_index()
392 | actions = actions.fillna(0)
393 | actions.to_csv(dump_path, index=False)
394 | actions.columns = ['user_id'] + ['u_feat7_' + str(i) for i in range(1, actions.shape[1])]
395 | return actions
396 |
397 |
398 | def user_top_k_0_1(start_date, end_date):
399 | actions = get_actions(start_date, end_date)
400 | actions = actions[['user_id', 'sku_id', 'type']]
401 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
402 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
403 | actions = actions.groupby('user_id', as_index=False).sum()
404 | del actions['type']
405 | del actions['sku_id']
406 | user_id = actions['user_id']
407 | del actions['user_id']
408 | actions = actions.applymap(lambda x: 1 if x > 0 else 0)
409 | actions = pd.concat([user_id, actions], axis=1)
410 | return actions
411 |
412 |
413 | # 用户最近K天行为0/1提取
414 | def get_action_user_feat8(start_date, end_date):
415 | dump_path = './cache/user_feat8_%s_%s.csv' % (start_date, end_date)
416 | if os.path.exists(dump_path):
417 | actions = pd.read_csv(dump_path)
418 | else:
419 | actions = None
420 | for i in (1, 2, 3, 4, 5, 6, 7, 15, 30):
421 | print(i)
422 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
423 | start_days = start_days.strftime('%Y-%m-%d')
424 | if actions is None:
425 | actions = user_top_k_0_1(start_days, end_date)
426 | else:
427 | actions = pd.merge(actions, user_top_k_0_1(start_days, end_date), how='outer', on='user_id')
428 | actions.to_csv(dump_path, index=False)
429 | actions.columns = ['user_id'] + ['u_feat8_' + str(i) for i in range(1, actions.shape[1])]
430 | return actions
431 |
432 |
433 | # 获取用户的重复购买率
434 | def get_action_user_feat8_2(start_date, end_date):
435 | dump_path = './cache/product_feat8_2_%s_%s.csv' % (start_date, end_date)
436 | if os.path.exists(dump_path):
437 | actions = pd.read_csv(dump_path)
438 | else:
439 | df = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']]
440 | df = df[df['type'] == 4] # 购买的行为
441 | df = df.groupby(['user_id', 'sku_id'], as_index=False).count()
442 | df.columns = ['user_id', 'sku_id', 'count1']
443 | df['count1'] = df['count1'].map(lambda x: 1 if x > 1 else 0)
444 | grouped = df.groupby(['user_id'], as_index=False)
445 | actions = grouped.count()[['user_id', 'count1']]
446 | actions.columns = ['user_id', 'count']
447 | re_count = grouped.sum()[['user_id', 'count1']]
448 | re_count.columns = ['user_id', 're_count']
449 | actions = pd.merge(actions, re_count, on='user_id', how='left')
450 | re_buy_rate = actions['re_count'] / actions['count']
451 | actions = pd.concat([actions['user_id'], re_buy_rate], axis=1)
452 | actions.columns = ['user_id', 're_buy_rate']
453 | actions.to_csv(dump_path, index=False)
454 | actions.columns = ['user_id'] + ['u_feat8_2_' + str(i) for i in range(1, actions.shape[1])]
455 | return actions
456 |
457 |
458 | # 获取最近一次行为的时间距离当前时间的差距
459 | def get_action_user_feat9(start_date, end_date):
460 | dump_path = './cache/user_feat9_%s_%s.csv' % (start_date, end_date)
461 | if os.path.exists(dump_path):
462 | actions = pd.read_csv(dump_path)
463 | else:
464 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
465 | # df['time'] = df['time'].map(lambda x: (-1)*get_day_chaju(x,start_date))
466 | df = df.drop_duplicates(['user_id', 'type'], keep='last')
467 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
468 | actions = df.groupby(['user_id', 'type']).sum()
469 | actions = actions.unstack()
470 | actions.columns = list(range(actions.shape[1]))
471 | actions = actions.reset_index()
472 | actions = actions.fillna(30)
473 | actions.to_csv(dump_path, index=False)
474 | actions.columns = ['user_id'] + ['u_feat9_' + str(i) for i in range(1, actions.shape[1])]
475 | return actions
476 |
477 |
478 | # 获取最后一次行为的次数并且进行归一化
479 | def get_action_user_feat10(start_date, end_date):
480 | dump_path = './cache/user_feat10_%s_%s.csv' % (start_date, end_date)
481 | if os.path.exists(dump_path):
482 | actions = pd.read_csv(dump_path)
483 | else:
484 |
485 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
486 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
487 |
488 | idx = df.groupby(['user_id', 'type'])['time'].transform(min)
489 | idx1 = idx == df['time']
490 | actions = df[idx1].groupby(["user_id", "type"]).count()
491 | actions = actions.unstack()
492 | actions.columns = list(range(actions.shape[1]))
493 | actions = actions.fillna(0)
494 | actions = actions.reset_index()
495 |
496 | user_sku = actions[['user_id']]
497 | del actions['user_id']
498 | min_max_scaler = preprocessing.MinMaxScaler()
499 | actions = min_max_scaler.fit_transform(actions.values)
500 | actions = pd.DataFrame(actions)
501 | actions = pd.concat([user_sku, actions], axis=1)
502 |
503 | actions.to_csv(dump_path, index=False)
504 | actions.columns = ['user_id'] + ['u_feat10_' + str(i) for i in range(1, actions.shape[1])]
505 | return actions
506 |
507 |
508 | # 获取人物该层级最后一层的各种行为的统计数量
509 | def get_action_user_feat11(start_date, end_date, n):
510 | dump_path = './cache/user_feat11_%s_%s_%s.csv' % (start_date, end_date, n)
511 | if os.path.exists(dump_path):
512 | actions = pd.read_csv(dump_path)
513 | else:
514 |
515 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
516 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
517 | df = df[df['time'] == 0]
518 | del df['time']
519 | temp = pd.get_dummies(df['type'], prefix='type')
520 | del df['type']
521 | actions = pd.concat([df, temp], axis=1)
522 | actions = actions.groupby(['user_id'], as_index=False).sum()
523 | user_sku = actions[['user_id']]
524 | del actions['user_id']
525 | min_max_scaler = preprocessing.MinMaxScaler()
526 | actions = min_max_scaler.fit_transform(actions.values)
527 | actions = pd.DataFrame(actions)
528 | actions = pd.concat([user_sku, actions], axis=1)
529 | actions.to_csv(dump_path, index=False)
530 | actions.columns = ['user_id'] + ['u_feat11_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
531 | return actions
532 |
533 |
534 | def get_action_user_feat12(start_date, end_date):
535 | dump_path = './cache/user_feat12_%s_%s.csv' % (start_date, end_date)
536 | if os.path.exists(dump_path):
537 | actions = pd.read_csv(dump_path)
538 | else:
539 | actions = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
540 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
541 | actions = actions.drop_duplicates(['user_id', 'time', 'type'], keep='first')
542 | actions['day'] = actions['time'].map(
543 | lambda x: (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d')).days)
544 | result = None
545 | for i in (2, 3, 7, 14, 28): # 层级个数
546 | print ('i%s' % i)
547 | actions['level%s' % i] = actions['day'].map(lambda x: x // i)
548 | a=set(actions['level%s' % i].tolist())
549 | for j in (1, 2,3,4, 5, 6): # type
550 | print ('j%s' % j)
551 | df = actions[actions['type'] == j][['user_id', 'level%s' % i, 'time']]
552 | df = df.groupby(['user_id', 'level%s' % i]).count()
553 | df = df.unstack()
554 | b=df.columns.levels[1].tolist()
555 | df.columns = ['u_feat12_' + str('level%s_' % i) + str(j) + '_' + str(k) for k in df.columns.levels[1].tolist()]
556 | if len(list(a-set(b)))!=0:
557 | c=list(a-set(b))
558 | for k in c:
559 | df['u_feat12_'+str('level%s_' % i)+str(j)+'_'+ str(k)]=0
560 | columns=df.columns
561 | dict={}
562 | for column in columns:
563 | k=int(column.split('_')[-1])
564 | dict[column]=k
565 | columns=sorted(dict.items(),key=lambda x: x[1])
566 | columns=[(columns[t])[0] for t in range(len(columns))]
567 | df=df[columns]
568 | df = df.reset_index()
569 | if result is None:
570 | result = df
571 | else:
572 | result = pd.merge(result, df, on='user_id', how='left')
573 | columns = result.columns
574 | user_id = result['user_id']
575 | del result['user_id']
576 | actions = result.fillna(0)
577 |
578 | min_max_scaler = preprocessing.MinMaxScaler()
579 | actions = min_max_scaler.fit_transform(actions.values)
580 | actions = pd.DataFrame(actions)
581 | actions = pd.concat([user_id, actions], axis=1)
582 | actions.columns=columns
583 | actions.to_csv(dump_path, index=False)
584 | return actions
585 |
586 |
587 |
588 | # 层级的天数
589 | def get_action_user_feat13(start_date, end_date, n):
590 | dump_path = './cache/user_feat13_%s_%s_%s.csv' % (start_date, end_date, n)
591 | if os.path.exists(dump_path):
592 | actions = pd.read_csv(dump_path)
593 | else:
594 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
595 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
596 | df = df.drop_duplicates(['user_id', 'type', 'time'], keep='first')
597 | actions = df.groupby(['user_id', 'type']).count()
598 | actions = actions.unstack()
599 | actions.columns = list(range(actions.shape[1]))
600 | actions = actions.fillna(0)
601 | actions = actions.reset_index()
602 | user_sku = actions[['user_id']]
603 | del actions['user_id']
604 | min_max_scaler = preprocessing.MinMaxScaler()
605 | actions = min_max_scaler.fit_transform(actions.values)
606 | actions = pd.DataFrame(actions)
607 | actions = pd.concat([user_sku, actions], axis=1)
608 | actions.to_csv(dump_path, index=False)
609 | actions.columns = ['user_id'] + ['u_feat13_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
610 | return actions
611 |
612 |
613 | def get_action_user_feat14(start_date, end_date):
614 | dump_path = './cache/user_feat14_%s_%s.csv' % (start_date, end_date)
615 | if os.path.exists(dump_path):
616 | actions = pd.read_csv(dump_path)
617 | else:
618 | n = 5
619 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
620 | df = df[df['type'] == 4][['user_id', 'time']]
621 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
622 | days = np.max(df['time'])
623 |
624 | df['cnt'] = 0
625 | actions = df.groupby(['user_id', 'time']).count()
626 |
627 | actions = actions.unstack()
628 |
629 | actions.columns = list(range(actions.shape[1]))
630 | actions = actions.reset_index()
631 |
632 | actions = actions.fillna(0)
633 | user_sku = actions[['user_id']]
634 | del actions['user_id']
635 | min_max_scaler = preprocessing.MinMaxScaler()
636 | actions = min_max_scaler.fit_transform(actions.values)
637 | actions = pd.DataFrame(actions)
638 | actions = pd.concat([user_sku, actions], axis=1)
639 | actions.to_csv(dump_path, index=False)
640 | actions.columns = ['user_id'] + ['u_feat14_' + str(i) for i in range(1, actions.shape[1])]
641 | return actions
642 |
643 |
644 | # 用户购买/加入购物车/关注前访问次数
645 | def get_action_user_feat15(start_date, end_date):
646 | dump_path = './cache/user_feat15_%s_%s.csv' % (start_date, end_date)
647 | if os.path.exists(dump_path):
648 | actions = pd.read_csv(dump_path)
649 | else:
650 | # 用户购买前访问次数
651 | def user_feat_15_1(start_date, end_date):
652 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
653 | visit = actions[actions['type'] == 1]
654 | visit = visit.groupby('user_id', as_index=False).count()
655 | visit.columns = ['user_id', 'visit']
656 | buy = actions[actions['type'] == 4]
657 | buy = buy.groupby('user_id', as_index=False).count()
658 | buy.columns = ['user_id', 'buy']
659 | actions = pd.merge(visit, buy, on='user_id', how='left')
660 | actions['visit_num_before_buy'] = actions['visit'] / actions['buy']
661 | del actions['buy']
662 | del actions['visit']
663 | return actions
664 |
665 | # 用户加入购物车前访问次数
666 | def user_feat_15_2(start_date, end_date):
667 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
668 | visit = actions[actions['type'] == 1]
669 | visit = visit.groupby('user_id', as_index=False).count()
670 | visit.columns = ['user_id', 'visit']
671 | addtoshopping = actions[actions['type'] == 2]
672 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
673 | addtoshopping.columns = ['user_id', 'addtoshopping']
674 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
675 | actions['visit_num_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
676 | del actions['addtoshopping']
677 | del actions['visit']
678 | return actions
679 |
680 | # 用户关注前访问次数
681 | def user_feat_15_3(start_date, end_date):
682 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
683 | visit = actions[actions['type'] == 1]
684 | visit = visit.groupby('user_id', as_index=False).count()
685 | visit.columns = ['user_id', 'visit']
686 | guanzhu = actions[actions['type'] == 5]
687 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
688 | guanzhu.columns = ['user_id', 'guanzhu']
689 | actions = pd.merge(visit, guanzhu, on='user_id', how='left')
690 | actions['visit_num_before_guanzhu'] = actions['visit'] / actions['guanzhu']
691 | del actions['guanzhu']
692 | del actions['visit']
693 | return actions
694 |
695 | # 用户购买前加入购物车次数
696 | def user_feat_15_4(start_date, end_date):
697 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
698 | addtoshopping = actions[actions['type'] == 2]
699 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
700 | addtoshopping.columns = ['user_id', 'addtoshopping']
701 | buy = actions[actions['type'] == 4]
702 | buy = buy.groupby('user_id', as_index=False).count()
703 | buy.columns = ['user_id', 'buy']
704 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
705 | actions['addtoshopping_num_before_buy'] = actions['addtoshopping'] / actions['buy']
706 | del actions['buy']
707 | del actions['addtoshopping']
708 | return actions
709 |
710 | # 用户购买前关注次数
711 | def user_feat_15_5(start_date, end_date):
712 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
713 | guanzhu = actions[actions['type'] == 5]
714 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
715 | guanzhu.columns = ['user_id', 'guanzhu']
716 | buy = actions[actions['type'] == 4]
717 | buy = buy.groupby('user_id', as_index=False).count()
718 | buy.columns = ['user_id', 'buy']
719 | actions = pd.merge(guanzhu, buy, on='user_id', how='left')
720 | actions['guanzhu_num_before_buy'] = actions['guanzhu'] / actions['buy']
721 | del actions['buy']
722 | del actions['guanzhu']
723 | return actions
724 |
725 | actions = pd.merge(user_feat_15_1(start_date, end_date), user_feat_15_2(start_date, end_date), on='user_id',
726 | how='outer')
727 | actions = pd.merge(actions, user_feat_15_3(start_date, end_date), on='user_id', how='outer')
728 | actions = pd.merge(actions, user_feat_15_4(start_date, end_date), on='user_id', how='outer')
729 | actions = pd.merge(actions, user_feat_15_5(start_date, end_date), on='user_id', how='outer')
730 | user_id = actions['user_id']
731 | del actions['user_id']
732 | actions = actions.fillna(0)
733 | min_max_scale = preprocessing.MinMaxScaler()
734 | actions = min_max_scale.fit_transform(actions.values)
735 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
736 |
737 | actions.to_csv(dump_path, index=False)
738 | actions.columns = ['user_id'] + ['u_feat15_' + str(i) for i in range(1, actions.shape[1])]
739 | return actions
740 |
741 |
742 | # 用户行为的交叉
743 | def get_action_user_feat16(start_date, end_date):
744 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
745 | if os.path.exists(dump_path):
746 | actions = pd.read_csv(dump_path)
747 | else:
748 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
749 | actions['cnt'] = 0
750 | action1 = actions.groupby(['user_id', 'type']).count()
751 | action1 = action1.unstack()
752 | index_col = list(range(action1.shape[1]))
753 | action1.columns = index_col
754 | action1 = action1.reset_index()
755 | action2 = actions.groupby('user_id', as_index=False).count()
756 | del action2['type']
757 | action2.columns = ['user_id', 'cnt']
758 | actions = pd.merge(action1, action2, how='left', on='user_id')
759 | for i in index_col:
760 | actions[i] = actions[i] / actions['cnt']
761 | del actions['cnt']
762 | actions.to_csv(dump_path, index=False)
763 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
764 | return actions
765 |
766 |
767 | # 最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除)
768 | def get_action_user_feat0509_1_30(start_date, end_date, n):
769 | dump_path = './cache/user_feat0509_1_30_%s_%s_%s.csv' % (start_date, end_date, n)
770 | if os.path.exists(dump_path):
771 | actions = pd.read_csv(dump_path)
772 | else:
773 |
774 | start_days = datetime.strptime(end_dfte, '%Y-%m-%d') - timedelta(days=n)
775 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
776 |
777 | actions = get_actions(start_days, end_date)[['user_id', 'sku_id', 'type']]
778 | actions_dummy = pd.get_dummies(actions['type'], prefix='actions')
779 | actions = pd.concat([actions, actions_dummy], axis=1)
780 | del actions['type']
781 |
782 | P = get_basic_product_feat()[['sku_id']]
783 | P['label'] = 1
784 | actions_sub = pd.merge(actions, P, on='sku_id', how='left')
785 | actions_sub = actions_sub[actions_sub['label'] == 1]
786 | del actions_sub['label']
787 |
788 | actions_sub = actions_sub.groupby(['user_id'], as_index=False).sum()
789 | del actions_sub['sku_id']
790 | actions_all = actions.groupby(['user_id'], as_index=False).sum()
791 | del actions_all['sku_id']
792 |
793 | if n > 7:
794 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
795 | # print actions.head()
796 | for i in range(1, 7):
797 | actions['actions_%s' % i] = actions['actions_%s_y' % i] / actions['actions_%s_x' % i]
798 | # actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
799 |
800 | else:
801 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
802 | actions.to_csv(dump_path, index=False)
803 | actions.columns = ['user_id'] + ['u_feat30_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
804 |
805 | return actions
806 |
807 |
808 |
809 | #用户行为的交叉
810 | def get_action_user_feat16(start_date,end_date):
811 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
812 | if os.path.exists(dump_path):
813 | actions = pd.read_csv(dump_path)
814 | else:
815 | actions=get_actions(start_date, end_date)[['user_id', 'type']]
816 | actions['cnt']=0
817 | action1 = actions.groupby(['user_id', 'type']).count()
818 | action1=action1.unstack()
819 | index_col=list(range(action1.shape[1]))
820 | action1.columns=index_col
821 | action1=action1.reset_index()
822 | action2 = actions.groupby('user_id', as_index=False).count()
823 | del action2['type']
824 | action2.columns = ['user_id', 'cnt']
825 | actions = pd.merge(action1, action2, how='left', on='user_id')
826 | for i in index_col:
827 | actions[i] = actions[i] / actions['cnt']
828 | del actions['cnt']
829 | actions.to_csv(dump_path,index=False)
830 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
831 | return actions
832 |
833 | #最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除)
834 | def get_action_user_feat0509_1_30(start_date,end_date,n):
835 | dump_path='./cache/user_feat0509_1_30_%s_%s_%s.csv'%(start_date,end_date,n)
836 | if os.path.exists(dump_path):
837 | actions = pd.read_csv(dump_path)
838 | else:
839 |
840 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
841 | start_days=datetime.strftime(start_days,'%Y-%m-%d')
842 |
843 | actions=get_actions(start_days,end_date)[['user_id','sku_id','type']]
844 | actions_dummy=pd.get_dummies(actions['type'],prefix='actions')
845 | actions=pd.concat([actions,actions_dummy],axis=1)
846 | del actions['type']
847 |
848 | P = get_basic_product_feat()[['sku_id']]
849 | P['label']=1
850 | actions_sub=pd.merge(actions,P,on='sku_id',how='left')
851 | actions_sub=actions_sub[actions_sub['label']==1]
852 | del actions_sub['label']
853 |
854 | actions_sub=actions_sub.groupby(['user_id'],as_index=False).sum()
855 | del actions_sub['sku_id']
856 | actions_all=actions.groupby(['user_id'],as_index=False).sum()
857 | del actions_all['sku_id']
858 |
859 | if n>7:
860 | actions=pd.merge(actions_all,actions_sub,on=['user_id'],how='left')
861 | #print actions.head()
862 | for i in range(1,7):
863 | actions['actions_%s'%i]=actions['actions_%s_y'%i]/actions['actions_%s_x'%i]
864 | #actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
865 |
866 | else:
867 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
868 | actions.to_csv(dump_path,index=False)
869 | actions.columns = ['user_id'] + ['u_feat30_' +str(n)+'_'+ str(i) for i in range(1, actions.shape[1])]
870 | # user_id = actions[['user_id']]
871 | # del actions['user_id']
872 | # actions = actions.fillna(0)
873 | # actions=actions.replace(np.inf,0)
874 | # # print(actions.head())
875 | # columns = actions.columns
876 |
877 | # min_max_scale = preprocessing.MinMaxScaler()
878 | # actions=actions.replace(np.inf,0)
879 | # actions = min_max_scale.fit_transform(actions.values)
880 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
881 | return actions
882 |
883 | #用户点击到购买的时间间隔
884 | def get_action_user_feat0515_2_1(start_date,end_date):
885 | dump_path='./cache/get_action_user_feat0515_2_1_%s_%s.csv'%(start_date,end_date)
886 | if os.path.exists(dump_path):
887 | actions = pd.read_csv(dump_path)
888 | else:
889 | actions = get_actions(start_date,end_date)
890 | actions_dianji=actions[actions['type']==6][['user_id','sku_id','time']]
891 | actions_dianji['time_dianji'] = actions_dianji['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
892 | actions_dianji = actions_dianji[['user_id', 'sku_id','time_dianji']]
893 | actions_dianji= actions_dianji.drop_duplicates(['user_id', 'sku_id'], keep='first')
894 |
895 |
896 | actions_goumai=actions[actions['type']==4][['user_id','sku_id','time']]
897 | actions_goumai['time_goumai'] = actions_goumai['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
898 | actions_goumai = actions_goumai[['user_id', 'sku_id','time_goumai']]
899 | actions_goumai= actions_goumai.drop_duplicates(['user_id', 'sku_id'], keep='last')
900 |
901 | actions = pd.merge(actions_dianji,actions_goumai,on=['user_id','sku_id'],how='inner')
902 | actions['time_jiange']=actions['time_goumai']-actions['time_dianji']
903 | actions=actions.drop(['sku_id','time_goumai','time_dianji'],axis=1)
904 | actions['time_jiange']=actions['time_jiange'].map(lambda x:x.days*24+x.seconds//3600+1)
905 |
906 | actions_min = actions.groupby('user_id').min().reset_index()
907 | actions_min.columns = ['user_id','time_min']
908 | # actions_mean = actions.groupby('user_id').mean().reset_index()
909 | # actions_mean.columns = ['user_id','time_mean']
910 | actions_max = actions.groupby('user_id').max().reset_index()
911 | actions_max.columns = ['user_id','time_max']
912 | actions=pd.merge(actions_min,actions_max,on='user_id',how='left')
913 |
914 | user_id = actions[['user_id']]
915 | del actions['user_id']
916 | actions = actions.fillna(0)
917 | columns = actions.columns
918 | min_max_scale = preprocessing.MinMaxScaler()
919 | actions = min_max_scale.fit_transform(actions.values)
920 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
921 | actions.to_csv(dump_path,index=False)
922 | return actions
923 |
924 |
925 | #用户购买每种cate的数量
926 | def get_action_user_feat0515_2_2(start_date,end_date):
927 | dump_path='./cache/get_action_user_feat0515_2_2_%s_%s.csv'%(start_date,end_date)
928 | if os.path.exists(dump_path):
929 | actions = pd.read_csv(dump_path)
930 | else:
931 | actions = get_actions(start_date,end_date)
932 | actions = get_actions(start_date,end_date)[['user_id','cate']]
933 | cate_col = pd.get_dummies(actions['cate'],prefix='cate')
934 | actions=pd.concat([actions[['user_id']],cate_col],axis=1)
935 | actions= actions.groupby('user_id').sum().reset_index()
936 |
937 | user_id = actions[['user_id']]
938 | del actions['user_id']
939 | actions = actions.fillna(0)
940 | columns = actions.columns
941 | min_max_scale = preprocessing.MinMaxScaler()
942 | actions = min_max_scale.fit_transform(actions.values)
943 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
944 | actions.to_csv(dump_path,index=False)
945 | return actions
946 |
947 |
948 | #获取某人某段时间内加入购物车的数量以及关注的数量
949 | def get_action_user_feat0515_2_3(start_date, end_date, n):
950 | dump_path = './cache/get_action_user_feat0515_2_3_%s_%s_%s_1.csv' % (start_date, end_date, n)
951 | if os.path.exists(dump_path):
952 | actions = pd.read_csv(dump_path)
953 | else:
954 |
955 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
956 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
957 |
958 | actions = get_actions(start_days,end_date)[['user_id','type','cate']]
959 | actions_gouwuche=actions[actions['type']==2]
960 | actions_gouwuche_1= actions_gouwuche[['user_id','type']]
961 | actions_gouwuche_1= actions_gouwuche_1.groupby('user_id').count().reset_index()
962 | actions_gouwuche_1.columns = ['user_id',str(n)+'gouwuche_add']
963 |
964 | actions_gouwuche_2= actions_gouwuche[actions_gouwuche['cate']==8][['user_id','type']]
965 | actions_gouwuche_2= actions_gouwuche_2.groupby('user_id').count().reset_index()
966 | actions_gouwuche_2.columns = ['user_id',str(n)+'gouwuche_add_cate_8']
967 |
968 | actions_guanzhu=actions[actions['type']==5]
969 | actions_guanzhu_1= actions_guanzhu[['user_id','type']]
970 | actions_guanzhu_1= actions_guanzhu_1.groupby('user_id').count().reset_index()
971 | actions_guanzhu_1.columns = ['user_id',str(n)+'guanzhu_add']
972 |
973 | actions_guanzhu_2= actions_guanzhu[actions_guanzhu['cate']==8][['user_id','type']]
974 | actions_guanzhu_2= actions_guanzhu_2.groupby('user_id').count().reset_index()
975 | actions_guanzhu_2.columns = ['user_id',str(n)+'guanzhu_add_cate_8']
976 |
977 | actions = pd.merge(actions_gouwuche_1,actions_gouwuche_2,on='user_id',how ='outer')
978 | actions = pd.merge(actions,actions_guanzhu_1,on='user_id',how ='outer')
979 | actions = pd.merge(actions,actions_guanzhu_2,on='user_id',how ='outer')
980 | actions=actions.fillna(0)
981 |
982 | user_id = actions[['user_id']]
983 | del actions['user_id']
984 | actions = actions.fillna(0)
985 | columns = actions.columns
986 | min_max_scale = preprocessing.MinMaxScaler()
987 | actions = min_max_scale.fit_transform(actions.values)
988 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
989 | actions.to_csv(dump_path, index=False)
990 |
991 |
992 | return actions
993 |
994 | #top n 中 某人使用了多少天产生了该行为
995 | def get_action_user_feat0515_2_4(start_date, end_date, n):
996 | dump_path = './cache/get_action_user_feat0515_2_4_%s_%s_%s.csv' % (start_date, end_date, n)
997 | if os.path.exists(dump_path):
998 | actions = pd.read_csv(dump_path)
999 | else:
1000 |
1001 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1002 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
1003 |
1004 | actions = get_actions(start_days,end_date)[['user_id','type','time']]
1005 | actions['time'] = actions['time'].map(lambda x: (datetime.strptime(end_date,'%Y-%m-%d')-datetime.strptime(x, '%Y-%m-%d %H:%M:%S')).days)
1006 | actions=actions.drop_duplicates(['user_id','type','time'])
1007 | actions = actions.groupby(['user_id','type']).count()
1008 | actions.columns = [str(n)+'day_nums']
1009 | actions=actions.unstack()
1010 | actions=actions.reset_index()
1011 | actions.columns = ['user_id'] + ['get_action_user_feat0515_2_4_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1012 | actions=actions.fillna(0)
1013 |
1014 | user_id = actions[['user_id']]
1015 | del actions['user_id']
1016 | actions = actions.fillna(0)
1017 | columns = actions.columns
1018 | min_max_scale = preprocessing.MinMaxScaler()
1019 | actions = min_max_scale.fit_transform(actions.values)
1020 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1021 | actions.to_csv(dump_path, index=False)
1022 | return actions
1023 |
1024 |
1025 | # 用户总购买/加购/关注/点击/浏览品牌数
1026 | def get_action_user_feat5(start_date, end_date):
1027 | dump_path = './cache/user_feat5_a_%s_%s.csv' % (start_date, end_date)
1028 | if os.path.exists(dump_path):
1029 | actions = pd.read_csv(dump_path)
1030 | else:
1031 | actions = get_actions(start_date, end_date)
1032 | action=None
1033 | for i in (1,2,4,5,6):
1034 | df=actions[actions['type']==i][['user_id', 'sku_id']]
1035 | df = df.drop_duplicates(['user_id', 'sku_id'], keep='first')
1036 | df = df.groupby('user_id', as_index=False).count()
1037 | df.columns = ['user_id', 'num_%s'%i]
1038 | if i==1:
1039 | action=df
1040 | else:
1041 | action=pd.merge(action,df,on='user_id',how='outer')
1042 | actions=action.fillna(0)
1043 | actions = actions.astype('float')
1044 | user=actions[['user_id']]
1045 | min_max_scaler = preprocessing.MinMaxScaler()
1046 | actions = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1047 | actions = pd.DataFrame(actions)
1048 | actions = pd.concat([user, actions], axis=1)
1049 | actions.to_csv(dump_path, index=False)
1050 | actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
1051 | return actions
1052 |
1053 | #top k 用户总购买/加购/关注/点击/浏览品牌数
1054 | def get_action_u0515_feat5(start_date,end_date,k):
1055 | dump_path = './cache/u0515_feat5_%s_%s_%s.csv' % (start_date, end_date,k)
1056 | if os.path.exists(dump_path):
1057 | actions = pd.read_csv(dump_path)
1058 | else:
1059 | start_days=pd.to_datetime(end_date)-timedelta(days=k)
1060 | start_days=str(start_days).split(' ')[0]
1061 | actions=get_action_user_feat5(start_days, end_date)
1062 | actions.to_csv(dump_path,index=False)
1063 | actions.columns=['user_id']+['u0515_feat5_'+str(k)+'_'+str(i) for i in range(1,actions.shape[1])]
1064 | return actions
1065 |
1066 |
1067 | #最早交互时间
1068 | def get_action_u0524_feat1(start_date,end_date):
1069 | dump_path = './cache/u0524_feat1_%s_%s.csv' % (start_date, end_date,)
1070 | if os.path.exists(dump_path):
1071 | actions = pd.read_csv(dump_path)
1072 | else:
1073 | #全集
1074 | actions=get_actions(start_date,end_date)[['user_id','time']]
1075 | actions=actions.groupby('user_id',as_index=False).first()
1076 | actions['time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(actions['time'])
1077 | actions['time_diff_early']=actions['time_diff_early'].dt.days*24+actions['time_diff_early'].dt.seconds//3600
1078 | actions=actions[['user_id','time_diff_early']]
1079 | #子集
1080 | sub_actions=sub_get_actions(start_date,end_date)[['user_id','time']]
1081 | sub_actions=sub_actions.groupby('user_id',as_index=False).first()
1082 | sub_actions['sub_time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(sub_actions['time'])
1083 | sub_actions['sub_time_diff_early']=sub_actions['sub_time_diff_early'].dt.days*24+sub_actions['sub_time_diff_early'].dt.seconds//3600
1084 | sub_actions = sub_actions[['user_id', 'sub_time_diff_early']]
1085 |
1086 | actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1087 | actions=actions.fillna(0)
1088 | min_max_scale = preprocessing.MinMaxScaler()
1089 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1090 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1091 | actions.to_csv(dump_path,index=False)
1092 | actions.columns=['user_id']+['u0524_feat1_'+str(i)for i in range(1,actions.shape[1])]
1093 | return actions
1094 |
1095 | #最晚交互时间
1096 | def get_action_u0524_feat2(start_date,end_date):
1097 | dump_path = './cache/u0524_feat2_%s_%s.csv' % (start_date, end_date,)
1098 | if os.path.exists(dump_path):
1099 | actions = pd.read_csv(dump_path)
1100 | else:
1101 | # 全集
1102 | actions = get_actions(start_date, end_date)[['user_id', 'time']]
1103 | actions = actions.groupby('user_id', as_index=False).last()
1104 | actions['time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(actions['time'])
1105 | actions['time_diff_recent'] = actions['time_diff_recent'].dt.days * 24 + actions['time_diff_recent'].dt.seconds // 3600
1106 | actions = actions[['user_id', 'time_diff_recent']]
1107 | # 子集
1108 | sub_actions = sub_get_actions(start_date, end_date)[['user_id', 'time']]
1109 | sub_actions = sub_actions.groupby('user_id', as_index=False).last()
1110 | sub_actions['sub_time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(sub_actions['time'])
1111 | sub_actions['sub_time_diff_recent'] = sub_actions['sub_time_diff_recent'].dt.days * 24 + sub_actions['sub_time_diff_recent'].dt.seconds // 3600
1112 | sub_actions = sub_actions[['user_id', 'sub_time_diff_recent']]
1113 |
1114 | actions = pd.merge(actions, sub_actions, on='user_id', how='left')
1115 | actions=actions.fillna(0)
1116 | min_max_scale = preprocessing.MinMaxScaler()
1117 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1118 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1119 | actions.to_csv(dump_path,index=False)
1120 | actions.columns = ['user_id'] + ['u0524_feat2_' + str(i) for i in range(1, actions.shape[1])]
1121 | return actions
1122 |
1123 |
1124 | #活跃天数
1125 | def get_action_u0524_feat3(start_date,end_date):
1126 | dump_path = './cache/u0524_feat3_%s_%s.csv' % (start_date, end_date,)
1127 | if os.path.exists(dump_path):
1128 | actions = pd.read_csv(dump_path)
1129 | else:
1130 | #全集
1131 | actions=get_actions(start_date,end_date)
1132 | actions['time']=pd.to_datetime(actions['time']).dt.date
1133 | actions=actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1134 | actions=actions.groupby('user_id',as_index=False).count()
1135 | #子集
1136 | sub_actions=sub_get_actions(start_date,end_date)
1137 | sub_actions['time']=pd.to_datetime(sub_actions['time']).dt.date
1138 | sub_actions=sub_actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1139 | sub_actions=sub_actions.groupby('user_id',as_index=False).count()
1140 | actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1141 | actions=actions.fillna(0)
1142 | min_max_scale = preprocessing.MinMaxScaler()
1143 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1144 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1145 | actions.to_csv(dump_path,index=False)
1146 | actions.columns=['user_id']+['u0524_feat3_'+str(i) for i in range(1,actions.shape[1])]
1147 | return actions
1148 |
1149 |
1150 | #点击模块
1151 | def get_action_user_feat0509_1_31(start_date,end_date,n):
1152 | dump_path='./cache/user_feat0509_1_31_%s_%s_%s.csv'%(start_date,end_date,n)
1153 | if os.path.exists(dump_path):
1154 | actions = pd.read_csv(dump_path)
1155 | else:
1156 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
1157 | start_days=datetime.strftime(start_days,'%Y-%m-%d')
1158 | actions=get_actions(start_days,end_date)
1159 | actions=actions[actions['type']==6][['user_id','model_id']]
1160 |
1161 | # actions = actions.drop('type',axis=1)
1162 |
1163 | actions_click_sum=actions[['user_id','model_id']].groupby('user_id').count().reset_index()
1164 | actions_click_sum.columns = ['user_id',str(n)+'click_sum_all']
1165 | actions[str(n)+'u_click14_history'] = actions['model_id'].map(lambda x: int(x == 14))
1166 | actions[str(n)+'u_click21_history'] = actions['model_id'].map(lambda x: int(x == 21))
1167 | actions[str(n)+'u_click28_history'] = actions['model_id'].map(lambda x: int(x == 28))
1168 | actions[str(n)+'u_click110_history'] = actions['model_id'].map(lambda x: int(x == 110))
1169 | actions[str(n)+'u_click210_history'] = actions['model_id'].map(lambda x: int(x == 210))
1170 | actions = actions.groupby('user_id').sum().reset_index().drop('model_id', axis=1)
1171 | # actions.to_csv(dump_path,index=False)
1172 | actions = pd.merge(actions,actions_click_sum,how='left',on='user_id')
1173 |
1174 | actions[str(n)+'u_click14/click_sum_history'] = actions[str(n)+'u_click14_history']/actions[str(n)+'click_sum_all']
1175 | actions[str(n)+'u_click21/click_sum_history'] = actions[str(n)+'u_click21_history']/actions[str(n)+'click_sum_all']
1176 | actions[str(n)+'u_click28/click_sum_history'] = actions[str(n)+'u_click28_history']/actions[str(n)+'click_sum_all']
1177 | actions[str(n)+'u_click110/click_sum_history'] = actions[str(n)+'u_click110_history']/actions[str(n)+'click_sum_all']
1178 | actions[str(n)+'u_click210/click_sum_history'] = actions[str(n)+'u_click210_history']/actions[str(n)+'click_sum_all']
1179 |
1180 | user_id = actions[['user_id']]
1181 | del actions['user_id']
1182 | actions = actions.fillna(0)
1183 | columns = actions.columns
1184 | min_max_scale = preprocessing.MinMaxScaler()
1185 | actions = min_max_scale.fit_transform(actions.values)
1186 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1187 | actions.to_csv(dump_path,index=False)
1188 | return actions
1189 | #u模型cate=8的购买者和不是cate=8的购买者
1190 | def get_action_u0513_feat16(start_date,end_date):
1191 | dump_path = './cache/u0513_feat16_%s_%s.csv' % (start_date, end_date)
1192 | if os.path.exists(dump_path):
1193 | actions = pd.read_csv(dump_path)
1194 | else:
1195 | df = get_actions(start_date, end_date)[['user_id', 'type', 'cate']]
1196 | df = df[df['type'] == 4]
1197 | df = df.groupby(['user_id', 'cate']).count()
1198 | df = df.unstack().reset_index()
1199 | df.columns = ['user_id'] + ['cate' + str(i) for i in range(4, 12)]
1200 | df = df.fillna(0)
1201 | sum1 = df.drop(['user_id', 'cate8'], axis=1).apply(sum, axis=1)
1202 | sum2 = df.drop(['user_id'], axis=1).apply(sum, axis=1)
1203 | actions = pd.concat([df[['user_id', 'cate8']], sum1, sum2], axis=1)
1204 | actions.columns = ['user_id', 'cate8', 'sum_other_cate', 'sum']
1205 | actions['cate8_rate'] = actions['cate8'] / actions['sum']
1206 | actions['sum_other_cate_rate'] = actions['sum_other_cate'] / actions['sum']
1207 | del actions['sum']
1208 | actions.to_csv(dump_path,index=False)
1209 | return actions
1210 |
1211 | #get_action_u0513_feat16('2016-02-01','2016-04-16')
1212 | # 用户层级特征
1213 | def get_action_user_feat_six_xingwei(start_date, end_date, n):
1214 | dump_path = './cache/user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1215 | if os.path.exists(dump_path):
1216 | actions = pd.read_csv(dump_path)
1217 | print("user_zlzl" + str(n))
1218 |
1219 | else:
1220 | actions = get_actions(start_date, end_date)
1221 | actions['time'] = actions['time'].map(lambda x: get_day_chaju(x, end_date) // n)
1222 | num_day = np.max(actions['time'])
1223 | df = None
1224 | print(num_day)
1225 | for i in range(min(num_day + 1, 6)):
1226 | in_temp = pd.get_dummies(actions['type'], prefix="user_action_time_" + str(i))
1227 | temp = actions[actions['time'] == i]
1228 | temp = pd.concat([temp['user_id'], in_temp], axis=1)
1229 |
1230 | feature = ['user_id']
1231 | for j in range(1, 7, 1):
1232 | feature.append('user_action_time_' + str(i) + '_' + str(j))
1233 |
1234 | temp = temp.groupby(['user_id'], as_index=False).sum()
1235 | temp.columns = feature
1236 | if df is None:
1237 | df = temp
1238 | else:
1239 | df = pd.merge(df, temp, how='outer', on='user_id')
1240 | df.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, df.shape[1])]
1241 | df.to_csv(dump_path, index=False)
1242 | actions=df
1243 |
1244 | # user_id = actions[['user_id']]
1245 | # del actions['user_id']
1246 | # actions = actions.fillna(0)
1247 | # actions=actions.replace(np.inf,0)
1248 | # # print(actions.head())
1249 | # columns = actions.columns
1250 |
1251 | # min_max_scale = preprocessing.MinMaxScaler()
1252 | # actions=actions.replace(np.inf,0)
1253 | # actions = min_max_scale.fit_transform(actions.values)
1254 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1255 | actions.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1256 | return actions
1257 |
1258 |
1259 | def deal_user_six_deal(start_date, end_date, n):
1260 | dump_path = './cache/deal_user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1261 | if os.path.exists(dump_path):
1262 | actions = pd.read_csv(dump_path)
1263 | actions.columns = ['user_id'] + ['u_featsix_deal_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1264 | return actions
1265 | else:
1266 | temp = get_action_user_feat_six_xingwei(start_date, end_date, n) # 修改
1267 | time1 = datetime.now()
1268 | columns = ["user_id"]
1269 | all_col = temp.shape[1] - 1
1270 | temp.columns = columns + list(range(all_col))
1271 | temp = temp.fillna(0)
1272 | columns = ['user_id']
1273 | for j in range(0, 6, 1):
1274 | temp["zl_" + str(j)] = 0
1275 | columns.append("zl_" + str(j))
1276 | for k in range(j, all_col, 6):
1277 | temp["zl_" + str(j)] = temp["zl_" + str(j)] + temp[k].map(lambda x: x * ((k // 6 + 1) ** (-0.67)))
1278 | temp["zl_" + str(j)] = temp["zl_" + str(j)].map(lambda x: (x - np.min(temp["zl_" + str(j)])) / (
1279 | np.max(temp["zl_" + str(j)]) - np.min(temp["zl_" + str(j)])))
1280 | temp = temp[columns]
1281 | temp.to_csv(dump_path, index=False)
1282 | return temp
1283 |
1284 | # # get user sku
1285 | # def get_user(start_date, end_date):
1286 | # dump_path = './cache/user_sku_%s_%s.csv' % (start_date, end_date)
1287 | # if os.path.exists(dump_path):
1288 | # actions = pd.read_csv(dump_path)
1289 | # else:
1290 | # actions = get_actions(start_date, end_date)
1291 | # actions = actions[(actions['type'] == 2) | (actions['type'] == 5) | (actions['type'] == 4)]
1292 | # actions=actions[actions['cate']==8]
1293 | # actions = actions[['user_id']]
1294 | # actions = actions.drop_duplicates(['user_id'], keep='first')
1295 | # actions.to_csv(dump_path, index=False)
1296 | # return actions
1297 |
1298 |
1299 | #用户购买前的行为
1300 | def get_action_u0509_feat_28(start_date, end_date,k):
1301 | dump_path = './cache/u0509_feat_28_%s_%s_%s.csv' % (start_date, end_date,k)
1302 | if os.path.exists(dump_path):
1303 | actions = pd.read_csv(dump_path)
1304 | else:
1305 | actions = get_actions(start_date, end_date)
1306 | actions = actions[actions['type'] == 4]
1307 | actions['time_buy'] = actions['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1308 | actions = actions[['user_id', 'sku_id', 'time_buy']].reset_index(drop=True)
1309 | actions['before_time_buy'] = actions['time_buy'] - timedelta(days=k)
1310 |
1311 | df = get_actions('2016-02-01','2016-04-16')[['user_id', 'sku_id', 'time', 'type']]
1312 | df['time'] = df['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1313 | df = pd.merge(df, actions, on=['user_id', 'sku_id'], how='left')
1314 | df = df.dropna(axis=0, how='any')
1315 | df['before_days'] = (df['time'] - df['before_time_buy']).dt.days
1316 | df['days'] = (df['time'] - df['time_buy']).dt.days
1317 | df = df[(df['before_days'] >= 0) & (df['days'] < 0)]
1318 | df_dummy = pd.get_dummies(df['type'], prefix='type')
1319 |
1320 | df = pd.concat([df, df_dummy], axis=1)[
1321 | ['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']]
1322 |
1323 | df = df.groupby(['user_id', 'sku_id'], as_index=False).sum()
1324 | del df['sku_id']
1325 | df = df.groupby('user_id', as_index=False).agg(['min', 'max', 'mean'])
1326 | df = df.reset_index()
1327 | df.columns = ['user_id'] + ['u0509_feat28_' + str(k) + '_' + i for i in (
1328 | 'type_1_min', 'type_1_max', 'type_1_mean', 'type_2_min', 'type_2_max', 'type_2_mean',
1329 | 'type_3_min', 'type_3_max', 'type_3_mean', 'type_4_min', 'type_4_max', 'type_4_mean',
1330 | 'type_5_min', 'type_5_max', 'type_5_mean', 'type_6_min', 'type_6_max', 'type_6_mean')]
1331 | min_max_scaler = preprocessing.MinMaxScaler()
1332 | actions = min_max_scaler.fit_transform(df.drop('user_id', axis=1).values)
1333 | actions = pd.DataFrame(actions)
1334 | actions = pd.concat([df[['user_id']], actions], axis=1)
1335 | actions.columns = ['user_id']+['u0509_feat_28_'+str(i) for i in range(1,actions.shape[1])]
1336 | actions.to_csv(dump_path,index=False)
1337 | actions.columns = ['user_id']+['u0509_feat_28_'+str(k)+"_"+str(i) for i in range(1,actions.shape[1])]
1338 | return actions
1339 |
1340 | #用户看了几个cate=8中的brand、用户看的cate=8的brand/用户看的brand
1341 | def get_action_u0509_feat_29(start_date,end_date):
1342 | dump_path = './cache/u0509_feat_29_%s_%s.csv' % (start_date, end_date)
1343 | if os.path.exists(dump_path):
1344 | actions = pd.read_csv(dump_path)
1345 | else:
1346 | actions=get_actions(start_date,end_date)
1347 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','brand'])[['user_id','brand']]
1348 | df1=df1.groupby(['user_id'],as_index=False).count()
1349 | df1.columns=['user_id','brand_cate=8']
1350 | df2=actions.drop_duplicates(['user_id','brand'])[['user_id','brand']]
1351 | df2 = df2.groupby(['user_id'], as_index=False).count()
1352 | df2.columns=['user_id','brand_cate_all']
1353 | df=pd.merge(df1,df2,on='user_id',how='right')
1354 | df['rate']=df['brand_cate=8']/df['brand_cate_all']
1355 | # print df
1356 | actions=df.fillna(0)
1357 | actions.to_csv(dump_path,index=False)
1358 | actions.columns=['user_id']+['u0509_feat_29'+str(i) for i in range(1,actions.shape[1])]
1359 | return actions
1360 |
1361 | def get_action_u0521_feat_31(start_date,end_date,k):
1362 | dump_path = './cache/u0509_feat_31_%s_%s_%s.csv' % (start_date, end_date,k)
1363 | if os.path.exists(dump_path):
1364 | actions = pd.read_csv(dump_path)
1365 | else:
1366 | start_days=pd.to_datetime(end_date)-timedelta(days=k)
1367 | start_days=datetime.strftime(start_days,'%H-%m-%d')
1368 | actions=get_actions(start_days,end_date)
1369 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','cate'])[['user_id','cate']]
1370 | df1=df1.groupby('user_id',as_index=False).count()
1371 | df1.columns=['user_id','cate8']
1372 | df2=actions.drop_duplicates(['user_id','cate'])[['user_id','cate']]
1373 | df2=df2.groupby('user_id',as_index=False).count()
1374 | actions=pd.merge(df1,df2,on='user_id',how='right')
1375 | actions['cate8/cate']=actions['cate8']/actions['cate']
1376 | actions=actions.fillna(0)
1377 | min_max_scaler = preprocessing.MinMaxScaler()
1378 | df = min_max_scaler.fit_transform(actions[['cate8','cate']].values)
1379 | df = pd.DataFrame(df)
1380 | actions = pd.concat([actions[['user_id','cate8/cate']], df], axis=1)
1381 | actions.to_csv(dump_path,index=False)
1382 | actions.columns=['user_id']+['u0509_feat_31_'+str(k)+'_'+str(i)for i in range(1,actions.shape[1])]
1383 | return actions
1384 |
1385 |
1386 | def get_action_u0521_feat_32(start_date,end_date):
1387 | dump_path = './cache/u0509_feat_32_%s_%s.csv' % (start_date, end_date)
1388 | if os.path.exists(dump_path):
1389 | actions = pd.read_csv(dump_path)
1390 | else:
1391 | actions=get_actions(start_date,end_date)
1392 | actions=actions[actions['cate']==8][['user_id','brand']]
1393 | df1=actions.drop_duplicates(['user_id','brand']).groupby('user_id',as_index=False).count()
1394 | df1.columns=['user_id','brand_num']
1395 | df2=actions.groupby('user_id',as_index=False).count()
1396 | actions=pd.merge(df1,df2,on='user_id',how='left')
1397 | actions['brand_num/brand']=actions['brand']/actions['brand_num']
1398 | actions=actions.fillna(0)
1399 | min_max_scaler = preprocessing.MinMaxScaler()
1400 | df = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1401 | df = pd.DataFrame(df)
1402 | actions = pd.concat([actions[['user_id']], df], axis=1)
1403 | actions.to_csv(dump_path, index=False)
1404 | actions.columns = ['user_id'] + ['u0509_feat_32_' + str(i) for i in range(1, actions.shape[1])]
1405 | return actions
1406 |
1407 | def get_action_user_feat7_0522_huachuang(start_date, end_date,n):
1408 | dump_path = './cache/user_feat7_six_%s_%s_%s_0522.csv' % (start_date, end_date,n)
1409 | if os.path.exists(dump_path):
1410 | actions = pd.read_csv(dump_path)
1411 | else:
1412 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1413 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
1414 |
1415 | df = get_actions(start_days, end_date)[['user_id', 'type', 'time']]
1416 | actions = df.groupby(['user_id', 'type'], as_index=False).count()
1417 |
1418 | time_min = df.groupby(['user_id', 'type'], as_index=False).min()
1419 | time_max = df.groupby(['user_id', 'type'], as_index=False).max()
1420 |
1421 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
1422 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1423 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1424 |
1425 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
1426 | time_cha[
1427 | 'time_y']).dt.seconds // 3600
1428 | del time_cha['time_x']
1429 | del time_cha['time_y']
1430 | # time_cha=time_cha.fillna(1)
1431 |
1432 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
1433 | actions = actions.groupby(['user_id', 'type']).sum()
1434 | actions['cnt/time'] = actions['time'] / actions["cha_hour"]
1435 | actions = actions.unstack()
1436 | actions.columns = list(range(actions.shape[1]))
1437 | actions = actions.reset_index()
1438 | actions = actions.fillna(0)
1439 | actions.to_csv(dump_path, index=False)
1440 | actions.columns = ['user_id'] + ['u_feat7_' +str(n)+"_"+ str(i) for i in range(1, actions.shape[1])]
1441 | return actions
1442 |
1443 | def get_user_labels(test_start_date,test_end_date):
1444 | dump_path = './cache/user_labels_%s_%s_11.csv' % (test_start_date, test_end_date)
1445 | if os.path.exists(dump_path):
1446 | actions = pd.read_csv(dump_path)
1447 | else:
1448 | actions = get_actions(test_start_date, test_end_date)
1449 | actions = actions[actions['cate']==8]
1450 | actions = actions[actions['type'] == 4].drop_duplicates(['user_id'])[['user_id']]
1451 | actions['label'] = 1
1452 |
1453 | return actions
1454 |
1455 | print("U model 1 finish part_0")
1456 |
1457 | #########################################################################################################
1458 |
1459 |
1460 | # In[ ]:
1461 |
1462 |
1463 |
1464 |
1465 | # In[ ]:
1466 |
1467 |
1468 |
1469 |
1470 | # In[ ]:
1471 |
1472 |
1473 |
1474 |
1475 | # In[ ]:
1476 |
1477 |
1478 |
1479 |
1480 | # In[ ]:
1481 |
1482 |
1483 |
1484 |
1485 | # In[ ]:
1486 |
1487 |
1488 |
1489 |
1490 | # In[ ]:
1491 |
1492 |
1493 |
1494 |
1495 | # In[ ]:
1496 |
1497 |
1498 |
1499 |
1500 | # In[ ]:
1501 |
1502 |
1503 |
1504 |
1505 | # In[ ]:
1506 |
1507 |
1508 |
1509 |
1510 | # In[2]:
1511 |
1512 | import os
1513 | from datetime import datetime
1514 | from datetime import timedelta
1515 |
1516 | # -*- coding: utf-8 -*-
1517 | """
1518 | Created on Sun May 14 10:27:41 2017
1519 | @author: 老虎趴趴走
1520 | """
1521 | import pandas as pd
1522 | import numpy as np
1523 | # import datetime
1524 | import math
1525 |
1526 | def user_features(user, ful_action, sub_action, end_date):
1527 | dump_path='./cache/user_features_%s_0514_2.csv'%(end_date)
1528 | if os.path.exists(dump_path):
1529 | actions = pd.read_csv(dump_path)
1530 |
1531 | else:
1532 | end_date=pd.to_datetime(end_date)
1533 | day = timedelta(1, 0)
1534 | print('=====> 提取特征...')
1535 | sub_1 = sub_action[(sub_action['time']>=end_date-1*day) & (sub_action['time']=end_date-3*day) & (sub_action['time']=end_date-5*day) & (sub_action['time']=end_date-30*day) & (sub_action['time']=end_date-5*day) & (ful_action['time']=end_date-30*day) & (ful_action['time'] 完成!')
1763 | actions.to_csv(dump_path,index=False)
1764 |
1765 | # user_id = actions[['user_id']]
1766 | # del actions['user_id']
1767 | # actions = actions.fillna(0)
1768 | # actions=actions.replace(np.inf,0)
1769 | # print(actions.head())
1770 | # columns = actions.columns
1771 |
1772 | # min_max_scale = preprocessing.MinMaxScaler()
1773 | # actions=actions.replace(np.inf,0)
1774 | # actions = min_max_scale.fit_transform(actions.values)
1775 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1776 | return actions
1777 |
1778 | import pandas as pd
1779 | ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1780 | sub_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True)
1781 | user = pd.read_csv('./data/JData_modified_user.csv', parse_dates=[4])
1782 | # user_features(user,ful_action,sel_action,'2016-04-11')
1783 | print("U model 1 finish part_1")
1784 | ######################################################################################
1785 |
1786 |
1787 | # In[ ]:
1788 |
1789 |
1790 |
1791 |
1792 | # In[6]:
1793 |
1794 | # 测试集
1795 | # ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1796 | # sel_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True)
1797 | def make_test_set(train_start_date, train_end_date,user,ful_action,sub_action):
1798 | dump_path = './cache/bu0525model_1_u_test_set_%s_%s.csv' % (train_start_date, train_end_date)
1799 | if os.path.exists(dump_path):
1800 | actions = pd.read_csv(dump_path)
1801 | else:
1802 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1803 | actions_1 = get_actions(start_days, train_end_date)
1804 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1805 |
1806 |
1807 |
1808 | print (actions.shape)
1809 |
1810 | start_days = "2016-02-01"
1811 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1812 | # print(actions.shape)
1813 | #
1814 |
1815 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1816 | # print(actions.shape)
1817 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1818 | print(actions.shape)
1819 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1820 | print(actions.shape)
1821 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1822 | print(actions.shape)
1823 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1824 | print(actions.shape)
1825 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1826 | print(actions.shape)
1827 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1828 | print (actions.shape)
1829 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1830 | print (actions.shape)
1831 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1832 | print (actions.shape)
1833 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1834 | print (actions.shape)
1835 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1836 | print (actions.shape)
1837 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1838 | print (actions.shape)
1839 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1840 | print (actions.shape)
1841 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1842 | print (actions.shape)
1843 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1844 | print (actions.shape)
1845 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1846 | print (actions.shape)
1847 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1848 | print (actions.shape)
1849 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1850 | print (actions.shape)
1851 |
1852 | #模型1 和 模型二
1853 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1854 | print (actions.shape)
1855 | #模型 二
1856 | # actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1857 |
1858 |
1859 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1860 | # print (actions.shape)
1861 |
1862 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1863 | # print (actions.shape)
1864 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1865 | # print (actions.shape)
1866 |
1867 | for i in (1, 2, 3, 7, 14, 28):
1868 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1869 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1870 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1871 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1872 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1873 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1874 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1875 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1876 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1877 | #模型1 和 模型二
1878 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1879 | if(i<=10):
1880 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1881 | #模型 二
1882 | # actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1883 | # actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
1884 | print(actions.shape)
1885 | print(actions.shape)
1886 |
1887 | actions = actions.fillna(0)
1888 | # user_id = actions[['user_id']]
1889 | # del actions['user_id']
1890 | # actions = actions.fillna(0)
1891 | # actions=actions.replace(np.inf,0)
1892 | # # print(actions.head())
1893 | # columns = actions.columns
1894 |
1895 | # min_max_scale = preprocessing.MinMaxScaler()
1896 | # actions=actions.replace(np.inf,0)
1897 | # actions = min_max_scale.fit_transform(actions.values)
1898 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1899 | # actions.to_csv(dump_path,index=False)
1900 | return actions
1901 |
1902 |
1903 | # 训练集
1904 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action):
1905 | dump_path = './cache/bu0525model_1_u_train_set_%s_%s_%s_%s.csv' % (train_start_date, train_end_date, test_start_date, test_end_date)
1906 | if os.path.exists(dump_path):
1907 | actions = pd.read_csv(dump_path)
1908 | else:
1909 |
1910 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1911 | actions_1 = get_actions(start_days, train_end_date)
1912 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1913 | # buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates()
1914 | # actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False]
1915 |
1916 |
1917 |
1918 | # print (actions.shape)
1919 |
1920 | # start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1921 | # actions_1 = get_actions(start_days, train_end_date)
1922 | # actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)]
1923 | # actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1924 | # actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id'])
1925 | print (actions.shape)
1926 | # start_days = train_start_date
1927 | start_days = "2016-02-01"
1928 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1929 | print(actions.shape)
1930 |
1931 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1932 | # print(actions.shape)
1933 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1934 | print(actions.shape)
1935 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1936 | print(actions.shape)
1937 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1938 | print(actions.shape)
1939 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1940 | print(actions.shape)
1941 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1942 | print(actions.shape)
1943 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1944 | print (actions.shape)
1945 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1946 | print (actions.shape)
1947 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1948 | print (actions.shape)
1949 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1950 | print (actions.shape)
1951 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1952 | print (actions.shape)
1953 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1954 | print (actions.shape)
1955 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1956 | print (actions.shape)
1957 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1958 | print (actions.shape)
1959 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1960 | print (actions.shape)
1961 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1962 | print (actions.shape)
1963 |
1964 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1965 | print (actions.shape)
1966 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1967 | print (actions.shape)
1968 |
1969 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1970 | # actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1971 |
1972 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1973 | # print (actions.shape)
1974 |
1975 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1976 | # print (actions.shape)
1977 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1978 | # print (actions.shape)
1979 | print (actions.shape)
1980 | for i in (1, 2, 3,7, 14, 28):
1981 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1982 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1983 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1984 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1985 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1986 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1987 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1988 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1989 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1990 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1991 | if(i<=10):
1992 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1993 | # actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1994 |
1995 | # actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
1996 | print(actions.shape)
1997 | actions = pd.merge(actions, get_user_labels(test_start_date, test_end_date), how='left', on='user_id')
1998 |
1999 | actions = actions.fillna(0)
2000 | print(actions.shape)
2001 | # user_id = actions[['user_id']]
2002 | # del actions['user_id']
2003 | # actions = actions.fillna(0)
2004 | # actions=actions.replace(np.inf,0)
2005 | # # print(actions.head())
2006 | # columns = actions.columns
2007 |
2008 | # min_max_scale = preprocessing.MinMaxScaler()
2009 | # actions=actions.replace(np.inf,0)
2010 | # actions = min_max_scale.fit_transform(actions.values)
2011 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
2012 | # actions.to_csv(dump_path,index=False)
2013 | return actions
2014 |
2015 | print("U model 1 finish part_3")
2016 |
2017 |
2018 |
2019 |
2020 |
2021 |
2022 | ###########################################################################################
2023 |
2024 |
2025 | # In[ ]:
2026 |
2027 |
2028 |
2029 |
2030 | # In[ ]:
2031 |
2032 |
2033 |
2034 |
2035 | # In[ ]:
2036 |
2037 |
2038 |
2039 |
2040 | # In[7]:
2041 |
2042 | #!/usr/bin/python
2043 |
2044 | import numpy as np
2045 | import xgboost as xgb
2046 | # from user_feat import *
2047 | from sklearn.model_selection import train_test_split
2048 |
2049 |
2050 | train_start_date = '2016-03-10'
2051 | train_end_date = '2016-04-11'
2052 | test_start_date = '2016-04-11'
2053 | test_end_date = '2016-04-16'
2054 |
2055 | # train_start_date='2016-03-05'
2056 | # train_end_date='2016-04-06'
2057 | # test_start_date='2016-04-06'
2058 | # test_end_date='2016-04-11'
2059 |
2060 | sub_start_date = '2016-03-15'
2061 | sub_end_date = '2016-04-16'
2062 |
2063 | #训练数据集
2064 | actions = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action)
2065 | # print(np.isinf(actions))
2066 | # print(np.isnan(actions))
2067 |
2068 | actions
2069 |
2070 |
2071 | feature_name = actions.columns.values
2072 |
2073 | # for index in feature_name[1:-1]:
2074 | # actions["r"+index]=actions[index].rank(method='max')/actions.shape[0]
2075 |
2076 | print(actions.shape)
2077 | actions_pos = actions[actions['label']==1]
2078 | actions_neg = actions[actions['label']==0]
2079 |
2080 |
2081 | print("+++++++++++++++++++++++")
2082 |
2083 |
2084 |
2085 | train,test=train_test_split(actions.values,test_size=0.2,random_state=0)
2086 | train=pd.DataFrame(train,columns=actions.columns)
2087 | test=pd.DataFrame(test,columns=actions.columns)
2088 |
2089 | X_train=train.drop(['user_id','label'],axis=1)
2090 | X_test=test.drop(['user_id','label'],axis=1)
2091 | y_train=train[['label']]
2092 | y_test=test[['label']]
2093 | train_index=train[['user_id']].copy()
2094 | test_index=test[['user_id']].copy()
2095 |
2096 |
2097 |
2098 |
2099 |
2100 | #测试数据集
2101 | sub_test_data = make_test_set(sub_start_date, sub_end_date,user,ful_action,sub_action)
2102 |
2103 | feature_name = sub_test_data.columns.values
2104 | # for index in feature_name[1:]:
2105 | # sub_test_data["r"+index]=sub_test_data[index].rank(method='max')/sub_test_data.shape[0]
2106 |
2107 |
2108 | sub_trainning_data=sub_test_data.drop(['user_id'],axis=1)
2109 | sub_user_index=sub_test_data[['user_id']].copy()
2110 |
2111 | print("U model 1 finish part_4")
2112 |
2113 | ########################################################################
2114 |
2115 |
2116 | # In[ ]:
2117 |
2118 |
2119 |
2120 |
2121 | # In[9]:
2122 |
2123 | print ('==========>>>train xgboost model ....')
2124 |
2125 | dtrain = xgb.DMatrix(X_train,label=y_train)
2126 | dtest = xgb.DMatrix(X_test,label=y_test)
2127 | param = {'learning_rate' : 0.1,
2128 | 'n_estimators': 1000,
2129 | 'max_depth': 3,
2130 | 'min_child_weight': 5,
2131 | 'gamma': 0,
2132 | 'subsample': 1.0,
2133 | 'colsample_bytree': 0.8,
2134 | 'eta': 0.05,
2135 | 'silent': 1,
2136 | 'objective':
2137 | 'binary:logistic',
2138 | 'scale_pos_weight':1}
2139 |
2140 |
2141 |
2142 | num_round =120
2143 | plst = list(param.items())
2144 | plst += [('eval_metric', 'logloss')]
2145 |
2146 | evallist = [(dtest, 'eval'), (dtrain, 'train')]
2147 | bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10)
2148 |
2149 |
2150 |
2151 |
2152 | # ============================================>>>>
2153 | print ('==========>>>predict test data label')
2154 |
2155 |
2156 | sub_trainning_data_1 = xgb.DMatrix(sub_trainning_data)
2157 | y = bst.predict(sub_trainning_data_1)
2158 | pred = sub_user_index
2159 | sub_user_index['label'] = y
2160 |
2161 | # print(sub_user_index.head())
2162 |
2163 | pred=sub_user_index
2164 | #pred.sort_values(by=['user_id','label'],ascending=[0,0],inplace=True)
2165 | pred=pred.sort_values(by=['user_id','label'],ascending=[0,0])
2166 | pred = pred.groupby('user_id').first().reset_index()
2167 | result=pred.sort_values(by=['label'],ascending=[0])
2168 | result['user_id']=result['user_id'].astype('int')
2169 |
2170 |
2171 | result.to_csv('./sub/Umodel_1.csv',index=False,index_label=False )
2172 |
2173 | print("U model 1 finish part_5")
2174 |
2175 |
2176 | # In[ ]:
2177 |
2178 |
2179 |
2180 |
--------------------------------------------------------------------------------
/Umodel_2.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # In[1]:
5 |
6 | #!/usr/bin/env python
7 |
8 | import time
9 |
10 | from datetime import datetime
11 | from datetime import timedelta
12 | import pandas as pd
13 | import pickle
14 | import os
15 | import math
16 | import numpy as np
17 | from sklearn import preprocessing
18 | import matplotlib.pyplot as plt
19 |
20 | action_1_path = "./data/JData_Action_201602.csv"
21 | action_2_path = "./data/JData_Action_201603.csv"
22 | action_3_path = "./data/JData_Action_201604.csv"
23 | user_path = "./data/JData_User.csv"
24 | product_path = "./data/JData_Product.csv"
25 |
26 |
27 | def convert_age(age_str):
28 | if age_str == u'-1':
29 | return 0
30 | elif age_str == u'15岁以下':
31 | return 1
32 | elif age_str == u'16-25岁':
33 | return 2
34 | elif age_str == u'26-35岁':
35 | return 3
36 | elif age_str == u'36-45岁':
37 | return 4
38 | elif age_str == u'46-55岁':
39 | return 5
40 | elif age_str == u'56岁以上':
41 | return 6
42 | else:
43 | return -1
44 |
45 |
46 | # 用户的基本信息
47 | def get_basic_user_feat():
48 | dump_path = './cache/basic_user.csv'
49 | if os.path.exists(dump_path):
50 | user = pd.read_csv(dump_path)
51 | else:
52 | user = pd.read_csv(user_path, encoding='gbk')
53 | user['age'] = user['age'].map(convert_age)
54 | age_df = pd.get_dummies(user["age"], prefix="age")
55 | sex_df = pd.get_dummies(user["sex"], prefix="sex")
56 | user_lv_df = pd.get_dummies(user["user_lv_cd"], prefix="user_lv_cd")
57 | user = pd.concat([user['user_id'], age_df, sex_df, user_lv_df], axis=1)
58 | user.to_csv(dump_path, index=False)
59 | return user
60 |
61 | # 商品的基本信息
62 | def get_basic_product_feat():
63 | dump_path = './cache/basic_product.csv'
64 | if os.path.exists(dump_path):
65 | product = pd.read_csv(dump_path)
66 | else:
67 | product = pd.read_csv(product_path)
68 | attr1_df = pd.get_dummies(product["a1"], prefix="a1")
69 | attr2_df = pd.get_dummies(product["a2"], prefix="a2")
70 | attr3_df = pd.get_dummies(product["a3"], prefix="a3")
71 | product = pd.concat([product[['sku_id', 'cate', 'brand']], attr1_df, attr2_df, attr3_df], axis=1)
72 | product.to_csv(dump_path, index=False)
73 | return product
74 |
75 | def get_actions_1():
76 | action = pd.read_csv(action_1_path)
77 | return action
78 |
79 |
80 | def get_actions_2():
81 | action2 = pd.read_csv(action_2_path)
82 | return action2
83 |
84 |
85 | def get_actions_3():
86 | action3 = pd.read_csv(action_3_path)
87 | return action3
88 |
89 | def sub_get_actions(start_date,end_date):
90 | dump_path = './cache/sub_action_%s_%s.csv' % (start_date, end_date)
91 | if os.path.exists(dump_path):
92 | actions = pd.read_csv(dump_path)
93 | else:
94 | actions=get_actions(start_date,end_date)
95 | actions=actions[actions['cate']==8]
96 | actions.to_csv(dump_path,index=False)
97 | return actions
98 |
99 | # 行为数据
100 | def get_actions(start_date, end_date):
101 | """
102 |
103 | :param start_date:
104 | :param end_date:
105 | :return: actions: pd.Dataframe
106 | """
107 | dump_path = './cache/all_action_%s_%s.csv' % (start_date, end_date)
108 | if os.path.exists(dump_path):
109 | actions = pd.read_csv(dump_path)
110 | else:
111 | action_1 = get_actions_1()
112 | action_1 = action_1[(action_1.time >= start_date) & (action_1.time < end_date)]
113 | action_2 = get_actions_2()
114 | action_2 = action_2[(action_2.time >= start_date) & (action_2.time < end_date)]
115 | actions = pd.concat([action_1, action_2])
116 | action_3 = get_actions_3()
117 | action_3 = action_3[(action_3.time >= start_date) & (action_3.time < end_date)]
118 | actions = pd.concat([actions, action_3]) # type: pd.DataFrame
119 | actions = actions[(actions.time >= start_date) & (actions.time < end_date)]
120 | actions.to_csv(dump_path, index=False)
121 | # actions['user_id']=actions['user_id'].astype('int')
122 | return actions
123 |
124 | # 获取两个时间相差几天
125 | def get_day_chaju(x, end_date):
126 | # x=x.split(' ')[0]
127 | x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
128 | end_date = datetime.strptime(end_date, '%Y-%m-%d')
129 | return (end_date - x).days
130 |
131 |
132 |
133 |
134 | # # 所有行为的总和
135 | # def get_action_feat(start_date, end_date):
136 | # dump_path = './cache/action_%s_%s.csv' % (start_date, end_date)
137 | # if os.path.exists(dump_path):
138 | # actions = pd.read_csv(dump_path)
139 | # else:
140 | # actions = get_actions(start_date, end_date)
141 | # actions = actions[['user_id', 'sku_id', 'type']]
142 | # df = pd.get_dummies(actions['type'], prefix='action')
143 | # actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
144 | # actions = actions.groupby(['user_id', 'sku_id'], as_index=False).sum()
145 | # del actions['type']
146 | # actions.to_csv(dump_path, index=False)
147 | # return actions
148 | # top k 天的行为次数总和(滑窗处理)
149 |
150 | #user_id,u_action_1_1,u_action_1_2,u_action_1_3,u_action_1_4,u_action_1_5,u_action_1_6
151 | def get_action_feat(start_date, end_date,k):
152 | dump_path = './cache/u_action_%s_%s_%s.csv' % (start_date, end_date,k)
153 | if os.path.exists(dump_path):
154 | actions = pd.read_csv(dump_path)
155 | else:
156 | start_days=pd.to_datetime(end_date)-timedelta(days=k)
157 | start_days=str(start_days).split(' ')[0]
158 | actions = get_actions(start_days, end_date)
159 | actions = actions[['user_id', 'type']]
160 | df = pd.get_dummies(actions['type'], prefix='type')
161 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
162 | actions = actions.groupby('user_id', as_index=False).sum()
163 | min_max_scaler = preprocessing.MinMaxScaler()
164 | df = min_max_scaler.fit_transform(actions.drop(['user_id','type'],axis=1).values)
165 | df = pd.DataFrame(df)
166 | df.columns=['u_action_'+str(k)+'_'+str(i) for i in range(1,df.shape[1]+1)]
167 | actions = pd.concat([actions[['user_id']], df], axis=1)
168 | actions.to_csv(dump_path, index=False)
169 | return actions
170 |
171 |
172 |
173 |
174 |
175 |
176 | # 用户的行为转化率
177 | def get_action_user_feat1(start_date, end_date):
178 | feature = ['user_id', 'user_action_1_ratio', 'user_action_2_ratio', 'user_action_3_ratio',
179 | 'user_action_5_ratio', 'user_action_6_ratio']
180 | dump_path = './cache/user_feat_accumulate_xiugai_%s_%s.csv' % (start_date, end_date)
181 | if os.path.exists(dump_path):
182 | actions = pd.read_csv(dump_path)
183 | else:
184 | actions = get_actions(start_date, end_date)
185 | df = pd.get_dummies(actions['type'], prefix='action')
186 | actions = pd.concat([actions['user_id'], df], axis=1)
187 | actions = actions.groupby(['user_id'], as_index=False).sum()
188 | actions['user_action_1_ratio'] = actions['action_4'] / actions['action_1']
189 | actions['user_action_2_ratio'] = actions['action_4'] / actions['action_2']
190 | # actions['user_action_3_ratio'] = actions['action_4'] / actions['action_3']
191 | actions['user_action_3_ratio'] = actions['action_3'] / actions['action_2']
192 | actions['user_action_5_ratio'] = actions['action_4'] / actions['action_5']
193 | actions['user_action_6_ratio'] = actions['action_4'] / actions['action_6']
194 | # 3.购物车删除
195 | actions = actions[feature]
196 | actions.to_csv(dump_path, index=False)
197 | return actions
198 |
199 |
200 | # print get_accumulate_user_feat('2016-03-10','2016-04-11')
201 | # 用户购买前访问天数
202 | # 用户购买/加入购物车/关注前访问天数
203 | def get_action_user_feat2(start_date, end_date):
204 | dump_path = './cache/user_feat2_after_%s_%s.csv' % (start_date, end_date)
205 | if os.path.exists(dump_path):
206 | actions = pd.read_csv(dump_path)
207 |
208 | else:
209 | # 用户购买前访问天数
210 | def user_feat_2_1(start_date, end_date):
211 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
212 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
213 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
214 | visit = actions[actions['type'] == 1]
215 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
216 | del visit['time']
217 | del actions['time']
218 | visit = visit.groupby('user_id', as_index=False).count()
219 | visit.columns = ['user_id', 'visit']
220 | buy = actions[actions['type'] == 4]
221 | buy = buy.groupby('user_id', as_index=False).count()
222 | buy.columns = ['user_id', 'buy']
223 | actions = pd.merge(visit, buy, on='user_id', how='left')
224 | actions['visit_day_before_buy'] = actions['visit'] / actions['buy']
225 | del actions['buy']
226 | del actions['visit']
227 | return actions
228 |
229 | # 用户加入购物车前访问天数
230 | def user_feat_2_2(start_date, end_date):
231 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
232 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
233 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
234 | visit = actions[actions['type'] == 1]
235 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
236 | del visit['time']
237 | del actions['time']
238 | visit = visit.groupby('user_id', as_index=False).count()
239 | visit.columns = ['user_id', 'visit']
240 | addtoshopping = actions[actions['type'] == 2]
241 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
242 | addtoshopping.columns = ['user_id', 'addtoshopping']
243 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
244 | actions['visit_day_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
245 | del actions['addtoshopping']
246 | del actions['visit']
247 | return actions
248 |
249 | # 用户关注前访问天数
250 | def user_feat_2_3(start_date, end_date):
251 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
252 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
253 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
254 | visit = actions[actions['type'] == 1]
255 | visit = visit.drop_duplicates(['user_id', 'time'], keep='first')
256 | del visit['time']
257 | del actions['time']
258 | visit = visit.groupby('user_id', as_index=False).count()
259 | visit.columns = ['user_id', 'visit']
260 | guanzhu = actions[actions['type'] == 5]
261 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
262 | guanzhu.columns = ['user_id', 'guanzhu']
263 | actions = pd.merge(visit, guanzhu, on='user_id', how='left')
264 | actions['visit_day_before_guanzhu'] = actions['visit'] / actions['guanzhu']
265 | del actions['guanzhu']
266 | del actions['visit']
267 | return actions
268 |
269 | # 用户购买前加入购物车天数
270 | def user_feat_2_4(start_date, end_date):
271 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
272 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
273 | # actions=actions.drop_duplicates(['user_id','time'],keep='first')
274 | addtoshopping = actions[actions['type'] == 2]
275 | addtoshopping = addtoshopping.drop_duplicates(['user_id', 'time'], keep='first')
276 | del addtoshopping['time']
277 | del actions['time']
278 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
279 | addtoshopping.columns = ['user_id', 'addtoshopping']
280 | buy = actions[actions['type'] == 4]
281 | buy = buy.groupby('user_id', as_index=False).count()
282 | buy.columns = ['user_id', 'buy']
283 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
284 | actions['addtoshopping_day_before_buy'] = actions['addtoshopping'] / actions['buy']
285 | del actions['buy']
286 | del actions['addtoshopping']
287 | return actions
288 |
289 | # 用户购买前关注天数
290 | def user_feat_2_5(start_date, end_date):
291 | actions = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
292 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
293 | guanzhu = actions[actions['type'] == 5]
294 | guanzhu = guanzhu.drop_duplicates(['user_id', 'time'], keep='first')
295 | del guanzhu['time']
296 | del actions['time']
297 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
298 | guanzhu.columns = ['user_id', 'guanzhu']
299 | buy = actions[actions['type'] == 4]
300 | buy = buy.groupby('user_id', as_index=False).count()
301 | buy.columns = ['user_id', 'buy']
302 | actions = pd.merge(guanzhu, buy, on='user_id', how='left')
303 | actions['guanzhu_day_before_buy'] = actions['guanzhu'] / actions['buy']
304 | del actions['buy']
305 | del actions['guanzhu']
306 | return actions
307 |
308 | actions = pd.merge(user_feat_2_1(start_date, end_date), user_feat_2_2(start_date, end_date), on='user_id',
309 | how='outer')
310 | actions = pd.merge(actions, user_feat_2_3(start_date, end_date), on='user_id', how='outer')
311 | actions = pd.merge(actions, user_feat_2_4(start_date, end_date), on='user_id', how='outer')
312 | actions = pd.merge(actions, user_feat_2_5(start_date, end_date), on='user_id', how='outer')
313 | user_id = actions['user_id']
314 | del actions['user_id']
315 | actions = actions.fillna(0)
316 | min_max_scale = preprocessing.MinMaxScaler()
317 | actions = min_max_scale.fit_transform(actions.values)
318 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
319 | actions.to_csv(dump_path, index=False)
320 | actions.columns = ['user_id'] + ['u_feat2_' + str(i) for i in range(1, actions.shape[1])]
321 | return actions
322 |
323 |
324 |
325 |
326 | # # 用户总购买品牌数
327 | # def get_action_user_feat5(start_date, end_date):
328 | # dump_path = './cache/user_feat5_%s_%s.csv' % (start_date, end_date)
329 | # if os.path.exists(dump_path):
330 | # actions = pd.read_csv(dump_path)
331 | # else:
332 | # actions = get_actions(start_date, end_date)[['user_id', 'sku_id']]
333 | # actions = actions.drop_duplicates(['user_id', 'sku_id'], keep='first')
334 | # actions = actions.groupby('user_id', as_index=False).count()
335 | # actions.columns = ['user_id', 'sku_num']
336 | # actions['sku_num'] = actions['sku_num'].astype('float')
337 | # actions['sku_num'] = actions['sku_num'].map(
338 | # lambda x: (x - actions['sku_num'].min()) / (actions['sku_num'].max() - actions['sku_num'].min()))
339 | # actions.to_csv(dump_path, index=False)
340 | # actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
341 | # return actions
342 |
343 |
344 | # 用户平均访问间隔
345 | def get_action_user_feat6(start_date, end_date):
346 | dump_path = './cache/user_feat6_%s_%s.csv' % (start_date, end_date)
347 | if os.path.exists(dump_path):
348 | actions = pd.read_csv(dump_path)
349 | else:
350 |
351 | df = get_actions(start_date, end_date)[['user_id', 'time']]
352 | # df['user_id']=df['user_id'].astype('int')
353 | df['time'] = df['time'].map(lambda x: x.split(' ')[0])
354 | df = df.drop_duplicates(['user_id', 'time'], keep='first')
355 | df['time'] = df['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d'))
356 | actions = df.groupby('user_id', as_index=False).agg(lambda x: x['time'].diff().mean())
357 | actions['avg_visit'] = actions['time'].dt.days
358 | del actions['time']
359 | actions.to_csv(dump_path, index=False)
360 | actions.columns = ['user_id'] + ['u_feat6_' + str(i) for i in range(1, actions.shape[1])]
361 | return actions
362 |
363 |
364 | # 用户平均六种行为的访问间隔
365 | def get_action_user_feat6_six(start_date, end_date):
366 | dump_path = './cache/user_feat6_six_%s_%s.csv' % (start_date, end_date)
367 | if os.path.exists(dump_path):
368 | actions = pd.read_csv(dump_path)
369 | else:
370 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
371 | df['time'] = df['time'].map(lambda x: (-1) * get_day_chaju(x, start_date))
372 | df = df.drop_duplicates(['user_id', 'time', 'type'], keep='first')
373 | actions = df.groupby(['user_id', 'type']).agg(lambda x: np.diff(x).mean())
374 | actions = actions.unstack()
375 | actions.columns = list(range(actions.shape[1]))
376 | actions = actions.reset_index()
377 | actions.to_csv(dump_path, index=False)
378 | actions.columns = ['user_id'] + ['u_feat6_six_' + str(i) for i in range(1, actions.shape[1])]
379 | return actions
380 |
381 |
382 | # 用户购买频率
383 | def get_action_user_feat7(start_date, end_date):
384 | dump_path = './cache/user_feat7_six_%s_%s.csv' % (start_date, end_date)
385 | if os.path.exists(dump_path):
386 | actions = pd.read_csv(dump_path)
387 | else:
388 | df = get_actions(start_date, end_date)[['user_id', 'type', 'time']]
389 | actions = df.groupby(['user_id', 'type'], as_index=False).count()
390 |
391 | time_min = df.groupby(['user_id', 'type'], as_index=False).min()
392 | time_max = df.groupby(['user_id', 'type'], as_index=False).max()
393 |
394 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
395 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
396 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
397 |
398 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
399 | time_cha[
400 | 'time_y']).dt.seconds // 3600
401 | del time_cha['time_x']
402 | del time_cha['time_y']
403 | # time_cha=time_cha.fillna(1)
404 |
405 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
406 | actions = actions.groupby(['user_id', 'type']).sum()
407 | actions['cnt/time'] = actions['time'] / actions["cha_hour"]
408 | actions = actions.unstack()
409 | actions.columns = list(range(actions.shape[1]))
410 | actions = actions.reset_index()
411 | actions = actions.fillna(0)
412 | actions.to_csv(dump_path, index=False)
413 | actions.columns = ['user_id'] + ['u_feat7_' + str(i) for i in range(1, actions.shape[1])]
414 | return actions
415 |
416 |
417 | def user_top_k_0_1(start_date, end_date):
418 | actions = get_actions(start_date, end_date)
419 | actions = actions[['user_id', 'sku_id', 'type']]
420 | df = pd.get_dummies(actions['type'], prefix='%s-%s-action' % (start_date, end_date))
421 | actions = pd.concat([actions, df], axis=1) # type: pd.DataFrame
422 | actions = actions.groupby('user_id', as_index=False).sum()
423 | del actions['type']
424 | del actions['sku_id']
425 | user_id = actions['user_id']
426 | del actions['user_id']
427 | actions = actions.applymap(lambda x: 1 if x > 0 else 0)
428 | actions = pd.concat([user_id, actions], axis=1)
429 | return actions
430 |
431 |
432 | # 用户最近K天行为0/1提取
433 | def get_action_user_feat8(start_date, end_date):
434 | dump_path = './cache/user_feat8_%s_%s.csv' % (start_date, end_date)
435 | if os.path.exists(dump_path):
436 | actions = pd.read_csv(dump_path)
437 | else:
438 | actions = None
439 | for i in (1, 2, 3, 4, 5, 6, 7, 15, 30):
440 | print(i)
441 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=i)
442 | start_days = start_days.strftime('%Y-%m-%d')
443 | if actions is None:
444 | actions = user_top_k_0_1(start_days, end_date)
445 | else:
446 | actions = pd.merge(actions, user_top_k_0_1(start_days, end_date), how='outer', on='user_id')
447 | actions.to_csv(dump_path, index=False)
448 | actions.columns = ['user_id'] + ['u_feat8_' + str(i) for i in range(1, actions.shape[1])]
449 | return actions
450 |
451 |
452 | # 获取用户的重复购买率
453 | def get_action_user_feat8_2(start_date, end_date):
454 | dump_path = './cache/product_feat8_2_%s_%s.csv' % (start_date, end_date)
455 | if os.path.exists(dump_path):
456 | actions = pd.read_csv(dump_path)
457 | else:
458 | df = get_actions(start_date, end_date)[['user_id', 'sku_id', 'type']]
459 | df = df[df['type'] == 4] # 购买的行为
460 | df = df.groupby(['user_id', 'sku_id'], as_index=False).count()
461 | df.columns = ['user_id', 'sku_id', 'count1']
462 | df['count1'] = df['count1'].map(lambda x: 1 if x > 1 else 0)
463 | grouped = df.groupby(['user_id'], as_index=False)
464 | actions = grouped.count()[['user_id', 'count1']]
465 | actions.columns = ['user_id', 'count']
466 | re_count = grouped.sum()[['user_id', 'count1']]
467 | re_count.columns = ['user_id', 're_count']
468 | actions = pd.merge(actions, re_count, on='user_id', how='left')
469 | re_buy_rate = actions['re_count'] / actions['count']
470 | actions = pd.concat([actions['user_id'], re_buy_rate], axis=1)
471 | actions.columns = ['user_id', 're_buy_rate']
472 | actions.to_csv(dump_path, index=False)
473 | actions.columns = ['user_id'] + ['u_feat8_2_' + str(i) for i in range(1, actions.shape[1])]
474 | return actions
475 |
476 |
477 | # 获取最近一次行为的时间距离当前时间的差距
478 | def get_action_user_feat9(start_date, end_date):
479 | dump_path = './cache/user_feat9_%s_%s.csv' % (start_date, end_date)
480 | if os.path.exists(dump_path):
481 | actions = pd.read_csv(dump_path)
482 | else:
483 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
484 | # df['time'] = df['time'].map(lambda x: (-1)*get_day_chaju(x,start_date))
485 | df = df.drop_duplicates(['user_id', 'type'], keep='last')
486 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
487 | actions = df.groupby(['user_id', 'type']).sum()
488 | actions = actions.unstack()
489 | actions.columns = list(range(actions.shape[1]))
490 | actions = actions.reset_index()
491 | actions = actions.fillna(30)
492 | actions.to_csv(dump_path, index=False)
493 | actions.columns = ['user_id'] + ['u_feat9_' + str(i) for i in range(1, actions.shape[1])]
494 | return actions
495 |
496 |
497 | # 获取最后一次行为的次数并且进行归一化
498 | def get_action_user_feat10(start_date, end_date):
499 | dump_path = './cache/user_feat10_%s_%s.csv' % (start_date, end_date)
500 | if os.path.exists(dump_path):
501 | actions = pd.read_csv(dump_path)
502 | else:
503 |
504 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
505 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) + 1)
506 |
507 | idx = df.groupby(['user_id', 'type'])['time'].transform(min)
508 | idx1 = idx == df['time']
509 | actions = df[idx1].groupby(["user_id", "type"]).count()
510 | actions = actions.unstack()
511 | actions.columns = list(range(actions.shape[1]))
512 | actions = actions.fillna(0)
513 | actions = actions.reset_index()
514 |
515 | user_sku = actions[['user_id']]
516 | del actions['user_id']
517 | min_max_scaler = preprocessing.MinMaxScaler()
518 | actions = min_max_scaler.fit_transform(actions.values)
519 | actions = pd.DataFrame(actions)
520 | actions = pd.concat([user_sku, actions], axis=1)
521 |
522 | actions.to_csv(dump_path, index=False)
523 | actions.columns = ['user_id'] + ['u_feat10_' + str(i) for i in range(1, actions.shape[1])]
524 | return actions
525 |
526 |
527 | # 获取人物该层级最后一层的各种行为的统计数量
528 | def get_action_user_feat11(start_date, end_date, n):
529 | dump_path = './cache/user_feat11_%s_%s_%s.csv' % (start_date, end_date, n)
530 | if os.path.exists(dump_path):
531 | actions = pd.read_csv(dump_path)
532 | else:
533 |
534 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
535 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
536 | df = df[df['time'] == 0]
537 | del df['time']
538 | temp = pd.get_dummies(df['type'], prefix='type')
539 | del df['type']
540 | actions = pd.concat([df, temp], axis=1)
541 | actions = actions.groupby(['user_id'], as_index=False).sum()
542 | user_sku = actions[['user_id']]
543 | del actions['user_id']
544 | min_max_scaler = preprocessing.MinMaxScaler()
545 | actions = min_max_scaler.fit_transform(actions.values)
546 | actions = pd.DataFrame(actions)
547 | actions = pd.concat([user_sku, actions], axis=1)
548 | actions.to_csv(dump_path, index=False)
549 | actions.columns = ['user_id'] + ['u_feat11_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
550 | return actions
551 |
552 |
553 | def get_action_user_feat12(start_date, end_date):
554 | dump_path = './cache/user_feat12_%s_%s.csv' % (start_date, end_date)
555 | if os.path.exists(dump_path):
556 | actions = pd.read_csv(dump_path)
557 | else:
558 | actions = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
559 | actions['time'] = actions['time'].map(lambda x: x.split(' ')[0])
560 | actions = actions.drop_duplicates(['user_id', 'time', 'type'], keep='first')
561 | actions['day'] = actions['time'].map(
562 | lambda x: (datetime.strptime(end_date, '%Y-%m-%d') - datetime.strptime(x, '%Y-%m-%d')).days)
563 | result = None
564 | for i in (2, 3, 7, 14, 28): # 层级个数
565 | print ('i%s' % i)
566 | actions['level%s' % i] = actions['day'].map(lambda x: x // i)
567 | a=set(actions['level%s' % i].tolist())
568 | for j in (1, 2,3,4, 5, 6): # type
569 | print ('j%s' % j)
570 | df = actions[actions['type'] == j][['user_id', 'level%s' % i, 'time']]
571 | df = df.groupby(['user_id', 'level%s' % i]).count()
572 | df = df.unstack()
573 | b=df.columns.levels[1].tolist()
574 | df.columns = ['u_feat12_' + str('level%s_' % i) + str(j) + '_' + str(k) for k in df.columns.levels[1].tolist()]
575 | if len(list(a-set(b)))!=0:
576 | c=list(a-set(b))
577 | for k in c:
578 | df['u_feat12_'+str('level%s_' % i)+str(j)+'_'+ str(k)]=0
579 | columns=df.columns
580 | dict={}
581 | for column in columns:
582 | k=int(column.split('_')[-1])
583 | dict[column]=k
584 | columns=sorted(dict.items(),key=lambda x: x[1])
585 | columns=[(columns[t])[0] for t in range(len(columns))]
586 | df=df[columns]
587 | df = df.reset_index()
588 | if result is None:
589 | result = df
590 | else:
591 | result = pd.merge(result, df, on='user_id', how='left')
592 | columns = result.columns
593 | user_id = result['user_id']
594 | del result['user_id']
595 | actions = result.fillna(0)
596 |
597 | min_max_scaler = preprocessing.MinMaxScaler()
598 | actions = min_max_scaler.fit_transform(actions.values)
599 | actions = pd.DataFrame(actions)
600 | actions = pd.concat([user_id, actions], axis=1)
601 | actions.columns=columns
602 | actions.to_csv(dump_path, index=False)
603 | return actions
604 |
605 |
606 |
607 | # 层级的天数
608 | def get_action_user_feat13(start_date, end_date, n):
609 | dump_path = './cache/user_feat13_%s_%s_%s.csv' % (start_date, end_date, n)
610 | if os.path.exists(dump_path):
611 | actions = pd.read_csv(dump_path)
612 | else:
613 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
614 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
615 | df = df.drop_duplicates(['user_id', 'type', 'time'], keep='first')
616 | actions = df.groupby(['user_id', 'type']).count()
617 | actions = actions.unstack()
618 | actions.columns = list(range(actions.shape[1]))
619 | actions = actions.fillna(0)
620 | actions = actions.reset_index()
621 | user_sku = actions[['user_id']]
622 | del actions['user_id']
623 | min_max_scaler = preprocessing.MinMaxScaler()
624 | actions = min_max_scaler.fit_transform(actions.values)
625 | actions = pd.DataFrame(actions)
626 | actions = pd.concat([user_sku, actions], axis=1)
627 | actions.to_csv(dump_path, index=False)
628 | actions.columns = ['user_id'] + ['u_feat13_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
629 | return actions
630 |
631 |
632 | def get_action_user_feat14(start_date, end_date):
633 | dump_path = './cache/user_feat14_%s_%s.csv' % (start_date, end_date)
634 | if os.path.exists(dump_path):
635 | actions = pd.read_csv(dump_path)
636 | else:
637 | n = 5
638 | df = get_actions(start_date, end_date)[['user_id', 'time', 'type']]
639 | df = df[df['type'] == 4][['user_id', 'time']]
640 | df['time'] = df['time'].map(lambda x: get_day_chaju(x, end_date) // n)
641 | days = np.max(df['time'])
642 |
643 | df['cnt'] = 0
644 | actions = df.groupby(['user_id', 'time']).count()
645 |
646 | actions = actions.unstack()
647 |
648 | actions.columns = list(range(actions.shape[1]))
649 | actions = actions.reset_index()
650 |
651 | actions = actions.fillna(0)
652 | user_sku = actions[['user_id']]
653 | del actions['user_id']
654 | min_max_scaler = preprocessing.MinMaxScaler()
655 | actions = min_max_scaler.fit_transform(actions.values)
656 | actions = pd.DataFrame(actions)
657 | actions = pd.concat([user_sku, actions], axis=1)
658 | actions.to_csv(dump_path, index=False)
659 | actions.columns = ['user_id'] + ['u_feat14_' + str(i) for i in range(1, actions.shape[1])]
660 | return actions
661 |
662 |
663 | # 用户购买/加入购物车/关注前访问次数
664 | def get_action_user_feat15(start_date, end_date):
665 | dump_path = './cache/user_feat15_%s_%s.csv' % (start_date, end_date)
666 | if os.path.exists(dump_path):
667 | actions = pd.read_csv(dump_path)
668 | else:
669 | # 用户购买前访问次数
670 | def user_feat_15_1(start_date, end_date):
671 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
672 | visit = actions[actions['type'] == 1]
673 | visit = visit.groupby('user_id', as_index=False).count()
674 | visit.columns = ['user_id', 'visit']
675 | buy = actions[actions['type'] == 4]
676 | buy = buy.groupby('user_id', as_index=False).count()
677 | buy.columns = ['user_id', 'buy']
678 | actions = pd.merge(visit, buy, on='user_id', how='left')
679 | actions['visit_num_before_buy'] = actions['visit'] / actions['buy']
680 | del actions['buy']
681 | del actions['visit']
682 | return actions
683 |
684 | # 用户加入购物车前访问次数
685 | def user_feat_15_2(start_date, end_date):
686 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
687 | visit = actions[actions['type'] == 1]
688 | visit = visit.groupby('user_id', as_index=False).count()
689 | visit.columns = ['user_id', 'visit']
690 | addtoshopping = actions[actions['type'] == 2]
691 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
692 | addtoshopping.columns = ['user_id', 'addtoshopping']
693 | actions = pd.merge(visit, addtoshopping, on='user_id', how='left')
694 | actions['visit_num_before_addtoshopping'] = actions['visit'] / actions['addtoshopping']
695 | del actions['addtoshopping']
696 | del actions['visit']
697 | return actions
698 |
699 | # 用户关注前访问次数
700 | def user_feat_15_3(start_date, end_date):
701 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
702 | visit = actions[actions['type'] == 1]
703 | visit = visit.groupby('user_id', as_index=False).count()
704 | visit.columns = ['user_id', 'visit']
705 | guanzhu = actions[actions['type'] == 5]
706 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
707 | guanzhu.columns = ['user_id', 'guanzhu']
708 | actions = pd.merge(visit, guanzhu, on='user_id', how='left')
709 | actions['visit_num_before_guanzhu'] = actions['visit'] / actions['guanzhu']
710 | del actions['guanzhu']
711 | del actions['visit']
712 | return actions
713 |
714 | # 用户购买前加入购物车次数
715 | def user_feat_15_4(start_date, end_date):
716 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
717 | addtoshopping = actions[actions['type'] == 2]
718 | addtoshopping = addtoshopping.groupby('user_id', as_index=False).count()
719 | addtoshopping.columns = ['user_id', 'addtoshopping']
720 | buy = actions[actions['type'] == 4]
721 | buy = buy.groupby('user_id', as_index=False).count()
722 | buy.columns = ['user_id', 'buy']
723 | actions = pd.merge(addtoshopping, buy, on='user_id', how='left')
724 | actions['addtoshopping_num_before_buy'] = actions['addtoshopping'] / actions['buy']
725 | del actions['buy']
726 | del actions['addtoshopping']
727 | return actions
728 |
729 | # 用户购买前关注次数
730 | def user_feat_15_5(start_date, end_date):
731 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
732 | guanzhu = actions[actions['type'] == 5]
733 | guanzhu = guanzhu.groupby('user_id', as_index=False).count()
734 | guanzhu.columns = ['user_id', 'guanzhu']
735 | buy = actions[actions['type'] == 4]
736 | buy = buy.groupby('user_id', as_index=False).count()
737 | buy.columns = ['user_id', 'buy']
738 | actions = pd.merge(guanzhu, buy, on='user_id', how='left')
739 | actions['guanzhu_num_before_buy'] = actions['guanzhu'] / actions['buy']
740 | del actions['buy']
741 | del actions['guanzhu']
742 | return actions
743 |
744 | actions = pd.merge(user_feat_15_1(start_date, end_date), user_feat_15_2(start_date, end_date), on='user_id',
745 | how='outer')
746 | actions = pd.merge(actions, user_feat_15_3(start_date, end_date), on='user_id', how='outer')
747 | actions = pd.merge(actions, user_feat_15_4(start_date, end_date), on='user_id', how='outer')
748 | actions = pd.merge(actions, user_feat_15_5(start_date, end_date), on='user_id', how='outer')
749 | user_id = actions['user_id']
750 | del actions['user_id']
751 | actions = actions.fillna(0)
752 | min_max_scale = preprocessing.MinMaxScaler()
753 | actions = min_max_scale.fit_transform(actions.values)
754 | actions = pd.concat([user_id, pd.DataFrame(actions)], axis=1)
755 |
756 | actions.to_csv(dump_path, index=False)
757 | actions.columns = ['user_id'] + ['u_feat15_' + str(i) for i in range(1, actions.shape[1])]
758 | return actions
759 |
760 |
761 | # 用户行为的交叉
762 | def get_action_user_feat16(start_date, end_date):
763 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
764 | if os.path.exists(dump_path):
765 | actions = pd.read_csv(dump_path)
766 | else:
767 | actions = get_actions(start_date, end_date)[['user_id', 'type']]
768 | actions['cnt'] = 0
769 | action1 = actions.groupby(['user_id', 'type']).count()
770 | action1 = action1.unstack()
771 | index_col = list(range(action1.shape[1]))
772 | action1.columns = index_col
773 | action1 = action1.reset_index()
774 | action2 = actions.groupby('user_id', as_index=False).count()
775 | del action2['type']
776 | action2.columns = ['user_id', 'cnt']
777 | actions = pd.merge(action1, action2, how='left', on='user_id')
778 | for i in index_col:
779 | actions[i] = actions[i] / actions['cnt']
780 | del actions['cnt']
781 | actions.to_csv(dump_path, index=False)
782 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
783 | return actions
784 |
785 |
786 | # 最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除)
787 | def get_action_user_feat0509_1_30(start_date, end_date, n):
788 | dump_path = './cache/user_feat0509_1_30_%s_%s_%s.csv' % (start_date, end_date, n)
789 | if os.path.exists(dump_path):
790 | actions = pd.read_csv(dump_path)
791 | else:
792 |
793 | start_days = datetime.strptime(end_dfte, '%Y-%m-%d') - timedelta(days=n)
794 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
795 |
796 | actions = get_actions(start_days, end_date)[['user_id', 'sku_id', 'type']]
797 | actions_dummy = pd.get_dummies(actions['type'], prefix='actions')
798 | actions = pd.concat([actions, actions_dummy], axis=1)
799 | del actions['type']
800 |
801 | P = get_basic_product_feat()[['sku_id']]
802 | P['label'] = 1
803 | actions_sub = pd.merge(actions, P, on='sku_id', how='left')
804 | actions_sub = actions_sub[actions_sub['label'] == 1]
805 | del actions_sub['label']
806 |
807 | actions_sub = actions_sub.groupby(['user_id'], as_index=False).sum()
808 | del actions_sub['sku_id']
809 | actions_all = actions.groupby(['user_id'], as_index=False).sum()
810 | del actions_all['sku_id']
811 |
812 | if n > 7:
813 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
814 | # print actions.head()
815 | for i in range(1, 7):
816 | actions['actions_%s' % i] = actions['actions_%s_y' % i] / actions['actions_%s_x' % i]
817 | # actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
818 |
819 | else:
820 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
821 | actions.to_csv(dump_path, index=False)
822 | actions.columns = ['user_id'] + ['u_feat30_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
823 |
824 | return actions
825 |
826 |
827 |
828 | #用户行为的交叉
829 | def get_action_user_feat16(start_date,end_date):
830 | dump_path = './cache/user_feat16_%s_%s.csv' % (start_date, end_date)
831 | if os.path.exists(dump_path):
832 | actions = pd.read_csv(dump_path)
833 | else:
834 | actions=get_actions(start_date, end_date)[['user_id', 'type']]
835 | actions['cnt']=0
836 | action1 = actions.groupby(['user_id', 'type']).count()
837 | action1=action1.unstack()
838 | index_col=list(range(action1.shape[1]))
839 | action1.columns=index_col
840 | action1=action1.reset_index()
841 | action2 = actions.groupby('user_id', as_index=False).count()
842 | del action2['type']
843 | action2.columns = ['user_id', 'cnt']
844 | actions = pd.merge(action1, action2, how='left', on='user_id')
845 | for i in index_col:
846 | actions[i] = actions[i] / actions['cnt']
847 | del actions['cnt']
848 | actions.to_csv(dump_path,index=False)
849 | actions.columns = ['user_id'] + ['u_feat16_' + str(i) for i in range(1, actions.shape[1])]
850 | return actions
851 |
852 | #最近k天用户访问P集合的商品数/用户访问总体的商品数(k小于7天,不除总体的商品数,反之,除)
853 | def get_action_user_feat0509_1_30(start_date,end_date,n):
854 | dump_path='./cache/user_feat0509_1_30_%s_%s_%s.csv'%(start_date,end_date,n)
855 | if os.path.exists(dump_path):
856 | actions = pd.read_csv(dump_path)
857 | else:
858 |
859 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
860 | start_days=datetime.strftime(start_days,'%Y-%m-%d')
861 |
862 | actions=get_actions(start_days,end_date)[['user_id','sku_id','type']]
863 | actions_dummy=pd.get_dummies(actions['type'],prefix='actions')
864 | actions=pd.concat([actions,actions_dummy],axis=1)
865 | del actions['type']
866 |
867 | P = get_basic_product_feat()[['sku_id']]
868 | P['label']=1
869 | actions_sub=pd.merge(actions,P,on='sku_id',how='left')
870 | actions_sub=actions_sub[actions_sub['label']==1]
871 | del actions_sub['label']
872 |
873 | actions_sub=actions_sub.groupby(['user_id'],as_index=False).sum()
874 | del actions_sub['sku_id']
875 | actions_all=actions.groupby(['user_id'],as_index=False).sum()
876 | del actions_all['sku_id']
877 |
878 | if n>7:
879 | actions=pd.merge(actions_all,actions_sub,on=['user_id'],how='left')
880 | #print actions.head()
881 | for i in range(1,7):
882 | actions['actions_%s'%i]=actions['actions_%s_y'%i]/actions['actions_%s_x'%i]
883 | #actions=actions[['user_id','actions_1','actions_2','actions_3','actions_4','actions_5','actions_6']]
884 |
885 | else:
886 | actions = pd.merge(actions_all, actions_sub, on=['user_id'], how='left')
887 | actions.to_csv(dump_path,index=False)
888 | actions.columns = ['user_id'] + ['u_feat30_' +str(n)+'_'+ str(i) for i in range(1, actions.shape[1])]
889 | # user_id = actions[['user_id']]
890 | # del actions['user_id']
891 | # actions = actions.fillna(0)
892 | # actions=actions.replace(np.inf,0)
893 | # # print(actions.head())
894 | # columns = actions.columns
895 |
896 | # min_max_scale = preprocessing.MinMaxScaler()
897 | # actions=actions.replace(np.inf,0)
898 | # actions = min_max_scale.fit_transform(actions.values)
899 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
900 | return actions
901 |
902 | #用户点击到购买的时间间隔
903 | def get_action_user_feat0515_2_1(start_date,end_date):
904 | dump_path='./cache/get_action_user_feat0515_2_1_%s_%s.csv'%(start_date,end_date)
905 | if os.path.exists(dump_path):
906 | actions = pd.read_csv(dump_path)
907 | else:
908 | actions = get_actions(start_date,end_date)
909 | actions_dianji=actions[actions['type']==6][['user_id','sku_id','time']]
910 | actions_dianji['time_dianji'] = actions_dianji['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
911 | actions_dianji = actions_dianji[['user_id', 'sku_id','time_dianji']]
912 | actions_dianji= actions_dianji.drop_duplicates(['user_id', 'sku_id'], keep='first')
913 |
914 |
915 | actions_goumai=actions[actions['type']==4][['user_id','sku_id','time']]
916 | actions_goumai['time_goumai'] = actions_goumai['time'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
917 | actions_goumai = actions_goumai[['user_id', 'sku_id','time_goumai']]
918 | actions_goumai= actions_goumai.drop_duplicates(['user_id', 'sku_id'], keep='last')
919 |
920 | actions = pd.merge(actions_dianji,actions_goumai,on=['user_id','sku_id'],how='inner')
921 | actions['time_jiange']=actions['time_goumai']-actions['time_dianji']
922 | actions=actions.drop(['sku_id','time_goumai','time_dianji'],axis=1)
923 | actions['time_jiange']=actions['time_jiange'].map(lambda x:x.days*24+x.seconds//3600+1)
924 |
925 | actions_min = actions.groupby('user_id').min().reset_index()
926 | actions_min.columns = ['user_id','time_min']
927 | # actions_mean = actions.groupby('user_id').mean().reset_index()
928 | # actions_mean.columns = ['user_id','time_mean']
929 | actions_max = actions.groupby('user_id').max().reset_index()
930 | actions_max.columns = ['user_id','time_max']
931 | actions=pd.merge(actions_min,actions_max,on='user_id',how='left')
932 |
933 | user_id = actions[['user_id']]
934 | del actions['user_id']
935 | actions = actions.fillna(0)
936 | columns = actions.columns
937 | min_max_scale = preprocessing.MinMaxScaler()
938 | actions = min_max_scale.fit_transform(actions.values)
939 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
940 | actions.to_csv(dump_path,index=False)
941 | return actions
942 |
943 |
944 | #用户购买每种cate的数量
945 | def get_action_user_feat0515_2_2(start_date,end_date):
946 | dump_path='./cache/get_action_user_feat0515_2_2_%s_%s.csv'%(start_date,end_date)
947 | if os.path.exists(dump_path):
948 | actions = pd.read_csv(dump_path)
949 | else:
950 | actions = get_actions(start_date,end_date)
951 | actions = get_actions(start_date,end_date)[['user_id','cate']]
952 | cate_col = pd.get_dummies(actions['cate'],prefix='cate')
953 | actions=pd.concat([actions[['user_id']],cate_col],axis=1)
954 | actions= actions.groupby('user_id').sum().reset_index()
955 |
956 | user_id = actions[['user_id']]
957 | del actions['user_id']
958 | actions = actions.fillna(0)
959 | columns = actions.columns
960 | min_max_scale = preprocessing.MinMaxScaler()
961 | actions = min_max_scale.fit_transform(actions.values)
962 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
963 | actions.to_csv(dump_path,index=False)
964 | return actions
965 |
966 |
967 | #获取某人某段时间内加入购物车的数量以及关注的数量
968 | def get_action_user_feat0515_2_3(start_date, end_date, n):
969 | dump_path = './cache/get_action_user_feat0515_2_3_%s_%s_%s_1.csv' % (start_date, end_date, n)
970 | if os.path.exists(dump_path):
971 | actions = pd.read_csv(dump_path)
972 | else:
973 |
974 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
975 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
976 |
977 | actions = get_actions(start_days,end_date)[['user_id','type','cate']]
978 | actions_gouwuche=actions[actions['type']==2]
979 | actions_gouwuche_1= actions_gouwuche[['user_id','type']]
980 | actions_gouwuche_1= actions_gouwuche_1.groupby('user_id').count().reset_index()
981 | actions_gouwuche_1.columns = ['user_id',str(n)+'gouwuche_add']
982 |
983 | actions_gouwuche_2= actions_gouwuche[actions_gouwuche['cate']==8][['user_id','type']]
984 | actions_gouwuche_2= actions_gouwuche_2.groupby('user_id').count().reset_index()
985 | actions_gouwuche_2.columns = ['user_id',str(n)+'gouwuche_add_cate_8']
986 |
987 | actions_guanzhu=actions[actions['type']==5]
988 | actions_guanzhu_1= actions_guanzhu[['user_id','type']]
989 | actions_guanzhu_1= actions_guanzhu_1.groupby('user_id').count().reset_index()
990 | actions_guanzhu_1.columns = ['user_id',str(n)+'guanzhu_add']
991 |
992 | actions_guanzhu_2= actions_guanzhu[actions_guanzhu['cate']==8][['user_id','type']]
993 | actions_guanzhu_2= actions_guanzhu_2.groupby('user_id').count().reset_index()
994 | actions_guanzhu_2.columns = ['user_id',str(n)+'guanzhu_add_cate_8']
995 |
996 | actions = pd.merge(actions_gouwuche_1,actions_gouwuche_2,on='user_id',how ='outer')
997 | actions = pd.merge(actions,actions_guanzhu_1,on='user_id',how ='outer')
998 | actions = pd.merge(actions,actions_guanzhu_2,on='user_id',how ='outer')
999 | actions=actions.fillna(0)
1000 |
1001 | user_id = actions[['user_id']]
1002 | del actions['user_id']
1003 | actions = actions.fillna(0)
1004 | columns = actions.columns
1005 | min_max_scale = preprocessing.MinMaxScaler()
1006 | actions = min_max_scale.fit_transform(actions.values)
1007 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1008 | actions.to_csv(dump_path, index=False)
1009 |
1010 |
1011 | return actions
1012 |
1013 | #top n 中 某人使用了多少天产生了该行为
1014 | def get_action_user_feat0515_2_4(start_date, end_date, n):
1015 | dump_path = './cache/get_action_user_feat0515_2_4_%s_%s_%s.csv' % (start_date, end_date, n)
1016 | if os.path.exists(dump_path):
1017 | actions = pd.read_csv(dump_path)
1018 | else:
1019 |
1020 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1021 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
1022 |
1023 | actions = get_actions(start_days,end_date)[['user_id','type','time']]
1024 | actions['time'] = actions['time'].map(lambda x: (datetime.strptime(end_date,'%Y-%m-%d')-datetime.strptime(x, '%Y-%m-%d %H:%M:%S')).days)
1025 | actions=actions.drop_duplicates(['user_id','type','time'])
1026 | actions = actions.groupby(['user_id','type']).count()
1027 | actions.columns = [str(n)+'day_nums']
1028 | actions=actions.unstack()
1029 | actions=actions.reset_index()
1030 | actions.columns = ['user_id'] + ['get_action_user_feat0515_2_4_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1031 | actions=actions.fillna(0)
1032 |
1033 | user_id = actions[['user_id']]
1034 | del actions['user_id']
1035 | actions = actions.fillna(0)
1036 | columns = actions.columns
1037 | min_max_scale = preprocessing.MinMaxScaler()
1038 | actions = min_max_scale.fit_transform(actions.values)
1039 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1040 | actions.to_csv(dump_path, index=False)
1041 | return actions
1042 |
1043 |
1044 | # 用户总购买/加购/关注/点击/浏览品牌数
1045 | def get_action_user_feat5(start_date, end_date):
1046 | dump_path = './cache/user_feat5_a_%s_%s.csv' % (start_date, end_date)
1047 | if os.path.exists(dump_path):
1048 | actions = pd.read_csv(dump_path)
1049 | else:
1050 | actions = get_actions(start_date, end_date)
1051 | action=None
1052 | for i in (1,2,4,5,6):
1053 | df=actions[actions['type']==i][['user_id', 'sku_id']]
1054 | df = df.drop_duplicates(['user_id', 'sku_id'], keep='first')
1055 | df = df.groupby('user_id', as_index=False).count()
1056 | df.columns = ['user_id', 'num_%s'%i]
1057 | if i==1:
1058 | action=df
1059 | else:
1060 | action=pd.merge(action,df,on='user_id',how='outer')
1061 | actions=action.fillna(0)
1062 | actions = actions.astype('float')
1063 | user=actions[['user_id']]
1064 | min_max_scaler = preprocessing.MinMaxScaler()
1065 | actions = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1066 | actions = pd.DataFrame(actions)
1067 | actions = pd.concat([user, actions], axis=1)
1068 | actions.to_csv(dump_path, index=False)
1069 | actions.columns = ['user_id'] + ['u_feat5_' + str(i) for i in range(1, actions.shape[1])]
1070 | return actions
1071 |
1072 | #top k 用户总购买/加购/关注/点击/浏览品牌数
1073 | def get_action_u0515_feat5(start_date,end_date,k):
1074 | dump_path = './cache/u0515_feat5_%s_%s_%s.csv' % (start_date, end_date,k)
1075 | if os.path.exists(dump_path):
1076 | actions = pd.read_csv(dump_path)
1077 | else:
1078 | start_days=pd.to_datetime(end_date)-timedelta(days=k)
1079 | start_days=str(start_days).split(' ')[0]
1080 | actions=get_action_user_feat5(start_days, end_date)
1081 | actions.to_csv(dump_path,index=False)
1082 | actions.columns=['user_id']+['u0515_feat5_'+str(k)+'_'+str(i) for i in range(1,actions.shape[1])]
1083 | return actions
1084 |
1085 |
1086 | #最早交互时间
1087 | def get_action_u0524_feat1(start_date,end_date):
1088 | dump_path = './cache/u0524_feat1_%s_%s.csv' % (start_date, end_date,)
1089 | if os.path.exists(dump_path):
1090 | actions = pd.read_csv(dump_path)
1091 | else:
1092 | #全集
1093 | actions=get_actions(start_date,end_date)[['user_id','time']]
1094 | actions=actions.groupby('user_id',as_index=False).first()
1095 | actions['time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(actions['time'])
1096 | actions['time_diff_early']=actions['time_diff_early'].dt.days*24+actions['time_diff_early'].dt.seconds//3600
1097 | actions=actions[['user_id','time_diff_early']]
1098 | #子集
1099 | sub_actions=sub_get_actions(start_date,end_date)[['user_id','time']]
1100 | sub_actions=sub_actions.groupby('user_id',as_index=False).first()
1101 | sub_actions['sub_time_diff_early']=pd.to_datetime(end_date)-pd.to_datetime(sub_actions['time'])
1102 | sub_actions['sub_time_diff_early']=sub_actions['sub_time_diff_early'].dt.days*24+sub_actions['sub_time_diff_early'].dt.seconds//3600
1103 | sub_actions = sub_actions[['user_id', 'sub_time_diff_early']]
1104 |
1105 | actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1106 | actions=actions.fillna(0)
1107 | min_max_scale = preprocessing.MinMaxScaler()
1108 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1109 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1110 | actions.to_csv(dump_path,index=False)
1111 | actions.columns=['user_id']+['u0524_feat1_'+str(i)for i in range(1,actions.shape[1])]
1112 | return actions
1113 |
1114 | #最晚交互时间
1115 | def get_action_u0524_feat2(start_date,end_date):
1116 | dump_path = './cache/u0524_feat2_%s_%s.csv' % (start_date, end_date,)
1117 | if os.path.exists(dump_path):
1118 | actions = pd.read_csv(dump_path)
1119 | else:
1120 | # 全集
1121 | actions = get_actions(start_date, end_date)[['user_id', 'time']]
1122 | actions = actions.groupby('user_id', as_index=False).last()
1123 | actions['time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(actions['time'])
1124 | actions['time_diff_recent'] = actions['time_diff_recent'].dt.days * 24 + actions['time_diff_recent'].dt.seconds // 3600
1125 | actions = actions[['user_id', 'time_diff_recent']]
1126 | # 子集
1127 | sub_actions = sub_get_actions(start_date, end_date)[['user_id', 'time']]
1128 | sub_actions = sub_actions.groupby('user_id', as_index=False).last()
1129 | sub_actions['sub_time_diff_recent'] = pd.to_datetime(end_date) - pd.to_datetime(sub_actions['time'])
1130 | sub_actions['sub_time_diff_recent'] = sub_actions['sub_time_diff_recent'].dt.days * 24 + sub_actions['sub_time_diff_recent'].dt.seconds // 3600
1131 | sub_actions = sub_actions[['user_id', 'sub_time_diff_recent']]
1132 |
1133 | actions = pd.merge(actions, sub_actions, on='user_id', how='left')
1134 | actions=actions.fillna(0)
1135 | min_max_scale = preprocessing.MinMaxScaler()
1136 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1137 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1138 | actions.to_csv(dump_path,index=False)
1139 | actions.columns = ['user_id'] + ['u0524_feat2_' + str(i) for i in range(1, actions.shape[1])]
1140 | return actions
1141 |
1142 |
1143 | #活跃天数
1144 | def get_action_u0524_feat3(start_date,end_date):
1145 | dump_path = './cache/u0524_feat3_%s_%s.csv' % (start_date, end_date,)
1146 | if os.path.exists(dump_path):
1147 | actions = pd.read_csv(dump_path)
1148 | else:
1149 | #全集
1150 | actions=get_actions(start_date,end_date)
1151 | actions['time']=pd.to_datetime(actions['time']).dt.date
1152 | actions=actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1153 | actions=actions.groupby('user_id',as_index=False).count()
1154 | #子集
1155 | sub_actions=sub_get_actions(start_date,end_date)
1156 | sub_actions['time']=pd.to_datetime(sub_actions['time']).dt.date
1157 | sub_actions=sub_actions.drop_duplicates(['user_id','time'])[['user_id','time']]
1158 | sub_actions=sub_actions.groupby('user_id',as_index=False).count()
1159 | actions=pd.merge(actions,sub_actions,on='user_id',how='left')
1160 | actions=actions.fillna(0)
1161 | min_max_scale = preprocessing.MinMaxScaler()
1162 | action = min_max_scale.fit_transform(actions.drop(['user_id'], axis=1).values)
1163 | actions = pd.concat([actions[['user_id']], pd.DataFrame(action)], axis=1)
1164 | actions.to_csv(dump_path,index=False)
1165 | actions.columns=['user_id']+['u0524_feat3_'+str(i) for i in range(1,actions.shape[1])]
1166 | return actions
1167 |
1168 |
1169 | #点击模块
1170 | def get_action_user_feat0509_1_31(start_date,end_date,n):
1171 | dump_path='./cache/user_feat0509_1_31_%s_%s_%s.csv'%(start_date,end_date,n)
1172 | if os.path.exists(dump_path):
1173 | actions = pd.read_csv(dump_path)
1174 | else:
1175 | start_days=datetime.strptime(end_date,'%Y-%m-%d')-timedelta(days=n)
1176 | start_days=datetime.strftime(start_days,'%Y-%m-%d')
1177 | actions=get_actions(start_days,end_date)
1178 | actions=actions[actions['type']==6][['user_id','model_id']]
1179 |
1180 | # actions = actions.drop('type',axis=1)
1181 |
1182 | actions_click_sum=actions[['user_id','model_id']].groupby('user_id').count().reset_index()
1183 | actions_click_sum.columns = ['user_id',str(n)+'click_sum_all']
1184 | actions[str(n)+'u_click14_history'] = actions['model_id'].map(lambda x: int(x == 14))
1185 | actions[str(n)+'u_click21_history'] = actions['model_id'].map(lambda x: int(x == 21))
1186 | actions[str(n)+'u_click28_history'] = actions['model_id'].map(lambda x: int(x == 28))
1187 | actions[str(n)+'u_click110_history'] = actions['model_id'].map(lambda x: int(x == 110))
1188 | actions[str(n)+'u_click210_history'] = actions['model_id'].map(lambda x: int(x == 210))
1189 | actions = actions.groupby('user_id').sum().reset_index().drop('model_id', axis=1)
1190 | # actions.to_csv(dump_path,index=False)
1191 | actions = pd.merge(actions,actions_click_sum,how='left',on='user_id')
1192 |
1193 | actions[str(n)+'u_click14/click_sum_history'] = actions[str(n)+'u_click14_history']/actions[str(n)+'click_sum_all']
1194 | actions[str(n)+'u_click21/click_sum_history'] = actions[str(n)+'u_click21_history']/actions[str(n)+'click_sum_all']
1195 | actions[str(n)+'u_click28/click_sum_history'] = actions[str(n)+'u_click28_history']/actions[str(n)+'click_sum_all']
1196 | actions[str(n)+'u_click110/click_sum_history'] = actions[str(n)+'u_click110_history']/actions[str(n)+'click_sum_all']
1197 | actions[str(n)+'u_click210/click_sum_history'] = actions[str(n)+'u_click210_history']/actions[str(n)+'click_sum_all']
1198 |
1199 | user_id = actions[['user_id']]
1200 | del actions['user_id']
1201 | actions = actions.fillna(0)
1202 | columns = actions.columns
1203 | min_max_scale = preprocessing.MinMaxScaler()
1204 | actions = min_max_scale.fit_transform(actions.values)
1205 | actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1206 | actions.to_csv(dump_path,index=False)
1207 | return actions
1208 | #u模型cate=8的购买者和不是cate=8的购买者
1209 | def get_action_u0513_feat16(start_date,end_date):
1210 | dump_path = './cache/u0513_feat16_%s_%s.csv' % (start_date, end_date)
1211 | if os.path.exists(dump_path):
1212 | actions = pd.read_csv(dump_path)
1213 | else:
1214 | df = get_actions(start_date, end_date)[['user_id', 'type', 'cate']]
1215 | df = df[df['type'] == 4]
1216 | df = df.groupby(['user_id', 'cate']).count()
1217 | df = df.unstack().reset_index()
1218 | df.columns = ['user_id'] + ['cate' + str(i) for i in range(4, 12)]
1219 | df = df.fillna(0)
1220 | sum1 = df.drop(['user_id', 'cate8'], axis=1).apply(sum, axis=1)
1221 | sum2 = df.drop(['user_id'], axis=1).apply(sum, axis=1)
1222 | actions = pd.concat([df[['user_id', 'cate8']], sum1, sum2], axis=1)
1223 | actions.columns = ['user_id', 'cate8', 'sum_other_cate', 'sum']
1224 | actions['cate8_rate'] = actions['cate8'] / actions['sum']
1225 | actions['sum_other_cate_rate'] = actions['sum_other_cate'] / actions['sum']
1226 | del actions['sum']
1227 | actions.to_csv(dump_path,index=False)
1228 | return actions
1229 |
1230 | #get_action_u0513_feat16('2016-02-01','2016-04-16')
1231 | # 用户层级特征
1232 | def get_action_user_feat_six_xingwei(start_date, end_date, n):
1233 | dump_path = './cache/user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1234 | if os.path.exists(dump_path):
1235 | actions = pd.read_csv(dump_path)
1236 | print("user_zlzl" + str(n))
1237 |
1238 | else:
1239 | actions = get_actions(start_date, end_date)
1240 | actions['time'] = actions['time'].map(lambda x: get_day_chaju(x, end_date) // n)
1241 | num_day = np.max(actions['time'])
1242 | df = None
1243 | print(num_day)
1244 | for i in range(min(num_day + 1, 6)):
1245 | in_temp = pd.get_dummies(actions['type'], prefix="user_action_time_" + str(i))
1246 | temp = actions[actions['time'] == i]
1247 | temp = pd.concat([temp['user_id'], in_temp], axis=1)
1248 |
1249 | feature = ['user_id']
1250 | for j in range(1, 7, 1):
1251 | feature.append('user_action_time_' + str(i) + '_' + str(j))
1252 |
1253 | temp = temp.groupby(['user_id'], as_index=False).sum()
1254 | temp.columns = feature
1255 | if df is None:
1256 | df = temp
1257 | else:
1258 | df = pd.merge(df, temp, how='outer', on='user_id')
1259 | df.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, df.shape[1])]
1260 | df.to_csv(dump_path, index=False)
1261 | actions=df
1262 |
1263 | # user_id = actions[['user_id']]
1264 | # del actions['user_id']
1265 | # actions = actions.fillna(0)
1266 | # actions=actions.replace(np.inf,0)
1267 | # # print(actions.head())
1268 | # columns = actions.columns
1269 |
1270 | # min_max_scale = preprocessing.MinMaxScaler()
1271 | # actions=actions.replace(np.inf,0)
1272 | # actions = min_max_scale.fit_transform(actions.values)
1273 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1274 | actions.columns = ['user_id'] + ['get_action_user_feat_six_xingwei_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1275 | return actions
1276 |
1277 |
1278 | def deal_user_six_deal(start_date, end_date, n):
1279 | dump_path = './cache/deal_user_six_action_%s_%s_%s_int.csv' % (start_date, end_date, n)
1280 | if os.path.exists(dump_path):
1281 | actions = pd.read_csv(dump_path)
1282 | actions.columns = ['user_id'] + ['u_featsix_deal_' + str(n) + '_' + str(i) for i in range(1, actions.shape[1])]
1283 | return actions
1284 | else:
1285 | temp = get_action_user_feat_six_xingwei(start_date, end_date, n) # 修改
1286 | time1 = datetime.now()
1287 | columns = ["user_id"]
1288 | all_col = temp.shape[1] - 1
1289 | temp.columns = columns + list(range(all_col))
1290 | temp = temp.fillna(0)
1291 | columns = ['user_id']
1292 | for j in range(0, 6, 1):
1293 | temp["zl_" + str(j)] = 0
1294 | columns.append("zl_" + str(j))
1295 | for k in range(j, all_col, 6):
1296 | temp["zl_" + str(j)] = temp["zl_" + str(j)] + temp[k].map(lambda x: x * ((k // 6 + 1) ** (-0.67)))
1297 | temp["zl_" + str(j)] = temp["zl_" + str(j)].map(lambda x: (x - np.min(temp["zl_" + str(j)])) / (
1298 | np.max(temp["zl_" + str(j)]) - np.min(temp["zl_" + str(j)])))
1299 | temp = temp[columns]
1300 | temp.to_csv(dump_path, index=False)
1301 | return temp
1302 |
1303 | # # get user sku
1304 | # def get_user(start_date, end_date):
1305 | # dump_path = './cache/user_sku_%s_%s.csv' % (start_date, end_date)
1306 | # if os.path.exists(dump_path):
1307 | # actions = pd.read_csv(dump_path)
1308 | # else:
1309 | # actions = get_actions(start_date, end_date)
1310 | # actions = actions[(actions['type'] == 2) | (actions['type'] == 5) | (actions['type'] == 4)]
1311 | # actions=actions[actions['cate']==8]
1312 | # actions = actions[['user_id']]
1313 | # actions = actions.drop_duplicates(['user_id'], keep='first')
1314 | # actions.to_csv(dump_path, index=False)
1315 | # return actions
1316 |
1317 |
1318 | #用户购买前的行为
1319 | def get_action_u0509_feat_28(start_date, end_date,k):
1320 | dump_path = './cache/u0509_feat_28_%s_%s_%s.csv' % (start_date, end_date,k)
1321 | if os.path.exists(dump_path):
1322 | actions = pd.read_csv(dump_path)
1323 | else:
1324 | actions = get_actions(start_date, end_date)
1325 | actions = actions[actions['type'] == 4]
1326 | actions['time_buy'] = actions['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1327 | actions = actions[['user_id', 'sku_id', 'time_buy']].reset_index(drop=True)
1328 | actions['before_time_buy'] = actions['time_buy'] - timedelta(days=k)
1329 |
1330 | df = get_actions('2016-02-01','2016-04-16')[['user_id', 'sku_id', 'time', 'type']]
1331 | df['time'] = df['time'].map(lambda x: datetime.strptime(x.split(' ')[0], '%Y-%m-%d'))
1332 | df = pd.merge(df, actions, on=['user_id', 'sku_id'], how='left')
1333 | df = df.dropna(axis=0, how='any')
1334 | df['before_days'] = (df['time'] - df['before_time_buy']).dt.days
1335 | df['days'] = (df['time'] - df['time_buy']).dt.days
1336 | df = df[(df['before_days'] >= 0) & (df['days'] < 0)]
1337 | df_dummy = pd.get_dummies(df['type'], prefix='type')
1338 |
1339 | df = pd.concat([df, df_dummy], axis=1)[
1340 | ['user_id', 'sku_id', 'type_1', 'type_2', 'type_3', 'type_4', 'type_5', 'type_6']]
1341 |
1342 | df = df.groupby(['user_id', 'sku_id'], as_index=False).sum()
1343 | del df['sku_id']
1344 | df = df.groupby('user_id', as_index=False).agg(['min', 'max', 'mean'])
1345 | df = df.reset_index()
1346 | df.columns = ['user_id'] + ['u0509_feat28_' + str(k) + '_' + i for i in (
1347 | 'type_1_min', 'type_1_max', 'type_1_mean', 'type_2_min', 'type_2_max', 'type_2_mean',
1348 | 'type_3_min', 'type_3_max', 'type_3_mean', 'type_4_min', 'type_4_max', 'type_4_mean',
1349 | 'type_5_min', 'type_5_max', 'type_5_mean', 'type_6_min', 'type_6_max', 'type_6_mean')]
1350 | min_max_scaler = preprocessing.MinMaxScaler()
1351 | actions = min_max_scaler.fit_transform(df.drop('user_id', axis=1).values)
1352 | actions = pd.DataFrame(actions)
1353 | actions = pd.concat([df[['user_id']], actions], axis=1)
1354 | actions.columns = ['user_id']+['u0509_feat_28_'+str(i) for i in range(1,actions.shape[1])]
1355 | actions.to_csv(dump_path,index=False)
1356 | actions.columns = ['user_id']+['u0509_feat_28_'+str(k)+"_"+str(i) for i in range(1,actions.shape[1])]
1357 | return actions
1358 |
1359 | #用户看了几个cate=8中的brand、用户看的cate=8的brand/用户看的brand
1360 | def get_action_u0509_feat_29(start_date,end_date):
1361 | dump_path = './cache/u0509_feat_29_%s_%s.csv' % (start_date, end_date)
1362 | if os.path.exists(dump_path):
1363 | actions = pd.read_csv(dump_path)
1364 | else:
1365 | actions=get_actions(start_date,end_date)
1366 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','brand'])[['user_id','brand']]
1367 | df1=df1.groupby(['user_id'],as_index=False).count()
1368 | df1.columns=['user_id','brand_cate=8']
1369 | df2=actions.drop_duplicates(['user_id','brand'])[['user_id','brand']]
1370 | df2 = df2.groupby(['user_id'], as_index=False).count()
1371 | df2.columns=['user_id','brand_cate_all']
1372 | df=pd.merge(df1,df2,on='user_id',how='right')
1373 | df['rate']=df['brand_cate=8']/df['brand_cate_all']
1374 | # print df
1375 | actions=df.fillna(0)
1376 | actions.to_csv(dump_path,index=False)
1377 | actions.columns=['user_id']+['u0509_feat_29'+str(i) for i in range(1,actions.shape[1])]
1378 | return actions
1379 |
1380 | def get_action_u0521_feat_31(start_date,end_date,k):
1381 | dump_path = './cache/u0509_feat_31_%s_%s_%s.csv' % (start_date, end_date,k)
1382 | if os.path.exists(dump_path):
1383 | actions = pd.read_csv(dump_path)
1384 | else:
1385 | start_days=pd.to_datetime(end_date)-timedelta(days=k)
1386 | start_days=datetime.strftime(start_days,'%H-%m-%d')
1387 | actions=get_actions(start_days,end_date)
1388 | df1=actions[actions['cate']==8].drop_duplicates(['user_id','cate'])[['user_id','cate']]
1389 | df1=df1.groupby('user_id',as_index=False).count()
1390 | df1.columns=['user_id','cate8']
1391 | df2=actions.drop_duplicates(['user_id','cate'])[['user_id','cate']]
1392 | df2=df2.groupby('user_id',as_index=False).count()
1393 | actions=pd.merge(df1,df2,on='user_id',how='right')
1394 | actions['cate8/cate']=actions['cate8']/actions['cate']
1395 | actions=actions.fillna(0)
1396 | min_max_scaler = preprocessing.MinMaxScaler()
1397 | df = min_max_scaler.fit_transform(actions[['cate8','cate']].values)
1398 | df = pd.DataFrame(df)
1399 | actions = pd.concat([actions[['user_id','cate8/cate']], df], axis=1)
1400 | actions.to_csv(dump_path,index=False)
1401 | actions.columns=['user_id']+['u0509_feat_31_'+str(k)+'_'+str(i)for i in range(1,actions.shape[1])]
1402 | return actions
1403 |
1404 |
1405 | def get_action_u0521_feat_32(start_date,end_date):
1406 | dump_path = './cache/u0509_feat_32_%s_%s.csv' % (start_date, end_date)
1407 | if os.path.exists(dump_path):
1408 | actions = pd.read_csv(dump_path)
1409 | else:
1410 | actions=get_actions(start_date,end_date)
1411 | actions=actions[actions['cate']==8][['user_id','brand']]
1412 | df1=actions.drop_duplicates(['user_id','brand']).groupby('user_id',as_index=False).count()
1413 | df1.columns=['user_id','brand_num']
1414 | df2=actions.groupby('user_id',as_index=False).count()
1415 | actions=pd.merge(df1,df2,on='user_id',how='left')
1416 | actions['brand_num/brand']=actions['brand']/actions['brand_num']
1417 | actions=actions.fillna(0)
1418 | min_max_scaler = preprocessing.MinMaxScaler()
1419 | df = min_max_scaler.fit_transform(actions.drop(['user_id'],axis=1).values)
1420 | df = pd.DataFrame(df)
1421 | actions = pd.concat([actions[['user_id']], df], axis=1)
1422 | actions.to_csv(dump_path, index=False)
1423 | actions.columns = ['user_id'] + ['u0509_feat_32_' + str(i) for i in range(1, actions.shape[1])]
1424 | return actions
1425 |
1426 | def get_action_user_feat7_0522_huachuang(start_date, end_date,n):
1427 | dump_path = './cache/user_feat7_six_%s_%s_%s_0522.csv' % (start_date, end_date,n)
1428 | if os.path.exists(dump_path):
1429 | actions = pd.read_csv(dump_path)
1430 | else:
1431 | start_days = datetime.strptime(end_date, '%Y-%m-%d') - timedelta(days=n)
1432 | start_days = datetime.strftime(start_days, '%Y-%m-%d')
1433 |
1434 | df = get_actions(start_days, end_date)[['user_id', 'type', 'time']]
1435 | actions = df.groupby(['user_id', 'type'], as_index=False).count()
1436 |
1437 | time_min = df.groupby(['user_id', 'type'], as_index=False).min()
1438 | time_max = df.groupby(['user_id', 'type'], as_index=False).max()
1439 |
1440 | time_cha = pd.merge(time_max, time_min, on=['user_id', 'type'], how='left')
1441 | time_cha['time_x'] = time_cha['time_x'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1442 | time_cha['time_y'] = time_cha['time_y'].map(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
1443 |
1444 | time_cha['cha_hour'] = 1 + (time_cha['time_x'] - time_cha['time_y']).dt.days * 24 + (time_cha['time_x'] -
1445 | time_cha[
1446 | 'time_y']).dt.seconds // 3600
1447 | del time_cha['time_x']
1448 | del time_cha['time_y']
1449 | # time_cha=time_cha.fillna(1)
1450 |
1451 | actions = pd.merge(time_cha, actions, on=['user_id', 'type'], how="left")
1452 | actions = actions.groupby(['user_id', 'type']).sum()
1453 | actions['cnt/time'] = actions['time'] / actions["cha_hour"]
1454 | actions = actions.unstack()
1455 | actions.columns = list(range(actions.shape[1]))
1456 | actions = actions.reset_index()
1457 | actions = actions.fillna(0)
1458 | actions.to_csv(dump_path, index=False)
1459 | actions.columns = ['user_id'] + ['u_feat7_' +str(n)+"_"+ str(i) for i in range(1, actions.shape[1])]
1460 | return actions
1461 |
1462 | def get_user_labels(test_start_date,test_end_date):
1463 | dump_path = './cache/user_labels_%s_%s_11.csv' % (test_start_date, test_end_date)
1464 | if os.path.exists(dump_path):
1465 | actions = pd.read_csv(dump_path)
1466 | else:
1467 | actions = get_actions(test_start_date, test_end_date)
1468 | actions = actions[actions['cate']==8]
1469 | actions = actions[actions['type'] == 4].drop_duplicates(['user_id'])[['user_id']]
1470 | actions['label'] = 1
1471 |
1472 | return actions
1473 |
1474 |
1475 | print("U model 2 finish part_0")
1476 | #########################################################################################################
1477 |
1478 |
1479 | # In[ ]:
1480 |
1481 |
1482 |
1483 |
1484 | # In[ ]:
1485 |
1486 |
1487 |
1488 |
1489 | # In[3]:
1490 |
1491 | import os
1492 | from datetime import datetime
1493 | from datetime import timedelta
1494 |
1495 | # -*- coding: utf-8 -*-
1496 | """
1497 | Created on Sun May 14 10:27:41 2017
1498 | @author: 老虎趴趴走
1499 | """
1500 | import pandas as pd
1501 | import numpy as np
1502 | # import datetime
1503 | import math
1504 |
1505 | def user_features(user, ful_action, sub_action, end_date):
1506 | dump_path='./cache/user_features_%s_0514_2.csv'%(end_date)
1507 | if os.path.exists(dump_path):
1508 | actions = pd.read_csv(dump_path)
1509 |
1510 | else:
1511 | end_date=pd.to_datetime(end_date)
1512 | day = timedelta(1, 0)
1513 | print('=====> 提取特征...')
1514 | sub_1 = sub_action[(sub_action['time']>=end_date-1*day) & (sub_action['time']=end_date-3*day) & (sub_action['time']=end_date-5*day) & (sub_action['time']=end_date-30*day) & (sub_action['time']=end_date-5*day) & (ful_action['time']=end_date-30*day) & (ful_action['time'] 完成!')
1742 | actions.to_csv(dump_path,index=False)
1743 |
1744 | # user_id = actions[['user_id']]
1745 | # del actions['user_id']
1746 | # actions = actions.fillna(0)
1747 | # actions=actions.replace(np.inf,0)
1748 | # print(actions.head())
1749 | # columns = actions.columns
1750 |
1751 | # min_max_scale = preprocessing.MinMaxScaler()
1752 | # actions=actions.replace(np.inf,0)
1753 | # actions = min_max_scale.fit_transform(actions.values)
1754 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1755 | return actions
1756 |
1757 | import pandas as pd
1758 | ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1759 | sub_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True)
1760 | user = pd.read_csv('./data/JData_modified_user.csv', parse_dates=[4])
1761 | # user_features(user,ful_action,sel_action,'2016-04-11')
1762 |
1763 | print("U model 2 finish part_1")
1764 | ######################################################################################
1765 |
1766 |
1767 | # In[ ]:
1768 |
1769 |
1770 |
1771 |
1772 | # In[ ]:
1773 |
1774 |
1775 |
1776 |
1777 | # In[ ]:
1778 |
1779 |
1780 |
1781 |
1782 | # In[ ]:
1783 |
1784 |
1785 |
1786 |
1787 | # In[8]:
1788 |
1789 | # 测试集
1790 | # ful_action = pd.read_csv('./data/JData_Action.csv', parse_dates=[2], infer_datetime_format=True)
1791 | # sel_action = pd.read_csv('./data/JData_subset_action.csv', parse_dates=[2, 7], infer_datetime_format=True)
1792 | def make_test_set(train_start_date, train_end_date,user,ful_action,sub_action):
1793 | dump_path = './cache/bu10525model_2_u_test_set_%s_%s.csv' % (train_start_date, train_end_date)
1794 | if os.path.exists(dump_path):
1795 | actions = pd.read_csv(dump_path)
1796 | else:
1797 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1798 | actions_1 = get_actions(start_days, train_end_date)
1799 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1800 | # buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates()
1801 | # actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False]
1802 |
1803 | # start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1804 | # actions_1 = get_actions(start_days, train_end_date)
1805 | # actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)]
1806 | # actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1807 |
1808 |
1809 | # actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id'])
1810 |
1811 |
1812 | print (actions.shape)
1813 | # start_days = train_start_date
1814 | start_days = "2016-02-01"
1815 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1816 | # print(actions.shape)
1817 | #
1818 |
1819 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1820 | # print(actions.shape)
1821 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1822 | print(actions.shape)
1823 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1824 | print(actions.shape)
1825 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1826 | print(actions.shape)
1827 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1828 | print(actions.shape)
1829 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1830 | print(actions.shape)
1831 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1832 | print (actions.shape)
1833 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1834 | print (actions.shape)
1835 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1836 | print (actions.shape)
1837 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1838 | print (actions.shape)
1839 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1840 | print (actions.shape)
1841 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1842 | print (actions.shape)
1843 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1844 | print (actions.shape)
1845 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1846 | print (actions.shape)
1847 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1848 | print (actions.shape)
1849 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1850 | print (actions.shape)
1851 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1852 | print (actions.shape)
1853 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1854 | print (actions.shape)
1855 |
1856 | #模型1 和 模型二
1857 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1858 | print (actions.shape)
1859 | #模型 二
1860 | actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1861 |
1862 |
1863 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1864 | # print (actions.shape)
1865 |
1866 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1867 | # print (actions.shape)
1868 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1869 | # print (actions.shape)
1870 |
1871 | for i in (1, 2, 3, 7, 14, 28):
1872 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1873 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1874 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1875 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1876 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1877 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1878 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1879 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1880 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1881 | #模型1 和 模型二
1882 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1883 | if(i<=10):
1884 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1885 | #模型 二
1886 | actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1887 | actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
1888 | print(actions.shape)
1889 | print(actions.shape)
1890 |
1891 | actions = actions.fillna(0)
1892 | # user_id = actions[['user_id']]
1893 | # del actions['user_id']
1894 | # actions = actions.fillna(0)
1895 | # actions=actions.replace(np.inf,0)
1896 | # # print(actions.head())
1897 | # columns = actions.columns
1898 |
1899 | # min_max_scale = preprocessing.MinMaxScaler()
1900 | # actions=actions.replace(np.inf,0)
1901 | # actions = min_max_scale.fit_transform(actions.values)
1902 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
1903 | # actions.to_csv(dump_path,index=False)
1904 | return actions
1905 |
1906 |
1907 | # 训练集
1908 | def make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action):
1909 | dump_path = './cache/bu10525model_2_u_train_set_%s_%s_%s_%s.csv' % (train_start_date, train_end_date, test_start_date, test_end_date)
1910 | if os.path.exists(dump_path):
1911 | actions = pd.read_csv(dump_path)
1912 | else:
1913 |
1914 | start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1915 | actions_1 = get_actions(start_days, train_end_date)
1916 | actions=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1917 | # buy_actions = actions_1[(actions_1['type']==4)&(actions_1['cate']==8)][['user_id']].drop_duplicates()
1918 | # actions = actions[actions['user_id'].isin(buy_actions['user_id'])==False]
1919 |
1920 |
1921 |
1922 | # print (actions.shape)
1923 |
1924 | # start_days=str(pd.to_datetime(train_end_date)-timedelta(days=30)).split(' ')[0]
1925 | # actions_1 = get_actions(start_days, train_end_date)
1926 | # actions_1 = actions_1[(actions_1['type']==2)|(actions_1['type']==4)|(actions_1['type']==5)]
1927 | # actions_1=actions_1[actions_1['cate']==8][['user_id']].drop_duplicates(['user_id'])
1928 | # actions = pd.concat([actions,actions_1]).drop_duplicates(['user_id'])
1929 | print (actions.shape)
1930 | # start_days = train_start_date
1931 | start_days = "2016-02-01"
1932 | # actions = pd.merge(actions,get_basic_user_feat() , how='left', on='user_id')
1933 | print(actions.shape)
1934 |
1935 | # actions = pd.merge(actions, get_action_user_feat1(start_days, train_end_date), how='left', on='user_id')
1936 | # print(actions.shape)
1937 | actions = pd.merge(actions, get_action_user_feat2(start_days, train_end_date), how='left', on='user_id')
1938 | print(actions.shape)
1939 | actions = pd.merge(actions, get_action_user_feat5(start_days, train_end_date), how='left', on='user_id')
1940 | print(actions.shape)
1941 | actions = pd.merge(actions, get_action_user_feat6(start_days, train_end_date), how='left', on='user_id')
1942 | print(actions.shape)
1943 | actions = pd.merge(actions, get_action_user_feat6_six(start_days, train_end_date), how='left', on='user_id')
1944 | print(actions.shape)
1945 | actions = pd.merge(actions, get_action_user_feat7(start_days, train_end_date), how='left', on='user_id')
1946 | print(actions.shape)
1947 | actions = pd.merge(actions, get_action_user_feat8(start_days, train_end_date), how='left', on='user_id')
1948 | print (actions.shape)
1949 | actions = pd.merge(actions, get_action_user_feat8_2(start_days, train_end_date), how='left', on='user_id')
1950 | print (actions.shape)
1951 | actions = pd.merge(actions, get_action_user_feat9(start_days, train_end_date), how='left', on='user_id')
1952 | print (actions.shape)
1953 | actions = pd.merge(actions, get_action_user_feat10(start_days, train_end_date), how='left', on='user_id')
1954 | print (actions.shape)
1955 | actions = pd.merge(actions, get_action_user_feat12(train_start_date, train_end_date), how='left', on='user_id')
1956 | print (actions.shape)
1957 | actions = pd.merge(actions, get_action_user_feat14(train_start_date, train_end_date), how='left', on='user_id')
1958 | print (actions.shape)
1959 | actions = pd.merge(actions, get_action_user_feat15(start_days, train_end_date), how='left', on='user_id')
1960 | print (actions.shape)
1961 | actions = pd.merge(actions, get_action_user_feat16(start_days, train_end_date), how='left', on='user_id')
1962 | print (actions.shape)
1963 | actions = pd.merge(actions, get_action_u0513_feat16(start_days, train_end_date), how='left', on='user_id')
1964 | print (actions.shape)
1965 | actions = pd.merge(actions, user_features(user,ful_action,sub_action,train_end_date), how='left', on='user_id')
1966 | print (actions.shape)
1967 |
1968 | actions = pd.merge(actions, get_action_user_feat0515_2_1(train_start_date, train_end_date), how='left', on='user_id')
1969 | print (actions.shape)
1970 | actions = pd.merge(actions, get_action_user_feat0515_2_2(train_start_date, train_end_date), how='left', on='user_id')
1971 | print (actions.shape)
1972 |
1973 | actions = pd.merge(actions, get_action_u0509_feat_29(train_start_date, train_end_date), how='left', on='user_id')
1974 | actions = pd.merge(actions, get_action_u0521_feat_32(train_start_date, train_end_date), how='left', on='user_id')
1975 |
1976 | # actions = pd.merge(actions, get_action_u0524_feat1(start_days, train_end_date), how='left', on='user_id')
1977 | # print (actions.shape)
1978 |
1979 | # actions = pd.merge(actions, get_action_u0524_feat2(start_days, train_end_date), how='left', on='user_id')
1980 | # print (actions.shape)
1981 | # actions = pd.merge(actions, get_action_u0524_feat3(start_days, train_end_date), how='left', on='user_id')
1982 | # print (actions.shape)
1983 | print (actions.shape)
1984 | for i in (1, 2, 3,7, 14, 28):
1985 | actions = pd.merge(actions, get_action_user_feat_six_xingwei(train_start_date, train_end_date, i), how='left',on='user_id')
1986 | actions = pd.merge(actions, deal_user_six_deal(train_start_date, train_end_date, i), how='left',on='user_id')
1987 | actions = pd.merge(actions, get_action_user_feat11(train_start_date, train_end_date, i), how='left',on='user_id')
1988 | actions = pd.merge(actions, get_action_user_feat13(train_start_date, train_end_date, i), how='left',on='user_id')
1989 | actions = pd.merge(actions, get_action_user_feat0509_1_30(train_start_date, train_end_date, i), how='left',on='user_id')
1990 | actions = pd.merge(actions, get_action_user_feat0515_2_3(train_start_date, train_end_date, i), how='left',on='user_id')
1991 | actions = pd.merge(actions, get_action_feat(train_start_date, train_end_date,i), how='left', on='user_id')
1992 | actions = pd.merge(actions, get_action_user_feat0515_2_4(train_start_date, train_end_date,i), how='left', on='user_id')
1993 | actions = pd.merge(actions, get_action_u0515_feat5(train_start_date, train_end_date,i), how='left', on='user_id')
1994 | actions = pd.merge(actions, get_action_u0509_feat_28(train_start_date, train_end_date,i), how='left', on='user_id')
1995 | if(i<=10):
1996 | actions = pd.merge(actions,get_action_user_feat0509_1_31(train_start_date, train_end_date,i), how='left', on='user_id')
1997 | actions = pd.merge(actions, get_action_u0521_feat_31(train_start_date, train_end_date,i), how='left', on='user_id')
1998 |
1999 | actions = pd.merge(actions, get_action_user_feat7_0522_huachuang(train_start_date, train_end_date,i), how='left', on='user_id')
2000 | print(actions.shape)
2001 | actions = pd.merge(actions, get_user_labels(test_start_date, test_end_date), how='left', on='user_id')
2002 |
2003 | actions = actions.fillna(0)
2004 | print(actions.shape)
2005 | # user_id = actions[['user_id']]
2006 | # del actions['user_id']
2007 | # actions = actions.fillna(0)
2008 | # actions=actions.replace(np.inf,0)
2009 | # # print(actions.head())
2010 | # columns = actions.columns
2011 |
2012 | # min_max_scale = preprocessing.MinMaxScaler()
2013 | # actions=actions.replace(np.inf,0)
2014 | # actions = min_max_scale.fit_transform(actions.values)
2015 | # actions = pd.concat([user_id, pd.DataFrame(actions,columns = columns)], axis=1)
2016 | # actions.to_csv(dump_path,index=False)
2017 | return actions
2018 |
2019 |
2020 | print("U model 2 finish part_3")
2021 |
2022 |
2023 |
2024 |
2025 |
2026 |
2027 | ###########################################################################################
2028 |
2029 |
2030 | # In[ ]:
2031 |
2032 |
2033 |
2034 |
2035 | # In[9]:
2036 |
2037 | #!/usr/bin/python
2038 |
2039 | import numpy as np
2040 | import xgboost as xgb
2041 | # from user_feat import *
2042 | from sklearn.model_selection import train_test_split
2043 |
2044 |
2045 | train_start_date = '2016-03-10'
2046 | train_end_date = '2016-04-11'
2047 | test_start_date = '2016-04-11'
2048 | test_end_date = '2016-04-16'
2049 |
2050 | # train_start_date='2016-03-05'
2051 | # train_end_date='2016-04-06'
2052 | # test_start_date='2016-04-06'
2053 | # test_end_date='2016-04-11'
2054 |
2055 | sub_start_date = '2016-03-15'
2056 | sub_end_date = '2016-04-16'
2057 |
2058 | #训练数据集
2059 | actions = make_train_set(train_start_date, train_end_date, test_start_date, test_end_date,user,ful_action,sub_action)
2060 | # print(np.isinf(actions))
2061 | # print(np.isnan(actions))
2062 |
2063 |
2064 |
2065 |
2066 |
2067 | # for index in feature_name[1:-1]:
2068 | # actions["r"+index]=actions[index].rank(method='max')/actions.shape[0]
2069 |
2070 |
2071 | ## train_neg,test_neg=train_test_split(actions_neg.values,test_size=0.15,random_state=0)
2072 |
2073 | # #test_neg = pd.DataFrame(test_neg,columns=actions_neg.columns)
2074 |
2075 | # #actions=pd.concat([actions_pos,test_neg])
2076 |
2077 | # actions_pos= pd.concat([actions_pos,actions_pos])
2078 | # actions_pos= pd.concat([actions_pos,actions_pos])
2079 | # actions_pos= pd.concat([actions_pos,actions_pos])
2080 | # actions_pos= pd.concat([actions_pos,actions_pos])
2081 | # actions=pd.concat([actions_pos,actions_neg])
2082 | print("+++++++++++++++++++++++")
2083 |
2084 |
2085 |
2086 | train,test=train_test_split(actions.values,test_size=0.2,random_state=0)
2087 | train=pd.DataFrame(train,columns=actions.columns)
2088 | test=pd.DataFrame(test,columns=actions.columns)
2089 |
2090 | X_train=train.drop(['user_id','label'],axis=1)
2091 | X_test=test.drop(['user_id','label'],axis=1)
2092 | y_train=train[['label']]
2093 | y_test=test[['label']]
2094 | train_index=train[['user_id']].copy()
2095 | test_index=test[['user_id']].copy()
2096 |
2097 |
2098 | #测试数据集
2099 | sub_test_data = make_test_set(sub_start_date, sub_end_date,user,ful_action,sub_action)
2100 | sub_trainning_data=sub_test_data.drop(['user_id'],axis=1)
2101 | sub_user_index=sub_test_data[['user_id']].copy()
2102 |
2103 |
2104 | print("U model 2 finish part_4")
2105 |
2106 | ########################################################################
2107 |
2108 |
2109 | # In[ ]:
2110 |
2111 |
2112 |
2113 |
2114 | # In[11]:
2115 |
2116 |
2117 | print ('==========>>>train xgboost model ....')
2118 |
2119 | dtrain = xgb.DMatrix(X_train,label=y_train)
2120 | dtest = xgb.DMatrix(X_test,label=y_test)
2121 | param = {'learning_rate' : 0.1,
2122 | 'n_estimators': 1000,
2123 | 'max_depth': 3,
2124 | 'min_child_weight': 5,
2125 | 'gamma': 0,
2126 | 'subsample': 1.0,
2127 | 'colsample_bytree': 0.8,
2128 | 'eta': 0.05,
2129 | 'silent': 1,
2130 | 'objective':
2131 | 'binary:logistic',
2132 | 'scale_pos_weight':1}
2133 |
2134 |
2135 |
2136 | num_round =120
2137 | plst = list(param.items())
2138 | plst += [('eval_metric', 'logloss')]
2139 |
2140 | evallist = [(dtest, 'eval'), (dtrain, 'train')]
2141 | bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=10)
2142 |
2143 |
2144 | # ============================================>>>>
2145 | print ('==========>>>predict test data label')
2146 |
2147 |
2148 | sub_trainning_data_1 = xgb.DMatrix(sub_trainning_data)
2149 | y = bst.predict(sub_trainning_data_1)
2150 | pred = sub_user_index
2151 | sub_user_index['label'] = y
2152 |
2153 | # print(sub_user_index.head())
2154 |
2155 | pred=sub_user_index
2156 | #pred.sort_values(by=['user_id','label'],ascending=[0,0],inplace=True)
2157 | pred=pred.sort_values(by=['user_id','label'],ascending=[0,0])
2158 | pred = pred.groupby('user_id').first().reset_index()
2159 | result=pred.sort_values(by=['label'],ascending=[0])
2160 | result['user_id']=result['user_id'].astype('int')
2161 |
2162 |
2163 | name=str(datetime.now()).replace(':','-').split('.')[0]
2164 | result.to_csv('./sub/Umodel_2.csv',index=False,index_label=False )
2165 | print("U model 2 finish part_5")
2166 |
2167 |
--------------------------------------------------------------------------------